示例#1
0
    def parse_item(self, response):
        item = PoiItem()

        try:
            item['name'] = response.css(
                'span#profileVerifyName::text').extract()[0]
            item['avatar'] = response.css(
                'a.photo-trigger img::attr(src)').extract()[0]

            item['site_id'] = 26
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        info = response.css('div.profile-basic-info-left span')
        item['gender'] = 'M' if info[0].css(
            '::text').extract()[0] == u'男' else 'F'

        item['birthday'] = str(datetime.date.today().year - int(info[1].css(
            'em::text').extract()[0])) + '-00-00'

        item['height'] = info[2].css('em::text').extract()[0] + 'cm'

        item['location'] = info[3].css('::text').extract()[0]

        item['score'] = int(
            response.css('span.js-totalPercent::text').extract()[0])
        return item
示例#2
0
    def parse_item_json(self, response):
        data = json.loads(response.body)
        item = PoiItem()

        item['site_id'] = 3
        item['user_id'] = data['user']['id']  #id
        item['name'] = data['user']['name']  #姓名
        item['gender'] = 'M' if data['user']['sex'] == 1 else 'F'  #性别
        item['level'] = data['user']['tb_age']  #吧龄
        item['followers'] = int(data['user']['fans_num'])  #粉丝数
        item['following'] = int(data['user']['concern_num'])  #关注数
        # item['my_like_num'] =   data['user']['my_like_num']       #喜欢的吧
        item['post_num'] = int(data['user']['post_num']
                               if data['user']['post_num'] else '0')  #总发帖数
        item['reply_num'] = int(data['user']['repost_num']
                                if data['user']['repost_num'] else '0')  #回复数
        # item['thread_num']  =   data['user']['thread_num']        #主题数
        item['description'] = data['user']['intro']  #简介
        item['avatar'] = 'http://tb.himg.baidu.com/sys/portrait/item/' + data[
            'user']['portrait']  #头像

        if item['name']:
            yield Request('http://www.baidu.com/p/' + item['name'] + '/detail',
                          callback=self.parse_item_detail,
                          meta={'item': item},
                          priority=10)
示例#3
0
    def parse_item(self, response):
        item = PoiItem()

        try:
            item['name'] = response.css('div.portrait img::attr(alt)').extract()[0]
            item['avatar'] = response.css('div.portrait img::attr(src)').extract()[0]
            item['gender'] = 'M' if 'male' in response.css('div.portrait i::attr(class)').extract()[0] else 'F'

            # If name is available, then this item is valid.
            item['site_id'] = 1 # Site id of tianya.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        item['description'] = ''.join(response.css('div.profile p::text').extract())
        item['followers'] = int(response.css('div.link-box a::text').extract()[1])
        item['following'] = int(response.css('div.link-box a::text').extract()[0])
        item['score'] = int(response.css('div.userinfo').re(ur'分</span>(.*?)</p>')[0])
        item['login_num'] = int(response.css('div.userinfo').re(ur'登录次数</span>(.*?)</p>')[0])
        item['last_login_time'] = response.css('div.userinfo').re(ur'最新登录</span>(.*?)</p>')[0]
        item['reg_time'] = response.css('div.userinfo').re(ur'注册日期</span>(.*?)</p>')[0]

        # May not have location or level.
        r = response.css('div.userinfo').re(ur'区</span>(.*?)</p>')
        if r: item['location'] = r[0]
        r = response.css('div.userinfo').re(ur'_blank">(.*?)</a>')
        if r: item['level'] = r[0]

        # Inexistence indicats zero.
        r = response.css('div.mod-hd').re(ur'主贴(\d+)')
        item['post_num'] = int(r[0]) if r else 0
        r = response.css('div.mod-hd').re(ur'回帖(\d+)')
        item['reply_num'] = int(r[0]) if r else 0

        return item
示例#4
0
    def parse_item(self, response):
        item = PoiItem()

        if u'提示消息' == response.xpath("//title/text()").extract()[0]:
            print 'nnooooooooooooooooooo'
            return
        else:
            name = response.xpath("//title/text()").extract()[0][0:-5]

        info0 = response.xpath(
            "//div[@class='person_basics']/div[1]/div[1]/dl").extract()

        # info = response.xpath("//div[@class='clear']/dl/dd/text()").extract()
        item['name'] = name
        item['avatar'] = response.xpath(
            "//p[@class='per_img']/a/img/@src").extract()[0]
        # item['gender'] = 'M' if info[1]==u'男' else 'F'
        # If name is available, then this item is valid.
        item['site_id'] = 20  # Site id of
        item['user_id'] = str(response.meta['user_id'])
        # except:
        #     return
        if u'span' not in info0[6]:
            item['occupation'] = info0[6][35:-10]

        marital_status = {
            u'未婚': u'S',
            u'离异': u'D',
            u'丧偶': u'W',
        }
        # if info[3] in marital_status:
        #     item['marital_status'] = marital_status[info[3]]

        education_level = {
            u'高中': u'1',
            u'大学本科': u'4',
            u'硕士': u'5',
            u'中专': u'2',
            u'大专': u'3',
            u'博士': u'6',
            u'其他': u'7',
        }
        if u'span' not in info0[7]:
            if info0[7][35:-10] in education_level:
                item['education_level'] = education_level[info0[7][35:-10]]
        if u'span' not in info0[1]:
            item['height'] = info0[1][35:-10]
        if u'span' not in info0[8]:
            item['salary'] = info0[8][35:-10]
        # item['weight'] = info[len(info)-6]
        # item['blood_type'] = info[len(info)-3]
        if u'span' not in info0[2]:
            item['location'] = info0[2][34:-10]
        if u'span' not in info0[0]:
            item['birthday'] = str(datetime.date.today().year -
                                   int(info0[0][35:-23])) + '-00-00'
        # if not u"Ta有点害羞,需要你的鼓励" in description:
        #     item['description'] ="".join(description.split())

        return item
示例#5
0
    def parse_item(self, response):
        item = PoiItem()

        try:

            user_name = response.css('dl.dl02 dt::text').extract()[0]

            item['name'] = user_name
            item['avatar'] = response.css(
                'img#myImagePhoto_mid::attr(src)').extract()[0]
            item['site_id'] = 17
            item['user_id'] = str(response.meta['user_id'])
            item['gender'] = 'M' if u'男' in response.css(
                'dl.dl02 dd.d01::text').extract()[0] else 'F'
            item['occupation'] = response.css(
                'dl.dl02 dd.d02::text').extract()[0]
            education_level = {
                u'初中': u'1',
                u'中专/职高/技校': u'3',
                u'高中': u'1',
                u'大专': u'2',
                u'本科': u'4',
                u'硕士': u'5',
                u'博士': u'6',
                u'博士后': u'6',
            }
            r = response.css('dl.dl02 dd.d01::text').extract()
            if r and r[8].strip() != u'保密':
                item['education_level'] = education_level[r[8].strip()]
            item['height'] = response.css('dl.dl02 dd.d01::text').extract()[4]

            marital_status = {
                u'已婚': u'M',
                u'单身': u'S',
                u'未婚': u'S',
                u'离异': u'D',
                u'丧偶': u'W'
            }
            r = response.css('dl.dl02 dd.d01::text').extract()
            if r and r[3].strip() != u'保密':
                item['marital_status'] = marital_status[r[3].strip()]
            r = response.css('dl.dl02 dd.d02::text').extract()
            if r and r[0].strip() != u'保密': item['salary'] = r[2].strip()
            r = response.css('dl.dl02 dd.d01::text').extract()
            if r and r[0].strip() != u'保密': item['location'] = r[2].strip()
            item['body_size'] = response.css(
                'dl.dl02 dd.d01::text').extract()[5]
            item['looks'] = response.css('dl.dl02 dd.d01::text').extract()[6]

        except:
            return

        return item
示例#6
0
    def parse_item(self, response):
        item = PoiItem()
        pattern = re.compile('memberInfo : \$.parseJSON\(\'(.*?)\'\),', re.S)
        result = re.search(pattern, response.body.decode('gbk'))
        ans = json.loads(result.group(1))
        pattern_description = re.compile(
            '<p class="fs14 lh20 c5e slider-area-js">(.*?)<span class="info-mark"></span></p>',
            re.S)
        description = re.search(pattern_description,
                                response.body.decode('gbk')).group(1)
        try:
            item['name'] = ans["fullName"]
            item['avatar'] = ans["photo"]
            item['gender'] = 'M' if ans["sex"] == 0 else 'F'
            # If name is available, then this item is valid.
            item['site_id'] = 9  # Site id of zhenai.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        item['occupation'] = ans["occupation"]

        marital_status = {
            u'未婚': u'S',
            u'离异': u'D',
            u'丧偶': u'W',
        }
        if ans["marriage"] in marital_status:
            item['marital_status'] = marital_status[ans["marriage"]]

        education_level = {
            u'高中及以下': u'1',
            u'大学本科': u'4',
            u'硕士': u'5',
            u'中专': u'2',
            u'大专': u'3',
            u'博士': u'6',
            u'其他': u'7',
        }
        if ans["education"] in education_level:
            item['education_level'] = education_level[ans["education"]]
        item['height'] = ans["height"]
        item['occupation'] = ans["occupation"]
        item['salary'] = ans["salary"]
        item['location'] = ans["workCity"]
        item['birthday'] = str(datetime.date.today().year -
                               int(ans["age"])) + '-00-00'
        if not u"Ta有点害羞,需要你的鼓励" in description:
            item['description'] = "".join(description.split())
        return item
示例#7
0
    def parse_item(self, response):
        item = PoiItem()

        name = response.xpath(
            "//div[@class='pp_s']/p[1]/strong[1]/text()").extract(
            )[0][0:-5 - len(str(response.meta['user_id']))]
        info = response.xpath("//div[@class='pp_x']/ul[1]/li/text()").extract()

        try:
            item['name'] = name
            item['avatar'] = 'http://www.51findlove.cn/' + response.xpath(
                "//div[@class='happ_img']/img/@src").extract()[0]
            # item['gender'] = 'M' if info[0]==u'男' else 'F'
            # If name is available, then this item is valid.
            item['site_id'] = 21  # Site id of 7rdao.
            item['user_id'] = str(response.meta['user_id'])
            # except:
            #     return

            # item['occupation']=info[6]

            marital_status = {
                u'未婚': u'S',
                u'离异': u'D',
                u'丧偶': u'W',
            }
            if info[4][5:] in marital_status:
                item['marital_status'] = marital_status[info[4][5:]]

            education_level = {
                u'高中': u'1',
                u'本科': u'4',
                u'硕士': u'5',
                u'中专': u'2',
                u'大专': u'3',
                u'博士': u'6',
                u'其他': u'7',
            }
            if info[2][3:] in education_level:
                item['education_level'] = education_level[info[2][3:]]
            item['height'] = info[1][3:]
            item['salary'] = info[5][3:]
            item['location'] = info[6][3:]
            item['birthday'] = str(datetime.date.today().year -
                                   int(info[0][3:-1])) + '-00-00'
            # if not u"Ta有点害羞,需要你的鼓励" in description:
            #     item['description'] ="".join(description.split())
        except:
            return
        return item
示例#8
0
    def parse_item(self, response):
        item = PoiItem()
        info = response.xpath("//div[@id='jibenxinxi']/p/text()").extract()

        info2 = response.xpath(
            "//table[1]/tr[1]/td[1]/table[1]/tr/td[2]/text()").extract()

        try:
            item['name'] = info[0][3:]
            item['avatar'] = response.xpath(
                "//div[@id='bigface']/div/img/@src").extract()[0]
            item['gender'] = 'M' if info[3][3:] == u'男' else 'F'
            # If name is available, then this item is valid.
            item['site_id'] = 23  # Site id of 7rdao.
            item['user_id'] = str(response.meta['user_id'])
            # except:
            #     return

            item['occupation'] = info[6][3:]

            marital_status = {
                u'未婚': u'S',
                u'离异': u'D',
                u'丧偶': u'W',
            }
            if info2[0] in marital_status:
                item['marital_status'] = marital_status[info2[0]]

            education_level = {
                u'高中': u'1',
                u'大学本科': u'4',
                u'硕士': u'5',
                u'中专': u'2',
                u'大专': u'3',
                u'博士': u'6',
                u'其他': u'7',
            }
            if info2[3] in education_level:
                item['education_level'] = education_level[info2[3]]
            item['height'] = info2[1]
            item['salary'] = info2[6]
            item['location'] = info[5][3:]
            item['birthday'] = str(datetime.date.today().year -
                                   int(info[4][3:-1])) + '-00-00'
            # if not u"Ta有点害羞,需要你的鼓励" in description:
            #     item['description'] ="".join(description.split())
        except:
            return
        return item
示例#9
0
 def parse_item(self, response):
     item = PoiItem()
     # '/html/body/div[2]/div/div[1]/h1'
     item['name'] = response.xpath(
         '/html/body/div[2]/div[1]/div[1]/h1/text()').extract()
     item['address'] = response.xpath(
         '/html/body/div[2]/div[1]/div[2]/ul/li[4]/text()').extract()
     item['category'] = response.xpath(
         '/html/body/div[2]/div[1]/div[2]/ul/li[6]/text()').extract()
     item['wgs_84'] = response.xpath(
         '/html/body/div[2]/div[1]/div[2]/ul/li[7]/text()').extract()
     item['gcj_02'] = response.xpath(
         '/html/body/div[2]/div[1]/div[2]/ul/li[8]/text()').extract()
     item['bd_09'] = response.xpath(
         '/html/body/div[2]/div[1]/div[2]/ul/li[9]/text()').extract()
     yield item
示例#10
0
    def parse_item(self, response):
        item = PoiItem()
        root = response.xpath('//html')
        try:
            name = response.css('div#member_messages h3 a::text').extract()[0]
            if name == u'编辑' or name == '':
                return
            item['name'] = name

            # If name is available, then this item is valid.
            item['site_id'] = 13  # Site id of tianya.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return
        desc = response.css(
            'div#member_messages h3 span::text').extract()[0].strip()
        if desc != '':
            item['description'] = desc

        avatar = response.css('div.face a img::attr(src)').extract()[0]
        if avatar[0:4] == 'http':
            item['avatar'] = avatar
        else:
            item['avatar'] = 'http://www.easydong.com' + avatar

        r = root.re(ur'<li><span>性别:</span>(.*?)</li>')
        if r and r[0].strip() != '':
            item['gender'] = 'M' if u'男' in r[0] else 'F'

        r = root.re(ur'<li><span>生日:</span>(.*?)</li>')
        if r and r[0].strip() != '':
            birthday = r[0].strip()
            if len(birthday) < 10:
                item['birthday'] = u'0000-' + birthday
            else:
                item['birthday'] = birthday

        r = root.re(ur'<li><span>易动积分:</span>(\d+)</li>')
        if r and r[0].strip() != '': item['score'] = int(r[0].strip())

        r = root.re(ur'<li><span>运动主场:</span>(.*?)</li>')
        if r and r[0].strip() != '': item['location'] = r[0].strip()

        r = root.re(ur'<li><span>体育爱好:</span>(.*?)</li>')
        if r and r[0].strip() != '': item['favorites'] = r[0].strip()

        return item
示例#11
0
文件: 019-7rdao.py 项目: songboyu/poi
    def parse_item(self, response):
        item = PoiItem()
        name = response.xpath(
            "//div[@id='baseInfo']/dl/dd[2]/a/text()").extract()[0]
        info = response.xpath("//div[@id='baseInfo']/dl/dd/text()").extract()

        # try:
        item['name'] = name
        item['avatar'] = response.xpath(
            "//a[@class='img']/img/@src").extract()[0]
        item['gender'] = 'M' if info[0] == u'男' else 'F'
        # If name is available, then this item is valid.
        item['site_id'] = 19  # Site id of 7rdao.
        item['user_id'] = str(response.meta['user_id'])
        # except:
        #     return

        item['occupation'] = info[6]

        marital_status = {
            u'未婚': u'S',
            u'离异': u'D',
            u'丧偶': u'W',
        }
        if info[4] in marital_status:
            item['marital_status'] = marital_status[info[4]]

        education_level = {
            u'高中': u'1',
            u'大学本科': u'4',
            u'硕士': u'5',
            u'中专': u'2',
            u'大专': u'3',
            u'博士': u'6',
            u'其他': u'7',
        }
        if info[5] in education_level:
            item['education_level'] = education_level[info[5]]
        item['height'] = info[1]
        item['salary'] = info[7]
        item['location'] = info[8][0:-2]
        item['birthday'] = str(datetime.date.today().year -
                               int(info[2])) + '-00-00'
        # if not u"Ta有点害羞,需要你的鼓励" in description:
        #     item['description'] ="".join(description.split())
        return item
示例#12
0
    def parse_item(self, response):
        item = PoiItem()

        try:
            item['name'] = response.css('p.nametit b::text').extract()[0]

            # If name is available, then this item is valid.
            item['site_id'] = 10  # Site id of tianya.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        item['avatar'] = response.css('img#changesize::attr(src)').extract()[0]
        item['gender'] = 'M' if u'男' in response.css(
            'p.nametit span::text').extract()[0] else 'F'
        item['birthday'] = str(datetime.date.today().year - int(
            re.findall(r'(\d+)',
                       response.css('p.nametit span::text').extract()[0])[0])
                               ) + '-00-00'

        text = response.css('div.font12::text').extract()[0].replace(
            '&nbsp;', '').split('/')
        item['location'] = text[0].strip()
        item['height'] = text[1].strip()
        education_level = {
            u'专科以下': u'1',
            u'专科': u'3',
            u'本科': u'4',
            u'硕士': u'5',
            u'博士': u'6',
            u'博士后': u'6',
        }
        item['education_level'] = education_level[text[2].strip()]
        item['occupation'] = text[3].strip().replace(u'职业:', '')
        desc = response.css('p#updateDesc span::text').extract()[0]
        if desc != u'评论一下嘛…' and desc != u'在照片上写点什么':
            item['description'] = desc

        r = response.css('p.nametit b img').extract()
        if r:
            item['level'] = '认证'
        else:
            item['level'] = '未认证'

        return item
示例#13
0
    def parse_item(self, response):
        item = PoiItem()

        try:
            item['name'] = response.css('a.founder::text').extract()[0]
            item['avatar'] = response.css('div.photo img::attr(src)').extract()[0]

            item['site_id'] = 8
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        r = response.css('div.info').re(ur'常居地:</span>(.*?)<')
        if r: item['location'] = r[0].strip()

        r = response.css('div.intro::text').extract()
        if r: item['description'] = r[0].strip()

        return item
示例#14
0
    def parse_item(self, response):
        item = PoiItem()
        j = json.loads(response.body)['data']
        if not j['uid']: return
        try:
            item['name'] = j['nick']
            item['avatar'] = j['avatar_150']

            # If name is available, then this item is valid.
            item['site_id'] = 25  # Site id of tianya.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return
        age = j['age'][0:-1]
        try:
            item['birthday'] = str(datetime.date.today().year -
                                   int(age)) + '-00-00'
        except:
            pass

        item['height'] = j['height']
        item['description'] = j['intro']
        item['location'] = j['location']
        item['salary'] = j['salary']
        item['gender'] = 'M' if j['sex'] == '1' else 'F'
        education_level = {
            u'初中': u'1',
            u'中专/职高/技校': u'2',
            u'高中/中专': u'2',
            u'大专以下': u'2',
            u'大专': u'3',
            u'本科': u'4',
            u'硕士': u'5',
            u'博士': u'6',
            u'博士后': u'6',
        }
        print j['education']
        item['education_level'] = education_level[j['education'].strip()]
        return item
示例#15
0
    def parse(self, response):
        # 解析当前页码和总页码
        page_cur = int(
            response.xpath(
                '//ul[@class="pagination pagination-sm mar-t5"]/li[@class="active"]/a/text()'
            ).extract()[0])
        page_num = int(
            response.xpath(
                '//ul[@class="pagination pagination-sm mar-t5"]/li[last()]/a/text()'
            ).extract()[0])

        zh = resolve(self.url)
        category_cur = zh[0]
        city_cur = zh[1]

        if page_cur < page_num:  # next page
            trs = response.xpath('//tbody/tr')
            item = PoiItem()
            for tr in trs:
                item['name'] = html.unescape(
                    tr.xpath('./td[2]/text()').extract()[0])
                item['province'] = html.unescape(
                    tr.xpath('./td[3]/text()').extract()[0])
                item['city'] = html.unescape(
                    tr.xpath('./td[4]/text()').extract()[0])

                district = tr.xpath('./td[5]/text()').extract()
                item['district'] = html.unescape(
                    district[0]) if district else ''

                code = tr.xpath('./td[6]/text()').extract()
                item['code'] = code[0] if code else ''

                phone = tr.xpath('./td[7]/text()').extract()
                item['phone_no'] = phone[0] if phone else ''

                region = tr.xpath('./td[8]/text()').extract()
                item['region'] = html.unescape(region[0]) if region else ''

                location = tr.xpath('./td[9]/text()').extract()
                item['location'] = html.unescape(
                    location[0]) if location else ''

                cate = tr.xpath('./td[10]/text()').extract()
                item['category'] = cate[0] if cate else ''

                sub = tr.xpath('./td[11]/text()').extract()
                item['sub_category'] = sub[0] if sub else ''

                lon = tr.xpath('./td[12]/text()').extract()
                item['longitude'] = lon[0] if lon else 0

                lat = tr.xpath('./td[13]/text()').extract()
                item['latitude'] = lat[0] if lat else 0
                yield item

            temp = self.baseurl + category_cur + '-' + city_cur + '/'
            suffix = int(page_cur) * 30
            url = temp + str(suffix)
            self.url = url
            yield Request(url, callback=self.parse, dont_filter=True)
        else:
            # next city
            self.cityindex += 1
            if self.cityindex == len(self.citymap):
                # next category
                self.cateIndex += 1
                if self.cateIndex >= len(self.category):
                    exit(0)
                else:
                    category_cur = self.categorymap[self.cateIndex]
                    self.cityindex = 0
            city_cur = self.citymap[self.cityindex]
            url = self.baseurl + category_cur + '-' + city_cur + '/'
            self.url = url
            yield Request(url, callback=self.parse, dont_filter=True)
示例#16
0
    def parse_item(self, response):
        item = PoiItem()
        root = response.xpath('//html')
        try:
            item['name'] = response.css(
                'strong#userNameStrong::text').extract()[0]

            # If name is available, then this item is valid.
            item['site_id'] = 12  # Site id of tianya.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        r = root.re(ur'"defaultUrl":"(.*?)"')
        if r:
            item['avatar'] = r[0]
        else:
            item['avatar'] = 'http://profile.baihe.com/new/' + response.css(
                'div#simplePhotoDiv img::attr(src)').extract()[0]

        gender = root.re(ur'var gender_topSendMsg_name_TA = \'(.*?)\';')[0]
        item['gender'] = 'M' if u'他' in gender else 'F'

        birthday = root.re(ur'var oppAge = (\d+);')[0]
        item['birthday'] = str(datetime.date.today().year -
                               int(birthday)) + '-00-00'
        item['height'] = root.re(ur'<strong>身高:</strong><p>(.*?)</p>')[0]

        marital_status = {u'已婚': u'M', u'未婚': u'S', u'离异': u'D', u'丧偶': u'W'}
        r = root.re(ur'<strong>婚姻状况:</strong><p>(.*?)</p>')
        if r: item['marital_status'] = marital_status[r[0].strip()]

        education_level = {
            u'初中': u'1',
            u'中专/职高/技校': u'3',
            u'高中': u'1',
            u'大专': u'2',
            u'本科': u'4',
            u'硕士': u'5',
            u'博士': u'6',
            u'博士后': u'6',
        }
        r = root.re(ur'<strong>学历:</strong><p>(.*?)</p>')
        if r and r[0].strip() != u'以后告诉你':
            item['education_level'] = education_level[r[0].strip()]

        r = root.re(ur'<strong>职业:</strong><p>(.*?)</p>')
        if r and r[0].strip() != u'以后告诉你': item['occupation'] = r[0].strip()

        r = root.re(ur'<strong>月薪:</strong><p>(.*?)</p>')
        if r and r[0].strip() != u'以后告诉你': item['salary'] = r[0].strip()

        r = root.re(ur'<strong>来自:</strong><p>(.*?)</p>')
        if r and r[0].strip() != u'以后告诉你': item['location'] = r[0].strip()

        r = response.css('td#_item_want_know_1::text').extract()
        if r and r[0].strip() != u'以后告诉你': item['hometown'] = r[0].strip()

        item['experience'] = ''
        r = response.css('td#_item_want_know_2::text').extract()
        if r and r[0].strip() != u'以后告诉你':
            item['experience'] += u'毕业学校:' + r[0].strip()

        r = response.css('td#_item_want_know_10::text').extract()
        if r and r[0].strip() != u'以后告诉你':
            item['experience'] += u'  公司行业:' + r[0].strip()

        r = response.css('td#_item_want_know_3::text').extract()
        if r and r[0].strip() != u'以后告诉你': item['body_size'] = r[0].strip()

        r = response.css('td#_item_want_know_5::text').extract()
        if r and r[0].strip() != u'以后告诉你': item['weight'] = r[0].strip()

        r = response.css('td#_item_want_know_9::text').extract()
        if r and r[0].strip() != u'以后告诉你': item['looks'] = r[0].strip()

        bloodType = {
            u'O型': u'1',
            u'A型': u'2',
            u'B型': u'3',
            u'AB型': u'4',
            u'其他型': u'5'
        }
        r = response.css('td#_item_want_know_7::text').extract()
        if r and r[0].strip() != u'以后告诉你':
            item['blood_type'] = bloodType[r[0].strip()]

        item['description'] = response.css(
            'div.pro_details pre::text').extract()[0]

        return item
示例#17
0
    def parse_item(self, response):
        item = PoiItem()
        root = response.xpath('//html')
        try:
            item['name'] = response.css('dd.nickname a::text').extract()[0]

            # If name is available, then this item is valid.
            item['site_id'] = 11  # Site id of tianya.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        item['avatar'] = response.css('a.img img::attr(src)').extract()[0]
        item['gender'] = 'M' if u'男' in response.css(
            'dd.f::text').extract()[0] else 'F'
        birthday = root.re(ur'出生年月.*?(\d+)')[0]
        item['birthday'] = birthday[:4] + '-' + birthday[4:] + '-00'
        item['height'] = root.re(ur'(\d+)</dd><dt>年龄:')[0]

        marital_status = {u'未婚': u'S', u'非单身': u'L', u'离异': u'D', u'丧偶': u'W'}
        r = root.re(ur'婚姻状况:</dt><dd class="f">(.*?)</dd><dt>')
        if r: item['marital_status'] = marital_status[r[0].strip()]

        education_level = {
            u'初中': u'1',
            u'高中': u'1',
            u'本科': u'4',
            u'硕士': u'5',
            u'中专': u'2',
            u'专科': u'3',
            u'博士': u'6',
        }
        r = root.re(ur'最高学历:</dt><dd>(.*?)</dd><dt>')
        if r: item['education_level'] = education_level[r[0].strip()]

        r = root.re(ur'从事职业:</dt><dd class="f">(.*?)</dd><dt>')
        if r: item['occupation'] = r[0].strip()

        r = root.re(ur'年收入:</dt><dd>(.*?)</dd><dt>')
        if r: item['salary'] = r[0].strip()

        r = root.re(ur'现居住地:</dt><dd class="f">(.*?)</dd><dt>')
        if r: item['location'] = re.sub(r'<(.*?)>', '', r[0]).strip()

        r = root.re(ur'最后在线时间:(\d+-\d+-\d+ \d+:\d+:\d+)')
        if r: item['last_login_time'] = r[0].strip()

        r = root.re(ur'自我介绍:</b>(.*?)</li>')
        if r: item['description'] = r[0].strip()

        r = root.re(ur'<li><b>我的个性:</b>(.*?)</li>')
        if r: item['personality'] = r[0].strip()

        r = root.re(ur'<li><b>兴趣爱好:</b>(.*?)</li>')
        if r: item['favorites'] = r[0].strip()

        r = root.re(ur'<li><b>我的外貌:</b>(.*?)</li>')
        if r: item['looks'] = r[0].strip()

        r = root.re(ur'诚信值:(\d+)')
        if r: item['score'] = int(r[0].strip())

        r = root.re(ur'(\d+)</span></dd><dt>性别:')
        if r: item['level'] = r[0].strip()

        return item
示例#18
0
    def parse_item(self, response):
        
        item = PoiItem()
        try:
            item['name'] = response.css('span.name::text').extract()[0]
            item['avatar'] = response.css('img#show_pic_1::attr(src)').extract()[0]

            item['site_id'] = 27 # Site id of zhenai.,
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        info = response.css('dd.userinfo-m').extract()[0]
        item['gender'] = 'M' if re.findall(ur'\((\d+),sex\)', info)[0]=='0' else 'F'
        birthday = re.findall(ur'(\d+)岁', info)[0]
        item['birthday'] = str(datetime.date.today().year - int(birthday)) + '-00-00'

        try:
            item['location'] = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info)[0]) + ' ' + self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info)[1])
        except:
            pass

        info = response.css('dl.infoList dd')
        try:
            r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[0].extract())[0])
            if r and r!=u'不限' and r!=u'请选择': info['height'] = r+'cm'
        except:
            pass

        try:
            r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[2].extract())[0])
            if r and r!=u'不限' and r!=u'请选择': info['occupation'] = r
        except:
            pass

        try:
            r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[7].extract())[0])
            if r and r!=u'不限' and r!=u'请选择': info['weight'] = r
        except:
            pass

        marital_status = {
            u'已婚':u'M',
            u'未婚':u'S',
            u'离异':u'D',
            u'丧偶':u'W'
        }
        try:
            r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[8].extract())[0])
            if r.strip()!=u'不限' and r!=u'请选择': item['marital_status'] = marital_status[r.strip()]
        except:
            pass

        try:
            r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[9].extract())[0])
            if r and r!=u'不限' and r!=u'请选择': info['salary1'] = r
        except:
            pass

        #["-1,不限","3,高中及以下","4,大专","5,大学本科","6,硕士","7,博士"],
        education_level = {
            u'高中及以下':u'1',
            u'大专':u'2',
            u'大学本科':u'4',
            u'硕士':u'5',
            u'博士':u'6',
        }
        try:
            r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[11].extract())[0])
            if r.strip()!=u'不限' and r!=u'请选择': item['education_level'] = education_level[r.strip()]
        except:
            pass

        try:
            item['description'] = response.css('div.InfoData div.Data::text').extract()[0].strip()
        except:
            pass
        return item
示例#19
0
    def parse_item(self, response):
        item = PoiItem()

        html = response.body.decode('gbk')
        try:
            item['name'] = re.findall(ur'我是<em>(.*?)<\/em>', html)[0]
            item['avatar'] = 'http://www.hongniang.com' + response.css(
                'a.da-pic img::attr(src)').extract()[0]
            # If name is available, then this item is valid.
            item['site_id'] = 22  # Site id of tianya.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        item['gender'] = 'M' if u'男' in html else 'F'

        try:
            item['hometown'] = re.findall(ur'>(.*?)</a></strong>人', html)[0]
        except:
            pass

        try:
            age = re.findall(ur'今年<em>(\d+)岁</em>', html)[0]
            item['birthday'] = str(datetime.date.today().year -
                                   int(age)) + '-00-00'
        except:
            pass

        try:
            item['height'] = re.findall(
                ur'身高<strong style=" color: #FB8B38">(.*?)</strong>', html)[0]
        except:
            pass

        try:
            item['location'] = re.findall(
                ur'工作在<strong ><a target="_blank" style=" color:#5E83EF; font-weight:bold;">(.*?)<',
                html)[0]
        except:
            pass

        try:
            item['salary'] = re.findall(ur'年入<em>(.*?)</em>', html)[0]
        except:
            pass

        try:
            marital_status = {
                u'已婚': u'M',
                u'未婚': u'S',
                u'离异': u'D',
                u'离异单身': u'D',
                u'离异带孩': u'D',
                u'丧偶': u'W',
                u'丧偶单身': u'W',
                u'丧偶带孩': u'W'
            }
            r = re.findall(ur'color:#7AAE1F">(.*?)</strong>', html)
            if r: item['marital_status'] = marital_status[r[0].strip()]
        except:
            pass

        try:
            education_level = {
                u'初中': u'1',
                u'中专/职高/技校': u'2',
                u'高中/中专': u'2',
                u'大专以下': u'2',
                u'大专': u'3',
                u'本科': u'4',
                u'硕士': u'5',
                u'博士': u'6',
                u'博士后': u'6',
            }
            r = re.findall(ur'</strong>,<em>(.*?)学历</em>', html)
            if r: item['education_level'] = education_level[r[0].strip()]
        except:
            pass

        return item
示例#20
0
    def parse_item(self, response):
        item = PoiItem()
        try:
            item['name'] = response.css('ul.hpUserInfoUl li')[0].css(
                'div.oh::text').extract()[0]
            item['avatar'] = response.css('img.br4::attr(src)').extract()[0]

            # If name is available, then this item is valid.
            item['site_id'] = 5  # Site id of tianya.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        r = response.css('ul.hpUserInfoUl li')[1].css('div.oh::text').extract()
        if r: item['location'] = r[0]

        r = response.css('ul.hpUserInfoUl li')[2].css('div.oh::text').extract()
        if r: item['gender'] = 'M' if u'男' in r[0] else 'F'

        r = response.css('ul.hpUserInfoUl li')[3].css('div.oh::text').extract()
        if r and r[0] != u'无':
            item['birthday'] = r[0].replace(u' 年 ', '-').replace(u' 月 ',
                                                                 '-').replace(
                                                                     u' 日', '')

        item['login_num'] = response.css('ul.hpUserInfoUl li')[4].css(
            'div.oh::text').extract()[0]

        r = response.css('ul.hpUserInfoUl li')[5].css('div.oh::text').extract()
        if r and r[0] != u'无': item['last_login_time'] = r[0]

        r = response.css('ul.hpUserInfoUl li')[6].css('div.oh::text').extract()
        if r: item['description'] = r[0]

        item['score'] = int(
            response.css('div.user-sns-count li')[3].css(
                'a.num::attr(title)').extract()[0])
        item['level'] = response.css('div.user-sns-count li')[2].css(
            'a.num::text').extract()[0] + u'级 - ' + response.css(
                'div.levelBox div::text').extract()[-1]

        item['reg_time'] = response.css(
            'div.hpUserInfo2 span.c999::text').extract()[0]

        return FormRequest(
            'http://hi.mop.com/ajax/get',
            headers={'X-Requested-With': 'XMLHttpRequest'},
            formdata={
                'data':
                json.dumps({
                    'header': {},
                    'req': {
                        'User/SubCount': {
                            'uid': item['user_id']
                        },
                        'User/SnsCount': {
                            'uid': item['user_id']
                        }
                    }
                }),
                'date':
                str(int(time.time() * 1000))
            },
            callback=self.parse_item_get_ajax,
            meta={'item': item},
            priority=20,
        )
示例#21
0
文件: 002wangyi.py 项目: songboyu/poi
    def parse_profile(self, response):
        """Parse a user's profile page.

        @url http://lvxiaobin99.blog.163.com/profile
        """

        item = PoiItem()
        root = response.xpath('//html')
        username = response.url.split('/')[2].split('.')[0]
        self.log('Profile=> ' + username, level=scrapy.log.DEBUG)

        # ID attributes.
        item['site_id'] = 2

        item['user_id'] = username

        # Personal attributes.
        r = root.re(r"nickName:'(.*?)'")
        if r: item['name'] = r[0]

        item[
            'avatar'] = 'http://os.blog.163.com/common/ava.s?host=' + username + '&b=1'

        r = root.re(ur'介绍:</td>[\s\S]*?>([\s\S]*?)<')
        if r: item['description'] = r[0].strip()

        r = root.re(r'marital=(\w)')
        if r: item['marital_status'] = r[0]

        r = root.re(r'education=(\d)')
        if r: item['education_level'] = r[0]

        r1 = root.re(r'industry=.*?>(.*?)<')
        r2 = root.re(r'occupation=.*?>(.*?)<')
        if r1 or r2: item['occupation'] = r1[0] + ' ' + r2[0]

        r = root.re(r'salary=.*?>(.*?)<')
        if r: item['salary'] = r[0]

        r = root.re(r'skills=([%\w]+)')
        if r: item['speciality'] = urllib.unquote(r[0].encode('utf8'))

        r = root.re(r'characteristics=[^"]+')
        if r:
            item['personality'] = urllib.unquote(' '.join(
                re.findall('%[%\w]+', ' '.join(r))).encode('utf8'))

        r = root.re(r'favorite\w+=[^"]+')
        if r:
            item['favorites'] = urllib.unquote(' '.join(
                re.findall('%[%\w]+', ' '.join(r))).encode('utf8'))

        r = ' '.join(root.xpath('//div[@class="biograph"]//text()').extract())
        if r: item['experience'] = re.sub(r'\s\s+', ' ', r).strip()

        # Body attributes.
        r = root.re(r'gender=(\w)')
        if r: item['gender'] = r[0]

        r = root.re(r'weight=.*?>(.*?)<')
        if r: item['weight'] = r[0]

        r = root.re(r'height=.*?>(.*?)<')
        if r: item['height'] = r[0]

        r = root.re(r'bodyShape=.*?>(.*?)<')
        if r: item['body_size'] = r[0]

        r = root.re(r'appearance=.*?>(.*?)<')
        if r: item['looks'] = r[0]

        r = root.re(r'bloodType=(\d)')
        if r: item['blood_type'] = r[0]

        # Contact attributes.
        r = root.re(ur'E-Mail :</td>[\s\S]*?>([\s\S]*?)<')
        if r: item['email'] = r[0].strip()

        r = root.re(ur'QQ:</td>[\s\S]*?>([\s\S]*?)<')
        if r: item['qq'] = r[0].strip()

        r = root.re(ur'移动电话:</td>[\s\S]*?>([\s\S]*?)<')
        if r: item['cellphone'] = r[0].strip()

        r = root.re(ur'固定电话:</td>[\s\S]*?>([\s\S]*?)<')
        if r: item['telephone'] = r[0].strip()

        # Time attributes.
        item['reg_time'] = timestamp2datetime(root.re(r'creatTime:(\d+)')[0])

        item['last_update_time'] = timestamp2datetime(
            root.re(r'updateTime:(\d+)')[0])

        item['last_login_time'] = timestamp2datetime(
            root.re(r'lastLoginTime:(\d+)')[0])

        r = root.re(r'birthDate=([^"]+)')
        if r: item['birthday'] = r[0]

        # Location attributes.
        r = root.re(r'type=1[^"]+')
        if r:
            item['location'] = urllib.unquote(' '.join(
                re.findall('%[%\w]+', r[-1])).encode('utf8'))

        r = root.re(r'type=4[^"]+')
        if r:
            item['hometown'] = urllib.unquote(' '.join(
                re.findall('%[%\w]+', r[-1])).encode('utf8'))

        userid = root.re(r'userId:(\d+)')[0]
        yield self.level_request(userid, item)
示例#22
0
    def parse_item(self, response):
        self.uid += 1
        if '继续浏览' in response.body:
            self.validateuser(self.uid)

        item = PoiItem()

        try:
            item['name'] = response.css('h1.avatar_title::text').extract()[0]
            item['avatar'] = response.css(
                'img#userpic::attr(src)').extract()[0]

            item['site_id'] = 7  # Site id
            item['user_id'] = str(response.meta['user_id'])
        except:
            yield Request(
                'http://www.renren.com/' + str(self.uid) +
                '/profile?v=info_timeline',
                headers={
                    'Cookie':
                    self.cookies,
                    'Pragma':
                    'no-cache',
                    'Referer':
                    'http://www.renren.com/343633795/profile',
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.94 Safari/537.36'
                },
                callback=self.parse_item,
                meta={'user_id': self.uid})
            return

        item['experience'] = ''

        r = response.css('li.work span::text').extract()
        if r: item['experience'] += r[0]

        r = response.css('li.school span::text').extract()
        if r: item['experience'] += r[0]

        r = response.css('li.birthday span::text').extract()
        if r:
            item['gender'] = 'M' if u'男' in r[0] else 'F'

        r = response.css('li.hometown::text').extract()
        if r: item['hometown'] = r[0]

        r = response.css('li.address::text').extract()
        if r: item['location'] = r[0]

        yield item
        yield Request(
            'http://www.renren.com/' + str(self.uid) +
            '/profile?v=info_timeline',
            headers={
                'Cookie':
                self.cookies,
                'Pragma':
                'no-cache',
                'Referer':
                'http://www.renren.com/343633795/profile',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.94 Safari/537.36'
            },
            callback=self.parse_item,
            meta={'user_id': self.uid})
示例#23
0
    def parse_item(self, response):
        item = PoiItem()
        root = response.xpath('//html')
        try:
            name = response.css('div.card-photo a::attr(title)').extract()[0]
            if name.strip() == '':
                return
            item['name'] = name

            # If name is available, then this item is valid.
            item['site_id'] = 14 # Site id of tianya.
            item['user_id'] = str(response.meta['user_id']) 
        except:
            return

        r = response.css('div.card-photo img::attr(src)').extract()
        if r: item['avatar'] = r[0]

        experience = ''
        r = response.css('ul.card-info a.clr-333::attr(title)').extract()
        if r:  experience += ' '.join(r)

        r = root.re(ur'<li><span class="clr-999">工作经验:</span>(.*?)</li>')
        if r:  experience += r[0]

        item['experience'] = experience


        r = response.css('ul.card-info li')[-2].css('a::text').extract()
        if r:
            item['followers'] = int(r[0].strip())
            item['following'] = int(r[1].strip())

        r = root.re(ur'<li><span class="clr-999">性  别:</span>(.*?)</li>')
        if r and r[0].strip() != '': item['gender'] = 'M' if u'男' in r[0].strip() else 'F'

        r = root.re(ur'<li><span class="clr-999">年  龄:</span>([\s\S]*?)</li>')
        if r and r[0].strip() != '': 
            r = re.findall(r'\((.*?)\)', r[0].strip())
            if r:
                birthday = r[0]
                if len(birthday) < 10:
                    item['birthday'] = u'0000-'+birthday
                else:
                    item['birthday'] = birthday

        r = root.re(ur'<li><span class="clr-999">身  高:</span>[\s]*(.+)')
        if r and r[0].strip() != '': item['height'] = r[0].strip()

        r = root.re(ur'<li><span class="clr-999">居 住 地 :</span>(.*?)</li>')
        if r and r[0].strip() != '': item['location'] = r[0].strip()

        r = root.re(ur'<li><span class="clr-999">户  籍:</span>(.*?)</li>')
        if r and r[0].strip() != '': item['hometown'] = r[0].strip()

        r = root.re(ur'<li><span>体育爱好:</span>(.*?)</li>')
        if r and r[0].strip() != '': item['favorites'] = r[0].strip()

        marital_status = {
            u'已婚':u'M',
            u'未婚':u'S'
        }
        r = root.re(ur'<li><span class="clr-999">婚姻状况:</span>(.*?)</li>')
        if r and r[0].strip()!=u'保密': item['marital_status'] = marital_status[r[0].strip()]

        education_level = {
            u'初中':u'1',
            u'高中':u'1',
            u'中技':u'2',
            u'中专':u'2',
            u'大专':u'3',
            u'本科':u'4',
            u'硕士':u'5',
            u'MBA':u'5',
            u'博士':u'6',
        }
        r = root.re(ur'<li><span class="clr-999">最高学历:</span>(.*?)</li>')
        if r: item['education_level'] = education_level[r[0].strip()]

        r = response.css('p.clr-333::text').extract()
        if r: item['description'] = r[0]
        return item
示例#24
0
文件: 028-hzxq.py 项目: songboyu/poi
    def parse_item(self, response):
        item = PoiItem()

        try:
            item['name'] = response.xpath("//a[@class='sexico1']/text()").extract()[0]
            item['avatar'] = response.xpath("//div[@class='U110MP']/img/@src").extract()[0]
            item['site_id'] = 28
            item['user_id'] = str(response.meta['user_id'])
            info_A = response.xpath("//div[@class='UmainL3content1']/dt/text()").extract()
            info_B = response.xpath("//div[@class='UmainL3content1']/dd/text()").extract()
            bloodType = {
            u'O型':u'1',
            u'A型':u'2',
            u'B型':u'3',
            u'AB型':u'4',
            u'其他型':u'5'
            }
            bloodTypeA = [
            u'O型',
            u'A型',
            u'B型',
            u'AB型',
            u'其他型',
            ]
            education_level = {
                    u'初中':u'1',
                    u'中专/职高/技校':u'3',
                    u'高中':u'1',
                    u'大专':u'2',
                    u'本科':u'4',
                    u'硕士':u'5',
                    u'博士':u'6',
                    u'博士后':u'6',
                    }
            education_levelA = [
                    u'初中',
                    u'中专/职高/技校',
                    u'高中',
                    u'大专',
                    u'本科',
                    u'硕士',
                    u'博士',
                    u'博士后',
                    ]
            marital_status = {
            u'已婚':u'M',
            u'恋爱中':u'S',
            u'未婚':u'S',
            u'离异':u'D',
            u'丧偶':u'W'
            }
            marital_statusA = [
            u'已婚',
            u'恋爱中',
            u'未婚',
            u'离异',
            u'丧偶',
            ]
            for i in range(len(info_A)):
                if u'血' in info_A[i]:
                    if info_B[i]!=u'保密'and info_B[i]!=u'未填'and info_B[i] in bloodTypeA: item['blood_type'] = bloodType[info_B[i]]
                elif u'学' in info_A[i]:
                    if info_B[i]!=u'保密'and info_B[i]!=u'未填'and info_B[i] in education_levelA: item['education_level'] = education_level[info_B[i]]
                elif u'月' in info_A[i]:
                    if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['salary'] = info_B[i]
                elif u'身' in info_A[i]:
                    if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['height'] = info_B[i]
                else:
                    pass
            info_A = response.xpath("//div[@class='UmainL3content2']/dt/text()").extract()
            info_B = response.xpath("//div[@class='UmainL3content2']/dd/text()").extract()
            for i in range(len(info_A)):

                if u'婚姻状况' in info_A[i]:
                    if info_B[i]!=u'保密'and info_B[i]!=u'未填'and info_B[i] in marital_statusA: item['marital_status'] = marital_status[info_B[i]]
                elif u'体' in info_A[i]:
                    if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['weight'] = info_B[i]
                elif u'职' in info_A[i]:
                    if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['occupation'] = info_B[i]
                elif u'年' in info_A[i]:
                    if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['birthday'] = info_B[i][-11:-1]
                else:
                    pass
        except:
            return

        return item
示例#25
0
文件: 030-ganji.py 项目: songboyu/poi
    def parse_item(self, response):
        item = PoiItem()

        try:
            item['name'] = response.css(
                'div.z-xql span.mr-5::text').extract()[0]
            item['avatar'] = response.css(
                'div.z-img img::attr(src)').extract()[0]

            item['site_id'] = 30
            item['user_id'] = str(response.meta['user_id'])
        except:
            return

        info = response.css('p.z-detail span')

        item['gender'] = 'M' if info[0].css(
            '::text').extract()[0] == u'男' else 'F'

        item['birthday'] = str(datetime.date.today().year - int(info[1].css(
            '::text').extract()[0])) + '-00-00'

        item['location'] = info[-1].css('::text').extract()[0][2:]

        info = response.css('div.z-subcon2 td.w2')

        # item['height'] = info[0].css('span::text').extract()[0]

        education_level = {
            u'初中': u'1',
            u'中专/职高/技校': u'2',
            u'高中及以下': u'1',
            u'大专以下': u'2',
            u'大专': u'3',
            u'本科': u'4',
            u'硕士': u'5',
            u'博士': u'6',
            u'博士后': u'6',
        }
        # print info[1].css('span::text').extract()[0]
        r = info[1].css('span::text').extract()
        if r: item['education_level'] = education_level[r[0].strip()]

        marital_status = {
            u'已婚': u'M',
            u'未婚': u'S',
            u'离异': u'D',
            u'离异单身': u'D',
            u'离异带孩': u'D',
            u'丧偶': u'W',
            u'丧偶单身': u'W',
            u'丧偶带孩': u'W'
        }
        # print info[2].css('span::text').extract()[0]
        r = info[2].css('span::text').extract()
        if r and r[0] != u'未填写':
            item['education_level'] = marital_status[r[0].strip()]

        item['description'] = response.css('div.z-db')[0].css(
            '::text').extract()[0]
        return item
示例#26
0
    def parse_item(self, response):
        item = PoiItem()

        try:
            item['name'] = response.css('h4::text').extract()[0]
            item['avatar'] = response.css(
                'img.img_absolute::attr(_src)').extract()[0]
            # item['gender'] = 'M' if 'male' in response.css('div.portrait i::attr(class)').extract()[0] else 'F'

            # If name is available, then this item is valid.
            item['site_id'] = 6  # Site id of tianya.
            item['user_id'] = str(response.meta['user_id'])
        except:
            return
        item['score'] = int(response.css('h6::text').extract()[0])

        try:
            item['level'] = response.css('span.member_dj::text').extract()[0]
        except:
            item['level'] = ' '.join(
                response.css('span.member_dj a::attr(title)').extract())

        item['description'] = response.css('div.js_text::text').extract()[0]

        age, marital, location = response.css(
            'h6.member_name::text').extract()[0].split(u',')

        item['birthday'] = str(datetime.date.today().year -
                               int(age[0:2])) + '-00-00'
        marital_status = {
            u'未婚': u'S',
            u'已婚': u'M',
            u'恋爱': u'L',
            u'分居': u'P',
            u'离异': u'D',
            u'离异,无小孩': u'D',
            u'离异,有小孩归对方': u'D',
            u'离异,有小孩归自己': u'D',
            u'丧偶': u'W',
            u'丧偶,无小孩': u'W',
            u'丧偶,有小孩归对方': u'W',
            u'丧偶,有小孩归自己': u'W',
        }
        item['marital_status'] = marital_status[marital]
        item['location'] = location[2:]

        education_level = {
            u'初中': u'1',
            u'高中': u'1',
            u'本科': u'4',
            u'双学士': u'5',
            u'硕士': u'5',
            u'小学': u'7',
            u'高中中专及以下': u'1',
            u'中专或相当学历': u'2',
            u'大专': u'3',
            u'博士': u'6',
            u'其他': u'7',
        }
        r = response.css('ul.member_info_list li')[0].css('em::text').extract()

        if r and r[0] != u'--' and r[0] != u'保密':
            item['education_level'] = education_level[r[0].strip()]

        r = response.css('ul.member_info_list li')[1].css('em::text').extract()
        if r and r[0] != u'--' and r[0] != u'保密': item['height'] = r[0]

        r = response.css('ul.member_info_list li')[3].css('em::text').extract()
        if r and r[0] != u'--' and r[0] != u'保密': item['salary'] = r[0]

        r = response.css('ul.member_info_list li')[5].css('em::text').extract()
        if r and r[0] != u'--' and r[0] != u'保密': item['weight'] = r[0]

        bloodType = {
            u'O型': u'1',
            u'A型': u'2',
            u'B型': u'3',
            u'AB型': u'4',
            u'其它': u'5'
        }
        r = response.css('ul.member_info_list li')[9].css('em::text').extract()

        if r and r[0] != u'--' and r[0] != u'保密':
            item['blood_type'] = bloodType[r[0].strip()]

        experience = ''
        r = response.css('div.js_box')[4]
        if r:
            for li in r.css('li.fn-clear'):
                if li.css('em::text').extract()[0] != '--':
                    experience += li.css('span::text').extract()[0] + li.css(
                        'em::text').extract()[0] + ' '
            item['experience'] = experience
            # print experience

        return item
示例#27
0
文件: 018-lol99.py 项目: songboyu/poi
    def parse_item(self, response):
        item = PoiItem()

        try:
            user_name = u''
            user_name = response.css('div.right_content h2::text').extract()[0]
            user_name = user_name.split('[')[0].rstrip()
            # print user_name
            item['name'] = user_name
            item['avatar'] = response.css(
                'div.pic img::attr(src)').extract()[0]

            item['site_id'] = 18
            item['user_id'] = str(response.meta['user_id'])
            item['description'] = response.css(
                'div.box_content p::text').extract()[0]
            r = response.xpath(
                "//div[@class='right_content']/p/span/text()").extract()
            temp = u''
            for i in range(len(r)):
                temp = r[i]

                if u'男' in temp:
                    item['gender'] = 'M'
                elif u'女' in temp:
                    item['gender'] = 'F'
                elif u'学历' in temp:
                    education_level = {
                        u'初中': u'1',
                        u'中专/职高/技校': u'3',
                        u'高中': u'1',
                        u'大专': u'2',
                        u'本科': u'4',
                        u'硕士': u'5',
                        u'博士': u'6',
                        u'博士后': u'6',
                    }
                    education_list = [
                        u'初中',
                        u'中专/职高/技校',
                        u'高中',
                        u'大专',
                        u'本科',
                        u'硕士',
                        u'博士',
                        u'博士后',
                    ]
                    education = u''
                    education = temp[3:]

                    if education in education_list and education != u'保密':
                        item['education_level'] = education_level[education]
                elif u'所在地' in temp:
                    item['location'] = temp[4:]
                elif u'籍贯' in temp:
                    item['hometown'] = temp[3:]
                elif u'婚姻状况' in temp:
                    marital_status = {
                        u'已婚': u'M',
                        u'单身': u'S',
                        u'未婚': u'S',
                        u'离异': u'D',
                        u'丧偶': u'W'
                    }
                    item['marital_status'] = marital_status[temp[5:]]
                elif u'月薪' in temp:
                    item['salary'] = temp[3:]
                elif u'身高' in temp:
                    item['height'] = temp[3:]
                else:
                    pass
        except:
            return

        return item
示例#28
0
    def parse_item(self, response):
        item = PoiItem()

        try:
            item['name'] = response.xpath(
                "//p[@class='name']/strong/text()").extract()[0]
            item['avatar'] = response.xpath(
                "//div[@class='photo']/img/@src").extract()[0]

            item['site_id'] = 24
            item['user_id'] = str(response.meta['user_id'])
            r = response.xpath(
                "//div[@style='background:none;']/dl/dd/text()").extract()
            for i in range(len(r)):
                if u'男' in r[i]:
                    item['gender'] = 'M'
                elif u'女' in r[i]:
                    item['gender'] = 'F'
                elif u'身高' in r[i]:
                    item['height'] = r[i][4:]
                elif u'体重' in r[i]:
                    item['weight'] = r[i][4:]
                elif u'情感状态' in r[i]:
                    marital_status = {
                        u'已婚': u'M',
                        u'恋爱中': u'S',
                        u'未婚': u'S',
                        u'离异': u'D',
                        u'丧偶': u'W'
                    }
                    marital_statusA = [
                        u'已婚',
                        u'恋爱中',
                        u'未婚',
                        u'离异',
                        u'丧偶',
                    ]
                    if r[i][6:] in marital_statusA:
                        item['marital_status'] = marital_status[r[i]]
            r = response.xpath(
                "//div[@class='pro_details']/dl/dd/text()").extract()
            for i in range(len(r)):
                if u'兴趣爱好' in r[i]:
                    item['favorites'] = r[i][6:]
                elif u'血型' in r[i]:
                    bloodType = {
                        u'O型': u'1',
                        u'A型': u'2',
                        u'B型': u'3',
                        u'AB型': u'4',
                        u'其他型': u'5'
                    }
                    bloodTypeA = [
                        u'O型',
                        u'A型',
                        u'B型',
                        u'AB型',
                        u'其他型',
                    ]
                    if r[i][4:] in bloodTypeA:
                        item['blood_type'] = bloodType[r[i][4:]]
                elif u'工作地区' in r[i]:

                    item['location'] = r[i][5:]
                elif u'学历' in r[i]:
                    education_level = {
                        u'初中': u'1',
                        u'中专/职高/技校': u'3',
                        u'高中': u'1',
                        u'大专': u'2',
                        u'本科': u'4',
                        u'硕士': u'5',
                        u'博士': u'6',
                        u'博士后': u'6',
                    }
                    education_levelA = [
                        u'初中',
                        u'中专/职高/技校',
                        u'高中',
                        u'大专',
                        u'本科',
                        u'硕士',
                        u'博士',
                        u'博士后',
                    ]
                    if r[i][3:] in education_levelA:
                        item['education_level'] = education_level[r[i][3:]]
                elif u'职业' in r[i]:
                    item['occupation'] = r[i][3:]
                elif u'年收入' in r[i]:
                    item['salary'] = r[i][4:]
                else:
                    pass
            r = response.xpath(
                "//div[@class='pro_details']/p/text()").extract()
            temp = u''
            for i in xrange(1, len(r)):
                temp += r[i]

            item['description'] = temp

        except:
            return

        return item
示例#29
0
    def parse_item(self, response):
        item = PoiItem()

        try:
            item['name'] = response.css(
                'a.profile-netname h1::text').extract()[0]
            item['avatar'] = response.css(
                'div.profile-user-img-box img::attr(src)').extract()[0]

            item['site_id'] = 16
            item['user_id'] = str(response.meta['user_id'])
            item['gender'] = 'M' if u'先生' in response.css(
                'span#profile_sex::text').extract()[0] else 'F'
            item['occupation'] = response.css(
                'span#profile_occupation::text').extract()[0]
            education_level = {
                u'初中': u'1',
                u'中专/职高/技校': u'3',
                u'高中': u'1',
                u'大专': u'2',
                u'本科': u'4',
                u'硕士': u'5',
                u'博士': u'6',
                u'博士后': u'6',
            }
            r = response.css('span#profile_education::text').extract()
            if r and r[0].strip() != u'保密':
                item['education_level'] = education_level[r[0].strip()]
            item['height'] = response.css(
                'span#profile_height::text').extract()[0]
            item['weight'] = response.css(
                'span#profile_weight::text').extract()[0]
            bloodType = {
                u'O型': u'1',
                u'A型': u'2',
                u'B型': u'3',
                u'AB型': u'4',
                u'其他型': u'5'
            }
            r = response.css('span#profile_blood_type::text').extract()
            if r and r[0].strip() != u'保密':
                item['blood_type'] = bloodType[r[0].strip()]
            marital_status = {
                u'已婚': u'M',
                u'未婚': u'S',
                u'离异': u'D',
                u'丧偶': u'W'
            }
            r = response.css('span#profile_marital::text').extract()
            if r and r[0].strip() != u'保密':
                item['marital_status'] = marital_status[r[0].strip()]
            r = response.css('span#profile_income::text').extract()
            if r and r[0].strip() != u'保密': item['salary'] = r[0].strip()
            r = response.css(
                'span#profile_r_state_id a[target=_blank]::text').extract()
            if r and r[0].strip() != u'保密': item['location'] = r[0].strip()
            item['hometown'] = response.css(
                'span#profile_n_state_id::text').extract()[0]

        except:
            return

        return item