def parse_item(self, response): item = PoiItem() try: item['name'] = response.css( 'span#profileVerifyName::text').extract()[0] item['avatar'] = response.css( 'a.photo-trigger img::attr(src)').extract()[0] item['site_id'] = 26 item['user_id'] = str(response.meta['user_id']) except: return info = response.css('div.profile-basic-info-left span') item['gender'] = 'M' if info[0].css( '::text').extract()[0] == u'男' else 'F' item['birthday'] = str(datetime.date.today().year - int(info[1].css( 'em::text').extract()[0])) + '-00-00' item['height'] = info[2].css('em::text').extract()[0] + 'cm' item['location'] = info[3].css('::text').extract()[0] item['score'] = int( response.css('span.js-totalPercent::text').extract()[0]) return item
def parse_item_json(self, response): data = json.loads(response.body) item = PoiItem() item['site_id'] = 3 item['user_id'] = data['user']['id'] #id item['name'] = data['user']['name'] #姓名 item['gender'] = 'M' if data['user']['sex'] == 1 else 'F' #性别 item['level'] = data['user']['tb_age'] #吧龄 item['followers'] = int(data['user']['fans_num']) #粉丝数 item['following'] = int(data['user']['concern_num']) #关注数 # item['my_like_num'] = data['user']['my_like_num'] #喜欢的吧 item['post_num'] = int(data['user']['post_num'] if data['user']['post_num'] else '0') #总发帖数 item['reply_num'] = int(data['user']['repost_num'] if data['user']['repost_num'] else '0') #回复数 # item['thread_num'] = data['user']['thread_num'] #主题数 item['description'] = data['user']['intro'] #简介 item['avatar'] = 'http://tb.himg.baidu.com/sys/portrait/item/' + data[ 'user']['portrait'] #头像 if item['name']: yield Request('http://www.baidu.com/p/' + item['name'] + '/detail', callback=self.parse_item_detail, meta={'item': item}, priority=10)
def parse_item(self, response): item = PoiItem() try: item['name'] = response.css('div.portrait img::attr(alt)').extract()[0] item['avatar'] = response.css('div.portrait img::attr(src)').extract()[0] item['gender'] = 'M' if 'male' in response.css('div.portrait i::attr(class)').extract()[0] else 'F' # If name is available, then this item is valid. item['site_id'] = 1 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return item['description'] = ''.join(response.css('div.profile p::text').extract()) item['followers'] = int(response.css('div.link-box a::text').extract()[1]) item['following'] = int(response.css('div.link-box a::text').extract()[0]) item['score'] = int(response.css('div.userinfo').re(ur'分</span>(.*?)</p>')[0]) item['login_num'] = int(response.css('div.userinfo').re(ur'登录次数</span>(.*?)</p>')[0]) item['last_login_time'] = response.css('div.userinfo').re(ur'最新登录</span>(.*?)</p>')[0] item['reg_time'] = response.css('div.userinfo').re(ur'注册日期</span>(.*?)</p>')[0] # May not have location or level. r = response.css('div.userinfo').re(ur'区</span>(.*?)</p>') if r: item['location'] = r[0] r = response.css('div.userinfo').re(ur'_blank">(.*?)</a>') if r: item['level'] = r[0] # Inexistence indicats zero. r = response.css('div.mod-hd').re(ur'主贴(\d+)') item['post_num'] = int(r[0]) if r else 0 r = response.css('div.mod-hd').re(ur'回帖(\d+)') item['reply_num'] = int(r[0]) if r else 0 return item
def parse_item(self, response): item = PoiItem() if u'提示消息' == response.xpath("//title/text()").extract()[0]: print 'nnooooooooooooooooooo' return else: name = response.xpath("//title/text()").extract()[0][0:-5] info0 = response.xpath( "//div[@class='person_basics']/div[1]/div[1]/dl").extract() # info = response.xpath("//div[@class='clear']/dl/dd/text()").extract() item['name'] = name item['avatar'] = response.xpath( "//p[@class='per_img']/a/img/@src").extract()[0] # item['gender'] = 'M' if info[1]==u'男' else 'F' # If name is available, then this item is valid. item['site_id'] = 20 # Site id of item['user_id'] = str(response.meta['user_id']) # except: # return if u'span' not in info0[6]: item['occupation'] = info0[6][35:-10] marital_status = { u'未婚': u'S', u'离异': u'D', u'丧偶': u'W', } # if info[3] in marital_status: # item['marital_status'] = marital_status[info[3]] education_level = { u'高中': u'1', u'大学本科': u'4', u'硕士': u'5', u'中专': u'2', u'大专': u'3', u'博士': u'6', u'其他': u'7', } if u'span' not in info0[7]: if info0[7][35:-10] in education_level: item['education_level'] = education_level[info0[7][35:-10]] if u'span' not in info0[1]: item['height'] = info0[1][35:-10] if u'span' not in info0[8]: item['salary'] = info0[8][35:-10] # item['weight'] = info[len(info)-6] # item['blood_type'] = info[len(info)-3] if u'span' not in info0[2]: item['location'] = info0[2][34:-10] if u'span' not in info0[0]: item['birthday'] = str(datetime.date.today().year - int(info0[0][35:-23])) + '-00-00' # if not u"Ta有点害羞,需要你的鼓励" in description: # item['description'] ="".join(description.split()) return item
def parse_item(self, response): item = PoiItem() try: user_name = response.css('dl.dl02 dt::text').extract()[0] item['name'] = user_name item['avatar'] = response.css( 'img#myImagePhoto_mid::attr(src)').extract()[0] item['site_id'] = 17 item['user_id'] = str(response.meta['user_id']) item['gender'] = 'M' if u'男' in response.css( 'dl.dl02 dd.d01::text').extract()[0] else 'F' item['occupation'] = response.css( 'dl.dl02 dd.d02::text').extract()[0] education_level = { u'初中': u'1', u'中专/职高/技校': u'3', u'高中': u'1', u'大专': u'2', u'本科': u'4', u'硕士': u'5', u'博士': u'6', u'博士后': u'6', } r = response.css('dl.dl02 dd.d01::text').extract() if r and r[8].strip() != u'保密': item['education_level'] = education_level[r[8].strip()] item['height'] = response.css('dl.dl02 dd.d01::text').extract()[4] marital_status = { u'已婚': u'M', u'单身': u'S', u'未婚': u'S', u'离异': u'D', u'丧偶': u'W' } r = response.css('dl.dl02 dd.d01::text').extract() if r and r[3].strip() != u'保密': item['marital_status'] = marital_status[r[3].strip()] r = response.css('dl.dl02 dd.d02::text').extract() if r and r[0].strip() != u'保密': item['salary'] = r[2].strip() r = response.css('dl.dl02 dd.d01::text').extract() if r and r[0].strip() != u'保密': item['location'] = r[2].strip() item['body_size'] = response.css( 'dl.dl02 dd.d01::text').extract()[5] item['looks'] = response.css('dl.dl02 dd.d01::text').extract()[6] except: return return item
def parse_item(self, response): item = PoiItem() pattern = re.compile('memberInfo : \$.parseJSON\(\'(.*?)\'\),', re.S) result = re.search(pattern, response.body.decode('gbk')) ans = json.loads(result.group(1)) pattern_description = re.compile( '<p class="fs14 lh20 c5e slider-area-js">(.*?)<span class="info-mark"></span></p>', re.S) description = re.search(pattern_description, response.body.decode('gbk')).group(1) try: item['name'] = ans["fullName"] item['avatar'] = ans["photo"] item['gender'] = 'M' if ans["sex"] == 0 else 'F' # If name is available, then this item is valid. item['site_id'] = 9 # Site id of zhenai. item['user_id'] = str(response.meta['user_id']) except: return item['occupation'] = ans["occupation"] marital_status = { u'未婚': u'S', u'离异': u'D', u'丧偶': u'W', } if ans["marriage"] in marital_status: item['marital_status'] = marital_status[ans["marriage"]] education_level = { u'高中及以下': u'1', u'大学本科': u'4', u'硕士': u'5', u'中专': u'2', u'大专': u'3', u'博士': u'6', u'其他': u'7', } if ans["education"] in education_level: item['education_level'] = education_level[ans["education"]] item['height'] = ans["height"] item['occupation'] = ans["occupation"] item['salary'] = ans["salary"] item['location'] = ans["workCity"] item['birthday'] = str(datetime.date.today().year - int(ans["age"])) + '-00-00' if not u"Ta有点害羞,需要你的鼓励" in description: item['description'] = "".join(description.split()) return item
def parse_item(self, response): item = PoiItem() name = response.xpath( "//div[@class='pp_s']/p[1]/strong[1]/text()").extract( )[0][0:-5 - len(str(response.meta['user_id']))] info = response.xpath("//div[@class='pp_x']/ul[1]/li/text()").extract() try: item['name'] = name item['avatar'] = 'http://www.51findlove.cn/' + response.xpath( "//div[@class='happ_img']/img/@src").extract()[0] # item['gender'] = 'M' if info[0]==u'男' else 'F' # If name is available, then this item is valid. item['site_id'] = 21 # Site id of 7rdao. item['user_id'] = str(response.meta['user_id']) # except: # return # item['occupation']=info[6] marital_status = { u'未婚': u'S', u'离异': u'D', u'丧偶': u'W', } if info[4][5:] in marital_status: item['marital_status'] = marital_status[info[4][5:]] education_level = { u'高中': u'1', u'本科': u'4', u'硕士': u'5', u'中专': u'2', u'大专': u'3', u'博士': u'6', u'其他': u'7', } if info[2][3:] in education_level: item['education_level'] = education_level[info[2][3:]] item['height'] = info[1][3:] item['salary'] = info[5][3:] item['location'] = info[6][3:] item['birthday'] = str(datetime.date.today().year - int(info[0][3:-1])) + '-00-00' # if not u"Ta有点害羞,需要你的鼓励" in description: # item['description'] ="".join(description.split()) except: return return item
def parse_item(self, response): item = PoiItem() info = response.xpath("//div[@id='jibenxinxi']/p/text()").extract() info2 = response.xpath( "//table[1]/tr[1]/td[1]/table[1]/tr/td[2]/text()").extract() try: item['name'] = info[0][3:] item['avatar'] = response.xpath( "//div[@id='bigface']/div/img/@src").extract()[0] item['gender'] = 'M' if info[3][3:] == u'男' else 'F' # If name is available, then this item is valid. item['site_id'] = 23 # Site id of 7rdao. item['user_id'] = str(response.meta['user_id']) # except: # return item['occupation'] = info[6][3:] marital_status = { u'未婚': u'S', u'离异': u'D', u'丧偶': u'W', } if info2[0] in marital_status: item['marital_status'] = marital_status[info2[0]] education_level = { u'高中': u'1', u'大学本科': u'4', u'硕士': u'5', u'中专': u'2', u'大专': u'3', u'博士': u'6', u'其他': u'7', } if info2[3] in education_level: item['education_level'] = education_level[info2[3]] item['height'] = info2[1] item['salary'] = info2[6] item['location'] = info[5][3:] item['birthday'] = str(datetime.date.today().year - int(info[4][3:-1])) + '-00-00' # if not u"Ta有点害羞,需要你的鼓励" in description: # item['description'] ="".join(description.split()) except: return return item
def parse_item(self, response): item = PoiItem() # '/html/body/div[2]/div/div[1]/h1' item['name'] = response.xpath( '/html/body/div[2]/div[1]/div[1]/h1/text()').extract() item['address'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[4]/text()').extract() item['category'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[6]/text()').extract() item['wgs_84'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[7]/text()').extract() item['gcj_02'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[8]/text()').extract() item['bd_09'] = response.xpath( '/html/body/div[2]/div[1]/div[2]/ul/li[9]/text()').extract() yield item
def parse_item(self, response): item = PoiItem() root = response.xpath('//html') try: name = response.css('div#member_messages h3 a::text').extract()[0] if name == u'编辑' or name == '': return item['name'] = name # If name is available, then this item is valid. item['site_id'] = 13 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return desc = response.css( 'div#member_messages h3 span::text').extract()[0].strip() if desc != '': item['description'] = desc avatar = response.css('div.face a img::attr(src)').extract()[0] if avatar[0:4] == 'http': item['avatar'] = avatar else: item['avatar'] = 'http://www.easydong.com' + avatar r = root.re(ur'<li><span>性别:</span>(.*?)</li>') if r and r[0].strip() != '': item['gender'] = 'M' if u'男' in r[0] else 'F' r = root.re(ur'<li><span>生日:</span>(.*?)</li>') if r and r[0].strip() != '': birthday = r[0].strip() if len(birthday) < 10: item['birthday'] = u'0000-' + birthday else: item['birthday'] = birthday r = root.re(ur'<li><span>易动积分:</span>(\d+)</li>') if r and r[0].strip() != '': item['score'] = int(r[0].strip()) r = root.re(ur'<li><span>运动主场:</span>(.*?)</li>') if r and r[0].strip() != '': item['location'] = r[0].strip() r = root.re(ur'<li><span>体育爱好:</span>(.*?)</li>') if r and r[0].strip() != '': item['favorites'] = r[0].strip() return item
def parse_item(self, response): item = PoiItem() name = response.xpath( "//div[@id='baseInfo']/dl/dd[2]/a/text()").extract()[0] info = response.xpath("//div[@id='baseInfo']/dl/dd/text()").extract() # try: item['name'] = name item['avatar'] = response.xpath( "//a[@class='img']/img/@src").extract()[0] item['gender'] = 'M' if info[0] == u'男' else 'F' # If name is available, then this item is valid. item['site_id'] = 19 # Site id of 7rdao. item['user_id'] = str(response.meta['user_id']) # except: # return item['occupation'] = info[6] marital_status = { u'未婚': u'S', u'离异': u'D', u'丧偶': u'W', } if info[4] in marital_status: item['marital_status'] = marital_status[info[4]] education_level = { u'高中': u'1', u'大学本科': u'4', u'硕士': u'5', u'中专': u'2', u'大专': u'3', u'博士': u'6', u'其他': u'7', } if info[5] in education_level: item['education_level'] = education_level[info[5]] item['height'] = info[1] item['salary'] = info[7] item['location'] = info[8][0:-2] item['birthday'] = str(datetime.date.today().year - int(info[2])) + '-00-00' # if not u"Ta有点害羞,需要你的鼓励" in description: # item['description'] ="".join(description.split()) return item
def parse_item(self, response): item = PoiItem() try: item['name'] = response.css('p.nametit b::text').extract()[0] # If name is available, then this item is valid. item['site_id'] = 10 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return item['avatar'] = response.css('img#changesize::attr(src)').extract()[0] item['gender'] = 'M' if u'男' in response.css( 'p.nametit span::text').extract()[0] else 'F' item['birthday'] = str(datetime.date.today().year - int( re.findall(r'(\d+)', response.css('p.nametit span::text').extract()[0])[0]) ) + '-00-00' text = response.css('div.font12::text').extract()[0].replace( ' ', '').split('/') item['location'] = text[0].strip() item['height'] = text[1].strip() education_level = { u'专科以下': u'1', u'专科': u'3', u'本科': u'4', u'硕士': u'5', u'博士': u'6', u'博士后': u'6', } item['education_level'] = education_level[text[2].strip()] item['occupation'] = text[3].strip().replace(u'职业:', '') desc = response.css('p#updateDesc span::text').extract()[0] if desc != u'评论一下嘛…' and desc != u'在照片上写点什么': item['description'] = desc r = response.css('p.nametit b img').extract() if r: item['level'] = '认证' else: item['level'] = '未认证' return item
def parse_item(self, response): item = PoiItem() try: item['name'] = response.css('a.founder::text').extract()[0] item['avatar'] = response.css('div.photo img::attr(src)').extract()[0] item['site_id'] = 8 item['user_id'] = str(response.meta['user_id']) except: return r = response.css('div.info').re(ur'常居地:</span>(.*?)<') if r: item['location'] = r[0].strip() r = response.css('div.intro::text').extract() if r: item['description'] = r[0].strip() return item
def parse_item(self, response): item = PoiItem() j = json.loads(response.body)['data'] if not j['uid']: return try: item['name'] = j['nick'] item['avatar'] = j['avatar_150'] # If name is available, then this item is valid. item['site_id'] = 25 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return age = j['age'][0:-1] try: item['birthday'] = str(datetime.date.today().year - int(age)) + '-00-00' except: pass item['height'] = j['height'] item['description'] = j['intro'] item['location'] = j['location'] item['salary'] = j['salary'] item['gender'] = 'M' if j['sex'] == '1' else 'F' education_level = { u'初中': u'1', u'中专/职高/技校': u'2', u'高中/中专': u'2', u'大专以下': u'2', u'大专': u'3', u'本科': u'4', u'硕士': u'5', u'博士': u'6', u'博士后': u'6', } print j['education'] item['education_level'] = education_level[j['education'].strip()] return item
def parse(self, response): # 解析当前页码和总页码 page_cur = int( response.xpath( '//ul[@class="pagination pagination-sm mar-t5"]/li[@class="active"]/a/text()' ).extract()[0]) page_num = int( response.xpath( '//ul[@class="pagination pagination-sm mar-t5"]/li[last()]/a/text()' ).extract()[0]) zh = resolve(self.url) category_cur = zh[0] city_cur = zh[1] if page_cur < page_num: # next page trs = response.xpath('//tbody/tr') item = PoiItem() for tr in trs: item['name'] = html.unescape( tr.xpath('./td[2]/text()').extract()[0]) item['province'] = html.unescape( tr.xpath('./td[3]/text()').extract()[0]) item['city'] = html.unescape( tr.xpath('./td[4]/text()').extract()[0]) district = tr.xpath('./td[5]/text()').extract() item['district'] = html.unescape( district[0]) if district else '' code = tr.xpath('./td[6]/text()').extract() item['code'] = code[0] if code else '' phone = tr.xpath('./td[7]/text()').extract() item['phone_no'] = phone[0] if phone else '' region = tr.xpath('./td[8]/text()').extract() item['region'] = html.unescape(region[0]) if region else '' location = tr.xpath('./td[9]/text()').extract() item['location'] = html.unescape( location[0]) if location else '' cate = tr.xpath('./td[10]/text()').extract() item['category'] = cate[0] if cate else '' sub = tr.xpath('./td[11]/text()').extract() item['sub_category'] = sub[0] if sub else '' lon = tr.xpath('./td[12]/text()').extract() item['longitude'] = lon[0] if lon else 0 lat = tr.xpath('./td[13]/text()').extract() item['latitude'] = lat[0] if lat else 0 yield item temp = self.baseurl + category_cur + '-' + city_cur + '/' suffix = int(page_cur) * 30 url = temp + str(suffix) self.url = url yield Request(url, callback=self.parse, dont_filter=True) else: # next city self.cityindex += 1 if self.cityindex == len(self.citymap): # next category self.cateIndex += 1 if self.cateIndex >= len(self.category): exit(0) else: category_cur = self.categorymap[self.cateIndex] self.cityindex = 0 city_cur = self.citymap[self.cityindex] url = self.baseurl + category_cur + '-' + city_cur + '/' self.url = url yield Request(url, callback=self.parse, dont_filter=True)
def parse_item(self, response): item = PoiItem() root = response.xpath('//html') try: item['name'] = response.css( 'strong#userNameStrong::text').extract()[0] # If name is available, then this item is valid. item['site_id'] = 12 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return r = root.re(ur'"defaultUrl":"(.*?)"') if r: item['avatar'] = r[0] else: item['avatar'] = 'http://profile.baihe.com/new/' + response.css( 'div#simplePhotoDiv img::attr(src)').extract()[0] gender = root.re(ur'var gender_topSendMsg_name_TA = \'(.*?)\';')[0] item['gender'] = 'M' if u'他' in gender else 'F' birthday = root.re(ur'var oppAge = (\d+);')[0] item['birthday'] = str(datetime.date.today().year - int(birthday)) + '-00-00' item['height'] = root.re(ur'<strong>身高:</strong><p>(.*?)</p>')[0] marital_status = {u'已婚': u'M', u'未婚': u'S', u'离异': u'D', u'丧偶': u'W'} r = root.re(ur'<strong>婚姻状况:</strong><p>(.*?)</p>') if r: item['marital_status'] = marital_status[r[0].strip()] education_level = { u'初中': u'1', u'中专/职高/技校': u'3', u'高中': u'1', u'大专': u'2', u'本科': u'4', u'硕士': u'5', u'博士': u'6', u'博士后': u'6', } r = root.re(ur'<strong>学历:</strong><p>(.*?)</p>') if r and r[0].strip() != u'以后告诉你': item['education_level'] = education_level[r[0].strip()] r = root.re(ur'<strong>职业:</strong><p>(.*?)</p>') if r and r[0].strip() != u'以后告诉你': item['occupation'] = r[0].strip() r = root.re(ur'<strong>月薪:</strong><p>(.*?)</p>') if r and r[0].strip() != u'以后告诉你': item['salary'] = r[0].strip() r = root.re(ur'<strong>来自:</strong><p>(.*?)</p>') if r and r[0].strip() != u'以后告诉你': item['location'] = r[0].strip() r = response.css('td#_item_want_know_1::text').extract() if r and r[0].strip() != u'以后告诉你': item['hometown'] = r[0].strip() item['experience'] = '' r = response.css('td#_item_want_know_2::text').extract() if r and r[0].strip() != u'以后告诉你': item['experience'] += u'毕业学校:' + r[0].strip() r = response.css('td#_item_want_know_10::text').extract() if r and r[0].strip() != u'以后告诉你': item['experience'] += u' 公司行业:' + r[0].strip() r = response.css('td#_item_want_know_3::text').extract() if r and r[0].strip() != u'以后告诉你': item['body_size'] = r[0].strip() r = response.css('td#_item_want_know_5::text').extract() if r and r[0].strip() != u'以后告诉你': item['weight'] = r[0].strip() r = response.css('td#_item_want_know_9::text').extract() if r and r[0].strip() != u'以后告诉你': item['looks'] = r[0].strip() bloodType = { u'O型': u'1', u'A型': u'2', u'B型': u'3', u'AB型': u'4', u'其他型': u'5' } r = response.css('td#_item_want_know_7::text').extract() if r and r[0].strip() != u'以后告诉你': item['blood_type'] = bloodType[r[0].strip()] item['description'] = response.css( 'div.pro_details pre::text').extract()[0] return item
def parse_item(self, response): item = PoiItem() root = response.xpath('//html') try: item['name'] = response.css('dd.nickname a::text').extract()[0] # If name is available, then this item is valid. item['site_id'] = 11 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return item['avatar'] = response.css('a.img img::attr(src)').extract()[0] item['gender'] = 'M' if u'男' in response.css( 'dd.f::text').extract()[0] else 'F' birthday = root.re(ur'出生年月.*?(\d+)')[0] item['birthday'] = birthday[:4] + '-' + birthday[4:] + '-00' item['height'] = root.re(ur'(\d+)</dd><dt>年龄:')[0] marital_status = {u'未婚': u'S', u'非单身': u'L', u'离异': u'D', u'丧偶': u'W'} r = root.re(ur'婚姻状况:</dt><dd class="f">(.*?)</dd><dt>') if r: item['marital_status'] = marital_status[r[0].strip()] education_level = { u'初中': u'1', u'高中': u'1', u'本科': u'4', u'硕士': u'5', u'中专': u'2', u'专科': u'3', u'博士': u'6', } r = root.re(ur'最高学历:</dt><dd>(.*?)</dd><dt>') if r: item['education_level'] = education_level[r[0].strip()] r = root.re(ur'从事职业:</dt><dd class="f">(.*?)</dd><dt>') if r: item['occupation'] = r[0].strip() r = root.re(ur'年收入:</dt><dd>(.*?)</dd><dt>') if r: item['salary'] = r[0].strip() r = root.re(ur'现居住地:</dt><dd class="f">(.*?)</dd><dt>') if r: item['location'] = re.sub(r'<(.*?)>', '', r[0]).strip() r = root.re(ur'最后在线时间:(\d+-\d+-\d+ \d+:\d+:\d+)') if r: item['last_login_time'] = r[0].strip() r = root.re(ur'自我介绍:</b>(.*?)</li>') if r: item['description'] = r[0].strip() r = root.re(ur'<li><b>我的个性:</b>(.*?)</li>') if r: item['personality'] = r[0].strip() r = root.re(ur'<li><b>兴趣爱好:</b>(.*?)</li>') if r: item['favorites'] = r[0].strip() r = root.re(ur'<li><b>我的外貌:</b>(.*?)</li>') if r: item['looks'] = r[0].strip() r = root.re(ur'诚信值:(\d+)') if r: item['score'] = int(r[0].strip()) r = root.re(ur'(\d+)</span></dd><dt>性别:') if r: item['level'] = r[0].strip() return item
def parse_item(self, response): item = PoiItem() try: item['name'] = response.css('span.name::text').extract()[0] item['avatar'] = response.css('img#show_pic_1::attr(src)').extract()[0] item['site_id'] = 27 # Site id of zhenai., item['user_id'] = str(response.meta['user_id']) except: return info = response.css('dd.userinfo-m').extract()[0] item['gender'] = 'M' if re.findall(ur'\((\d+),sex\)', info)[0]=='0' else 'F' birthday = re.findall(ur'(\d+)岁', info)[0] item['birthday'] = str(datetime.date.today().year - int(birthday)) + '-00-00' try: item['location'] = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info)[0]) + ' ' + self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info)[1]) except: pass info = response.css('dl.infoList dd') try: r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[0].extract())[0]) if r and r!=u'不限' and r!=u'请选择': info['height'] = r+'cm' except: pass try: r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[2].extract())[0]) if r and r!=u'不限' and r!=u'请选择': info['occupation'] = r except: pass try: r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[7].extract())[0]) if r and r!=u'不限' and r!=u'请选择': info['weight'] = r except: pass marital_status = { u'已婚':u'M', u'未婚':u'S', u'离异':u'D', u'丧偶':u'W' } try: r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[8].extract())[0]) if r.strip()!=u'不限' and r!=u'请选择': item['marital_status'] = marital_status[r.strip()] except: pass try: r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[9].extract())[0]) if r and r!=u'不限' and r!=u'请选择': info['salary1'] = r except: pass #["-1,不限","3,高中及以下","4,大专","5,大学本科","6,硕士","7,博士"], education_level = { u'高中及以下':u'1', u'大专':u'2', u'大学本科':u'4', u'硕士':u'5', u'博士':u'6', } try: r = self.getAttr(re.findall(r'userdetail.*?\((.*?)\)', info[11].extract())[0]) if r.strip()!=u'不限' and r!=u'请选择': item['education_level'] = education_level[r.strip()] except: pass try: item['description'] = response.css('div.InfoData div.Data::text').extract()[0].strip() except: pass return item
def parse_item(self, response): item = PoiItem() html = response.body.decode('gbk') try: item['name'] = re.findall(ur'我是<em>(.*?)<\/em>', html)[0] item['avatar'] = 'http://www.hongniang.com' + response.css( 'a.da-pic img::attr(src)').extract()[0] # If name is available, then this item is valid. item['site_id'] = 22 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return item['gender'] = 'M' if u'男' in html else 'F' try: item['hometown'] = re.findall(ur'>(.*?)</a></strong>人', html)[0] except: pass try: age = re.findall(ur'今年<em>(\d+)岁</em>', html)[0] item['birthday'] = str(datetime.date.today().year - int(age)) + '-00-00' except: pass try: item['height'] = re.findall( ur'身高<strong style=" color: #FB8B38">(.*?)</strong>', html)[0] except: pass try: item['location'] = re.findall( ur'工作在<strong ><a target="_blank" style=" color:#5E83EF; font-weight:bold;">(.*?)<', html)[0] except: pass try: item['salary'] = re.findall(ur'年入<em>(.*?)</em>', html)[0] except: pass try: marital_status = { u'已婚': u'M', u'未婚': u'S', u'离异': u'D', u'离异单身': u'D', u'离异带孩': u'D', u'丧偶': u'W', u'丧偶单身': u'W', u'丧偶带孩': u'W' } r = re.findall(ur'color:#7AAE1F">(.*?)</strong>', html) if r: item['marital_status'] = marital_status[r[0].strip()] except: pass try: education_level = { u'初中': u'1', u'中专/职高/技校': u'2', u'高中/中专': u'2', u'大专以下': u'2', u'大专': u'3', u'本科': u'4', u'硕士': u'5', u'博士': u'6', u'博士后': u'6', } r = re.findall(ur'</strong>,<em>(.*?)学历</em>', html) if r: item['education_level'] = education_level[r[0].strip()] except: pass return item
def parse_item(self, response): item = PoiItem() try: item['name'] = response.css('ul.hpUserInfoUl li')[0].css( 'div.oh::text').extract()[0] item['avatar'] = response.css('img.br4::attr(src)').extract()[0] # If name is available, then this item is valid. item['site_id'] = 5 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return r = response.css('ul.hpUserInfoUl li')[1].css('div.oh::text').extract() if r: item['location'] = r[0] r = response.css('ul.hpUserInfoUl li')[2].css('div.oh::text').extract() if r: item['gender'] = 'M' if u'男' in r[0] else 'F' r = response.css('ul.hpUserInfoUl li')[3].css('div.oh::text').extract() if r and r[0] != u'无': item['birthday'] = r[0].replace(u' 年 ', '-').replace(u' 月 ', '-').replace( u' 日', '') item['login_num'] = response.css('ul.hpUserInfoUl li')[4].css( 'div.oh::text').extract()[0] r = response.css('ul.hpUserInfoUl li')[5].css('div.oh::text').extract() if r and r[0] != u'无': item['last_login_time'] = r[0] r = response.css('ul.hpUserInfoUl li')[6].css('div.oh::text').extract() if r: item['description'] = r[0] item['score'] = int( response.css('div.user-sns-count li')[3].css( 'a.num::attr(title)').extract()[0]) item['level'] = response.css('div.user-sns-count li')[2].css( 'a.num::text').extract()[0] + u'级 - ' + response.css( 'div.levelBox div::text').extract()[-1] item['reg_time'] = response.css( 'div.hpUserInfo2 span.c999::text').extract()[0] return FormRequest( 'http://hi.mop.com/ajax/get', headers={'X-Requested-With': 'XMLHttpRequest'}, formdata={ 'data': json.dumps({ 'header': {}, 'req': { 'User/SubCount': { 'uid': item['user_id'] }, 'User/SnsCount': { 'uid': item['user_id'] } } }), 'date': str(int(time.time() * 1000)) }, callback=self.parse_item_get_ajax, meta={'item': item}, priority=20, )
def parse_profile(self, response): """Parse a user's profile page. @url http://lvxiaobin99.blog.163.com/profile """ item = PoiItem() root = response.xpath('//html') username = response.url.split('/')[2].split('.')[0] self.log('Profile=> ' + username, level=scrapy.log.DEBUG) # ID attributes. item['site_id'] = 2 item['user_id'] = username # Personal attributes. r = root.re(r"nickName:'(.*?)'") if r: item['name'] = r[0] item[ 'avatar'] = 'http://os.blog.163.com/common/ava.s?host=' + username + '&b=1' r = root.re(ur'介绍:</td>[\s\S]*?>([\s\S]*?)<') if r: item['description'] = r[0].strip() r = root.re(r'marital=(\w)') if r: item['marital_status'] = r[0] r = root.re(r'education=(\d)') if r: item['education_level'] = r[0] r1 = root.re(r'industry=.*?>(.*?)<') r2 = root.re(r'occupation=.*?>(.*?)<') if r1 or r2: item['occupation'] = r1[0] + ' ' + r2[0] r = root.re(r'salary=.*?>(.*?)<') if r: item['salary'] = r[0] r = root.re(r'skills=([%\w]+)') if r: item['speciality'] = urllib.unquote(r[0].encode('utf8')) r = root.re(r'characteristics=[^"]+') if r: item['personality'] = urllib.unquote(' '.join( re.findall('%[%\w]+', ' '.join(r))).encode('utf8')) r = root.re(r'favorite\w+=[^"]+') if r: item['favorites'] = urllib.unquote(' '.join( re.findall('%[%\w]+', ' '.join(r))).encode('utf8')) r = ' '.join(root.xpath('//div[@class="biograph"]//text()').extract()) if r: item['experience'] = re.sub(r'\s\s+', ' ', r).strip() # Body attributes. r = root.re(r'gender=(\w)') if r: item['gender'] = r[0] r = root.re(r'weight=.*?>(.*?)<') if r: item['weight'] = r[0] r = root.re(r'height=.*?>(.*?)<') if r: item['height'] = r[0] r = root.re(r'bodyShape=.*?>(.*?)<') if r: item['body_size'] = r[0] r = root.re(r'appearance=.*?>(.*?)<') if r: item['looks'] = r[0] r = root.re(r'bloodType=(\d)') if r: item['blood_type'] = r[0] # Contact attributes. r = root.re(ur'E-Mail :</td>[\s\S]*?>([\s\S]*?)<') if r: item['email'] = r[0].strip() r = root.re(ur'QQ:</td>[\s\S]*?>([\s\S]*?)<') if r: item['qq'] = r[0].strip() r = root.re(ur'移动电话:</td>[\s\S]*?>([\s\S]*?)<') if r: item['cellphone'] = r[0].strip() r = root.re(ur'固定电话:</td>[\s\S]*?>([\s\S]*?)<') if r: item['telephone'] = r[0].strip() # Time attributes. item['reg_time'] = timestamp2datetime(root.re(r'creatTime:(\d+)')[0]) item['last_update_time'] = timestamp2datetime( root.re(r'updateTime:(\d+)')[0]) item['last_login_time'] = timestamp2datetime( root.re(r'lastLoginTime:(\d+)')[0]) r = root.re(r'birthDate=([^"]+)') if r: item['birthday'] = r[0] # Location attributes. r = root.re(r'type=1[^"]+') if r: item['location'] = urllib.unquote(' '.join( re.findall('%[%\w]+', r[-1])).encode('utf8')) r = root.re(r'type=4[^"]+') if r: item['hometown'] = urllib.unquote(' '.join( re.findall('%[%\w]+', r[-1])).encode('utf8')) userid = root.re(r'userId:(\d+)')[0] yield self.level_request(userid, item)
def parse_item(self, response): self.uid += 1 if '继续浏览' in response.body: self.validateuser(self.uid) item = PoiItem() try: item['name'] = response.css('h1.avatar_title::text').extract()[0] item['avatar'] = response.css( 'img#userpic::attr(src)').extract()[0] item['site_id'] = 7 # Site id item['user_id'] = str(response.meta['user_id']) except: yield Request( 'http://www.renren.com/' + str(self.uid) + '/profile?v=info_timeline', headers={ 'Cookie': self.cookies, 'Pragma': 'no-cache', 'Referer': 'http://www.renren.com/343633795/profile', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.94 Safari/537.36' }, callback=self.parse_item, meta={'user_id': self.uid}) return item['experience'] = '' r = response.css('li.work span::text').extract() if r: item['experience'] += r[0] r = response.css('li.school span::text').extract() if r: item['experience'] += r[0] r = response.css('li.birthday span::text').extract() if r: item['gender'] = 'M' if u'男' in r[0] else 'F' r = response.css('li.hometown::text').extract() if r: item['hometown'] = r[0] r = response.css('li.address::text').extract() if r: item['location'] = r[0] yield item yield Request( 'http://www.renren.com/' + str(self.uid) + '/profile?v=info_timeline', headers={ 'Cookie': self.cookies, 'Pragma': 'no-cache', 'Referer': 'http://www.renren.com/343633795/profile', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.94 Safari/537.36' }, callback=self.parse_item, meta={'user_id': self.uid})
def parse_item(self, response): item = PoiItem() root = response.xpath('//html') try: name = response.css('div.card-photo a::attr(title)').extract()[0] if name.strip() == '': return item['name'] = name # If name is available, then this item is valid. item['site_id'] = 14 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return r = response.css('div.card-photo img::attr(src)').extract() if r: item['avatar'] = r[0] experience = '' r = response.css('ul.card-info a.clr-333::attr(title)').extract() if r: experience += ' '.join(r) r = root.re(ur'<li><span class="clr-999">工作经验:</span>(.*?)</li>') if r: experience += r[0] item['experience'] = experience r = response.css('ul.card-info li')[-2].css('a::text').extract() if r: item['followers'] = int(r[0].strip()) item['following'] = int(r[1].strip()) r = root.re(ur'<li><span class="clr-999">性 别:</span>(.*?)</li>') if r and r[0].strip() != '': item['gender'] = 'M' if u'男' in r[0].strip() else 'F' r = root.re(ur'<li><span class="clr-999">年 龄:</span>([\s\S]*?)</li>') if r and r[0].strip() != '': r = re.findall(r'\((.*?)\)', r[0].strip()) if r: birthday = r[0] if len(birthday) < 10: item['birthday'] = u'0000-'+birthday else: item['birthday'] = birthday r = root.re(ur'<li><span class="clr-999">身 高:</span>[\s]*(.+)') if r and r[0].strip() != '': item['height'] = r[0].strip() r = root.re(ur'<li><span class="clr-999">居 住 地 :</span>(.*?)</li>') if r and r[0].strip() != '': item['location'] = r[0].strip() r = root.re(ur'<li><span class="clr-999">户 籍:</span>(.*?)</li>') if r and r[0].strip() != '': item['hometown'] = r[0].strip() r = root.re(ur'<li><span>体育爱好:</span>(.*?)</li>') if r and r[0].strip() != '': item['favorites'] = r[0].strip() marital_status = { u'已婚':u'M', u'未婚':u'S' } r = root.re(ur'<li><span class="clr-999">婚姻状况:</span>(.*?)</li>') if r and r[0].strip()!=u'保密': item['marital_status'] = marital_status[r[0].strip()] education_level = { u'初中':u'1', u'高中':u'1', u'中技':u'2', u'中专':u'2', u'大专':u'3', u'本科':u'4', u'硕士':u'5', u'MBA':u'5', u'博士':u'6', } r = root.re(ur'<li><span class="clr-999">最高学历:</span>(.*?)</li>') if r: item['education_level'] = education_level[r[0].strip()] r = response.css('p.clr-333::text').extract() if r: item['description'] = r[0] return item
def parse_item(self, response): item = PoiItem() try: item['name'] = response.xpath("//a[@class='sexico1']/text()").extract()[0] item['avatar'] = response.xpath("//div[@class='U110MP']/img/@src").extract()[0] item['site_id'] = 28 item['user_id'] = str(response.meta['user_id']) info_A = response.xpath("//div[@class='UmainL3content1']/dt/text()").extract() info_B = response.xpath("//div[@class='UmainL3content1']/dd/text()").extract() bloodType = { u'O型':u'1', u'A型':u'2', u'B型':u'3', u'AB型':u'4', u'其他型':u'5' } bloodTypeA = [ u'O型', u'A型', u'B型', u'AB型', u'其他型', ] education_level = { u'初中':u'1', u'中专/职高/技校':u'3', u'高中':u'1', u'大专':u'2', u'本科':u'4', u'硕士':u'5', u'博士':u'6', u'博士后':u'6', } education_levelA = [ u'初中', u'中专/职高/技校', u'高中', u'大专', u'本科', u'硕士', u'博士', u'博士后', ] marital_status = { u'已婚':u'M', u'恋爱中':u'S', u'未婚':u'S', u'离异':u'D', u'丧偶':u'W' } marital_statusA = [ u'已婚', u'恋爱中', u'未婚', u'离异', u'丧偶', ] for i in range(len(info_A)): if u'血' in info_A[i]: if info_B[i]!=u'保密'and info_B[i]!=u'未填'and info_B[i] in bloodTypeA: item['blood_type'] = bloodType[info_B[i]] elif u'学' in info_A[i]: if info_B[i]!=u'保密'and info_B[i]!=u'未填'and info_B[i] in education_levelA: item['education_level'] = education_level[info_B[i]] elif u'月' in info_A[i]: if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['salary'] = info_B[i] elif u'身' in info_A[i]: if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['height'] = info_B[i] else: pass info_A = response.xpath("//div[@class='UmainL3content2']/dt/text()").extract() info_B = response.xpath("//div[@class='UmainL3content2']/dd/text()").extract() for i in range(len(info_A)): if u'婚姻状况' in info_A[i]: if info_B[i]!=u'保密'and info_B[i]!=u'未填'and info_B[i] in marital_statusA: item['marital_status'] = marital_status[info_B[i]] elif u'体' in info_A[i]: if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['weight'] = info_B[i] elif u'职' in info_A[i]: if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['occupation'] = info_B[i] elif u'年' in info_A[i]: if info_B[i]!=u'保密'and info_B[i]!=u'未填': item['birthday'] = info_B[i][-11:-1] else: pass except: return return item
def parse_item(self, response): item = PoiItem() try: item['name'] = response.css( 'div.z-xql span.mr-5::text').extract()[0] item['avatar'] = response.css( 'div.z-img img::attr(src)').extract()[0] item['site_id'] = 30 item['user_id'] = str(response.meta['user_id']) except: return info = response.css('p.z-detail span') item['gender'] = 'M' if info[0].css( '::text').extract()[0] == u'男' else 'F' item['birthday'] = str(datetime.date.today().year - int(info[1].css( '::text').extract()[0])) + '-00-00' item['location'] = info[-1].css('::text').extract()[0][2:] info = response.css('div.z-subcon2 td.w2') # item['height'] = info[0].css('span::text').extract()[0] education_level = { u'初中': u'1', u'中专/职高/技校': u'2', u'高中及以下': u'1', u'大专以下': u'2', u'大专': u'3', u'本科': u'4', u'硕士': u'5', u'博士': u'6', u'博士后': u'6', } # print info[1].css('span::text').extract()[0] r = info[1].css('span::text').extract() if r: item['education_level'] = education_level[r[0].strip()] marital_status = { u'已婚': u'M', u'未婚': u'S', u'离异': u'D', u'离异单身': u'D', u'离异带孩': u'D', u'丧偶': u'W', u'丧偶单身': u'W', u'丧偶带孩': u'W' } # print info[2].css('span::text').extract()[0] r = info[2].css('span::text').extract() if r and r[0] != u'未填写': item['education_level'] = marital_status[r[0].strip()] item['description'] = response.css('div.z-db')[0].css( '::text').extract()[0] return item
def parse_item(self, response): item = PoiItem() try: item['name'] = response.css('h4::text').extract()[0] item['avatar'] = response.css( 'img.img_absolute::attr(_src)').extract()[0] # item['gender'] = 'M' if 'male' in response.css('div.portrait i::attr(class)').extract()[0] else 'F' # If name is available, then this item is valid. item['site_id'] = 6 # Site id of tianya. item['user_id'] = str(response.meta['user_id']) except: return item['score'] = int(response.css('h6::text').extract()[0]) try: item['level'] = response.css('span.member_dj::text').extract()[0] except: item['level'] = ' '.join( response.css('span.member_dj a::attr(title)').extract()) item['description'] = response.css('div.js_text::text').extract()[0] age, marital, location = response.css( 'h6.member_name::text').extract()[0].split(u',') item['birthday'] = str(datetime.date.today().year - int(age[0:2])) + '-00-00' marital_status = { u'未婚': u'S', u'已婚': u'M', u'恋爱': u'L', u'分居': u'P', u'离异': u'D', u'离异,无小孩': u'D', u'离异,有小孩归对方': u'D', u'离异,有小孩归自己': u'D', u'丧偶': u'W', u'丧偶,无小孩': u'W', u'丧偶,有小孩归对方': u'W', u'丧偶,有小孩归自己': u'W', } item['marital_status'] = marital_status[marital] item['location'] = location[2:] education_level = { u'初中': u'1', u'高中': u'1', u'本科': u'4', u'双学士': u'5', u'硕士': u'5', u'小学': u'7', u'高中中专及以下': u'1', u'中专或相当学历': u'2', u'大专': u'3', u'博士': u'6', u'其他': u'7', } r = response.css('ul.member_info_list li')[0].css('em::text').extract() if r and r[0] != u'--' and r[0] != u'保密': item['education_level'] = education_level[r[0].strip()] r = response.css('ul.member_info_list li')[1].css('em::text').extract() if r and r[0] != u'--' and r[0] != u'保密': item['height'] = r[0] r = response.css('ul.member_info_list li')[3].css('em::text').extract() if r and r[0] != u'--' and r[0] != u'保密': item['salary'] = r[0] r = response.css('ul.member_info_list li')[5].css('em::text').extract() if r and r[0] != u'--' and r[0] != u'保密': item['weight'] = r[0] bloodType = { u'O型': u'1', u'A型': u'2', u'B型': u'3', u'AB型': u'4', u'其它': u'5' } r = response.css('ul.member_info_list li')[9].css('em::text').extract() if r and r[0] != u'--' and r[0] != u'保密': item['blood_type'] = bloodType[r[0].strip()] experience = '' r = response.css('div.js_box')[4] if r: for li in r.css('li.fn-clear'): if li.css('em::text').extract()[0] != '--': experience += li.css('span::text').extract()[0] + li.css( 'em::text').extract()[0] + ' ' item['experience'] = experience # print experience return item
def parse_item(self, response): item = PoiItem() try: user_name = u'' user_name = response.css('div.right_content h2::text').extract()[0] user_name = user_name.split('[')[0].rstrip() # print user_name item['name'] = user_name item['avatar'] = response.css( 'div.pic img::attr(src)').extract()[0] item['site_id'] = 18 item['user_id'] = str(response.meta['user_id']) item['description'] = response.css( 'div.box_content p::text').extract()[0] r = response.xpath( "//div[@class='right_content']/p/span/text()").extract() temp = u'' for i in range(len(r)): temp = r[i] if u'男' in temp: item['gender'] = 'M' elif u'女' in temp: item['gender'] = 'F' elif u'学历' in temp: education_level = { u'初中': u'1', u'中专/职高/技校': u'3', u'高中': u'1', u'大专': u'2', u'本科': u'4', u'硕士': u'5', u'博士': u'6', u'博士后': u'6', } education_list = [ u'初中', u'中专/职高/技校', u'高中', u'大专', u'本科', u'硕士', u'博士', u'博士后', ] education = u'' education = temp[3:] if education in education_list and education != u'保密': item['education_level'] = education_level[education] elif u'所在地' in temp: item['location'] = temp[4:] elif u'籍贯' in temp: item['hometown'] = temp[3:] elif u'婚姻状况' in temp: marital_status = { u'已婚': u'M', u'单身': u'S', u'未婚': u'S', u'离异': u'D', u'丧偶': u'W' } item['marital_status'] = marital_status[temp[5:]] elif u'月薪' in temp: item['salary'] = temp[3:] elif u'身高' in temp: item['height'] = temp[3:] else: pass except: return return item
def parse_item(self, response): item = PoiItem() try: item['name'] = response.xpath( "//p[@class='name']/strong/text()").extract()[0] item['avatar'] = response.xpath( "//div[@class='photo']/img/@src").extract()[0] item['site_id'] = 24 item['user_id'] = str(response.meta['user_id']) r = response.xpath( "//div[@style='background:none;']/dl/dd/text()").extract() for i in range(len(r)): if u'男' in r[i]: item['gender'] = 'M' elif u'女' in r[i]: item['gender'] = 'F' elif u'身高' in r[i]: item['height'] = r[i][4:] elif u'体重' in r[i]: item['weight'] = r[i][4:] elif u'情感状态' in r[i]: marital_status = { u'已婚': u'M', u'恋爱中': u'S', u'未婚': u'S', u'离异': u'D', u'丧偶': u'W' } marital_statusA = [ u'已婚', u'恋爱中', u'未婚', u'离异', u'丧偶', ] if r[i][6:] in marital_statusA: item['marital_status'] = marital_status[r[i]] r = response.xpath( "//div[@class='pro_details']/dl/dd/text()").extract() for i in range(len(r)): if u'兴趣爱好' in r[i]: item['favorites'] = r[i][6:] elif u'血型' in r[i]: bloodType = { u'O型': u'1', u'A型': u'2', u'B型': u'3', u'AB型': u'4', u'其他型': u'5' } bloodTypeA = [ u'O型', u'A型', u'B型', u'AB型', u'其他型', ] if r[i][4:] in bloodTypeA: item['blood_type'] = bloodType[r[i][4:]] elif u'工作地区' in r[i]: item['location'] = r[i][5:] elif u'学历' in r[i]: education_level = { u'初中': u'1', u'中专/职高/技校': u'3', u'高中': u'1', u'大专': u'2', u'本科': u'4', u'硕士': u'5', u'博士': u'6', u'博士后': u'6', } education_levelA = [ u'初中', u'中专/职高/技校', u'高中', u'大专', u'本科', u'硕士', u'博士', u'博士后', ] if r[i][3:] in education_levelA: item['education_level'] = education_level[r[i][3:]] elif u'职业' in r[i]: item['occupation'] = r[i][3:] elif u'年收入' in r[i]: item['salary'] = r[i][4:] else: pass r = response.xpath( "//div[@class='pro_details']/p/text()").extract() temp = u'' for i in xrange(1, len(r)): temp += r[i] item['description'] = temp except: return return item
def parse_item(self, response): item = PoiItem() try: item['name'] = response.css( 'a.profile-netname h1::text').extract()[0] item['avatar'] = response.css( 'div.profile-user-img-box img::attr(src)').extract()[0] item['site_id'] = 16 item['user_id'] = str(response.meta['user_id']) item['gender'] = 'M' if u'先生' in response.css( 'span#profile_sex::text').extract()[0] else 'F' item['occupation'] = response.css( 'span#profile_occupation::text').extract()[0] education_level = { u'初中': u'1', u'中专/职高/技校': u'3', u'高中': u'1', u'大专': u'2', u'本科': u'4', u'硕士': u'5', u'博士': u'6', u'博士后': u'6', } r = response.css('span#profile_education::text').extract() if r and r[0].strip() != u'保密': item['education_level'] = education_level[r[0].strip()] item['height'] = response.css( 'span#profile_height::text').extract()[0] item['weight'] = response.css( 'span#profile_weight::text').extract()[0] bloodType = { u'O型': u'1', u'A型': u'2', u'B型': u'3', u'AB型': u'4', u'其他型': u'5' } r = response.css('span#profile_blood_type::text').extract() if r and r[0].strip() != u'保密': item['blood_type'] = bloodType[r[0].strip()] marital_status = { u'已婚': u'M', u'未婚': u'S', u'离异': u'D', u'丧偶': u'W' } r = response.css('span#profile_marital::text').extract() if r and r[0].strip() != u'保密': item['marital_status'] = marital_status[r[0].strip()] r = response.css('span#profile_income::text').extract() if r and r[0].strip() != u'保密': item['salary'] = r[0].strip() r = response.css( 'span#profile_r_state_id a[target=_blank]::text').extract() if r and r[0].strip() != u'保密': item['location'] = r[0].strip() item['hometown'] = response.css( 'span#profile_n_state_id::text').extract()[0] except: return return item