def detailParse(self, response): item = response.meta['item'] fcontent = Selector(response=response).xpath( '//span[@property="v:summary"]/text()') item['fcontent'] = fcontent.extract_first().strip() yield item
def get_all_url(self): # 获取列表页所有的url zero_page_url = 'http://www.baixing.com/w/posts/myPosts/all?page=0' self.login() zero_page_response = self.session.get(zero_page_url, headers=self.headers) # 获取页码数 page_num = Selector(zero_page_response).xpath( '//ol[@class="page-nav"]/li[last()-1]/a/text()') if page_num: page_num = int(page_num.extract_first()) print('请求第 -0- 页成功') self.file.write('请求 -0- 页成功\n') # 获取当前页面所有的url zero_id_list = Selector(zero_page_response).xpath( '//*[@id="posts-ad-items"]/li/a/@href').extract() for id_ in zero_id_list: self.file.write(id_) self.file.write('\n') url_format = slice(0, -1) init_url = zero_page_url[url_format] for index in range(1, page_num): url_ = init_url + str(index) # 进行请求 res = self.session.get(url_, headers=self.headers) # 解析url if res.status_code == 200: print('请求 -{}- 页成功'.format(index)) id_list = Selector(res).xpath( '//*[@id="posts-ad-items"]/li/a/@href').extract() # 写入文件 self.file.write('请求 -{}- 页成功'.format(index)) self.file.write('\n') for id_ in id_list: self.file.write(id_) self.file.write('\n') else: print('请求 -{}- 页失败'.format(index)) self.file.write('请求 -{}- 页失败'.format(index)) self.file.write('\n')
def get_userItem(body, user_bit): """ :type body: str :param body: 用户主页网页 :return:item: 用户信息item concern_link: 所关注用户的链接 fans_link: 粉丝的链接 """ item = items.userItem() # 获取ID user_id = int(get_userID_from_mainPage(body)) item['id'] = user_id # 获取用户名 p_name = re.compile(r'\$CONFIG\[\'onick\'\]=\'(.*)\'') try: name_ = p_name.search(body).group(1) item['name'] = name_ except: item['name'] = '' # 是否是加V用户 script_heads = Selector(text=body).xpath('//script/text()').extract() head_body = '' content_body = '' num_body = '' relation_body = '' for script_head in script_heads: p_head = re.compile(r'^\s*FM\.view\(\{(.*)\}\)', re.M) p_head_s = p_head.search( script_head.replace('\n', '').replace('\t', '')) if p_head_s: json_head = '{' + p_head_s.group(1) + '}' data = json.loads(json_head) if data.get('domid'): if 'Pl_Official_Headerv6' in data.get( 'domid'): # 从该段中获取是否加V和性别信息 head_body = data.get('html') elif 'Pl_Core_UserInfo' in data.get( 'domid'): # 从该段中获取加V描述,地址,生日 content_body = data.get('html') elif 'Pl_Core_T8CustomTriColumn' in data.get('domid'): num_body = data.get('html') elif 'Pl_Core_UserGrid' in data.get('domid'): relation_body = data.get('html') if head_body: v = Selector(text=head_body).xpath( "//em[@class='W_icon icon_pf_approve']").extract_first() v_co = Selector(text=head_body).xpath( "//em[contains(@class, 'icon_pf_approve_co')]").extract_first() if v: item['is_v'] = 1 # 个人 elif v_co: item['is_v'] = 2 # 机构 else: item['is_v'] = 0 # 获取性别 female = Selector(text=head_body).xpath( "//i[@class='W_icon icon_pf_female']").extract_first() male = Selector(text=head_body).xpath( "//i[@class='W_icon icon_pf_male']").extract_first() if female: item['sex'] = 1 # 女 elif male: item['sex'] = 2 # 男 else: item['sex'] = 0 # 出错或未知 # 获取加V描述 v_des = Selector( text=head_body).xpath("//div[@class='pf_intro']/text()") if v_des: item['v_des'] = v_des.extract_first().strip() else: item['v_des'] = '' else: item['is_v'] = -1 # 出错 item['sex'] = 0 item['v_des'] = '' # 获得微博等级 if content_body: level_ = Selector(text=content_body).xpath( "//a[contains(@class,'W_icon_level')]/span/text()").extract_first( ) if level_: item['level'] = int(level_.strip('Lv. ')) else: item['level'] = -1 # 获取地址和生日 item['address'] = '' item['birthday'] = '' contents = Selector(text=content_body) \ .xpath("//div[@class='detail']/ul[@class='ul_detail']/li[@class='item S_line2 clearfix']") for content in contents: icon = content.xpath( "./span[contains(@class, 'item_ico W_f')]/em/@class" ).extract_first() if icon == 'W_ficon ficon_cd_place S_ficon': addr = content.xpath( "./span[contains(@class, 'item_text W_f')]/text()" ).extract_first().strip() item['address'] = addr elif icon == 'W_ficon ficon_constellation S_ficon': birthday = content.xpath( "./span[contains(@class, 'item_text W_f')]/text()" ).extract_first().strip() item['birthday'] = birthday else: item['level'] = -1 item['address'] = '' item['birthday'] = '' # 获取关注、粉丝、微博的数量 concern_link = '' fans_link = '' if num_body: num_infos = Selector( text=num_body).xpath("//a[@class='t_link S_txt1']") has_link = False if num_infos: text_xpath = "./span[@class='S_txt2']/text()" num_xpath = "./strong[contains(@class, 'W_f')]/text()" has_link = True else: num_infos = Selector(text=num_body).xpath("//td[@class='S_line1']") text_xpath = "./span[@class='S_txt2']/text()" num_xpath = "./strong[contains(@class,'W_f')]/text()" for num_info in num_infos: info = num_info.xpath(text_xpath).extract_first() num = num_info.xpath(num_xpath).extract_first() if num: num = int(num.strip()) else: num = -1 if info == '关注': item['concern_num'] = num if has_link: concern_link = num_info.xpath("./@href").extract_first() elif info == '粉丝': item['fans_num'] = num if has_link: fans_link = num_info.xpath("./@href").extract_first() elif info == '微博': item['blog_num'] = num # 从微关系块获得关注、粉丝的链接信息 if not has_link: relation_infos = Selector(text=relation_body) \ .xpath( "//div[@class='obj_name']/h2[contains(@class, 'main_title W_fb W_f')]/a[contains(@class, 'S_txt')]") for relation in relation_infos: link = relation.xpath("./@href").extract_first() text = relation.xpath('./text()').extract_first() if '关注' in text: concern_link = link elif '粉丝' in text: fans_link = link else: item['concern_num'] = -1 # 未知或出错 item['fans_num'] = -1 item['blog_num'] = -1 return item, concern_link, fans_link