def parse_art(self, response): start_time = '2019年03月01日' # TODO 设置起始时间 try: push_time_str = response.xpath( '//div[@class="box01"]/div[@class="fl"]/text()').extract_first( ) push_time = push_time_str.replace(' 来源:', '') if not push_time < start_time: item = ArticleItem() item["title"] = response.xpath( '//h1/text()').extract_first().strip() item["author"] = response.xpath( '//div[@class="edit clearfix"]/text()').extract_first() item["push_time"] = push_time source = response.xpath( '//div[@class="box01"]/div[@class="fl"]/a/text()' ).extract_first() item["source"] = source art_path = response.xpath('//div[@class="box_con"]') item["detail"] = art_path.xpath( 'string(.)').extract_first().strip() item["url"] = response.url item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['collection'] = "人民网" if 'auto' in response.url: item['type'] = '汽车' yield item except Exception as e: print("解析失败", e.__traceback__.tb_lineno, e)
def parse_detail(self, response): # 解析帖子详情页 try: if response.xpath('//a[@class="next"]') and not response.xpath( '//a[@class="prev"]'): # 若存在下页且不存在上页,直接翻页到最后 last_page_url = response.xpath( '//div[@class="pages_btns"][1]/div[@class="pages"]/a[last()]/@href' ).extract_first() last_page_url = self.base_url + last_page_url yield scrapy.Request(last_page_url, callback=self.parse_detail) return title = response.xpath('//h1/text()').extract_first() item_num = 0 while True: # 倒序遍历回复或帖子 try: reply = response.xpath( '//div[@class="mainbox viewthread"][last()-{}]'.format( item_num)) if not reply: # 遍历完成,往前翻页 before_page_url = response.xpath( '//div[@class="pages_btns"][1]/div[@class="pages"]/a[@class="prev"]/@href' ).extract_first() if before_page_url: before_page_url = self.base_url + before_page_url yield scrapy.Request(before_page_url, callback=self.parse_detail) break # 解析元素 item = ArticleItem() item["title"] = title item["author"] = reply.xpath( './/td[@class="postauthor"]/cite/a/text()' ).extract_first() push_time_str = reply.xpath( './/div[@class="postinfo"]/text()[5]').extract_first( ).strip() push_time_str = push_time_str.replace("发表于 ", '') push_time_date = datetime.strptime(push_time_str, "%Y-%m-%d %H:%S") if push_time_date < self.start_time_date: return item["push_time"] = push_time_date item["source"] = None art_path = reply.xpath( './/div[@class="postmessage defaultpost"]') item["detail"] = art_path.xpath( 'string(.)').extract_first().strip() item["url"] = response.url item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['collection'] = "110法律咨询" item['kw'] = None item['type'] = None yield item item_num += 1 except Exception as e: print("解析失败", e.__traceback__.tb_lineno, e) break except Exception as e: print("解析失败", e.__traceback__.tb_lineno, e)
def parse_art(self, response): try: item = ArticleItem() item["title"] = response.xpath( '//div[@class="h-title"]/text()').extract_first().strip() item["author"] = response.xpath( '//span[@class="p-jc"]/text()[2]').extract_first().strip() item["push_time"] = response.xpath( '//div[@class="h-info"]/span[1]/text()').extract_first() source = response.xpath( '//div[@class="h-info"]//em[@id="source"]/text()' ).extract_first().strip() item["source"] = source art_path = response.xpath('//div[@id="p-detail"]') item["detail"] = art_path.xpath( 'string(.)').extract_first().strip() item["url"] = response.url item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['collection'] = "新华网" if "tech" in response.url: item['type'] = '科技' if 'auto' in response.url: item['type'] = '汽车' if 'energy' in response.url: item['type'] = '能源' if 'money' in response.url: item['type'] = '金融' yield item except Exception as e: print("解析失败", e.__traceback__.tb_lineno, e)
def parse_reply(self, response): # 解析帖子回复 try: for each in response.xpath('//div[@id="postreply"]/dl'): item = ArticleItem() item["title"] = response.xpath( '//h1/span[2]/text()').extract_first() if item["title"] is None: item["title"] = response.xpath( '//h1/span/text()').extract_first() item["author"] = each.xpath( './dd/ul[1]/li[1]/a/text()').extract_first().strip() item["push_time"] = each.xpath( './dd/ul[1]/li[2]/span/text()').extract_first() source = None item["source"] = source art_path = each.xpath('./dd/div[@id]') item["detail"] = art_path.xpath( 'string(.)').extract_first().strip() item["url"] = response.url item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['collection'] = "新华网" item['type'] = '帖子回复' yield item except Exception as e: print("解析失败", e.__traceback__.tb_lineno, e)
def parse_art(self, response): time_str = response.xpath( '//div[@class="f-fl"]/span[3]/text()').extract_first() push_time = datetime.strptime(time_str, "%Y-%m-%d") # 字符串化为时间 start_time = datetime.strptime('2018-03-01', "%Y-%m-%d") # 字符串化为时间 if push_time >= start_time: try: item = ArticleItem() item["title"] = response.xpath( '//h1/text()').extract_first().strip() item["author"] = response.xpath( '//div[@class="f-fl"]/span[1]/text()').extract_first() item["push_time"] = time_str source = response.xpath( '//div[@class="f-fl"]/text()[6]').extract_first().strip() item["source"] = source art_path = response.xpath('//article') item["url"] = response.url item["detail"] = art_path.xpath( 'string(.)').extract_first().strip() item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['collection'] = "乌有之乡网刊(2018/3)" item['type'] = response.xpath( '//span[@class="s-last"]/a/text()').extract_first() yield item except Exception as e: print("解析失败", e.__traceback__.tb_lineno, e)
def parse_detail(self, response): try: if response.xpath('//a[text()="下页"]') and not response.xpath('//a[text()="上页"]'): # 若存在下页且不存在上页,直接翻页到最后 last_page_url = response.xpath( '//div[@class="mb15 cf"]/div[@class="atl-pages"]/form/a[last()-1]/@href').extract_first() last_page_url = self.base_url + last_page_url yield scrapy.Request(last_page_url, callback=self.parse_detail) return title = response.xpath('//*[@id="post_head"]/h1/span[1]/span/text()').extract_first() host_path = response.xpath('//div[@class="atl-item host-item"]') if host_path: # 若存在主贴 push_time_str = response.xpath('//*[@id="post_head"]/div[2]/div[2]/span[2]/text()').extract_first() push_time = push_time_str.replace('时间:', '') if push_time > self.start_time: # 且时间符合 item = ArticleItem() item["title"] = title item["author"] = response.xpath( '//*[@id="post_head"]/div[2]/div[2]/span[1]/a[1]/text()').extract_first() item["push_time"] = push_time item["source"] = None art_path = host_path.xpath('.//div[@class="bbs-content clearfix"]') item["detail"] = art_path.xpath('string(.)').extract_first().strip() item["url"] = response.url item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['collection'] = "(天涯网)" + "自动驾驶" item['kw'] = self.kw item['type'] = None yield item item_num = 0 while True: # 倒序遍历回复或帖子 try: reply = response.xpath('//div[@class="atl-item"][last()-{}]'.format(item_num)) if reply is None: # 往前翻页 before_page_url = response.xpath( '//div[@class="mb15 cf"]/div[@class="atl-pages"]/form/a[text()="上页"]/@href').extract_first() if before_page_url: before_page_url = self.base_url + before_page_url yield scrapy.Request(before_page_url, callback=self.parse_detail) break item = ArticleItem() item["title"] = title item["author"] = reply.xpath('./@_host').extract_first() push_time = reply.xpath('./@js_restime').extract_first() if push_time < self.start_time: break item["push_time"] = push_time item["source"] = None art_path = reply.xpath('.//div[@class="bbs-content"]') item["detail"] = art_path.xpath('string(.)').extract_first().strip() item["url"] = response.url item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['collection'] = "(天涯网)" + "自动驾驶" item['kw'] = self.kw item['type'] = None yield item item_num += 1 except: break except Exception as e: print("解析失败", e.__traceback__.tb_lineno, e)
def parse_detail(self, response): # 解析帖子详情 try: item = ArticleItem() item["title"] = response.xpath( '//h1/span[2]/text()').extract_first() if item["title"] is None: item["title"] = response.xpath( '//h1/span/text()').extract_first() item["author"] = response.xpath( '//ul[@class="de-xx clear"]/li[2]/a/text()').extract_first( ).strip() item["push_time"] = response.xpath( '//ul[@class="de-xx clear"]/li[@class="fr"]/span/text()' ).extract_first() source = None item["source"] = source art_path = response.xpath('//div[@id="message_"]') item["detail"] = art_path.xpath( 'string(.)').extract_first().strip() item["url"] = response.url item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['collection'] = "新华网" item['type'] = '帖子' yield item reply_page = response.xpath('//div[@class="lt-page clear"]') if reply_page: # 多页评论 for each in response.xpath( '//div[@class="lt-page clear"][1]/ul[@class="fl1"]/li' ): url_str = each.xpath('./a/@href').extract_first() reply_page_url = 'http://forum.home.news.cn' + url_str yield scrapy.Request(url=reply_page_url, callback=self.parse_reply) elif response.xpath('//div[@id="postreply"]/dl'): # 单页评论 try: for each in response.xpath('//div[@id="postreply"]/dl'): item = ArticleItem() item["title"] = response.xpath( '//h1/span[2]/text()').extract_first() if item["title"] is None: item["title"] = response.xpath( '//h1/span/text()').extract_first() item["author"] = each.xpath('./dd/ul[1]/li[1]/a/text()' ).extract_first().strip() item["push_time"] = each.xpath( './dd/ul[1]/li[2]/span/text()').extract_first() source = None item["source"] = source art_path = each.xpath('./dd/div[@id]') item["detail"] = art_path.xpath( 'string(.)').extract_first().strip() item["url"] = response.url item['catch_time'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) item['collection'] = "新华网" item['type'] = '帖子回复' yield item except Exception as e: print("解析失败", e.__traceback__.tb_lineno, e) except Exception as e: print("解析失败", e.__traceback__.tb_lineno, e)