Exemplo n.º 1
0
    def parse_art(self, response):
        start_time = '2019年03月01日'  # TODO 设置起始时间
        try:
            push_time_str = response.xpath(
                '//div[@class="box01"]/div[@class="fl"]/text()').extract_first(
                )
            push_time = push_time_str.replace('  来源:', '')
            if not push_time < start_time:
                item = ArticleItem()
                item["title"] = response.xpath(
                    '//h1/text()').extract_first().strip()
                item["author"] = response.xpath(
                    '//div[@class="edit clearfix"]/text()').extract_first()
                item["push_time"] = push_time
                source = response.xpath(
                    '//div[@class="box01"]/div[@class="fl"]/a/text()'
                ).extract_first()
                item["source"] = source
                art_path = response.xpath('//div[@class="box_con"]')
                item["detail"] = art_path.xpath(
                    'string(.)').extract_first().strip()

                item["url"] = response.url
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['collection'] = "人民网"
                if 'auto' in response.url:
                    item['type'] = '汽车'
                yield item
        except Exception as e:
            print("解析失败", e.__traceback__.tb_lineno, e)
Exemplo n.º 2
0
 def parse_detail(self, response):
     # 解析帖子详情页
     try:
         if response.xpath('//a[@class="next"]') and not response.xpath(
                 '//a[@class="prev"]'):  # 若存在下页且不存在上页,直接翻页到最后
             last_page_url = response.xpath(
                 '//div[@class="pages_btns"][1]/div[@class="pages"]/a[last()]/@href'
             ).extract_first()
             last_page_url = self.base_url + last_page_url
             yield scrapy.Request(last_page_url, callback=self.parse_detail)
             return
         title = response.xpath('//h1/text()').extract_first()
         item_num = 0
         while True:  # 倒序遍历回复或帖子
             try:
                 reply = response.xpath(
                     '//div[@class="mainbox viewthread"][last()-{}]'.format(
                         item_num))
                 if not reply:  # 遍历完成,往前翻页
                     before_page_url = response.xpath(
                         '//div[@class="pages_btns"][1]/div[@class="pages"]/a[@class="prev"]/@href'
                     ).extract_first()
                     if before_page_url:
                         before_page_url = self.base_url + before_page_url
                         yield scrapy.Request(before_page_url,
                                              callback=self.parse_detail)
                     break
                 # 解析元素
                 item = ArticleItem()
                 item["title"] = title
                 item["author"] = reply.xpath(
                     './/td[@class="postauthor"]/cite/a/text()'
                 ).extract_first()
                 push_time_str = reply.xpath(
                     './/div[@class="postinfo"]/text()[5]').extract_first(
                     ).strip()
                 push_time_str = push_time_str.replace("发表于 ", '')
                 push_time_date = datetime.strptime(push_time_str,
                                                    "%Y-%m-%d %H:%S")
                 if push_time_date < self.start_time_date:
                     return
                 item["push_time"] = push_time_date
                 item["source"] = None
                 art_path = reply.xpath(
                     './/div[@class="postmessage defaultpost"]')
                 item["detail"] = art_path.xpath(
                     'string(.)').extract_first().strip()
                 item["url"] = response.url
                 item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                    time.localtime())
                 item['collection'] = "110法律咨询"
                 item['kw'] = None
                 item['type'] = None
                 yield item
                 item_num += 1
             except Exception as e:
                 print("解析失败", e.__traceback__.tb_lineno, e)
                 break
     except Exception as e:
         print("解析失败", e.__traceback__.tb_lineno, e)
    def parse_art(self, response):
        try:
            item = ArticleItem()
            item["title"] = response.xpath(
                '//div[@class="h-title"]/text()').extract_first().strip()
            item["author"] = response.xpath(
                '//span[@class="p-jc"]/text()[2]').extract_first().strip()
            item["push_time"] = response.xpath(
                '//div[@class="h-info"]/span[1]/text()').extract_first()
            source = response.xpath(
                '//div[@class="h-info"]//em[@id="source"]/text()'
            ).extract_first().strip()
            item["source"] = source
            art_path = response.xpath('//div[@id="p-detail"]')
            item["detail"] = art_path.xpath(
                'string(.)').extract_first().strip()

            item["url"] = response.url
            item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                               time.localtime())
            item['collection'] = "新华网"
            if "tech" in response.url:
                item['type'] = '科技'
            if 'auto' in response.url:
                item['type'] = '汽车'
            if 'energy' in response.url:
                item['type'] = '能源'
            if 'money' in response.url:
                item['type'] = '金融'
            yield item
        except Exception as e:
            print("解析失败", e.__traceback__.tb_lineno, e)
Exemplo n.º 4
0
    def parse_reply(self, response):
        # 解析帖子回复
        try:
            for each in response.xpath('//div[@id="postreply"]/dl'):
                item = ArticleItem()
                item["title"] = response.xpath(
                    '//h1/span[2]/text()').extract_first()
                if item["title"] is None:
                    item["title"] = response.xpath(
                        '//h1/span/text()').extract_first()
                item["author"] = each.xpath(
                    './dd/ul[1]/li[1]/a/text()').extract_first().strip()
                item["push_time"] = each.xpath(
                    './dd/ul[1]/li[2]/span/text()').extract_first()
                source = None
                item["source"] = source
                art_path = each.xpath('./dd/div[@id]')
                item["detail"] = art_path.xpath(
                    'string(.)').extract_first().strip()

                item["url"] = response.url
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['collection'] = "新华网"
                item['type'] = '帖子回复'
                yield item
        except Exception as e:
            print("解析失败", e.__traceback__.tb_lineno, e)
Exemplo n.º 5
0
 def parse_art(self, response):
     time_str = response.xpath(
         '//div[@class="f-fl"]/span[3]/text()').extract_first()
     push_time = datetime.strptime(time_str, "%Y-%m-%d")  # 字符串化为时间
     start_time = datetime.strptime('2018-03-01', "%Y-%m-%d")  # 字符串化为时间
     if push_time >= start_time:
         try:
             item = ArticleItem()
             item["title"] = response.xpath(
                 '//h1/text()').extract_first().strip()
             item["author"] = response.xpath(
                 '//div[@class="f-fl"]/span[1]/text()').extract_first()
             item["push_time"] = time_str
             source = response.xpath(
                 '//div[@class="f-fl"]/text()[6]').extract_first().strip()
             item["source"] = source
             art_path = response.xpath('//article')
             item["url"] = response.url
             item["detail"] = art_path.xpath(
                 'string(.)').extract_first().strip()
             item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                time.localtime())
             item['collection'] = "乌有之乡网刊(2018/3)"
             item['type'] = response.xpath(
                 '//span[@class="s-last"]/a/text()').extract_first()
             yield item
         except Exception as e:
             print("解析失败", e.__traceback__.tb_lineno, e)
Exemplo n.º 6
0
    def parse_detail(self, response):
        try:
            if response.xpath('//a[text()="下页"]') and not response.xpath('//a[text()="上页"]'):  # 若存在下页且不存在上页,直接翻页到最后
                last_page_url = response.xpath(
                    '//div[@class="mb15 cf"]/div[@class="atl-pages"]/form/a[last()-1]/@href').extract_first()
                last_page_url = self.base_url + last_page_url
                yield scrapy.Request(last_page_url, callback=self.parse_detail)
                return
            title = response.xpath('//*[@id="post_head"]/h1/span[1]/span/text()').extract_first()
            host_path = response.xpath('//div[@class="atl-item host-item"]')
            if host_path:  # 若存在主贴
                push_time_str = response.xpath('//*[@id="post_head"]/div[2]/div[2]/span[2]/text()').extract_first()
                push_time = push_time_str.replace('时间:', '')
                if push_time > self.start_time:  # 且时间符合
                    item = ArticleItem()
                    item["title"] = title
                    item["author"] = response.xpath(
                        '//*[@id="post_head"]/div[2]/div[2]/span[1]/a[1]/text()').extract_first()
                    item["push_time"] = push_time
                    item["source"] = None
                    art_path = host_path.xpath('.//div[@class="bbs-content clearfix"]')
                    item["detail"] = art_path.xpath('string(.)').extract_first().strip()
                    item["url"] = response.url
                    item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    item['collection'] = "(天涯网)" + "自动驾驶"
                    item['kw'] = self.kw
                    item['type'] = None
                    yield item

            item_num = 0
            while True:  # 倒序遍历回复或帖子
                try:
                    reply = response.xpath('//div[@class="atl-item"][last()-{}]'.format(item_num))
                    if reply is None:  # 往前翻页
                        before_page_url = response.xpath(
                            '//div[@class="mb15 cf"]/div[@class="atl-pages"]/form/a[text()="上页"]/@href').extract_first()
                        if before_page_url:
                            before_page_url = self.base_url + before_page_url
                            yield scrapy.Request(before_page_url, callback=self.parse_detail)
                        break
                    item = ArticleItem()
                    item["title"] = title
                    item["author"] = reply.xpath('./@_host').extract_first()
                    push_time = reply.xpath('./@js_restime').extract_first()
                    if push_time < self.start_time:
                        break
                    item["push_time"] = push_time
                    item["source"] = None
                    art_path = reply.xpath('.//div[@class="bbs-content"]')
                    item["detail"] = art_path.xpath('string(.)').extract_first().strip()

                    item["url"] = response.url
                    item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    item['collection'] = "(天涯网)" + "自动驾驶"
                    item['kw'] = self.kw
                    item['type'] = None
                    yield item
                    item_num += 1
                except:
                    break
        except Exception as e:
            print("解析失败", e.__traceback__.tb_lineno, e)
Exemplo n.º 7
0
    def parse_detail(self, response):
        # 解析帖子详情
        try:
            item = ArticleItem()
            item["title"] = response.xpath(
                '//h1/span[2]/text()').extract_first()
            if item["title"] is None:
                item["title"] = response.xpath(
                    '//h1/span/text()').extract_first()
            item["author"] = response.xpath(
                '//ul[@class="de-xx clear"]/li[2]/a/text()').extract_first(
                ).strip()
            item["push_time"] = response.xpath(
                '//ul[@class="de-xx clear"]/li[@class="fr"]/span/text()'
            ).extract_first()
            source = None
            item["source"] = source
            art_path = response.xpath('//div[@id="message_"]')
            item["detail"] = art_path.xpath(
                'string(.)').extract_first().strip()

            item["url"] = response.url
            item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                               time.localtime())
            item['collection'] = "新华网"
            item['type'] = '帖子'
            yield item

            reply_page = response.xpath('//div[@class="lt-page clear"]')
            if reply_page:  # 多页评论
                for each in response.xpath(
                        '//div[@class="lt-page clear"][1]/ul[@class="fl1"]/li'
                ):
                    url_str = each.xpath('./a/@href').extract_first()
                    reply_page_url = 'http://forum.home.news.cn' + url_str
                    yield scrapy.Request(url=reply_page_url,
                                         callback=self.parse_reply)

            elif response.xpath('//div[@id="postreply"]/dl'):  # 单页评论
                try:
                    for each in response.xpath('//div[@id="postreply"]/dl'):
                        item = ArticleItem()
                        item["title"] = response.xpath(
                            '//h1/span[2]/text()').extract_first()
                        if item["title"] is None:
                            item["title"] = response.xpath(
                                '//h1/span/text()').extract_first()
                        item["author"] = each.xpath('./dd/ul[1]/li[1]/a/text()'
                                                    ).extract_first().strip()
                        item["push_time"] = each.xpath(
                            './dd/ul[1]/li[2]/span/text()').extract_first()
                        source = None
                        item["source"] = source
                        art_path = each.xpath('./dd/div[@id]')
                        item["detail"] = art_path.xpath(
                            'string(.)').extract_first().strip()

                        item["url"] = response.url
                        item['catch_time'] = time.strftime(
                            "%Y-%m-%d %H:%M:%S", time.localtime())
                        item['collection'] = "新华网"
                        item['type'] = '帖子回复'
                        yield item
                except Exception as e:
                    print("解析失败", e.__traceback__.tb_lineno, e)
        except Exception as e:
            print("解析失败", e.__traceback__.tb_lineno, e)