def parse(self, response):
        # print(response)
        title = response.css('.detail_tit::text').extract()#标题
        #publish_date = response.css('.detail_extend1 fl::text').extract()#出版日期
        content = response.css('#fontzoom p::text').extract()#内容
        publish_date = response.xpath('//div[@class="detail_extend1 fl"]//text()').extract_first()

        if len(title) != 0:
            if len(publish_date) != 0:
                    if len(content) != 0:

                            news = GovermentNewsItem()
                            news['title'] = title[0]
                            #news['publish_date'] = publish_date[0][4:14]
                            news['content'] = content
                            news['publish_date'] = re.findall(r'时间:(\d+-\d+\-\d+).*?来源:.*?',publish_date,re.S)
                            news['source'] = re.findall(r'时间.*?来源:(\w+)',publish_date,re.S)
                            yield news

        page = response.css('a::attr(href)').extract()
        if len(page) == 0:
            sleep(1)
        page = list(filter(lambda x: x != '', page))
        page_company = list(filter(lambda x: '/spp/' in x, page))

        for p in page_company:

            if("http" in p):
                yield  scrapy.Request(url=p, callback=self.parse, headers=self.headers1, meta={'dont_merge_cookies': True})
            # a=requests.get(p,headers=self.headers1)
            # self.log(a)
            else:
                p="https://www.spp.gov.cn/"+p
            yield scrapy.Request(url=p, callback=self.parse, headers=self.headers1, meta={'dont_merge_cookies': True})
Exemplo n.º 2
0
    def parse(self, response):
        title = response.xpath('/html/head/title//text()').extract()

        content = response.xpath('//*[@id="cmsArticleContent"]/ucapcontent/p//text()').extract()
        publish_date = response.xpath('//*[@id="articleattribute"]/li[1]//text()').extract()
        source = response.xpath('//*[@id="articleattribute"]/li[2]//text()').extract()

        if  len(content) != 0:
            items = GovermentNewsItem()
            items['title'] = title[0].replace('\r','').replace(' ','').replace('\n','')
            items['content'] = content
            items['publish_date'] = publish_date[0].replace('\r','').replace(' ','').replace('\n','')
            items['source'] = source
            yield items

        urls = response.css('#content a::attr(href)').extract()
        if len(urls) == 0:
            sleep(1)
        urls = list(filter(lambda x: x != '', urls))
        urls_company = list(filter(lambda x: '/2019' in x, urls))

        for u in urls_company:

            if 'http' in u:
                yield scrapy.Request(url=u, callback=self.parse, meta={'dont_merge_cookies': True})
            else:
                u = 'http://www.sc.gov.cn' + u
                yield scrapy.Request(url=u, callback=self.parse, meta={'dont_merge_cookies': True})
Exemplo n.º 3
0
 def CN(self,response):
     item = GovermentNewsItem()
     item['content'] = response.xpath('//*[@id="UCAP-CONTENT"]/p//text()').extract()
     item['title']= response.xpath('//h1/text()').extract()
     item['publish_date']= response.xpath('//div[@class="pages-date"]/text()').extract()[:1]
     item['source']= response.xpath('//div[@class="pages-date"]/span/text()').extract()
     yield item
Exemplo n.º 4
0
    def parse(self, response):
        title = response.xpath('/html/head/title//text').extract()
        content = response.css('#zoom p::text').extract()
        source = response.xpath(
            '//*[@id="zoom"]/div/font[1]/text()[2]').extract()
        time = re.findall(r'<font>发布时间:(.*?)</font>', response.text, re.M)

        if len(content) != 0:
            items = GovermentNewsItem()
            items['title'] = title[0].replace('<p>', '').replace(
                '\r', '').replace(' ', '').replace('</p>',
                                                   '').replace('\n', '')
            items['content'] = content
            items['time'] = time
            items['source'] = source
            yield items

        urls = re.findall(r'<a href="(.*?)" tar', response.text, re.M)
        print(urls)
        urls = list(filter(lambda x: x != '', urls))
        urls_company = list(filter(lambda x: '/art/' in x, urls))

        for u in urls_company:
            yield scrapy.Request(url=u,
                                 callback=self.parse,
                                 meta={'dont_merge_cookies': True})
Exemplo n.º 5
0
    def parse(self, response):
        title = response.xpath(
            '/html/body/div/div[3]/div/div/article/div/h2//text()').extract()

        content = response.xpath(
            '/html/body/div/div[3]/div/div/article/div/div[2]//text()'
        ).extract()

        if len(title) and len(content) != 0:
            items = GovermentNewsItem()
            items['title'] = title[0].replace('\r', '').replace(' ',
                                                                '').replace(
                                                                    '\n', '')
            items['content'] = content[1].replace('\r',
                                                  '').replace(' ', '').replace(
                                                      '\n', '')
            items['publish_date'] = response.xpath(
                '/html/body/div/div[3]/div/div/article/div/div[1]/span/span[2]//text()'
            ).extract()
            items['source'] = response.xpath(
                '/html/body/div/div[3]/div/div/article/div/div[1]/span/span[1]//text()'
            ).extract()
            yield items

        urls = response.css('a::attr(href)').extract()
        if len(urls) == 0:
            sleep(1)
        urls = list(filter(lambda x: x != '', urls))
        urls_company = list(filter(lambda x: 'content_' in x, urls))

        for u in urls_company:

            if 'http' in u:
                yield scrapy.Request(url=u,
                                     callback=self.parse,
                                     meta={'dont_merge_cookies': True})
            else:
                u = 'http://www.cq.gov.cn' + u
                yield scrapy.Request(url=u,
                                     callback=self.parse,
                                     meta={'dont_merge_cookies': True})