Exemplo n.º 1
0
    def content_parse(self, response):
        date = re.findall('\d{4}-\d+-\d+',response.css('.author-timestamp::attr(content)').extract_first())
        if len(date) > 0:
            if helper.compare_time(date[0], self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date[0]
        pipleitem['id'] = re.findall('\d{2}/(\S*)',response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
        pipleitem['source'] = 'The Washington Post'
        pipleitem['editor'] = response.css('.author-byline .author-name::text').extract_first()
        pipleitem['content'] = helper.list2str(response.xpath('string(//div[@id="article-body"])').extract())
        pipleitem['image_urls'] = helper.list2str(response.css('#article-body img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(response.xpath('//div[@class="news-content info-content"]').css(' video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = re.findall(
            '\d{4}-\d+-\d+',
            response.xpath(
                '//meta[@name="analyticsAttributes.articleDate"]/@content').
            extract_first())
        if len(date) > 0:
            if helper.compare_time(date[0], self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date[0]
        pipleitem['id'] = re.findall('id\S*', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath(
            '//h1[@class="ArticleHeader_headline"]/text()').extract_first()
        pipleitem['source'] = 'Reuters'
        # pipleitem['editor'] = response.css('.BylineBar_byline::text').extract_first()
        pipleitem['editor'] = response.xpath(
            '//meta[@name="Author"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@class="StandardArticleBody_body"])').extract())
        pipleitem['image_urls'] = helper.list2str(
            response.css('.StandardArticleBody_body img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.meta['date']
        if len(date) == 0: return
        if helper.compare_time(date, self.limittime) < 0: return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date
        pipleitem['id'] = re.findall('doc-([a-z\d]*).', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath(
            '//h1[@id="artibodyTitle"]/text()').extract_first()
        pipleitem['source'] = response.css(
            'p.info *:nth-last-child(1)::text').extract_first()
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(
            response.xpath('string(//div[@id="artibody"])').extract()).replace(
                u'\u3000', u'')
        pipleitem['image_urls'] = helper.list2str(
            response.css('#artibody img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = re.findall('\d{4}-\d+-\d+',
                          response.css('head').extract_first())
        if len(date) > 0:
            date = date[0]
            if helper.compare_time(date, self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date
        pipleitem['id'] = None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('head title::text').extract_first()
        pipleitem['source'] = None
        pipleitem['editor'] = None
        pipleitem['content'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = re.findall(
            '\d{4}-\d+-\d+',
            response.xpath(
                '//meta[@name="article.published"]/@content').extract_first())
        if len(date) > 0:
            if helper.compare_time(date[0], self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date[0]
        pipleitem['id'] = re.findall('-\d{5,}', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('head title::text').extract_first()
        # pipleitem['title'] = response.xpath('//h1[@class="wsj-article-headline"]/text()').extract_first()
        pipleitem['source'] = response.xpath(
            '//meta[@name="page.content.source"]/@content').extract_first()
        pipleitem['editor'] = response.xpath(
            '//meta[@name="author"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@class="wsj-snippet-body"])').extract())
        pipleitem['image_urls'] = None
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Exemplo n.º 6
0
    def content_parse(self, response):
        datestr = response.css('.horn-txt p::text').extract_first()
        if datestr != None:
            datestr = re.findall('\d{4}-\d+-\d+\s*[\d:]*', datestr)[0]
            if helper.compare_time(datestr, self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = datestr
        pipleitem['id'] = response.meta['id']
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css(
            '.new-headline::text').extract_first()
        pipleitem['source'] = '华人头条'
        pipleitem['editor'] = response.css(
            '.horn-mark p::text').extract_first()
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@class="news-content info-content"])').extract())
        pipleitem['image_urls'] = helper.list2str(
            response.xpath('//div[@class="news-content info-content"]').css(
                ' img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.xpath('//div[@class="news-content info-content"]').css(
                ' video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None

        comment_count = re.findall(
            '\d+',
            response.css('.comment-num span::text').extract_first())
        if comment_count != None:
            pipleitem['comment'] = comment_count[0]
        else:
            pipleitem['comment'] = None

        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
# -*- coding: utf-8 -*-