Exemplo n.º 1
0
    def content_parse(self, response):
        date = re.findall('\d{4}-\d+-\d+',response.css('.author-timestamp::attr(content)').extract_first())
        if len(date) > 0:
            if helper.compare_time(date[0], self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date[0]
        pipleitem['id'] = re.findall('\d{2}/(\S*)',response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
        pipleitem['source'] = 'The Washington Post'
        pipleitem['editor'] = response.css('.author-byline .author-name::text').extract_first()
        pipleitem['content'] = helper.list2str(response.xpath('string(//div[@id="article-body"])').extract())
        pipleitem['image_urls'] = helper.list2str(response.css('#article-body img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(response.xpath('//div[@class="news-content info-content"]').css(' video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = re.findall(
            '\d{4}-\d+-\d+',
            response.xpath(
                '//meta[@name="analyticsAttributes.articleDate"]/@content').
            extract_first())
        if len(date) > 0:
            if helper.compare_time(date[0], self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date[0]
        pipleitem['id'] = re.findall('id\S*', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath(
            '//h1[@class="ArticleHeader_headline"]/text()').extract_first()
        pipleitem['source'] = 'Reuters'
        # pipleitem['editor'] = response.css('.BylineBar_byline::text').extract_first()
        pipleitem['editor'] = response.xpath(
            '//meta[@name="Author"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@class="StandardArticleBody_body"])').extract())
        pipleitem['image_urls'] = helper.list2str(
            response.css('.StandardArticleBody_body img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.meta['date']
        if len(date) == 0: return
        if helper.compare_time(date, self.limittime) < 0: return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date
        pipleitem['id'] = re.findall('doc-([a-z\d]*).', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath(
            '//h1[@id="artibodyTitle"]/text()').extract_first()
        pipleitem['source'] = response.css(
            'p.info *:nth-last-child(1)::text').extract_first()
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(
            response.xpath('string(//div[@id="artibody"])').extract()).replace(
                u'\u3000', u'')
        pipleitem['image_urls'] = helper.list2str(
            response.css('#artibody img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = re.findall(
            '\d{4}-\d+-\d+',
            response.xpath(
                '//meta[@name="article.published"]/@content').extract_first())
        if len(date) > 0:
            if helper.compare_time(date[0], self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date[0]
        pipleitem['id'] = re.findall('-\d{5,}', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('head title::text').extract_first()
        # pipleitem['title'] = response.xpath('//h1[@class="wsj-article-headline"]/text()').extract_first()
        pipleitem['source'] = response.xpath(
            '//meta[@name="page.content.source"]/@content').extract_first()
        pipleitem['editor'] = response.xpath(
            '//meta[@name="author"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@class="wsj-snippet-body"])').extract())
        pipleitem['image_urls'] = None
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = re.findall('\d{4}-\d+-\d+',
                          response.css('head').extract_first())
        if len(date) > 0:
            date = date[0]
            if helper.compare_time(date, self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = date
        pipleitem['id'] = None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('head title::text').extract_first()
        pipleitem['source'] = None
        pipleitem['editor'] = None
        pipleitem['content'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.xpath(
            '//span[@class="date style-scope ytd-video-secondary-info-renderer"]/text()'
        ).extract_first()
        log.msg(message=date, level=log.WARNING)
        if len(date) > 0 and isinstance(date, str):
            if date == None or date.find('getSimpleString') > 0 or date.find(
                    'ago') > 0:
                return
            if helper.compare_time(helper.formatTime(date),
                                   self.limittime) < 0:
                return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = re.findall('v=(\S*).', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.xpath(
            '//div[@id="container"]/h1/yt-formatted-string/text()'
        ).extract_first()
        pipleitem['source'] = 'Youtube'
        pipleitem['editor'] = response.css(
            '#owner-name a::text').extract_first()
        pipleitem['content'] = None
        views = re.findall(
            '\d*',
            response.xpath(
                '//span[@class="view-count style-scope yt-view-count-renderer"]/text()'
            ).extract_first())
        if len(views) > 0: pipleitem['views'] = views[0]
        pipleitem['image_urls'] = None
        pipleitem['video_urls'] = response.css(
            'video::attr(src)').extract_first()
        pipleitem['share'] = None
        tmp = response.xpath(
            '//yt-formatted-string[@id="text"]/@aria-label').extract()
        if len(tmp) < 2: tmp = ['0', '0']
        for i in range(len(tmp)):
            if re.search('No', tmp[i]): tmp[i] = '0'
        pipleitem['like'] = re.findall('\d*', tmp[0])[0]
        pipleitem['dislike'] = re.findall('\d*', tmp[1])[0]

        pipleitem['comment'] = '0'
        comment = re.findall(
            '\d*',
            response.xpath('//h2[@id="count"]/yt-formatted-string/text()').
            extract_first())[0]
        if len(comment) > 0: pipleitem['comment'] = comment
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Exemplo n.º 7
0
    def content_parse(self, response):
        datestr = response.css('.horn-txt p::text').extract_first()
        if datestr != None:
            datestr = re.findall('\d{4}-\d+-\d+\s*[\d:]*', datestr)[0]
            if helper.compare_time(datestr, self.limittime) < 0: return
        else:
            return

        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = datestr
        pipleitem['id'] = response.meta['id']
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css(
            '.new-headline::text').extract_first()
        pipleitem['source'] = '华人头条'
        pipleitem['editor'] = response.css(
            '.horn-mark p::text').extract_first()
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@class="news-content info-content"])').extract())
        pipleitem['image_urls'] = helper.list2str(
            response.xpath('//div[@class="news-content info-content"]').css(
                ' img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.xpath('//div[@class="news-content info-content"]').css(
                ' video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None

        comment_count = re.findall(
            '\d+',
            response.css('.comment-num span::text').extract_first())
        if comment_count != None:
            pipleitem['comment'] = comment_count[0]
        else:
            pipleitem['comment'] = None

        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Exemplo n.º 8
0
    def content_parse(self, response):
        pipleitem = CctvOpinionmonitorItem()

        pipleitem['date'] = None
        pipleitem['id'] = re.findall('no=([a-z\d]*)', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.meta['title']
        pipleitem['content'] = response.xpath(
            'string(//p[@style="margin-right:8px;margin-bottom:18px"])'
        ).extract_first()
        pipleitem['source'] = '今日华人网'
        pipleitem['editor'] = None
        pipleitem['image_urls'] = None
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
# -*- coding: utf-8 -*-