示例#1
0
    def parse(self, response):
        canonical_url = response.xpath(
            '//link[@rel="canonical"]/@href').extract_first()

        title = response.xpath('//h1/span/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        article_body = response.css('div.text')
        if article_body:
            # join multiple text sections
            body_html = " ".join(article_body.extract())
            byline = response.css('span.article-author').xpath(
                'span/text()').extract_first()

            publication_date_str = response.css(
                'span.article-pub-date::text').extract_first().strip()
            # '30 August 2018'
            publication_date = datetime.strptime(publication_date_str,
                                                 '%d %B %Y')
            # datetime.datetime(2018, 8, 30, 0, 0)

            publication_date = SAST.localize(publication_date)

            item = ScrapenewsItem()
            item['body_html'] = body_html
            item['title'] = title
            item['byline'] = byline
            item['published_at'] = publication_date.isoformat()
            item['retrieved_at'] = datetime.utcnow().isoformat()
            item['url'] = canonical_url
            item['file_name'] = response.url.split('/')[-2]
            item['spider_name'] = self.name
            item['publication_name'] = self.publication_name

            yield item
示例#2
0
    def parse(self, response):

        canonical_url = response.xpath(
            '//link[@rel="canonical"]/@href').extract_first()
        title = response.xpath('//h1/span/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        # should we be using canonical_url instead of response.url for the above?
        article_body = response.css('div.article-widget-text')

        if article_body:
            body_html = " ".join(article_body.css('::text').extract())
            byline = response.css('span.article-author').xpath(
                '@data-author').extract_first()

            publication_date_str = response.css(
                'span.article-pub-date::text').extract_first().strip()
            # u'26 June 2018 - 07:28'
            publication_date = datetime.strptime(publication_date_str,
                                                 '%d %B %Y - %H:%M')
            # datetime.datetime(2018, 6, 26, 7, 28)
            publication_date = SAST.localize(publication_date)

            item = ScrapenewsItem()
            item['body_html'] = body_html
            item['title'] = title
            item['byline'] = byline
            item['published_at'] = publication_date.isoformat()
            item['retrieved_at'] = datetime.utcnow().isoformat()
            item['url'] = canonical_url
            item['file_name'] = response.url.split('/')[-2]
            # should we be using canonical_url instead of response.url for the above?
            item['spider_name'] = self.name
            item['publication_name'] = self.publication_name

            yield item
示例#3
0
    def parse_item(self, response):
        title = response.xpath(
            '//h1[contains(@class, "entry-title")]/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        og_type = response.xpath(
            '//meta[@property="og:type"]/@content').extract_first()
        if og_type == 'activity':
            body_element = response.xpath(
                '//div[contains(@class, "td-post-content")]')
            body_html = body_element.extract_first()
            publication_date = response.xpath(
                '//time/@datetime').extract_first()

            if body_html:
                item = ScrapenewsItem()
                item['body_html'] = body_html
                item['title'] = title
                item['published_at'] = publication_date
                item['retrieved_at'] = datetime.utcnow().isoformat()
                item['url'] = response.url
                item['file_name'] = response.url.split('/')[-1]
                item['publication_name'] = self.publication_name
                item['spider_name'] = self.name

                yield item
            else:
                self.logger.info("No body found for %s", response.url)
示例#4
0
    def parse(self, response):
        canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first()
        if canonical_url:
            url = canonical_url
        else:
            url = response.url

        if '/opinionistas' in url:
            self.logger.info("Ignoring %s", url)
            return

        title = response.xpath('//div[@class="titles"]/h1/text()').extract_first()
        self.logger.info('%s %s', url, title)
        article_body = response.xpath('//div[@class="article-container"]')
        if article_body:
            body_html = article_body.extract_first()
            byline = response.xpath('//meta[@name="author"]/@content').extract_first()
            publication_date_str = response.xpath('//meta[@name="published"]/@content').extract_first()

            publication_date = datetime.strptime(publication_date_str, '%Y-%m-%d')
            publication_date = SAST.localize(publication_date)

            item = ScrapenewsItem()
            item['body_html'] = body_html
            item['title'] = title
            item['byline'] = byline
            item['published_at'] = publication_date.isoformat()
            item['retrieved_at'] = datetime.utcnow().isoformat()
            item['url'] = url
            item['file_name'] = url.split('/')[-2]
            item['spider_name'] = self.name

            item['publication_name'] = self.publication_name

            yield item
示例#5
0
    def parse(self, response):

        title = response.xpath('//header/h1/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        article_body = response.xpath('//div[@itemprop="articleBody"]')
        if article_body:
            body_html = article_body.extract_first()
            byline = response.xpath(
                '//span[@itemprop="author"]/strong/text()').extract_first()
            publication_date_str = response.xpath(
                '//span[@itemprop="datePublished"]/@content').extract_first()

            publication_date_str = publication_date_str.strip()[:16]
            publication_date = datetime.strptime(publication_date_str,
                                                 '%Y-%m-%dT%H:%M')
            publication_date = SAST.localize(publication_date)

            item = ScrapenewsItem()
            item['body_html'] = body_html
            item['title'] = title
            item['byline'] = byline
            item['published_at'] = publication_date.isoformat()
            item['retrieved_at'] = datetime.utcnow().isoformat()
            item['url'] = response.url
            item['file_name'] = response.url.split('/')[-1]
            item['spider_name'] = self.name

            item['publication_name'] = self.publication_name

            yield item
示例#6
0
    def parse_item(self, response):

        canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first()
        title = response.css('h2 > span').xpath('text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        # should we be using canonical_url instead of response.url for the above?
        og_type = response.xpath('//meta[@property="og:type"]/@content').extract_first()
        if og_type == 'article':
            body_html = " ".join(response.css('article.article-full p').extract())
            byline = response.css('.byline span[itemprop="author"] a ::text').extract_first()

            publication_date_str = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
            # alas no time portion: possibly use timelib to 
            publication_date = datetime.strptime(publication_date_str, '%Y-%m-%d')
            # datetime.datetime(2018, 6, 14, 11, 0)
            publication_date = SAST.localize(publication_date)
            # datetime.datetime(2018, 6, 14, 11, 0, tzinfo=<DstTzInfo 'Africa/Johannesburg' SAST+2:00:00 STD>)

            if body_html:
                item = ScrapenewsItem()
                item['body_html'] = body_html
                item['title'] = title
                item['byline'] = byline
                item['published_at'] = publication_date.isoformat()
                item['retrieved_at'] = datetime.utcnow().isoformat()
                item['url'] = canonical_url
                item['file_name'] = response.url.split('/').pop()
                item['spider_name'] = self.name
                item['publication_name'] = self.publication_name

                yield item
            else:
                self.logger.info("No body found for %s", response.url)
示例#7
0
    def parse_item(self, response):

        canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first()
        title = response.xpath('//h1/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        # should we be using canonical_url instead of response.url for the above?
        og_type = response.xpath('//meta[@property="og:type"]/@content').extract_first()

        if og_type == 'article':
            article_body = response.css('div.post-content')
            body_html = " ".join(article_body.css('::text').extract())
            byline = response.css('span.author::text').extract_first().strip()
            # When extracting, ignore the first line as it is a comment.
            publication_date_str = response.css('span.create::text').extract()[1].strip()
            publication_date = datetime.strptime(publication_date_str, '%d %B %Y, %I:%M %p')
            publication_date = SAST.localize(publication_date)

            if body_html:
                item = ScrapenewsItem()
                item['body_html'] = body_html
                item['title'] = title.strip()
                item['byline'] = byline
                item['published_at'] = publication_date.isoformat()
                item['retrieved_at'] = datetime.utcnow().isoformat()
                item['url'] = canonical_url
                item['file_name'] = response.url.split('/')[-2]
                # should we be using canonical_url instead of response.url for the above?
                item['spider_name'] = self.name
                item['publication_name'] = self.publication_name

                yield item

            else:
                self.logger.info("No body found for %s", response.url)
示例#8
0
    def parse_item(self, response):

        canonical_url = response.xpath('//link[@rel="canonical"]/@href').extract_first()
        title = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        # should we be using canonical_url instead of response.url for the above?
        og_type = response.xpath('//meta[@property="og:type"]/@content').extract_first()
        if og_type == 'article':
            article_body = response.css('div.td-post-content')
            body_html = " ".join(article_body.xpath('//p').extract())
            byline = response.css('div.td-post-author-name').css('::text').extract()[2]

            publication_date_str = response.xpath('//time/@datetime').extract_first()
            # u'2018-06-14T11:00:00+00:00'
            publication_date = datetime.strptime(publication_date_str[0:19], '%Y-%m-%dT%H:%M:%S')
            # datetime.datetime(2018, 6, 14, 11, 0)
            publication_date = SAST.localize(publication_date)
            # datetime.datetime(2018, 6, 14, 11, 0, tzinfo=<DstTzInfo 'Africa/Johannesburg' SAST+2:00:00 STD>)

            if body_html:
                item = ScrapenewsItem()
                item['body_html'] = body_html
                item['title'] = title
                item['byline'] = byline
                item['published_at'] = publication_date.isoformat()
                item['retrieved_at'] = datetime.utcnow().isoformat()
                item['url'] = canonical_url
                item['file_name'] = response.url.split('/')[-2]
                item['spider_name'] = self.name
                item['publication_name'] = self.publication_name

                yield item
            else:
                self.logger.info("No body found for %s", response.url)
示例#9
0
    def parse_item(self, response):
        canonical_url = response.xpath(
            '//link[@rel="canonical"]/@href').extract_first()
        if canonical_url:
            url = canonical_url
        else:
            url = response.url

        title = response.css('h1.article-title-primary').xpath(
            'span/text()').extract_first()
        self.logger.info('%s %s', url, title)

        # Ignore premium content articles
        is_premium_content = response.css('div.premium-alert').xpath(
            "h3/text()").extract_first(
            ) == 'This article is reserved for our subscribers.'
        if is_premium_content:
            self.logger.info("Ignoring premium content %s", url)
            return

        article_body = response.css('div.article-widget-text')
        if article_body:
            # join multiple text sections
            body_html = " ".join(article_body.extract())
            byline = response.css('span.heading-author').xpath(
                'text()').extract_first()
            publication_date_str = response.css('div.article-pub-date').xpath(
                'text()').extract_first().strip()
            publication_date = datetime.strptime(publication_date_str,
                                                 '%d %B %Y - %H:%M')
            publication_date = SAST.localize(publication_date)

            item = ScrapenewsItem()
            item['body_html'] = body_html
            item['title'] = title
            item['byline'] = byline
            item['published_at'] = publication_date.isoformat()
            item['retrieved_at'] = datetime.utcnow().isoformat()
            item['url'] = url
            item['file_name'] = url.split('/')[-2]
            item['spider_name'] = self.name

            publication_urls = {
                'bd': 'Business Day',
                'fm': 'Financial Mail',
                'rdm': 'Rand Daily Mail',
                'bt': 'Business Times',
                'ft': 'Financial Times'
            }

            url_part = url.split('/')[3]

            item['publication_name'] = publication_urls.get(
                url_part, 'Business Day')

            yield item
        else:
            self.logger.info("No body found for %s", response.url)
示例#10
0
    def parse_item(self, response):

        og_url = response.xpath(
            '//meta[@property="og:url"]/@content').extract_first()
        # no 'canonical' that I could find
        title = response.xpath('//h1/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        # should we be using og_url instead of response.url for the above?
        og_type = response.xpath(
            '//meta[@property="og:type"]/@content').extract_first()

        if og_type == 'article':
            subtitle = response.xpath('//p[@id="article_subtitle"]').css(
                '::text').extract_first()
            photo_caption = response.xpath(
                '//figcaption[@id="article_primary_image_caption"]/text()'
            ).extract_first()
            article_body = " ".join(
                response.xpath('//div[@id="article_body"]').css(
                    '::text').extract())
            if subtitle and photo_caption:
                body_html = subtitle + photo_caption + article_body
            elif subtitle and not photo_caption:
                body_html = subtitle + article_body
            elif photo_caption and not subtitle:
                body_html = photo_caption + article_body
            else:
                body_html = article_body
            byline = response.xpath(
                '//a[@rel="author"]/text()').extract_first()
            publication_date_str = response.xpath(
                '//time/@datetime').extract_first()
            publication_date = datetime.strptime(publication_date_str,
                                                 '%Y-%m-%d')
            publication_date = SAST.localize(publication_date)

            if body_html:
                item = ScrapenewsItem()
                item['body_html'] = body_html
                item['title'] = title
                item['byline'] = byline
                item['published_at'] = publication_date.isoformat()
                item['retrieved_at'] = datetime.utcnow().isoformat()
                item['url'] = og_url
                item['file_name'] = response.url.split('/')[-2]
                # should we be using og_url instead of response.url for the above?
                item['spider_name'] = self.name
                item['publication_name'] = self.publication_name

                yield item

            else:
                self.logger.info("No body found for %s", response.url)
示例#11
0
    def parse_item(self, response):
        # Whitelist categories that are actually news
        if not response.css('.post-categories').xpath(
                'li/a[contains(@href, "news-headlines")]'):
            self.logger.info("Skipping non-news article %s", response.url)
            return

        canonical_url = response.xpath(
            '//link[@rel="canonical"]/@href').extract_first()
        title = response.xpath(
            '//h1[@class="entry-title"]/text()').extract_first()
        og_type = response.xpath(
            '//meta[@property="og:type"]/@content').extract_first()
        if og_type == 'article':
            article_body = response.css('div.entry-content')
            body_html = article_body.extract_first()
            byline = response.css('div.author-name').css(
                '::text').extract_first()

            publication_date_str = response.xpath(
                '//time/@datetime').extract_first()
            # u'2018-06-14T11:00:00+00:00'
            publication_date = datetime.strptime(publication_date_str[0:19],
                                                 '%Y-%m-%dT%H:%M:%S')
            # datetime.datetime(2018, 6, 14, 11, 0)
            publication_date = SAST.localize(publication_date)
            # datetime.datetime(2018, 6, 14, 11, 0, tzinfo=<DstTzInfo 'Africa/Johannesburg' SAST+2:00:00 STD>)

            if body_html:
                item = ScrapenewsItem()
                item['body_html'] = body_html
                item['title'] = title
                item['byline'] = byline
                item['published_at'] = publication_date.isoformat()
                item['retrieved_at'] = datetime.utcnow().isoformat()
                item['url'] = canonical_url
                item['file_name'] = response.url.split('/')[-2]
                item['spider_name'] = self.name
                item['publication_name'] = self.publication_name

                yield item
            else:
                self.logger.info("No body found for %s", canonical_url)
示例#12
0
    def parse(self, response):
        canonical_url = response.xpath(
            '//link[@rel="canonical"]/@href').extract_first()

        ## Skip excluded sections
        section = response.css('a.section').xpath('text()').extract_first()
        if section and section.lower() in IGNORE_SECTIONS:
            self.logger.info("Skipping %s because section is %s",
                             canonical_url, section)
            return

        ## Skip syndicated content
        body_html = "".join(response.css("#body_content p").extract())
        body_text = remove_tags(body_html)

        for string in SKIP_STRINGS:
            suffix = body_text[-20:]
            if string in suffix:
                self.logger.info("Skipping %s because suffix %r contains %r",
                                 canonical_url, suffix, string)
                return

        publication_date_str = response.xpath(
            '//meta[@name="publicationdate"]/@content').extract_first()
        publication_date = datetime.strptime(publication_date_str, '%d/%m/%Y')
        publication_date = SAST.localize(publication_date)

        item = ScrapenewsItem()
        item['body_html'] = response.css("#body_content").extract_first()
        item['title'] = response.xpath(
            '//meta[@name="title"]/@content').extract_first()
        item['byline'] = response.xpath(
            '//meta[@name="author"]/@content').extract_first()
        item['published_at'] = publication_date.isoformat()
        item['retrieved_at'] = datetime.utcnow().isoformat()
        item['url'] = canonical_url
        item['file_name'] = response.url.split('/')[-1]
        item['spider_name'] = self.name
        item['publication_name'] = self.publication_name

        yield item
示例#13
0
    def parse(self, response):
        if '/News/' not in response.url:
            self.logger.info("Ignoring %s", response.url)
            return

        title = response.xpath(
            '//div[contains(@class, "article_details")]/h1/text()'
        ).extract_first()
        self.logger.info('%s %s', response.url, title)
        article_body = response.xpath('//article[@id="article-body"]')
        if article_body:
            body_html = article_body.extract_first()
            byline = response.xpath(
                '//div[contains(@class, "ByLineWidth")]/p/text()'
            ).extract_first()
            publication_date_str = response.xpath(
                '//span[@id="spnDate"]/text()').extract_first()
            accreditation = response.xpath(
                '//div[contains(@class, "ByLineWidth")]/div[contains(@class, "accreditation")]/a/@href'
            ).extract_first()

            publication_date = datetime.strptime(publication_date_str,
                                                 '%Y-%m-%d %H:%M')
            publication_date = SAST.localize(publication_date)

            item = ScrapenewsItem()
            item['body_html'] = body_html
            item['title'] = title
            item['byline'] = byline
            item['published_at'] = publication_date.isoformat()
            item['retrieved_at'] = datetime.utcnow().isoformat()
            item['url'] = response.url
            item['file_name'] = response.url.split('/')[-1]
            item['spider_name'] = self.name

            item['publication_name'] = self.publication_name
            if accreditation:
                item['publication_name'] += " with " + accreditation[1:]

            yield item
        self.logger.info("")
示例#14
0
    def parse_item(self, response):

        canonical_url = response.xpath(
            '//link[@rel="canonical"]/@href').extract_first()
        title = response.xpath('//h1/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        # should we be using canonical_url instead of response.url for the above?
        og_type = response.xpath(
            '//meta[@property="og:type"]/@content').extract_first()
        if og_type == 'article':
            article_body = response.css('div.article-body')
            body_html = " ".join(
                article_body.xpath('//p').css('::text').extract())
            byline = response.xpath(
                '//strong[@itemprop="name"]/text()').extract_first()
            publication_date_str = response.xpath(
                '//meta[@itemprop="datePublished"]/@content').extract_first()

            # '2020-01-30T08:22:00.000Z'
            publication_date = datetime.strptime(publication_date_str[:19],
                                                 '%Y-%m-%dT%H:%M:%S')
            # datetime.datetime(2018, 6, 18, 9, 1); datetime.datetime(2018, 6, 8, 21, 20)
            publication_date = SAST.localize(publication_date)
            # datetime.datetime(2018, 6, 8, 21, 20, tzinfo=<DstTzInfo 'Africa/Johannesburg' SAST+2:00:00 STD>)

            if body_html:
                item = ScrapenewsItem()
                item['body_html'] = body_html
                item['title'] = title
                item['byline'] = byline
                item['published_at'] = publication_date.isoformat()
                item['retrieved_at'] = datetime.utcnow().isoformat()
                item['url'] = canonical_url
                item['file_name'] = response.url.split('/')[-1]
                item['publication_name'] = self.publication_name
                item['spider_name'] = self.name

                yield item
            else:
                self.logger.info("No body found for %s", response.url)
示例#15
0
    def parse_item(self, response):
        title = response.css('header.article-header h1').xpath(
            'text()').extract_first()
        self.logger.info('%s %s', response.url, title)

        publication_date = response.css('.article-meta time').xpath(
            '@datetime').extract_first()
        body_html = response.css('.article-text').extract_first()

        if body_html:
            item = ScrapenewsItem()
            item['body_html'] = body_html
            item['title'] = title
            item['published_at'] = publication_date
            item['retrieved_at'] = datetime.utcnow().isoformat()
            item['url'] = response.url
            item['file_name'] = response.url.split('/')[-1]
            item['publication_name'] = self.publication_name
            item['spider_name'] = self.name

            yield item
        else:
            self.logger.info("No body found for %s", response.url)
示例#16
0
    def parse(self, response):
        canonical_url = response.xpath(
            '//link[@rel="canonical"]/@href').extract_first()

        title = response.xpath('//h1/span/text()').extract_first()
        self.logger.info('%s %s', response.url, title)
        article_body = response.css('div.article-widget-text')
        if article_body:
            # join multiple text sections
            body_html = " ".join(article_body.extract())
            byline_includes_by = response.xpath(
                '//span[@class="heading-author"]/text()').extract()
            byline_str = " ".join(byline_includes_by)
            byline_no_by = (byline_str).split(" ")[1:]
            byline = " ".join(byline_no_by)

            publication_date_str = response.css('div.article-pub-date').xpath(
                'text()').extract()[0].strip()
            # '03 May 2018 - 19:17'
            publication_date = datetime.strptime(publication_date_str,
                                                 '%d %B %Y - %H:%M')
            # datetime.datetime(2018, 5, 3, 19, 17)

            publication_date = SAST.localize(publication_date)

            item = ScrapenewsItem()
            item['body_html'] = body_html
            item['title'] = title
            item['byline'] = byline
            item['published_at'] = publication_date.isoformat()
            item['retrieved_at'] = datetime.utcnow().isoformat()
            item['url'] = canonical_url
            item['file_name'] = response.url.split('/')[-2]
            item['spider_name'] = self.name
            item['publication_name'] = self.publication_name

            yield item