예제 #1
0
    def parse(self, response, **kwargs):
        """
        Extracts all the data from the crawled pages and appends them to articles list
        """
        title = response.xpath('//*[@id="wrap"]/h1/text()').extract_first()
        if title:
            url_to_full_version = response._get_url()
            first_160 = ''.join(
                response.xpath(
                    '//*[@id="woe"]/section/div/p/text()').extract())[:160]
            base_date = response.xpath(
                '//*[@id="wrap"]/div/div[2]/text()').extract_first()
            date_formatted = conf.exec_func_chain(base_date, [
                conf.clean_records_regex, lambda v: v[0:-2],
                lambda v: conf.parse_dtts(v, '%b %d, %Y')
            ])

            tags = response.xpath(
                '//*[@id="woe"]/section[3]/div/div/a/text()').extract()
            authors_section = response.xpath(
                '//*[@id="wrap"]/div/div[1]/div/span/a')
            for row in authors_section:
                full_author_url = Selector(text=row.extract()).xpath('///@href') \
                    .extract_first()
                author_fullname = conf.clean_records_regex(
                    Selector(text=row.extract()).xpath(
                        '///span/text()').extract_first())
                if date_formatted >= conf.crawl_date[0].get(
                        'LastExecutionDate'):
                    conf.write_data_append(
                        'articles.json',
                        json.dumps({
                            'title':
                            title,
                            'urlFullVersion':
                            url_to_full_version,
                            'first160':
                            first_160,
                            'dateFormatted':
                            date_formatted,
                            'tags':
                            tags,
                            'authorUrl':
                            f"{conf.gd_base_url}"
                            f"{full_author_url}",
                            'authorName':
                            author_fullname,
                            'author_key':
                            full_author_url.rsplit('/')[-2]
                        }))