def parse(self, response, **kwargs): """ Extracts all the data from the crawled pages and appends them to articles list """ title = response.xpath('//*[@id="wrap"]/h1/text()').extract_first() if title: url_to_full_version = response._get_url() first_160 = ''.join( response.xpath( '//*[@id="woe"]/section/div/p/text()').extract())[:160] base_date = response.xpath( '//*[@id="wrap"]/div/div[2]/text()').extract_first() date_formatted = conf.exec_func_chain(base_date, [ conf.clean_records_regex, lambda v: v[0:-2], lambda v: conf.parse_dtts(v, '%b %d, %Y') ]) tags = response.xpath( '//*[@id="woe"]/section[3]/div/div/a/text()').extract() authors_section = response.xpath( '//*[@id="wrap"]/div/div[1]/div/span/a') for row in authors_section: full_author_url = Selector(text=row.extract()).xpath('///@href') \ .extract_first() author_fullname = conf.clean_records_regex( Selector(text=row.extract()).xpath( '///span/text()').extract_first()) if date_formatted >= conf.crawl_date[0].get( 'LastExecutionDate'): conf.write_data_append( 'articles.json', json.dumps({ 'title': title, 'urlFullVersion': url_to_full_version, 'first160': first_160, 'dateFormatted': date_formatted, 'tags': tags, 'authorUrl': f"{conf.gd_base_url}" f"{full_author_url}", 'authorName': author_fullname, 'author_key': full_author_url.rsplit('/')[-2] }))