def get_metadata(response): # `lxml` doesn't support <meta charset> tags, replace it for http-equiv. raw_content = response.content.replace( b'<meta charset="utf-8">', b'<meta http-equiv="content-type" content="text/html; charset=utf-8"/>' ) root = html.fromstring(raw_content) metadata = {} try: metadata['title'] = root\ .cssselect('.main-content .nota .text > h3')[0]\ .text_content().strip() except: pass try: raw_date = root.cssselect('p.publicado')[0].text_content().strip() date = parse_date(raw_date.split('|')[0]) if date: metadata['date'] = date except: pass return metadata
def parse_thread(self, response): thread_id = response.meta['thread_id'] page_id = response.meta['page_id'] is_error = response.xpath("//div[@class='standard_error']") if is_error or len(response.body) < 500: logger.info("not_found thread_id=%s page_id=%s", thread_id, page_id) return title = response.xpath( "//span[@class='threadtitle']//text()").extract() title = " ".join([t.strip() for t in title if t.strip()]) content = response.xpath("//div[@class='content']//text()").extract() content = "\n".join([t.strip() for t in content if t.strip()]) pub_date = response.xpath( "(//span[@class='date'])[1]//text()").extract() pub_date = " ".join([c.strip() for c in pub_date if c.strip()]) pub_date = parse_date(pub_date) logger.info("parsed thread_id=%s page_id=%s", thread_id, page_id) # Prepare and yield the parsed article. document_id = "{}@@{}".format(thread_id, page_id) full_content = "\n".join([title, content]) word_count = len(full_content.split()) article = { 'content': full_content, 'content_type': 'clean', 'tags': ['forum', 'Argentina', 'spanish'], 'word_count': word_count, 'data_source': 'gxzone.com', 'entry': { 'date_scraped': datetime.now(), 'source_id': document_id, }, 'title': title, 'url': response.url, 'date': pub_date, } yield article # Go to the next page too. next_page = response.xpath("//a[@rel='next']/@href").extract_first() if next_page: request = Request(response.urljoin(next_page), callback=self.parse_thread) request.meta['thread_id'] = thread_id request.meta['page_id'] = page_id + 1 yield request
def parse_article(self, response): article_id = response.url.split('/')[-1].split('.')[0] title = response.xpath( "//h1[contains(@class, 'articulo-titulo') or " "contains(@id, 'titulo_noticia')]//text()").extract_first() summary = response.xpath( "//div[contains(@class, 'articulo-subtitulos') or " "contains(@id, 'subtitulo_noticia')]//text()").extract() summary = "\n".join([s.strip() for s in summary if s.strip()]) content = response.xpath( "//div[contains(@id, 'cuerpo_noticia')]//text()").extract() content = "\n".join([c.strip() for c in content if c.strip()]) pub_date = response.xpath( "//a[@title='Ver todas las noticias de esta fecha']/text()" ).extract() pub_date = " ".join([c.strip() for c in pub_date if c.strip()]) pub_date = parse_date(pub_date) # Prepare the parsed article. full_content = "\n".join([title, summary, content]) word_count = len(full_content.split()) article = { 'content': full_content, 'content_type': 'clean', 'tags': ['news', 'Spain', 'spanish'], 'word_count': word_count, 'data_source': 'elpais.com', 'entry': { 'date_scraped': datetime.now(), 'source_id': article_id, }, 'title': title, 'url': response.url, 'date': pub_date, } logger.info("parsed article_id=%s pub_date=%s", article_id, pub_date.isoformat().split('T')[0]) yield article
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.cssselect('header > h1')[0]\ .text_content().strip() except: pass try: raw_date = root.cssselect('time.fecha')[0].get('datetime') date = parse_date(raw_date.strip()) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.cssselect('h1.article-header')[0]\ .text_content().strip() except: pass try: raw_date = root.cssselect('span.author-details')[0].text_content() date = parse_date(u' '.join(raw_date.split('|')[1:])) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.cssselect('h1.detail-main-title')[0]\ .text_content().strip() except: pass try: raw_date = root.cssselect('span.detail-date')[0].text_content().strip() date = parse_date(raw_date) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.cssselect('h1.entry-title')[0].text_content() except: pass try: raw_date = root.cssselect('p.post-meta > span.updated')[0]\ .text_content().strip() date = parse_date(raw_date.strip()) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.xpath("//h1[@itemprop='headline']")[0]\ .text_content().strip() except: pass try: raw_date = root.xpath("//div[@class='fecha']")[0]\ .text_content().strip() date = parse_date(raw_date) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.xpath("//h1[@itemprop='name']")[0]\ .text_content().strip() except: pass try: raw_date = root.xpath("//*[@itemprop='datePublished']")[0]\ .text_content().strip() date = parse_date(raw_date) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.cssselect( 'td.contenidos h1')[0].text_content().strip() except: pass try: raw_date = root.xpath( '//div[@class="cajaHerramientas"]/span/text()[2]')[0].strip() date = parse_date(raw_date) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.cssselect('div.not_comp h1')[0]\ .text_content().strip() except: pass try: raw_date = root.cssselect('ul#menu_inf > li')[0]\ .text_content().strip().split(', ')[-1] date = parse_date(raw_date) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root\ .cssselect('#cuDetalle_cuTitular_tituloNoticia')[0]\ .text_content().strip() except: pass try: raw_date = root.cssselect('#cuDetalle_cuCreditos_fecha')[0]\ .text_content().strip() date = parse_date(raw_date) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.xpath( '//h1[contains(@itemprop, "headline") ' 'or contains(@id, "article-title")]')[0].text_content() except: pass try: raw_date = root.xpath('//meta[@itemprop="datePublished"]')[0].get( 'content') date = parse_date(raw_date.strip()) if date: metadata['date'] = date except: pass return metadata
def get_metadata(response): root = html.fromstring(response.content) metadata = {} try: metadata['title'] = root.cssselect( '#contenido-general-articulo > header > h1')[0].text_content( ).strip() except: pass try: raw_date = root.cssselect('div.dia-publicacion')[0]\ .text_content().strip() date = parse_date(raw_date) if date: metadata['date'] = date except: pass return metadata
def _parse_pantallazo_com_uy(root): # We use the overtitle as the actual article title, as it's more # detailed. overtitle = root.xpath("//div[@class='gral_ucdeest']/h1/text()")[0] title = root.xpath("//div[@class='gral_ucdeest']/h5/text()")[0] summary = root.xpath("//div[@class='gral_ucdeest']/h3/text()")[0] article = u"\n".join( root.xpath("//div[@class='gral_ucdeest']/h4/" "following-sibling::p/text()")) content = u'\n'.join([overtitle, title, summary, article]).strip() raw_date = root.xpath("//div[@class='gral_ucdeest']/h6/text()")[0] date = parse_date(raw_date) article = { 'date': date, 'title': overtitle, 'content': content, } return article
def _parse_montevideo_com_uy(root): # We use the overtitle as the actual article title, as it's more # detailed. overtitle = root.xpath("//h1[@itemprop='headline']/text()")[0] title = root.xpath( "//h1[@itemprop='headline']/following-sibling::h4[1]/text()")[0] summary = root.xpath("//h2[@itemprop='articleSection']/text()")[0] article = u"\n".join( root.xpath( "//h3[@itemprop='articleBody']/following-sibling::p/text()")) content = u'\n'.join([overtitle, title, summary, article]).strip() raw_date = root.xpath("//time")[0].text_content() date = parse_date(raw_date) article = { 'date': date, 'title': overtitle, 'content': content, } return article
def _parse_futbol_com_uy(root): # We use the overtitle as the actual article title, as it's more # detailed. overtitle = root.xpath("//div[@class='doscolsteizq']/h3/text()")[0] title = root.xpath("//div[@class='doscolsteizq']/h1/text()")[0] summary = root.xpath( "//div[@class='doscolsteizq']/div[@id='txt']/h5/text()")[0] article = u"\n".join( root.xpath("//div[@class='doscolsteizq']/div[@id='txt']/h6/" "following-sibling::p/text()")) content = u'\n'.join([overtitle, title, summary, article]).strip() raw_date = root.xpath("//div[@class='fecharedesizq']/h4")[0].text_content() date = parse_date(raw_date) article = { 'date': date, 'title': overtitle, 'content': content, } return article