예제 #1
0
파일: _180.py 프로젝트: nagitsu/nabu
def get_metadata(response):
    # `lxml` doesn't support <meta charset> tags, replace it for http-equiv.
    raw_content = response.content.replace(
        b'<meta charset="utf-8">',
        b'<meta http-equiv="content-type" content="text/html; charset=utf-8"/>'
    )
    root = html.fromstring(raw_content)

    metadata = {}
    try:
        metadata['title'] = root\
            .cssselect('.main-content .nota .text > h3')[0]\
            .text_content().strip()
    except:
        pass

    try:
        raw_date = root.cssselect('p.publicado')[0].text_content().strip()
        date = parse_date(raw_date.split('|')[0])
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #2
0
파일: gxzone.py 프로젝트: nagitsu/nabu
    def parse_thread(self, response):
        thread_id = response.meta['thread_id']
        page_id = response.meta['page_id']

        is_error = response.xpath("//div[@class='standard_error']")
        if is_error or len(response.body) < 500:
            logger.info("not_found thread_id=%s page_id=%s", thread_id,
                        page_id)
            return

        title = response.xpath(
            "//span[@class='threadtitle']//text()").extract()
        title = " ".join([t.strip() for t in title if t.strip()])

        content = response.xpath("//div[@class='content']//text()").extract()
        content = "\n".join([t.strip() for t in content if t.strip()])

        pub_date = response.xpath(
            "(//span[@class='date'])[1]//text()").extract()
        pub_date = " ".join([c.strip() for c in pub_date if c.strip()])
        pub_date = parse_date(pub_date)

        logger.info("parsed thread_id=%s page_id=%s", thread_id, page_id)

        # Prepare and yield the parsed article.
        document_id = "{}@@{}".format(thread_id, page_id)
        full_content = "\n".join([title, content])
        word_count = len(full_content.split())
        article = {
            'content': full_content,
            'content_type': 'clean',
            'tags': ['forum', 'Argentina', 'spanish'],
            'word_count': word_count,
            'data_source': 'gxzone.com',
            'entry': {
                'date_scraped': datetime.now(),
                'source_id': document_id,
            },
            'title': title,
            'url': response.url,
            'date': pub_date,
        }

        yield article

        # Go to the next page too.
        next_page = response.xpath("//a[@rel='next']/@href").extract_first()
        if next_page:
            request = Request(response.urljoin(next_page),
                              callback=self.parse_thread)
            request.meta['thread_id'] = thread_id
            request.meta['page_id'] = page_id + 1
            yield request
예제 #3
0
    def parse_article(self, response):
        article_id = response.url.split('/')[-1].split('.')[0]

        title = response.xpath(
            "//h1[contains(@class, 'articulo-titulo') or "
            "contains(@id, 'titulo_noticia')]//text()").extract_first()

        summary = response.xpath(
            "//div[contains(@class, 'articulo-subtitulos') or "
            "contains(@id, 'subtitulo_noticia')]//text()").extract()
        summary = "\n".join([s.strip() for s in summary if s.strip()])

        content = response.xpath(
            "//div[contains(@id, 'cuerpo_noticia')]//text()").extract()
        content = "\n".join([c.strip() for c in content if c.strip()])

        pub_date = response.xpath(
            "//a[@title='Ver todas las noticias de esta fecha']/text()"
        ).extract()
        pub_date = " ".join([c.strip() for c in pub_date if c.strip()])
        pub_date = parse_date(pub_date)

        # Prepare the parsed article.
        full_content = "\n".join([title, summary, content])
        word_count = len(full_content.split())
        article = {
            'content': full_content,
            'content_type': 'clean',
            'tags': ['news', 'Spain', 'spanish'],
            'word_count': word_count,
            'data_source': 'elpais.com',
            'entry': {
                'date_scraped': datetime.now(),
                'source_id': article_id,
            },
            'title': title,
            'url': response.url,
            'date': pub_date,
        }

        logger.info("parsed article_id=%s pub_date=%s", article_id,
                    pub_date.isoformat().split('T')[0])

        yield article
예제 #4
0
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.cssselect('header > h1')[0]\
                                .text_content().strip()
    except:
        pass

    try:
        raw_date = root.cssselect('time.fecha')[0].get('datetime')
        date = parse_date(raw_date.strip())
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #5
0
파일: univision.py 프로젝트: nagitsu/nabu
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.cssselect('h1.article-header')[0]\
                                .text_content().strip()
    except:
        pass

    try:
        raw_date = root.cssselect('span.author-details')[0].text_content()
        date = parse_date(u' '.join(raw_date.split('|')[1:]))
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #6
0
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.cssselect('h1.detail-main-title')[0]\
                                .text_content().strip()
    except:
        pass

    try:
        raw_date = root.cssselect('span.detail-date')[0].text_content().strip()
        date = parse_date(raw_date)
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #7
0
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.cssselect('h1.entry-title')[0].text_content()
    except:
        pass

    try:
        raw_date = root.cssselect('p.post-meta > span.updated')[0]\
                       .text_content().strip()
        date = parse_date(raw_date.strip())
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #8
0
파일: lanacion.py 프로젝트: nagitsu/nabu
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.xpath("//h1[@itemprop='headline']")[0]\
                                .text_content().strip()
    except:
        pass

    try:
        raw_date = root.xpath("//div[@class='fecha']")[0]\
                       .text_content().strip()
        date = parse_date(raw_date)
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #9
0
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.xpath("//h1[@itemprop='name']")[0]\
                                .text_content().strip()
    except:
        pass

    try:
        raw_date = root.xpath("//*[@itemprop='datePublished']")[0]\
                       .text_content().strip()
        date = parse_date(raw_date)
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #10
0
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.cssselect(
            'td.contenidos h1')[0].text_content().strip()
    except:
        pass

    try:
        raw_date = root.xpath(
            '//div[@class="cajaHerramientas"]/span/text()[2]')[0].strip()
        date = parse_date(raw_date)
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #11
0
파일: eldiario.py 프로젝트: nagitsu/nabu
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.cssselect('div.not_comp h1')[0]\
                                .text_content().strip()
    except:
        pass

    try:
        raw_date = root.cssselect('ul#menu_inf > li')[0]\
                       .text_content().strip().split(', ')[-1]
        date = parse_date(raw_date)
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #12
0
파일: elmercurio.py 프로젝트: nagitsu/nabu
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root\
            .cssselect('#cuDetalle_cuTitular_tituloNoticia')[0]\
            .text_content().strip()
    except:
        pass

    try:
        raw_date = root.cssselect('#cuDetalle_cuCreditos_fecha')[0]\
                       .text_content().strip()
        date = parse_date(raw_date)
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #13
0
파일: lared21.py 프로젝트: nagitsu/nabu
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.xpath(
            '//h1[contains(@itemprop, "headline") '
            'or contains(@id, "article-title")]')[0].text_content()
    except:
        pass

    try:
        raw_date = root.xpath('//meta[@itemprop="datePublished"]')[0].get(
            'content')
        date = parse_date(raw_date.strip())
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #14
0
def get_metadata(response):
    root = html.fromstring(response.content)

    metadata = {}
    try:
        metadata['title'] = root.cssselect(
            '#contenido-general-articulo > header > h1')[0].text_content(
            ).strip()
    except:
        pass

    try:
        raw_date = root.cssselect('div.dia-publicacion')[0]\
                       .text_content().strip()
        date = parse_date(raw_date)
        if date:
            metadata['date'] = date
    except:
        pass

    return metadata
예제 #15
0
def _parse_pantallazo_com_uy(root):
    # We use the overtitle as the actual article title, as it's more
    # detailed.
    overtitle = root.xpath("//div[@class='gral_ucdeest']/h1/text()")[0]

    title = root.xpath("//div[@class='gral_ucdeest']/h5/text()")[0]
    summary = root.xpath("//div[@class='gral_ucdeest']/h3/text()")[0]
    article = u"\n".join(
        root.xpath("//div[@class='gral_ucdeest']/h4/"
                   "following-sibling::p/text()"))
    content = u'\n'.join([overtitle, title, summary, article]).strip()

    raw_date = root.xpath("//div[@class='gral_ucdeest']/h6/text()")[0]
    date = parse_date(raw_date)

    article = {
        'date': date,
        'title': overtitle,
        'content': content,
    }

    return article
예제 #16
0
def _parse_montevideo_com_uy(root):
    # We use the overtitle as the actual article title, as it's more
    # detailed.
    overtitle = root.xpath("//h1[@itemprop='headline']/text()")[0]

    title = root.xpath(
        "//h1[@itemprop='headline']/following-sibling::h4[1]/text()")[0]
    summary = root.xpath("//h2[@itemprop='articleSection']/text()")[0]
    article = u"\n".join(
        root.xpath(
            "//h3[@itemprop='articleBody']/following-sibling::p/text()"))
    content = u'\n'.join([overtitle, title, summary, article]).strip()

    raw_date = root.xpath("//time")[0].text_content()
    date = parse_date(raw_date)

    article = {
        'date': date,
        'title': overtitle,
        'content': content,
    }

    return article
예제 #17
0
def _parse_futbol_com_uy(root):
    # We use the overtitle as the actual article title, as it's more
    # detailed.
    overtitle = root.xpath("//div[@class='doscolsteizq']/h3/text()")[0]

    title = root.xpath("//div[@class='doscolsteizq']/h1/text()")[0]
    summary = root.xpath(
        "//div[@class='doscolsteizq']/div[@id='txt']/h5/text()")[0]
    article = u"\n".join(
        root.xpath("//div[@class='doscolsteizq']/div[@id='txt']/h6/"
                   "following-sibling::p/text()"))
    content = u'\n'.join([overtitle, title, summary, article]).strip()

    raw_date = root.xpath("//div[@class='fecharedesizq']/h4")[0].text_content()
    date = parse_date(raw_date)

    article = {
        'date': date,
        'title': overtitle,
        'content': content,
    }

    return article