示例#1
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, "p", {"class": "description-text"})

    except utils.ParserError:
        body = utils.find_one_tag(soup, "p", {"class": "article-text row"})

    body = "".join([p.text for p in body.findAll("p")])

    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **paper,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
示例#2
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, 'div', {'id': 'article-body'})
    except utils.ParserError as error:
        body = utils.find_one_tag(soup, 'section', {"class": "article__main"})

    body = "".join([p.text for p in body.findAll("p")])

    app = utils.find_application_json(soup, find='headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **nzherald,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
示例#3
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(soup, "section", {"name": "articleBody"})
    body = "".join([p.text for p in body.findAll("p")])
    noise = [
        "The Times is committed to publishing a diversity of letters to the editor. We’d like to hear what you think about this or any of our articles. Here are some tips. And here’s our email: [email protected] The New York Times Opinion section on Facebook, Twitter (@NYTopinion) and Instagram.",
        "Want climate news in your inbox? Sign up here for Climate Fwd:, our email newsletter.",
        "For more news on climate and the environment, follow @NYTClimate on Twitter.",
    ]
    for n in body:
        body.replace(n, "")

    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **nytimes,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
示例#4
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, 'article')
        body = ''.join([p.text for p in body.findAll('p')])

    except utils.ParserError:
        body = utils.find_one_tag(
            soup, "div", {
                "class": "article-body js-article-container",
                "itemprop": "articleBody"
            })
        body = body.findAll("p")
        body = "".join(p.text for p in body
                       if "c-letters-cta__text" not in p.attrs.values())

    app = utils.find_application_json(soup, 'headline')

    headline = app['headline']
    #  sometimes can be "" in the ld+json
    if headline == "":
        headline = utils.find_one_tag(soup, "h1", {
            "class": "c-article-header__hed"
        }).text

    published = app['datePublished']

    return {
        **atlantic,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
示例#5
0
def parse_url(url):
    headers = {'User-Agent':'Mozilla/5.0'}
    response = utils.request(url, headers=headers)
    soup = response['soup']
    html = response['html']

    body = utils.find_one_tag(soup, 'div', {'id': 'main'})
    body = [p.text for p in body.findAll('p') if p.attrs == {} or p.attrs == {'dir': 'ltr'}]
    body = ''.join(body)

    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **independent,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }
示例#6
0
def parse_url(url):
    r = request(url)
    if 'error' in r.keys():
        return {'error': r['error']}
    html = r['html']
    soup = r['soup']

    body = find_one_tag(soup, 'article')
    text_blocks = body.findAll("div", attrs={'data-component': 'text-block'})

    body = []
    for block in text_blocks:
        body.extend(block.findAll("p", attrs={'class': None}))

    deep_body = []
    for p_tag in body:
        #  style tags were slipping into the p tag
        for s in p_tag('style'):
            s.decompose()

        text = p_tag.get_text()
        #  last link tag, often a link to Twitter or Read more here
        if p_tag.find('a') and p_tag is body[-1]:
            pass
        else:
            deep_body.append(text)
    body = "".join(deep_body)
    app = utils.find_application_json(soup, find='headline')
    return {
        "newspaper_id": "bbc",
        "body": body,
        "article_id": get_bbc_article_id(url),
        "headline": app['headline'],
        "article_url": url,
        "html": html,
        "date_published": app["datePublished"],
    }
示例#7
0
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, "div", {"class": "main-article-body"})
        body = body.findAll("p")

    except utils.ParserError:
        #  possible to have multiple 'text section' divs
        body = soup.findAll("div", {"class": "text section"})
        p_tags = []
        for b in body:
            p_tags.extend(b.findAll("p"))
        body = p_tags

    if len(body) == 0:
        body = utils.find_one_tag(soup, "div", {"class": "wysiwyg wysiwyg--all-content"})
        body = body.findAll("p")

    body = "".join([p.text for p in body])

    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **aljazeera,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": strip_aljazzera_dt(app["datePublished"]),
    }
def parse_url(url):
    response = utils.request(url)
    soup = response['soup']
    html = response['html']

    try:
        body = utils.find_one_tag(soup, "div", {"class": "article-body"})

    except utils.ParserError:
        body = utils.find_one_tag(
            soup, "div", {"class": "ent-article-body ent-layout-centered"})

    new_body = []
    for p in body.findAll("p"):

        if 'data-elm-loc' in p.attrs.keys():
            new_body.append(p.text)

        if 'class' in p.attrs.keys():
            if 'font--body' in p.attrs['class']:
                new_body.append(p.text)

    body = "".join(new_body)
    app = utils.find_application_json(soup, 'headline')
    headline = app['headline']
    published = app['datePublished']

    return {
        **washington_post,
        "body": body,
        "headline": headline,
        "article_url": url,
        "html": html,
        "article_id": get_article_id(url),
        "date_published": published,
    }