Пример #1
0
def get_article(session, entry):
    if 'pheedo_origlink' not in entry:
        return
    link = entry['pheedo_origlink']
    if '_blog.' in link:
        return
    f, a = link.rsplit('/', 1)
    a = a.replace('_story.', '_print.')
    url = f + '/' + a
    doc, css = fetch_page(session, url)
    content = css('#content').pop()
    h1 = content.find('h1')
    title = h1.xpath('string()').strip()
    content.remove(h1)
    h3 = content.find('h3')
    time = h3.find('span')
    date = datetime.fromtimestamp(int(time.get('epochtime'))/1000)
    h3.remove(time)
    author = h3.xpath('string()').strip()
    content.remove(h3)
    body = html.tostring(content)
    article = {
        'title': title,
        'author': author,
        'link': link,
        'key': link,
        'date': date,
        'attribution': 'The Washington Post',
        'body': body
        }
    return article
Пример #2
0
def get_article(session, entry):
    if 'pheedo_origlink' not in entry:
        return
    link = entry['pheedo_origlink']
    url = link.rsplit('?', 1)[0] + '?pagewanted=print'
    doc, css = fetch_page(session, url, headers={'Referer': link})
    date = css('.timestamp')
    if not len(date):
        return
    date_text = date.pop().text.strip()
    try:
        date = datetime.strptime(date_text, '%B %d, %Y')
    except ValueError:
        date = datetime.strptime(date_text, '%B %d, %Y,')
    headline = css('nyt_headline')
    if not len(headline):
        headline = css('h3.entry-title')
    author = css('nyt_byline')
    if not len(author):
        author = css('.byline')
    body = css('#articleBody')
    if not len(body):
        body = css('.entry-content')
    article = {
        'title': headline.pop().text,
        'author': author.pop().xpath('string()').strip(),
        'link': link,
        'key': link,
        'date': date,
        'attribution': 'The New York Times',
        'body': html.tostring(body.pop())
        }
    return article
Пример #3
0
def get_article(session, entry):
    doc, css = fetch_page(session, entry.link)
    body = '\n'.join([stringify(p) for p in css('.ap_para')])
    date_ = datetime.strptime(css('.ap_dt_stmp').pop().text, '%b. %d, %Y')
    by_line = css('.ap_by')
    author = by_line.pop().text if len(by_line) else ''
    article = {
        'title': css('.ap_head').pop().text,
        'author': author,
        'link': entry.link,
        'key': entry.id,
        'date': date_,
        'attribution': css('#CopyrightLine').pop().xpath('string()'),
        'body': body
        }
    return article
Пример #4
0
def get_article(session, entry):
    doc, css = fetch_page(session, entry.link)

    time = css("time")
    date_ = css(".date")
    if len(time):
        date = time.pop().get("datetime")
        try:
            date = datetime.strptime(date[:15], "%Y-%m-%dT%H:%M%Z")
        except ValueError:
            try:
                date = datetime.strptime(date[:15], "%Y-%m-%dT%H:%M")
            except ValueError:
                date = datetime.strptime(date[:15], "%Y-%m-%d")
    elif len(date_):
        date = date_.pop().text.split(" ", 1)[-1]
        try:
            date = datetime.strptime(date, "%d %B %Y")
        except ValueError:
            date = datetime.strptime(date, "%d %b %Y")
    body = css("#article-body-blocks")
    if not len(body):
        return None
    body = html.tostring(body.pop())
    author = css("a.contributor")
    author = author.pop().text.strip() if len(author) else ""
    article = {
        "title": css("h1").pop().text,
        "author": author,
        "link": entry.link,
        "key": entry.id,
        "date": date,
        "attribution": css("#copyright-links li").pop().xpath("string()"),
        "body": body,
    }
    return article