def get_article(session, entry): if 'pheedo_origlink' not in entry: return link = entry['pheedo_origlink'] if '_blog.' in link: return f, a = link.rsplit('/', 1) a = a.replace('_story.', '_print.') url = f + '/' + a doc, css = fetch_page(session, url) content = css('#content').pop() h1 = content.find('h1') title = h1.xpath('string()').strip() content.remove(h1) h3 = content.find('h3') time = h3.find('span') date = datetime.fromtimestamp(int(time.get('epochtime'))/1000) h3.remove(time) author = h3.xpath('string()').strip() content.remove(h3) body = html.tostring(content) article = { 'title': title, 'author': author, 'link': link, 'key': link, 'date': date, 'attribution': 'The Washington Post', 'body': body } return article
def get_article(session, entry): if 'pheedo_origlink' not in entry: return link = entry['pheedo_origlink'] url = link.rsplit('?', 1)[0] + '?pagewanted=print' doc, css = fetch_page(session, url, headers={'Referer': link}) date = css('.timestamp') if not len(date): return date_text = date.pop().text.strip() try: date = datetime.strptime(date_text, '%B %d, %Y') except ValueError: date = datetime.strptime(date_text, '%B %d, %Y,') headline = css('nyt_headline') if not len(headline): headline = css('h3.entry-title') author = css('nyt_byline') if not len(author): author = css('.byline') body = css('#articleBody') if not len(body): body = css('.entry-content') article = { 'title': headline.pop().text, 'author': author.pop().xpath('string()').strip(), 'link': link, 'key': link, 'date': date, 'attribution': 'The New York Times', 'body': html.tostring(body.pop()) } return article
def get_article(session, entry): doc, css = fetch_page(session, entry.link) body = '\n'.join([stringify(p) for p in css('.ap_para')]) date_ = datetime.strptime(css('.ap_dt_stmp').pop().text, '%b. %d, %Y') by_line = css('.ap_by') author = by_line.pop().text if len(by_line) else '' article = { 'title': css('.ap_head').pop().text, 'author': author, 'link': entry.link, 'key': entry.id, 'date': date_, 'attribution': css('#CopyrightLine').pop().xpath('string()'), 'body': body } return article
def get_article(session, entry): doc, css = fetch_page(session, entry.link) time = css("time") date_ = css(".date") if len(time): date = time.pop().get("datetime") try: date = datetime.strptime(date[:15], "%Y-%m-%dT%H:%M%Z") except ValueError: try: date = datetime.strptime(date[:15], "%Y-%m-%dT%H:%M") except ValueError: date = datetime.strptime(date[:15], "%Y-%m-%d") elif len(date_): date = date_.pop().text.split(" ", 1)[-1] try: date = datetime.strptime(date, "%d %B %Y") except ValueError: date = datetime.strptime(date, "%d %b %Y") body = css("#article-body-blocks") if not len(body): return None body = html.tostring(body.pop()) author = css("a.contributor") author = author.pop().text.strip() if len(author) else "" article = { "title": css("h1").pop().text, "author": author, "link": entry.link, "key": entry.id, "date": date, "attribution": css("#copyright-links li").pop().xpath("string()"), "body": body, } return article