def get_news_by_url(url): news = News() try: soup = BeautifulSoup(urllib2.urlopen(url)) #title title = soup.find("div", "pg-story-head md").find("h2").text news.set_title(title) #postTime author_posttime = soup.find("p", "dateline").text.replace("\n","").lower().replace("\t","").split("/") post_time = author_posttime[1].replace("pm", "").replace("am", "").strip() t_format = "%d %b %Y, %I:%M" post_time = datetime.strptime(post_time, t_format).isoformat() news.set_posttime(post_time) #author author = author_posttime[0] news.set_author(author) #url news.set_url(url) #date date = datetime.utcnow().isoformat() news.set_date(date) #source source = 'elfinancierocr' news.set_source(source) #content, encoding, id, country, labels paragraphs = soup.find("div", "pg-story-body mce").find_all('p') content = " ".join([unicode(p.text) for p in paragraphs]) news.set_content(content) #encoding encoding = 'utf-8' news.set_encoding(encoding) news.news = message.add_embers_ids(news.news) return news.news except: log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0])) return None
def get_news_by_url(url): news = News() try: soup = BeautifulSoup(urllib2.urlopen(url)) # title title = soup.find_all("h1")[0].text news.set_title(title) # postTime post_time = soup.select('meta[name="REVISION_DATE"]')[0]["content"] t_format = "%a %b %d %H:%M:%S %Z %Y" post_time = datetime.strptime(post_time, t_format).isoformat() news.set_posttime(post_time) # author author = soup.select('meta[name="Author"]')[0]["content"] news.set_author(author) # url news.set_url(url) # date date = datetime.utcnow().isoformat() news.set_date(date) # source source = "lta_reuters" news.set_source(source) # content, encoding, id, country, labels paragraphs = soup.find(id="resizeableText").find_all("p") content = " ".join([unicode(p.text) for p in paragraphs]) news.set_content(content) # encoding encoding = "utf-8" news.set_encoding(encoding) news.news = message.add_embers_ids(news.news) return news.news except: log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0])) return None