def test(): url = sys.argv[1] headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} r = requests.get(url, headers=headers) t = time.time() a = Article(url, language='zh', keep_article_html=True) print time.time() - t html = to_unicode(r.content) a.parse(url=url, html=html) print time.time() - t print a.title print a.top_img print a.imgs print a.text # print a.article_html print a.is_valid_body()
class ArticleParser: def __init__(self, url): self._article = Article(url) self._article.download() self._article.parse() self._text = None @property def title(self): return self._article.title @property def text(self): if not self._text: if self._article.is_valid_body(): self._text = self._article.text else: self._text = '\n'.join(p.text for p in justext.justext( self._article.html, justext.get_stoplist("English"))) return self._text @property def source_url(self): return self._article.source_url @property def date(self): return self._article.publish_date @date.setter def date(self, value): self._article.publish_date = value def save(self): f = open("../news/" + self.title.replace(" ", "_"), "w") f.write(self.date.isoformat() + "\n") f.write(self.title + "\n") f.write(self.source_url + "\n") f.write(self.text + "\n") f.close()
def get_news_data(url, num_words=None): """Retrieves information about the news article""" article = Article(url) article.download() article.parse() article.nlp() metadata = article.meta_data if num_words is None: summary_ = summarize(article.text) else: summary_ = summarize(article.text, words=num_words) authors = [metadata['author']] for author in article.authors: if author not in authors: authors.append(author) return News(article.title, authors, metadata['description'], article.text, article.summary, summary_, article.is_valid_body() and article.is_valid_url(), metadata['og']['site_name'], metadata['generator'])