Пример #1
0
def test():
    url = sys.argv[1]
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    r = requests.get(url, headers=headers)
    t = time.time()
    a = Article(url, language='zh', keep_article_html=True)
    print time.time() - t
    html = to_unicode(r.content)
    a.parse(url=url, html=html)

    print time.time() - t

    print a.title
    print a.top_img
    print a.imgs
    print a.text
    # print a.article_html
    print a.is_valid_body()
Пример #2
0
class ArticleParser:
    def __init__(self, url):
        self._article = Article(url)
        self._article.download()
        self._article.parse()
        self._text = None

    @property
    def title(self):
        return self._article.title

    @property
    def text(self):
        if not self._text:
            if self._article.is_valid_body():
                self._text = self._article.text
            else:
                self._text = '\n'.join(p.text for p in justext.justext(
                    self._article.html, justext.get_stoplist("English")))

        return self._text

    @property
    def source_url(self):
        return self._article.source_url

    @property
    def date(self):
        return self._article.publish_date

    @date.setter
    def date(self, value):
        self._article.publish_date = value

    def save(self):
        f = open("../news/" + self.title.replace(" ", "_"), "w")
        f.write(self.date.isoformat() + "\n")
        f.write(self.title + "\n")
        f.write(self.source_url + "\n")
        f.write(self.text + "\n")
        f.close()
Пример #3
0
def get_news_data(url, num_words=None):
    """Retrieves information about the news article"""
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()

    metadata = article.meta_data
    if num_words is None:
        summary_ = summarize(article.text)
    else:
        summary_ = summarize(article.text, words=num_words)
    authors = [metadata['author']]
    for author in article.authors:
        if author not in authors:
            authors.append(author)

    return News(article.title, authors, metadata['description'], article.text,
                article.summary, summary_,
                article.is_valid_body() and article.is_valid_url(),
                metadata['og']['site_name'], metadata['generator'])