Пример #1
0
def get_news(url, number, nthreads = 10):
    """
    Retrieves news from the specified source
    """

    t0 = time.time()
    news = NewsScraper(url, nthreads = nthreads)
    news.pull()
    news.scrape(number)
    texts = [article['text'] for article in news.polished()]
    print "Scraped %d articles" % len(texts)

    return texts
Пример #2
0
    { "word1": 2, "word2": 1, "word3": 1 .... }
    it returns the inverse dictionary:
    { 1: ["word1", "word3"], 2: ["word2"] .... }
    """
    words_by_part = {}
    for elem in partition:
        if partition[elem] not in words_by_part:
            words_by_part[partition[elem]] = [elem]
        else:
            words_by_part[partition[elem]].append(elem)

    return words_by_part

t0 = time.time()

news = NewsScraper('http://cnn.com', nthreads = 10)
news.pull()
news.scrape(10)
texts = (article['text'] for article in news.polished())

t1 = time.time()
print "Data retrieved in %.2f sec" %(t1-t0)

# Create a graph builder
gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords)

gb.load_texts(texts)

# Show texts in the builder
# for text in texts:
#     print text
Пример #3
0
    it returns the inverse dictionary:
    { 1: ["word1", "word3"], 2: ["word2"] .... }
    """
    words_by_part = {}
    for elem in partition:
        if partition[elem] not in words_by_part:
            words_by_part[partition[elem]] = [elem]
        else:
            words_by_part[partition[elem]].append(elem)

    return words_by_part


t0 = time.time()

news = NewsScraper('http://cnn.com', nthreads=10)
news.pull()
news.scrape(10)
texts = (article['text'] for article in news.polished())

t1 = time.time()
print "Data retrieved in %.2f sec" % (t1 - t0)

# Create a graph builder
gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords)

gb.load_texts(texts)

# Show texts in the builder
# for text in texts:
#     print text