def get_news(url, number, nthreads = 10): """ Retrieves news from the specified source """ t0 = time.time() news = NewsScraper(url, nthreads = nthreads) news.pull() news.scrape(number) texts = [article['text'] for article in news.polished()] print "Scraped %d articles" % len(texts) return texts
{ "word1": 2, "word2": 1, "word3": 1 .... } it returns the inverse dictionary: { 1: ["word1", "word3"], 2: ["word2"] .... } """ words_by_part = {} for elem in partition: if partition[elem] not in words_by_part: words_by_part[partition[elem]] = [elem] else: words_by_part[partition[elem]].append(elem) return words_by_part t0 = time.time() news = NewsScraper('http://cnn.com', nthreads = 10) news.pull() news.scrape(10) texts = (article['text'] for article in news.polished()) t1 = time.time() print "Data retrieved in %.2f sec" %(t1-t0) # Create a graph builder gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords) gb.load_texts(texts) # Show texts in the builder # for text in texts: # print text
it returns the inverse dictionary: { 1: ["word1", "word3"], 2: ["word2"] .... } """ words_by_part = {} for elem in partition: if partition[elem] not in words_by_part: words_by_part[partition[elem]] = [elem] else: words_by_part[partition[elem]].append(elem) return words_by_part t0 = time.time() news = NewsScraper('http://cnn.com', nthreads=10) news.pull() news.scrape(10) texts = (article['text'] for article in news.polished()) t1 = time.time() print "Data retrieved in %.2f sec" % (t1 - t0) # Create a graph builder gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords) gb.load_texts(texts) # Show texts in the builder # for text in texts: # print text