def get_topics_non_dictionary_overlapping(num_news, k, url='http://cnn.com'): texts = get_news(url, num_news) gb = SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False) gb.load_texts(texts) G = gb.create_graph() print "Graph built" words_by_part = graph_cluster.get_overlap_clusters(G, k, 1) #print_topics_from_partitions(G, words_by_part, 10) return G
def get_topics_by_standard_words(num_news, draw=False, url='http://cnn.com'): texts = get_news(url, num_news) gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords) gb.load_texts(texts) G = gb.create_graph() print "Graph built" partition = community.best_partition(G) words_by_part = get_words_by_partition(partition) mod = community.modularity(partition,G) print("modularity:", mod) print_topics_from_partitions(G, words_by_part, 10) if draw: values = [partition.get(node) for node in G.nodes()] nx.draw_spring(G, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False) plt.show() return G
words_by_part[partition[elem]].append(elem) return words_by_part t0 = time.time() news = NewsScraper('http://cnn.com', nthreads = 10) news.pull() news.scrape(10) texts = (article['text'] for article in news.polished()) t1 = time.time() print "Data retrieved in %.2f sec" %(t1-t0) # Create a graph builder gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords) gb.load_texts(texts) # Show texts in the builder # for text in texts: # print text # print "##################################################" # # print "##################################################" # print "TOKENIZED SENTENCES" # print "##################################################" # Show tokenized sentences for text in gb.text_sentences[:1]: print "##################################################"
return words_by_part t0 = time.time() news = NewsScraper('http://cnn.com', nthreads=10) news.pull() news.scrape(10) texts = (article['text'] for article in news.polished()) t1 = time.time() print "Data retrieved in %.2f sec" % (t1 - t0) # Create a graph builder gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords) gb.load_texts(texts) # Show texts in the builder # for text in texts: # print text # print "##################################################" # # print "##################################################" # print "TOKENIZED SENTENCES" # print "##################################################" # Show tokenized sentences for text in gb.text_sentences[:1]: print "##################################################"