def main(): nlp = spacy.load('en') #nlp = whitespace_nlp_with_sentences convention_df = SampleCorpora.ConventionData2012.get_data() convention_df['parsed'] = convention_df.text.apply(nlp) corpus = (CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed') .build() .get_unigram_corpus()) model = word2vec.Word2Vec(size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0, iter=10, null_word=0, trim_rule=None, sorted_vocab=1) html = word_similarity_explorer_gensim(corpus, category='democrat', target_term='jobs', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, width_in_pixels=1000, metadata=convention_df['speaker'], word2vec=Word2VecFromParsedCorpus(corpus, model).train(), term_significance=ScaledFScoreSignificance(), max_p_val=0.05, save_svg_button=True, d3_url='scattertext/data/viz/scripts/d3.min.js', d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js') open('./demo_gensim_similarity.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_gensim_similarity.html in Chrome or Firefox.')
def main(): nlp = spacy.en.English() convention_df = SampleCorpora.ConventionData2012.get_data() convention_df['parsed'] = convention_df.text.apply(nlp) corpus = CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build() model = word2vec.Word2Vec(size=300, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0, iter=1, null_word=0, trim_rule=None, sorted_vocab=1) html = word_similarity_explorer_gensim(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', target_term='jobs', minimum_term_frequency=5, pmi_filter_thresold=4, width_in_pixels=1000, metadata=convention_df['speaker'], word2vec=Word2VecFromParsedCorpus( corpus, model).train(), max_p_val=0.1, save_svg_button=True) open('./demo_gensim_similarity.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_gensim_similarity.html in Chrome or Firefox.')
def Plot_Clusters_Kmeans(outputfile_name, nbr_clusters, path): list_stop_words = [stemmer.stem(stopWords[i]) for i in range(len(stopWords))] text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8') text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str) corpus = (st.CorpusFromPandas(text2kw_clusters, category_col='Cluster', text_col='Text', nlp=nlp).build(). \ remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True). get_unigram_corpus().compact(st.ClassPercentageCompactor(term_count=2, term_ranker=st.OncePerDocFrequencyRanker))) for i in range(nbr_clusters): directory = path+r"\\"+str(i)+r"\\" try: os.mkdir(directory) except FileExistsError: pass html = st.produce_scattertext_explorer(corpus, category=str(i), category_name=str(i)+" Category", not_category_name='Other Categories', metadata=text2kw_clusters['Date'], minimum_term_frequency=50) filename = directory+str(i)+"_Category-VS-other categories.html" open(filename, 'wb+').write(html.encode('utf-8')) text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8') text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str) text2kw_clusters['Text'] = text2kw_clusters['Text'].apply(nlp) corpus = (st.CorpusFromParsedDocuments(text2kw_clusters, category_col='Cluster', parsed_col='Text')).build(). \ remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True) for i in range(nbr_clusters): directory = path+r"\\"+str(i)+r"\\" m = text2kw_clusters[text2kw_clusters["Cluster"] == str(i)] liste = [word_tokenize(str(x)) for x in m["processedReviews"] if not stemmer.stem(str(x)) in list_stop_words] words = [] for j in range(len(liste)): for k in range(len(liste[j])): if not (liste[j][k] in list_stop_words): try: words.append(liste[j][k]) except: pass counter = collections.Counter(words) c = counter.most_common() html = word_similarity_explorer_gensim(corpus, category=str(i), category_name=str(i)+" Category", not_category_name='Other Categories', minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005), target_term=stemmer.stem(c[0][0]), # pmi_threshold_coefficient=4, width_in_pixels=1000, metadata=text2kw_clusters['Date'], word2vec=model, max_p_val=0.05, save_svg_button=True) filename = directory+str(i)+"_w2v_Category-VS-other categories.html" open(filename, 'wb+').write(html.encode('utf-8')) for i in range(nbr_clusters): directory = path+r"\\"+str(i)+r"\\" text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8') text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str) text2kw_clusters['Sentiments'] = text2kw_clusters['Sentiments'].astype(str) text2kw_clusters['Date'] = text2kw_clusters['Date'].astype(str) text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Cluster"] == str(i)] text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Sentiments"] != "neutral"] corpus = (st.CorpusFromPandas(text2kw_clusters, category_col='Sentiments', text_col='Text', nlp=nlp).build(). remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True)) html = st.produce_scattertext_explorer(corpus, category="positive", category_name="Positive Verbatims", not_category_name='Negative Verbatims', metadata=text2kw_clusters['Date'], minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005)) filename = directory+str(i)+"_Positive_Category-VS-Negative_Category.html" open(filename, 'wb+').write(html.encode('utf-8')) for i in range(nbr_clusters): directory = path+r"\\"+str(i)+r"\\" text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8') text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str) text2kw_clusters['Sentiments'] = text2kw_clusters['Sentiments'].astype(str) text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Cluster"] == str(i)] text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Sentiments"] != "neutral"] text2kw_clusters['Text'] = text2kw_clusters['Text'].apply(nlp) liste = [word_tokenize(str(x)) for x in text2kw_clusters["processedReviews"] if not stemmer.stem(str(x)) in list_stop_words] words = [] for j in range(len(liste)): for k in range(len(liste[j])): if not (liste[j][k] in list_stop_words): try: words.append(liste[j][k]) except: pass counter = collections.Counter(words) c = counter.most_common() corpus = (st.CorpusFromParsedDocuments(text2kw_clusters, category_col='Sentiments', parsed_col='Text')).build(). \ remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True) html = word_similarity_explorer_gensim(corpus, category="positive", category_name="Positive Verbatims", not_category_name='Negative Verbatims', minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005), target_term=stemmer.stem(c[0][0]), # pmi_threshold_coefficient=4, width_in_pixels=1000, metadata=text2kw_clusters['Date'], word2vec=model, max_p_val=0.05, save_svg_button=True) filename = directory+str(i)+"_w2v__Positive_Category-VS-Negative_Category.html" open(filename, 'wb+').write(html.encode('utf-8'))
build_corpus = CorpusFromParsedDocuments(df_2, category_col='author', parsed_col='parsed').build() build_model = word2vec.Word2Vec(size=300,alpha=0.025,window=5,min_count=5,max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0, iter=1, null_word=0, trim_rule=None, sorted_vocab=1) html = word_similarity_explorer_gensim(build_corpus, category='GLOBE EDITORIAL', category_name='GLOBE EDITORIAL', not_category_name='Jeffrey Simpson', target_term='obama', minimum_term_frequency=100, pmi_threshold_coefficient=4, width_in_pixels=1000, metadata=df_2['author'], word2vec=Word2VecFromParsedCorpus(build_corpus, build_model).train(), max_p_val=0.05, save_svg_button=True) open('../output/gensim_similarity_top_2_authors.html', 'wb').write(html.encode('utf-8'))
#nlp = spacy.en.English() #convention_df = SampleCorpora.ConventionData2012.get_data() #convention_df['parsed'] = convention_df.text.apply(nlp) #corpus = CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build() filename = '../data/output/w2v_model_cbow_win_5_dim_300_iter_20_mc_10.dat' model = word2vec.Word2Vec.load(filename) html = word_similarity_explorer_gensim( None, category='democrat', category_name='Democratic', not_category_name='Republican', target_term='jobs', minimum_term_frequency=5, pmi_threshold_coefficient=4, width_in_pixels=1000, #metadata=convention_df['speaker'], word2vec=model, max_p_val=0.05, save_svg_button=True) #html = word_similarity_explorer_gensim(corpus, # category='democrat', # category_name='Democratic', # not_category_name='Republican', # target_term='jobs', # minimum_term_frequency=5, # pmi_threshold_coefficient=4, # width_in_pixels=1000,