def main(): nlp = spacy.load('en') #nlp = whitespace_nlp_with_sentences convention_df = SampleCorpora.ConventionData2012.get_data() convention_df['parsed'] = convention_df.text.apply(nlp) corpus = (CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed') .build() .get_unigram_corpus()) model = word2vec.Word2Vec(size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0, iter=10, null_word=0, trim_rule=None, sorted_vocab=1) html = word_similarity_explorer_gensim(corpus, category='democrat', target_term='jobs', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, width_in_pixels=1000, metadata=convention_df['speaker'], word2vec=Word2VecFromParsedCorpus(corpus, model).train(), term_significance=ScaledFScoreSignificance(), max_p_val=0.05, save_svg_button=True, d3_url='scattertext/data/viz/scripts/d3.min.js', d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js') open('./demo_gensim_similarity.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_gensim_similarity.html in Chrome or Firefox.')
def main(): nlp = spacy.en.English() convention_df = SampleCorpora.ConventionData2012.get_data() convention_df['parsed'] = convention_df.text.apply(nlp) corpus = CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build() model = word2vec.Word2Vec(size=300, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0, iter=1, null_word=0, trim_rule=None, sorted_vocab=1) html = word_similarity_explorer_gensim(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', target_term='jobs', minimum_term_frequency=5, pmi_filter_thresold=4, width_in_pixels=1000, metadata=convention_df['speaker'], word2vec=Word2VecFromParsedCorpus( corpus, model).train(), max_p_val=0.1, save_svg_button=True) open('./demo_gensim_similarity.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_gensim_similarity.html in Chrome or Firefox.')
build_corpus = CorpusFromParsedDocuments(df_2, category_col='author', parsed_col='parsed').build() build_model = word2vec.Word2Vec(size=300,alpha=0.025,window=5,min_count=5,max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, sg=1, hs=1, negative=0, cbow_mean=0, iter=1, null_word=0, trim_rule=None, sorted_vocab=1) html = word_similarity_explorer_gensim(build_corpus, category='GLOBE EDITORIAL', category_name='GLOBE EDITORIAL', not_category_name='Jeffrey Simpson', target_term='obama', minimum_term_frequency=100, pmi_threshold_coefficient=4, width_in_pixels=1000, metadata=df_2['author'], word2vec=Word2VecFromParsedCorpus(build_corpus, build_model).train(), max_p_val=0.05, save_svg_button=True) open('../output/gensim_similarity_top_2_authors.html', 'wb').write(html.encode('utf-8'))