def main():
	nlp = spacy.load('en')
	#nlp = whitespace_nlp_with_sentences
	convention_df = SampleCorpora.ConventionData2012.get_data()
	convention_df['parsed'] = convention_df.text.apply(nlp)
	corpus = (CorpusFromParsedDocuments(convention_df,
	                                   category_col='party',
	                                   parsed_col='parsed')
	          .build()
	          .get_unigram_corpus())
	model = word2vec.Word2Vec(size=100,
	                          alpha=0.025,
	                          window=5,
	                          min_count=5,
	                          max_vocab_size=None,
	                          sample=0,
	                          seed=1,
	                          workers=1,
	                          min_alpha=0.0001,
	                          sg=1,
	                          hs=1,
	                          negative=0,
	                          cbow_mean=0,
	                          iter=10,
	                          null_word=0,
	                          trim_rule=None,
	                          sorted_vocab=1)
	html = word_similarity_explorer_gensim(corpus,
	                                       category='democrat',
	                                       target_term='jobs',
	                                       category_name='Democratic',
	                                       not_category_name='Republican',
	                                       minimum_term_frequency=5,
	                                       width_in_pixels=1000,
	                                       metadata=convention_df['speaker'],
	                                       word2vec=Word2VecFromParsedCorpus(corpus, model).train(),
	                                       term_significance=ScaledFScoreSignificance(),
	                                       max_p_val=0.05,
	                                       save_svg_button=True,
	                                       d3_url='scattertext/data/viz/scripts/d3.min.js',
	                                       d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js')
	open('./demo_gensim_similarity.html', 'wb').write(html.encode('utf-8'))
	print('Open ./demo_gensim_similarity.html in Chrome or Firefox.')
Пример #2
0
def main():
    nlp = spacy.en.English()
    convention_df = SampleCorpora.ConventionData2012.get_data()
    convention_df['parsed'] = convention_df.text.apply(nlp)
    corpus = CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parsed').build()
    model = word2vec.Word2Vec(size=300,
                              alpha=0.025,
                              window=5,
                              min_count=5,
                              max_vocab_size=None,
                              sample=0,
                              seed=1,
                              workers=1,
                              min_alpha=0.0001,
                              sg=1,
                              hs=1,
                              negative=0,
                              cbow_mean=0,
                              iter=1,
                              null_word=0,
                              trim_rule=None,
                              sorted_vocab=1)
    html = word_similarity_explorer_gensim(corpus,
                                           category='democrat',
                                           category_name='Democratic',
                                           not_category_name='Republican',
                                           target_term='jobs',
                                           minimum_term_frequency=5,
                                           pmi_filter_thresold=4,
                                           width_in_pixels=1000,
                                           metadata=convention_df['speaker'],
                                           word2vec=Word2VecFromParsedCorpus(
                                               corpus, model).train(),
                                           max_p_val=0.1,
                                           save_svg_button=True)
    open('./demo_gensim_similarity.html', 'wb').write(html.encode('utf-8'))
    print('Open ./demo_gensim_similarity.html in Chrome or Firefox.')
Пример #3
0
def Plot_Clusters_Kmeans(outputfile_name, nbr_clusters, path):
    list_stop_words = [stemmer.stem(stopWords[i]) for i in range(len(stopWords))]

    text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8')
    text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str)
    corpus = (st.CorpusFromPandas(text2kw_clusters, category_col='Cluster', text_col='Text', nlp=nlp).build(). \
              remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True).
              get_unigram_corpus().compact(st.ClassPercentageCompactor(term_count=2,
                                                                       term_ranker=st.OncePerDocFrequencyRanker)))

    for i in range(nbr_clusters):
        directory = path+r"\\"+str(i)+r"\\"
        try:
            os.mkdir(directory)
        except FileExistsError:
            pass

        html = st.produce_scattertext_explorer(corpus, category=str(i), category_name=str(i)+" Category",
                                               not_category_name='Other Categories',
                                               metadata=text2kw_clusters['Date'],
                                               minimum_term_frequency=50)
        filename = directory+str(i)+"_Category-VS-other categories.html"
        open(filename, 'wb+').write(html.encode('utf-8'))

    text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8')
    text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str)
    text2kw_clusters['Text'] = text2kw_clusters['Text'].apply(nlp)
    corpus = (st.CorpusFromParsedDocuments(text2kw_clusters, category_col='Cluster', parsed_col='Text')).build(). \
        remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True)
    for i in range(nbr_clusters):
        directory = path+r"\\"+str(i)+r"\\"

        m = text2kw_clusters[text2kw_clusters["Cluster"] == str(i)]
        liste = [word_tokenize(str(x)) for x in m["processedReviews"] if not stemmer.stem(str(x)) in list_stop_words]
        words = []
        for j in range(len(liste)):
            for k in range(len(liste[j])):
                if not (liste[j][k] in list_stop_words):
                    try:
                        words.append(liste[j][k])
                    except:
                        pass

        counter = collections.Counter(words)
        c = counter.most_common()
        html = word_similarity_explorer_gensim(corpus, category=str(i), category_name=str(i)+" Category",
                                               not_category_name='Other Categories',
                                               minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005),
                                               target_term=stemmer.stem(c[0][0]),
                                               # pmi_threshold_coefficient=4,
                                               width_in_pixels=1000,
                                               metadata=text2kw_clusters['Date'],
                                               word2vec=model,
                                               max_p_val=0.05,
                                               save_svg_button=True)
        filename = directory+str(i)+"_w2v_Category-VS-other categories.html"
        open(filename, 'wb+').write(html.encode('utf-8'))

    for i in range(nbr_clusters):
        directory = path+r"\\"+str(i)+r"\\"

        text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8')
        text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str)
        text2kw_clusters['Sentiments'] = text2kw_clusters['Sentiments'].astype(str)
        text2kw_clusters['Date'] = text2kw_clusters['Date'].astype(str)
        text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Cluster"] == str(i)]
        text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Sentiments"] != "neutral"]
        corpus = (st.CorpusFromPandas(text2kw_clusters, category_col='Sentiments', text_col='Text', nlp=nlp).build().
                  remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True))
        html = st.produce_scattertext_explorer(corpus, category="positive", category_name="Positive Verbatims",
                                               not_category_name='Negative Verbatims',
                                               metadata=text2kw_clusters['Date'],
                                               minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005))
        filename = directory+str(i)+"_Positive_Category-VS-Negative_Category.html"
        open(filename, 'wb+').write(html.encode('utf-8'))

    for i in range(nbr_clusters):
        directory = path+r"\\"+str(i)+r"\\"

        text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8')
        text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str)
        text2kw_clusters['Sentiments'] = text2kw_clusters['Sentiments'].astype(str)
        text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Cluster"] == str(i)]
        text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Sentiments"] != "neutral"]
        text2kw_clusters['Text'] = text2kw_clusters['Text'].apply(nlp)
        liste = [word_tokenize(str(x)) for x in text2kw_clusters["processedReviews"] if
                 not stemmer.stem(str(x)) in list_stop_words]
        words = []
        for j in range(len(liste)):
            for k in range(len(liste[j])):
                if not (liste[j][k] in list_stop_words):
                    try:
                        words.append(liste[j][k])
                    except:
                        pass
        counter = collections.Counter(words)
        c = counter.most_common()

        corpus = (st.CorpusFromParsedDocuments(text2kw_clusters, category_col='Sentiments', parsed_col='Text')).build(). \
            remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True)
        html = word_similarity_explorer_gensim(corpus, category="positive", category_name="Positive Verbatims",
                                               not_category_name='Negative Verbatims',
                                               minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005),
                                               target_term=stemmer.stem(c[0][0]),
                                               # pmi_threshold_coefficient=4,
                                               width_in_pixels=1000,
                                               metadata=text2kw_clusters['Date'],
                                               word2vec=model,
                                               max_p_val=0.05,
                                               save_svg_button=True)
        filename = directory+str(i)+"_w2v__Positive_Category-VS-Negative_Category.html"
        open(filename, 'wb+').write(html.encode('utf-8'))
build_corpus = CorpusFromParsedDocuments(df_2, category_col='author', parsed_col='parsed').build()

build_model = word2vec.Word2Vec(size=300,alpha=0.025,window=5,min_count=5,max_vocab_size=None,
                          sample=0,
                          seed=1,
                          workers=1,
                          min_alpha=0.0001,
                          sg=1,
                          hs=1,
                          negative=0,
                          cbow_mean=0,
                          iter=1,
                          null_word=0,
                          trim_rule=None,
                          sorted_vocab=1)

html = word_similarity_explorer_gensim(build_corpus,
                                       category='GLOBE EDITORIAL',
                                       category_name='GLOBE EDITORIAL',
                                       not_category_name='Jeffrey Simpson',
                                       target_term='obama',
                                       minimum_term_frequency=100,
                                       pmi_threshold_coefficient=4,
                                       width_in_pixels=1000,
                                       metadata=df_2['author'],
                                       word2vec=Word2VecFromParsedCorpus(build_corpus, build_model).train(),
                                       max_p_val=0.05,
                                       save_svg_button=True)

open('../output/gensim_similarity_top_2_authors.html', 'wb').write(html.encode('utf-8'))
#nlp = spacy.en.English()
#convention_df = SampleCorpora.ConventionData2012.get_data()
#convention_df['parsed'] = convention_df.text.apply(nlp)
#corpus = CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build()

filename = '../data/output/w2v_model_cbow_win_5_dim_300_iter_20_mc_10.dat'

model = word2vec.Word2Vec.load(filename)

html = word_similarity_explorer_gensim(
    None,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    target_term='jobs',
    minimum_term_frequency=5,
    pmi_threshold_coefficient=4,
    width_in_pixels=1000,
    #metadata=convention_df['speaker'],
    word2vec=model,
    max_p_val=0.05,
    save_svg_button=True)

#html = word_similarity_explorer_gensim(corpus,
#                                       category='democrat',
#                                       category_name='Democratic',
#                                       not_category_name='Republican',
#                                       target_term='jobs',
#                                       minimum_term_frequency=5,
#                                       pmi_threshold_coefficient=4,
#                                       width_in_pixels=1000,