def gitc(self, dataframe): general_inquirer_feature_builder = st.FeatsFromGeneralInquirer() corpus = st.CorpusFromPandas( dataframe, category_col='Document Type', text_col='Text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=general_inquirer_feature_builder).build() html = st.produce_frequency_explorer( corpus, category='submission', category_name='Submission', not_category_name='Standard', use_non_text_features=True, use_full_doc=True, term_scorer=st.LogOddsRatioUninformativeDirichletPrior(), grey_threshold=1.96, width_in_pixels=1000, metadata=dataframe['Document'], topic_model_term_lists=general_inquirer_feature_builder. get_top_model_term_lists()) logger.getLogger().info("Opening GITC-Visual") open(self.gitc_file, 'wb').write(html.encode('utf-8')) webbrowser.open("file://" + self.gitc_file)
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() corpus = st.CorpusFromPandas( movie_df, category_col='category', text_col='text', nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus() beta_posterior = st.BetaPosterior(corpus).set_categories('fresh', ['rotten']) score_df = beta_posterior.get_score_df() print("Top Fresh Terms") print(score_df.sort_values(by='cat_p').head()) print("Top Rotten Terms") print(score_df.sort_values(by='ncat_p').head()) html = st.produce_frequency_explorer(corpus, category='fresh', not_category_name='rotten', term_scorer=beta_posterior, grey_threshold=1.96) file_name = 'demo_beta_posterior.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open %s in Chrome or Firefox.' % file_name)
scores[corpus.get_terms()[feati]].append((acc - shuff_acc) / acc) print("Features sorted by their score:") print( sorted([(round(np.mean(score), 4), feat) for feat, score in scores.items()], reverse=True)) print("Features sorted by their pred diff:") print( sorted([(round(np.mean(score), 4), feat) for feat, score in pred_diff.items()], reverse=True)) term_scores = pd.Series(index=corpus.get_terms()) top_terms = pd.Series(scores).apply(np.mean) term_scores.loc[top_terms.index] = top_terms.values term_scores = term_scores.fillna(0) html = st.produce_frequency_explorer(corpus, category='Positive', not_categories=['Negative'], neutral_categories=['Plot'], scores=term_scores.values, metadata=movie_df['movie_name'], grey_threshold=0, show_neutral=True) file_name = 'demo_rf.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() general_inquirer_feature_builder = st.FeatsFromGeneralInquirer() corpus = st.CorpusFromPandas( convention_df, category_col='party', text_col='text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=general_inquirer_feature_builder).build() html = st.produce_frequency_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True, term_scorer=st.LogOddsRatioUninformativeDirichletPrior(), grey_threshold=1.96, width_in_pixels=1000, topic_model_term_lists=general_inquirer_feature_builder. get_top_model_term_lists(), metadata_descriptions=general_inquirer_feature_builder.get_definitions()) fn = 'demo_general_inquirer_frequency_plot.html' with open(fn, 'wb') as out: out.write(html.encode('utf-8')) print('Open ./%s in Chrome.' % (fn))
C=1.0 / tfidf_X.shape[0], tol=1e-3) clf.fit(tfidf_X, newsgroups_train.target) corpus = st.CorpusFromScikit(X=CountVectorizer( vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data), y=newsgroups_train.target, feature_vocabulary=vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data).build() html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=clf.coef_[0], use_term_significance=False, terms_to_include=st.AutoTermSelector.get_selected_terms( corpus, clf.coef_[0]), metadata=[ '/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames ]) file_name = "demo_sklearn.html" open(file_name, 'wb').write(html.encode('utf-8')) print("open " + file_name) sfs = (corpus.get_scaled_f_scores('alt.atheism') - 0.5) * 2 html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=sfs, use_term_significance=False,
from scattertext.termscoring.DeltaJSDivergence import DeltaJSDivergence from scattertext.termcompaction.AssociationCompactor import JSDCompactor from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer from scattertext.CorpusFromPandas import CorpusFromPandas convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas( convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact( JSDCompactor(1000)) html = produce_frequency_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=convention_df['speaker'], term_scorer=DeltaJSDivergence(), transform=dense_rank, term_metadata_df=corpus.get_term_freq_df(''), enable_term_category_description=False) open('./demo_JSDivergence.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_JSDivergence.html in Chrome or Firefox.')
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, CohensD, produce_frequency_explorer, \ OncePerDocFrequencyRanker from scattertext.termcompaction.ClassPercentageCompactor import ClassPercentageCompactor from scattertext import produce_scattertext_explorer from scattertext.CorpusFromPandas import CorpusFromPandas from scattertext.termranking import ClassBalancedFrequencyRanker from scattertext.termscoring.ScaledFScore import ScaledFScorePresets convention_df = SampleCorpora.ConventionData2012.get_data() corpus = (CorpusFromPandas( convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build().compact( ClassPercentageCompactor(term_ranker=OncePerDocFrequencyRanker))) html = produce_frequency_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', term_scorer=CohensD(corpus).set_term_ranker( ClassBalancedFrequencyRanker).set_categories('democrat', ['republican']), metadata=convention_df['speaker'], grey_threshold=0, show_neutral=True) file_name = 'demo_cohens_d.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open ./demo_cohens_d.html in Chrome or Firefox.')
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category\ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas( movie_df, category_col='category', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_unigram_corpus() term_scorer = (st.RelativeEntropy(corpus) .set_categories('Positive', ['Negative'], ['Plot'])) html = st.produce_frequency_explorer( corpus, category='Positive', not_categories=['Negative'], neutral_categories=['Plot'], term_scorer=term_scorer, metadata=movie_df['movie_name'], grey_threshold=0, show_neutral=True ) file_name = 'demo_relative_entropy.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() corpus = (st.CorpusFromPandas( convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus()) html = st.produce_frequency_explorer(corpus, category='BARACK OBAMA', term_scorer=st.ScaledFScorePresets( one_to_neg_one=True, use_score_difference=True), metadata=convention_df['speaker'], grey_threshold=0) file_name = 'demo_obama.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open ./%s in Chrome.' % (file_name))
import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() corpus = (st.CorpusFromPandas( convention_df, category_col='party', text_col='text', nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus()) term_scorer = st.CredTFIDF(corpus, use_l2_norm=False, use_cred=False).set_categories( 'democrat', ['republican']) print(term_scorer.get_score_df().sort_values(by='delta_cred_tf_idf', ascending=False).head()) html = st.produce_frequency_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', term_scorer=term_scorer, metadata=convention_df['speaker'], grey_threshold=0, include_all_contexts=True) file_name = 'demo_include_all_contexts.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open %s in Chrome or Firefox.' % file_name)
for name, key in [('Democratic Rank', 'DemocraticRank'), ('Republican Rank', 'RepublicanRank'), ('Rank Difference Score', 'RankDiff')]) + '+ "</span>" ;})' html = produce_frequency_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=convention_df['speaker'], term_scorer=DeltaJSDivergence(), transform=dense_rank, term_metadata_df=term_etc_df, get_custom_term_html=get_custom_term_html, enable_term_category_description=False, header_names={ 'upper': 'Top Dem. RankDiff', 'lower': 'Top GOP RankDiff' }, header_sorting_algos={ 'upper': '(function(a, b) {return b.etc.RankDiff - a.etc.RankDiff})', 'lower': '(function(a, b) {return a.etc.RankDiff - b.etc.RankDiff})' }) open('./demo_JSDivergence.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_JSDivergence.html in Chrome or Firefox.')
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR from scattertext.CorpusFromPandas import CorpusFromPandas convention_df = SampleCorpora.ConventionData2012.get_data() corpus = (CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences) .build() .get_unigram_corpus()) html = produce_frequency_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', term_scorer=HedgesR(corpus), metadata=convention_df['speaker'], grey_threshold=0 ) file_name = 'demo_hedges_r.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open ./%s in Chrome.' % (file_name))
import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() general_inquirer_feature_builder = st.FeatsFromGeneralInquirer() corpus = st.CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=general_inquirer_feature_builder).build() html = st.produce_frequency_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True, term_scorer=st.LogOddsRatioUninformativeDirichletPrior(), grey_threshold=1.96, width_in_pixels=1000, topic_model_term_lists=general_inquirer_feature_builder.get_top_model_term_lists(), metadata_descriptions=general_inquirer_feature_builder.get_definitions()) fn = 'demo_general_inquirer_frequency_plot.html' with open(fn, 'wb') as out: out.write(html.encode('utf-8')) print('Open ./%s in Chrome.' % (fn))
from scattertext.termcompaction.CompactTerms import CompactTerms import scattertext as st from scattertext import LogOddsRatioInformativeDirichletPrior fn = 'demo_log_odds_ratio_prior.html' df = st.SampleCorpora.RottenTomatoes.get_data() corpus = (st.CorpusFromPandas(df, category_col='category', text_col='text', nlp=st.whitespace_nlp_with_sentences) .build()) priors = (st.PriorFactory(corpus, category='fresh', not_categories=['rotten'], starting_count=1) #.use_general_term_frequencies() .use_all_categories() .get_priors()) (open(fn, 'wb') .write( st.produce_frequency_explorer( corpus, category='fresh', not_categories=['rotten'], metadata=df['movie_name'], term_scorer=LogOddsRatioInformativeDirichletPrior(priors, 1), ).encode('utf-8')) ) print(fn)
convention_df, category_col='party', text_col='text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=moral_foundations_feats).build() cohens_d_scorer = st.CohensD(corpus).use_metadata() term_scorer = cohens_d_scorer.set_categories('democrat', ['republican']) mfd_df = term_scorer.get_score_df() print(mfd_df.head()) mfd_df.to_csv('demo_moral_foundations.csv') print('See demo_moral_foundations.csv for the output.') html = st.produce_frequency_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', metadata=convention_df['speaker'], use_non_text_features=True, use_full_doc=True, term_scorer=st.CohensD(corpus).use_metadata(), grey_threshold=0, width_in_pixels=1000, topic_model_term_lists=moral_foundations_feats.get_top_model_term_lists(), metadata_descriptions=moral_foundations_feats.get_definitions()) fn = 'demo_moral_foundations.html' with open(fn, 'wb') as out: out.write(html.encode('utf-8')) print('Open ./%s in Chrome.' % (fn))
# scaler_algo='normcdf' # ), # grey_threshold=0, # y_axis_values=[-1, 0, 1], # metadata=convention_df['speaker']) # fn = './demo_scaled_f_score.html' # open(fn, 'wb').write(html.encode('utf-8')) # print('Open ' + fn + ' in Chrome or Firefox.') # ================================================================================ all_satisfaction_score_comment_in_all_conds=utils_data.get_all_satisfaction_score_comment_in_all_conds() columns=['senti_on_Metfor_oral','feature','review'] all_satisfaction_score_comment_in_all_conds_df=pd.DataFrame(all_satisfaction_score_comment_in_all_conds,index=None,columns=columns) # print("all_satisfaction_score_comment_in_all_conds_df",all_satisfaction_score_comment_in_all_conds_df) # ================================================================================ corpus=CorpusFromPandas( all_satisfaction_score_comment_in_all_conds_df,category_col='senti_on_Metfor_oral',text_col='review', nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus() # ================================================================================ html=produce_frequency_explorer( corpus,category='negative',category_name='Negative',not_category_name='Positive',minimum_term_frequency=5, width_in_pixels=1000,term_scorer=ScaledFScorePresetsNeg1To1(beta=1,scaler_algo='normcdf'), grey_threshold=0,y_axis_values=[-1,0,1],metadata=all_satisfaction_score_comment_in_all_conds_df['feature']) # ================================================================================ fn = '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/demo_scaled_f_score.html' open(fn,'wb').write(html.encode('utf-8')) print('Open ' + fn + ' in Chrome or Firefox.')
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer from scattertext.CorpusFromPandas import CorpusFromPandas from scattertext.termscoring.ScaledFScore import ScaledFScorePresetsNeg1To1 convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build() html = produce_frequency_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, width_in_pixels=1000, term_scorer=ScaledFScorePresetsNeg1To1( beta=1, scaler_algo='normcdf'), grey_threshold=0, y_axis_values=[-1, 0, 1], metadata=convention_df['speaker']) fn = './demo_scaled_f_score.html' open(fn, 'wb').write(html.encode('utf-8')) print('Open ' + fn + ' in Chrome or Firefox.')
grey_threshold=0 ) file_name = 'demo_mann_whitney.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open %s in Chrome or Firefox.' % file_name) ''' movie_df = st.SampleCorpora.RottenTomatoes.get_data() corpus = st.CorpusFromPandas(movie_df, category_col='category', text_col='text', nlp=st.whitespace_nlp_with_sentences).build() corpus = corpus.get_unigram_corpus() score_df = st.MannWhitneyU(corpus).set_categories( 'plot', ['fresh', 'rotten']).get_score_df('fdr_bh') print(score_df.sort_values(by='mwu_z', ascending=False).head()) print(score_df.sort_values(by='mwu_z', ascending=False).tail()) html = st.produce_frequency_explorer(corpus, category='plot', y_label='Mann Whitney FDR-BH Z', scores=score_df.mwu_z, grey_threshold=0) file_name = 'demo_mann_whitney.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open %s in Chrome or Firefox.' % file_name)
C=1.0 / tfidf_X.shape[0], tol=1e-3) clf.fit(tfidf_X, newsgroups_train.target) corpus = st.CorpusFromScikit( X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data), y=newsgroups_train.target, feature_vocabulary=vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data ).build() html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=clf.coef_[0], use_term_significance=False, terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, clf.coef_[0]), metadata = ['/'.join(fn.split('/')[-2:]) for fn in newsgroups_train.filenames] ) file_name = "demo_sklearn.html" open(file_name, 'wb').write(html.encode('utf-8')) print("open " + file_name) sfs = (corpus.get_scaled_f_scores('alt.atheism') - 0.5) * 2 html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=sfs, use_term_significance=False, terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, sfs),
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, HedgesR from scattertext.CorpusFromPandas import CorpusFromPandas convention_df = SampleCorpora.ConventionData2012.get_data() corpus = (CorpusFromPandas( convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus()) html = produce_frequency_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', term_scorer=HedgesR(corpus), metadata=convention_df['speaker'], grey_threshold=0) file_name = 'demo_hedges_r.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open ./%s in Chrome.' % (file_name))
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() corpus = st.CorpusFromPandas(movie_df, category_col='category', text_col='text', nlp=st.whitespace_nlp_with_sentences).build( ).get_unigram_corpus().remove_categories(['plot']) term_scorer = st.CredTFIDF(corpus).set_categories('fresh', ['rotten']) print(term_scorer.get_score_df().sort_values(by='delta_cred_tf_idf', ascending=False).head()) html = st.produce_frequency_explorer(corpus, category='fresh', not_category_name='rotten', term_scorer=term_scorer, metadata=corpus.get_df()['movie_name'], grey_threshold=0) file_name = 'demo_cred_tfidf.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open %s in Chrome or Firefox.' % file_name)