from sklearn.decomposition import KernelPCA, NMF from sklearn.preprocessing import RobustScaler from statsmodels.multivariate.pca import PCA import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() general_inquirer_feature_builder = st.FeatsFromGeneralInquirer() corpus = st.CorpusFromPandas( convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=general_inquirer_feature_builder, ).build().get_unigram_corpus() html = st.produce_pairplot( corpus, use_metadata=True, category_projector=st.CategoryProjector(compactor=None), topic_model_term_lists=general_inquirer_feature_builder. get_top_model_term_lists(), topic_model_preview_size=100, metadata_descriptions=general_inquirer_feature_builder.get_definitions(), metadata=convention_df['party'] + ': ' + convention_df['speaker']) file_name = 'convention_pair_plot_geninq.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.decomposition import PCA import scattertext as st newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() tfidf_X = vectorizer.fit_transform(newsgroups_train.data) corpus = st.CorpusFromScikit( X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform( newsgroups_train.data), y=newsgroups_train.target, feature_vocabulary=vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data).build().get_unigram_corpus() html = st.produce_category_focused_pairplot( corpus=corpus, category_projector=st.CategoryProjector(projector=PCA(10)), category='alt.atheism') file_name = 'demo_pair_plot_category_focused.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open ./%s in Chrome.' % (file_name))
import umap from sklearn.feature_extraction.text import TfidfTransformer import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas(movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences).build( ).get_stoplisted_unigram_corpus() category_projection = st.CategoryProjector(projector=umap.UMAP( metric='cosine')).project(corpus) html = st.produce_pairplot( corpus, # category_projection=st.get_optimal_category_projection(corpus, verbose=True), category_projection=category_projection, metadata=movie_df['category'] + ': ' + movie_df['movie_name'], scaler=st.Scalers.scale_0_to_1, show_halo=True, d3_url_struct=st.D3URLs( d3_scale_chromatic_url= 'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js'), default_to_term_comparison=False) file_name = 'movie_pair_plot_umap.html'
from sklearn.feature_extraction.text import TfidfTransformer import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas(movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences).build( ).get_stoplisted_unigram_corpus() category_projection = st.CategoryProjector( selector=None, normalizer=TfidfTransformer(), projector=umap.UMAP(min_dist=0.5, metric='cosine')).project(corpus) html = st.produce_pairplot( corpus, # category_projection=st.get_optimal_category_projection(corpus, verbose=True), category_projection=category_projection, metadata=movie_df['category'] + ': ' + movie_df['movie_name'], scaler=st.Scalers.scale_0_to_1, show_halo=False, d3_url_struct=st.D3URLs( d3_scale_chromatic_url= 'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js')) file_name = 'movie_pair_plot_umap.html'
import scattertext as st import scattertext.categoryprojector.pairplot convention_df = st.SampleCorpora.ConventionData2012.get_data() empath_feature_builder = st.FeatsFromOnlyEmpath() corpus = st.CorpusFromPandas( convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus() html = scattertext.categoryprojector.pairplot.produce_pairplot( corpus, use_metadata=True, category_projector=st.CategoryProjector(selector=None), topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(), metadata=convention_df['party'] + ': ' + convention_df['speaker'] ) file_name = 'convention_pair_plot_empath.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas( movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_stoplisted_unigram_corpus() html = st.produce_pairplot( corpus, category_projector=st.CategoryProjector(projector=phate.PHATE()), metadata=movie_df['category'] + ': ' + movie_df['movie_name'], #scaler=st.Scalers.scale_0_to_1, #show_halo=False, #d3_url_struct=st.D3URLs( # d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', # d3_url='scattertext/data/viz/scripts/d3.min.js' #), default_to_term_comparison=False ) file_name = 'movie_pair_plot_phates.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)