from scattertext.cartegoryprojector.OptimalProjection import get_optimal_category_projection from scattertext.termcompaction.AssociationCompactor import ScorePercentileCompactor, AssociationCompactor from scattertext.termscoring import ScaledFScore movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas( movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_unigram_corpus() ''' category_projection = get_optimal_category_projection( corpus, n_dims=2, n_steps=20, projector=lambda n_terms, n_dims: CategoryProjector(AssociationCompactor(n_terms, scorer=RankDifference), projector=PCA(n_dims))) ''' html = st.produce_pairplot(corpus, #category_projection=category_projection, metadata=movie_df['category'] + ': ' + movie_df['movie_name']) file_name = 'movie_pair_plot.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
from sklearn.decomposition import KernelPCA, NMF from sklearn.preprocessing import RobustScaler from statsmodels.multivariate.pca import PCA import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() general_inquirer_feature_builder = st.FeatsFromGeneralInquirer() corpus = st.CorpusFromPandas( convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=general_inquirer_feature_builder, ).build().get_unigram_corpus() html = st.produce_pairplot( corpus, use_metadata=True, category_projector=st.CategoryProjector(compactor=None), topic_model_term_lists=general_inquirer_feature_builder. get_top_model_term_lists(), topic_model_preview_size=100, metadata_descriptions=general_inquirer_feature_builder.get_definitions(), metadata=convention_df['party'] + ': ' + convention_df['speaker']) file_name = 'convention_pair_plot_geninq.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category.apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) movie_df.movie_name = movie_df.movie_name.apply(lambda x: x.replace('_', ' ')) corpus = st.CorpusFromPandas( movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_stoplisted_unigram_corpus() html = st.produce_pairplot( corpus, category_projection=st.get_optimal_category_projection(corpus, verbose=True), metadata=movie_df['category'] + ': ' + movie_df['movie_name'], d3_url_struct=st.D3URLs( d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js' ), default_to_term_comparison=False ) file_name = 'movie_pair_plot_movies_mirror.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas( movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_unigram_corpus() html = st.produce_pairplot(corpus, category_projection=st.get_optimal_category_projection(corpus), metadata=movie_df['category'] + ': ' + movie_df['movie_name']) file_name = 'movie_pair_plot.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
from sklearn.decomposition import KernelPCA from sklearn.preprocessing import RobustScaler import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() corpus = st.CorpusFromPandas( convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus() html = st.produce_pairplot(corpus, metadata=convention_df['party'] + ': ' + convention_df['speaker']) file_name = 'convention_pair_plot.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import f1_score import scattertext as st newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() tfidf_X = vectorizer.fit_transform(newsgroups_train.data) corpus = st.CorpusFromScikit( X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform( newsgroups_train.data), y=newsgroups_train.target, feature_vocabulary=vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data).build().get_unigram_corpus() html = st.produce_pairplot(corpus) file_name = 'demo_pair_plot.html' open(file_name, 'wb').write(html.encode('utf-8')) print('Open ./%s in Chrome.' % (file_name))
import scattertext as st convention_df = st.SampleCorpora.ConventionData2012.get_data() empath_feature_builder = st.FeatsFromOnlyEmpath() corpus = st.CorpusFromPandas( convention_df, category_col='speaker', text_col='text', nlp=st.whitespace_nlp_with_sentences, feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus() html = st.produce_pairplot( corpus, use_metadata=True, category_projector=st.CategoryProjector(compactor=None), topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(), metadata=convention_df['party'] + ': ' + convention_df['speaker']) file_name = 'convention_pair_plot_empath.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st import gensim movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) movie_df['parse'] = movie_df.text.apply(st.whitespace_nlp_with_sentences) corpus = st.CorpusFromParsedDocuments( movie_df, category_col='movie_name', parsed_col='parse').build().get_stoplisted_unigram_corpus() category_projection = st.Doc2VecCategoryProjector().project(corpus) html = st.produce_pairplot( corpus, category_projection=category_projection, metadata=movie_df['category'] + ': ' + movie_df['movie_name'], scaler=st.Scalers.scale_0_to_1, d3_url_struct=st.D3URLs( d3_scale_chromatic_url= 'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js')) file_name = 'movie_pair_plot_d2v.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas( movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_stoplisted_unigram_corpus() html = st.produce_pairplot( corpus, category_projection=st.get_optimal_category_projection(corpus, verbose=True), metadata=movie_df['category'] + ': ' + movie_df['movie_name'], d3_url_struct=st.D3URLs( d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js' ) ) file_name = 'movie_pair_plot.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import gensim movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) movie_df['parse'] = movie_df.text.apply(st.whitespace_nlp_with_sentences) corpus = st.CorpusFromParsedDocuments( movie_df, category_col='movie_name', parsed_col='parse' ).build().get_stoplisted_unigram_corpus() category_projection = st.Doc2VecCategoryProjector().project(corpus) html = st.produce_pairplot( corpus, category_projection=category_projection, metadata=movie_df['category'] + ': ' + movie_df['movie_name'], scaler=st.Scalers.scale_0_to_1, d3_url_struct=st.D3URLs( d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js' ) ) file_name = 'movie_pair_plot_d2v.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category\ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas( movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_unigram_corpus() html = st.produce_pairplot(corpus, metadata=movie_df['category'] + ': ' + movie_df['movie_name']) file_name = 'movie_pair_plot.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas(movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences).build( ).get_stoplisted_unigram_corpus() category_projection = st.CategoryProjector( selector=None, normalizer=TfidfTransformer(), projector=umap.UMAP(min_dist=0.5, metric='cosine')).project(corpus) html = st.produce_pairplot( corpus, # category_projection=st.get_optimal_category_projection(corpus, verbose=True), category_projection=category_projection, metadata=movie_df['category'] + ': ' + movie_df['movie_name'], scaler=st.Scalers.scale_0_to_1, show_halo=False, d3_url_struct=st.D3URLs( d3_scale_chromatic_url= 'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', d3_url='scattertext/data/viz/scripts/d3.min.js')) file_name = 'movie_pair_plot_umap.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas( movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_stoplisted_unigram_corpus() html = st.produce_pairplot( corpus, category_projector=st.CategoryProjector(projector=phate.PHATE()), metadata=movie_df['category'] + ': ' + movie_df['movie_name'], #scaler=st.Scalers.scale_0_to_1, #show_halo=False, #d3_url_struct=st.D3URLs( # d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', # d3_url='scattertext/data/viz/scripts/d3.min.js' #), default_to_term_comparison=False ) file_name = 'movie_pair_plot_phates.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)