def create_scatter_text(writers, names, messages, nonames=False): my_df = pd.DataFrame({"author": names, "message": messages}) nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer()) my_df['parse'] = my_df['message'].apply(nlp) corpus = st.CorpusFromParsedDocuments( my_df, category_col='author', parsed_col='parse').build().get_unigram_corpus().compact( st.AssociationCompactor(2000)) if nonames: html = st.produce_scattertext_explorer(corpus, category=writers[0], category_name="Author_0", not_category_name="Author_1", minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, transform=st.Scalers.dense_rank) else: html = st.produce_scattertext_explorer(corpus, category=writers[0], category_name=writers[0], not_category_name=writers[1], minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, transform=st.Scalers.dense_rank) with open('./demo_compact.html', 'w') as f: f.write(html) f.close()
import scattertext as st df = st.SampleCorpora.ConventionData2012.get_data().assign( parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)) corpus = st.CorpusFromParsedDocuments( df, category_col='party', parsed_col='parse').build().get_unigram_corpus().compact( st.AssociationCompactor(2000)) html = st.produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=corpus.get_df()['speaker'], transform=st.Scalers.dense_rank, show_diagonal=False, max_overlapping=3, vertical_lines=0.5) open('./demo_vertical_lines.html', 'w').write(html) print('open ./demo_vertical_lines.html in Chrome')
import scattertext as st import pandas as pd df = st.SampleCorpora.RottenTomatoes.get_data() df['parse'] = df['text'].apply(st.whitespace_nlp_with_sentences) corpus = (st.CorpusFromParsedDocuments(df, category_col='category', parsed_col='parse') .build() .get_unigram_corpus() .compact(st.AssociationCompactor(1000))) corpus, axes = st.EmbeddingsResolver(corpus).set_embeddings_model().project_embeddings() term_colors = st.CategoryColorAssigner(corpus).get_term_colors() html = st.produce_pca_explorer(corpus, category='fresh', not_categories=['rotten'], neutral_categories=['plot'], metadata=df['movie_name'], width_in_pixels=1000, show_axes=False, use_full_doc=True, projection=axes, term_colors=term_colors, show_characteristic=False, show_top_terms=False, unified_context=True, show_category_headings=True, show_cross_axes=False, include_term_category_counts=True, color_func="(function(d) {return modelInfo.term_colors[d.term]})", ) file_name = 'demo_unified_context.html'
convention_df = st.SampleCorpora.ConventionData2012.get_data( ).assign( parse=lambda df: df.text.apply(nlp), party=lambda df: df.party.apply( {#'democrat': #'Democratic', #'republican': '#Republican'}.get ) ) corpus = st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse', feats_from_spacy_doc=st.PyTextRankPhrases() ).build( ).compact( st.AssociationCompactor(2000, use_non_text_features=True) class Stats_Graph_Manager: def __init__(self): pass class graph: def __init__(self,gdict=None): if gdict is None: gdict = []
import scattertext as st df = st.SampleCorpora.ConventionData2012.get_data().assign( parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences) ) corpus = st.CorpusFromParsedDocuments( df, category_col='party', parsed_col='parse' ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000)) html = st.produce_scattertext_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=corpus.get_df()['speaker'], transform=st.Scalers.dense_rank, show_diagonal=True, max_overlapping=3 ) open('./demo_compact.html', 'w').write(html) print('open ./demo_compact.html in Chrome')