Exemplo n.º 1
0
from scattertext.cartegoryprojector.OptimalProjection import get_optimal_category_projection
from scattertext.termcompaction.AssociationCompactor import ScorePercentileCompactor, AssociationCompactor
from scattertext.termscoring import ScaledFScore

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus()
'''
category_projection = get_optimal_category_projection(
    corpus,
    n_dims=2,
    n_steps=20,
    projector=lambda n_terms, n_dims: CategoryProjector(AssociationCompactor(n_terms, scorer=RankDifference),
                                                        projector=PCA(n_dims)))
'''

html = st.produce_pairplot(corpus,
                           #category_projection=category_projection,
                           metadata=movie_df['category'] + ': ' + movie_df['movie_name'])

file_name = 'movie_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemplo n.º 2
0
from sklearn.decomposition import KernelPCA, NMF
from sklearn.preprocessing import RobustScaler
from statsmodels.multivariate.pca import PCA

import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
general_inquirer_feature_builder = st.FeatsFromGeneralInquirer()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=general_inquirer_feature_builder,
).build().get_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    use_metadata=True,
    category_projector=st.CategoryProjector(compactor=None),
    topic_model_term_lists=general_inquirer_feature_builder.
    get_top_model_term_lists(),
    topic_model_preview_size=100,
    metadata_descriptions=general_inquirer_feature_builder.get_definitions(),
    metadata=convention_df['party'] + ': ' + convention_df['speaker'])

file_name = 'convention_pair_plot_geninq.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category.apply(lambda x: {'rotten': 'Negative',
                                                       'fresh': 'Positive',
                                                       'plot': 'Plot'}[x])
movie_df.movie_name = movie_df.movie_name.apply(lambda x: x.replace('_', ' '))

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    category_projection=st.get_optimal_category_projection(corpus, verbose=True),
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'
    ),
    default_to_term_comparison=False
)

file_name = 'movie_pair_plot_movies_mirror.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemplo n.º 4
0
import scattertext as st
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus()

html = st.produce_pairplot(corpus,
                           category_projection=st.get_optimal_category_projection(corpus),
                           metadata=movie_df['category'] + ': ' + movie_df['movie_name'])

file_name = 'movie_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemplo n.º 5
0
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import RobustScaler

import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences).build().get_unigram_corpus()
html = st.produce_pairplot(corpus,
                           metadata=convention_df['party'] + ': ' +
                           convention_df['speaker'])

file_name = 'convention_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemplo n.º 6
0
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
import scattertext as st

newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer()
tfidf_X = vectorizer.fit_transform(newsgroups_train.data)

corpus = st.CorpusFromScikit(
    X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(
        newsgroups_train.data),
    y=newsgroups_train.target,
    feature_vocabulary=vectorizer.vocabulary_,
    category_names=newsgroups_train.target_names,
    raw_texts=newsgroups_train.data).build().get_unigram_corpus()

html = st.produce_pairplot(corpus)
file_name = 'demo_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
empath_feature_builder = st.FeatsFromOnlyEmpath()

corpus = st.CorpusFromPandas(
    convention_df,
    category_col='speaker',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences,
    feats_from_spacy_doc=empath_feature_builder).build().get_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    use_metadata=True,
    category_projector=st.CategoryProjector(compactor=None),
    topic_model_term_lists=empath_feature_builder.get_top_model_term_lists(),
    metadata=convention_df['party'] + ': ' + convention_df['speaker'])

file_name = 'convention_pair_plot_empath.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemplo n.º 8
0
import scattertext as st
import gensim

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
movie_df['parse'] = movie_df.text.apply(st.whitespace_nlp_with_sentences)

corpus = st.CorpusFromParsedDocuments(
    movie_df, category_col='movie_name',
    parsed_col='parse').build().get_stoplisted_unigram_corpus()

category_projection = st.Doc2VecCategoryProjector().project(corpus)

html = st.produce_pairplot(
    corpus,
    category_projection=category_projection,
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    scaler=st.Scalers.scale_0_to_1,
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url=
        'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'))

file_name = 'movie_pair_plot_d2v.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    category_projection=st.get_optimal_category_projection(corpus, verbose=True),
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'
    )
)

file_name = 'movie_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
import gensim

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
movie_df['parse'] = movie_df.text.apply(st.whitespace_nlp_with_sentences)


corpus = st.CorpusFromParsedDocuments(
    movie_df,
    category_col='movie_name',
    parsed_col='parse'
).build().get_stoplisted_unigram_corpus()

category_projection = st.Doc2VecCategoryProjector().project(corpus)

html = st.produce_pairplot(
    corpus,
    category_projection=category_projection,
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    scaler=st.Scalers.scale_0_to_1,
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'
    )
)

file_name = 'movie_pair_plot_d2v.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemplo n.º 11
0
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category\
	.apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
	movie_df,
	category_col='movie_name',
	text_col='text',
	nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus()
html = st.produce_pairplot(corpus, metadata=movie_df['category'] + ': ' + movie_df['movie_name'])

file_name = 'movie_pair_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(movie_df,
                             category_col='movie_name',
                             text_col='text',
                             nlp=st.whitespace_nlp_with_sentences).build(
                             ).get_stoplisted_unigram_corpus()

category_projection = st.CategoryProjector(
    selector=None,
    normalizer=TfidfTransformer(),
    projector=umap.UMAP(min_dist=0.5, metric='cosine')).project(corpus)

html = st.produce_pairplot(
    corpus,
    # category_projection=st.get_optimal_category_projection(corpus, verbose=True),
    category_projection=category_projection,
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    scaler=st.Scalers.scale_0_to_1,
    show_halo=False,
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url=
        'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'))

file_name = 'movie_pair_plot_umap.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemplo n.º 13
0
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    category_projector=st.CategoryProjector(projector=phate.PHATE()),
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    #scaler=st.Scalers.scale_0_to_1,
    #show_halo=False,
    #d3_url_struct=st.D3URLs(
    #    d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
    #    d3_url='scattertext/data/viz/scripts/d3.min.js'
    #),
    default_to_term_comparison=False
)

file_name = 'movie_pair_plot_phates.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)