Exemplo n.º 1
0
def main():
    shisei = _parse_geutenberg(
        'http://www.gutenberg.org/files/31617/31617-0.txt')
    horadanshaku = _parse_geutenberg(
        'http://www.gutenberg.org/files/34084/34084-0.txt')
    df = pd.DataFrame({
        'text': [shisei, horadanshaku],
        'title': ['Shisei', 'Horadanshaku tabimiyage'],
        'author': ['Akutagawa Ryunosuke', 'Kuni Sasaki']
    })

    df['text'] = df['text'].apply(st.japanese_nlp)
    corpus = st.CorpusFromParsedDocuments(df,
                                          category_col='title',
                                          parsed_col='text').build()
    html = st.produce_scattertext_explorer(
        corpus,
        category='Shisei',
        category_name='Shisei',
        not_category_name='Horadanshaku tabimiyage',
        minimum_term_frequency=5,
        width_in_pixels=1000,
        metadata=df['title'] + ' by ' + df['author'],
        asian_mode=True)
    open('./demo_japanese.html', 'w').write(html)
    print('Open ./demo_japanese.html in Chrome or Firefox.')
    def scattertext_function(self):

        ## START
        nlp = spacy.load('en_core_web_sm')
        convention_df = pd.read_csv(
            "After_Classification/After_Classification_NY_6.csv")
        convention_df['parsed'] = convention_df.tweet.apply(nlp)

        ##Index(['Unnamed: 0', 'Date', 'name', 'tweet', 'death', 'Classification'], dtype='object')
        # print("Document Count")
        # print(convention_df.groupby('Classification')['tweet'].count())
        # print("Word Count")
        # print(convention_df.groupby('Classification').apply(lambda x: x.tweet.apply(lambda x: len(x.split())).sum()))
        # print(type(convention_df))

        ##Convert Dataframe into Scattertext Corpus
        corpus = st.CorpusFromParsedDocuments(convention_df,
                                              category_col='Classification',
                                              parsed_col='parsed').build()
        print(type(st.Scalers.log_scale_standardize))
        list(corpus.get_scaled_f_scores_vs_background().index[:10])
        html = st.produce_scattertext_explorer(
            corpus,
            category='pos',
            category_name='POS',
            not_category_name='NEG',
            minimum_term_frequency=5,
            width_in_pixels=1000,
            transform=st.Scalers.log_scale_standardize)

        file_name_1 = 'After_Classification_NY_6.html'
        open(file_name_1, 'wb').write(html.encode('utf-8'))
        print(IFrame(src=file_name_1, width=1200, height=700))
Exemplo n.º 3
0
def create_scatter_text(writers, names, messages, nonames=False):
    my_df = pd.DataFrame({"author": names, "message": messages})
    nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
    my_df['parse'] = my_df['message'].apply(nlp)

    corpus = st.CorpusFromParsedDocuments(
        my_df, category_col='author',
        parsed_col='parse').build().get_unigram_corpus().compact(
            st.AssociationCompactor(2000))

    if nonames:
        html = st.produce_scattertext_explorer(corpus,
                                               category=writers[0],
                                               category_name="Author_0",
                                               not_category_name="Author_1",
                                               minimum_term_frequency=0,
                                               pmi_threshold_coefficient=0,
                                               width_in_pixels=1000,
                                               transform=st.Scalers.dense_rank)
    else:
        html = st.produce_scattertext_explorer(corpus,
                                               category=writers[0],
                                               category_name=writers[0],
                                               not_category_name=writers[1],
                                               minimum_term_frequency=0,
                                               pmi_threshold_coefficient=0,
                                               width_in_pixels=1000,
                                               transform=st.Scalers.dense_rank)

    with open('./demo_compact.html', 'w') as f:
        f.write(html)
    f.close()
Exemplo n.º 4
0
    def create_corpus(self):

        # load cleaned df
        convention_df = self.clean_texts()

        # create parsed corpus
        convention_df.groupby('bias').apply(
            lambda x: x.text.apply(lambda x: len(x.split())).sum())
        convention_df['parsed'] = convention_df.text.apply(nlp)
        corpus = st.CorpusFromParsedDocuments(convention_df,
                                              category_col='bias',
                                              parsed_col='parsed').build()

        # remove stop words
        stop_word_list = [
            'via getty', 'inbox', 'subscribe', '×', 'close ×', 'screen close',
            'full screen', 'buy second', 'second continue', 'story continued',
            'llc permission', '―', 'xe', '\\xe2\\x80\\x99', 'news',
            'for reprint', 'llc', 'post', 'click', 'to', '’ve',
            'unsupported on', 'share', 'that ’s', 'still', 'got', 'it', '37',
            'of his', 'this report', 'ofs', 'fox', 'photos', '’m', 'is the',
            's.', 'around', 'times', 'also', 'the', 'copyright',
            'washington times', 'mr', 'press', 'wait', 'associated',
            'unsubscribe', 'view', 'photo wait', 'http', '#',
            'associated press', 'more videos', 'get', 'just watched',
            'permission', 'however', 'b.', 'ms.', 'here©', 'device',
            'copyright ©', 'paste', '10', 'the associated', 'contributed to',
            'hide', 'and his', 'videos', 'said mr.', '_', '©', 'contributed',
            'embed', 'n’t', '/', 'something', 'i', 'that they', 'read',
            'for a', 'playback', 'must watch', 'washington post', 'just',
            'to get', 'r', 'read more', 'toggle', 'more', 'i ’m', 'follow',
            'is', 'https', ' ', 'said', 'mr.', 'unsupported', 'or blog',
            'your device', 'for', 'cnn', 'of 76', 'that', 'ms', 'andhis',
            'click here', 'or share', 'replay', 'press contributed', 'they',
            'must', 'prof', 'www', 'it ’s', 'told', '’re', 'the washington',
            '1', "'s rise", '© 2018', 'to this', 'skip', 'around the', 'blog',
            'cut', 'told fox', 'mrs.', 'hide caption', 'ad', 'watched',
            '/ the', 'replay more', 'and the', '’s', '2018', 'copy', '&',
            'read or', 'reprint permission', 'are', 'told cnn', 'watch',
            'here for', 'also said', 'copy this', 'reprint', 'report',
            'advertisement', 'mrs', 'caption', 'autoplay', 'fox news', 'dr',
            'enlarge', 'times llc', '76', 'photo', 'this'
        ]
        stop_word_list = list(set(stop_word_list))

        update_stop = []
        for term in stop_word_list:
            if term in corpus._term_idx_store:
                update_stop.append(term)
        corpus = corpus.remove_terms(update_stop)

        return corpus
Exemplo n.º 5
0
    def get_scattertext_corpus(df,
                               dep_data_col,
                               group1_name,
                               group2_name,
                               lang="en"):

        cut_off = 3  # nur als Beispiel! muss letztlich vom User angegeben werden wie er die Gruppen bestimmen will (falls er nicht schon gelabelte Daten hoch lädt)
        df.loc[df[dep_data_col] > cut_off, 'label'] = group1_name
        df.loc[df[dep_data_col] < cut_off, 'label'] = group2_name
        df.dropna(inplace=True, axis=0)

        df["lemmas"] = df["nlp"].apply(lambda doc: [
            tok.text for tok in doc
            if not tok.is_punct and not tok.is_stop and len(tok.text) > 1
        ])
        df["lemmas"] = df["lemmas"].apply(lambda text: nlp(" ".join(text)))
        st_corpus = scatter_text.CorpusFromParsedDocuments(
            df, category_col='label',
            parsed_col='lemmas').build().remove_terms(stopwords,
                                                      ignore_absences=True)
        return st_corpus
Exemplo n.º 6
0
    def empath(self, dataframe):
        feat_builder = st.FeatsFromOnlyEmpath()
        empath_corpus = st.CorpusFromParsedDocuments(
            dataframe,
            category_col='Document Type',
            feats_from_spacy_doc=feat_builder,
            parsed_col='Text').build()

        html = st.produce_scattertext_explorer(
            empath_corpus,
            category='submission',
            category_name='Submission',
            not_category_name='Standard',
            width_in_pixels=1000,
            metadata=dataframe['Document'],
            use_non_text_features=True,
            use_full_doc=True,
            topic_model_term_lists=feat_builder.get_top_model_term_lists())

        logger.getLogger().info("Opening Empath Visual")
        open(self.empath_file, 'wb').write(html.encode('utf-8'))
        webbrowser.open("file://" + self.empath_file)
Exemplo n.º 7
0
            zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))
    df['first_name'] = df['User Name'].apply(lambda x: x.split()[0].lower(
    ) if type(x) == str and len(x.split()) > 0 else x)
    male_prob = agefromname.AgeFromName().get_all_name_male_prob()
    df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
    df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f'
                                            if x < 0.1 else '?')
    df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]
    df_mf.to_csv('emoji_data.csv', index=False)

nlp = st.tweet_tokenzier_factory(nltk.tokenize.TweetTokenizer())
df_mf['parse'] = df_mf['Tweet content'].apply(nlp)

corpus = st.CorpusFromParsedDocuments(
    df_mf,
    parsed_col='parse',
    category_col='gender',
    feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()).build()

html = st.produce_scattertext_explorer(
    corpus,
    category='f',
    category_name='Female',
    not_category_name='Male',
    use_full_doc=True,
    term_ranker=OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=(df_mf['User Name'] + ' (@' + df_mf['Nickname'] + ') ' +
              df_mf['Date'].astype(str)),
    width_in_pixels=1000)
Exemplo n.º 8
0
import scattertext as st
import spacy
nlp = spacy.load('en_core_web_sm')

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(nlp))

corpus = st.CorpusFromParsedDocuments(
    df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=st.FeatsFromSpacyDoc(
        use_lemmas=True)).build().get_unigram_corpus().compact(
            st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(corpus,
                                       category='democrat',
                                       category_name='Democratic',
                                       not_category_name='Republican',
                                       minimum_term_frequency=0,
                                       pmi_threshold_coefficient=0,
                                       width_in_pixels=1000,
                                       metadata=corpus.get_df()['speaker'],
                                       transform=st.Scalers.dense_rank,
                                       max_overlapping=3)
open('./demo_lemmas.html', 'w').write(html)
print('open ./demo_lemmas.html in Chrome')
Exemplo n.º 9
0


    nlp = spacy.load('en')
    convention_df = st.SampleCorpora.ConventionData2012.get_data(
    ).assign(
    parse=lambda df: df.text.apply(nlp),
    party=lambda df: df.party.apply(
        {#'democrat': #'Democratic', 
         #'republican': '#Republican'}.get
    )
)
corpus = st.CorpusFromParsedDocuments(
    convention_df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=st.PyTextRankPhrases()
).build(
).compact(
    st.AssociationCompactor(2000, use_non_text_features=True)

class Stats_Graph_Manager:
    def __init__(self):
        pass
 
        
        



Exemplo n.º 10
0
import scattertext as st

nlp = spacy.load('en', parser=False)
t0 = time.time()
print('reading dataset')
reviews_df = pd.read_csv(
    'https://github.com/JasonKessler/ICLR18ReviewVis/raw/master/iclr2018_reviews.csv.bz2'
)
print('parsing', time.time() - t0, 's')
reviews_df['parse'] = reviews_df['review'].apply(
    st.whitespace_nlp_with_sentences)
print('building full corpus', time.time() - t0, 's')
full_corpus = (
    st.CorpusFromParsedDocuments(
        reviews_df,
        category_col='category',
        parsed_col='parse',
        #feats_from_spacy_doc=st.PhraseMachinePhrases()
    ).build())

term_ranker = st.OncePerDocFrequencyRanker
corpus = (full_corpus.keep_only_these_categories([
    'Accept, Positive', 'Accept, Negative', 'Reject, Positive',
    'Reject, Negative'
], False).get_unigram_corpus().compact(
    st.ClassPercentageCompactor(term_count=5)))

print('finding priors', time.time() - t0, 's')
priors = (st.PriorFactory(
    full_corpus, starting_count=0.01).use_all_categories().get_priors())
print('building four square', time.time() - t0, 's')
Exemplo n.º 11
0
      spacy.explain("prep"))  # to understand tags
noun_chunks_df = pd.DataFrame()

for i, chunk in enumerate(parsed_review.noun_chunks):
    noun_chunks_df.loc[i, 'text'] = chunk.text
    noun_chunks_df.loc[i, 'root'] = chunk.root,
    noun_chunks_df.loc[i, 'root.text'] = chunk.root.text,
    noun_chunks_df.loc[i, 'root.dep_'] = chunk.root.dep_
    noun_chunks_df.loc[i, 'root.head.text'] = chunk.root.head.text

print(noun_chunks_df[:20])

nlp = spacy.load('en_core_web_sm', disable_pipes=["tagger", "ner"])
train_df['parsed'] = train_df.Text[49500:50500].apply(nlp)
corpus = st.CorpusFromParsedDocuments(train_df[49500:50500],
                                      category_col='Score',
                                      parsed_col='parsed').build()

from sense2vec.vectors import VectorMap

s2v = Sense2VecComponent('data/reddit_vectors-1.1.0/reddit_vectors-1.1.0')
spacy_tok.add_pipe(s2v)
doc = spacy_tok(u"dessert.")
freq = doc[0]._.s2v_freq
vector = doc[0]._.s2v_vec
most_similar = doc[0]._.s2v_most_similar(5)
print(most_similar, freq)

doc = spacy_tok(u"burger")
most_similar = doc[0]._.s2v_most_similar(4)
print(most_similar)
Exemplo n.º 12
0
    with tempfile.NamedTemporaryFile(delete=True) as tempf:
        with tempfile.NamedTemporaryFile(delete=True) as tempm:
            tempf.write(('\n'.join(documents)).encode())
            mod = spm.SentencePieceTrainer.Train(
                '--input=%s --model_prefix=%s --vocab_size=%s' %
                (tempf.name, tempm.name, vocab_size))
            sp = spm.SentencePieceProcessor()
            sp.load(tempm.name + '.model')
    return sp


sp = train_sentence_piece_tokenizer(convention_df.text.values, 2000)

corpus = st.CorpusFromParsedDocuments(
    convention_df,
    parsed_col='parse',
    category_col='party',
    feats_from_spacy_doc=st.FeatsFromSentencePiece(sp)).build()

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    sort_by_dist=False,
    metadata=convention_df['party'] + ': ' + convention_df['speaker'],
    term_scorer=st.RankDifference(),
    transform=st.Scalers.dense_rank,
    use_non_text_features=True,
    use_full_doc=True,
)
import scattertext as st
from scattertext import RankDifference

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(
    st.whitespace_nlp_with_sentences)

unigram_corpus = (st.CorpusFromParsedDocuments(
    convention_df, category_col='party',
    parsed_col='parse').build().get_stoplisted_unigram_corpus())

topic_model = (
    st.SentencesForTopicModeling(unigram_corpus).get_topics_from_terms(
        [
            'obama', 'romney', 'democrats', 'republicans', 'health',
            'military', 'taxes', 'education', 'olympics', 'auto', 'iraq',
            'iran', 'israel'
        ],
        scorer=RankDifference(),
        num_terms_per_topic=20))

topic_feature_builder = st.FeatsFromTopicModel(topic_model)

topic_corpus = st.CorpusFromParsedDocuments(
    convention_df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=topic_feature_builder).build()

html = st.produce_scattertext_explorer(
    topic_corpus,
Exemplo n.º 14
0
df_1 = df.groupby( [ "author"] ).size().reset_index(name='Counts')
df_1 = df_1.sort_values(by=['Counts'], ascending=False)
df_1 = df_1.head(2)


df_2 = df_1.merge(df, on='author', how='inner')

df_2 = df_2.sort_values(by=['Counts'], ascending=False)

build_corpus = st.CorpusFromPandas(df_2, category_col='author', text_col='clean_article_text', nlp=nlp).build()
df_freq = build_corpus.get_term_freq_df()
df_freq['GLOBE EDITORIAL SCORE'] = build_corpus.get_scaled_f_scores('GLOBE EDITORIAL')
df_freq['Jeffrey Simpson Score'] = build_corpus.get_scaled_f_scores('Jeffrey Simpson')

html = st.produce_scattertext_explorer(build_corpus,
          category='GLOBE EDITORIAL',
          category_name='GLOBE EDITORIAL',
          not_category_name='Jeffrey Simpson',
          width_in_pixels=1000,
          metadata=df_2['author'])


open("../output/Top_2_Authors.html", 'wb').write(html.encode('utf-8'))

#visualizing Empath topics and categories instead of terms

build_feats = st.FeatsFromOnlyEmpath()
build_corpus_2 = st.CorpusFromParsedDocuments(df_2,category_col='author', feats_from_spacy_doc=build_feats, parsed_col='clean_article_text').build()
html = st.produce_scattertext_explorer(build_corpus_2,category='GLOBE EDITORIAL',category_name='GLOBE EDITORIAL',not_category_name='Jeffrey Simpson',width_in_pixels=1000,metadata=df_2['author'],use_non_text_features=True,use_full_doc=True,topic_model_term_lists=build_feats.get_top_model_term_lists())

open("../output/Top_2_Authors-Empath.html", 'wb').write(html.encode('utf-8'))
Exemplo n.º 15
0
import scattertext as st
import gensim

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])
movie_df['parse'] = movie_df.text.apply(st.whitespace_nlp_with_sentences)

corpus = st.CorpusFromParsedDocuments(
    movie_df, category_col='movie_name',
    parsed_col='parse').build().get_stoplisted_unigram_corpus()

category_projection = st.Doc2VecCategoryProjector().project(corpus)

html = st.produce_pairplot(
    corpus,
    category_projection=category_projection,
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    scaler=st.Scalers.scale_0_to_1,
    d3_url_struct=st.D3URLs(
        d3_scale_chromatic_url=
        'scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
        d3_url='scattertext/data/viz/scripts/d3.min.js'))

file_name = 'movie_pair_plot_d2v.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
Exemplo n.º 16
0
import scattertext as st
import spacy

nlp = spacy.load('en_core_web_sm')

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: list(nlp.pipe(df.text)))

corpus = st.CorpusFromParsedDocuments(
    df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=st.SpacyEntities(
        entity_types_to_use=['NAME', 'LOC'])).build()

html = st.produce_scattertext_explorer(corpus,
                                       category='democrat',
                                       category_name='Democratic',
                                       not_category_name='Republican',
                                       minimum_term_frequency=0,
                                       pmi_threshold_coefficient=0,
                                       width_in_pixels=1000,
                                       metadata=corpus.get_df()['speaker'],
                                       transform=st.Scalers.dense_rank,
                                       max_overlapping=10,
                                       max_docs_per_category=0)
open('./demo_names2.html', 'w').write(html)
print('open ./demo_names2.html in Chrome')
Exemplo n.º 17
0
def Plot_Clusters_Kmeans(outputfile_name, nbr_clusters, path):
    list_stop_words = [stemmer.stem(stopWords[i]) for i in range(len(stopWords))]

    text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8')
    text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str)
    corpus = (st.CorpusFromPandas(text2kw_clusters, category_col='Cluster', text_col='Text', nlp=nlp).build(). \
              remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True).
              get_unigram_corpus().compact(st.ClassPercentageCompactor(term_count=2,
                                                                       term_ranker=st.OncePerDocFrequencyRanker)))

    for i in range(nbr_clusters):
        directory = path+r"\\"+str(i)+r"\\"
        try:
            os.mkdir(directory)
        except FileExistsError:
            pass

        html = st.produce_scattertext_explorer(corpus, category=str(i), category_name=str(i)+" Category",
                                               not_category_name='Other Categories',
                                               metadata=text2kw_clusters['Date'],
                                               minimum_term_frequency=50)
        filename = directory+str(i)+"_Category-VS-other categories.html"
        open(filename, 'wb+').write(html.encode('utf-8'))

    text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8')
    text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str)
    text2kw_clusters['Text'] = text2kw_clusters['Text'].apply(nlp)
    corpus = (st.CorpusFromParsedDocuments(text2kw_clusters, category_col='Cluster', parsed_col='Text')).build(). \
        remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True)
    for i in range(nbr_clusters):
        directory = path+r"\\"+str(i)+r"\\"

        m = text2kw_clusters[text2kw_clusters["Cluster"] == str(i)]
        liste = [word_tokenize(str(x)) for x in m["processedReviews"] if not stemmer.stem(str(x)) in list_stop_words]
        words = []
        for j in range(len(liste)):
            for k in range(len(liste[j])):
                if not (liste[j][k] in list_stop_words):
                    try:
                        words.append(liste[j][k])
                    except:
                        pass

        counter = collections.Counter(words)
        c = counter.most_common()
        html = word_similarity_explorer_gensim(corpus, category=str(i), category_name=str(i)+" Category",
                                               not_category_name='Other Categories',
                                               minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005),
                                               target_term=stemmer.stem(c[0][0]),
                                               # pmi_threshold_coefficient=4,
                                               width_in_pixels=1000,
                                               metadata=text2kw_clusters['Date'],
                                               word2vec=model,
                                               max_p_val=0.05,
                                               save_svg_button=True)
        filename = directory+str(i)+"_w2v_Category-VS-other categories.html"
        open(filename, 'wb+').write(html.encode('utf-8'))

    for i in range(nbr_clusters):
        directory = path+r"\\"+str(i)+r"\\"

        text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8')
        text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str)
        text2kw_clusters['Sentiments'] = text2kw_clusters['Sentiments'].astype(str)
        text2kw_clusters['Date'] = text2kw_clusters['Date'].astype(str)
        text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Cluster"] == str(i)]
        text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Sentiments"] != "neutral"]
        corpus = (st.CorpusFromPandas(text2kw_clusters, category_col='Sentiments', text_col='Text', nlp=nlp).build().
                  remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True))
        html = st.produce_scattertext_explorer(corpus, category="positive", category_name="Positive Verbatims",
                                               not_category_name='Negative Verbatims',
                                               metadata=text2kw_clusters['Date'],
                                               minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005))
        filename = directory+str(i)+"_Positive_Category-VS-Negative_Category.html"
        open(filename, 'wb+').write(html.encode('utf-8'))

    for i in range(nbr_clusters):
        directory = path+r"\\"+str(i)+r"\\"

        text2kw_clusters = pd.read_csv(outputfile_name, error_bad_lines=False, encoding='utf-8')
        text2kw_clusters['Cluster'] = text2kw_clusters['Cluster'].astype(str)
        text2kw_clusters['Sentiments'] = text2kw_clusters['Sentiments'].astype(str)
        text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Cluster"] == str(i)]
        text2kw_clusters = text2kw_clusters.loc[text2kw_clusters["Sentiments"] != "neutral"]
        text2kw_clusters['Text'] = text2kw_clusters['Text'].apply(nlp)
        liste = [word_tokenize(str(x)) for x in text2kw_clusters["processedReviews"] if
                 not stemmer.stem(str(x)) in list_stop_words]
        words = []
        for j in range(len(liste)):
            for k in range(len(liste[j])):
                if not (liste[j][k] in list_stop_words):
                    try:
                        words.append(liste[j][k])
                    except:
                        pass
        counter = collections.Counter(words)
        c = counter.most_common()

        corpus = (st.CorpusFromParsedDocuments(text2kw_clusters, category_col='Sentiments', parsed_col='Text')).build(). \
            remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True)
        html = word_similarity_explorer_gensim(corpus, category="positive", category_name="Positive Verbatims",
                                               not_category_name='Negative Verbatims',
                                               minimum_term_frequency=int(text2kw_clusters.shape[0]*0.005),
                                               target_term=stemmer.stem(c[0][0]),
                                               # pmi_threshold_coefficient=4,
                                               width_in_pixels=1000,
                                               metadata=text2kw_clusters['Date'],
                                               word2vec=model,
                                               max_p_val=0.05,
                                               save_svg_button=True)
        filename = directory+str(i)+"_w2v__Positive_Category-VS-Negative_Category.html"
        open(filename, 'wb+').write(html.encode('utf-8'))
data = [
	{'text': "I don't think you'll want to.", 'category': 'a'},
	{'text': "You'll have a didn't a-b #dfs .", 'category': 'a'},
	{'text': "You'll shoudn't #have a, didn't a-b #dfs .", 'category': 'a'},
	{'text': "Can't not get along to didn't.", 'category': 'b'},
	{'text': "Can't try aba-ba alo33ng to didn't.", 'category': 'b'},
	{'text': "Can't no't g'e't al33ong 3to5.", 'category': 'b'},
	{'text': "You haven't changed a b'it.", 'category': 'c'},
	{'text': "You haven't changed a b'it.", 'category': 'c'},
	{'text': "You haven't ch5ng3d a bit.", 'category': 'c'}
]

df = pd.DataFrame(data)
df['parse'] = df.text.apply(lambda x: st.whitespace_nlp_with_sentences(x, tok_splitter_re=re.compile('( )')))
corpus = st.CorpusFromParsedDocuments(df, parsed_col='parse', category_col='category').build().get_unigram_corpus()

semiotic_square = st.SemioticSquare(
	corpus,
	category_a='a',
	category_b='b',
	neutral_categories=['c'],
	scorer=st.RankDifference(),
	labels={'not_a_and_not_b': 'Plot Descriptions',
	        'a_and_b': 'Reviews',
	        'a_and_not_b': 'Positive',
	        'b_and_not_a': 'Negative',
	        'a':'',
	        'b':'',
	        'not_a':'',
	        'not_b':''}
        'income'
    ],
    'jobs':
    ['jobs', 'workers', 'labor', 'employment', 'worker', 'employee', 'job'],
    'patriotic':
    ['america', 'country', 'flag', 'americans', 'patriotism', 'patriotic'],
    'family': [
        'mother', 'father', 'mom', 'dad', 'sister', 'brother', 'grandfather',
        'grandmother', 'son', 'daughter'
    ]
}
topic_feature_builder = st.FeatsFromTopicModel(topic_model)

topic_corpus = st.CorpusFromParsedDocuments(
    convention_df,
    category_col='party',
    parsed_col='parse',
    feats_from_spacy_doc=topic_feature_builder).build()

html = st.produce_scattertext_explorer(
    topic_corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    width_in_pixels=1000,
    metadata=convention_df['speaker'],
    use_non_text_features=True,
    use_full_doc=True,
    pmi_threshold_coefficient=0,
    topic_model_term_lists=topic_feature_builder.get_top_model_term_lists())
Exemplo n.º 20
0
import scattertext as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse.linalg import svds

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(
    st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(convention_df,
                                       category_col='party',
                                       parsed_col='parse').build().
          get_stoplisted_unigram_corpus().remove_infrequent_words(
              minimum_term_count=3, term_ranker=st.OncePerDocFrequencyRanker))
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()).T
U, S, VT = svds(embeddings, k=3, maxiter=20000, which='LM')

x_dim = 0
y_dim = 1
projection = pd.DataFrame({
    'term': corpus.get_terms(),
    'x': U.T[x_dim],
    'y': U.T[y_dim]
}).set_index('term')

html = st.produce_pca_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    projection=projection,
    metadata=convention_df['speaker'],
Exemplo n.º 21
0
# ================================================================================
all_satisfaction_score_comment_in_all_conds = utils_data.get_all_satisfaction_score_comment_in_all_conds(
)

# ================================================================================
columns = ['senti_on_Metfor_oral', 'feature', 'review']
all_satisfaction_score_comment_in_all_conds_df = pd.DataFrame(
    all_satisfaction_score_comment_in_all_conds, index=None, columns=columns)

all_satisfaction_score_comment_in_all_conds_df[
    'parse'] = all_satisfaction_score_comment_in_all_conds_df['review'].apply(
        st.whitespace_nlp_with_sentences)

# ================================================================================
corpus = (st.CorpusFromParsedDocuments(
    all_satisfaction_score_comment_in_all_conds_df,
    category_col='senti_on_Metfor_oral',
    parsed_col='parse').build().get_stoplisted_unigram_corpus())

# ================================================================================
html = st.produce_projection_explorer(
    corpus,
    category='negative',
    category_name='Negative',
    not_category_name='Positive',
    metadata=all_satisfaction_score_comment_in_all_conds_df.feature,
    width_in_pixels=1000)

# ================================================================================
file_name = '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/demo_tsne_style.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open', file_name, 'in chrome')
from sklearn.decomposition import TruncatedSVD

import scattertext as st
from scattertext import ClassPercentageCompactor, CSRMatrixFactory
from scattertext.representations.CorpusSentenceIterator import CorpusSentenceIterator

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(
    st.whitespace_nlp_with_sentences)

corpus = (st.CorpusFromParsedDocuments(
    convention_df, category_col='party',
    parsed_col='parse').build().get_stoplisted_unigram_corpus().select(
        ClassPercentageCompactor(term_count=3)))

html = st.produce_projection_explorer(corpus,
                                      embeddings=corpus.get_term_doc_mat(),
                                      projection_model=TruncatedSVD(
                                          n_components=30, n_iter=10),
                                      x_dim=0,
                                      y_dim=1,
                                      category='democrat',
                                      category_name='Democratic',
                                      not_category_name='Republican',
                                      metadata=convention_df.speaker,
                                      width_in_pixels=1000)
file_name = 'demo_bow_pca.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open', file_name, 'in chrome')
Exemplo n.º 23
0
 
################################################Scatterplot2###################################################################

import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import scattertext as st
from scipy.sparse.linalg import svds

 
Data_join['parse'] = Data_join['content'].apply(st.whitespace_nlp_with_sentences)

 
#Corpus for scatterplot2
corpus = (st.CorpusFromParsedDocuments(Data_join,
                                       category_col='review',
                                       parsed_col='parse')
              .build()
              .get_stoplisted_unigram_corpus()) 

 

corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['author'])
corpus.get_df()['content']
len(corpus.get_metadata())

print(corpus.get_term_doc_mat()) 

#Eigen value matrix creation
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
u, s, vt = svds(embeddings, k=167, maxiter=20000, which='LM')
projection = pd.DataFrame({'term': corpus.get_metadata(), 'x': u.T[0], 'y': u.T[1]}).set_index('term')
    df = pd.read_csv('./complete_data.tsv', sep='\t')
    data_nix_ken = pd.read_csv('./stats/data_nix_ken.tsv', sep='\t')

    if not os.path.exists('plots'):
        os.makedirs('plots')

    # Scattertext attack vs support (only responses)
    # https://kanoki.org/2019/03/17/text-data-visualization-in-python/
    # https://github.com/JasonKessler/scattertext
    nlp = spacy.load('en_core_web_sm')
    for data_set in ['debate_test', 'debate_train', 'procon', 'political']:
        df_plot = df.loc[(df['org_dataset'] == data_set)
                         & (df['label'].isin(['attack', 'support']))]
        df_plot['parsed'] = df_plot['response'].apply(nlp)
        corpus = st.CorpusFromParsedDocuments(df_plot,
                                              category_col='label',
                                              parsed_col='parsed').build()
        html = st.produce_scattertext_explorer(
            corpus,
            category='attack',
            not_category_name='support',
            width_in_pixels=1000,
            minimum_term_frequency=5,
            transform=st.Scalers.log_scale_standardize,
            use_full_doc=True)
        file_name = './plots/scattertext_attack_support' + data_set + '.html'
        with open(file_name, 'wb') as file:
            file.write(html.encode('utf-8'))

    # Scattertext Nixon vs Kennedy
    df_plot = data_nix_ken
Exemplo n.º 25
0
import scattertext as st
import spacy

nlp = spacy.blank('en_core_web_sm')
nlp.tokenizer.rules = {key: value for key, value in nlp.tokenizer.rules.items()
                       if "'" not in key and "’" not in key and "‘" not in key}
nlp.add_pipe(nlp.create_pipe('sentencizer'))

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(nlp)
)

corpus = st.CorpusFromParsedDocuments(
    df, category_col='party', parsed_col='parse'
).build().compact(st.ClassPercentageCompactor(term_count=10))

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank,
    show_diagonal=False,
    max_overlapping=3
)
open('./demo_with_apostrophes.html', 'w').write(html)
print('open ./demo_with_apostrophes.html in Chrome')
Exemplo n.º 26
0
import scattertext as st

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))

corpus = st.CorpusFromParsedDocuments(
    df, category_col='party',
    parsed_col='parse').build().get_unigram_corpus().compact(
        st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(corpus,
                                       category='democrat',
                                       category_name='Democratic',
                                       not_category_name='Republican',
                                       minimum_term_frequency=0,
                                       pmi_threshold_coefficient=0,
                                       width_in_pixels=1000,
                                       metadata=corpus.get_df()['speaker'],
                                       transform=st.Scalers.dense_rank,
                                       show_diagonal=False,
                                       max_overlapping=3,
                                       vertical_lines=0.5)
open('./demo_vertical_lines.html', 'w').write(html)
print('open ./demo_vertical_lines.html in Chrome')
import scattertext as st

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(
    st.whitespace_nlp_with_sentences)

corpus = (st.CorpusFromParsedDocuments(
    convention_df, category_col='party',
    parsed_col='parse').build().get_stoplisted_unigram_corpus())

html = st.produce_projection_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    metadata=convention_df.speaker,
    color_func=
    '''(function(d) {return d.s > 0.5 ? d3.interpolateRdYlBu(0.6) : d3.interpolateRdYlBu(0.4) })''',
    center_label_over_points=True,
    censor_points=True,
    width_in_pixels=1000)
file_name = 'demo_tsne_style_for_publication.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open', file_name, 'in chrome')