def get_test_semiotic_square():
    df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                      columns=['category', 'text'])
    corpus = CorpusFromPandas(df, 'category', 'text',
                              nlp=whitespace_nlp).build()
    semsq = SemioticSquare(corpus, 'hamlet', 'jay-z/r. kelly', ['swift'])
    return semsq
 def setUp(cls):
     categories, documents = get_docs_categories()
     cls.df = pd.DataFrame({'category': categories, 'text': documents})
     cls.corpus = CorpusFromPandas(cls.df,
                                   'category',
                                   'text',
                                   nlp=whitespace_nlp).build()
示例#3
0
	def test_keep_only_these_categories(self):
		df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
		                  columns=['category', 'text'])
		corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
		hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift'])
		self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift'])
		self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms()))
		with self.assertRaises(AssertionError):
			corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'])
		corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)
 def test_keep_only_these_categories(self):
     df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                       columns=['category', 'text'])
     corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
     hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift'])
     self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift'])
     self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms()))
     with self.assertRaises(AssertionError):
         corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'])
     corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)
 def test_constructor(self):
     df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                       columns=['category', 'text'])
     corpus = CorpusFromPandas(df, 'category', 'text',
                               nlp=whitespace_nlp).build()
     SemioticSquare(corpus, 'hamlet', 'jay-z/r. kelly', ['swift'])
     with self.assertRaises(AssertionError):
         SemioticSquare(corpus, 'XXXhamlet', 'jay-z/r. kelly', ['swift'])
     with self.assertRaises(AssertionError):
         SemioticSquare(corpus, 'hamlet', 'jay-z/r. kellyXXX', ['swift'])
     with self.assertRaises(AssertionError):
         SemioticSquare(corpus, 'hamlet', 'jay-z/r. kelly',
                        ['swift', 'asd'])
     with self.assertRaises(EmptyNeutralCategoriesError):
         SemioticSquare(corpus, 'hamlet', 'jay-z/r. kelly', [])
示例#6
0
 def setUp(cls):
     cls.categories, cls.documents = get_docs_categories()
     cls.parsed_docs = []
     for doc in cls.documents:
         cls.parsed_docs.append(whitespace_nlp(doc))
     cls.df = pd.DataFrame({
         'category': cls.categories,
         'parsed': cls.parsed_docs,
         'orig': [d.upper() for d in cls.documents]
     })
     cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category',
                                                   'parsed').build()
     cls.corpus = CorpusFromPandas(cls.df,
                                   'category',
                                   'orig',
                                   nlp=whitespace_nlp).build()
示例#7
0
    def test_remove_categories(self):
        df = pd.DataFrame(data=np.array(get_docs_categories_semiotic()).T,
                          columns=['category', 'text'])
        corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
        swiftless = corpus.remove_categories(['swift'])

        swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'],
                                                 'category',
                                                 'text',
                                                 nlp=whitespace_nlp).build()
        np.testing.assert_equal(
            [i for i in corpus._y if i != corpus.get_categories().index('swift')],
            swiftless._y
        )
        self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0])
        self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape)
        self.assertEqual(set(swiftless_constructed.get_terms()),
                         set(swiftless.get_terms()))
        pd.testing.assert_series_equal(swiftless_constructed.get_texts(),
                                       swiftless.get_texts())
        np.testing.assert_equal(swiftless.get_category_names_by_row(),
                                swiftless_constructed.get_category_names_by_row())
def main():
    convention_df = SampleCorpora.ConventionData2012.get_data()

    corpus = CorpusFromPandas(
        convention_df,
        category_col='party',
        text_col='text',
        nlp=whitespace_nlp_with_sentences,
        feats_from_spacy_doc=FeatsFromGeneralInquirer()).build()
    html = produce_scattertext_explorer(corpus,
                                        category='democrat',
                                        category_name='Democratic',
                                        not_category_name='Republican',
                                        width_in_pixels=1000,
                                        metadata=convention_df['speaker'],
                                        use_non_text_features=True,
                                        use_full_doc=True)
    open('./demo_general_inquirer.html', 'wb').write(html.encode('utf-8'))
    print('Open ./demo_general_inquirer.html in Chrome or Firefox.')
示例#9
0
def main():
	nlp = spacy.load('en_core_web_sm')
	convention_df = SampleCorpora.ConventionData2012.get_data()
	corpus = CorpusFromPandas(convention_df,
	                          category_col='party',
	                          text_col='text',
	                          nlp=nlp).build()
	html = word_similarity_explorer(corpus,
	                                category='democrat',
	                                category_name='Democratic',
	                                not_category_name='Republican',
	                                target_term='jobs',
	                                minimum_term_frequency=5,
	                                width_in_pixels=1000,
	                                metadata=convention_df['speaker'],
	                                alpha=0.01,
	                                max_p_val=0.1,
	                                save_svg_button=True)
	open('./demo_similarity.html', 'wb').write(html.encode('utf-8'))
	print('Open ./demo_similarlity.html in Chrome or Firefox.')
示例#10
0
    def test_remove_categories(self):
        df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                          columns=['category', 'text'])
        corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
        swiftless = corpus.remove_categories(['swift'])

        swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'],
                                                 'category',
                                                 'text',
                                                 nlp=whitespace_nlp).build()
        np.testing.assert_equal(
            [i for i in corpus._y if i != corpus.get_categories().index('swift')],
            swiftless._y
        )
        self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0])
        self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape)
        self.assertEqual(set(swiftless_constructed.get_terms()),
                         set(swiftless.get_terms()))
        pd.testing.assert_series_equal(swiftless_constructed.get_texts(),
                                       swiftless.get_texts())
示例#11
0
from scattertext import SampleCorpora, whitespace_nlp_with_sentences
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.ScaledFScore import ScaledFScorePresets

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=whitespace_nlp_with_sentences).build()

html = produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=5,
    pmi_threshold_coefficient=8,
    width_in_pixels=1000,
    metadata=convention_df['speaker'],
    #term_scorer=ScaledFScorePresets(one_to_neg_one=True, beta=1),
    #d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
    #d3_url='scattertext/data/viz/scripts/d3.min.js',
)

open('./demo.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo.html in Chrome or Firefox.')
示例#12
0
# 	                                      scaler_algo='normcdf'
#                                       ),
#                                   grey_threshold=0,
#                                   y_axis_values=[-1, 0, 1],
#                                   metadata=convention_df['speaker'])
# fn = './demo_scaled_f_score.html'
# open(fn, 'wb').write(html.encode('utf-8'))
# print('Open ' + fn + ' in Chrome or Firefox.')

# ================================================================================
all_satisfaction_score_comment_in_all_conds=utils_data.get_all_satisfaction_score_comment_in_all_conds()

columns=['senti_on_Metfor_oral','feature','review']
all_satisfaction_score_comment_in_all_conds_df=pd.DataFrame(all_satisfaction_score_comment_in_all_conds,index=None,columns=columns)
# print("all_satisfaction_score_comment_in_all_conds_df",all_satisfaction_score_comment_in_all_conds_df)

# ================================================================================
corpus=CorpusFromPandas(
    all_satisfaction_score_comment_in_all_conds_df,category_col='senti_on_Metfor_oral',text_col='review',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus()

# ================================================================================
html=produce_frequency_explorer(
    corpus,category='negative',category_name='Negative',not_category_name='Positive',minimum_term_frequency=5,
    width_in_pixels=1000,term_scorer=ScaledFScorePresetsNeg1To1(beta=1,scaler_algo='normcdf'),
    grey_threshold=0,y_axis_values=[-1,0,1],metadata=all_satisfaction_score_comment_in_all_conds_df['feature'])

# ================================================================================
fn = '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/demo_scaled_f_score.html'
open(fn,'wb').write(html.encode('utf-8'))
print('Open ' + fn + ' in Chrome or Firefox.')
示例#13
0
from scattertext import SampleCorpora, produce_fightin_words_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.WhitespaceNLP import whitespace_nlp_with_sentences
from scattertext.termsignificance.LogOddsRatioInformativeDirichletPiror import LogOddsRatioInformativeDirichletPrior

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=whitespace_nlp_with_sentences).build()
'''
term_freq_df = corpus.get_term_freq_df()
frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
zeta_i_j = (LogOddsRatioUninformativeDirichletPrior()
	.get_zeta_i_j_given_separate_counts(term_freq_df['democrat freq'],
                                      term_freq_df['republican freq']))
zeta_scaled_for_charting = scale_neg_1_to_1_with_zero_mean_abs_max(zeta_i_j)

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    x_coords=frequencies_scaled,
                                    y_coords=zeta_scaled_for_charting,
                                    scores=zeta_i_j,
                                    sort_by_dist=False,
                                    metadata=convention_df['speaker'],
                                    x_label='Log Frequency',
                                    y_label='Log Odds Ratio w/ Prior (a_w=0.01)')
示例#14
0
import spacy
import csv, pandas as pd

from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.en.English()
platforms = pd.read_csv('official-candidate-platforms.csv')

corpus = CorpusFromPandas(platforms,
                          category_col='incumbent',
                          text_col='platform',
                          nlp=nlp).build()

incumbent = produce_scattertext_explorer(corpus,
                                         category='incumbent',
                                         category_name='Incumbent',
                                         not_category_name='Challenger',
                                         minimum_term_frequency=8,
                                         width_in_pixels=1000,
                                         filter_unigrams=True,
                                         metadata=platforms['meta'])

office = CorpusFromPandas(platforms,
                          category_col='position',
                          text_col='platform',
                          nlp=nlp).build()

mayor = produce_scattertext_explorer(office,
                                     category='mayor',
                                     category_name='Mayor',
import spacy

from scattertext import SampleCorpora, PhraseMachinePhrases, dense_rank, RankDifference, AssociationCompactor, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
                           category_col='party',
                           text_col='text',
                           feats_from_spacy_doc=PhraseMachinePhrases(),
                           nlp=spacy.load('en', parser=False)).build().compact(
                               AssociationCompactor(4000)))

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=0,
                                    pmi_threshold_coefficient=0,
                                    transform=dense_rank,
                                    metadata=corpus.get_df()['speaker'],
                                    term_scorer=RankDifference(),
                                    width_in_pixels=1000)
open('./demo_phrase_machine.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_phrase_machine.html in Chrome or Firefox.')
示例#16
0
def get_test_corpus():
    df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                      columns=['category', 'text'])
    corpus = CorpusFromPandas(df, 'category', 'text',
                              nlp=whitespace_nlp).build()
    return corpus
示例#17
0
from scattertext.Scalers import dense_rank

from scattertext.termscoring.RankDifference import RankDifference

from scattertext.termcompaction.AssociationCompactor import AssociationCompactor

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact(
        AssociationCompactor(4000))

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=0,
                                    pmi_threshold_coefficient=0,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'],
                                    term_scorer=RankDifference(),
                                    transform=dense_rank)

open('./demo_dense_rank.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_dense_rank.html in Chrome or Firefox.')
示例#18
0
from scattertext.Scalers import dense_rank
from scattertext.termscoring.DeltaJSDivergence import DeltaJSDivergence

from scattertext.termcompaction.AssociationCompactor import JSDCompactor

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer, RankDifference
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact(
        JSDCompactor(1000))

term_etc_df = corpus.get_term_freq_df('').assign(
    DemocraticRank=lambda df: dense_rank(df['democrat']),
    RepublicanRank=lambda df: dense_rank(df['republican']),
    RankDiff=lambda df: RankDifference().get_scores(df['democrat'], df[
        'republican']),
)

get_custom_term_html = '(function(x) {return "Term: " + x.term + "<span class=topic_preview>"' + ' '.join(
    f''' + "<br>{name}: " + x.etc.{key}.toFixed(5)'''
    for name, key in [('Democratic Rank', 'DemocraticRank'),
                      ('Republican Rank',
                       'RepublicanRank'), ('Rank Difference Score',
                                           'RankDiff')]) + '+ "</span>" ;})'

html = produce_frequency_explorer(
 def _get_test_corpus(self):
     cats, docs = get_docs_categories_four()
     df = pd.DataFrame({'category': cats, 'text': docs})
     corpus = CorpusFromPandas(df, 'category', 'text',
                               nlp=whitespace_nlp).build()
     return corpus
import spacy

from scattertext import SampleCorpora, PhraseMachinePhrases
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termcompaction.CompactTerms import CompactTerms

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(convention_df,
                           category_col='party',
                           text_col='text',
                           feats_from_spacy_doc=PhraseMachinePhrases(),
                           nlp=spacy.load('en', parser=False)).build().compact(
                               CompactTerms(minimum_term_count=2)))

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=2,
                                    pmi_threshold_coefficient=0,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'])
open('./demo_phrase_machine.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_phrase_machine.html in Chrome or Firefox.')
示例#21
0
import spacy

from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termscoring.LogOddsUniformativePriorScore import LogOddsUninformativePriorScore

nlp = spacy.load('en_core_web_sm')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()
term_freq_df = corpus.get_term_freq_df()
scores = -(LogOddsUninformativePriorScore.get_thresholded_score(
    term_freq_df['democrat freq'],
    term_freq_df['republican freq'],
    alpha_w=2.,
    threshold=0.1))
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    scores=scores,
                                    sort_by_dist=False,
                                    gray_zero_scores=True,
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'])
open('./demo_insignificant_greyed_out.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_insignificant_greyed_out.html in Chrome or Firefox.')
示例#22
0
#                                     metadata=convention_df['speaker'])
# open('./demo_phrase_machine.html', 'wb').write(html.encode('utf-8'))
# print('Open ./demo_phrase_machine.html in Chrome or Firefox.')

# ================================================================================
all_satisfaction_score_comment_in_all_conds = utils_data.get_all_satisfaction_score_comment_in_all_conds(
)

columns = ['senti_on_Metfor_oral', 'feature', 'review']
all_satisfaction_score_comment_in_all_conds_df = pd.DataFrame(
    all_satisfaction_score_comment_in_all_conds, index=None, columns=columns)

# ================================================================================
corpus = (CorpusFromPandas(all_satisfaction_score_comment_in_all_conds_df,
                           category_col='senti_on_Metfor_oral',
                           text_col='review',
                           feats_from_spacy_doc=PhraseMachinePhrases(),
                           nlp=spacy.load('en', parser=False)).build().compact(
                               CompactTerms(minimum_term_count=2)))

# ================================================================================
html = produce_scattertext_explorer(
    corpus,
    category='negative',
    category_name='Negative',
    not_category_name='Positive',
    minimum_term_frequency=2,
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    metadata=all_satisfaction_score_comment_in_all_conds_df['feature'])

# ================================================================================
def main():
    nlp = spacy.load('en')

    # ================================================================================
    # convention_df = SampleCorpora.ConventionData2012.get_data()
    # print("convention_df",convention_df)
    #         party         speaker  \
    # 0    democrat    BARACK OBAMA
    # 1    democrat    MICHELLE OBAMA
    # 2    democrat    RICHARD DURBIN
    # 3    democrat    JOSEPH BIDEN
    # 4    democrat    JILL BIDEN
    # 5    democrat    ANGIE FLORES

    #      text
    # 0    Thank you. Thank you. Thank you. Thank you so much.Thank you.Thank you so much. Thank you. Thank you very
    # 1    Thank you so much. Tonight, I am so thrilled and so honored and so proud to introduce the love of my life
    # 2    Thank you. It is a singular honor to be here tonight. Eight years ago in Boston, I introduced you to a sta
    # 3    Hey, Delaware. \nAnd my favorite Democrat, Jilly, I want you to know that Beau and Hunt and Ashley and I —
    # 4    Hello. \nThank you, Angie. I'm so proud of how far you've come.\nI'm so proud to stand before you tonight
    # 5    My name is Angie Flores and I am a student at Miami-Dade College. \nWhen you grow up in a family where get

    # print("convention_df",convention_df.shape)
    # (189, 3)

    # df1=convention_df.iloc[:10,:]
    # df2=convention_df.iloc[150:160,:]
    # df_cat=pd.concat([df1,df2],axis=0)
    # # print("df_cat",df_cat.shape)
    # # (20, 3)
    # convention_df=df_cat

    # ================================================================================
    # convention_df=pd.read_csv('/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/Data/WebMD_Metformin_oral/text.csv',encoding='utf8',error_bad_lines=False)
    # print("convention_df",convention_df.shape)

    # ================================================================================
    # corpus = CorpusFromPandas(convention_df,
    #                           category_col='party',
    #                           text_col='text',
    #                           nlp=nlp).build()

    # ================================================================================
    all_satisfaction_score_comment_in_all_conds = utils_data.get_all_satisfaction_score_comment_in_all_conds(
    )
    # print("all_satisfaction_score_comment_in_all_conds",all_satisfaction_score_comment_in_all_conds)
    # [['negative', 'Satisfaction', 'after a week----mouth ulccers,cudnt talk,eat,drink for 5 days....whole body burnt,headache, fatigue....quit---am slowly getting better, wudnt give to my worst

    # print("all_satisfaction_score_comment_in_all_conds",len(all_satisfaction_score_comment_in_all_conds))
    # 1402

    # ================================================================================
    columns = ['senti_on_Metfor_oral', 'feature', 'review']
    all_satisfaction_score_comment_in_all_conds_df = pd.DataFrame(
        all_satisfaction_score_comment_in_all_conds,
        index=None,
        columns=columns)

    # ================================================================================
    corpus = CorpusFromPandas(all_satisfaction_score_comment_in_all_conds_df,
                              category_col='senti_on_Metfor_oral',
                              text_col='review',
                              nlp=nlp).build()

    # ================================================================================
    html = word_similarity_explorer(
        corpus,
        category='negative',
        category_name='Negative',
        not_category_name='Positive',
        target_term='jobs',
        minimum_term_frequency=5,
        width_in_pixels=1000,
        metadata=all_satisfaction_score_comment_in_all_conds_df['feature'],
        alpha=0.01,
        max_p_val=0.1,
        save_svg_button=True)

    # ================================================================================
    open(
        '/mnt/1T-5e7/mycodehtml/Data_mining/Visualization/Scattertext/demo_similarity.html',
        'wb').write(html.encode('utf-8'))
    print('Open ./demo_similarlity.html in Chrome or Firefox.')
示例#24
0
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, CohensD, produce_frequency_explorer, \
    OncePerDocFrequencyRanker
from scattertext.termcompaction.ClassPercentageCompactor import ClassPercentageCompactor
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.termranking import ClassBalancedFrequencyRanker
from scattertext.termscoring.ScaledFScore import ScaledFScorePresets

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = (CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().compact(
        ClassPercentageCompactor(term_ranker=OncePerDocFrequencyRanker)))

html = produce_frequency_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    term_scorer=CohensD(corpus).set_term_ranker(
        ClassBalancedFrequencyRanker).set_categories('democrat',
                                                     ['republican']),
    metadata=convention_df['speaker'],
    grey_threshold=0,
    show_neutral=True)
file_name = 'demo_cohens_d.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./demo_cohens_d.html in Chrome or Firefox.')
示例#25
0
import spacy

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, PhraseMachinePhrases
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          feats_from_spacy_doc=PhraseMachinePhrases(),
                          nlp=whitespace_nlp_with_sentences).build()

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    pmi_threshold_coefficient=8,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'])
open('./demo.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo.html in Chrome or Firefox.')
	def test_one_word_per_docs(self):
		records = [(0, 'verified', 'RAs'), (1, 'view', 'RAs'), (2, 'laminectomy', 'RAs'), (3, 'recognition', 'RAs'),
		           (4, 'possibility', 'RAs'), (5, 'possibility', 'RAs'), (6, 'possibility', 'RAs'),
		           (7, 'observations', 'RAs'), (8, 'observation', 'RAs'), (9, 'observation', 'RAs'),
		           (10, 'observation', 'RAs'), (11, 'observation', 'RAs'), (12, 'observation', 'RAs'),
		           (13, 'implication', 'RAs'), (14, 'idea', 'RAs'), (15, 'hypothesis', 'RAs'), (16, 'fact', 'RAs'),
		           (17, 'fact', 'RAs'), (18, 'fact', 'RAs'), (19, 'fact', 'RAs'), (20, 'fact', 'RAs'),
		           (21, 'surprising', 'RAs'), (22, 'surprising', 'RAs'), (23, 'surprising', 'RAs'), (24, 'suggests', 'RAs'),
		           (25, 'suggests', 'RAs'), (26, 'suggests', 'RAs'), (27, 'suggests', 'RAs'), (28, 'suggests', 'RAs'),
		           (29, 'suggests', 'RAs'), (30, 'suggests', 'RAs'), (31, 'suggests', 'RAs'), (32, 'suggests', 'RAs'),
		           (33, 'suggests', 'RAs'), (34, 'suggests', 'RAs'), (35, 'suggests', 'RAs'), (36, 'suggests', 'RAs'),
		           (37, 'suggests', 'RAs'), (38, 'suggests', 'RAs'), (39, 'suggests', 'RAs'), (40, 'suggests', 'RAs'),
		           (41, 'suggests', 'RAs'), (42, 'suggests', 'RAs'), (43, 'suggests', 'RAs'), (44, 'suggests', 'RAs'),
		           (45, 'suggests', 'RAs'), (46, 'suggests', 'RAs'), (47, 'suggests', 'RAs'), (48, 'suggesting', 'RAs'),
		           (49, 'suggesting', 'RAs'), (50, 'suggesting', 'RAs'), (51, 'suggesting', 'RAs'),
		           (52, 'suggesting', 'RAs'), (53, 'suggesting', 'RAs'), (54, 'suggesting', 'RAs'),
		           (55, 'suggesting', 'RAs'), (56, 'suggesting', 'RAs'), (57, 'suggesting', 'RAs'),
		           (58, 'suggesting', 'RAs'), (59, 'suggesting', 'RAs'), (60, 'suggesting', 'RAs'),
		           (61, 'suggesting', 'RAs'), (62, 'suggesting', 'RAs'), (63, 'suggesting', 'RAs'),
		           (64, 'suggesting', 'RAs'), (65, 'suggesting', 'RAs'), (66, 'suggesting', 'RAs'),
		           (67, 'suggesting', 'RAs'), (68, 'suggesting', 'RAs'), (69, 'suggesting', 'RAs'),
		           (70, 'suggesting', 'RAs'), (71, 'suggesting', 'RAs'), (72, 'suggesting', 'RAs'),
		           (73, 'suggesting', 'RAs'), (74, 'suggesting', 'RAs'), (75, 'suggested', 'RAs'), (76, 'suggested', 'RAs'),
		           (77, 'suggested', 'RAs'), (78, 'suggested', 'RAs'), (79, 'suggested', 'RAs'), (80, 'suggest', 'RAs'),
		           (81, 'suggest', 'RAs'), (82, 'suggest', 'RAs'), (83, 'suggest', 'RAs'), (84, 'suggest', 'RAs'),
		           (85, 'suggest', 'RAs'), (86, 'suggest', 'RAs'), (87, 'suggest', 'RAs'), (88, 'suggest', 'RAs'),
		           (89, 'suggest', 'RAs'), (90, 'suggest', 'RAs'), (91, 'suggest', 'RAs'), (92, 'suggest', 'RAs'),
		           (93, 'suggest', 'RAs'), (94, 'suggest', 'RAs'), (95, 'suggest', 'RAs'), (96, 'suggest', 'RAs'),
		           (97, 'suggest', 'RAs'), (98, 'suggest', 'RAs'), (99, 'suggest', 'RAs'), (100, 'suggest', 'RAs'),
		           (101, 'suggest', 'RAs'), (102, 'suggest', 'RAs'), (103, 'suggest', 'RAs'), (104, 'suggest', 'RAs'),
		           (105, 'suggest', 'RAs'), (106, 'suggest', 'RAs'), (107, 'suggest', 'RAs'), (108, 'suggest', 'RAs'),
		           (109, 'suggest', 'RAs'), (110, 'suggest', 'RAs'), (111, 'suggest', 'RAs'), (112, 'suggest', 'RAs'),
		           (113, 'suggest', 'RAs'), (114, 'suggest', 'RAs'), (115, 'suggest', 'RAs'), (116, 'suggest', 'RAs'),
		           (117, 'suggest', 'RAs'), (118, 'suggest', 'RAs'), (119, 'suggest', 'RAs'), (120, 'suggest', 'RAs'),
		           (121, 'suggest', 'RAs'), (122, 'suggest', 'RAs'), (123, 'suggest', 'RAs'), (124, 'suggest', 'RAs'),
		           (125, 'suggest', 'RAs'), (126, 'suggest', 'RAs'), (127, 'suggest', 'RAs'), (128, 'speculate', 'RAs'),
		           (129, 'speculate', 'RAs'), (130, 'speculate', 'RAs'), (131, 'shows', 'RAs'), (132, 'shows', 'RAs'),
		           (133, 'shows', 'RAs'), (134, 'shows', 'RAs'), (135, 'shows', 'RAs'), (136, 'shown', 'RAs'),
		           (137, 'shown', 'RAs'), (138, 'shown', 'RAs'), (139, 'shown', 'RAs'), (140, 'showing', 'RAs'),
		           (141, 'showing', 'RAs'), (142, 'showing', 'RAs'), (143, 'showing', 'RAs'), (144, 'showing', 'RAs'),
		           (145, 'showing', 'RAs'), (146, 'showed', 'RAs'), (147, 'showed', 'RAs'), (148, 'showed', 'RAs'),
		           (149, 'showed', 'RAs'), (150, 'showed', 'RAs'), (151, 'showed', 'RAs'), (152, 'showed', 'RAs'),
		           (153, 'showed', 'RAs'), (154, 'showed', 'RAs'), (155, 'showed', 'RAs'), (156, 'showed', 'RAs'),
		           (157, 'showed', 'RAs'), (158, 'showed', 'RAs'), (159, 'showed', 'RAs'), (160, 'showed', 'RAs'),
		           (161, 'showed', 'RAs'), (162, 'showed', 'RAs'), (163, 'showed', 'RAs'), (164, 'showed', 'RAs'),
		           (165, 'showed', 'RAs'), (166, 'showed', 'RAs'), (167, 'showed', 'RAs'), (168, 'showed', 'RAs'),
		           (169, 'show', 'RAs'), (170, 'show', 'RAs'), (171, 'show', 'RAs'), (172, 'show', 'RAs'),
		           (173, 'show', 'RAs'), (174, 'show', 'RAs'), (175, 'show', 'RAs'), (176, 'show', 'RAs'),
		           (177, 'show', 'RAs'), (178, 'show', 'RAs'), (179, 'show', 'RAs'), (180, 'show', 'RAs'),
		           (181, 'show', 'RAs'), (182, 'show', 'RAs'), (183, 'show', 'RAs'), (184, 'show', 'RAs'),
		           (185, 'show', 'RAs'), (186, 'show', 'RAs'), (187, 'show', 'RAs'), (188, 'show', 'RAs'),
		           (189, 'show', 'RAs'), (190, 'show', 'RAs'), (191, 'show', 'RAs'), (192, 'revealing', 'RAs'),
		           (193, 'revealed', 'RAs'), (194, 'revealed', 'RAs'), (195, 'revealed', 'RAs'), (196, 'revealed', 'RAs'),
		           (197, 'revealed', 'RAs'), (198, 'revealed', 'RAs'), (199, 'reveal', 'RAs'), (200, 'requires', 'RAs'),
		           (201, 'requires', 'RAs'), (202, 'requires', 'RAs'), (203, 'report', 'RAs'), (204, 'report', 'RAs'),
		           (205, 'reasoned', 'RAs'), (206, 'reasoned', 'RAs'), (207, 'reasoned', 'RAs'), (208, 'reasoned', 'RAs'),
		           (209, 'rationale', 'RAs'), (210, 'observations', 'RAs'), (211, 'findings', 'RAs'),
		           (212, 'postulated', 'RAs'), (213, 'postulate', 'RAs'), (214, 'possible', 'RAs'),
		           (215, 'possible', 'RAs'), (216, 'possible', 'RAs'), (217, 'possible', 'RAs'), (218, 'possible', 'RAs'),
		           (219, 'possible', 'RAs'), (220, 'possible', 'RAs'), (221, 'possible', 'RAs'), (222, 'possible', 'RAs'),
		           (223, 'possible', 'RAs'), (224, 'possible', 'RAs'), (225, 'possible', 'RAs'), (226, 'possible', 'RAs'),
		           (227, 'possible', 'RAs'), (228, 'possibility', 'RAs'), (229, 'possibility', 'RAs'),
		           (230, 'possibility', 'RAs'), (231, 'possibility', 'RAs'), (232, 'possibility', 'RAs'),
		           (233, 'possibility', 'RAs'), (234, 'possibility', 'RAs'), (235, 'possibility', 'RAs'),
		           (236, 'explanation', 'RAs'), (237, 'possibility', 'RAs'), (238, 'One', 'RAs'),
		           (239, 'interpretation', 'RAs'), (240, 'observed', 'RAs'), (241, 'observed', 'RAs'),
		           (242, 'observations', 'RAs'), (243, 'observations', 'RAs'), (244, 'observation', 'RAs'),
		           (245, 'noteworthy', 'RAs'), (246, 'noted', 'RAs'), (247, 'noted', 'RAs'), (248, 'noted', 'RAs'),
		           (249, 'note', 'RAs'), (250, 'known', 'RAs'), (251, 'evidence', 'RAs'), (252, 'doubt', 'RAs'),
		           (253, 'means', 'RAs'), (254, 'means', 'RAs'), (255, 'likely', 'RAs'), (256, 'likely', 'RAs'),
		           (257, 'likely', 'RAs'), (258, 'likely', 'RAs'), (259, 'likely', 'RAs'), (260, 'likely', 'RAs'),
		           (261, 'likely', 'RAs'), (262, 'likely', 'RAs'), (263, 'likely', 'RAs'), (264, 'possible', 'RAs'),
		           (265, 'possible', 'RAs'), (266, 'possible', 'RAs'), (267, 'interesting', 'RAs'), (268, 'infer', 'RAs'),
		           (269, 'inevitable', 'RAs'), (270, 'indicating', 'RAs'), (271, 'indicating', 'RAs'),
		           (272, 'indicating', 'RAs'), (273, 'indicating', 'RAs'), (274, 'indicating', 'RAs'),
		           (275, 'indicating', 'RAs'), (276, 'indicating', 'RAs'), (277, 'indicating', 'RAs'),
		           (278, 'indicating', 'RAs'), (279, 'indicates', 'RAs'), (280, 'indicates', 'RAs'),
		           (281, 'indicates', 'RAs'), (282, 'indicates', 'RAs'), (283, 'indicates', 'RAs'),
		           (284, 'indicates', 'RAs'), (285, 'indicated', 'RAs'), (286, 'indicated', 'RAs'),
		           (287, 'indicated', 'RAs'), (288, 'indicated', 'RAs'), (289, 'indicated', 'RAs'),
		           (290, 'indicated', 'RAs'), (291, 'indicated', 'RAs'), (292, 'indicate', 'RAs'), (293, 'indicate', 'RAs'),
		           (294, 'indicate', 'RAs'), (295, 'indicate', 'RAs'), (296, 'indicate', 'RAs'), (297, 'indicate', 'RAs'),
		           (298, 'indicate', 'RAs'), (299, 'indicate', 'RAs'), (300, 'indicate', 'RAs'), (301, 'indicate', 'RAs'),
		           (302, 'indicate', 'RAs'), (303, 'indicate', 'RAs'), (304, 'indicate', 'RAs'), (305, 'indicate', 'RAs'),
		           (306, 'indicate', 'RAs'), (307, 'indicate', 'RAs'), (308, 'indicate', 'RAs'), (309, 'indicate', 'RAs'),
		           (310, 'indicate', 'RAs'), (311, 'indicate', 'RAs'), (312, 'indicate', 'RAs'), (313, 'indicate', 'RAs'),
		           (314, 'implying', 'RAs'), (315, 'imply', 'RAs'), (316, 'imply', 'RAs'), (317, 'implies', 'RAs'),
		           (318, 'idea', 'RAs'), (319, 'idea', 'RAs'), (320, 'hypothesized', 'RAs'), (321, 'hypothesized', 'RAs'),
		           (322, 'hypothesized', 'RAs'), (323, 'shown', 'RAs'), (324, 'given', 'RAs'), (325, 'given', 'RAs'),
		           (326, 'given', 'RAs'), (327, 'given', 'RAs'), (328, 'evidence', 'RAs'), (329, 'found', 'RAs'),
		           (330, 'found', 'RAs'), (331, 'found', 'RAs'), (332, 'found', 'RAs'), (333, 'found', 'RAs'),
		           (334, 'found', 'RAs'), (335, 'found', 'RAs'), (336, 'found', 'RAs'), (337, 'found', 'RAs'),
		           (338, 'found', 'RAs'), (339, 'found', 'RAs'), (340, 'found', 'RAs'), (341, 'found', 'RAs'),
		           (342, 'found', 'RAs'), (343, 'found', 'RAs'), (344, 'found', 'RAs'), (345, 'found', 'RAs'),
		           (346, 'found', 'RAs'), (347, 'found', 'RAs'), (348, 'found', 'RAs'), (349, 'found', 'RAs'),
		           (350, 'found', 'RAs'), (351, 'finding', 'RAs'), (352, 'find', 'RAs'), (353, 'feel', 'RAs'),
		           (354, 'fact', 'RAs'), (355, 'extent', 'RAs'), (356, 'expected', 'RAs'), (357, 'evidence', 'RAs'),
		           (358, 'evidence', 'RAs'), (359, 'evidence', 'RAs'), (360, 'evidence', 'RAs'), (361, 'estimated', 'RAs'),
		           (362, 'estimated', 'RAs'), (363, 'estimated', 'RAs'), (364, 'estimate', 'RAs'),
		           (365, 'established', 'RAs'), (366, 'established', 'RAs'), (367, 'emphasize', 'RAs'),
		           (368, 'determined', 'RAs'), (369, 'demonstration', 'RAs'), (370, 'demonstrating', 'RAs'),
		           (371, 'demonstrates', 'RAs'), (372, 'demonstrated', 'RAs'), (373, 'demonstrated', 'RAs'),
		           (374, 'demonstrate', 'RAs'), (375, 'demonstrate', 'RAs'), (376, 'demonstrate', 'RAs'),
		           (377, 'demonstrate', 'RAs'), (378, 'demonstrate', 'RAs'), (379, 'demonstrate', 'RAs'),
		           (380, 'demonstrate', 'RAs'), (381, 'demonstrate', 'RAs'), (382, 'demonstrate', 'RAs'),
		           (383, 'argued', 'RAs'), (384, 'confirming', 'RAs'), (385, 'confirming', 'RAs'),
		           (386, 'confirming', 'RAs'), (387, 'confirming', 'RAs'), (388, 'confirmed', 'RAs'),
		           (389, 'confirmed', 'RAs'), (390, 'confirmed', 'RAs'), (391, 'confirmed', 'RAs'),
		           (392, 'confirmed', 'RAs'), (393, 'confirm', 'RAs'), (394, 'confirm', 'RAs'), (395, 'conclusion', 'RAs'),
		           (396, 'conclude', 'RAs'), (397, 'conclude', 'RAs'), (398, 'conclude', 'RAs'), (399, 'conclude', 'RAs'),
		           (400, 'conclude', 'RAs'), (401, 'conclude', 'RAs'), (402, 'conclude', 'RAs'), (403, 'conclude', 'RAs'),
		           (404, 'believe', 'RAs'), (405, 'believe', 'RAs'), (406, 'believe', 'RAs'), (407, 'believe', 'RAs'),
		           (408, 'believe', 'RAs'), (409, 'appears', 'RAs'), (410, 'appeared', 'RAs'), (411, 'appeared', 'RAs'),
		           (412, 'anticipated', 'RAs'), (413, 'acknowledged', 'RAs'), (414, 'acknowledge', 'RAs'),
		           (415, 'accept', 'RAs'), (416, 'limitation', 'RAs'), (417, 'explanation', 'RAs'), (418, 'finding', 'RAs'),
		           (419, 'decision', 'RAs'), (420, 'well-known', 'RAs'), (421, 'view', 'RAs'), (422, 'observation', 'RAs'),
		           (423, 'fact', 'RAs'), (424, 'fact', 'RAs'), (425, 'reports', 'RAs'), (426, 'possibility', 'RAs'),
		           (427, 'indication', 'RAs'), (428, 'exclude', 'RAs'), (429, 'reported', 'RAs'), (430, 'indicated', 'RAs'),
		           (431, 'observation', 'RAs'), (432, 'observation', 'RAs'), (433, 'suggests', 'RAs'),
		           (434, 'suggesting', 'RAs'), (435, 'suggesting', 'RAs'), (436, 'suggesting', 'RAs'),
		           (437, 'suggesting', 'RAs'), (438, 'suggesting', 'RAs'), (439, 'suggested', 'RAs'),
		           (440, 'suggested', 'RAs'), (441, 'suggested', 'RAs'), (442, 'suggested', 'RAs'),
		           (443, 'suggested', 'RAs'), (444, 'suggested', 'RAs'), (445, 'suggested', 'RAs'),
		           (446, 'suggested', 'RAs'), (447, 'suggested', 'RAs'), (448, 'suggest', 'RAs'), (449, 'suggest', 'RAs'),
		           (450, 'suggest', 'RAs'), (451, 'suggest', 'RAs'), (452, 'shown', 'RAs'), (453, 'shown', 'RAs'),
		           (454, 'shown', 'RAs'), (455, 'shown', 'RAs'), (456, 'shown', 'RAs'), (457, 'shown', 'RAs'),
		           (458, 'shown', 'RAs'), (459, 'shown', 'RAs'), (460, 'shown', 'RAs'), (461, 'shown', 'RAs'),
		           (462, 'shown', 'RAs'), (463, 'shown', 'RAs'), (464, 'shown', 'RAs'), (465, 'showing', 'RAs'),
		           (466, 'showed', 'RAs'), (467, 'showed', 'RAs'), (468, 'showed', 'RAs'), (469, 'showed', 'RAs'),
		           (470, 'showed', 'RAs'), (471, 'show', 'RAs'), (472, 'show', 'RAs'), (473, 'show', 'RAs'),
		           (474, 'revealed', 'RAs'), (475, 'revealed', 'RAs'), (476, 'revealed', 'RAs'), (477, 'reported', 'RAs'),
		           (478, 'reported', 'RAs'), (479, 'reported', 'RAs'), (480, 'reported', 'RAs'), (481, 'reported', 'RAs'),
		           (482, 'reported', 'RAs'), (483, 'evidence', 'RAs'), (484, 'proposed', 'RAs'), (485, 'reports', 'RAs'),
		           (486, 'observations', 'RAs'), (487, 'postulated', 'RAs'), (488, 'observations', 'RAs'),
		           (489, 'observations', 'RAs'), (490, 'observation', 'RAs'), (491, 'notion', 'RAs'), (492, 'noted', 'RAs'),
		           (493, 'noted', 'RAs'), (494, 'thought', 'RAs'), (495, 'increasing', 'RAs'), (496, 'indicates', 'RAs'),
		           (497, 'indicated', 'RAs'), (498, 'indicate', 'RAs'), (499, 'indicate', 'RAs'), (500, 'evidence', 'RAs'),
		           (501, 'hypothesized', 'RAs'), (502, 'found', 'RAs'), (503, 'found', 'RAs'), (504, 'found', 'RAs'),
		           (505, 'found', 'RAs'), (506, 'found', 'RAs'), (507, 'found', 'RAs'), (508, 'found', 'RAs'),
		           (509, 'found', 'RAs'), (510, 'found', 'RAs'), (511, 'found', 'RAs'), (512, 'findings', 'RAs'),
		           (513, 'findings', 'RAs'), (514, 'findings', 'RAs'), (515, 'find', 'RAs'), (516, 'evidence', 'RAs'),
		           (517, 'evidence', 'RAs'), (518, 'established', 'RAs'), (519, 'established', 'RAs'),
		           (520, 'documented', 'RAs'), (521, 'demonstrated', 'RAs'), (522, 'demonstrated', 'RAs'),
		           (523, 'demonstrated', 'RAs'), (524, 'demonstrated', 'RAs'), (525, 'demonstrated', 'RAs'),
		           (526, 'demonstrated', 'RAs'), (527, 'demonstrated', 'RAs'), (528, 'demonstrated', 'RAs'),
		           (529, 'demonstrated', 'RAs'), (530, 'confirmed', 'RAs'), (531, 'concluded', 'RAs'),
		           (532, 'claimed', 'RAs'), (533, 'believed', 'RAs'), (534, 'argued', 'RAs'), (535, 'reports', 'RAs'),
		           (536, 'prove', 'RAs'), (537, 'confirm', 'RAs'), (538, 'show', 'RAs'), (539, 'types', 'RAs'),
		           (540, 'analysis', 'RAs'), (541, 'fact', 'RAs'), (542, 'showing', 'RAs'), (543, 'recognize', 'RAs'),
		           (544, 'reassuring', 'RAs'), (545, 'provided', 'RAs'), (546, 'note', 'RAs'), (547, 'limitation', 'RAs'),
		           (548, 'knowing', 'RAs'), (549, 'expected', 'RAs'), (550, 'indicating', 'RAs'), (551, 'indicates', 'RAs'),
		           (552, 'indicated', 'RAs'), (553, 'included', 'RAs'), (554, 'given', 'RAs'), (555, 'estimated', 'RAs'),
		           (556, 'estimated', 'RAs'), (557, 'established', 'RAs'), (558, 'ensured', 'RAs'), (559, 'ensure', 'RAs'),
		           (560, 'ensure', 'RAs'), (561, 'ensure', 'RAs'), (562, 'effect', 'RAs'), (563, 'dependence', 'RAs'),
		           (564, 'confirm', 'RAs'), (565, 'confirm', 'RAs'), (566, 'condition', 'RAs'), (567, 'assuming', 'RAs'),
		           (568, 'assumed', 'RAs'), (569, 'acknowledge', 'RAs'), (570, 'method', 'RAs'), (571, 'limitation', 'RAs'),
		           (572, 'difference', 'RAs'), (573, 'length', 'RAs'), (574, 'view', 'RAs'), (575, 'theory', 'RAs'),
		           (576, 'notion', 'RAs'), (577, 'notion', 'RAs'), (578, 'idea', 'RAs'), (579, 'hypothesis', 'RAs'),
		           (580, 'suggests', 'RAs'), (581, 'recognises', 'RAs'), (582, 'probability', 'RAs'),
		           (583, 'postulated', 'RAs'), (584, 'postulated', 'RAs'), (585, 'hypothesis', 'RAs'),
		           (586, 'hypothesis', 'RAs'), (587, 'hypothesis', 'RAs'), (588, 'account', 'RAs'), (589, 'account', 'RAs'),
		           (590, 'theory', 'RAs'), (591, 'idea', 'RAs'), (592, 'unlikely', 'RAs'), (593, 'understand', 'RAs'),
		           (594, 'uncovered', 'RAs'), (595, 'time', 'RAs'), (596, 'potential', 'RAs'), (597, 'possibility', 'RAs'),
		           (598, 'finding', 'RAs'), (599, 'fact', 'RAs'), (600, 'fact', 'RAs'), (601, 'plausibility', 'RAs'),
		           (602, 'suggests', 'RAs'), (603, 'suggests', 'RAs'), (604, 'suggests', 'RAs'), (605, 'suggests', 'RAs'),
		           (606, 'suggests', 'RAs'), (607, 'suggesting', 'RAs'), (608, 'suggesting', 'RAs'),
		           (609, 'suggesting', 'RAs'), (610, 'suggesting', 'RAs'), (611, 'suggesting', 'RAs'),
		           (612, 'suggesting', 'RAs'), (613, 'suggesting', 'RAs'), (614, 'suggesting', 'RAs'),
		           (615, 'suggesting', 'RAs'), (616, 'suggesting', 'RAs'), (617, 'suggesting', 'RAs'),
		           (618, 'suggested', 'RAs'), (619, 'suggest', 'RAs'), (620, 'suggest', 'RAs'), (621, 'suggest', 'RAs'),
		           (622, 'suggest', 'RAs'), (623, 'suggest', 'RAs'), (624, 'suggest', 'RAs'), (625, 'suggest', 'RAs'),
		           (626, 'suggest', 'RAs'), (627, 'suggest', 'RAs'), (628, 'suggest', 'RAs'), (629, 'suggest', 'RAs'),
		           (630, 'suggest', 'RAs'), (631, 'suggest', 'RAs'), (632, 'suggest', 'RAs'), (633, 'suggest', 'RAs'),
		           (634, 'suggest', 'RAs'), (635, 'suggest', 'RAs'), (636, 'shows', 'RAs'), (637, 'shown', 'RAs'),
		           (638, 'shown', 'RAs'), (639, 'shown', 'RAs'), (640, 'shown', 'RAs'), (641, 'showing', 'RAs'),
		           (642, 'showing', 'RAs'), (643, 'showing', 'RAs'), (644, 'showing', 'RAs'), (645, 'showing', 'RAs'),
		           (646, 'showing', 'RAs'), (647, 'showing', 'RAs'), (648, 'showing', 'RAs'), (649, 'showed', 'RAs'),
		           (650, 'showed', 'RAs'), (651, 'showed', 'RAs'), (652, 'showed', 'RAs'), (653, 'showed', 'RAs'),
		           (654, 'show', 'RAs'), (655, 'show', 'RAs'), (656, 'show', 'RAs'), (657, 'show', 'RAs'),
		           (658, 'show', 'RAs'), (659, 'show', 'RAs'), (660, 'show', 'RAs'), (661, 'revealed', 'RAs'),
		           (662, 'revealed', 'RAs'), (663, 'revealed', 'RAs'), (664, 'revealed', 'RAs'), (665, 'revealed', 'RAs'),
		           (666, 'reported', 'RAs'), (667, 'possible', 'RAs'), (668, 'possible', 'RAs'), (669, 'possible', 'RAs'),
		           (670, 'possible', 'RAs'), (671, 'possible', 'RAs'), (672, 'possible', 'RAs'), (673, 'possible', 'RAs'),
		           (674, 'possible', 'RAs'), (675, 'observation', 'RAs'), (676, 'hypothesis', 'RAs'),
		           (677, 'observed', 'RAs'), (678, 'observed', 'RAs'), (679, 'observed', 'RAs'), (680, 'observed', 'RAs'),
		           (681, 'observed', 'RAs'), (682, 'observed', 'RAs'), (683, 'observed', 'RAs'), (684, 'observed', 'RAs'),
		           (685, 'observed', 'RAs'), (686, 'observed', 'RAs'), (687, 'observed', 'RAs'), (688, 'observed', 'RAs'),
		           (689, 'observed', 'RAs'), (690, 'observed', 'RAs'), (691, 'noted', 'RAs'), (692, 'note', 'RAs'),
		           (693, 'note', 'RAs'), (694, 'lower', 'RAs'), (695, 'likely', 'RAs'), (696, 'indicating', 'RAs'),
		           (697, 'indicating', 'RAs'), (698, 'indicating', 'RAs'), (699, 'indicates', 'RAs'),
		           (700, 'indicated', 'RAs'), (701, 'indicate', 'RAs'), (702, 'indicate', 'RAs'), (703, 'indicate', 'RAs'),
		           (704, 'indicate', 'RAs'), (705, 'illustrate', 'RAs'), (706, 'illustrate', 'RAs'),
		           (707, 'hypothesized', 'RAs'), (708, 'higher', 'RAs'), (709, 'given', 'RAs'), (710, 'Given', 'RAs'),
		           (711, 'found', 'RAs'), (712, 'found', 'RAs'), (713, 'found', 'RAs'), (714, 'found', 'RAs'),
		           (715, 'found', 'RAs'), (716, 'found', 'RAs'), (717, 'found', 'RAs'), (718, 'found', 'RAs'),
		           (719, 'found', 'RAs'), (720, 'found', 'RAs'), (721, 'found', 'RAs'), (722, 'found', 'RAs'),
		           (723, 'found', 'RAs'), (724, 'found', 'RAs'), (725, 'found', 'RAs'), (726, 'found', 'RAs'),
		           (727, 'found', 'RAs'), (728, 'feasible', 'RAs'), (729, 'Evidence', 'RAs'), (730, 'established', 'RAs'),
		           (731, 'discovered', 'RAs'), (732, 'determined', 'RAs'), (733, 'demonstrating', 'RAs'),
		           (734, 'demonstrated', 'RAs'), (735, 'demonstrated', 'RAs'), (736, 'demonstrate', 'RAs'),
		           (737, 'demonstrate', 'RAs'), (738, 'demonstrate', 'RAs'), (739, 'demonstrate', 'RAs'),
		           (740, 'demonstrate', 'RAs'), (741, 'demonstrate', 'RAs'), (742, 'demonstrate', 'RAs'),
		           (743, 'confirming', 'RAs'), (744, 'confirming', 'RAs'), (745, 'confirmed', 'RAs'),
		           (746, 'confirm', 'RAs'), (747, 'confirm', 'RAs'), (748, 'conclude', 'RAs'), (749, 'conclude', 'RAs'),
		           (750, 'conclude', 'RAs'), (751, 'interpretation', 'RAs'), (752, 'observed', 'RAs'),
		           (753, 'Given', 'RAs'), (754, 'given', 'RAs'), (755, 'evidence', 'RAs'), (756, 'hypothesis', 'RAs'),
		           (757, 'notion', 'RAs'), (758, 'fact', 'RAs'), (759, 'discovery', 'RAs'), (760, 'suggests', 'RAs'),
		           (761, 'suggests', 'RAs'), (762, 'suggests', 'RAs'), (763, 'suggested', 'RAs'), (764, 'evidence', 'RAs'),
		           (765, 'shown', 'RAs'), (766, 'shown', 'RAs'), (767, 'shown', 'RAs'), (768, 'shown', 'RAs'),
		           (769, 'shown', 'RAs'), (770, 'shown', 'RAs'), (771, 'shown', 'RAs'), (772, 'shown', 'RAs'),
		           (773, 'showed', 'RAs'), (774, 'show', 'RAs'), (775, 'show', 'RAs'), (776, 'revealed', 'RAs'),
		           (777, 'revealed', 'RAs'), (778, 'reported', 'RAs'), (779, 'reported', 'RAs'), (780, 'reported', 'RAs'),
		           (781, 'reported', 'RAs'), (782, 'recommending', 'RAs'), (783, 'reported', 'RAs'),
		           (784, 'indicates', 'RAs'), (785, 'indicates', 'RAs'), (786, 'indicates', 'RAs'),
		           (787, 'indicates', 'RAs'), (788, 'indicate', 'RAs'), (789, 'hypothesis', 'RAs'), (790, 'found', 'RAs'),
		           (791, 'demonstrated', 'RAs'), (792, 'demonstrated', 'RAs'), (793, 'confirmed', 'RAs'),
		           (794, 'confirm', 'RAs'), (795, 'awareness', 'RAs'), (796, 'caveat', 'RAs'), (797, 'fact', 'RAs'),
		           (798, 'show', 'RAs'), (799, 'reasoned', 'RAs'), (800, 'posit', 'RAs'), (801, 'hypothesized', 'RAs'),
		           (802, 'hypothesized', 'RAs'), (803, 'hypothesized', 'RAs'), (804, 'hypothesized', 'RAs'),
		           (805, 'hypothesized', 'RAs'), (806, 'hypothesized', 'RAs'), (807, 'hypothesized', 'RAs'),
		           (808, 'envision', 'RAs'), (809, 'believe', 'RAs'), (810, 'anticipated', 'RAs'),
		           (811, 'anticipate', 'RAs'), (812, 'ensure', 'RAs'), (813, 'possibility', 'RAs'),
		           (814, 'suggests', 'RAs'), (815, 'suggests', 'RAs'), (816, 'shown', 'RAs'), (817, 'seems', 'RAs'),
		           (818, 'probability', 'RAs'), (819, 'possible', 'RAs'), (820, 'noting', 'RAs'), (821, 'note', 'RAs'),
		           (822, 'given', 'RAs'), (823, 'exclude', 'RAs'), (824, 'assumption', 'RAs'), (825, 'assumption', 'RAs'),
		           (826, 'assumption', 'RAs'), (827, 'assumed', 'RAs'), (828, 'acknowledge', 'RAs'),
		           (829, 'limitation', 'RAs'), (830, 'hypothesis', 'RAs'), (831, 'suggesting', 'RAs'),
		           (832, 'possibility', 'RAs'), (833, 'hypothesis ', 'RAs'), (834, 'What', 'Theses'),
		           (835, 'unlikely', 'Theses'), (836, 'unlikely', 'Theses'), (837, 'unlikely', 'Theses'),
		           (838, 'speculation', 'Theses'), (839, 'result', 'Theses'), (840, 'question', 'Theses'),
		           (841, 'problem', 'Theses'), (842, 'possibility', 'Theses'), (843, 'observations', 'Theses'),
		           (844, 'observation', 'Theses'), (845, 'indication', 'Theses'), (846, 'evidence', 'Theses'),
		           (847, 'evidence', 'Theses'), (848, 'findings', 'Theses'), (849, 'fact', 'Theses'),
		           (850, 'fact', 'Theses'), (851, 'fact', 'Theses'), (852, 'fact', 'Theses'), (853, 'fact', 'Theses'),
		           (854, 'fact', 'Theses'), (855, 'expectation', 'Theses'), (856, 'observation', 'Theses'),
		           (857, 'discordance', 'Theses'), (858, 'observation', 'Theses'), (859, 'evidence', 'Theses'),
		           (860, 'conclusion', 'Theses'), (861, 'surprising', 'Theses'), (862, 'surprising', 'Theses'),
		           (863, 'surprising', 'Theses'), (864, 'surprising', 'Theses'), (865, 'suggests', 'Theses'),
		           (866, 'suggests', 'Theses'), (867, 'suggests', 'Theses'), (868, 'suggests', 'Theses'),
		           (869, 'suggests', 'Theses'), (870, 'suggests', 'Theses'), (871, 'suggests', 'Theses'),
		           (872, 'suggests', 'Theses'), (873, 'suggests', 'Theses'), (874, 'suggests', 'Theses'),
		           (875, 'suggests', 'Theses'), (876, 'suggests', 'Theses'), (877, 'suggests', 'Theses'),
		           (878, 'suggests', 'Theses'), (879, 'suggests', 'Theses'), (880, 'suggests', 'Theses'),
		           (881, 'suggests', 'Theses'), (882, 'suggests', 'Theses'), (883, 'suggests', 'Theses'),
		           (884, 'suggests', 'Theses'), (885, 'suggests', 'Theses'), (886, 'suggests', 'Theses'),
		           (887, 'suggests', 'Theses'), (888, 'suggests', 'Theses'), (889, 'suggests', 'Theses'),
		           (890, 'suggests', 'Theses'), (891, 'suggests', 'Theses'), (892, 'suggests', 'Theses'),
		           (893, 'suggests', 'Theses'), (894, 'suggests', 'Theses'), (895, 'suggests', 'Theses'),
		           (896, 'suggesting', 'Theses'), (897, 'suggesting', 'Theses'), (898, 'suggesting', 'Theses'),
		           (899, 'suggesting', 'Theses'), (900, 'suggesting', 'Theses'), (901, 'suggesting', 'Theses'),
		           (902, 'suggesting', 'Theses'), (903, 'suggesting', 'Theses'), (904, 'suggesting', 'Theses'),
		           (905, 'suggesting', 'Theses'), (906, 'suggesting', 'Theses'), (907, 'suggesting', 'Theses'),
		           (908, 'suggesting', 'Theses'), (909, 'suggesting', 'Theses'), (910, 'suggesting', 'Theses'),
		           (911, 'suggested', 'Theses'), (912, 'suggested', 'Theses'), (913, 'suggested', 'Theses'),
		           (914, 'suggested', 'Theses'), (915, 'suggested', 'Theses'), (916, 'suggested', 'Theses'),
		           (917, 'suggested', 'Theses'), (918, 'suggested', 'Theses'), (919, 'suggested', 'Theses'),
		           (920, 'suggest', 'Theses'), (921, 'suggest', 'Theses'), (922, 'suggest', 'Theses'),
		           (923, 'suggest', 'Theses'), (924, 'suggest', 'Theses'), (925, 'suggest', 'Theses'),
		           (926, 'suggest', 'Theses'), (927, 'suggest', 'Theses'), (928, 'suggest', 'Theses'),
		           (929, 'suggest', 'Theses'), (930, 'suggest', 'Theses'), (931, 'suggest', 'Theses'),
		           (932, 'suggest', 'Theses'), (933, 'suggest', 'Theses'), (934, 'suggest', 'Theses'),
		           (935, 'suggest', 'Theses'), (936, 'suggest', 'Theses'), (937, 'suggest', 'Theses'),
		           (938, 'suggest', 'Theses'), (939, 'suggest', 'Theses'), (940, 'suggest', 'Theses'),
		           (941, 'suggest', 'Theses'), (942, 'suggest', 'Theses'), (943, 'suggest', 'Theses'),
		           (944, 'suggest', 'Theses'), (945, 'suggest', 'Theses'), (946, 'suggest', 'Theses'),
		           (947, 'striking', 'Theses'), (948, 'striking', 'Theses'), (949, 'speculated', 'Theses'),
		           (950, 'speculated', 'Theses'), (951, 'speculated', 'Theses'), (952, 'speculate', 'Theses'),
		           (953, 'signifying', 'Theses'), (954, 'shows', 'Theses'), (955, 'shows', 'Theses'),
		           (956, 'shows', 'Theses'), (957, 'shows', 'Theses'), (958, 'shows', 'Theses'), (959, 'shows', 'Theses'),
		           (960, 'shown', 'Theses'), (961, 'shown', 'Theses'), (962, 'shown', 'Theses'), (963, 'shown', 'Theses'),
		           (964, 'shown', 'Theses'), (965, 'shown', 'Theses'), (966, 'shown', 'Theses'), (967, 'shown', 'Theses'),
		           (968, 'shown', 'Theses'), (969, 'showing', 'Theses'), (970, 'showed', 'Theses'),
		           (971, 'showed', 'Theses'), (972, 'showed', 'Theses'), (973, 'showed', 'Theses'),
		           (974, 'showed', 'Theses'), (975, 'showed', 'Theses'), (976, 'showed', 'Theses'),
		           (977, 'showed', 'Theses'), (978, 'showed', 'Theses'), (979, 'showed', 'Theses'),
		           (980, 'showed', 'Theses'), (981, 'showed', 'Theses'), (982, 'showed', 'Theses'),
		           (983, 'showed', 'Theses'), (984, 'showed', 'Theses'), (985, 'showed', 'Theses'),
		           (986, 'showed', 'Theses'), (987, 'showed', 'Theses'), (988, 'showed', 'Theses'),
		           (989, 'showed', 'Theses'), (990, 'showed', 'Theses'), (991, 'showed', 'Theses'), (992, 'show', 'Theses'),
		           (993, 'show', 'Theses'), (994, 'show', 'Theses'), (995, 'show', 'Theses'), (996, 'seem', 'Theses'),
		           (997, 'revealed', 'Theses'), (998, 'revealed', 'Theses'), (999, 'revealed', 'Theses'),
		           (1000, 'report', 'Theses'), (1001, 'recognized', 'Theses'), (1002, 'proposed', 'Theses'),
		           (1003, 'predicting', 'Theses'), (1004, 'possible', 'Theses'), (1005, 'possible', 'Theses'),
		           (1006, 'possible', 'Theses'), (1007, 'possible', 'Theses'), (1008, 'possible', 'Theses'),
		           (1009, 'possible', 'Theses'), (1010, 'possible', 'Theses'), (1011, 'possible', 'Theses'),
		           (1012, 'plausible', 'Theses'), (1013, 'plausible', 'Theses'), (1014, 'observation', 'Theses'),
		           (1015, 'observed', 'Theses'), (1016, 'observed', 'Theses'), (1017, 'observed', 'Theses'),
		           (1018, 'observed', 'Theses'), (1019, 'noting', 'Theses'), (1020, 'noticeable', 'Theses'),
		           (1021, 'noteworthy', 'Theses'), (1022, 'noteworthy', 'Theses'), (1023, 'noted', 'Theses'),
		           (1024, 'noted', 'Theses'), (1025, 'noted', 'Theses'), (1026, 'Note', 'Theses'), (1027, 'note', 'Theses'),
		           (1028, 'note', 'Theses'), (1029, 'note', 'Theses'), (1030, 'note', 'Theses'), (1031, 'means', 'Theses'),
		           (1032, 'meaning', 'Theses'), (1033, 'meaning', 'Theses'), (1034, 'meaning', 'Theses'),
		           (1035, 'mean', 'Theses'), (1036, 'mean', 'Theses'), (1037, 'likely', 'Theses'),
		           (1038, 'likely', 'Theses'), (1039, 'likely', 'Theses'), (1040, 'likely', 'Theses'),
		           (1041, 'likely', 'Theses'), (1042, 'likely', 'Theses'), (1043, 'suggested', 'Theses'),
		           (1044, 'interesting', 'Theses'), (1045, 'interesting', 'Theses'), (1046, 'indicating', 'Theses'),
		           (1047, 'indicating', 'Theses'), (1048, 'indicating', 'Theses'), (1049, 'indicating', 'Theses'),
		           (1050, 'indicating', 'Theses'), (1051, 'indicating', 'Theses'), (1052, 'indicating', 'Theses'),
		           (1053, 'indicating', 'Theses'), (1054, 'indicating', 'Theses'), (1055, 'indicating', 'Theses'),
		           (1056, 'indicating', 'Theses'), (1057, 'indicating', 'Theses'), (1058, 'indicating', 'Theses'),
		           (1059, 'indicating', 'Theses'), (1060, 'indicating', 'Theses'), (1061, 'indicates', 'Theses'),
		           (1062, 'indicates', 'Theses'), (1063, 'indicates', 'Theses'), (1064, 'indicates', 'Theses'),
		           (1065, 'indicates', 'Theses'), (1066, 'indicates', 'Theses'), (1067, 'indicates', 'Theses'),
		           (1068, 'indicates', 'Theses'), (1069, 'indicates', 'Theses'), (1070, 'indicated', 'Theses'),
		           (1071, 'indicated', 'Theses'), (1072, 'indicated', 'Theses'), (1073, 'indicated', 'Theses'),
		           (1074, 'indicated', 'Theses'), (1075, 'indicated', 'Theses'), (1076, 'indicated', 'Theses'),
		           (1077, 'indicate', 'Theses'), (1078, 'indicate', 'Theses'), (1079, 'indicate', 'Theses'),
		           (1080, 'indicate', 'Theses'), (1081, 'indicate', 'Theses'), (1082, 'indicate', 'Theses'),
		           (1083, 'indicate', 'Theses'), (1084, 'indicate', 'Theses'), (1085, 'indicate', 'Theses'),
		           (1086, 'implying', 'Theses'), (1087, 'imply', 'Theses'), (1088, 'Given', 'Theses'),
		           (1089, 'Given', 'Theses'), (1090, 'Given', 'Theses'), (1091, 'Given', 'Theses'),
		           (1092, 'Given', 'Theses'), (1093, 'given', 'Theses'), (1094, 'given', 'Theses'),
		           (1095, 'found', 'Theses'), (1096, 'found', 'Theses'), (1097, 'found', 'Theses'),
		           (1098, 'found', 'Theses'), (1099, 'found', 'Theses'), (1100, 'found', 'Theses'),
		           (1101, 'found', 'Theses'), (1102, 'found', 'Theses'), (1103, 'found', 'Theses'),
		           (1104, 'found', 'Theses'), (1105, 'found', 'Theses'), (1106, 'found', 'Theses'),
		           (1107, 'found', 'Theses'), (1108, 'found', 'Theses'), (1109, 'found', 'Theses'),
		           (1110, 'evidence', 'Theses'), (1111, 'established', 'Theses'), (1112, 'established', 'Theses'),
		           (1113, 'ensure', 'Theses'), (1114, 'doubt', 'Theses'), (1115, 'discovered', 'Theses'),
		           (1116, 'determining', 'Theses'), (1117, 'demonstrating', 'Theses'), (1118, 'demonstrating', 'Theses'),
		           (1119, 'demonstrating', 'Theses'), (1120, 'demonstrating', 'Theses'), (1121, 'demonstrating', 'Theses'),
		           (1122, 'demonstrates', 'Theses'), (1123, 'demonstrates', 'Theses'), (1124, 'demonstrates', 'Theses'),
		           (1125, 'demonstrates', 'Theses'), (1126, 'demonstrates', 'Theses'), (1127, 'demonstrated', 'Theses'),
		           (1128, 'demonstrated', 'Theses'), (1129, 'demonstrated', 'Theses'), (1130, 'demonstrated', 'Theses'),
		           (1131, 'demonstrated', 'Theses'), (1132, 'demonstrated', 'Theses'), (1133, 'demonstrated', 'Theses'),
		           (1134, 'demonstrated', 'Theses'), (1135, 'demonstrated', 'Theses'), (1136, 'demonstrated', 'Theses'),
		           (1137, 'demonstrated', 'Theses'), (1138, 'demonstrated', 'Theses'), (1139, 'demonstrated', 'Theses'),
		           (1140, 'demonstrated', 'Theses'), (1141, 'demonstrated', 'Theses'), (1142, 'demonstrated', 'Theses'),
		           (1143, 'demonstrated', 'Theses'), (1144, 'demonstrated', 'Theses'), (1145, 'demonstrated', 'Theses'),
		           (1146, 'demonstrated', 'Theses'), (1147, 'demonstrated', 'Theses'), (1148, 'demonstrated', 'Theses'),
		           (1149, 'demonstrated', 'Theses'), (1150, 'demonstrated', 'Theses'), (1151, 'demonstrated', 'Theses'),
		           (1152, 'demonstrated', 'Theses'), (1153, 'demonstrated', 'Theses'), (1154, 'demonstrated', 'Theses'),
		           (1155, 'demonstrated', 'Theses'), (1156, 'demonstrated', 'Theses'), (1157, 'demonstrated', 'Theses'),
		           (1158, 'demonstrated', 'Theses'), (1159, 'demonstrated', 'Theses'), (1160, 'demonstrated', 'Theses'),
		           (1161, 'demonstrated', 'Theses'), (1162, 'demonstrated', 'Theses'), (1163, 'demonstrated', 'Theses'),
		           (1164, 'demonstrated', 'Theses'), (1165, 'demonstrated', 'Theses'), (1166, 'demonstrated', 'Theses'),
		           (1167, 'demonstrated', 'Theses'), (1168, 'demonstrate', 'Theses'), (1169, 'demonstrate', 'Theses'),
		           (1170, 'demonstrate', 'Theses'), (1171, 'demonstrate', 'Theses'), (1172, 'demonstrate', 'Theses'),
		           (1173, 'corroborated', 'Theses'), (1174, 'Considering', 'Theses'), (1175, 'Considering', 'Theses'),
		           (1176, 'confirms', 'Theses'), (1177, 'confirms', 'Theses'), (1178, 'confirming', 'Theses'),
		           (1179, 'confirming', 'Theses'), (1180, 'confirmed', 'Theses'), (1181, 'confirmed', 'Theses'),
		           (1182, 'confirmed', 'Theses'), (1183, 'confirmed', 'Theses'), (1184, 'concluded', 'Theses'),
		           (1185, 'conclude', 'Theses'), (1186, 'conclude', 'Theses'), (1187, 'concerns', 'Theses'),
		           (1188, 'clear', 'Theses'), (1189, 'assuming', 'Theses'), (1190, 'appears', 'Theses'),
		           (1191, 'appears', 'Theses'), (1192, 'appears', 'Theses'), (1193, 'appears', 'Theses'),
		           (1194, 'appeared', 'Theses'), (1195, 'apparent', 'Theses'), (1196, 'observation', 'Theses'),
		           (1197, 'agrees', 'Theses'), (1198, 'agreed', 'Theses'), (1199, 'affirming', 'Theses'),
		           (1200, 'thought', 'Theses'), (1201, 'thought', 'Theses'), (1202, 'thought', 'Theses'),
		           (1203, 'thought', 'Theses'), (1204, 'thought', 'Theses'), (1205, 'thought', 'Theses'),
		           (1206, 'thought', 'Theses'), (1207, 'thought', 'Theses'), (1208, 'thought', 'Theses'),
		           (1209, 'thought', 'Theses'), (1210, 'thickened', 'Theses'), (1211, 'findings', 'Theses'),
		           (1212, 'fact', 'Theses'), (1213, 'fact', 'Theses'), (1214, 'fact', 'Theses'), (1215, 'fact', 'Theses'),
		           (1216, 'demonstration', 'Theses'), (1217, 'evidence', 'Theses'), (1218, 'surprising', 'Theses'),
		           (1219, 'surprising', 'Theses'), (1220, 'surprise', 'Theses'), (1221, 'supported', 'Theses'),
		           (1222, 'suggests', 'Theses'), (1223, 'suggests', 'Theses'), (1224, 'suggests', 'Theses'),
		           (1225, 'suggests', 'Theses'), (1226, 'suggests', 'Theses'), (1227, 'suggests', 'Theses'),
		           (1228, 'suggests', 'Theses'), (1229, 'suggests', 'Theses'), (1230, 'suggests', 'Theses'),
		           (1231, 'suggests', 'Theses'), (1232, 'suggesting', 'Theses'), (1233, 'suggesting', 'Theses'),
		           (1234, 'suggesting', 'Theses'), (1235, 'suggesting', 'Theses'), (1236, 'suggesting', 'Theses'),
		           (1237, 'suggesting', 'Theses'), (1238, 'suggesting', 'Theses'), (1239, 'suggested', 'Theses'),
		           (1240, 'suggested', 'Theses'), (1241, 'suggested', 'Theses'), (1242, 'suggested', 'Theses'),
		           (1243, 'suggested', 'Theses'), (1244, 'suggested', 'Theses'), (1245, 'suggested', 'Theses'),
		           (1246, 'suggested', 'Theses'), (1247, 'suggested', 'Theses'), (1248, 'suggested', 'Theses'),
		           (1249, 'suggested', 'Theses'), (1250, 'suggested', 'Theses'), (1251, 'suggested', 'Theses'),
		           (1252, 'suggest', 'Theses'), (1253, 'suggest', 'Theses'), (1254, 'suggest', 'Theses'),
		           (1255, 'suggest', 'Theses'), (1256, 'suggest', 'Theses'), (1257, 'suggest', 'Theses'),
		           (1258, 'submit', 'Theses'), (1259, 'stating', 'Theses'), (1260, 'shows', 'Theses'),
		           (1261, 'shown', 'Theses'), (1262, 'shown', 'Theses'), (1263, 'shown', 'Theses'),
		           (1264, 'shown', 'Theses'), (1265, 'shown', 'Theses'), (1266, 'shown', 'Theses'),
		           (1267, 'shown', 'Theses'), (1268, 'shown', 'Theses'), (1269, 'shown', 'Theses'),
		           (1270, 'shown', 'Theses'), (1271, 'shown', 'Theses'), (1272, 'shown', 'Theses'),
		           (1273, 'shown', 'Theses'), (1274, 'shown', 'Theses'), (1275, 'shown', 'Theses'),
		           (1276, 'shown', 'Theses'), (1277, 'showing', 'Theses'), (1278, 'showed', 'Theses'),
		           (1279, 'showed', 'Theses'), (1280, 'showed', 'Theses'), (1281, 'showed', 'Theses'),
		           (1282, 'showed', 'Theses'), (1283, 'showed', 'Theses'), (1284, 'showed', 'Theses'),
		           (1285, 'showed', 'Theses'), (1286, 'showed', 'Theses'), (1287, 'showed', 'Theses'),
		           (1288, 'showed', 'Theses'), (1289, 'showed', 'Theses'), (1290, 'showed', 'Theses'),
		           (1291, 'showed', 'Theses'), (1292, 'showed', 'Theses'), (1293, 'showed', 'Theses'),
		           (1294, 'seen', 'Theses'), (1295, 'seems', 'Theses'), (1296, 'revealed', 'Theses'),
		           (1297, 'reveal', 'Theses'), (1298, 'reported', 'Theses'), (1299, 'reported', 'Theses'),
		           (1300, 'reported', 'Theses'), (1301, 'reported', 'Theses'), (1302, 'reported', 'Theses'),
		           (1303, 'reported', 'Theses'), (1304, 'reported', 'Theses'), (1305, 'reported', 'Theses'),
		           (1306, 'reported', 'Theses'), (1307, 'reported', 'Theses'), (1308, 'reported', 'Theses'),
		           (1309, 'reported', 'Theses'), (1310, 'reported', 'Theses'), (1311, 'data', 'Theses'),
		           (1312, 'recommended', 'Theses'), (1313, 'recognised', 'Theses'), (1314, 'recognised', 'Theses'),
		           (1315, 'recognised', 'Theses'), (1316, 'recognised', 'Theses'), (1317, 'proposed', 'Theses'),
		           (1318, 'propose', 'Theses'), (1319, 'propose', 'Theses'), (1320, 'projected', 'Theses'),
		           (1321, 'postulated', 'Theses'), (1322, 'plausible', 'Theses'), (1323, 'understanding', 'Theses'),
		           (1324, 'observed', 'Theses'), (1325, 'mentioning', 'Theses'), (1326, 'means', 'Theses'),
		           (1327, 'likely', 'Theses'), (1328, 'likely', 'Theses'), (1329, 'likely', 'Theses'),
		           (1330, 'likely', 'Theses'), (1331, 'likely', 'Theses'), (1332, 'known', 'Theses'),
		           (1333, 'known', 'Theses'), (1334, 'indicating', 'Theses'), (1335, 'indicates', 'Theses'),
		           (1336, 'indicates', 'Theses'), (1337, 'indicates', 'Theses'), (1338, 'indicated', 'Theses'),
		           (1339, 'indicated', 'Theses'), (1340, 'indicated', 'Theses'), (1341, 'indicate', 'Theses'),
		           (1342, 'implying', 'Theses'), (1343, 'implying', 'Theses'), (1344, 'implicating', 'Theses'),
		           (1345, 'highlight', 'Theses'), (1346, 'shown', 'Theses'), (1347, 'shown', 'Theses'),
		           (1348, 'shown', 'Theses'), (1349, 'Given', 'Theses'), (1350, 'found', 'Theses'),
		           (1351, 'found', 'Theses'), (1352, 'found', 'Theses'), (1353, 'found', 'Theses'),
		           (1354, 'found', 'Theses'), (1355, 'found', 'Theses'), (1356, 'found', 'Theses'),
		           (1357, 'evidence', 'Theses'), (1358, 'evidence', 'Theses'), (1359, 'evidence', 'Theses'),
		           (1360, 'evidence', 'Theses'), (1361, 'evidence', 'Theses'), (1362, 'estimated', 'Theses'),
		           (1363, 'estimated', 'Theses'), (1364, 'estimated', 'Theses'), (1365, 'established', 'Theses'),
		           (1366, 'documented', 'Theses'), (1367, 'documented', 'Theses'), (1368, 'discovered', 'Theses'),
		           (1369, 'described', 'Theses'), (1370, 'demonstrated', 'Theses'), (1371, 'demonstrated', 'Theses'),
		           (1372, 'demonstrated', 'Theses'), (1373, 'demonstrated', 'Theses'), (1374, 'demonstrated', 'Theses'),
		           (1375, 'demonstrated', 'Theses'), (1376, 'demonstrated', 'Theses'), (1377, 'demonstrated', 'Theses'),
		           (1378, 'demonstrated', 'Theses'), (1379, 'demonstrated', 'Theses'), (1380, 'demonstrated', 'Theses'),
		           (1381, 'demonstrated', 'Theses'), (1382, 'demonstrated', 'Theses'), (1383, 'demonstrated', 'Theses'),
		           (1384, 'demonstrated', 'Theses'), (1385, 'demonstrated', 'Theses'), (1386, 'demonstrated', 'Theses'),
		           (1387, 'demonstrated', 'Theses'), (1388, 'demonstrated', 'Theses'), (1389, 'demonstrated', 'Theses'),
		           (1390, 'demonstrated', 'Theses'), (1391, 'demonstrate', 'Theses'), (1392, 'demonstrate', 'Theses'),
		           (1393, 'credible', 'Theses'), (1394, 'consistent', 'Theses'), (1395, 'Considering', 'Theses'),
		           (1396, 'Considering', 'Theses'), (1397, 'considering', 'Theses'), (1398, 'considering', 'Theses'),
		           (1399, 'confirmed', 'Theses'), (1400, 'confirmed', 'Theses'), (1401, 'concluded', 'Theses'),
		           (1402, 'concluded', 'Theses'), (1403, 'believed', 'Theses'), (1404, 'believed', 'Theses'),
		           (1405, 'believed', 'Theses'), (1406, 'believed', 'Theses'), (1407, 'believed', 'Theses'),
		           (1408, 'believe', 'Theses'), (1409, 'recognition', 'Theses'), (1410, 'acknowledging', 'Theses'),
		           (1411, 'accepted', 'Theses'), (1412, 'accepted', 'Theses'), (1413, 'range', 'Theses'),
		           (1414, 'hope', 'Theses'), (1415, 'hoped', 'Theses'), (1416, 'hoped', 'Theses'),
		           (1417, 'hoped', 'Theses'), (1418, 'hoped', 'Theses'), (1419, 'hoped', 'Theses'),
		           (1420, 'hoped', 'Theses'), (1421, 'hoped', 'Theses'), (1422, 'hoped', 'Theses'),
		           (1423, 'hoped', 'Theses'), (1424, 'hoped', 'Theses'), (1425, 'expect', 'Theses'),
		           (1426, 'confirm', 'Theses'), (1427, 'ensure', 'Theses'), (1428, 'affirm', 'Theses'),
		           (1429, 'rationale', 'Theses'), (1430, 'probability', 'Theses'), (1431, 'observation', 'Theses'),
		           (1432, 'fact', 'Theses'), (1433, 'difference', 'Theses'), (1434, 'difference', 'Theses'),
		           (1435, 'suggests', 'Theses'), (1436, 'suggests', 'Theses'), (1437, 'showed', 'Theses'),
		           (1438, 'recognize', 'Theses'), (1439, 'probability', 'Theses'), (1440, 'application', 'Theses'),
		           (1441, 'noteworthy', 'Theses'), (1442, 'noteworthy', 'Theses'), (1443, 'noted', 'Theses'),
		           (1444, 'noted', 'Theses'), (1445, 'doubt', 'Theses'), (1446, 'method', 'Theses'),
		           (1447, 'insist', 'Theses'), (1448, 'found', 'Theses'), (1449, 'ensure', 'Theses'),
		           (1450, 'ensure', 'Theses'), (1451, 'ensure', 'Theses'), (1452, 'ensure', 'Theses'),
		           (1453, 'ensure    ', 'Theses'), (1454, 'ensure', 'Theses'), (1455, 'demonstrated', 'Theses'),
		           (1456, 'corroborated', 'Theses'), (1457, 'concern', 'Theses'), (1458, 'check', 'Theses'),
		           (1459, 'assumes', 'Theses'), (1460, 'assumes', 'Theses'), (1461, 'ascertained', 'Theses'),
		           (1462, 'reason', 'Theses'), (1463, 'unlikely', 'Theses'), (1464, 'thought', 'Theses'),
		           (1465, 'thought', 'Theses'), (1466, 'advantage', 'Theses'), (1467, 'idea', 'Theses'),
		           (1468, 'hypothesis', 'Theses'), (1469, 'hypothesis', 'Theses'), (1470, 'hypothesis', 'Theses'),
		           (1471, 'hypothesis', 'Theses'), (1472, 'hypothesis', 'Theses'), (1473, 'hypothesis', 'Theses'),
		           (1474, 'fact', 'Theses'), (1475, 'hypothesis', 'Theses'), (1476, 'suggesting', 'Theses'),
		           (1477, 'suggest', 'Theses'), (1478, 'proposed', 'Theses'), (1479, 'presumed', 'Theses'),
		           (1480, 'postulate', 'Theses'), (1481, 'possible', 'Theses'), (1482, 'possible', 'Theses'),
		           (1483, 'possibility', 'Theses'), (1484, 'model', 'Theses'), (1485, 'likely', 'Theses'),
		           (1486, 'likely', 'Theses'), (1487, 'likely', 'Theses'), (1488, 'concern', 'Theses'),
		           (1489, 'implies', 'Theses'), (1490, 'hypothesized', 'Theses'), (1491, 'hypothesized', 'Theses'),
		           (1492, 'hypothesize', 'Theses'), (1493, 'hypothesize', 'Theses'), (1494, 'hypothesised', 'Theses'),
		           (1495, 'hypothesised', 'Theses'), (1496, 'Hypothesis', 'Theses'), (1497, 'hypothesis', 'Theses'),
		           (1498, 'hypothesis', 'Theses'), (1499, 'hypothesis', 'Theses'), (1500, 'hypothesis', 'Theses'),
		           (1501, 'hoped', 'Theses'), (1502, 'hoped', 'Theses'), (1503, 'hoped', 'Theses'),
		           (1504, 'hoped', 'Theses'), (1505, 'expected', 'Theses'), (1506, 'expected', 'Theses'),
		           (1507, 'expected', 'Theses'), (1508, 'demonstrating', 'Theses'), (1509, 'Assuming', 'Theses'),
		           (1510, 'anticipated', 'Theses'), (1511, 'theory', 'Theses'), (1512, 'hypothesise', 'Theses')]
		df = (pd.DataFrame
		      .from_records(records, columns=['index', 'Text', 'Genre'])
		      .set_index('index'))
		c = TermDocMatrixFromPandas(df,
		                            category_col='Genre',
		                            text_col='Text',
		                            nlp=whitespace_nlp).build()
		c.get_term_freq_df()
		c = CorpusFromPandas(df,
		                     category_col='Genre',
		                     text_col='Text',
		                     nlp=whitespace_nlp).build()
		df = c.get_term_freq_df()
示例#27
0
import spacy

from scattertext import SampleCorpora
from scattertext import produce_scattertext_explorer, produce_scattertext_html
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

html = produce_scattertext_html(corpus,
                                category='democrat',
                                category_name='Democratic',
                                not_category_name='Republican',
                                minimum_term_frequency=5,
                                pmi_filter_thresold=4,
                                width_in_pixels=1000)
open('./simple.html', 'wb').write(html.encode('utf-8'))
print('Open ./simple.html in Chrome or Firefox.')
 def test_chinese_error(self):
     with self.assertRaises(Exception):
         CorpusFromPandas(self.df, 'category', 'text',
                          nlp=chinese_nlp).build()
示例#29
0
import numpy as np
import spacy
from sklearn.linear_model import LogisticRegression

from scattertext import SampleCorpora
from scattertext import produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

term_freq_df = corpus.get_term_freq_df()

def scale(ar):
	return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
	ar[ar > 0] = scale(ar[ar > 0])
	ar[ar < 0] = -scale(-ar[ar < 0])
	return (ar + 1) / 2.


frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
scores = corpus.get_logreg_coefs('democrat',
                                 LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))
scores_scaled = zero_centered_scale(scores)
示例#30
0
from scattertext.Scalers import dense_rank
from scattertext.termscoring.DeltaJSDivergence import DeltaJSDivergence

from scattertext.termcompaction.AssociationCompactor import JSDCompactor

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_frequency_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact(
        JSDCompactor(1000))

html = produce_frequency_explorer(corpus,
                                  category='democrat',
                                  category_name='Democratic',
                                  not_category_name='Republican',
                                  minimum_term_frequency=0,
                                  pmi_threshold_coefficient=0,
                                  width_in_pixels=1000,
                                  metadata=convention_df['speaker'],
                                  term_scorer=DeltaJSDivergence(),
                                  transform=dense_rank,
                                  term_metadata_df=corpus.get_term_freq_df(''),
                                  enable_term_category_description=False)

open('./demo_JSDivergence.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_JSDivergence.html in Chrome or Firefox.')