Пример #1
0
	def test_keep_only_these_categories(self):
		df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
		                  columns=['category', 'text'])
		corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
		hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift'])
		self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift'])
		self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms()))
		with self.assertRaises(AssertionError):
			corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'])
		corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)
Пример #2
0
 def test_keep_only_these_categories(self):
     df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                       columns=['category', 'text'])
     corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
     hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift'])
     self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift'])
     self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms()))
     with self.assertRaises(AssertionError):
         corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'])
     corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)
Пример #3
0
    def test_remove_categories(self):
        df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                          columns=['category', 'text'])
        corpus = CorpusFromPandas(df, 'category', 'text',
                                  nlp=whitespace_nlp).build()
        swiftless = corpus.remove_categories(['swift'])

        swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'],
                                                 'category',
                                                 'text',
                                                 nlp=whitespace_nlp).build()
        np.testing.assert_equal([
            i for i in corpus._y if i != corpus.get_categories().index('swift')
        ], swiftless._y)
        self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0])
        self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape)
        self.assertEqual(set(swiftless_constructed.get_terms()),
                         set(swiftless.get_terms()))
        pd.testing.assert_series_equal(swiftless_constructed.get_texts(),
                                       swiftless.get_texts())
Пример #4
0
    def test_remove_categories(self):
        df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                          columns=['category', 'text'])
        corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
        swiftless = corpus.remove_categories(['swift'])

        swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'],
                                                 'category',
                                                 'text',
                                                 nlp=whitespace_nlp).build()
        np.testing.assert_equal(
            [i for i in corpus._y if i != corpus.get_categories().index('swift')],
            swiftless._y
        )
        self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0])
        self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape)
        self.assertEqual(set(swiftless_constructed.get_terms()),
                         set(swiftless.get_terms()))
        pd.testing.assert_series_equal(swiftless_constructed.get_texts(),
                                       swiftless.get_texts())
Пример #5
0
                                    not_category_name='Republican',
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    x_coords=frequencies_scaled,
                                    y_coords=zeta_scaled_for_charting,
                                    scores=zeta_i_j,
                                    sort_by_dist=False,
                                    metadata=convention_df['speaker'],
                                    x_label='Log Frequency',
                                    y_label='Log Odds Ratio w/ Prior (a_w=0.01)')
'''

bg_df = (corpus.get_term_and_background_counts().where(
    lambda x: x.corpus > 0).dropna())
bg_df.background += bg_df.corpus
corpus_bg = corpus.remove_terms(set(corpus.get_terms()) - set(bg_df.index))
priors = (corpus_bg.get_term_and_background_counts().reindex(
    corpus_bg.get_terms())['background'])
term_scorer = LogOddsRatioInformativeDirichletPrior(priors.values, 10)

tooltip_context = '''(function(d) {
	return d.term+"<br/>Count ratio (per 25k): "+d.cat25k+":"+d.ncat25k+"<br/>Z-score: "+ Number(Math.round(d.os+'e3')+'e-3');
})'''

html = produce_fightin_words_explorer(corpus_bg,
                                      category='democrat',
                                      category_name='Democratic',
                                      not_category_name='Republican',
                                      minimum_term_frequency=5,
                                      get_tooltip_content=tooltip_context,
                                      term_scorer=term_scorer)