def test_score_transform(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     d1 = sc.to_dict('hamlet')
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0, score_transform=lambda x:x)
     d2 = sc.to_dict('hamlet')
     assert sum([datum['s'] for datum in d1['data']]) != sum([datum['s'] for datum in d2['data']])
示例#2
0
    def test_neutral_categories(self):
        corpus = get_test_corpus()
        priors = (PriorFactory(
            corpus, 'hamlet', starting_count=0.001,
            not_categories=['swift']).use_neutral_categories().get_priors())
        self.assertEqual(priors.min(), 0.001)
        self.assertEqual(priors.shape[0], corpus._X.shape[1])

        corpus = get_test_corpus()
        priors = (PriorFactory(
            corpus, 'hamlet', starting_count=0.001, not_categories=[
                'swift'
            ]).use_neutral_categories().drop_zero_priors().get_priors())

        jzcnts = corpus.get_term_freq_df()['jay-z/r. kelly freq'].where(
            lambda x: x > 0).dropna()
        np.testing.assert_equal(priors.values, jzcnts.values + 0.001)
示例#3
0
 def test_all_categories(self):
     corpus = get_test_corpus()
     priors, my_corpus = (PriorFactory(
         corpus, starting_count=0,
         category='hamlet').use_all_categories().build())
     tdf = corpus.get_term_freq_df()
     self.assertEqual(len(priors), len(tdf))
     np.testing.assert_equal(priors.values,
                             corpus.get_term_freq_df().sum(axis=1).values)
	def test_get_custom_term_frequencies(self):
		corpus = get_test_corpus()
		fact = (PriorFactory(corpus, starting_count=0.04)
			.use_custom_term_frequencies(pd.Series({'halt': 3, 'i': 8}))
			.drop_zero_priors()
			)
		priors, clean_corpus = fact.build()
		self.assertEqual(set(clean_corpus.get_terms()), {'i', 'halt'})
		np.testing.assert_equal(priors.sort_values().values, [3.04, 8.04])
	def test_multi_categories(self):
		corpus = get_test_corpus()
		j_vs_all = ScatterChartExplorer(corpus=corpus, minimum_term_frequency=0) \
			.to_dict('hamlet')
		j_vs_swift = ScatterChartExplorer(corpus=corpus, minimum_term_frequency=0) \
			.to_dict('hamlet', not_categories=['swift'])
		self.assertNotEqual(set(j_vs_all['info']['not_category_internal_names']),
		                    set(j_vs_swift['info']['not_category_internal_names']))
		self.assertEqual(list(j_vs_all['docs']['labels']), list(j_vs_swift['docs']['labels']))
		self.assertEqual(list(j_vs_all['docs']['categories']), list(j_vs_swift['docs']['categories']))
示例#6
0
 def test_multi_categories(self):
     corpus = get_test_corpus()
     j_vs_all = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) \
         .to_dict('hamlet')
     j_vs_swift = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) \
         .to_dict('hamlet', not_categories=['swift'])
     self.assertNotEqual(set(j_vs_all['info']['not_category_internal_names']),
                         set(j_vs_swift['info']['not_category_internal_names']))
     self.assertEqual(j_vs_all['info']['categories'], corpus.get_categories())
     self.assertEqual(j_vs_swift['info']['categories'], corpus.get_categories())
示例#7
0
 def test_score_transform(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     d1 = sc.to_dict('hamlet')
     sc = ScatterChart(term_doc_matrix=corpus,
                       minimum_term_frequency=0,
                       score_transform=lambda x: x)
     d2 = sc.to_dict('hamlet')
     assert sum([datum['s'] for datum in d1['data']]) != sum(
         [datum['s'] for datum in d2['data']])
 def test_multi_categories(self):
     corpus = get_test_corpus()
     j_vs_all = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) \
         .to_dict('hamlet')
     j_vs_swift = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) \
         .to_dict('hamlet', not_categories=['swift'])
     self.assertNotEqual(set(j_vs_all['info']['not_category_internal_names']),
                         set(j_vs_swift['info']['not_category_internal_names']))
     self.assertEqual(j_vs_all['info']['categories'], corpus.get_categories())
     self.assertEqual(j_vs_swift['info']['categories'], corpus.get_categories())
 def test_align_to_target(self):
     full_corpus = get_test_corpus()
     corpus = full_corpus.remove_categories(['swift'])
     priors = PriorFactory(full_corpus).use_all_categories().get_priors()
     with self.assertRaises(ValueError):
         (LogOddsRatioInformativeDirichletPrior(priors).get_scores(
             *corpus.get_term_freq_df().values.T))
     priors = (PriorFactory(full_corpus).use_all_categories().
               align_to_target(corpus).get_priors())
     (LogOddsRatioInformativeDirichletPrior(priors).get_scores(
         *corpus.get_term_freq_df().values.T))
 def test_multi_categories(self):
     corpus = get_test_corpus()
     j_vs_all = ScatterChartExplorer(corpus=corpus, minimum_term_frequency=0) \
      .to_dict('hamlet')
     j_vs_swift = ScatterChartExplorer(corpus=corpus, minimum_term_frequency=0) \
      .to_dict('hamlet', not_categories=['swift'])
     self.assertNotEqual(
         set(j_vs_all['info']['not_category_internal_names']),
         set(j_vs_swift['info']['not_category_internal_names']))
     self.assertEqual(list(j_vs_all['docs']['labels']),
                      list(j_vs_swift['docs']['labels']))
     self.assertEqual(list(j_vs_all['docs']['categories']),
                      list(j_vs_swift['docs']['categories']))
示例#11
0
    def test_get_general_term_frequencies(self):
        corpus = get_test_corpus()
        fact = (PriorFactory(corpus,
                             category='hamlet',
                             not_categories=['swift'],
                             starting_count=0).use_general_term_frequencies().
                use_all_categories())
        priors, clean_corpus = fact.build()

        expected_prior = pd.merge(
            corpus.get_term_doc_count_df(),
            corpus.get_term_and_background_counts()[['background']],
            left_index=True,
            right_index=True,
            how='left').fillna(0.).sum(axis=1)

        np.testing.assert_allclose(priors.values, expected_prior.values)
示例#12
0
 def test_resuse_is_disabled(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     sc.to_dict('hamlet')
     with self.assertRaises(Exception):
         sc.to_dict('hamlet')
 def test_resuse_is_disabled(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     sc.to_dict('hamlet')
     with self.assertRaises(Exception):
         sc.to_dict('hamlet')