def test_metadata(self): corpus = build_hamlet_jz_corpus() meta = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight'] j = (ScatterChartExplorer(corpus, minimum_term_frequency=0) .to_dict('hamlet', metadata=meta)) self.maxDiff = None j['docs']['labels'] = list(j['docs']['labels']) self.assertEqual(j['docs'], {'labels': [0, 0, 0, 0, 1, 1, 1, 1], 'categories': ['hamlet', 'jay-z/r. kelly'], 'meta': ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight'], 'texts': ["what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!']} )
def test_metadata(self): corpus = build_hamlet_jz_corpus() meta = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight'] j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict( 'hamlet', metadata=meta)) self.maxDiff = None j['docs']['labels'] = list(j['docs']['labels']) self.assertEqual( j['docs'], { 'labels': [0, 0, 0, 0, 1, 1, 1, 1], 'categories': ['hamlet', 'jay-z/r. kelly'], 'meta': [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight' ], 'texts': [ "what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!' ] })
def test_get_name_hedges(self): corpus = build_hamlet_jz_corpus() self.assertEqual( HedgesR(corpus).set_categories('hamlet').get_name(), "Hedge's r") self.assertEqual( len(HedgesR(corpus).set_categories('hamlet').get_scores()), corpus.get_num_terms())
def test_get_cohens_d_scores_zero_robust(self): corpus = build_hamlet_jz_corpus() corpus._X[1, :] = 0 np.testing.assert_almost_equal( CohensD(corpus).set_term_ranker(OncePerDocFrequencyRanker). set_categories('hamlet').get_scores()[:5], [-0.2303607, 0.8838835, 0.8838835, 0.8838835, 0.8838835])
def test_get_cohens_d_score_df(self): corpus = build_hamlet_jz_corpus() columns = (CohensD(corpus) .set_term_ranker(OncePerDocFrequencyRanker) .set_categories('hamlet') .get_score_df().columns) self.assertEqual(set(columns), set(['cohens_d', 'cohens_d_se', 'cohens_d_z', 'cohens_d_p', 'hedges_r', 'hedges_r_se', 'hedges_r_z', 'hedges_r_p', 'm1', 'm2']))
def test_get_score_df(self): corpus = build_hamlet_jz_corpus() beta_posterior = BetaPosterior(corpus).set_categories('hamlet') score_df = beta_posterior.get_score_df() scores = beta_posterior.get_scores() np.testing.assert_almost_equal(scores[:5], [-0.3194860824225506, 1.0294085051562822, 1.0294085051562822, 1.234664219528909, 1.0294085051562822])
def test_get_score_df(self): corpus = build_hamlet_jz_corpus() self.assertEqual( set( CredTFIDF(corpus).set_categories( 'hamlet').get_score_df().columns), set(['pos_cred_tfidf', 'neg_cred_tfidf', 'delta_cred_tf_idf']))
def test_hide_terms(self): corpus = build_hamlet_jz_corpus().get_unigram_corpus() terms_to_hide = ['thou', 'heaven'] sc = (ScatterChartExplorer(corpus, minimum_term_frequency=0).hide_terms(terms_to_hide)) self.assertEquals(type(sc), ScatterChartExplorer) j = sc.to_dict('hamlet', include_term_category_counts=True) self.assertTrue(all(['display' in t and t['display'] == False for t in j['data'] if t['term'] in terms_to_hide])) self.assertTrue(all(['display' not in t for t in j['data'] if t['term'] not in terms_to_hide]))
def test_get_cohens_d_scores(self): corpus = build_hamlet_jz_corpus() np.testing.assert_almost_equal( CohensD(corpus).set_term_ranker(OncePerDocFrequencyRanker). set_categories('hamlet').get_cohens_d_scores()[:9], [ 0., 0.8242361, 0.8242361, 1.4276187, 0.8242361, 0.8242361, 0.8242361, 0.8242361, 0.5395892 ])
def test_to_dict(self): np.random.seed(0) random.seed(0) corpus = build_hamlet_jz_corpus() j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict('hamlet')) self.assertEqual(set(j.keys()), set(['info', 'data', 'docs'])) self.assertEqual( set(j['info'].keys()), set([ 'not_category_name', 'category_name', 'category_terms', 'not_category_internal_names', 'not_category_terms', 'category_internal_name', 'categories', 'neutral_category_name', 'extra_category_name', 'neutral_category_internal_names', 'extra_category_internal_names' ])) self.assertEqual(list(j['docs']['labels']), [0, 0, 0, 0, 1, 1, 1, 1]) self.assertEqual(list(j['docs']['texts']), [ "what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!' ]) expected = { 'y': 0.5, 'ncat': 0, 'ncat25k': 0, 'bg': 5, 'cat': 1, 's': 0.5, 'term': 'art', 'os': 0.5192, 'extra': 0, 'extra25k': 0, 'cat25k': 758, 'x': 0.06, 'neut': 0, 'neut25k': 0, 'ox': 5, 'oy': 3 } actual = [t for t in j['data'] if t['term'] == 'art'][0] ''' for var in expected.keys(): try: #np.testing.assert_almost_equal(actual[var], expected[var],decimal=1) except TypeError: self.assertEqual(actual[var], expected[var]) ''' self.assertEqual(set(expected.keys()), set(actual.keys())) self.assertEqual(expected['term'], actual['term']) self.assertEqual(j['docs'].keys(), {'texts', 'labels', 'categories'})
def test_include_term_category_counts(self): corpus = build_hamlet_jz_corpus().get_unigram_corpus() j = (ScatterChartExplorer(corpus, minimum_term_frequency=0) .to_dict('hamlet', include_term_category_counts=True)) self.assertEqual(set(j.keys()), set(['info', 'data', 'docs', 'termCounts'])) self.assertEqual(len(j['termCounts']), corpus.get_num_categories()) term_idx_set = set() for cat_counts in j['termCounts']: term_idx_set |= set(cat_counts.keys()) self.assertTrue(all([freq >= docs for freq, docs in cat_counts.values()])) self.assertEqual(len(term_idx_set), corpus.get_num_terms())
def test_include_term_category_counts(self): corpus = build_hamlet_jz_corpus().get_unigram_corpus() j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict( 'hamlet', include_term_category_counts=True)) self.assertEqual(set(j.keys()), set(['info', 'data', 'docs', 'termCounts'])) self.assertEqual(len(j['termCounts']), corpus.get_num_categories()) term_idx_set = set() for cat_counts in j['termCounts']: term_idx_set |= set(cat_counts.keys()) self.assertTrue( all([freq >= docs for freq, docs in cat_counts.values()])) self.assertEqual(len(term_idx_set), corpus.get_num_terms())
def test_get_score_df(self): corpus = build_hamlet_jz_corpus() tfidf = (CredTFIDF(corpus).set_term_ranker( OncePerDocFrequencyRanker).set_categories('hamlet')) np.testing.assert_almost_equal(tfidf.get_scores()[:5], [ 3.0757237e-05, 4.1256023e-02, 4.1256023e-02, 5.5708409e-02, 4.1256023e-02 ]) print(tfidf.get_score_df().iloc[0]) self.assertEqual( list(tfidf.get_score_df().columns), ['pos_cred_tfidf', 'neg_cred_tfidf', 'delta_cred_tf_idf'])
def test_hide_terms(self): corpus = build_hamlet_jz_corpus().get_unigram_corpus() terms_to_hide = ['thou', 'heaven'] sc = (ScatterChartExplorer( corpus, minimum_term_frequency=0).hide_terms(terms_to_hide)) self.assertEquals(type(sc), ScatterChartExplorer) j = sc.to_dict('hamlet', include_term_category_counts=True) self.assertTrue( all([ 'display' in t and t['display'] == False for t in j['data'] if t['term'] in terms_to_hide ])) self.assertTrue( all([ 'display' not in t for t in j['data'] if t['term'] not in terms_to_hide ]))
def test_get_name(self): corpus = build_hamlet_jz_corpus() self.assertEqual( CohensD(corpus).set_categories('hamlet').get_name(), "Cohen's d")
def test_get_cohens_d_scores(self): corpus = build_hamlet_jz_corpus() np.testing.assert_almost_equal( CohensD(corpus).set_term_ranker(OncePerDocFrequencyRanker). set_categories('hamlet').get_scores()[:5], [-0.2127981, 0.8164966, 0.8164966, 1.3669723, 0.8164966])
def test_to_dict(self): np.random.seed(0) random.seed(0) corpus = build_hamlet_jz_corpus() j = (ScatterChartExplorer(corpus, minimum_term_frequency=0) .to_dict('hamlet')) self.assertEqual(set(j.keys()), set(['info', 'data', 'docs'])) self.assertEqual(set(j['info'].keys()), set(['not_category_name', 'category_name', 'category_terms', 'not_category_internal_names', 'not_category_terms', 'category_internal_name', 'categories', 'neutral_category_name', 'extra_category_name', 'neutral_category_internal_names', 'extra_category_internal_names'])) self.assertEqual(list(j['docs']['labels']), [0, 0, 0, 0, 1, 1, 1, 1]) self.assertEqual(list(j['docs']['texts']), ["what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!']) expected = {'y': 0.5, 'ncat': 0, 'ncat25k': 0, 'bg': 5, 'cat': 1, 's': 0.5, 'term': 'art', 'os': 0.5192, 'extra': 0, 'extra25k': 0, 'cat25k': 758, 'x': 0.06, 'neut': 0, 'neut25k': 0, 'ox': 5, 'oy': 3} actual = [t for t in j['data'] if t['term'] == 'art'][0] ''' for var in expected.keys(): try: #np.testing.assert_almost_equal(actual[var], expected[var],decimal=1) except TypeError: self.assertEqual(actual[var], expected[var]) ''' self.assertEqual(set(expected.keys()), set(actual.keys())) self.assertEqual(expected['term'], actual['term']) self.assertEqual(j['docs'].keys(), {'texts', 'labels', 'categories'}) j = (ScatterChartExplorer(corpus, minimum_term_frequency=0) .inject_term_metadata({'art': {'display': 'blah blah blah', 'color': 'red'}}) .to_dict('hamlet')) actual = [t for t in j['data'] if t['term'] == 'art'][0] expected = {'y': 0.5, 'ncat': 0, 'ncat25k': 0, 'bg': 5, 'cat': 1, 's': 0.5, 'term': 'art', 'os': 0.5192, 'extra': 0, 'extra25k': 0, 'cat25k': 758, 'x': 0.06, 'neut': 0, 'neut25k': 0, 'ox': 5, 'oy': 3, 'etc': {'display': 'blah blah blah', 'color': 'red'}} self.assertEqual(set(actual.keys()), set(expected.keys())) self.assertEqual(actual['etc'], expected['etc']) actual = [t for t in j['data'] if t['term'] != 'art'][0] self.assertEqual(set(actual.keys()), set(expected.keys())) self.assertEqual(actual['etc'], {})
def test_get_name(self): corpus = build_hamlet_jz_corpus() self.assertEqual(CohensD(corpus) .set_categories('hamlet') .get_name(), "Cohen's d")
def test_get_name_hedges(self): corpus = build_hamlet_jz_corpus() self.assertEqual(HedgesR(corpus).set_categories('hamlet').get_name(), "Hedge's r") self.assertEqual(len(HedgesR(corpus).set_categories('hamlet').get_scores()), corpus.get_num_terms())
def test_get_name(self): corpus = build_hamlet_jz_corpus() self.assertEqual(BetaPosterior(corpus).get_name(), 'Beta Posterior')
def setUpClass(cls): cls.corpus = build_hamlet_jz_corpus()
def test_get_name(self): corpus = build_hamlet_jz_corpus() self.assertEqual( CredTFIDF(corpus).get_name(), 'Delta mean cred-tf-idf')
def test_get_cohens_d_scores(self): corpus = build_hamlet_jz_corpus() np.testing.assert_almost_equal( CohensD(corpus).set_term_ranker(OncePerDocFrequencyRanker). set_categories('hamlet').get_scores()[:5], [-0.2884615, 0.625, 0.625, 0.9919727, 0.625])
def setUpClass(cls): cls.corpus = build_hamlet_jz_corpus()
def test_get_cohens_d_scores(self): corpus = build_hamlet_jz_corpus() np.testing.assert_almost_equal(CohensD(corpus) .set_term_ranker(OncePerDocFrequencyRanker) .set_categories('hamlet') .get_scores()[:5], [-0.2127981, 0.8164966, 0.8164966, 1.3669723, 0.8164966])