def test_main(self):
		d = DocsAndLabelsFromCorpus(self.parsed_corpus)
		output = d.get_labels_and_texts()
		self.assertTrue('texts' in output)
		self.assertTrue('labels' in output)
		self.assertEqual(self.parsed_corpus._y.astype(int).tolist(),
		                 list(output['labels']))
		self.assertEqual(self.parsed_corpus.get_texts().tolist(),
		                 list(output['texts']))
Exemplo n.º 2
0
 def _make_docs_getter(self, max_docs_per_category):
     if max_docs_per_category is None:
         docs_getter = DocsAndLabelsFromCorpus(self.term_doc_matrix)
     else:
         docs_getter = DocsAndLabelsFromCorpusSample(
             self.term_doc_matrix, max_docs_per_category)
     if self.scatterchartdata.use_non_text_features:
         docs_getter = docs_getter.use_non_text_features()
     return docs_getter
Exemplo n.º 3
0
 def test_main(self):
     d = DocsAndLabelsFromCorpus(self.parsed_corpus)
     output = d.get_labels_and_texts()
     self.assertTrue('texts' in output)
     self.assertTrue('labels' in output)
     self.assertEqual(
         self.parsed_corpus._y.astype(int).tolist(), output['labels'])
     self.assertEqual(self.parsed_corpus.get_texts().tolist(),
                      output['texts'])
Exemplo n.º 4
0
	def test_metadata(self):
		d = DocsAndLabelsFromCorpus(self.corpus)
		metadata = ['element 0 0', 'element 1 0', 'element 2 0', 'element 3 0',
		            'element 4 1', 'element 5 1', 'element 6 1',
		            'element 7 1', 'element 8 1', 'element 9 2']
		output = d.get_labels_and_texts_and_meta(metadata)
		self.assertTrue('texts' in output)
		self.assertTrue('labels' in output)
		self.assertTrue('meta' in output)
		self.assertEqual(output['meta'], metadata)
Exemplo n.º 5
0
	def test_extra_features(self):
		corpus = build_hamlet_jz_corpus_with_meta()
		d = DocsAndLabelsFromCorpus(corpus).use_non_text_features()
		metadata = ['meta%s'%(i) for i in range(corpus.get_num_docs())]
		output = d.get_labels_and_texts_and_meta(metadata)
		self.assertEqual(output, {'categories': ['hamlet', 'jay-z/r. kelly'],
		                          'texts': ["what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!'], 'meta': ['meta0', 'meta1', 'meta2', 'meta3', 'meta4', 'meta5', 'meta6', 'meta7'],
		                          'labels': [0, 0, 0, 0, 1, 1, 1, 1],
		                          'extra': [{'cat3': 1, 'cat4': 2}, {'cat4': 2}, {'cat5': 1, 'cat3': 2}, {'cat9': 1, 'cat6': 2}, {'cat3': 1, 'cat4': 2}, {'cat1': 2, 'cat2': 1}, {'cat5': 1, 'cat2': 2}, {'cat3': 2, 'cat4': 1}]}
)
	def test_metadata(self):
		d = DocsAndLabelsFromCorpus(self.parsed_corpus)
		metadata = ['element 0 0', 'element 1 0', 'element 2 0', 'element 3 0',
		            'element 4 1', 'element 5 1', 'element 6 1',
		            'element 7 1', 'element 8 1', 'element 9 2']
		output = d.get_labels_and_texts_and_meta(metadata)
		self.assertTrue('texts' in output)
		self.assertTrue('labels' in output)
		self.assertTrue('meta' in output)
		self.assertEqual(output['meta'], metadata)
 def _make_docs_getter(self, max_docs_per_category, alternative_text_field):
     if max_docs_per_category is None:
         docs_getter = DocsAndLabelsFromCorpus(self.term_doc_matrix,
                                               alternative_text_field=alternative_text_field)
     else:
         docs_getter = DocsAndLabelsFromCorpusSample(self.term_doc_matrix,
                                                     max_docs_per_category,
                                                     alternative_text_field=alternative_text_field)
     if self.scatterchartdata.use_non_text_features:
         docs_getter = docs_getter.use_non_text_features()
     return docs_getter
Exemplo n.º 8
0
	def test_categories(self):
		for obj in [DocsAndLabelsFromCorpusSample(self.corpus, 1), DocsAndLabelsFromCorpus(self.corpus)]:
			output = obj.get_labels_and_texts()
			self.assertEqual(output['categories'], ['hamlet', 'jay-z/r. kelly', '???'])
			metadata = ['element 0 0', 'element 1 0', 'element 2 0', 'element 3 0',
			            'element 4 1', 'element 5 1', 'element 6 1',
			            'element 7 1', 'element 8 1', 'element 9 2']
			output = obj.get_labels_and_texts_and_meta(metadata)
			self.assertEqual(output['categories'], ['hamlet', 'jay-z/r. kelly', '???'])
	def test_extra_features(self):
		corpus = build_hamlet_jz_corpus_with_meta()
		d = DocsAndLabelsFromCorpus(corpus).use_non_text_features()
		metadata = ['meta%s' % (i) for i in range(corpus.get_num_docs())]
		output = d.get_labels_and_texts_and_meta(metadata)
		extra_val = [{'cat3': 1, 'cat4': 2}, {'cat4': 2}, {'cat5': 1, 'cat3': 2},
		             {'cat9': 1, 'cat6': 2}, {'cat3': 1, 'cat4': 2},
		             {'cat1': 2, 'cat2': 1},
		             {'cat5': 1, 'cat2': 2},
		             {'cat3': 2, 'cat4': 1}]
		extra_val = [{'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}, {'cat1': 2}]
		output['labels'] = list(output['labels'])
		self.assertEqual(output, {'categories': ['hamlet', 'jay-z/r. kelly'],
		                          'texts': ["what art thou that usurp'st this time of night,",
		                                    'together with that fair and warlike form',
		                                    'in which the majesty of buried denmark',
		                                    'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?',
		                                    'it is i sire tone from brooklyn.', 'well, speak up man what is it?',
		                                    'news from the east sire! the best of both worlds has returned!'],
		                          'meta': ['meta0', 'meta1', 'meta2', 'meta3', 'meta4', 'meta5', 'meta6', 'meta7'],
		                          'labels': [0, 0, 0, 0, 1, 1, 1, 1],
		                          'extra': extra_val}
		                 )
Exemplo n.º 10
0
 def test_alternative_text_field(self):
     DocsAndLabelsFromCorpus(self.corpus)
     DocsAndLabelsFromCorpus(self.parsed_corpus)
     with self.assertRaises(CorpusShouldBeParsedCorpusException):
         DocsAndLabelsFromCorpus(self.corpus, alternative_text_field='orig')
     d = DocsAndLabelsFromCorpus(self.parsed_corpus,
                                 alternative_text_field='orig')
     self.assertEqual(d.get_labels_and_texts()['texts'][0],
                      d.get_labels_and_texts()['texts'][0].upper())
     d = DocsAndLabelsFromCorpus(self.parsed_corpus)
     self.assertNotEqual(d.get_labels_and_texts()['texts'][0],
                         d.get_labels_and_texts()['texts'][0].upper())
     d = DocsAndLabelsFromCorpusSample(self.parsed_corpus,
                                       2,
                                       alternative_text_field='orig',
                                       seed=0)
     texts = d.get_labels_and_texts()['texts']
     self.assertEqual(texts[0], texts[0].upper())
     d = DocsAndLabelsFromCorpusSample(self.parsed_corpus, 2)
     self.assertNotEqual(d.get_labels_and_texts()['texts'][0],
                         d.get_labels_and_texts()['texts'][0].upper())
	def test_alternative_text_field(self):
		DocsAndLabelsFromCorpus(self.corpus)
		DocsAndLabelsFromCorpus(self.parsed_corpus)
		with self.assertRaises(CorpusShouldBeParsedCorpusException):
			DocsAndLabelsFromCorpus(self.corpus, alternative_text_field='orig')
		d = DocsAndLabelsFromCorpus(self.parsed_corpus, alternative_text_field='orig')
		self.assertEqual(d.get_labels_and_texts()['texts'][0],
		                 d.get_labels_and_texts()['texts'][0].upper())
		d = DocsAndLabelsFromCorpus(self.parsed_corpus)
		self.assertNotEqual(d.get_labels_and_texts()['texts'][0],
		                    d.get_labels_and_texts()['texts'][0].upper())
		d = DocsAndLabelsFromCorpusSample(self.parsed_corpus, 2, alternative_text_field='orig', seed=0)
		texts = d.get_labels_and_texts()['texts']
		self.assertEqual(texts[0],
		                 texts[0].upper())
		d = DocsAndLabelsFromCorpusSample(self.parsed_corpus, 2)
		self.assertNotEqual(d.get_labels_and_texts()['texts'][0],
		                    d.get_labels_and_texts()['texts'][0].upper())