Exemplo n.º 1
0
 def _make_docs_getter(self, max_docs_per_category):
     if max_docs_per_category is None:
         docs_getter = DocsAndLabelsFromCorpus(self.term_doc_matrix)
     else:
         docs_getter = DocsAndLabelsFromCorpusSample(
             self.term_doc_matrix, max_docs_per_category)
     if self.scatterchartdata.use_non_text_features:
         docs_getter = docs_getter.use_non_text_features()
     return docs_getter
Exemplo n.º 2
0
	def test_categories(self):
		for obj in [DocsAndLabelsFromCorpusSample(self.corpus, 1), DocsAndLabelsFromCorpus(self.corpus)]:
			output = obj.get_labels_and_texts()
			self.assertEqual(output['categories'], ['hamlet', 'jay-z/r. kelly', '???'])
			metadata = ['element 0 0', 'element 1 0', 'element 2 0', 'element 3 0',
			            'element 4 1', 'element 5 1', 'element 6 1',
			            'element 7 1', 'element 8 1', 'element 9 2']
			output = obj.get_labels_and_texts_and_meta(metadata)
			self.assertEqual(output['categories'], ['hamlet', 'jay-z/r. kelly', '???'])
Exemplo n.º 3
0
    def test_max_per_category(self):
        docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus,
                                                        max_per_category=2,
                                                        seed=0)
        metadata = np.array([
            'element 0 0', 'element 1 0', 'element 2 0', 'element 3 0',
            'element 4 1', 'element 5 1', 'element 6 1', 'element 7 1',
            'element 8 1', 'element 9 2'
        ])
        output = docs_and_labels.get_labels_and_texts_and_meta(metadata)
        self.assertTrue('texts' in output)
        self.assertTrue('labels' in output)
        self.assertTrue('meta' in output)
        self.assertTrue('extra' not in output)
        d = {}
        for text, lab, meta in zip(output['texts'], output['labels'],
                                   output['meta']):
            d.setdefault(lab, []).append(text)
        for lab, documents in d.items():
            self.assertLessEqual(len(documents), 2)
        json.dumps(d)

        docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus,
                                                        max_per_category=2)
        output = docs_and_labels.get_labels_and_texts()
        self.assertTrue('texts' in output)
        self.assertTrue('labels' in output)
        self.assertTrue('meta' not in output)
        self.assertTrue('extra' not in output)
        d = {}
        for text, lab in zip(output['texts'], output['labels']):
            d.setdefault(lab, []).append(text)
        for lab, documents in d.items():
            self.assertLessEqual(len(documents), 2)
        json.dumps(d)

        docs_and_labels = DocsAndLabelsFromCorpusSample(
            self.parsed_corpus, max_per_category=2).use_non_text_features()
        output = docs_and_labels.get_labels_and_texts()
        self.assertTrue('texts' in output)
        self.assertTrue('labels' in output)
        self.assertTrue('meta' not in output)
        self.assertTrue('extra' in output)
        d = {}
        for text, lab in zip(output['texts'], output['labels']):
            d.setdefault(lab, []).append(text)
        for lab, documents in d.items():
            self.assertLessEqual(len(documents), 2)
        json.dumps(d)
Exemplo n.º 4
0
 def test_alternative_text_field(self):
     DocsAndLabelsFromCorpus(self.corpus)
     DocsAndLabelsFromCorpus(self.parsed_corpus)
     with self.assertRaises(CorpusShouldBeParsedCorpusException):
         DocsAndLabelsFromCorpus(self.corpus, alternative_text_field='orig')
     d = DocsAndLabelsFromCorpus(self.parsed_corpus,
                                 alternative_text_field='orig')
     self.assertEqual(d.get_labels_and_texts()['texts'][0],
                      d.get_labels_and_texts()['texts'][0].upper())
     d = DocsAndLabelsFromCorpus(self.parsed_corpus)
     self.assertNotEqual(d.get_labels_and_texts()['texts'][0],
                         d.get_labels_and_texts()['texts'][0].upper())
     d = DocsAndLabelsFromCorpusSample(self.parsed_corpus,
                                       2,
                                       alternative_text_field='orig',
                                       seed=0)
     texts = d.get_labels_and_texts()['texts']
     self.assertEqual(texts[0], texts[0].upper())
     d = DocsAndLabelsFromCorpusSample(self.parsed_corpus, 2)
     self.assertNotEqual(d.get_labels_and_texts()['texts'][0],
                         d.get_labels_and_texts()['texts'][0].upper())
	def test_max_per_category(self):
		docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus, max_per_category=2, seed=0)
		metadata = np.array(['element 0 0', 'element 1 0', 'element 2 0', 'element 3 0',
		                     'element 4 1', 'element 5 1', 'element 6 1',
		                     'element 7 1', 'element 8 1', 'element 9 2'])
		output = docs_and_labels.get_labels_and_texts_and_meta(metadata)
		self.assertTrue('texts' in output)
		self.assertTrue('labels' in output)
		self.assertTrue('meta' in output)
		self.assertTrue('extra' not in output)
		d = {}
		for text, lab, meta in zip(output['texts'], output['labels'], output['meta']):
			d.setdefault(lab, []).append(text)
		for lab, documents in d.items():
			self.assertLessEqual(len(documents), 2)
		json.dumps(d)

		docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus, max_per_category=2)
		output = docs_and_labels.get_labels_and_texts()
		self.assertTrue('texts' in output)
		self.assertTrue('labels' in output)
		self.assertTrue('meta' not in output)
		self.assertTrue('extra' not in output)
		d = {}
		for text, lab in zip(output['texts'], output['labels']):
			d.setdefault(lab, []).append(text)
		for lab, documents in d.items():
			self.assertLessEqual(len(documents), 2)
		json.dumps(d)

		docs_and_labels = DocsAndLabelsFromCorpusSample(self.parsed_corpus, max_per_category=2).use_non_text_features()
		output = docs_and_labels.get_labels_and_texts()
		self.assertTrue('texts' in output)
		self.assertTrue('labels' in output)
		self.assertTrue('meta' not in output)
		self.assertTrue('extra' in output)
		d = {}
		for text, lab in zip(output['texts'], output['labels']):
			d.setdefault(lab, []).append(text)
		for lab, documents in d.items():
			self.assertLessEqual(len(documents), 2)
		json.dumps(d)