Exemplo n.º 1
0
 def test_add_metadata(self):
     hamlet = get_hamlet_term_doc_matrix()
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs()):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     other_hamlet = hamlet.add_metadata(meta_fact.get_csr_matrix(),
                                        meta_index_store)
     assert other_hamlet != hamlet
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs() - 5):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     with self.assertRaises(AssertionError):
         hamlet.add_metadata(meta_fact.get_csr_matrix(), meta_index_store)
Exemplo n.º 2
0
    def _get_term_indices_to_compact_from_term_freqs(self, term_freqs,
                                                     term_doc_matrix,
                                                     non_text):
        idx = IndexStore()
        tdf_vals = term_freqs.values
        valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
        tdf_vals = term_freqs[valid_terms_mask].values
        terms = np.array(term_freqs.index)[valid_terms_mask]

        lengths = []
        fact = CSRMatrixFactory()
        for i, t in enumerate(terms):
            for tok in t.split():
                fact[i, idx.getidx(tok)] = 1
            lengths.append(len(t.split()))
        lengths = np.array(lengths)
        mat = fact.get_csr_matrix()

        coocs = lengths - (mat * mat.T)
        pairs = np.argwhere(coocs == 0).T
        pairs = self._limit_to_non_identical_terms(pairs)
        pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(
            pairs, terms)
        pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
        idx_store = term_doc_matrix._get_relevant_idx_store(non_text)
        redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(
            pairs[:, 1])])
        infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
        terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
        return terms_to_remove
Exemplo n.º 3
0
    def use_external_metadata_lists(self, metadata_lists):
        '''
        Takes a list of string lists. Each list corresponds to metadata to associate its corresponding document.
        :param metadata: List[List[str]]
        :return: new TermDocMatrix
        '''
        metadata_index_store = IndexStore()
        metadata_csr_factory = CSRMatrixFactory()
        assert len(metadata_lists) == self.get_num_docs()
        print("STARTING")
        for doc_i, metadata_list in enumerate(metadata_lists):
            print("L", metadata_list)
            for metadatum in metadata_list:
                print("METADATUM", metadatum)
                # raise Exception(str(metadatum)
                #                + " " + str(type(metadatum)) + " " + str(len(metadatum)) + str(metadata_list)
                #                + " " + str(type(metadata_list)) + " " + str(len(metadata_list)) + str(metadata_lists))
                # raise Exception(f"METADATUM {metadatum} " + metadatum + ":::" + metadata_list)
                metadata_csr_factory[
                    doc_i, metadata_index_store.getidx(metadatum)] = 1

        return self._make_new_term_doc_matrix(
            new_mX=metadata_csr_factory.get_csr_matrix(dtype=int),
            new_metadata_idx_store=metadata_index_store,
            new_y_mask=self._y == self._y)
Exemplo n.º 4
0
 def _get_term_indices_to_compact_from_term_freqs(self, term_freqs):
     fact = CSRMatrixFactory()
     idx = IndexStore()
     tdf_vals = term_freqs.values
     valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
     tdf_vals = term_freqs[valid_terms_mask].values
     terms = np.array(term_freqs.index)[valid_terms_mask]
     lengths = []
     for i, t in enumerate(terms):
         for tok in t.split():
             fact[i, idx.getidx(tok)] = 1
         lengths.append(len(t.split()))
     lengths = np.array(lengths)
     mat = fact.get_csr_matrix()
     coocs = lengths - (mat * mat.T)
     pairs = np.argwhere(coocs == 0).T
     pairs = pairs.T[(pairs[0] != pairs[1])]
     pairs = pairs[np.array([terms[i[1]] in terms[i[0]] for i in pairs])]
     pairs = pairs[np.all(tdf_vals[pairs[:, 1]] <= tdf_vals[pairs[:, 0]],
                          axis=1)]
     idx_store = self.term_doc_matrix._term_idx_store
     redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(
         pairs[:, 1])])
     infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
     terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
     return terms_to_remove
Exemplo n.º 5
0
	def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix):
		idx = IndexStore()
		tdf_vals = term_freqs.values
		valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
		tdf_vals = term_freqs[valid_terms_mask].values
		terms = np.array(term_freqs.index)[valid_terms_mask]

		lengths = []
		fact = CSRMatrixFactory()
		for i, t in enumerate(terms):
			for tok in t.split():
				fact[i, idx.getidx(tok)] = 1
			lengths.append(len(t.split()))
		lengths = np.array(lengths)
		mat = fact.get_csr_matrix()

		coocs = lengths - (mat * mat.T)
		pairs = np.argwhere(coocs == 0).T
		pairs = self._limit_to_non_identical_terms(pairs)
		pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(pairs, terms)
		pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
		idx_store = term_doc_matrix._term_idx_store
		redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(pairs[:, 1])])
		infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
		terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
		return terms_to_remove
Exemplo n.º 6
0
 def test_add_metadata(self):
     hamlet = get_hamlet_term_doc_matrix()
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs()):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     other_hamlet = hamlet.add_metadata(meta_fact.get_csr_matrix(),
                                        meta_index_store)
     assert other_hamlet != hamlet
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs() - 5):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     with self.assertRaises(AssertionError):
         hamlet.add_metadata(meta_fact.get_csr_matrix(),
                             meta_index_store)
	def test_build(self):
		from sklearn.feature_extraction.text import CountVectorizer
		categories, docs = get_docs_categories_semiotic()
		idx_store = IndexStore()
		y = np.array([idx_store.getidx(c) for c in categories])
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(docs)
		term_doc_mat = TermDocMatrixFromScikit(
			X=X_counts,
			y=y,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=idx_store.values()).build()
		self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly'])
		self.assertEqual(term_doc_mat
		                 .get_term_freq_df()
		                 .assign(score=term_doc_mat.get_scaled_f_scores('hamlet'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['that', 'march', 'did', 'majesty', 'sometimes'])