def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix, non_text): idx = IndexStore() tdf_vals = term_freqs.values valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count tdf_vals = term_freqs[valid_terms_mask].values terms = np.array(term_freqs.index)[valid_terms_mask] lengths = [] fact = CSRMatrixFactory() for i, t in enumerate(terms): for tok in t.split(): fact[i, idx.getidx(tok)] = 1 lengths.append(len(t.split())) lengths = np.array(lengths) mat = fact.get_csr_matrix() coocs = lengths - (mat * mat.T) pairs = np.argwhere(coocs == 0).T pairs = self._limit_to_non_identical_terms(pairs) pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram( pairs, terms) pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals) idx_store = term_doc_matrix._get_relevant_idx_store(non_text) redundant_terms = idx_store.getidxstrictbatch(terms[np.unique( pairs[:, 1])]) infrequent_terms = np.argwhere(~valid_terms_mask).T[0] terms_to_remove = np.concatenate([redundant_terms, infrequent_terms]) return terms_to_remove
def use_external_metadata_lists(self, metadata_lists): ''' Takes a list of string lists. Each list corresponds to metadata to associate its corresponding document. :param metadata: List[List[str]] :return: new TermDocMatrix ''' metadata_index_store = IndexStore() metadata_csr_factory = CSRMatrixFactory() assert len(metadata_lists) == self.get_num_docs() print("STARTING") for doc_i, metadata_list in enumerate(metadata_lists): print("L", metadata_list) for metadatum in metadata_list: print("METADATUM", metadatum) # raise Exception(str(metadatum) # + " " + str(type(metadatum)) + " " + str(len(metadatum)) + str(metadata_list) # + " " + str(type(metadata_list)) + " " + str(len(metadata_list)) + str(metadata_lists)) # raise Exception(f"METADATUM {metadatum} " + metadatum + ":::" + metadata_list) metadata_csr_factory[ doc_i, metadata_index_store.getidx(metadatum)] = 1 return self._make_new_term_doc_matrix( new_mX=metadata_csr_factory.get_csr_matrix(dtype=int), new_metadata_idx_store=metadata_index_store, new_y_mask=self._y == self._y)
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix): idx = IndexStore() tdf_vals = term_freqs.values valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count tdf_vals = term_freqs[valid_terms_mask].values terms = np.array(term_freqs.index)[valid_terms_mask] lengths = [] fact = CSRMatrixFactory() for i, t in enumerate(terms): for tok in t.split(): fact[i, idx.getidx(tok)] = 1 lengths.append(len(t.split())) lengths = np.array(lengths) mat = fact.get_csr_matrix() coocs = lengths - (mat * mat.T) pairs = np.argwhere(coocs == 0).T pairs = self._limit_to_non_identical_terms(pairs) pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(pairs, terms) pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals) idx_store = term_doc_matrix._term_idx_store redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(pairs[:, 1])]) infrequent_terms = np.argwhere(~valid_terms_mask).T[0] terms_to_remove = np.concatenate([redundant_terms, infrequent_terms]) return terms_to_remove
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs): fact = CSRMatrixFactory() idx = IndexStore() tdf_vals = term_freqs.values valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count tdf_vals = term_freqs[valid_terms_mask].values terms = np.array(term_freqs.index)[valid_terms_mask] lengths = [] for i, t in enumerate(terms): for tok in t.split(): fact[i, idx.getidx(tok)] = 1 lengths.append(len(t.split())) lengths = np.array(lengths) mat = fact.get_csr_matrix() coocs = lengths - (mat * mat.T) pairs = np.argwhere(coocs == 0).T pairs = pairs.T[(pairs[0] != pairs[1])] pairs = pairs[np.array([terms[i[1]] in terms[i[0]] for i in pairs])] pairs = pairs[np.all(tdf_vals[pairs[:, 1]] <= tdf_vals[pairs[:, 0]], axis=1)] idx_store = self.term_doc_matrix._term_idx_store redundant_terms = idx_store.getidxstrictbatch(terms[np.unique( pairs[:, 1])]) infrequent_terms = np.argwhere(~valid_terms_mask).T[0] terms_to_remove = np.concatenate([redundant_terms, infrequent_terms]) return terms_to_remove
def __init__(self, X: csr_matrix, term_vocabulary: List[str], mX: Optional[csr_matrix] = None, y: Optional[np.array] = None, category_names: Optional[str] = None, metadata_vocabulary: Optional[List[str]] = None, text_df: Optional[pd.DataFrame] = None, text_col: Optional[str] = None, parsed_col: Optional[str] = None, category_col: Optional[str] = None, unigram_frequency_path: Optional[str] = None): ''' Parameters ---------- X: csr_matrix; term-document frequency matrix; columns represent terms and rows documents term_vocabulary: List[str]; Each entry corresponds to a term mX: Optional[csr_matrix]; metadata csr matrix y: Optional[np.array[int]]; indices of category names for each document category_names: Optional[List[str]], names of categories for y text_df: pd.DataFrame with a row containing the raw document text text_col: str; name of row containing the text of each document parsed_col: str; name of row containing the parsed text of each document unigram_frequency_path: str (see TermDocMatrix) ''' self.X = X self.term_idx_store = IndexStoreFromList.build(term_vocabulary) assert self.X.shape[1] == len(term_vocabulary) self.metadata_idx_store = IndexStore() if y is None: self.y = np.zeros(self.X.shape[0], dtype=np.int) self.category_idx_store = IndexStoreFromList.build(['_']) assert category_names is None else: self.y = y assert len(category_names) == len(set(y)) self.category_idx_store = IndexStoreFromList.build(category_names) if metadata_vocabulary is not None: assert mX.shape[1] == metadata_vocabulary self.mX = mX self.metadata_idx_store = IndexStoreFromList.build( metadata_vocabulary) else: assert metadata_vocabulary is None self.mX = csr_matrix((0, 0)) self.metadata_idx_store = IndexStore() self.text_df = text_df if parsed_col is not None: assert parsed_col in text_df if text_col is not None: assert text_col in text_df if category_col is not None: assert category_col in text_df self.category_col = category_col self.text_col = text_col self.parsed_col = parsed_col self.unigram_frequency_path = unigram_frequency_path
def test_add_metadata(self): hamlet = get_hamlet_term_doc_matrix() meta_index_store = IndexStore() meta_fact = CSRMatrixFactory() for i in range(hamlet.get_num_docs()): meta_fact[i, i] = meta_index_store.getidx(str(i)) other_hamlet = hamlet.add_metadata(meta_fact.get_csr_matrix(), meta_index_store) assert other_hamlet != hamlet meta_index_store = IndexStore() meta_fact = CSRMatrixFactory() for i in range(hamlet.get_num_docs() - 5): meta_fact[i, i] = meta_index_store.getidx(str(i)) with self.assertRaises(AssertionError): hamlet.add_metadata(meta_fact.get_csr_matrix(), meta_index_store)
def build(term_to_index_dict): ''' Parameters ---------- term_to_index_dict: term -> idx dictionary Returns ------- IndexStore ''' idxstore = IndexStore() idxstore._val2i = term_to_index_dict idxstore._next_i = len(term_to_index_dict) idxstore._i2val = [None for _ in range(idxstore._next_i)] for term, idx in idxstore._val2i.items(): idxstore._i2val[idx] = term return idxstore
def test_build(self): from sklearn.feature_extraction.text import CountVectorizer categories, docs = get_docs_categories_semiotic() idx_store = IndexStore() y = np.array([idx_store.getidx(c) for c in categories]) count_vectorizer = CountVectorizer() X_counts = count_vectorizer.fit_transform(docs) term_doc_mat = TermDocMatrixFromScikit( X=X_counts, y=y, feature_vocabulary=count_vectorizer.vocabulary_, category_names=idx_store.values()).build() self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly']) self.assertEqual(term_doc_mat .get_term_freq_df() .assign(score=term_doc_mat.get_scaled_f_scores('hamlet')) .sort_values(by='score', ascending=False).index.tolist()[:5], ['that', 'march', 'did', 'majesty', 'sometimes'])
def _get_build_kwargs(self): constructor_kwargs = { 'X': csr_matrix(self.term_freq_df.values.T), 'mX': csr_matrix((0, 0)), 'y': np.array(range(len(self.term_freq_df.columns))), 'term_idx_store': IndexStoreFromList.build(self.term_freq_df.index.values), 'metadata_idx_store': IndexStore(), 'category_idx_store': IndexStoreFromList.build(self.term_freq_df.columns), 'unigram_frequency_path': self.unigram_frequency_path } return constructor_kwargs
def _get_build_kwargs(self): constructor_kwargs = { 'X': self.X, 'mX': csr_matrix((0, 0)), 'y': self.y, 'term_idx_store': IndexStoreFromDict.build(self.feature_vocabulary), 'metadata_idx_store': IndexStore(), 'category_idx_store': IndexStoreFromList.build(self.category_names), 'unigram_frequency_path': self.unigram_frequency_path } return constructor_kwargs