def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in df parsed_col : str name of spacy parsed column in df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc
class CorpusFromParsedDocuments(object): def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in df parsed_col : str name of spacy parsed column in df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._y = self._get_y_and_populate_category_idx_store() self._df.apply(self._add_to_x_factory, axis=1) self._X = self._X_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix() self._mX = self._mX_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix() return ParsedCorpus(self._df, self._X, self._mX, self._y, self._term_idx_store, self._category_idx_store, self._metadata_idx_store, self._parsed_col, self._category_col) def _get_y_and_populate_category_idx_store(self): return np.array(self._df[self._category_col].apply( self._category_idx_store.getidx)) def _add_to_x_factory(self, row): parsed_text = row[self._parsed_col] for term, count in self._feats_from_spacy_doc.get_feats( parsed_text).items(): term_idx = self._term_idx_store.getidx(term) self._X_factory[row.name, term_idx] = count for meta, val in self._feats_from_spacy_doc.get_doc_metadata( parsed_text).items(): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = val
def init_term_doc_matrix_variables(): y = [] X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() category_idx_store = IndexStore() term_idx_store = IndexStore() metadata_idx_store = IndexStore() return X_factory, mX_factory, category_idx_store, \ term_idx_store, metadata_idx_store, y
def test_main(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getval(0), 'a') self.assertEqual(index_store.getval(1), 'b') self.assertTrue('a' in index_store) self.assertFalse('c' in index_store) self.assertEqual(set(index_store.values()), set(['a', 'b'])) self.assertFalse(0 in index_store) self.assertTrue(index_store.hasidx(0)) self.assertFalse(index_store.hasidx(2)) self.assertEqual(index_store.getnumvals(), 2) self.assertEqual(list(index_store.items()), [(0, 'a'), (1, 'b')])
def test_getidxstrict(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('a'), 0) with self.assertRaises(KeyError): index_store.getidxstrict('c')
def _build_from_category_spacy_doc_iter(self, category_doc_iter): ''' Parameters ---------- category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs Returns ---------- t : TermDocMatrix ''' term_idx_store = IndexStore() category_idx_store = IndexStore() metadata_idx_store = IndexStore() X, mX, y = self._get_features_and_labels_from_documents_and_indexes( category_doc_iter, category_idx_store, term_idx_store, metadata_idx_store) return TermDocMatrix(X, mX, y, term_idx_store=term_idx_store, category_idx_store=category_idx_store, metadata_idx_store=metadata_idx_store)
def test_batch_delete(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('d'), 3) with self.assertRaises(KeyError): new_idx_store = index_store.batch_delete_vals(['e', 'c']) new_idx_store = index_store.batch_delete_vals(['b', 'c']) self.assertEqual(new_idx_store.getidx('a'), 0) self.assertEqual(new_idx_store.getidx('c'), 2) self.assertEqual(new_idx_store.getidx('e'), 3) self.assertEqual(index_store.getidx('d'), 3) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('a'), 0) with self.assertRaises(ValueError): new_idx_store = index_store.batch_delete_idx([5, 1]) new_idx_store = index_store.batch_delete_idx([2, 1]) self.assertEqual(new_idx_store.getidx('a'), 0) self.assertEqual(new_idx_store.getidx('c'), 2) self.assertEqual(new_idx_store.getidx('e'), 3)
def test_batch_delete_extra(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('d'), 3) self.assertEqual(index_store.getidx('e'), 4) self.assertEqual(index_store.getidx('f'), 5) del_idxstore = index_store.batch_delete_vals(['b', 'e']) self.assertEqual(list(del_idxstore.items()), [(0, 'a'), (1, 'c'), (2, 'd'), (3, 'f')]) del_idxstore2 = del_idxstore.batch_delete_vals([]) self.assertEqual(list(del_idxstore.items()), list(del_idxstore2.items()))
def build_from_category_whitespace_delimited_text(category_text_iter): ''' Parameters ---------- category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs Returns ------- TermDocMatrix ''' y = [] X_factory = CSRMatrixFactory() term_idx_store = IndexStore() category_idx_store = IndexStore() mX_factory = CSRMatrixFactory() metadata_idx_store = IndexStore() for doci, (category, text) in enumerate(category_text_iter): y.append(category_idx_store.getidx(category)) term_freq = Counter() for sent in text.strip(string.punctuation).lower().split('\n'): unigrams = [] for tok in sent.strip().split(): unigrams.append(tok) bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:]))) for term in unigrams + bigrams: term_freq[term_idx_store.getidx(term)] += 1 for word_idx, freq in term_freq.items(): X_factory[doci, word_idx] = freq metadata_idx_store = IndexStore() return TermDocMatrix(X=X_factory.get_csr_matrix(), mX=mX_factory.get_csr_matrix(), y=np.array(y), term_idx_store=term_idx_store, metadata_idx_store=metadata_idx_store, category_idx_store=category_idx_store)
def term_group_freq_df(self, group_col): # type: (str) -> pd.DataFrame ''' Returns a dataframe indexed on the number of groups a term occured in. Parameters ---------- group_col Returns ------- pd.DataFrame ''' group_idx_store = IndexStore() X = self._X group_idx_to_cat_idx, row_group_cat \ = self._get_group_docids_and_index_store(X, group_col, group_idx_store) newX = self._change_document_type_in_matrix(X, row_group_cat) newX = self._make_all_positive_data_ones(newX) category_row = newX.tocoo().row for group_idx, cat_idx in group_idx_to_cat_idx.items(): category_row[category_row == group_idx] = cat_idx catX = self._change_document_type_in_matrix(newX, category_row) return self._term_freq_df_from_matrix(catX)