def test_getidxstrict(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('a'), 0) with self.assertRaises(KeyError): index_store.getidxstrict('c')
class CorpusFromParsedDocuments(object): def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in df parsed_col : str name of spacy parsed column in df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc def build(self): '''Constructs the term doc matrix. Returns ------- scattertext.ParsedCorpus.ParsedCorpus ''' self._y = self._get_y_and_populate_category_idx_store() self._df.apply(self._add_to_x_factory, axis=1) self._X = self._X_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix() self._mX = self._mX_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix() return ParsedCorpus(self._df, self._X, self._mX, self._y, self._term_idx_store, self._category_idx_store, self._metadata_idx_store, self._parsed_col, self._category_col) def _get_y_and_populate_category_idx_store(self): return np.array(self._df[self._category_col].apply( self._category_idx_store.getidx)) def _add_to_x_factory(self, row): parsed_text = row[self._parsed_col] for term, count in self._feats_from_spacy_doc.get_feats( parsed_text).items(): term_idx = self._term_idx_store.getidx(term) self._X_factory[row.name, term_idx] = count for meta, val in self._feats_from_spacy_doc.get_doc_metadata( parsed_text).items(): meta_idx = self._metadata_idx_store.getidx(meta) self._mX_factory[row.name, meta_idx] = val
def test_main(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getval(0), 'a') self.assertEqual(index_store.getval(1), 'b') self.assertTrue('a' in index_store) self.assertFalse('c' in index_store) self.assertEqual(set(index_store.values()), set(['a', 'b'])) self.assertFalse(0 in index_store) self.assertTrue(index_store.hasidx(0)) self.assertFalse(index_store.hasidx(2)) self.assertEqual(index_store.getnumvals(), 2) self.assertEqual(list(index_store.items()), [(0, 'a'), (1, 'b')])
def test_batch_delete_extra(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('d'), 3) self.assertEqual(index_store.getidx('e'), 4) self.assertEqual(index_store.getidx('f'), 5) del_idxstore = index_store.batch_delete_vals(['b', 'e']) self.assertEqual(list(del_idxstore.items()), [(0, 'a'), (1, 'c'), (2, 'd'), (3, 'f')]) del_idxstore2 = del_idxstore.batch_delete_vals([]) self.assertEqual(list(del_idxstore.items()), list(del_idxstore2.items()))
def test_batch_delete(self): index_store = IndexStore() self.assertEqual(index_store.getidx('a'), 0) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('d'), 3) with self.assertRaises(KeyError): new_idx_store = index_store.batch_delete_vals(['e', 'c']) new_idx_store = index_store.batch_delete_vals(['b', 'c']) self.assertEqual(new_idx_store.getidx('a'), 0) self.assertEqual(new_idx_store.getidx('c'), 2) self.assertEqual(new_idx_store.getidx('e'), 3) self.assertEqual(index_store.getidx('d'), 3) self.assertEqual(index_store.getidx('c'), 2) self.assertEqual(index_store.getidx('b'), 1) self.assertEqual(index_store.getidx('a'), 0) with self.assertRaises(ValueError): new_idx_store = index_store.batch_delete_idx([5, 1]) new_idx_store = index_store.batch_delete_idx([2, 1]) self.assertEqual(new_idx_store.getidx('a'), 0) self.assertEqual(new_idx_store.getidx('c'), 2) self.assertEqual(new_idx_store.getidx('e'), 3)
def build_from_category_whitespace_delimited_text(category_text_iter): ''' Parameters ---------- category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs Returns ------- TermDocMatrix ''' y = [] X_factory = CSRMatrixFactory() term_idx_store = IndexStore() category_idx_store = IndexStore() mX_factory = CSRMatrixFactory() metadata_idx_store = IndexStore() for doci, (category, text) in enumerate(category_text_iter): y.append(category_idx_store.getidx(category)) term_freq = Counter() for sent in text.strip(string.punctuation).lower().split('\n'): unigrams = [] for tok in sent.strip().split(): unigrams.append(tok) bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:]))) for term in unigrams + bigrams: term_freq[term_idx_store.getidx(term)] += 1 for word_idx, freq in term_freq.items(): X_factory[doci, word_idx] = freq metadata_idx_store = IndexStore() return TermDocMatrix(X=X_factory.get_csr_matrix(), mX=mX_factory.get_csr_matrix(), y=np.array(y), term_idx_store=term_idx_store, metadata_idx_store=metadata_idx_store, category_idx_store=category_idx_store)