def setUp(cls): categories, documents = get_docs_categories() cls.df = pd.DataFrame({'category': categories, 'text': documents}) cls.corpus = CorpusFromPandas(cls.df, 'category', 'text', nlp=whitespace_nlp).build()
def setUp(cls): categories, documents = get_docs_categories() cls.df = pd.DataFrame({'category': categories, 'text': documents}) cls.corpus = CorpusFromPandas(cls.df, 'category', 'text', nlp=whitespace_nlp).build()
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'parsed': cls.parsed_docs}) cls.corpus_fact = CorpusFromParsedDocuments(cls.df, 'category', 'parsed')
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'parsed': cls.parsed_docs}) cls.corpus_fact = CorpusFromParsedDocuments(cls.df, 'category', 'parsed')
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'parsed': cls.parsed_docs, 'orig': [d.upper() for d in cls.documents]}) cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build() cls.corpus = CorpusFromPandas(cls.df, 'category', 'orig', nlp=whitespace_nlp).build()
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'author': ['a', 'a', 'c', 'c', 'c', 'c', 'd', 'd', 'e', 'e'], 'parsed': cls.parsed_docs, 'document_lengths': [len(doc) for doc in cls.documents]}) cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'author': ['a', 'a', 'c', 'c', 'c', 'c', 'd', 'd', 'e', 'e'], 'parsed': cls.parsed_docs, 'document_lengths': [len(doc) for doc in cls.documents]}) cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
def test_term_category_matrix_from_pandas_without_categories(self): tdm = get_term_doc_matrix_without_categories() categories, documents = get_docs_categories() reg_tdm = TermDocMatrixFromPandas(pd.DataFrame({'text': documents, 'categories': categories}), text_col='text', category_col='categories', nlp=whitespace_nlp).build() self.assertIsInstance(tdm, TermDocMatrixWithoutCategories) self.assertEqual(tdm.get_terms(), reg_tdm.get_terms()) self.assertEqual(tdm.get_num_docs(), reg_tdm.get_num_docs()) np.testing.assert_equal(tdm.get_term_doc_mat().data, reg_tdm.get_term_doc_mat().data)
def test_get_term_df(self): categories, documents = get_docs_categories() df = pd.DataFrame({'category': categories, 'text': documents}) tdm_factory = TermDocMatrixFromPandas(df, 'category', 'text', nlp=whitespace_nlp) term_doc_matrix = tdm_factory.build() term_df = term_doc_matrix.get_term_freq_df() self.assertEqual(dict(term_df.ix['speak up']), {'??? freq': 2, 'hamlet freq': 0, 'jay-z/r. kelly freq': 1}) self.assertEqual(dict(term_df.ix['that']), {'??? freq': 0, 'hamlet freq': 2, 'jay-z/r. kelly freq': 0})
def test_get_term_df(self): categories, documents = get_docs_categories() df = pd.DataFrame({'category': categories, 'text': documents}) tdm_factory = TermDocMatrixFromPandas(df, 'category', 'text', nlp=whitespace_nlp) term_doc_matrix = tdm_factory.build() term_df = term_doc_matrix.get_term_freq_df() self.assertEqual(dict(term_df.ix['speak up']), {'??? freq': 2, 'hamlet freq': 0, 'jay-z/r. kelly freq': 1}) self.assertEqual(dict(term_df.ix['that']), {'??? freq': 0, 'hamlet freq': 2, 'jay-z/r. kelly freq': 0})
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({ 'category': cls.categories, 'parsed': cls.parsed_docs, 'orig': [d.upper() for d in cls.documents] }) cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build() cls.corpus = CorpusFromPandas(cls.df, 'category', 'orig', nlp=whitespace_nlp).build()
def test_main(self): categories, documents = get_docs_categories() df = pd.DataFrame({'category': categories, 'text': documents}) corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build() self.assertEqual(CategoryColorAssigner(corpus).get_category_colors().to_dict(), {'???': [255, 127, 14], 'hamlet': [174, 199, 232], 'jay-z/r. kelly': [31, 119, 180]}) term_colors = CategoryColorAssigner(corpus).get_term_colors() self.assertEqual(term_colors['this time'], 'aec7e8') self.assertEqual(term_colors['sire'], '1f77b4') self.assertEqual(len(term_colors), corpus.get_num_terms()) mfact = CSRMatrixFactory() mis = IndexStore() for i, c in enumerate(df['category']): mfact[i, mis.getidx(c)] = 1 corpus = corpus.add_metadata(mfact.get_csr_matrix(), mis) meta_colors = CategoryColorAssigner(corpus, use_non_text_features=True).get_term_colors() self.assertEqual(meta_colors, {'hamlet': 'aec7e8', 'jay-z/r. kelly': '1f77b4', '???': 'ff7f0e'}) self.assertNotEqual(CategoryColorAssigner(corpus).get_term_colors(), meta_colors)
def setUp(cls): categories, documents = get_docs_categories() cls.df = pd.DataFrame({'category': categories, 'text': documents}) cls.df['parsed'] = cls.df.text.apply(whitespace_nlp) cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
def get_term_doc_matrix_without_categories(): categories, documents = get_docs_categories() df = pd.DataFrame({'text': documents}) tdm = TermDocMatrixWithoutCategoriesFromPandas(df, 'text', nlp=whitespace_nlp).build() return tdm