def test_whitespace_nlp(self): raw = '''Hi! My name is Jason. You can call me Mr. J. Is that your name too? Ha. Ha ha. ''' doc = whitespace_nlp(raw) self.assertEqual(len(list(doc)), 73) self.assertEqual(len(doc.sents), 1) tok = Tok('WORD', 'Jason', 'jason', 'Name', 'NNP') self.assertEqual(len(tok), 5) self.assertEqual(str(tok), 'jason') self.assertEqual( str( Doc([[ Tok('WORD', 'Jason', 'jason', 'Name', 'NNP'), Tok('WORD', 'a', 'a', 'Name', 'NNP') ]], raw='asdfbasdfasd')), 'asdfbasdfasd') self.assertEqual( str( Doc([[ Tok('WORD', 'Blah', 'blah', 'Name', 'NNP'), Tok('Space', ' ', ' ', ' ', ' '), Tok('WORD', 'a', 'a', 'Name', 'NNP') ]])), 'blah a')
def build_hamlet_jz_df(): # type: () -> pd.DataFrame categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text df = pd.DataFrame({ 'category': categories, 'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents] }) df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)] return df
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'author': ['a', 'a', 'c', 'c', 'c', 'c', 'd', 'd', 'e', 'e'], 'parsed': cls.parsed_docs, 'document_lengths': [len(doc) for doc in cls.documents]}) cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
def build_hamlet_jz_corpus(): # type: () -> Corpus categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text df = pd.DataFrame({ 'category': categories, 'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents] }) df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)] return CorpusFromParsedDocuments(df=df, category_col='category', parsed_col='parsed').build()
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({ 'category': cls.categories, 'author': ['a', 'a', 'c', 'c', 'c', 'c', 'd', 'd', 'e', 'e'], 'parsed': cls.parsed_docs, 'document_lengths': [len(doc) for doc in cls.documents] }) cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
def build_hamlet_jz_corpus_with_meta(): # type: () -> Corpus def empath_mock(doc, **kwargs): toks = list(doc) num_toks = min(3,len(toks)) return {'cat'+str(len(tok)):val for val,tok in enumerate(toks[:num_toks])} categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text df = pd.DataFrame({ 'category': categories, 'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents] }) df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)] return CorpusFromParsedDocuments( df=df, category_col='category', parsed_col='parsed', feats_from_spacy_doc=FeatsFromSpacyDocAndEmpath(empath_analyze_function=empath_mock) ).build()
def test_whitespace_nlp(self): raw = '''Hi! My name is Jason. You can call me Mr. J. Is that your name too? Ha. Ha ha. ''' doc = whitespace_nlp(raw) self.assertEqual(len(list(doc)), 55) self.assertEqual(len(doc.sents), 1) tok = Tok('WORD', 'Jason', 'jason', 'Name', 'NNP') self.assertEqual(len(tok), 5) self.assertEqual(str(tok), 'jason') self.assertEqual(str(Doc([[Tok('WORD', 'Jason', 'jason', 'Name', 'NNP'), Tok('WORD', 'a', 'a', 'Name', 'NNP')]], raw='asdfbasdfasd')), 'asdfbasdfasd') self.assertEqual(str(Doc([[Tok('WORD', 'Blah', 'blah', 'Name', 'NNP'), Tok('Space', ' ', ' ', ' ', ' '), Tok('WORD', 'a', 'a', 'Name', 'NNP')]])), 'blah a')
def build_hamlet_jz_corpus_with_meta(): # type: () -> Corpus def empath_mock(doc, **kwargs): toks = doc.split() num_toks = min(3, len(toks)) return { 'cat' + str(len(tok)): val for val, tok in enumerate(toks[:num_toks]) } categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text df = pd.DataFrame({ 'category': categories, 'parsed': [whitespace_nlp(clean_function(doc)) for doc in documents] }) df = df[df['parsed'].apply(lambda x: len(str(x).strip()) > 0)] return CorpusFromParsedDocuments( df=df, category_col='category', parsed_col='parsed', feats_from_spacy_doc=FeatsFromSpacyDocAndEmpath( empath_analyze_function=empath_mock)).build()
def test_get_feats(self): doc = whitespace_nlp("A a bb cc.") term_freq = UnigramsFromSpacyDoc().get_feats(doc) self.assertEqual(Counter({'a': 2, 'bb': 1, 'cc': 1}), term_freq)