Пример #1
0
 def test_main(self):
     categories, documents = get_docs_categories()
     clean_function = lambda text: '' if text.startswith('[') else text
     entity_types = set(['GPE'])
     term_doc_mat = (TermDocMatrixFactory(
         category_text_iter=zip(categories, documents),
         clean_function=clean_function,
         nlp=_testing_nlp,
         feats_from_spacy_doc=FeatsFromSpacyDoc(
             entity_types_to_censor=entity_types)).build())
     clf = PassiveAggressiveClassifier(n_iter=5,
                                       C=0.5,
                                       n_jobs=-1,
                                       random_state=0)
     fdc = FeatsFromDoc(
         term_doc_mat._term_idx_store,
         clean_function=clean_function,
         feats_from_spacy_doc=FeatsFromSpacyDoc(
             entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
     tfidf = TfidfTransformer(norm='l1')
     X = tfidf.fit_transform(term_doc_mat._X)
     clf.fit(X, term_doc_mat._y)
     X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
     pred = clf.predict(tfidf.transform(X_to_predict))
     dec = clf.decision_function(X_to_predict)
Пример #2
0
 def test_entity_tags(self):
     doc = whitespace_nlp("A a bb cc Bob.", {'bb': 'BAD'}, {'Bob': 'NNP'})
     term_freq = FeatsFromSpacyDoc(
         entity_types_to_censor=set(['BAD'])).get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'a _BAD': 1,
             '_BAD cc': 1,
             'cc': 1,
             'a a': 1,
             '_BAD': 1,
             'bob': 1,
             'cc bob': 1
         }), term_freq)
     term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD']),
                                   tag_types_to_censor=set(
                                       ['NNP'])).get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'a _BAD': 1,
             '_BAD cc': 1,
             'cc': 1,
             'a a': 1,
             '_BAD': 1,
             'NNP': 1,
             'cc NNP': 1
         }), term_freq)
 def __init__(self,
              topic_model,
              use_lemmas=False,
              entity_types_to_censor=set(),
              entity_types_to_use=None,
              tag_types_to_censor=set(),
              strip_final_period=False,
              keyword_processor_args = {'case_sensitive' :False}):
     from flashtext import KeywordProcessor
     self._keyword_processor = KeywordProcessor(**keyword_processor_args)
     self._topic_model = topic_model
     for keyphrase in reduce(lambda x, y: set(x) | set(y), topic_model.values()):
         self._keyword_processor.add_keyword(keyphrase)
     FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
                                tag_types_to_censor, strip_final_period)
     FeatsFromTopicModelBase.__init__(self, topic_model)
Пример #4
0
 def test_lemmas(self):
     doc = whitespace_nlp("A a bb ddddd.")
     term_freq = FeatsFromSpacyDoc(use_lemmas=True).get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'bb': 1,
             'a bb': 1,
             'dd': 1,
             'a a': 1,
             'bb dd': 1
         }), term_freq)
Пример #5
0
 def test_main(self):
     doc = whitespace_nlp("A a bb cc.")
     term_freq = FeatsFromSpacyDoc().get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'bb': 1,
             'a bb': 1,
             'cc': 1,
             'a a': 1,
             'bb cc': 1
         }), term_freq)
Пример #6
0
 def test_build_censor_entities(self):
     categories, documents = get_docs_categories()
     clean_function = lambda text: '' if text.startswith('[') else text
     term_doc_mat = (TermDocMatrixFactory(
         category_text_iter=zip(categories, documents),
         clean_function=clean_function,
         nlp=_testing_nlp,
         feats_from_spacy_doc=FeatsFromSpacyDoc(
             entity_types_to_censor=set(['GPE']))).build())
     self.assertIn('_GPE', set(term_doc_mat.get_term_freq_df().index))
     self.assertNotIn('brooklyn',
                      set(term_doc_mat.get_term_freq_df().index))
Пример #7
0
 def test_entity_types_to_censor_not_a_set(self):
     doc = whitespace_nlp("A a bb cc.", {'bb': 'A'})
     with self.assertRaises(AssertionError):
         FeatsFromSpacyDoc(entity_types_to_censor='A').get_feats(doc)
Пример #8
0
 def test_empty(self):
     doc = whitespace_nlp("")
     term_freq = FeatsFromSpacyDoc().get_feats(doc)
     self.assertEqual(Counter(), term_freq)
Пример #9
0
    def test_strip_final_period(self):
        doc = bad_whitespace_nlp('''I CAN'T ANSWER THAT
 QUESTION.
 I HAVE NOT ASKED THEM
 SPECIFICALLY IF THEY HAVE
 ENOUGH.''')
        feats = FeatsFromSpacyDoc().get_feats(doc)
        print(feats)
        self.assertEqual(
            feats,
            Counter({
                'i': 2,
                'have': 2,
                'that question.': 1,
                'answer': 1,
                'question.': 1,
                'enough.': 1,
                'i have': 1,
                'them specifically': 1,
                'have enough.': 1,
                'not asked': 1,
                'they have': 1,
                'have not': 1,
                'specifically': 1,
                'answer that': 1,
                'question. i': 1,
                "can't": 1,
                'if': 1,
                'they': 1,
                "can't answer": 1,
                'asked': 1,
                'them': 1,
                'if they': 1,
                'asked them': 1,
                'that': 1,
                'not': 1,
                "i can't": 1,
                'specifically if': 1
            }))
        feats = FeatsFromSpacyDoc(strip_final_period=True).get_feats(doc)
        print(feats)
        self.assertEqual(
            feats,
            Counter({
                'i': 2,
                'have': 2,
                'that question': 1,
                'answer': 1,
                'question': 1,
                'enough': 1,
                'i have': 1,
                'them specifically': 1,
                'have enough': 1,
                'not asked': 1,
                'they have': 1,
                'have not': 1,
                'specifically': 1,
                'answer that': 1,
                'question i': 1,
                "can't": 1,
                'if': 1,
                'they': 1,
                "can't answer": 1,
                'asked': 1,
                'them': 1,
                'if they': 1,
                'asked them': 1,
                'that': 1,
                'not': 1,
                "i can't": 1,
                'specifically if': 1
            }))