Пример #1
0
 def test_entity_tags(self):
     doc = whitespace_nlp("A a bb cc Bob.", {'bb': 'BAD'}, {'Bob': 'NNP'})
     term_freq = FeatsFromSpacyDoc(
         entity_types_to_censor=set(['BAD'])).get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'a _BAD': 1,
             '_BAD cc': 1,
             'cc': 1,
             'a a': 1,
             '_BAD': 1,
             'bob': 1,
             'cc bob': 1
         }), term_freq)
     term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD']),
                                   tag_types_to_censor=set(
                                       ['NNP'])).get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'a _BAD': 1,
             '_BAD cc': 1,
             'cc': 1,
             'a a': 1,
             '_BAD': 1,
             'NNP': 1,
             'cc NNP': 1
         }), term_freq)
Пример #2
0
    def __init__(self,
                 use_lemmas=False,
                 entity_types_to_censor=set(),
                 tag_types_to_censor=set(),
                 strip_final_period=False,
                 empath_analyze_function=None,
                 **kwargs):
        '''
        Parameters
        ----------
        empath_analyze_function: function (default=empath.Empath().analyze)
            Function that produces a dictionary mapping Empath categories to

        Other parameters from FeatsFromSpacyDoc.__init__
        '''
        if empath_analyze_function is None:
            try:
                import empath
            except ImportError:
                raise Exception(
                    "Please install the empath library to use FeatsFromSpacyDocAndEmpath."
                )
            self._empath_analyze_function = empath.Empath().analyze
        else:
            self._empath_analyze_function = partial(
                empath_analyze_function, kwargs={'tokenizer': 'bigram'})
        FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
                                   tag_types_to_censor, strip_final_period)
Пример #3
0
 def __init__(self,
              use_lemmas=False,
              entity_types_to_censor=set(),
              tag_types_to_censor=set(),
              strip_final_period=False):
     FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
                                tag_types_to_censor, strip_final_period)
     self._include_chunks = False
     self._rank_smoothing_constant = 0
Пример #4
0
 def __init__(self,
              use_lemmas=False,
              entity_types_to_censor=set(),
              entity_types_to_use=None,
              tag_types_to_censor=set(),
              strip_final_period=False):
     self._entity_types_to_use = entity_types_to_use
     FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
                                tag_types_to_censor, strip_final_period)
Пример #5
0
    def __init__(self,
                 df,
                 category_col,
                 parsed_col,
                 feats_from_spacy_doc=FeatsFromSpacyDoc()):
        '''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in convention_df
		parsed_col : str
			name of spacy parsed column in convention_df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
        self._df = df.reset_index()
        self._category_col = category_col
        self._parsed_col = parsed_col
        self._category_idx_store = IndexStore()
        self._X_factory = CSRMatrixFactory()
        self._mX_factory = CSRMatrixFactory()
        self._term_idx_store = IndexStore()
        self._metadata_idx_store = IndexStore()
        self._feats_from_spacy_doc = feats_from_spacy_doc
Пример #6
0
 def __init__(self,
              topic_model,
              use_lemmas=False,
              entity_types_to_censor=set(),
              entity_types_to_use=None,
              tag_types_to_censor=set(),
              strip_final_period=False,
              keyword_processor_args={'case_sensitive': False}):
     self._keyword_processor = KeywordProcessor(**keyword_processor_args)
     self._topic_model = topic_model
     for keyphrase in reduce(lambda x, y: set(x) | set(y),
                             topic_model.values()):
         self._keyword_processor.add_keyword(keyphrase)
     FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor,
                                tag_types_to_censor, strip_final_period)
     FeatsFromTopicModelBase.__init__(self, topic_model)
Пример #7
0
 def test_lemmas(self):
     doc = whitespace_nlp("A a bb ddddd.")
     term_freq = FeatsFromSpacyDoc(use_lemmas=True).get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'bb': 1,
             'a bb': 1,
             'dd': 1,
             'a a': 1,
             'bb dd': 1
         }), term_freq)
Пример #8
0
 def test_main(self):
     doc = whitespace_nlp("A a bb cc.")
     term_freq = FeatsFromSpacyDoc().get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'bb': 1,
             'a bb': 1,
             'cc': 1,
             'a a': 1,
             'bb cc': 1
         }), term_freq)
Пример #9
0
    def censor_entity_types(self, entity_types):
        # type: (set) -> TermDocMatrixFactory
        '''
        Entity types to exclude from feature construction. Terms matching
        specificed entities, instead of labeled by their lower case orthographic
        form or lemma, will be labeled by their entity type.

        Parameters
        ----------
        entity_types : set of entity types outputted by spaCy
          'TIME', 'WORK_OF_ART', 'PERSON', 'MONEY', 'ORG', 'ORDINAL', 'DATE',
          'CARDINAL', 'LAW', 'QUANTITY', 'GPE', 'PERCENT'

        Returns
        ---------
        self
        '''
        assert type(entity_types) == set
        self._entity_types_to_censor = entity_types
        self._feats_from_spacy_doc = FeatsFromSpacyDoc(
            use_lemmas=self._use_lemmas,
            entity_types_to_censor=self._entity_types_to_censor)
        return self
 def __init__(self,
              df,
              parsed_col,
              feats_from_spacy_doc=FeatsFromSpacyDoc()):
     '''
     Parameters
     ----------
     df : pd.DataFrame
      contains category_col, and parse_col, were parsed col is entirely spacy docs
     parsed_col : str
         name of spacy parsed column in convention_df
     feats_from_spacy_doc : FeatsFromSpacyDoc
     '''
     self.df = df
     self.parsed_col = parsed_col
     self.feats_from_spacy_doc = feats_from_spacy_doc
Пример #11
0
 def test_singleton_with_sentences(self):
     doc = whitespace_nlp_with_sentences("Blah")
     term_freq = FeatsFromSpacyDoc().get_feats(doc)
     self.assertEqual(Counter({'blah': 1}), term_freq)
Пример #12
0
    def __init__(self,
                 category_text_iter=None,
                 clean_function=lambda x: x,
                 nlp=None,
                 feats_from_spacy_doc=None):
        """
        Class for easy construction of a term document matrix.
       This class let's you define an iterator for each document (text_iter),
       an iterator for each document's category name (category_iter),
       and a document cleaning function that's applied to each document
       before it's parsed.

       Parameters
       ----------
       category_text_iter : iter<str: category, unicode: document)>
           An iterator of pairs. The first element is a string category
           name, the second the text of a document.  You can also set this
           using the function set_category_text_iter.
       clean_function : function (default lambda x: x)
           A function that strips invalid characters out of a string, returning
           the new string.
       post_nlp_clean_function : function (default lambda x: x)
           A function that takes a spaCy Doc
       nlp : spacy.load('en_core_web_sm') (default None)
           The spaCy parser used to parse documents.  If it's None,
           the class will go through the expensive operation of
           creating one to parse the text
       feats_from_spacy_doc : FeatsFromSpacyDoc (default None)
           Class for extraction of features from spacy
       Attributes
       ----------
       _clean_function : function
           function that takes a unicode document and returns
           a cleaned version of that document
       _text_iter : iter<unicode>
           an iterator that iterates through the unicode text of each
            document
       _category_iter : iter<str>
           an iterator the same size as text iter that gives a string or
           unicode name of each document catgory
       Examples
       --------
       >>> import scattertext as ST
       >>> documents = [u'What art thou that usurp''st this time of night,',
       ...u'Together with that fair and warlike form',
       ...u'In which the majesty of buried Denmark',
       ...u'Did sometimes march? by heaven I charge thee, speak!',
         ...u'Halt! Who goes there?',
         ...u'[Intro]',
         ...u'It is I sire Tone from Brooklyn.',
         ...u'Well, speak up man what is it?',
         ...u'News from the East sire! THE BEST OF BOTH WORLDS HAS RETURNED!']
       >>> categories = ['hamlet'] * 4 + ['jay-z/r. kelly'] * 5
       >>> clean_function = lambda text: '' if text.startswith('[') else text
       >>> term_doc_mat = ST.TermDocMatrixFactory(category_text_iter = zip(categories, documents),clean_function = clean_function).build()
        """
        self._category_text_iter = category_text_iter
        self._clean_function = clean_function
        self._nlp = nlp
        self._entity_types_to_censor = set()
        if feats_from_spacy_doc is None:
            self._feats_from_spacy_doc = FeatsFromSpacyDoc()
        else:
            self._feats_from_spacy_doc = feats_from_spacy_doc
Пример #13
0
class TermDocMatrixFactory(object):
    def __init__(self,
                 category_text_iter=None,
                 clean_function=lambda x: x,
                 nlp=None,
                 feats_from_spacy_doc=None):
        """
        Class for easy construction of a term document matrix.
       This class let's you define an iterator for each document (text_iter),
       an iterator for each document's category name (category_iter),
       and a document cleaning function that's applied to each document
       before it's parsed.

       Parameters
       ----------
       category_text_iter : iter<str: category, unicode: document)>
           An iterator of pairs. The first element is a string category
           name, the second the text of a document.  You can also set this
           using the function set_category_text_iter.
       clean_function : function (default lambda x: x)
           A function that strips invalid characters out of a string, returning
           the new string.
       post_nlp_clean_function : function (default lambda x: x)
           A function that takes a spaCy Doc
       nlp : spacy.load('en_core_web_sm') (default None)
           The spaCy parser used to parse documents.  If it's None,
           the class will go through the expensive operation of
           creating one to parse the text
       feats_from_spacy_doc : FeatsFromSpacyDoc (default None)
           Class for extraction of features from spacy
       Attributes
       ----------
       _clean_function : function
           function that takes a unicode document and returns
           a cleaned version of that document
       _text_iter : iter<unicode>
           an iterator that iterates through the unicode text of each
            document
       _category_iter : iter<str>
           an iterator the same size as text iter that gives a string or
           unicode name of each document catgory
       Examples
       --------
       >>> import scattertext as ST
       >>> documents = [u'What art thou that usurp''st this time of night,',
       ...u'Together with that fair and warlike form',
       ...u'In which the majesty of buried Denmark',
       ...u'Did sometimes march? by heaven I charge thee, speak!',
         ...u'Halt! Who goes there?',
         ...u'[Intro]',
         ...u'It is I sire Tone from Brooklyn.',
         ...u'Well, speak up man what is it?',
         ...u'News from the East sire! THE BEST OF BOTH WORLDS HAS RETURNED!']
       >>> categories = ['hamlet'] * 4 + ['jay-z/r. kelly'] * 5
       >>> clean_function = lambda text: '' if text.startswith('[') else text
       >>> term_doc_mat = ST.TermDocMatrixFactory(category_text_iter = zip(categories, documents),clean_function = clean_function).build()
        """
        self._category_text_iter = category_text_iter
        self._clean_function = clean_function
        self._nlp = nlp
        self._entity_types_to_censor = set()
        if feats_from_spacy_doc is None:
            self._feats_from_spacy_doc = FeatsFromSpacyDoc()
        else:
            self._feats_from_spacy_doc = feats_from_spacy_doc

    def set_category_text_iter(self, category_text_iter):
        """Initializes the category_text_iter

       Paramters
       ----------
       category_text_iter : iter<str: category, unicode: document)>
               An iterator of pairs. The first element is a string category
               name, the second the text of a document.

         Returns
         ----------
         self: TermDocMatrixFactory
        """

        self._category_text_iter = category_text_iter
        return self

    def set_nlp(self, nlp):
        """Adds a spaCy-compatible nlp function

       Paramters
       ----------
       nlp : spacy model

         Returns
         ----------
         self: TermDocMatrixFactory
        """

        self._nlp = nlp
        return self

    def build(self):
        """Generate a TermDocMatrix from data in parameters.

         Returns
         ----------
         term_doc_matrix : TermDocMatrix
            The object that this factory class builds.
        """
        if self._category_text_iter is None:
            raise CategoryTextIterNotSetError()
        nlp = self.get_nlp()

        category_document_iter = (
            (category, self._clean_function(raw_text))
            for category, raw_text in self._category_text_iter)
        term_doc_matrix = self._build_from_category_spacy_doc_iter(
            ((category, nlp(text))
             for (category, text) in category_document_iter
             if text.strip() != ''))
        return term_doc_matrix

    def get_nlp(self):
        nlp = self._nlp
        if nlp is None:
            import spacy
            nlp = spacy.load('en_core_web_sm')
        return nlp

    def censor_entity_types(self, entity_types):
        # type: (set) -> TermDocMatrixFactory
        '''
        Entity types to exclude from feature construction. Terms matching
        specificed entities, instead of labeled by their lower case orthographic
        form or lemma, will be labeled by their entity type.

        Parameters
        ----------
        entity_types : set of entity types outputted by spaCy
          'TIME', 'WORK_OF_ART', 'PERSON', 'MONEY', 'ORG', 'ORDINAL', 'DATE',
          'CARDINAL', 'LAW', 'QUANTITY', 'GPE', 'PERCENT'

        Returns
        ---------
        self
        '''
        assert type(entity_types) == set
        self._entity_types_to_censor = entity_types
        self._feats_from_spacy_doc = FeatsFromSpacyDoc(
            use_lemmas=self._use_lemmas,
            entity_types_to_censor=self._entity_types_to_censor)
        return self

    def _build_from_category_spacy_doc_iter(self, category_doc_iter):
        '''
        Parameters
        ----------
        category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs

        Returns
        ----------
        t : TermDocMatrix
        '''
        term_idx_store = IndexStore()
        category_idx_store = IndexStore()
        metadata_idx_store = IndexStore()
        X, mX, y = self._get_features_and_labels_from_documents_and_indexes \
            (category_doc_iter,
             category_idx_store,
             term_idx_store,
             metadata_idx_store)
        return TermDocMatrix(X,
                             mX,
                             y,
                             term_idx_store=term_idx_store,
                             category_idx_store=category_idx_store,
                             metadata_idx_store=metadata_idx_store)

    def _get_features_and_labels_from_documents_and_indexes(
            self, category_doc_iter, category_idx_store, term_idx_store,
            metadata_idx_store):
        y = []
        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        for document_index, (category,
                             parsed_text) in enumerate(category_doc_iter):
            self._register_doc_and_category(X_factory, mX_factory, category,
                                            category_idx_store, document_index,
                                            parsed_text, term_idx_store,
                                            metadata_idx_store, y)
        X = X_factory.get_csr_matrix()
        mX = mX_factory.get_csr_matrix()
        y = np.array(y)
        return X, mX, y

    def _old_register_doc_and_category(self, X_factory, category,
                                       category_idx_store, document_index,
                                       parsed_text, term_idx_store, y):
        y.append(category_idx_store.getidx(category))
        document_features = self._get_features_from_parsed_text(
            parsed_text, term_idx_store)
        self._register_document_features_with_X_factory \
            (X_factory, document_index, document_features)

    def _register_doc_and_category(self, X_factory, mX_factory, category,
                                   category_idx_store, document_index,
                                   parsed_text, term_idx_store,
                                   metadata_idx_store, y):
        self._register_doc(X_factory, mX_factory, document_index, parsed_text,
                           term_idx_store, metadata_idx_store)
        self._register_category(category, category_idx_store, y)

    def _register_doc(self, X_factory, mX_factory, document_index, parsed_text,
                      term_idx_store, metadata_idx_store):
        for term, count in self._feats_from_spacy_doc.get_feats(
                parsed_text).items():
            term_idx = term_idx_store.getidx(term)
            X_factory[document_index, term_idx] = count
        for term, val in self._feats_from_spacy_doc.get_doc_metadata(
                parsed_text).items():
            meta_idx = metadata_idx_store.getidx(term)
            mX_factory[document_index, meta_idx] = val

    def _register_category(self, category, category_idx_store, y):
        y.append(category_idx_store.getidx(category))

    def _register_document_features_with_X_factory(self, X_factory, doci,
                                                   term_freq):
        for word_idx, freq in term_freq.items():
            X_factory[doci, word_idx] = freq

    def _get_features_from_parsed_text(self, parsed_text, term_idx_store):
        return {
            term_idx_store.getidxstrict(k): v
            for k, v in self._feats_from_spacy_doc.get_feats(
                parsed_text).items() if k in term_idx_store
        }
Пример #14
0
 def test_entity_types_to_censor_not_a_set(self):
     doc = whitespace_nlp("A a bb cc.", {'bb': 'A'})
     with self.assertRaises(AssertionError):
         FeatsFromSpacyDoc(entity_types_to_censor='A').get_feats(doc)
Пример #15
0
 def test_empty(self):
     doc = whitespace_nlp("")
     term_freq = FeatsFromSpacyDoc().get_feats(doc)
     self.assertEqual(Counter(), term_freq)
Пример #16
0
    def test_strip_final_period(self):
        doc = bad_whitespace_nlp('''I CAN'T ANSWER THAT
 QUESTION.
 I HAVE NOT ASKED THEM
 SPECIFICALLY IF THEY HAVE
 ENOUGH.''')
        feats = FeatsFromSpacyDoc().get_feats(doc)
        self.assertEqual(
            feats,
            Counter({
                'i': 2,
                'have': 2,
                'that question.': 1,
                'answer': 1,
                'question.': 1,
                'enough.': 1,
                'i have': 1,
                'them specifically': 1,
                'have enough.': 1,
                'not asked': 1,
                'they have': 1,
                'have not': 1,
                'specifically': 1,
                'answer that': 1,
                'question. i': 1,
                "can't": 1,
                'if': 1,
                'they': 1,
                "can't answer": 1,
                'asked': 1,
                'them': 1,
                'if they': 1,
                'asked them': 1,
                'that': 1,
                'not': 1,
                "i can't": 1,
                'specifically if': 1
            }))
        feats = FeatsFromSpacyDoc(strip_final_period=True).get_feats(doc)
        self.assertEqual(
            feats,
            Counter({
                'i': 2,
                'have': 2,
                'that question': 1,
                'answer': 1,
                'question': 1,
                'enough': 1,
                'i have': 1,
                'them specifically': 1,
                'have enough': 1,
                'not asked': 1,
                'they have': 1,
                'have not': 1,
                'specifically': 1,
                'answer that': 1,
                'question i': 1,
                "can't": 1,
                'if': 1,
                'they': 1,
                "can't answer": 1,
                'asked': 1,
                'them': 1,
                'if they': 1,
                'asked them': 1,
                'that': 1,
                'not': 1,
                "i can't": 1,
                'specifically if': 1
            }))
Пример #17
0
 def __init__(self, extractor=None, **args):
     import pyate
     self._extractor = pyate.combo_basic if extractor is None else extractor
     FeatsFromSpacyDoc.__init__(self, **args)