def test_entity_tags(self): doc = whitespace_nlp("A a bb cc Bob.", {'bb': 'BAD'}, {'Bob': 'NNP'}) term_freq = FeatsFromSpacyDoc( entity_types_to_censor=set(['BAD'])).get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'a _BAD': 1, '_BAD cc': 1, 'cc': 1, 'a a': 1, '_BAD': 1, 'bob': 1, 'cc bob': 1 }), term_freq) term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD']), tag_types_to_censor=set( ['NNP'])).get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'a _BAD': 1, '_BAD cc': 1, 'cc': 1, 'a a': 1, '_BAD': 1, 'NNP': 1, 'cc NNP': 1 }), term_freq)
def __init__(self, use_lemmas=False, entity_types_to_censor=set(), tag_types_to_censor=set(), strip_final_period=False, empath_analyze_function=None, **kwargs): ''' Parameters ---------- empath_analyze_function: function (default=empath.Empath().analyze) Function that produces a dictionary mapping Empath categories to Other parameters from FeatsFromSpacyDoc.__init__ ''' if empath_analyze_function is None: try: import empath except ImportError: raise Exception( "Please install the empath library to use FeatsFromSpacyDocAndEmpath." ) self._empath_analyze_function = empath.Empath().analyze else: self._empath_analyze_function = partial( empath_analyze_function, kwargs={'tokenizer': 'bigram'}) FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor, tag_types_to_censor, strip_final_period)
def __init__(self, use_lemmas=False, entity_types_to_censor=set(), tag_types_to_censor=set(), strip_final_period=False): FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor, tag_types_to_censor, strip_final_period) self._include_chunks = False self._rank_smoothing_constant = 0
def __init__(self, use_lemmas=False, entity_types_to_censor=set(), entity_types_to_use=None, tag_types_to_censor=set(), strip_final_period=False): self._entity_types_to_use = entity_types_to_use FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor, tag_types_to_censor, strip_final_period)
def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc
def __init__(self, topic_model, use_lemmas=False, entity_types_to_censor=set(), entity_types_to_use=None, tag_types_to_censor=set(), strip_final_period=False, keyword_processor_args={'case_sensitive': False}): self._keyword_processor = KeywordProcessor(**keyword_processor_args) self._topic_model = topic_model for keyphrase in reduce(lambda x, y: set(x) | set(y), topic_model.values()): self._keyword_processor.add_keyword(keyphrase) FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor, tag_types_to_censor, strip_final_period) FeatsFromTopicModelBase.__init__(self, topic_model)
def test_lemmas(self): doc = whitespace_nlp("A a bb ddddd.") term_freq = FeatsFromSpacyDoc(use_lemmas=True).get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'bb': 1, 'a bb': 1, 'dd': 1, 'a a': 1, 'bb dd': 1 }), term_freq)
def test_main(self): doc = whitespace_nlp("A a bb cc.") term_freq = FeatsFromSpacyDoc().get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'bb': 1, 'a bb': 1, 'cc': 1, 'a a': 1, 'bb cc': 1 }), term_freq)
def censor_entity_types(self, entity_types): # type: (set) -> TermDocMatrixFactory ''' Entity types to exclude from feature construction. Terms matching specificed entities, instead of labeled by their lower case orthographic form or lemma, will be labeled by their entity type. Parameters ---------- entity_types : set of entity types outputted by spaCy 'TIME', 'WORK_OF_ART', 'PERSON', 'MONEY', 'ORG', 'ORDINAL', 'DATE', 'CARDINAL', 'LAW', 'QUANTITY', 'GPE', 'PERCENT' Returns --------- self ''' assert type(entity_types) == set self._entity_types_to_censor = entity_types self._feats_from_spacy_doc = FeatsFromSpacyDoc( use_lemmas=self._use_lemmas, entity_types_to_censor=self._entity_types_to_censor) return self
def __init__(self, df, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self.df = df self.parsed_col = parsed_col self.feats_from_spacy_doc = feats_from_spacy_doc
def test_singleton_with_sentences(self): doc = whitespace_nlp_with_sentences("Blah") term_freq = FeatsFromSpacyDoc().get_feats(doc) self.assertEqual(Counter({'blah': 1}), term_freq)
def __init__(self, category_text_iter=None, clean_function=lambda x: x, nlp=None, feats_from_spacy_doc=None): """ Class for easy construction of a term document matrix. This class let's you define an iterator for each document (text_iter), an iterator for each document's category name (category_iter), and a document cleaning function that's applied to each document before it's parsed. Parameters ---------- category_text_iter : iter<str: category, unicode: document)> An iterator of pairs. The first element is a string category name, the second the text of a document. You can also set this using the function set_category_text_iter. clean_function : function (default lambda x: x) A function that strips invalid characters out of a string, returning the new string. post_nlp_clean_function : function (default lambda x: x) A function that takes a spaCy Doc nlp : spacy.load('en_core_web_sm') (default None) The spaCy parser used to parse documents. If it's None, the class will go through the expensive operation of creating one to parse the text feats_from_spacy_doc : FeatsFromSpacyDoc (default None) Class for extraction of features from spacy Attributes ---------- _clean_function : function function that takes a unicode document and returns a cleaned version of that document _text_iter : iter<unicode> an iterator that iterates through the unicode text of each document _category_iter : iter<str> an iterator the same size as text iter that gives a string or unicode name of each document catgory Examples -------- >>> import scattertext as ST >>> documents = [u'What art thou that usurp''st this time of night,', ...u'Together with that fair and warlike form', ...u'In which the majesty of buried Denmark', ...u'Did sometimes march? by heaven I charge thee, speak!', ...u'Halt! Who goes there?', ...u'[Intro]', ...u'It is I sire Tone from Brooklyn.', ...u'Well, speak up man what is it?', ...u'News from the East sire! THE BEST OF BOTH WORLDS HAS RETURNED!'] >>> categories = ['hamlet'] * 4 + ['jay-z/r. kelly'] * 5 >>> clean_function = lambda text: '' if text.startswith('[') else text >>> term_doc_mat = ST.TermDocMatrixFactory(category_text_iter = zip(categories, documents),clean_function = clean_function).build() """ self._category_text_iter = category_text_iter self._clean_function = clean_function self._nlp = nlp self._entity_types_to_censor = set() if feats_from_spacy_doc is None: self._feats_from_spacy_doc = FeatsFromSpacyDoc() else: self._feats_from_spacy_doc = feats_from_spacy_doc
class TermDocMatrixFactory(object): def __init__(self, category_text_iter=None, clean_function=lambda x: x, nlp=None, feats_from_spacy_doc=None): """ Class for easy construction of a term document matrix. This class let's you define an iterator for each document (text_iter), an iterator for each document's category name (category_iter), and a document cleaning function that's applied to each document before it's parsed. Parameters ---------- category_text_iter : iter<str: category, unicode: document)> An iterator of pairs. The first element is a string category name, the second the text of a document. You can also set this using the function set_category_text_iter. clean_function : function (default lambda x: x) A function that strips invalid characters out of a string, returning the new string. post_nlp_clean_function : function (default lambda x: x) A function that takes a spaCy Doc nlp : spacy.load('en_core_web_sm') (default None) The spaCy parser used to parse documents. If it's None, the class will go through the expensive operation of creating one to parse the text feats_from_spacy_doc : FeatsFromSpacyDoc (default None) Class for extraction of features from spacy Attributes ---------- _clean_function : function function that takes a unicode document and returns a cleaned version of that document _text_iter : iter<unicode> an iterator that iterates through the unicode text of each document _category_iter : iter<str> an iterator the same size as text iter that gives a string or unicode name of each document catgory Examples -------- >>> import scattertext as ST >>> documents = [u'What art thou that usurp''st this time of night,', ...u'Together with that fair and warlike form', ...u'In which the majesty of buried Denmark', ...u'Did sometimes march? by heaven I charge thee, speak!', ...u'Halt! Who goes there?', ...u'[Intro]', ...u'It is I sire Tone from Brooklyn.', ...u'Well, speak up man what is it?', ...u'News from the East sire! THE BEST OF BOTH WORLDS HAS RETURNED!'] >>> categories = ['hamlet'] * 4 + ['jay-z/r. kelly'] * 5 >>> clean_function = lambda text: '' if text.startswith('[') else text >>> term_doc_mat = ST.TermDocMatrixFactory(category_text_iter = zip(categories, documents),clean_function = clean_function).build() """ self._category_text_iter = category_text_iter self._clean_function = clean_function self._nlp = nlp self._entity_types_to_censor = set() if feats_from_spacy_doc is None: self._feats_from_spacy_doc = FeatsFromSpacyDoc() else: self._feats_from_spacy_doc = feats_from_spacy_doc def set_category_text_iter(self, category_text_iter): """Initializes the category_text_iter Paramters ---------- category_text_iter : iter<str: category, unicode: document)> An iterator of pairs. The first element is a string category name, the second the text of a document. Returns ---------- self: TermDocMatrixFactory """ self._category_text_iter = category_text_iter return self def set_nlp(self, nlp): """Adds a spaCy-compatible nlp function Paramters ---------- nlp : spacy model Returns ---------- self: TermDocMatrixFactory """ self._nlp = nlp return self def build(self): """Generate a TermDocMatrix from data in parameters. Returns ---------- term_doc_matrix : TermDocMatrix The object that this factory class builds. """ if self._category_text_iter is None: raise CategoryTextIterNotSetError() nlp = self.get_nlp() category_document_iter = ( (category, self._clean_function(raw_text)) for category, raw_text in self._category_text_iter) term_doc_matrix = self._build_from_category_spacy_doc_iter( ((category, nlp(text)) for (category, text) in category_document_iter if text.strip() != '')) return term_doc_matrix def get_nlp(self): nlp = self._nlp if nlp is None: import spacy nlp = spacy.load('en_core_web_sm') return nlp def censor_entity_types(self, entity_types): # type: (set) -> TermDocMatrixFactory ''' Entity types to exclude from feature construction. Terms matching specificed entities, instead of labeled by their lower case orthographic form or lemma, will be labeled by their entity type. Parameters ---------- entity_types : set of entity types outputted by spaCy 'TIME', 'WORK_OF_ART', 'PERSON', 'MONEY', 'ORG', 'ORDINAL', 'DATE', 'CARDINAL', 'LAW', 'QUANTITY', 'GPE', 'PERCENT' Returns --------- self ''' assert type(entity_types) == set self._entity_types_to_censor = entity_types self._feats_from_spacy_doc = FeatsFromSpacyDoc( use_lemmas=self._use_lemmas, entity_types_to_censor=self._entity_types_to_censor) return self def _build_from_category_spacy_doc_iter(self, category_doc_iter): ''' Parameters ---------- category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs Returns ---------- t : TermDocMatrix ''' term_idx_store = IndexStore() category_idx_store = IndexStore() metadata_idx_store = IndexStore() X, mX, y = self._get_features_and_labels_from_documents_and_indexes \ (category_doc_iter, category_idx_store, term_idx_store, metadata_idx_store) return TermDocMatrix(X, mX, y, term_idx_store=term_idx_store, category_idx_store=category_idx_store, metadata_idx_store=metadata_idx_store) def _get_features_and_labels_from_documents_and_indexes( self, category_doc_iter, category_idx_store, term_idx_store, metadata_idx_store): y = [] X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() for document_index, (category, parsed_text) in enumerate(category_doc_iter): self._register_doc_and_category(X_factory, mX_factory, category, category_idx_store, document_index, parsed_text, term_idx_store, metadata_idx_store, y) X = X_factory.get_csr_matrix() mX = mX_factory.get_csr_matrix() y = np.array(y) return X, mX, y def _old_register_doc_and_category(self, X_factory, category, category_idx_store, document_index, parsed_text, term_idx_store, y): y.append(category_idx_store.getidx(category)) document_features = self._get_features_from_parsed_text( parsed_text, term_idx_store) self._register_document_features_with_X_factory \ (X_factory, document_index, document_features) def _register_doc_and_category(self, X_factory, mX_factory, category, category_idx_store, document_index, parsed_text, term_idx_store, metadata_idx_store, y): self._register_doc(X_factory, mX_factory, document_index, parsed_text, term_idx_store, metadata_idx_store) self._register_category(category, category_idx_store, y) def _register_doc(self, X_factory, mX_factory, document_index, parsed_text, term_idx_store, metadata_idx_store): for term, count in self._feats_from_spacy_doc.get_feats( parsed_text).items(): term_idx = term_idx_store.getidx(term) X_factory[document_index, term_idx] = count for term, val in self._feats_from_spacy_doc.get_doc_metadata( parsed_text).items(): meta_idx = metadata_idx_store.getidx(term) mX_factory[document_index, meta_idx] = val def _register_category(self, category, category_idx_store, y): y.append(category_idx_store.getidx(category)) def _register_document_features_with_X_factory(self, X_factory, doci, term_freq): for word_idx, freq in term_freq.items(): X_factory[doci, word_idx] = freq def _get_features_from_parsed_text(self, parsed_text, term_idx_store): return { term_idx_store.getidxstrict(k): v for k, v in self._feats_from_spacy_doc.get_feats( parsed_text).items() if k in term_idx_store }
def test_entity_types_to_censor_not_a_set(self): doc = whitespace_nlp("A a bb cc.", {'bb': 'A'}) with self.assertRaises(AssertionError): FeatsFromSpacyDoc(entity_types_to_censor='A').get_feats(doc)
def test_empty(self): doc = whitespace_nlp("") term_freq = FeatsFromSpacyDoc().get_feats(doc) self.assertEqual(Counter(), term_freq)
def test_strip_final_period(self): doc = bad_whitespace_nlp('''I CAN'T ANSWER THAT QUESTION. I HAVE NOT ASKED THEM SPECIFICALLY IF THEY HAVE ENOUGH.''') feats = FeatsFromSpacyDoc().get_feats(doc) self.assertEqual( feats, Counter({ 'i': 2, 'have': 2, 'that question.': 1, 'answer': 1, 'question.': 1, 'enough.': 1, 'i have': 1, 'them specifically': 1, 'have enough.': 1, 'not asked': 1, 'they have': 1, 'have not': 1, 'specifically': 1, 'answer that': 1, 'question. i': 1, "can't": 1, 'if': 1, 'they': 1, "can't answer": 1, 'asked': 1, 'them': 1, 'if they': 1, 'asked them': 1, 'that': 1, 'not': 1, "i can't": 1, 'specifically if': 1 })) feats = FeatsFromSpacyDoc(strip_final_period=True).get_feats(doc) self.assertEqual( feats, Counter({ 'i': 2, 'have': 2, 'that question': 1, 'answer': 1, 'question': 1, 'enough': 1, 'i have': 1, 'them specifically': 1, 'have enough': 1, 'not asked': 1, 'they have': 1, 'have not': 1, 'specifically': 1, 'answer that': 1, 'question i': 1, "can't": 1, 'if': 1, 'they': 1, "can't answer": 1, 'asked': 1, 'them': 1, 'if they': 1, 'asked them': 1, 'that': 1, 'not': 1, "i can't": 1, 'specifically if': 1 }))
def __init__(self, extractor=None, **args): import pyate self._extractor = pyate.combo_basic if extractor is None else extractor FeatsFromSpacyDoc.__init__(self, **args)