def test_pos(self, spacy_doc): result1 = list(extract.words(spacy_doc, include_pos={"NOUN"})) result2 = list(extract.words(spacy_doc, include_pos="NOUN")) assert all(tok.pos_ == "NOUN" for tok in result1) assert all(tok.pos_ == "NOUN" for tok in result2) result3 = list(extract.words(spacy_doc, exclude_pos={"NOUN"})) result4 = list(extract.words(spacy_doc, exclude_pos="NOUN")) assert not any(tok.pos_ == "NOUN" for tok in result3) assert not any(tok.pos_ == "NOUN" for tok in result4)
def word_movers(doc1, doc2, metric='cosine'): """ Measure the semantic similarity between two documents using Word Movers Distance. Args: doc1 (``textacy.Doc`` or ``spacy.Doc``) doc2 (``textacy.Doc`` or ``spacy.Doc``) metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'}) Returns: float: similarity between `doc1` and `doc2` in the interval [0.0, 1.0], where larger values correspond to more similar documents References: Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances," in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009. Kusner, Matt J., et al. "From word embeddings to document distances." Proceedings of the 32nd International Conference on Machine Learning (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf """ stringstore = StringStore() n = 0 word_vecs = [] for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)): if word.has_vector: if stringstore[ word.text] - 1 == n: # stringstore[0] always empty space word_vecs.append(word.vector) n += 1 distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(np.double) distance_mat /= distance_mat.max() vec1 = collections.Counter(stringstore[word.text] - 1 for word in extract.words(doc1) if word.has_vector) vec1 = np.array([vec1[word_idx] for word_idx in range(len(stringstore)) ]).astype(np.double) vec1 /= vec1.sum() # normalize word counts vec2 = collections.Counter(stringstore[word.text] - 1 for word in extract.words(doc2) if word.has_vector) vec2 = np.array([vec2[word_idx] for word_idx in range(len(stringstore)) ]).astype(np.double) vec2 /= vec2.sum() # normalize word counts return 1.0 - emd(vec1, vec2, distance_mat)
def word_movers(doc1, doc2, metric='cosine'): """ Measure the semantic similarity between two documents using Word Movers Distance. Args: doc1 (``textacy.Doc`` or ``spacy.Doc``) doc2 (``textacy.Doc`` or ``spacy.Doc``) metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'}) Returns: float: similarity between `doc1` and `doc2` in the interval [0.0, 1.0], where larger values correspond to more similar documents References: Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances," in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009. Kusner, Matt J., et al. "From word embeddings to document distances." Proceedings of the 32nd International Conference on Machine Learning (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf """ stringstore = StringStore() n = 0 word_vecs = [] for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)): if word.has_vector: if stringstore[word.text] - 1 == n: # stringstore[0] always empty space word_vecs.append(word.vector) n += 1 distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(np.double) distance_mat /= distance_mat.max() vec1 = collections.Counter( stringstore[word.text] - 1 for word in extract.words(doc1) if word.has_vector) vec1 = np.array([vec1[word_idx] for word_idx in range(len(stringstore))]).astype(np.double) vec1 /= vec1.sum() # normalize word counts vec2 = collections.Counter( stringstore[word.text] - 1 for word in extract.words(doc2) if word.has_vector) vec2 = np.array([vec2[word_idx] for word_idx in range(len(stringstore))]).astype(np.double) vec2 /= vec2.sum() # normalize word counts return 1.0 - emd(vec1, vec2, distance_mat)
def words(self, **kwargs): """ Extract an ordered sequence of words from a spacy-parsed doc, optionally filtering words by part-of-speech (etc.) and frequency. Args: **kwargs: filter_stops (bool, optional): if True, remove stop words from word list filter_punct (bool, optional): if True, remove punctuation from word list filter_nums (bool, optional): if True, remove number-like words (e.g. 10, 'ten') from word list good_pos_tags (set[str], optional): remove words whose part-of-speech tag is NOT in the specified tags, using the set of universal POS tagset bad_pos_tags (set[str], optional): remove words whose part-of-speech tag IS in the specified tags, using the set of universal POS tagset min_freq (int, optional): remove words that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Token``: the next token passing all specified filters, in order of appearance in the document .. seealso:: :func:`extract.words() <textacy.extract.words>` """ for word in extract.words(self.spacy_doc, **kwargs): yield word
def __init__(self, doc): if isinstance(doc, SpacyDoc): lang = doc.vocab.lang self.n_sents = sum(1 for _ in doc.sents) elif isinstance(doc, textacy.Doc): lang = doc.lang self.n_sents = doc.n_sents else: raise ValueError( '``doc`` must be a ``textacy.Doc`` or ``spacy.Doc``') # get objs for basic count computations hyphenator = data.load_hyphenator(lang=lang) words = tuple( extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)) syllables_per_word = tuple( len(hyphenator.positions(word.lower_)) + 1 for word in words) chars_per_word = tuple(len(word) for word in words) # compute basic counts needed for most readability stats self.n_words = len(words) self.n_unique_words = len({word.lower for word in words}) self.n_chars = sum(chars_per_word) self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7) self.n_syllables = sum(syllables_per_word) self.n_monosyllable_words = sum(1 for spw in syllables_per_word if spw == 1) self.n_polysyllable_words = sum(1 for spw in syllables_per_word if spw >= 3)
def doc_to_gensim(doc, lemmatize=True, filter_stops=True, filter_punct=True, filter_nums=False): """ Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document. Args: doc (``spacy.Doc``) lemmatize (bool): if True, use lemmatized strings for words; otherwise, use the original form of the string as it appears in ``doc`` filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove numbers from word list Returns: :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`: integer word ID to word string mapping list((int, int)): bag-of-words document, a list of (integer word ID, word count) 2-tuples """ gdict = Dictionary() words = extract.words(doc, filter_stops=filter_stops, filter_punct=filter_punct, filter_nums=filter_nums) if lemmatize is True: gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True) else: gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True) return (gdict, gdoc)
def pos_density(doc): tag_counts = Counter([token.tag_ for token in doc]) pos_counts = Counter([token.pos_ for token in doc]) n_sentences = _count_iter_items(doc.sents) # OLD: we use tokens and not word as in the phd, maybe we should change it # n_tokens = sum([len(sent) for sent in doc.sents]) n_words = _count_iter_items( extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)) return { 'num_comma': tag_counts[','] / n_sentences, # punctuation mark, comma / sentences 'nouns': (pos_counts['NOUN'] + pos_counts['PROPN']) / n_words, # (nouns + proper nouns)/all words 'propernouns': pos_counts['PROPN'] / n_words, # proper nouns/all words 'pronouns': (tag_counts['PRP'] + tag_counts['PRP$']) / n_words, # pronouns/all words # should we use ADP here?!?! 'conj': (pos_counts['CONJ'] + pos_counts['ADP']) / n_words, # conjunctions/all words 'adj': pos_counts['ADJ'] / n_words, # adjectives/all words 'ver': (pos_counts['VERB'] - tag_counts['MD']) / n_words, # non-modal verbs/all words 'interj': pos_counts['INTJ'] / n_sentences, # interjections/total sentences 'adverbs': pos_counts['ADV'] / n_sentences, # adverbs/total sentences 'modals': tag_counts['MD'] / n_sentences, # modal verbs/total sentences 'perpro': tag_counts['PRP'] / n_sentences, # personal pronouns/total sentences 'whpro': (tag_counts['WP'] + tag_counts['WP$']) / n_sentences, # wh- pronouns/total sentences 'numfuncwords': (tag_counts['BES'] + tag_counts['CC'] + tag_counts['DT'] + tag_counts['EX'] + tag_counts['HVS'] + tag_counts['IN'] + tag_counts['MD'] + tag_counts['PRP'] + tag_counts['PRP$'] + tag_counts['RP'] + tag_counts['TO'] + tag_counts['UH']) / n_sentences, # function words/total sentences 'numdet': pos_counts['DET'] / n_sentences, # determiners/total sentences 'numvb': tag_counts['VB'] / n_sentences, # VB tags/total sentences 'numvbd': tag_counts['VBD'] / n_sentences, # VBD tags/total sentences 'numvbg': tag_counts['VBG'] / n_sentences, # VBG tags/total sentences 'numvbn': tag_counts['VBN'] / n_sentences, # VBN tags/total sentences 'numvbp': tag_counts['VBP'] / n_sentences, # VBP tags/total sentences }
def test_words(self): expected = [ 'Two', 'weeks', 'ago', ',', 'I', 'was', 'in', 'Kuwait', 'participating', 'in', 'an', 'I.M.F.', 'seminar', 'for', 'Arab', 'educators', '.', 'For', '30', 'minutes', ',', 'we', 'discussed', 'the', 'impact'] observed = [tok.orth_ for tok in extract.words( self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False)][:25] self.assertEqual(observed, expected)
def test_words_filter(spacy_doc): result = [ tok for tok in extract.words( spacy_doc, filter_stops=True, filter_punct=True, filter_nums=True) ] assert not any(tok.is_stop for tok in result) assert not any(tok.is_punct for tok in result) assert not any(tok.like_num for tok in result)
def words(self, **kwargs): """ Extract an ordered sequence of words from a spacy-parsed doc, optionally filtering words by part-of-speech (etc.) and frequency. .. seealso:: :func:`extract.words() <textacy.extract.words>` for all function kwargs. """ return extract.words(self.spacy_doc, **kwargs)
def test_words_min_freq(self): expected = [ ',', 'I', 'was', 'in', 'in', 'an', 'for', '.', 'For', ',', 'we', 'the', 'education', 'in', 'the', '.', 'And', 'an', 'education', 'and', 'asked', 'he', 'ask', '"', 'I'] observed = [tok.orth_ for tok in extract.words( self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False, min_freq=2)][:25] self.assertEqual(observed, expected)
def test_words_good_tags(self): expected = [ 'weeks', 'seminar', 'educators', 'minutes', 'impact', 'technology', 'trends', 'education', 'education', 'official', 'hand', 'question', 'mosques', 'sorrow', 'what', 'kids'] observed = [tok.orth_ for tok in extract.words( self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False, good_pos_tags={'NOUN'})][:25] self.assertEqual(observed, expected)
def test_words_filter(self): expected = [ 'weeks', 'ago', 'Kuwait', 'participating', 'I.M.F.', 'seminar', 'Arab', 'educators', 'minutes', 'discussed', 'impact', 'technology', 'trends', 'education', 'Middle', 'East', 'Egyptian', 'education', 'official', 'raised', 'hand', 'asked', 'ask', 'personal', 'question'] observed = [tok.orth_ for tok in extract.words( self.spacy_doc, filter_stops=True, filter_punct=True, filter_nums=True)][:25] self.assertEqual(observed, expected)
def test_words_good_tags(spacy_doc): result = [ tok for tok in extract.words(spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False, include_pos={'NOUN'}) ] assert all(tok.pos_ == 'NOUN' for tok in result)
def test_filter(self, spacy_doc): result = list( extract.words( spacy_doc, filter_stops=True, filter_punct=True, filter_nums=True ) ) assert not any(tok.is_stop for tok in result) assert not any(tok.is_punct for tok in result) assert not any(tok.like_num for tok in result)
def test_words_good_tags(self): expected = [ 'weeks', 'I', 'Kuwait', 'I.M.F.', 'seminar', 'educators', 'minutes', 'we', 'impact', 'technology', 'trends', 'education', 'Middle', 'East', 'education', 'official', 'hand', 'he', 'me', 'question', 'I', 'Donald', 'Trump', 'we', 'mosques'] observed = [tok.orth_ for tok in extract.words( self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False, good_pos_tags={'NOUN'})][:25] self.assertEqual(observed, expected)
def readability_stats(doc): """ Get calculated values for a variety of statistics related to the "readability" of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index, Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index. Also includes constituent values needed to compute the stats, e.g. word count. **DEPRECATED** Args: doc (:class:`textacy.Doc <textacy.document.Doc>`) Returns: dict: mapping of readability statistic name (str) to value (int or float) Raises: NotImplementedError: if ``doc`` is not English language. sorry. """ msg = '`readability_stats()` function is deprecated; use `TextStats` class instead' with warnings.catch_warnings(): warnings.simplefilter('once', DeprecationWarning) warnings.warn(msg, DeprecationWarning) if doc.lang != 'en': raise NotImplementedError('non-English NLP is not ready yet, sorry') n_sents = doc.n_sents words = list(extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)) n_words = len(words) if n_words == 0: logging.warning("readability stats can't be computed because doc has 0 words") return None n_unique_words = len({word.lower for word in words}) n_chars = sum(len(word) for word in words) hyphenator = data.load_hyphenator(lang='en') syllables_per_word = [len(hyphenator.positions(word.lower_)) + 1 for word in words] n_syllables = sum(syllables_per_word) n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3) return {'n_sents': n_sents, 'n_words': n_words, 'n_unique_words': n_unique_words, 'n_chars': n_chars, 'n_syllables': n_syllables, 'n_polysyllable_words': n_polysyllable_words, 'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents), 'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents), 'smog_index': smog_index(n_polysyllable_words, n_sents), 'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents), 'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents), 'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents)}
def test_words_min_freq(spacy_doc): counts = collections.Counter() counts.update(tok.lower_ for tok in spacy_doc) result = [ tok for tok in extract.words(spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False, min_freq=2) ] assert all(counts[tok.lower_] >= 2 for tok in result)
def sents_to_semantic_network(sents, edge_weighting='cosine'): """ Convert a list of sentences into a semantic network, where each sentence is represented by a node with edges linking it to other sentences weighted by the (cosine or jaccard) similarity of their constituent words. Args: sents (list(str) or list(:class:`spacy.Span`)) edge_weighting (str {'cosine', 'jaccard'}, optional): similarity metric to use for weighting edges between sentences; if 'cosine', use the cosine similarity between sentences represented as tf-idf word vectors; if 'jaccard', use the set intersection divided by the set union of all words in a given sentence pair Returns: :class:`networkx.Graph()`: nodes are the integer indexes of the sentences in the input ``sents`` list, *not* the actual text of the sentences! Notes: * If passing sentences as strings, be sure to filter out stopwords, punctuation, certain parts of speech, etc. beforehand * Consider normalizing the strings so that like terms are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`) """ n_sents = len(sents) if isinstance(sents[0], str): pass elif isinstance(sents[0], spacy_span): sents = [ ' '.join( normalized_str(tok) for tok in extract.words(sent, filter_stops=True, filter_punct=True, filter_nums=False)) for sent in sents ] else: msg = 'Input sents must be strings or spacy Spans, not {}.'.format( type(sents[0])) raise TypeError(msg) if edge_weighting == 'cosine': term_sent_matrix = TfidfVectorizer().fit_transform(sents) elif edge_weighting == 'jaccard': term_sent_matrix = CountVectorizer(binary=True).fit_transform(sents) weights = (term_sent_matrix * term_sent_matrix.T).A.tolist() graph = nx.Graph() graph.add_edges_from((i, j, { 'weight': weights[i][j] }) for i in range(n_sents) for j in range(i + 1, n_sents)) return graph
def test_words_min_freq(self): expected = [ ',', 'I', 'was', 'in', 'in', 'an', 'for', '.', 'For', ',', 'we', 'the', 'education', 'in', 'the', '.', 'And', 'an', 'education', 'and', 'asked', 'he', 'ask', '"', 'I' ] observed = [ tok.orth_ for tok in extract.words(self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False, min_freq=2) ][:25] self.assertEqual(observed, expected)
def test_words_good_tags(self): expected = [ 'weeks', 'seminar', 'educators', 'minutes', 'impact', 'technology', 'trends', 'education', 'education', 'official', 'hand', 'question', 'mosques', 'sorrow', 'what', 'kids' ] observed = [ tok.orth_ for tok in extract.words(self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False, good_pos_tags={'NOUN'}) ][:25] self.assertEqual(observed, expected)
def test_words(self): expected = [ 'Two', 'weeks', 'ago', ',', 'I', 'was', 'in', 'Kuwait', 'participating', 'in', 'an', 'I.M.F.', 'seminar', 'for', 'Arab', 'educators', '.', 'For', '30', 'minutes', ',', 'we', 'discussed', 'the', 'impact' ] observed = [ tok.orth_ for tok in extract.words(self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False) ][:25] self.assertEqual(observed, expected)
def test_words_filter(self): expected = [ 'weeks', 'ago', 'Kuwait', 'participating', 'I.M.F.', 'seminar', 'Arab', 'educators', 'minutes', 'discussed', 'impact', 'technology', 'trends', 'education', 'Middle', 'East', 'Egyptian', 'education', 'official', 'raised', 'hand', 'asked', 'ask', 'personal', 'question' ] observed = [ tok.orth_ for tok in extract.words(self.spacy_doc, filter_stops=True, filter_punct=True, filter_nums=True) ][:25] self.assertEqual(observed, expected)
def sents_to_semantic_network(sents, edge_weighting='cosine'): """ Convert a list of sentences into a semantic network, where each sentence is represented by a node with edges linking it to other sentences weighted by the (cosine or jaccard) similarity of their constituent words. Args: sents (list(str) or list(:class:`spacy.Span`)) edge_weighting (str {'cosine', 'jaccard'}, optional): similarity metric to use for weighting edges between sentences; if 'cosine', use the cosine similarity between sentences represented as tf-idf word vectors; if 'jaccard', use the set intersection divided by the set union of all words in a given sentence pair Returns: :class:`networkx.Graph`: nodes are the integer indexes of the sentences in the input ``sents`` list, *not* the actual text of the sentences! Notes: * If passing sentences as strings, be sure to filter out stopwords, punctuation, certain parts of speech, etc. beforehand * Consider normalizing the strings so that like terms are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`) """ n_sents = len(sents) if isinstance(sents[0], unicode_type): pass elif isinstance(sents[0], SpacySpan): sents = [' '.join(normalized_str(tok) for tok in extract.words(sent, filter_stops=True, filter_punct=True, filter_nums=False)) for sent in sents] else: msg = 'Input sents must be strings or spacy Spans, not {}.'.format(type(sents[0])) raise TypeError(msg) if edge_weighting == 'cosine': term_sent_matrix = TfidfVectorizer().fit_transform(sents) elif edge_weighting == 'jaccard': term_sent_matrix = CountVectorizer(binary=True).fit_transform(sents) weights = (term_sent_matrix * term_sent_matrix.T).A.tolist() graph = nx.Graph() graph.add_edges_from( (i, j, {'weight': weights[i][j]}) for i in range(n_sents) for j in range(i + 1, n_sents)) return graph
def readability_stats(doc): """ Get calculated values for a variety of statistics related to the "readability" of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index, Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index. Also includes constituent values needed to compute the stats, e.g. word count. Args: doc (:class:`textacy.Doc <textacy.document.Doc>`) Returns: dict: mapping of readability statistic name (str) to value (int or float) Raises: NotImplementedError: if ``doc`` is not English language. sorry. """ if doc.lang != 'en': raise NotImplementedError('non-English NLP is not ready yet, sorry') n_sents = doc.n_sents words = list(extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)) n_words = len(words) n_unique_words = len({word.lower for word in words}) n_chars = sum(len(word) for word in words) hyphenator = data.load_hyphenator(lang='en') syllables_per_word = [len(hyphenator.positions(word.lower_)) + 1 for word in words] n_syllables = sum(syllables_per_word) n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3) return {'n_sents': n_sents, 'n_words': n_words, 'n_unique_words': n_unique_words, 'n_chars': n_chars, 'n_syllables': n_syllables, 'n_polysyllable_words': n_polysyllable_words, 'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents), 'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents), 'smog_index': smog_index(n_polysyllable_words, n_sents), 'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents), 'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents), 'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents)}
def words(doc): return list( extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False))
def test_words_good_tags(spacy_doc): result1 = list(extract.words(spacy_doc, include_pos={"NOUN"})) result2 = list(extract.words(spacy_doc, include_pos="NOUN")) assert all(tok.pos_ == "NOUN" for tok in result1) assert all(tok.pos_ == "NOUN" for tok in result2)
def test_default(self, spacy_doc): result = list(extract.words(spacy_doc)) assert all(isinstance(tok, Token) for tok in result) assert not any(tok.is_space for tok in result)
def test_min_freq(self, spacy_doc): counts = collections.Counter() counts.update(tok.lower_ for tok in spacy_doc) result = list(extract.words(spacy_doc, min_freq=2)) assert all(counts[tok.lower_] >= 2 for tok in result)