Пример #1
0
 def test_pos(self, spacy_doc):
     result1 = list(extract.words(spacy_doc, include_pos={"NOUN"}))
     result2 = list(extract.words(spacy_doc, include_pos="NOUN"))
     assert all(tok.pos_ == "NOUN" for tok in result1)
     assert all(tok.pos_ == "NOUN" for tok in result2)
     result3 = list(extract.words(spacy_doc, exclude_pos={"NOUN"}))
     result4 = list(extract.words(spacy_doc, exclude_pos="NOUN"))
     assert not any(tok.pos_ == "NOUN" for tok in result3)
     assert not any(tok.pos_ == "NOUN" for tok in result4)
Пример #2
0
def word_movers(doc1, doc2, metric='cosine'):
    """
    Measure the semantic similarity between two documents using Word Movers
    Distance.

    Args:
        doc1 (``textacy.Doc`` or ``spacy.Doc``)
        doc2 (``textacy.Doc`` or ``spacy.Doc``)
        metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'})

    Returns:
        float: similarity between `doc1` and `doc2` in the interval [0.0, 1.0],
            where larger values correspond to more similar documents

    References:
        Ofir Pele and Michael Werman, "A linear time histogram metric for improved
            SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008.
        Ofir Pele and Michael Werman, "Fast and robust earth mover's distances,"
            in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009.
        Kusner, Matt J., et al. "From word embeddings to document distances."
            Proceedings of the 32nd International Conference on Machine Learning
            (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf
    """
    stringstore = StringStore()

    n = 0
    word_vecs = []
    for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)):
        if word.has_vector:
            if stringstore[
                    word.text] - 1 == n:  # stringstore[0] always empty space
                word_vecs.append(word.vector)
                n += 1
    distance_mat = pairwise_distances(np.array(word_vecs),
                                      metric=metric).astype(np.double)
    distance_mat /= distance_mat.max()

    vec1 = collections.Counter(stringstore[word.text] - 1
                               for word in extract.words(doc1)
                               if word.has_vector)
    vec1 = np.array([vec1[word_idx] for word_idx in range(len(stringstore))
                     ]).astype(np.double)
    vec1 /= vec1.sum()  # normalize word counts

    vec2 = collections.Counter(stringstore[word.text] - 1
                               for word in extract.words(doc2)
                               if word.has_vector)
    vec2 = np.array([vec2[word_idx] for word_idx in range(len(stringstore))
                     ]).astype(np.double)
    vec2 /= vec2.sum()  # normalize word counts

    return 1.0 - emd(vec1, vec2, distance_mat)
Пример #3
0
def word_movers(doc1, doc2, metric='cosine'):
    """
    Measure the semantic similarity between two documents using Word Movers
    Distance.

    Args:
        doc1 (``textacy.Doc`` or ``spacy.Doc``)
        doc2 (``textacy.Doc`` or ``spacy.Doc``)
        metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'})

    Returns:
        float: similarity between `doc1` and `doc2` in the interval [0.0, 1.0],
            where larger values correspond to more similar documents

    References:
        Ofir Pele and Michael Werman, "A linear time histogram metric for improved
            SIFT matching," in Computer Vision - ECCV 2008, Marseille, France, 2008.
        Ofir Pele and Michael Werman, "Fast and robust earth mover's distances,"
            in Proc. 2009 IEEE 12th Int. Conf. on Computer Vision, Kyoto, Japan, 2009.
        Kusner, Matt J., et al. "From word embeddings to document distances."
            Proceedings of the 32nd International Conference on Machine Learning
            (ICML 2015). 2015. http://jmlr.org/proceedings/papers/v37/kusnerb15.pdf
    """
    stringstore = StringStore()

    n = 0
    word_vecs = []
    for word in itertoolz.concatv(extract.words(doc1), extract.words(doc2)):
        if word.has_vector:
            if stringstore[word.text] - 1 == n:  # stringstore[0] always empty space
                word_vecs.append(word.vector)
                n += 1
    distance_mat = pairwise_distances(np.array(word_vecs), metric=metric).astype(np.double)
    distance_mat /= distance_mat.max()

    vec1 = collections.Counter(
        stringstore[word.text] - 1
        for word in extract.words(doc1)
        if word.has_vector)
    vec1 = np.array([vec1[word_idx] for word_idx in range(len(stringstore))]).astype(np.double)
    vec1 /= vec1.sum()  # normalize word counts

    vec2 = collections.Counter(
        stringstore[word.text] - 1
        for word in extract.words(doc2)
        if word.has_vector)
    vec2 = np.array([vec2[word_idx] for word_idx in range(len(stringstore))]).astype(np.double)
    vec2 /= vec2.sum()  # normalize word counts

    return 1.0 - emd(vec1, vec2, distance_mat)
Пример #4
0
    def words(self, **kwargs):
        """
        Extract an ordered sequence of words from a spacy-parsed doc, optionally
        filtering words by part-of-speech (etc.) and frequency.

        Args:
            **kwargs:
                filter_stops (bool, optional): if True, remove stop words from word list
                filter_punct (bool, optional): if True, remove punctuation from word list
                filter_nums (bool, optional): if True, remove number-like words
                    (e.g. 10, 'ten') from word list
                good_pos_tags (set[str], optional): remove words whose part-of-speech tag
                    is NOT in the specified tags, using the set of universal POS tagset
                bad_pos_tags (set[str], optional): remove words whose part-of-speech tag
                    IS in the specified tags, using the set of universal POS tagset
                min_freq (int, optional): remove words that occur in `doc` fewer than
                    `min_freq` times

        Yields:
            ``spacy.Token``: the next token passing all specified filters,
                in order of appearance in the document

        .. seealso:: :func:`extract.words() <textacy.extract.words>`
        """
        for word in extract.words(self.spacy_doc, **kwargs):
            yield word
Пример #5
0
    def words(self, **kwargs):
        """
        Extract an ordered sequence of words from a spacy-parsed doc, optionally
        filtering words by part-of-speech (etc.) and frequency.

        Args:
            **kwargs:
                filter_stops (bool, optional): if True, remove stop words from word list
                filter_punct (bool, optional): if True, remove punctuation from word list
                filter_nums (bool, optional): if True, remove number-like words
                    (e.g. 10, 'ten') from word list
                good_pos_tags (set[str], optional): remove words whose part-of-speech tag
                    is NOT in the specified tags, using the set of universal POS tagset
                bad_pos_tags (set[str], optional): remove words whose part-of-speech tag
                    IS in the specified tags, using the set of universal POS tagset
                min_freq (int, optional): remove words that occur in `doc` fewer than
                    `min_freq` times

        Yields:
            ``spacy.Token``: the next token passing all specified filters,
                in order of appearance in the document

        .. seealso:: :func:`extract.words() <textacy.extract.words>`
        """
        for word in extract.words(self.spacy_doc, **kwargs):
            yield word
Пример #6
0
 def __init__(self, doc):
     if isinstance(doc, SpacyDoc):
         lang = doc.vocab.lang
         self.n_sents = sum(1 for _ in doc.sents)
     elif isinstance(doc, textacy.Doc):
         lang = doc.lang
         self.n_sents = doc.n_sents
     else:
         raise ValueError(
             '``doc`` must be a ``textacy.Doc`` or ``spacy.Doc``')
     # get objs for basic count computations
     hyphenator = data.load_hyphenator(lang=lang)
     words = tuple(
         extract.words(doc,
                       filter_punct=True,
                       filter_stops=False,
                       filter_nums=False))
     syllables_per_word = tuple(
         len(hyphenator.positions(word.lower_)) + 1 for word in words)
     chars_per_word = tuple(len(word) for word in words)
     # compute basic counts needed for most readability stats
     self.n_words = len(words)
     self.n_unique_words = len({word.lower for word in words})
     self.n_chars = sum(chars_per_word)
     self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7)
     self.n_syllables = sum(syllables_per_word)
     self.n_monosyllable_words = sum(1 for spw in syllables_per_word
                                     if spw == 1)
     self.n_polysyllable_words = sum(1 for spw in syllables_per_word
                                     if spw >= 3)
Пример #7
0
def doc_to_gensim(doc, lemmatize=True,
                  filter_stops=True, filter_punct=True, filter_nums=False):
    """
    Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document.

    Args:
        doc (``spacy.Doc``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list((int, int)): bag-of-words document, a list of (integer word ID, word count)
            2-tuples
    """
    gdict = Dictionary()
    words = extract.words(doc,
                          filter_stops=filter_stops,
                          filter_punct=filter_punct,
                          filter_nums=filter_nums)
    if lemmatize is True:
        gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True)
    else:
        gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True)

    return (gdict, gdoc)
Пример #8
0
def doc_to_gensim(doc, lemmatize=True,
                  filter_stops=True, filter_punct=True, filter_nums=False):
    """
    Convert a single ``spacy.Doc`` into a gensim dictionary and bag-of-words document.

    Args:
        doc (``spacy.Doc``)
        lemmatize (bool): if True, use lemmatized strings for words; otherwise,
            use the original form of the string as it appears in ``doc``
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove numbers from word list

    Returns:
        :class:`gensim.Dictionary <gensim.corpora.dictionary.Dictionary>`:
            integer word ID to word string mapping
        list((int, int)): bag-of-words document, a list of (integer word ID, word count)
            2-tuples
    """
    gdict = Dictionary()
    words = extract.words(doc,
                          filter_stops=filter_stops,
                          filter_punct=filter_punct,
                          filter_nums=filter_nums)
    if lemmatize is True:
        gdoc = gdict.doc2bow((word.lemma_ for word in words), allow_update=True)
    else:
        gdoc = gdict.doc2bow((word.orth_ for word in words), allow_update=True)

    return (gdict, gdoc)
Пример #9
0
def pos_density(doc):
    tag_counts = Counter([token.tag_ for token in doc])
    pos_counts = Counter([token.pos_ for token in doc])

    n_sentences = _count_iter_items(doc.sents)

    # OLD: we use tokens and not word as in the phd, maybe we should change it
    # n_tokens = sum([len(sent) for sent in doc.sents])
    n_words = _count_iter_items(
        extract.words(doc,
                      filter_punct=True,
                      filter_stops=False,
                      filter_nums=False))

    return {
        'num_comma':
        tag_counts[','] / n_sentences,  # punctuation mark, comma / sentences
        'nouns': (pos_counts['NOUN'] + pos_counts['PROPN']) /
        n_words,  # (nouns + proper nouns)/all words
        'propernouns':
        pos_counts['PROPN'] / n_words,  # proper nouns/all words
        'pronouns': (tag_counts['PRP'] + tag_counts['PRP$']) /
        n_words,  # pronouns/all words

        # should we use ADP here?!?!
        'conj': (pos_counts['CONJ'] + pos_counts['ADP']) /
        n_words,  # conjunctions/all words
        'adj':
        pos_counts['ADJ'] / n_words,  # adjectives/all words
        'ver': (pos_counts['VERB'] - tag_counts['MD']) /
        n_words,  # non-modal verbs/all words
        'interj':
        pos_counts['INTJ'] / n_sentences,  # interjections/total sentences
        'adverbs':
        pos_counts['ADV'] / n_sentences,  # adverbs/total sentences
        'modals':
        tag_counts['MD'] / n_sentences,  # modal verbs/total sentences
        'perpro':
        tag_counts['PRP'] / n_sentences,  # personal pronouns/total sentences
        'whpro': (tag_counts['WP'] + tag_counts['WP$']) /
        n_sentences,  # wh- pronouns/total sentences
        'numfuncwords':
        (tag_counts['BES'] + tag_counts['CC'] + tag_counts['DT'] +
         tag_counts['EX'] + tag_counts['HVS'] + tag_counts['IN'] +
         tag_counts['MD'] + tag_counts['PRP'] + tag_counts['PRP$'] +
         tag_counts['RP'] + tag_counts['TO'] + tag_counts['UH']) /
        n_sentences,  # function words/total sentences
        'numdet':
        pos_counts['DET'] / n_sentences,  # determiners/total sentences
        'numvb':
        tag_counts['VB'] / n_sentences,  # VB tags/total sentences
        'numvbd':
        tag_counts['VBD'] / n_sentences,  # VBD tags/total sentences
        'numvbg':
        tag_counts['VBG'] / n_sentences,  # VBG tags/total sentences
        'numvbn':
        tag_counts['VBN'] / n_sentences,  # VBN tags/total sentences
        'numvbp':
        tag_counts['VBP'] / n_sentences,  # VBP tags/total sentences
    }
Пример #10
0
 def test_words(self):
     expected = [
         'Two', 'weeks', 'ago', ',', 'I', 'was', 'in', 'Kuwait', 'participating',
         'in', 'an', 'I.M.F.', 'seminar', 'for', 'Arab', 'educators', '.', 'For',
         '30', 'minutes', ',', 'we', 'discussed', 'the', 'impact']
     observed = [tok.orth_ for tok in extract.words(
         self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False)][:25]
     self.assertEqual(observed, expected)
Пример #11
0
def test_words_filter(spacy_doc):
    result = [
        tok for tok in extract.words(
            spacy_doc, filter_stops=True, filter_punct=True, filter_nums=True)
    ]
    assert not any(tok.is_stop for tok in result)
    assert not any(tok.is_punct for tok in result)
    assert not any(tok.like_num for tok in result)
Пример #12
0
    def words(self, **kwargs):
        """
        Extract an ordered sequence of words from a spacy-parsed doc, optionally
        filtering words by part-of-speech (etc.) and frequency.

        .. seealso:: :func:`extract.words() <textacy.extract.words>` for all function kwargs.
        """
        return extract.words(self.spacy_doc, **kwargs)
Пример #13
0
    def words(self, **kwargs):
        """
        Extract an ordered sequence of words from a spacy-parsed doc, optionally
        filtering words by part-of-speech (etc.) and frequency.

        .. seealso:: :func:`extract.words() <textacy.extract.words>` for all function kwargs.
        """
        return extract.words(self.spacy_doc, **kwargs)
Пример #14
0
 def test_words_min_freq(self):
     expected = [
         ',', 'I', 'was', 'in', 'in', 'an', 'for', '.', 'For', ',', 'we', 'the',
         'education', 'in', 'the', '.', 'And', 'an', 'education', 'and', 'asked',
         'he', 'ask', '"', 'I']
     observed = [tok.orth_ for tok in extract.words(
         self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False,
         min_freq=2)][:25]
     self.assertEqual(observed, expected)
Пример #15
0
 def test_words_good_tags(self):
     expected = [
         'weeks', 'seminar', 'educators', 'minutes', 'impact', 'technology',
         'trends', 'education', 'education', 'official', 'hand', 'question',
         'mosques', 'sorrow', 'what', 'kids']
     observed = [tok.orth_ for tok in extract.words(
         self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False,
         good_pos_tags={'NOUN'})][:25]
     self.assertEqual(observed, expected)
Пример #16
0
 def test_words_filter(self):
     expected = [
         'weeks', 'ago', 'Kuwait', 'participating', 'I.M.F.', 'seminar', 'Arab',
         'educators', 'minutes', 'discussed', 'impact', 'technology', 'trends',
         'education', 'Middle', 'East', 'Egyptian', 'education', 'official',
         'raised', 'hand', 'asked', 'ask', 'personal', 'question']
     observed = [tok.orth_ for tok in extract.words(
         self.spacy_doc, filter_stops=True, filter_punct=True, filter_nums=True)][:25]
     self.assertEqual(observed, expected)
Пример #17
0
def test_words_good_tags(spacy_doc):
    result = [
        tok for tok in extract.words(spacy_doc,
                                     filter_stops=False,
                                     filter_punct=False,
                                     filter_nums=False,
                                     include_pos={'NOUN'})
    ]
    assert all(tok.pos_ == 'NOUN' for tok in result)
Пример #18
0
 def test_filter(self, spacy_doc):
     result = list(
         extract.words(
             spacy_doc, filter_stops=True, filter_punct=True, filter_nums=True
         )
     )
     assert not any(tok.is_stop for tok in result)
     assert not any(tok.is_punct for tok in result)
     assert not any(tok.like_num for tok in result)
Пример #19
0
 def test_words_good_tags(self):
     expected = [
         'weeks', 'I', 'Kuwait', 'I.M.F.', 'seminar', 'educators', 'minutes',
         'we', 'impact', 'technology', 'trends', 'education', 'Middle', 'East',
         'education', 'official', 'hand', 'he', 'me', 'question', 'I', 'Donald',
         'Trump', 'we', 'mosques']
     observed = [tok.orth_ for tok in extract.words(
         self.spacy_doc, filter_stops=False, filter_punct=False, filter_nums=False,
         good_pos_tags={'NOUN'})][:25]
     self.assertEqual(observed, expected)
Пример #20
0
def readability_stats(doc):
    """
    Get calculated values for a variety of statistics related to the "readability"
    of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index,
    Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index.

    Also includes constituent values needed to compute the stats, e.g. word count.

    **DEPRECATED**

    Args:
        doc (:class:`textacy.Doc <textacy.document.Doc>`)

    Returns:
        dict: mapping of readability statistic name (str) to value (int or float)

    Raises:
        NotImplementedError: if ``doc`` is not English language. sorry.
    """
    msg = '`readability_stats()` function is deprecated; use `TextStats` class instead'
    with warnings.catch_warnings():
        warnings.simplefilter('once', DeprecationWarning)
        warnings.warn(msg, DeprecationWarning)

    if doc.lang != 'en':
        raise NotImplementedError('non-English NLP is not ready yet, sorry')

    n_sents = doc.n_sents

    words = list(extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False))
    n_words = len(words)
    if n_words == 0:
        logging.warning("readability stats can't be computed because doc has 0 words")
        return None
    n_unique_words = len({word.lower for word in words})
    n_chars = sum(len(word) for word in words)

    hyphenator = data.load_hyphenator(lang='en')
    syllables_per_word = [len(hyphenator.positions(word.lower_)) + 1 for word in words]
    n_syllables = sum(syllables_per_word)
    n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3)

    return {'n_sents': n_sents,
            'n_words': n_words,
            'n_unique_words': n_unique_words,
            'n_chars': n_chars,
            'n_syllables': n_syllables,
            'n_polysyllable_words': n_polysyllable_words,
            'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents),
            'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents),
            'smog_index': smog_index(n_polysyllable_words, n_sents),
            'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents),
            'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents),
            'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents)}
Пример #21
0
def test_words_min_freq(spacy_doc):
    counts = collections.Counter()
    counts.update(tok.lower_ for tok in spacy_doc)
    result = [
        tok for tok in extract.words(spacy_doc,
                                     filter_stops=False,
                                     filter_punct=False,
                                     filter_nums=False,
                                     min_freq=2)
    ]
    assert all(counts[tok.lower_] >= 2 for tok in result)
Пример #22
0
def sents_to_semantic_network(sents, edge_weighting='cosine'):
    """
    Convert a list of sentences into a semantic network, where each sentence is
    represented by a node with edges linking it to other sentences weighted by
    the (cosine or jaccard) similarity of their constituent words.

    Args:
        sents (list(str) or list(:class:`spacy.Span`))
        edge_weighting (str {'cosine', 'jaccard'}, optional): similarity metric
            to use for weighting edges between sentences; if 'cosine', use the
            cosine similarity between sentences represented as tf-idf word vectors;
            if 'jaccard', use the set intersection divided by the set union of
            all words in a given sentence pair

    Returns:
        :class:`networkx.Graph()`: nodes are the integer indexes of the sentences
            in the input ``sents`` list, *not* the actual text of the sentences!

    Notes:
        * If passing sentences as strings, be sure to filter out stopwords, punctuation,
          certain parts of speech, etc. beforehand
        * Consider normalizing the strings so that like terms are counted together
          (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`)
    """
    n_sents = len(sents)
    if isinstance(sents[0], str):
        pass
    elif isinstance(sents[0], spacy_span):
        sents = [
            ' '.join(
                normalized_str(tok)
                for tok in extract.words(sent,
                                         filter_stops=True,
                                         filter_punct=True,
                                         filter_nums=False)) for sent in sents
        ]
    else:
        msg = 'Input sents must be strings or spacy Spans, not {}.'.format(
            type(sents[0]))
        raise TypeError(msg)

    if edge_weighting == 'cosine':
        term_sent_matrix = TfidfVectorizer().fit_transform(sents)
    elif edge_weighting == 'jaccard':
        term_sent_matrix = CountVectorizer(binary=True).fit_transform(sents)
    weights = (term_sent_matrix * term_sent_matrix.T).A.tolist()

    graph = nx.Graph()
    graph.add_edges_from((i, j, {
        'weight': weights[i][j]
    }) for i in range(n_sents) for j in range(i + 1, n_sents))

    return graph
Пример #23
0
 def test_words_min_freq(self):
     expected = [
         ',', 'I', 'was', 'in', 'in', 'an', 'for', '.', 'For', ',', 'we',
         'the', 'education', 'in', 'the', '.', 'And', 'an', 'education',
         'and', 'asked', 'he', 'ask', '"', 'I'
     ]
     observed = [
         tok.orth_ for tok in extract.words(self.spacy_doc,
                                            filter_stops=False,
                                            filter_punct=False,
                                            filter_nums=False,
                                            min_freq=2)
     ][:25]
     self.assertEqual(observed, expected)
Пример #24
0
 def test_words_good_tags(self):
     expected = [
         'weeks', 'seminar', 'educators', 'minutes', 'impact', 'technology',
         'trends', 'education', 'education', 'official', 'hand', 'question',
         'mosques', 'sorrow', 'what', 'kids'
     ]
     observed = [
         tok.orth_ for tok in extract.words(self.spacy_doc,
                                            filter_stops=False,
                                            filter_punct=False,
                                            filter_nums=False,
                                            good_pos_tags={'NOUN'})
     ][:25]
     self.assertEqual(observed, expected)
Пример #25
0
 def test_words(self):
     expected = [
         'Two', 'weeks', 'ago', ',', 'I', 'was', 'in', 'Kuwait',
         'participating', 'in', 'an', 'I.M.F.', 'seminar', 'for', 'Arab',
         'educators', '.', 'For', '30', 'minutes', ',', 'we', 'discussed',
         'the', 'impact'
     ]
     observed = [
         tok.orth_ for tok in extract.words(self.spacy_doc,
                                            filter_stops=False,
                                            filter_punct=False,
                                            filter_nums=False)
     ][:25]
     self.assertEqual(observed, expected)
Пример #26
0
 def test_words_filter(self):
     expected = [
         'weeks', 'ago', 'Kuwait', 'participating', 'I.M.F.', 'seminar',
         'Arab', 'educators', 'minutes', 'discussed', 'impact',
         'technology', 'trends', 'education', 'Middle', 'East', 'Egyptian',
         'education', 'official', 'raised', 'hand', 'asked', 'ask',
         'personal', 'question'
     ]
     observed = [
         tok.orth_ for tok in extract.words(self.spacy_doc,
                                            filter_stops=True,
                                            filter_punct=True,
                                            filter_nums=True)
     ][:25]
     self.assertEqual(observed, expected)
Пример #27
0
def sents_to_semantic_network(sents,
                              edge_weighting='cosine'):
    """
    Convert a list of sentences into a semantic network, where each sentence is
    represented by a node with edges linking it to other sentences weighted by
    the (cosine or jaccard) similarity of their constituent words.

    Args:
        sents (list(str) or list(:class:`spacy.Span`))
        edge_weighting (str {'cosine', 'jaccard'}, optional): similarity metric
            to use for weighting edges between sentences; if 'cosine', use the
            cosine similarity between sentences represented as tf-idf word vectors;
            if 'jaccard', use the set intersection divided by the set union of
            all words in a given sentence pair

    Returns:
        :class:`networkx.Graph`: nodes are the integer indexes of the sentences
            in the input ``sents`` list, *not* the actual text of the sentences!

    Notes:
        * If passing sentences as strings, be sure to filter out stopwords, punctuation,
          certain parts of speech, etc. beforehand
        * Consider normalizing the strings so that like terms are counted together
          (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`)
    """
    n_sents = len(sents)
    if isinstance(sents[0], unicode_type):
        pass
    elif isinstance(sents[0], SpacySpan):
        sents = [' '.join(normalized_str(tok) for tok in
                          extract.words(sent, filter_stops=True, filter_punct=True, filter_nums=False))
                 for sent in sents]
    else:
        msg = 'Input sents must be strings or spacy Spans, not {}.'.format(type(sents[0]))
        raise TypeError(msg)

    if edge_weighting == 'cosine':
        term_sent_matrix = TfidfVectorizer().fit_transform(sents)
    elif edge_weighting == 'jaccard':
        term_sent_matrix = CountVectorizer(binary=True).fit_transform(sents)
    weights = (term_sent_matrix * term_sent_matrix.T).A.tolist()

    graph = nx.Graph()
    graph.add_edges_from(
        (i, j, {'weight': weights[i][j]})
        for i in range(n_sents) for j in range(i + 1, n_sents))

    return graph
Пример #28
0
def readability_stats(doc):
    """
    Get calculated values for a variety of statistics related to the "readability"
    of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index,
    Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index.

    Also includes constituent values needed to compute the stats, e.g. word count.

    Args:
        doc (:class:`textacy.Doc <textacy.document.Doc>`)

    Returns:
        dict: mapping of readability statistic name (str) to value (int or float)

    Raises:
        NotImplementedError: if ``doc`` is not English language. sorry.
    """
    if doc.lang != 'en':
        raise NotImplementedError('non-English NLP is not ready yet, sorry')

    n_sents = doc.n_sents

    words = list(extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False))
    n_words = len(words)
    n_unique_words = len({word.lower for word in words})
    n_chars = sum(len(word) for word in words)

    hyphenator = data.load_hyphenator(lang='en')
    syllables_per_word = [len(hyphenator.positions(word.lower_)) + 1 for word in words]
    n_syllables = sum(syllables_per_word)
    n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3)

    return {'n_sents': n_sents,
            'n_words': n_words,
            'n_unique_words': n_unique_words,
            'n_chars': n_chars,
            'n_syllables': n_syllables,
            'n_polysyllable_words': n_polysyllable_words,
            'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents),
            'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents),
            'smog_index': smog_index(n_polysyllable_words, n_sents),
            'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents),
            'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents),
            'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents)}
Пример #29
0
def words(doc):
    return list(
        extract.words(doc,
                      filter_punct=True,
                      filter_stops=False,
                      filter_nums=False))
Пример #30
0
def test_words_good_tags(spacy_doc):
    result1 = list(extract.words(spacy_doc, include_pos={"NOUN"}))
    result2 = list(extract.words(spacy_doc, include_pos="NOUN"))
    assert all(tok.pos_ == "NOUN" for tok in result1)
    assert all(tok.pos_ == "NOUN" for tok in result2)
Пример #31
0
 def test_default(self, spacy_doc):
     result = list(extract.words(spacy_doc))
     assert all(isinstance(tok, Token) for tok in result)
     assert not any(tok.is_space for tok in result)
Пример #32
0
 def test_min_freq(self, spacy_doc):
     counts = collections.Counter()
     counts.update(tok.lower_ for tok in spacy_doc)
     result = list(extract.words(spacy_doc, min_freq=2))
     assert all(counts[tok.lower_] >= 2 for tok in result)