Пример #1
0
 def normalize_list(self, text):
     """
     Get a list of word stems that appear in the text. Stopwords and an initial
     'to' will be stripped.
     """
     pieces = [self.snowball_stem(word) for word in tokenize_list(text) if self.good_lemma(word)]
     if not pieces:
         return text
     return pieces
Пример #2
0
 def normalize_list(self, text):
     """
     Get a list of word stems that appear in the text. Stopwords and an initial
     'to' will be stripped.
     """
     pieces = [
         self.snowball_stem(word) for word in tokenize_list(text)
         if self.good_lemma(word)
     ]
     if not pieces:
         return text
     return pieces
Пример #3
0
def normalize_list(text):
    """
    Get a list of word stems that appear in the text. Stopwords and an initial
    'to' will be stripped.
    """
    pieces = [morphy_stem(word) for word in tokenize_list(text)]
    pieces = [piece for piece in pieces if good_lemma(piece)]
    if not pieces:
        return text
    if pieces[0] == 'to':
        pieces = pieces[1:]
    return pieces
Пример #4
0
def tag_and_stem(text):
    """
    Returns a list of (stem, tag, token) triples:

    - stem: the word's uninflected form
    - tag: the word's part of speech
    - token: the original word, so we can reconstruct it later
    """
    tokens = tokenize_list(preprocess_text(text))
    tagged = nltk.pos_tag(tokens)
    out = []
    for token, tag in tagged:
        if token in BRACKET_DIC:
            out.append((token, BRACKET_DIC[token], token))
        else:
            stem = morphy_stem(token, tag)
            out.append((stem, tag, token))
    return out
Пример #5
0
def tag_and_stem(text):
    """
    Returns a list of (stem, tag, token) triples:

    - stem: the word's uninflected form
    - tag: the word's part of speech
    - token: the original word, so we can reconstruct it later
    """
    tokens = tokenize_list(preprocess_text(text))
    tagged = nltk.pos_tag(tokens)
    out = []
    for token, tag in tagged:
        if token in BRACKET_DIC:
            out.append((token, BRACKET_DIC[token], token))
        else:
            stem = morphy_stem(token, tag)
            out.append((stem, tag, token))
    return out
Пример #6
0
def normalize_list(text):
    """
    Get a list of word stems that appear in the text. Stopwords and an initial
    'to' will be stripped, unless this leaves nothing in the stem.

    >>> normalize_list('the dog')
    [u'dog']
    >>> normalize_list('big dogs')
    [u'big', u'dog']
    >>> normalize_list('the')
    [u'the']
    """
    text = preprocess_text(text)
    pieces = [morphy_stem(word) for word in tokenize_list(text)]
    pieces = [piece for piece in pieces if good_lemma(piece)]
    if not pieces:
        return [text]
    if pieces[0] == 'to':
        pieces = pieces[1:]
    return pieces
Пример #7
0
def normalize_list(text):
    """
    Get a list of word stems that appear in the text. Stopwords and an initial
    'to' will be stripped, unless this leaves nothing in the stem.

    >>> normalize_list('the dog')
    [u'dog']
    >>> normalize_list('big dogs')
    [u'big', u'dog']
    >>> normalize_list('the')
    [u'the']
    """
    text = preprocess_text(text)
    pieces = [morphy_stem(word) for word in tokenize_list(text)]
    pieces = [piece for piece in pieces if good_lemma(piece)]
    if not pieces:
        return [text]
    if pieces[0] == 'to':
        pieces = pieces[1:]
    return pieces