예제 #1
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    tokenizer = WordTokenizer()
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
예제 #2
0
    def tokenize(self, text, include_punc=True):
        '''Return a list of word tokens.

        :param text: string of text.
        :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True.
        '''
        tokens = nltk.tokenize.word_tokenize(text)
        if include_punc:
            return tokens
        else:
            # Return each word token
            # Strips punctuation unless the word comes from a contraction
            # e.g. "Let's" => ["Let", "'s"]
            # e.g. "Can't" => ["Ca", "n't"]
            # e.g. "home." => ['home']
            return [word if word.startswith("'") else strip_punc(word, all=False)
                    for word in tokens if strip_punc(word, all=False)]
예제 #3
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    tokenizer = WordTokenizer()
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict([(u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features])
    return features
예제 #4
0
 def test_strip_punc(self):
     assert_equal(strip_punc(self.text),
                 'this Has Punctuation ')
예제 #5
0
def noun_phrases():
    text = get_text(request)
    noun_phrases = set(TextBlob(text).noun_phrases)
    # Strip punctuation from ends of noun phrases and exclude long phrases
    stripped = [strip_punc(np) for np in noun_phrases if len(np.split()) <= 5]
    return jsonify({"result": stripped})
예제 #6
0
 def test_strip_punc_all(self):
     assert_equal(strip_punc(self.text, all=True), 'this Has Punctuation')
예제 #7
0
 def test_strip_punc(self):
     assert_equal(strip_punc(self.text), 'this. Has. Punctuation')
예제 #8
0
파일: run.py 프로젝트: MansMeg/textfeel-web
def noun_phrases():
    text = get_text(request)
    noun_phrases = set(TextBlob(text).noun_phrases)
    # Strip punctuation from ends of noun phrases and exclude long phrases
    stripped = [strip_punc(np) for np in noun_phrases if len(np.split()) <= 5]
    return jsonify({"result": stripped})