def contains_extractor(document): '''A basic document feature extractor that returns a dict of words that the document contains. ''' tokenizer = WordTokenizer() if isinstance(document, basestring): tokens = set([strip_punc(w, all=False) for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set((strip_punc(w, all=False) for w in document)) features = dict((u'contains({0})'.format(w), True) for w in tokens) return features
def tokenize(self, text, include_punc=True): '''Return a list of word tokens. :param text: string of text. :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True. ''' tokens = nltk.tokenize.word_tokenize(text) if include_punc: return tokens else: # Return each word token # Strips punctuation unless the word comes from a contraction # e.g. "Let's" => ["Let", "'s"] # e.g. "Can't" => ["Ca", "n't"] # e.g. "home." => ['home'] return [word if word.startswith("'") else strip_punc(word, all=False) for word in tokens if strip_punc(word, all=False)]
def basic_extractor(document, train_set): '''A basic document feature extractor that returns a dict indicating what words in ``train_set`` are contained in ``document``. :param document: The text to extract features from. Can be a string or an iterable. :param train_set: Training data set, a list of tuples of the form ``(words, label)``. ''' tokenizer = WordTokenizer() word_features = _get_words_from_dataset(train_set) if isinstance(document, basestring): tokens = set([strip_punc(w, all=False) for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set(strip_punc(w, all=False) for w in document) features = dict([(u'contains({0})'.format(word), (word in tokens)) for word in word_features]) return features
def test_strip_punc(self): assert_equal(strip_punc(self.text), 'this Has Punctuation ')
def noun_phrases(): text = get_text(request) noun_phrases = set(TextBlob(text).noun_phrases) # Strip punctuation from ends of noun phrases and exclude long phrases stripped = [strip_punc(np) for np in noun_phrases if len(np.split()) <= 5] return jsonify({"result": stripped})
def test_strip_punc_all(self): assert_equal(strip_punc(self.text, all=True), 'this Has Punctuation')
def test_strip_punc(self): assert_equal(strip_punc(self.text), 'this. Has. Punctuation')