def word_tokenize(self, sentences, include_punc=True): # : Do not process empty strings (Issue #3) if sentences.strip() == "": return [] _tokens = sentences.split(" ") # : Handle strings consisting of a single punctuation mark seperately (Issue #4) if len(_tokens) == 1: if _tokens[0] in PUNCTUATION: if include_punc: return _tokens else: return [] if include_punc: last_word = _tokens[-1] # Make sure that you do not separate '.' tokens into ['', '.'] # (Issue #5) if last_word.endswith('.') and len(last_word) > 1: _tokens = _tokens[:-1] + [last_word[:-1], '.'] return _tokens else: # Return each word token # Strips punctuation unless the word comes from a contraction # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!" # e.g. "hat's" => ["hat", "'s"] # e.g. "home." => ['home'] words = [ word if word.startswith("'") else strip_punc( word, all=False) for word in _tokens if strip_punc( word, all=False)] return list(words)
def _get_document_tokens(document): if isinstance(document, basestring): tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False))) else: tokens = set(strip_punc(w, all=False) for w in document) return tokens
def word_tokenize(self, sentences, include_punc=True): #: Do not process empty strings (Issue #3) if sentences.strip() == "": return [] _tokens = sentences.split(" ") #: Handle strings consisting of a single punctuation mark seperately (Issue #4) if len(_tokens) == 1: if _tokens[0] in PUNCTUATION: if include_punc: return _tokens else: return [] if include_punc: last_word = _tokens[-1] # Make sure that you do not separate '.' tokens into ['', '.'] # (Issue #5) if last_word.endswith('.') and len(last_word) > 1: _tokens = _tokens[:-1] + [last_word[:-1], '.'] return _tokens else: # Return each word token # Strips punctuation unless the word comes from a contraction # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!" # e.g. "hat's" => ["hat", "'s"] # e.g. "home." => ['home'] words = [ word if word.startswith("'") else strip_punc( word, all=False) for word in _tokens if strip_punc( word, all=False)] return list(words)
def contains_extractor(document): '''A basic document feature extractor that returns a dict of words that the document contains. ''' if isinstance(document, basestring): tokens = set([strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False)]) else: tokens = set((strip_punc(w, all=False) for w in document)) features = dict((u'contains({0})'.format(w), True) for w in tokens) return features
def contains_extractor(document): '''A basic document feature extractor that returns a dict of words that the document contains. ''' tokenizer = WordTokenizer() if isinstance(document, basestring): tokens = set([strip_punc(w, all=False) for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set((strip_punc(w, all=False) for w in document)) features = dict((u'contains({0})'.format(w), True) for w in tokens) return features
def _get_document_tokens(document): if isinstance(document, basestring): tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False))) else: tokens = set(strip_punc(w, all=False) for w in document) return tokens[docs] def basic_extractor(document, train_set): word_features = _get_words_from_dataset(train_set) tokens = _get_document_tokens(document) features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features)) return features
def basic_extractor(document, train_set): '''A basic document feature extractor that returns a dict indicating what words in ``train_set`` are contained in ``document``. :param document: The text to extract features from. Can be a string or an iterable. :param train_set: Training data set, a list of tuples of the form ``(words, label)``. ''' word_features = _get_words_from_dataset(train_set) if isinstance(document, basestring): tokens = set((strip_punc(w, all=False) for w in word_tokenize(document, include_punc=False))) else: tokens = set(strip_punc(w, all=False) for w in document) features = dict(((u'contains({0})'.format(word), (word in tokens)) for word in word_features)) return features
def tokenize(self, text, include_punc=True): '''Return a list of word tokens. :param text: string of text. :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True. ''' tokens = nltk.tokenize.word_tokenize(text) if include_punc: return tokens else: # Return each word token # Strips punctuation unless the word comes from a contraction # e.g. "Let's" => ["Let", "'s"] # e.g. "Can't" => ["Ca", "n't"] # e.g. "home." => ['home'] return [word if word.startswith("'") else strip_punc(word, all=False) for word in tokens if strip_punc(word, all=False)]
def tokenize(self, text, include_punc=True): '''Return a list of word tokens. :param text: string of text. :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True. ''' tokens = nltk.tokenize.word_tokenize(text) if include_punc: return tokens else: # Return each word token # Strips punctuation unless the word comes from a contraction # e.g. "Let's" => ["Let", "'s"] # e.g. "Can't" => ["Ca", "n't"] # e.g. "home." => ['home'] return [ word if word.startswith("'") else strip_punc(word, all=False) for word in tokens if strip_punc(word, all=False) ]
def word_tokenize(self, text, include_punc=True): """The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. It assumes that the text has already been segmented into sentences, e.g. using ``self.sent_tokenize()``. This tokenizer performs the following steps: - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll`` - treat most punctuation characters as separate tokens - split off commas and single quotes, when followed by whitespace - separate periods that appear at the end of line Source: NLTK's docstring of ``TreebankWordTokenizer`` (accessed: 02/10/2014) """ #: Do not process empty strings (Issue #3) if text.strip() == "": return [] _tokens = self.word_tok.tokenize(text) #: Handle strings consisting of a single punctuation mark seperately (Issue #4) if len(_tokens) == 1: if _tokens[0] in PUNCTUATION: if include_punc: return _tokens else: return [] if include_punc: return _tokens else: # Return each word token # Strips punctuation unless the word comes from a contraction # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!" # e.g. "hat's" => ["hat", "'s"] # e.g. "home." => ['home'] words = [ word if word.startswith("'") else strip_punc( word, all=False) for word in _tokens if strip_punc( word, all=False)] return list(words)
def noun_phrases(): text = get_text(request) noun_phrases = set(TextBlob(text).noun_phrases) # Strip punctuation from ends of noun phrases and exclude long phrases stripped = [strip_punc(np) for np in noun_phrases if len(np.split()) <= 5] return jsonify({"result": stripped})
import nltk nltk.download('stopwords', download_dir='.') from nltk.corpus import stopwords nltk.data.path.append('.') stop_words = stopwords.words('english') from textblob.utils import strip_punc tokenized = sc.textFile('wasb:///example/data/RomeoAndJuliet.txt')\ .map(lambda line: strip_punc(line, all=True).lower())\ .flatMap(lambda line: line.split()) filtered = tokenized.filter(lambda word: word not in stop_words) from operator import add word_counts = filtered.map(lambda word: (word, 1)).reduceByKey(add) filtered_counts = word_counts.filter(lambda item: item[1] >= 60) from operator import itemgetter sorted_items = sorted(filtered_counts.collect(), key=itemgetter(1), reverse=True) max_len = max([len(word) for word, count in sorted_items]) for word, count in sorted_items: print('{:>{width}}: {}'.format(word, count, width=max_len))
def test_strip_punc_all(self): assert_equal(strip_punc(self.text, all=True), 'this Has Punctuation')
def test_strip_punc(self): assert_equal(strip_punc(self.text), 'this. Has. Punctuation')