class TestWordTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ['Python', 'is', 'a', 'high-level', 'programming', 'language', '.']) def test_exclude_punc(self): assert_equal(self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language']) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Python") assert_equal(next(gen), "is") def test_word_tokenize(self): tokens = word_tokenize(self.text) assert_true(is_generator(tokens)) assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
class TestWordTokenizer(unittest.TestCase): '''An example unit test case.''' def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ['Python', 'is', 'a', 'high-level', 'programming', 'language', '.']) def test_exclude_punc(self): assert_equal(self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language']) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Python") assert_equal(next(gen), "is")
def find_ngrams(tweets: List[str], n: int, top: int): ngram_counter: Counter = Counter() for tweet in tweets: tokenizer = WordTokenizer() tokens = tokenizer.tokenize(tweet, include_punc=True) for i in range(len(tokens) - n): subwords = ' '.join(tokens[i:i + n]) ngram_counter[subwords] += 1 print(ngram_counter.most_common(top))
def analyze(self, text): """Return the sentiment as a tuple of the form: ``(classification, pos_probability, neg_probability)`` """ # Lazily train the classifier super(NaiveBayesAnalyzer, self).analyze(text) tokenizer = WordTokenizer() tokens = tokenizer.tokenize(text, include_punc=False) filtered = [t.lower() for t in tokens if len(t) >= 3] feats = self._extract_feats(filtered) prob_dist = self._classifier.prob_classify(feats) # classification, p_pos, p_neg return prob_dist.max(), prob_dist.prob('pos'), prob_dist.prob("neg")
class TestWordTokenizer(unittest.TestCase): '''An example unit test case.''' def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), [ 'Python', 'is', 'a', 'high-level', 'programming', 'language', '.' ]) def test_exclude_punc(self): assert_equal( self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language']) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Python") assert_equal(next(gen), "is")
class CharacterSkipGramAnalyzer(object): def __init__(self): self.sentencer = SentenceTokenizer() self.worder = WordTokenizer(); def __call__(self, doc): tokens = [] for sent in self.sentencer.tokenize(doc.lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = self.worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0,len(word)): term = word[:j] + word[j+1:] tokens.append(term.strip()) return tokens
def tokenize_text(self, block): ''' Runs the text string through Text Blobs tokenizer/lemmatizer ''' def lemmatize_word(word): w = Word(word) return w.lemmatize().lower() tokenizer = WordTokenizer() token = tokenizer.tokenize(block) filtered_words = [word.lower() for word in token if word not in ignoredwords] results = list(map(lemmatize_word, filtered_words)) # pool = Pool(5) # results = pool.map(self.lemmatize_word, token) # pool.close() # pool.join() return results
def correct(self): '''Attempt to correct the spelling of a blob. .. versionadded:: 0.6.0 :rtype: BaseBlob ''' tok = WordTokenizer() corrected = (Word(w).correct() for w in tok.tokenize(self.raw, include_punc=True)) # Separate each token with a space unless the token is a punctuation ret = '' for i, word in enumerate(corrected): # Avoid an extra space at the beginning if word in pystring.punctuation or i == 0: ret = ''.join([ret, word]) else: ret = ' '.join([ret, word]) return self.__class__(ret)
def clean_tweet(tweet: str, should_remove_stopwords: bool = False) -> CleanedTweet: # Extract tokens from each tweet tokenizer = WordTokenizer() tokens = tokenizer.tokenize(tweet, include_punc=True) cleaned_tokens: List[str] = [] for token in tokens: t = SPECIAL_CHARS.sub('', token).lower() # Substitute the & symbol to standardize text if t == 'amp': t = 'and' # Skip all links and empty strings if should_keep_token(t, should_remove_stopwords): cleaned_tokens.append(t) # Lowercase all tokens cleaned_tweet = ' '.join(cleaned_tokens) return CleanedTweet(text=cleaned_tweet, num_tokens=len(cleaned_tokens))
def transform(self, texts): """ transform data :texts: The texts to count word lengths in :returns: list of counts for each text """ mini, maxi = self.span num_counts = maxi - mini wt = WordTokenizer() tokens = [wt.tokenize(text) for text in texts] text_len_dist = [] for line_tokens in tokens: counter = [0] * num_counts for word in line_tokens: word_len = len(word) if mini <= word_len <= maxi: counter[word_len - 1] += 1 text_len_dist.append([each for each in counter]) return text_len_dist
def transform(self, texts): """ transform data :texts: The texts to count word lengths in :returns: list of counts for each text """ mini, maxi = self.span num_counts = maxi - mini wt = WordTokenizer() tokens = [wt.tokenize(text) for text in texts] text_len_dist = [] for line_tokens in tokens: counter = [0]*num_counts for word in line_tokens: word_len = len(word) if mini <= word_len <= maxi: counter[word_len - 1] += 1 text_len_dist.append([each for each in counter]) return text_len_dist
import string from FeatureExtraction.mainExtractor import CharacterAnalyzer from textblob.tokenizers import SentenceTokenizer, WordTokenizer sentencer = SentenceTokenizer() worder = WordTokenizer() sentences = ['How are you? I am fine!'] tokens = [] for sent in sentencer.tokenize(sentences[0].lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0, len(word)): term = word[:j] + word[j + 1:] tokens.append(term.strip()) print tokens
import string from FeatureExtraction.mainExtractor import CharacterAnalyzer from textblob.tokenizers import SentenceTokenizer, WordTokenizer sentencer = SentenceTokenizer() worder = WordTokenizer(); sentences = ['How are you? I am fine!'] tokens = [] for sent in sentencer.tokenize(sentences[0].lower()): words = ''.join([ch for ch in sent if ch not in string.punctuation]) words = worder.tokenize(words) for word in words: tokens.append(word.strip()) if len(word) > 2: for j in range(0,len(word)): term = word[:j] + word[j+1:] tokens.append(term.strip()) print tokens