示例#1
0
class TestWordTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")

    def test_word_tokenize(self):
        tokens = word_tokenize(self.text)
        assert_true(is_generator(tokens))
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
示例#2
0
class TestWordTokenizer(unittest.TestCase):

    '''An example unit test case.'''

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")
示例#3
0
class TestWordTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")

    def test_word_tokenize(self):
        tokens = word_tokenize(self.text)
        assert_true(is_generator(tokens))
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
示例#4
0
def find_ngrams(tweets: List[str], n: int, top: int):

    ngram_counter: Counter = Counter()

    for tweet in tweets:
        tokenizer = WordTokenizer()
        tokens = tokenizer.tokenize(tweet, include_punc=True)

        for i in range(len(tokens) - n):
            subwords = ' '.join(tokens[i:i + n])
            ngram_counter[subwords] += 1

    print(ngram_counter.most_common(top))
示例#5
0
 def analyze(self, text):
     """Return the sentiment as a tuple of the form:
     ``(classification, pos_probability, neg_probability)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokenizer = WordTokenizer()
     tokens = tokenizer.tokenize(text, include_punc=False)
     filtered = [t.lower() for t in tokens if len(t) >= 3]
     feats = self._extract_feats(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     # classification, p_pos, p_neg
     return prob_dist.max(), prob_dist.prob('pos'), prob_dist.prob("neg")
示例#6
0
 def analyze(self, text):
     """Return the sentiment as a tuple of the form:
     ``(classification, pos_probability, neg_probability)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokenizer = WordTokenizer()
     tokens = tokenizer.tokenize(text, include_punc=False)
     filtered = [t.lower() for t in tokens if len(t) >= 3]
     feats = self._extract_feats(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     # classification, p_pos, p_neg
     return prob_dist.max(), prob_dist.prob('pos'), prob_dist.prob("neg")
示例#7
0
class TestWordTokenizer(unittest.TestCase):
    '''An example unit test case.'''
    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text), [
            'Python', 'is', 'a', 'high-level', 'programming', 'language', '.'
        ])

    def test_exclude_punc(self):
        assert_equal(
            self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming', 'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")
示例#8
0
class CharacterSkipGramAnalyzer(object):   
    def __init__(self):
        self.sentencer = SentenceTokenizer()
        self.worder = WordTokenizer();
    def __call__(self, doc):  
        tokens = []      
        for sent in self.sentencer.tokenize(doc.lower()):
            words = ''.join([ch for ch in sent if ch not in string.punctuation])
            words = self.worder.tokenize(words)
            
            for word in words:
                tokens.append(word.strip())
                if len(word) > 2:
                    for j in range(0,len(word)):    
                        term = word[:j] + word[j+1:] 
                        tokens.append(term.strip())
        return tokens
    def tokenize_text(self, block):
        '''
        Runs the text string through Text Blobs tokenizer/lemmatizer
        '''
        def lemmatize_word(word):
            w = Word(word)
            return w.lemmatize().lower()
        tokenizer = WordTokenizer()
        token = tokenizer.tokenize(block)

        filtered_words = [word.lower() for word in token if word not in ignoredwords]
        results = list(map(lemmatize_word, filtered_words))
        # pool = Pool(5)
        # results = pool.map(self.lemmatize_word, token)
        # pool.close()
        # pool.join()
        return results
示例#10
0
文件: blob.py 项目: DDani/TextBlob
    def correct(self):
        '''Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0

        :rtype: BaseBlob
        '''
        tok = WordTokenizer()
        corrected = (Word(w).correct() for w in tok.tokenize(self.raw, include_punc=True))
        # Separate each token with a space unless the token is a punctuation
        ret = ''
        for i, word in enumerate(corrected):
            # Avoid an extra space at the beginning
            if word in pystring.punctuation or i == 0:
                ret = ''.join([ret, word])
            else:
                ret = ' '.join([ret, word])
        return self.__class__(ret)
示例#11
0
    def correct(self):
        '''Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0

        :rtype: BaseBlob
        '''
        tok = WordTokenizer()
        corrected = (Word(w).correct()
                     for w in tok.tokenize(self.raw, include_punc=True))
        # Separate each token with a space unless the token is a punctuation
        ret = ''
        for i, word in enumerate(corrected):
            # Avoid an extra space at the beginning
            if word in pystring.punctuation or i == 0:
                ret = ''.join([ret, word])
            else:
                ret = ' '.join([ret, word])
        return self.__class__(ret)
示例#12
0
def clean_tweet(tweet: str,
                should_remove_stopwords: bool = False) -> CleanedTweet:
    # Extract tokens from each tweet
    tokenizer = WordTokenizer()
    tokens = tokenizer.tokenize(tweet, include_punc=True)

    cleaned_tokens: List[str] = []
    for token in tokens:
        t = SPECIAL_CHARS.sub('', token).lower()

        # Substitute the & symbol to standardize text
        if t == 'amp':
            t = 'and'

        # Skip all links and empty strings
        if should_keep_token(t, should_remove_stopwords):
            cleaned_tokens.append(t)  # Lowercase all tokens

    cleaned_tweet = ' '.join(cleaned_tokens)
    return CleanedTweet(text=cleaned_tweet, num_tokens=len(cleaned_tokens))
示例#13
0
    def transform(self, texts):
        """ transform data

        :texts: The texts to count word lengths in
        :returns: list of counts for each text

        """
        mini, maxi = self.span
        num_counts = maxi - mini
        wt = WordTokenizer()
        tokens = [wt.tokenize(text) for text in texts]
        text_len_dist = []
        for line_tokens in tokens:
            counter = [0] * num_counts
            for word in line_tokens:
                word_len = len(word)
                if mini <= word_len <= maxi:
                    counter[word_len - 1] += 1
            text_len_dist.append([each for each in counter])
        return text_len_dist
示例#14
0
    def transform(self, texts):
        """ transform data

        :texts: The texts to count word lengths in
        :returns: list of counts for each text

        """
        mini, maxi = self.span
        num_counts = maxi - mini
        wt = WordTokenizer()
        tokens = [wt.tokenize(text) for text in texts]
        text_len_dist = []
        for line_tokens in tokens:
            counter = [0]*num_counts
            for word in line_tokens:
                word_len = len(word)
                if mini <= word_len <= maxi:
                    counter[word_len - 1] += 1
            text_len_dist.append([each for each in counter])
        return text_len_dist
示例#15
0
import string

from FeatureExtraction.mainExtractor import CharacterAnalyzer
from textblob.tokenizers import SentenceTokenizer, WordTokenizer

sentencer = SentenceTokenizer()
worder = WordTokenizer()

sentences = ['How are you? I am fine!']

tokens = []
for sent in sentencer.tokenize(sentences[0].lower()):
    words = ''.join([ch for ch in sent if ch not in string.punctuation])
    words = worder.tokenize(words)

    for word in words:
        tokens.append(word.strip())
        if len(word) > 2:
            for j in range(0, len(word)):
                term = word[:j] + word[j + 1:]
                tokens.append(term.strip())

print tokens
示例#16
0
import string

from FeatureExtraction.mainExtractor import CharacterAnalyzer
from textblob.tokenizers import SentenceTokenizer, WordTokenizer


sentencer = SentenceTokenizer()
worder = WordTokenizer();

sentences = ['How are you? I am fine!']

tokens = []      
for sent in sentencer.tokenize(sentences[0].lower()):
    words = ''.join([ch for ch in sent if ch not in string.punctuation])
    words = worder.tokenize(words)
    
    for word in words:
        tokens.append(word.strip())
        if len(word) > 2:
            for j in range(0,len(word)):    
                term = word[:j] + word[j+1:] 
                tokens.append(term.strip())

print tokens