Пример #1
0
class TestWordTokenizer(unittest.TestCase):

    '''An example unit test case.'''

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")
Пример #2
0
class TestWordTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")

    def test_word_tokenize(self):
        tokens = word_tokenize(self.text)
        assert_true(is_generator(tokens))
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
Пример #3
0
class TestWordTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")

    def test_word_tokenize(self):
        tokens = word_tokenize(self.text)
        assert_true(is_generator(tokens))
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
Пример #4
0
 def words(self):
     '''Return a list of word tokens. This excludes punctuation characters.
     If you want to include punctuation characters, access the ``tokens``
     property.
     '''
     # NLTK's word tokenizer expects sentences as input, so tokenize the
     # blob into sentences before tokenizing to words
     tok = WordTokenizer()
     words = chain.from_iterable(tok.itokenize(sent.raw, include_punc=False)
                                 for sent in self.sentences)
     return WordList(words)
Пример #5
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    tokenizer = WordTokenizer()
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
Пример #6
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    tokenizer = WordTokenizer()
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
Пример #7
0
 def analyze(self, text):
     """Return the sentiment as a tuple of the form:
     ``(classification, pos_probability, neg_probability)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokenizer = WordTokenizer()
     tokens = tokenizer.tokenize(text, include_punc=False)
     filtered = [t.lower() for t in tokens if len(t) >= 3]
     feats = self._extract_feats(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     # classification, p_pos, p_neg
     return prob_dist.max(), prob_dist.prob('pos'), prob_dist.prob("neg")
Пример #8
0
def find_ngrams(tweets: List[str], n: int, top: int):

    ngram_counter: Counter = Counter()

    for tweet in tweets:
        tokenizer = WordTokenizer()
        tokens = tokenizer.tokenize(tweet, include_punc=True)

        for i in range(len(tokens) - n):
            subwords = ' '.join(tokens[i:i + n])
            ngram_counter[subwords] += 1

    print(ngram_counter.most_common(top))
Пример #9
0
 def analyze(self, text):
     """Return the sentiment as a tuple of the form:
     ``(classification, pos_probability, neg_probability)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokenizer = WordTokenizer()
     tokens = tokenizer.tokenize(text, include_punc=False)
     filtered = [t.lower() for t in tokens if len(t) >= 3]
     feats = self._extract_feats(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     # classification, p_pos, p_neg
     return prob_dist.max(), prob_dist.prob('pos'), prob_dist.prob("neg")
Пример #10
0
 def analyze(self, text):
     """Return the sentiment as a named tuple of the form:
     ``Sentiment(classification, p_pos, p_neg)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokenizer = WordTokenizer()
     tokens = tokenizer.itokenize(text, include_punc=False)
     filtered = (t.lower() for t in tokens if len(t) >= 3)
     feats = self._extract_feats(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     return self.RETURN_TYPE(
         classification=prob_dist.max(), p_pos=prob_dist.prob("pos"), p_neg=prob_dist.prob("neg")
     )
Пример #11
0
def _get_words_from_dataset(dataset):
    '''Return a set of all words in a dataset.

    :param dataset: A list of tuples of the form ``(words, label)`` where
        ``words`` is either a string of a list of tokens.
    '''
    tokenizer = WordTokenizer()
    all_words = []
    for words, classification in dataset:
        # Words may either be a string or an iterable
        if isinstance(words, basestring):
            all_words.extend(tokenizer.itokenize(words, include_punc=False))
        else:
            all_words.extend(words)
    return set(all_words)
Пример #12
0
 def words(self):
     '''Return a list of word tokens. This excludes punctuation characters.
     If you want to include punctuation characters, access the ``tokens``
     property.
     '''
     return WordList(WordTokenizer().itokenize(self.raw,
                                               include_punc=False))
Пример #13
0
    def tag(self, corpus, tokenize=True):
        '''Tags a string `corpus`.'''
        # Assume untokenized corpus has \n between sentences and ' ' between words
        s_split = SentenceTokenizer(
        ).tokenize if tokenize else lambda t: t.split('\n')
        w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split()

        def split_sents(corpus):
            for s in s_split(corpus):
                yield w_split(s)

        prev, prev2 = self.START
        tokens = []
        for words in split_sents(corpus):
            context = self.START + [self._normalize(w)
                                    for w in words] + self.END
            for i, word in enumerate(words):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features(i, word, context, prev,
                                                  prev2)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens
    def tokenize_text(self, block):
        '''
        Runs the text string through Text Blobs tokenizer/lemmatizer
        '''
        def lemmatize_word(word):
            w = Word(word)
            return w.lemmatize().lower()
        tokenizer = WordTokenizer()
        token = tokenizer.tokenize(block)

        filtered_words = [word.lower() for word in token if word not in ignoredwords]
        results = list(map(lemmatize_word, filtered_words))
        # pool = Pool(5)
        # results = pool.map(self.lemmatize_word, token)
        # pool.close()
        # pool.join()
        return results
Пример #15
0
    def correct(self):
        '''Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0

        :rtype: BaseBlob
        '''
        tok = WordTokenizer()
        corrected = (Word(w).correct() for w in tok.tokenize(self.raw, include_punc=True))
        # Separate each token with a space unless the token is a punctuation
        ret = ''
        for i, word in enumerate(corrected):
            # Avoid an extra space at the beginning
            if word in pystring.punctuation or i == 0:
                ret = ''.join([ret, word])
            else:
                ret = ' '.join([ret, word])
        return self.__class__(ret)
Пример #16
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    tokenizer = WordTokenizer()
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict([(u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features])
    return features
Пример #17
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    tokenizer = WordTokenizer()
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict(((u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features))
    return features
Пример #18
0
    def correct(self):
        '''Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0

        :rtype: BaseBlob
        '''
        tok = WordTokenizer()
        corrected = (Word(w).correct()
                     for w in tok.tokenize(self.raw, include_punc=True))
        # Separate each token with a space unless the token is a punctuation
        ret = ''
        for i, word in enumerate(corrected):
            # Avoid an extra space at the beginning
            if word in pystring.punctuation or i == 0:
                ret = ''.join([ret, word])
            else:
                ret = ' '.join([ret, word])
        return self.__class__(ret)
Пример #19
0
    def transform(self, texts):
        """ transform data

        :texts: The texts to count word lengths in
        :returns: list of counts for each text

        """
        mini, maxi = self.span
        num_counts = maxi - mini
        wt = WordTokenizer()
        tokens = [wt.tokenize(text) for text in texts]
        text_len_dist = []
        for line_tokens in tokens:
            counter = [0]*num_counts
            for word in line_tokens:
                word_len = len(word)
                if mini <= word_len <= maxi:
                    counter[word_len - 1] += 1
            text_len_dist.append([each for each in counter])
        return text_len_dist
Пример #20
0
def clean_tweet(tweet: str,
                should_remove_stopwords: bool = False) -> CleanedTweet:
    # Extract tokens from each tweet
    tokenizer = WordTokenizer()
    tokens = tokenizer.tokenize(tweet, include_punc=True)

    cleaned_tokens: List[str] = []
    for token in tokens:
        t = SPECIAL_CHARS.sub('', token).lower()

        # Substitute the & symbol to standardize text
        if t == 'amp':
            t = 'and'

        # Skip all links and empty strings
        if should_keep_token(t, should_remove_stopwords):
            cleaned_tokens.append(t)  # Lowercase all tokens

    cleaned_tweet = ' '.join(cleaned_tokens)
    return CleanedTweet(text=cleaned_tweet, num_tokens=len(cleaned_tokens))
Пример #21
0
    def transform(self, texts):
        """ transform data

        :texts: The texts to count word lengths in
        :returns: list of counts for each text

        """
        mini, maxi = self.span
        num_counts = maxi - mini
        wt = WordTokenizer()
        tokens = [wt.tokenize(text) for text in texts]
        text_len_dist = []
        for line_tokens in tokens:
            counter = [0] * num_counts
            for word in line_tokens:
                word_len = len(word)
                if mini <= word_len <= maxi:
                    counter[word_len - 1] += 1
            text_len_dist.append([each for each in counter])
        return text_len_dist
Пример #22
0
 def words(self):
     '''Return a list of word tokens. This excludes punctuation characters.
     If you want to include punctuation characters, access the ``tokens``
     property.
     '''
     # NLTK's word tokenizer expects sentences as input, so tokenize the
     # blob into sentences before tokenizing to words
     words = []
     for sent in self.sentences:
         words.extend(WordTokenizer().tokenize(sent.raw,
                                               include_punc=False))
     return WordList(words)
Пример #23
0
class TestWordTokenizer(unittest.TestCase):
    '''An example unit test case.'''
    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text), [
            'Python', 'is', 'a', 'high-level', 'programming', 'language', '.'
        ])

    def test_exclude_punc(self):
        assert_equal(
            self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming', 'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")
Пример #24
0
class CharacterSkipGramAnalyzer(object):   
    def __init__(self):
        self.sentencer = SentenceTokenizer()
        self.worder = WordTokenizer();
    def __call__(self, doc):  
        tokens = []      
        for sent in self.sentencer.tokenize(doc.lower()):
            words = ''.join([ch for ch in sent if ch not in string.punctuation])
            words = self.worder.tokenize(words)
            
            for word in words:
                tokens.append(word.strip())
                if len(word) > 2:
                    for j in range(0,len(word)):    
                        term = word[:j] + word[j+1:] 
                        tokens.append(term.strip())
        return tokens
Пример #25
0
import string

from FeatureExtraction.mainExtractor import CharacterAnalyzer
from textblob.tokenizers import SentenceTokenizer, WordTokenizer

sentencer = SentenceTokenizer()
worder = WordTokenizer()

sentences = ['How are you? I am fine!']

tokens = []
for sent in sentencer.tokenize(sentences[0].lower()):
    words = ''.join([ch for ch in sent if ch not in string.punctuation])
    words = worder.tokenize(words)

    for word in words:
        tokens.append(word.strip())
        if len(word) > 2:
            for j in range(0, len(word)):
                term = word[:j] + word[j + 1:]
                tokens.append(term.strip())

print tokens
Пример #26
0
            cred_score -= 0.08
            output['team'] = -0.08

### PARTNERS/INVESTORS
    if 'partners' not in bodytext and 'investors' not in bodytext:
        cred_score -= 0.07
        output['partners'] = -0.07

### LANGUAGE             
# root words only
    guarantee_words = ['guarantee', 'fixed', 'periodic', 'regular', 'permanent', 'steady', 'promise', 'assur', 'always']
    profit_words = ['profit', 'return', 'payout', 'earnings', 'income', 'interest', 'revenue', 'yield']
    hype_words = ['revolution', 'huge', 'incredible', 'unbelievable' 'safest', 'simplest', 'best' 'totally', 'perfect', 'immediate']
    danger_pairs = [('never','worry'), ('always','safe')]
    # should use machine learning to come up with a list of synonyms of these words
    blob = TextBlob(bodytext, tokenizer=WordTokenizer())
    words = blob.tokens
    check_pairs(itertools.product(guarantee_words, profit_words), words, 0.1)
    # a phrase along the lines of "guaranteed profits" is flagged
    check_pairs(danger_pairs, words, 0.03)  # these "hype" words is flagged
    check_pairs(zip(hype_words, ['']*len(hype_words)), words, 0.05)
    # a phrase along the lines of "never worry" or "always safe"is flagged

### LEGAL INFORMATION
    if not ('terms & conditions' in bodytext or 'terms and conditions' in bodytext or 'terms of use' in bodytext):
        cred_score -= 0.03
        output['terms'] = -0.03

### WHITEPAPER
    have_whitepaper = False
    have_roadmap = False
Пример #27
0
 def setUp(self):
     self.tokenizer = WordTokenizer()
     self.text = "Python is a high-level programming language."
Пример #28
0
class BaseBlob(StringlikeMixin, BlobComparableMixin):
    """An abstract base class that all textblob classes will inherit from.
    Includes words, POS tag, NP, and word count properties. Also includes
    basic dunder and string methods for making objects like Python strings.

    :param text: A string.
    :param tokenizer: (optional) A tokenizer instance. If ``None``,
        defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
    :param np_extractor: (optional) An NPExtractor instance. If ``None``,
        defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
    :param pos_tagger: (optional) A Tagger instance. If ``None``,
        defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
    :param analyzer: (optional) A sentiment analyzer. If ``None``,
        defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
    :param parser: A parser. If ``None``, defaults to
        :class:`PatternParser <textblob.en.parsers.PatternParser>`.
    :param classifier: A classifier.

    .. versionchanged:: 0.6.0
        ``clean_html`` parameter deprecated, as it was in NLTK.
    """
    np_extractor = FastNPExtractor()
    pos_tagger = NLTKTagger()
    tokenizer = WordTokenizer()
    translator = Translator()
    analyzer = PatternAnalyzer()
    parser = PatternParser()

    def __init__(self,
                 text,
                 tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None,
                 clean_html=False):
        if not isinstance(text, basestring):
            raise TypeError('The `text` argument passed to `__init__(text)` '
                            'must be a string, not {0}'.format(type(text)))
        if clean_html:
            raise NotImplementedError(
                "clean_html has been deprecated. "
                "To remove HTML markup, use BeautifulSoup's "
                "get_text() function")
        self.raw = self.string = text
        self.stripped = lowerstrip(self.raw, all=True)
        _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
                           parser, classifier)

    @cached_property
    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.
        """
        return WordList(word_tokenize(self.raw, include_punc=False))

    @cached_property
    def tokens(self):
        """Return a list of tokens, using this blob's tokenizer object
        (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`).
        """
        return WordList(self.tokenizer.tokenize(self.raw))

    def tokenize(self, tokenizer=None):
        """Return a list of tokens, using ``tokenizer``.

        :param tokenizer: (optional) A tokenizer object. If None, defaults to
            this blob's default tokenizer.
        """
        t = tokenizer if tokenizer is not None else self.tokenizer
        return WordList(t.tokenize(self.raw))

    def parse(self, parser=None):
        """Parse the text.

        :param parser: (optional) A parser instance. If ``None``, defaults to
            this blob's default parser.

        .. versionadded:: 0.6.0
        """
        p = parser if parser is not None else self.parser
        return p.parse(self.raw)

    def classify(self):
        """Classify the blob using the blob's ``classifier``."""
        if self.classifier is None:
            raise NameError("This blob has no classifier. Train one first!")
        return self.classifier.classify(self.raw)

    @cached_property
    def sentiment(self):
        """Return a tuple of form (polarity, subjectivity ) where polarity
        is a float within the range [-1.0, 1.0] and subjectivity is a float
        within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is
        very subjective.

        :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity)``
        """
        return self.analyzer.analyze(self.raw)

    @cached_property
    def sentiment_assessments(self):
        """Return a tuple of form (polarity, subjectivity, assessments ) where
        polarity is a float within the range [-1.0, 1.0], subjectivity is a
        float within the range [0.0, 1.0] where 0.0 is very objective and 1.0
        is very subjective, and assessments is a list of polarity and
        subjectivity scores for the assessed tokens.

        :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity,
        assessments)``
        """
        return self.analyzer.analyze(self.raw, keep_assessments=True)

    @cached_property
    def polarity(self):
        """Return the polarity score as a float within the range [-1.0, 1.0]

        :rtype: float
        """
        return PatternAnalyzer().analyze(self.raw)[0]

    @cached_property
    def subjectivity(self):
        """Return the subjectivity score as a float within the range [0.0, 1.0]
        where 0.0 is very objective and 1.0 is very subjective.

        :rtype: float
        """
        return PatternAnalyzer().analyze(self.raw)[1]

    @cached_property
    def noun_phrases(self):
        """Returns a list of noun phrases for this blob."""
        return WordList([
            phrase.strip().lower()
            for phrase in self.np_extractor.extract(self.raw)
            if len(phrase) > 1
        ])

    @cached_property
    def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples
        """
        if isinstance(self, TextBlob):
            return [
                val for sublist in [s.pos_tags for s in self.sentences]
                for val in sublist
            ]
        else:
            return [(Word(word, pos_tag=t), unicode(t))
                    for word, t in self.pos_tagger.tag(self)
                    if not PUNCTUATION_REGEX.match(unicode(t))]

    tags = pos_tags

    @cached_property
    def word_counts(self):
        """Dictionary of word frequencies in this text.
        """
        counts = defaultdict(int)
        stripped_words = [lowerstrip(word) for word in self.words]
        for word in stripped_words:
            counts[word] += 1
        return counts

    @cached_property
    def np_counts(self):
        """Dictionary of noun phrase frequencies in this text.
        """
        counts = defaultdict(int)
        for phrase in self.noun_phrases:
            counts[phrase] += 1
        return counts

    def ngrams(self, n=3):
        """Return a list of n-grams (tuples of n successive words) for this
        blob.

        :rtype: List of :class:`WordLists <WordList>`
        """
        if n <= 0:
            return []
        grams = [
            WordList(self.words[i:i + n])
            for i in range(len(self.words) - n + 1)
        ]
        return grams

    def translate(self, from_lang="auto", to="en"):
        """Translate the blob to another language.
        Uses the Google Translate API. Returns a new TextBlob.

        Requires an internet connection.

        Usage:
        ::

            >>> b = TextBlob("Simple is better than complex")
            >>> b.translate(to="es")
            TextBlob('Lo simple es mejor que complejo')

        Language code reference:
            https://developers.google.com/translate/v2/using_rest#language-params

        .. versionadded:: 0.5.0.

        :param str from_lang: Language to translate from. If ``None``, will attempt
            to detect the language.
        :param str to: Language to translate to.
        :rtype: :class:`BaseBlob <BaseBlob>`
        """
        return self.__class__(
            self.translator.translate(self.raw,
                                      from_lang=from_lang,
                                      to_lang=to))

    def detect_language(self):
        """Detect the blob's language using the Google Translate API.

        Requires an internet connection.

        Usage:
        ::

            >>> b = TextBlob("bonjour")
            >>> b.detect_language()
            u'fr'

        Language code reference:
            https://developers.google.com/translate/v2/using_rest#language-params

        .. versionadded:: 0.5.0

        :rtype: str
        """
        return self.translator.detect(self.raw)

    def correct(self):
        """Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0

        :rtype: :class:`BaseBlob <BaseBlob>`
        """
        # regex matches: word or punctuation or whitespace
        tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w+|[^\w\s]|\s")
        corrected = (Word(w).correct() for w in tokens)
        ret = ''.join(corrected)
        return self.__class__(ret)

    def _cmpkey(self):
        """Key used by ComparableMixin to implement all rich comparison
        operators.
        """
        return self.raw

    def _strkey(self):
        """Key used by StringlikeMixin to implement string methods."""
        return self.raw

    def __hash__(self):
        return hash(self._cmpkey())

    def __add__(self, other):
        '''Concatenates two text objects the same way Python strings are
        concatenated.

        Arguments:
        - `other`: a string or a text object
        '''
        if isinstance(other, basestring):
            return self.__class__(self.raw + other)
        elif isinstance(other, BaseBlob):
            return self.__class__(self.raw + other.raw)
        else:
            raise TypeError(
                'Operands must be either strings or {0} objects'.format(
                    self.__class__.__name__))

    def split(self, sep=None, maxsplit=sys.maxsize):
        """Behaves like the built-in str.split() except returns a
        WordList.

        :rtype: :class:`WordList <WordList>`
        """
        return WordList(self._strkey().split(sep, maxsplit))
import pandas as pd
import numpy as np

WineDataset = pd.read_csv("../WineDataset.csv", encoding="ISO-8859-1")

#The goal of TextBlob is to gather the Polarity and Subjectivity of each description.
#Polarity has a range of [-1,1] and measures how positive or negative the description isself.
#Subjectivity has a range of [0,1] and measures how objective (0) or subjective (1) each description is.
#https://textblob.readthedocs.io/en/dev/quickstart.html#create-a-textblob
from textblob import TextBlob
from textblob.tokenizers import WordTokenizer
tokenizer = WordTokenizer()
WineDataset['Polarity'] = WineDataset["description"].apply(
    lambda text: TextBlob(text, tokenizer=tokenizer).polarity)
WineDataset['Subjectivity'] = WineDataset["description"].apply(
    lambda text: TextBlob(text, tokenizer=tokenizer).subjectivity)

#This model will contain each wines Polarity, Subjectivity, Price, Country of Origin, and the target will be
#the amount of points it recieved.
WineDataset_ = WineDataset.loc[:, [
    "Polarity", "Subjectivity", "price", "points", "country"
]]
columns = ["country"]
WineDataset_ = pd.get_dummies(WineDataset_, columns=columns)
WineDataset = WineDataset_.dropna()

X = WineDataset.drop(columns="points", axis=1)
Y = WineDataset["points"]

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,
Пример #30
0
def test_get_words_from_dataset():
    tok = WordTokenizer()
    all_words = []
    for words, _ in train_set:
        all_words.extend(tok.itokenize(words, include_punc=False))
    assert_equal(_get_words_from_dataset(train_set), set(all_words))
Пример #31
0
import string

from FeatureExtraction.mainExtractor import CharacterAnalyzer
from textblob.tokenizers import SentenceTokenizer, WordTokenizer


sentencer = SentenceTokenizer()
worder = WordTokenizer();

sentences = ['How are you? I am fine!']

tokens = []      
for sent in sentencer.tokenize(sentences[0].lower()):
    words = ''.join([ch for ch in sent if ch not in string.punctuation])
    words = worder.tokenize(words)
    
    for word in words:
        tokens.append(word.strip())
        if len(word) > 2:
            for j in range(0,len(word)):    
                term = word[:j] + word[j+1:] 
                tokens.append(term.strip())

print tokens
Пример #32
0
 def setUp(self):
     self.tokenizer = WordTokenizer()
     self.text = "Python is a high-level programming language."
Пример #33
0
 def test_tokens_property(self):
     assert_true(self.blob.tokens,
         tb.WordList(WordTokenizer().tokenize(self.text)))
Пример #34
0
import helper
import json
import os 
import sqlite3
from textblob.en.taggers import PatternTagger
from textblob.tokenizers import WordTokenizer

tk = WordTokenizer()
tagger = PatternTagger()

# since lots of repeat words, we store an index to the actual token.
keys = []
def key_to_int(key):
  try:
    return keys.index(key) 
  except ValueError:
    keys.append(key)
    return len(keys) - 1

ntoken_freq = {}
npos_freq = {}

conn = sqlite3.connect("data.db")
c = conn.cursor()

USAGE_MINIMUM = 15
NTOKENS_PURGE_THRESHOLD = 5E6

# used to track progress
posts_processed = 0
Пример #35
0
 def __init__(self):
     self.sentencer = SentenceTokenizer()
     self.worder = WordTokenizer();
Пример #36
0
class Blobber(object):
    """A factory for TextBlobs that all share the same tagger,
    tokenizer, parser, classifier, and np_extractor.

    Usage:

        >>> from textblob import Blobber
        >>> from textblob.taggers import NLTKTagger
        >>> from textblob.tokenizers import SentenceTokenizer
        >>> tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer())
        >>> blob1 = tb("This is one blob.")
        >>> blob2 = tb("This blob has the same tagger and tokenizer.")
        >>> blob1.pos_tagger is blob2.pos_tagger
        True

    :param tokenizer: (optional) A tokenizer instance. If ``None``,
        defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
    :param np_extractor: (optional) An NPExtractor instance. If ``None``,
        defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
    :param pos_tagger: (optional) A Tagger instance. If ``None``,
        defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
    :param analyzer: (optional) A sentiment analyzer. If ``None``,
        defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
    :param parser: A parser. If ``None``, defaults to
        :class:`PatternParser <textblob.en.parsers.PatternParser>`.
    :param classifier: A classifier.

    .. versionadded:: 0.4.0
    """

    np_extractor = FastNPExtractor()
    pos_tagger = NLTKTagger()
    tokenizer = WordTokenizer()
    analyzer = PatternAnalyzer()
    parser = PatternParser()

    def __init__(self,
                 tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None):
        _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
                           parser, classifier)

    def __call__(self, text):
        """Return a new TextBlob object with this Blobber's ``np_extractor``,
        ``pos_tagger``, ``tokenizer``, ``analyzer``, and ``classifier``.

        :returns: A new :class:`TextBlob <TextBlob>`.
        """
        return TextBlob(text,
                        tokenizer=self.tokenizer,
                        pos_tagger=self.pos_tagger,
                        np_extractor=self.np_extractor,
                        analyzer=self.analyzer,
                        parser=self.parser,
                        classifier=self.classifier)

    def __repr__(self):
        classifier_name = self.classifier.__class__.__name__ + "()" if self.classifier else "None"
        return ("Blobber(tokenizer={0}(), pos_tagger={1}(), "
                    "np_extractor={2}(), analyzer={3}(), parser={4}(), classifier={5})")\
                    .format(self.tokenizer.__class__.__name__,
                            self.pos_tagger.__class__.__name__,
                            self.np_extractor.__class__.__name__,
                            self.analyzer.__class__.__name__,
                            self.parser.__class__.__name__,
                            classifier_name)

    __str__ = __repr__
Пример #37
0
flags.DEFINE_bool("build_features", False, "build column features")
flags.DEFINE_bool("build_fasttext", False, "build fasttext features")
flags.DEFINE_bool("build_tfrecord", False,
                  "build tensorflow record input files")
flags.DEFINE_integer("nrows", 100, "The TOP number of rows to query")

prog = re.compile("[\\W\\d]", re.UNICODE)
prog_with_digits = re.compile("[\\W]", re.UNICODE)

stemmer = SnowballStemmer("russian", ignore_stopwords=True)

float_prog = re.compile(r"[-+]?\d*\.\d+|\d+", re.UNICODE)
dot_prog = re.compile(r'[xх*]', re.UNICODE)

TransTable = str.maketrans(dict.fromkeys(r'~/-\[\]()|{}:^+', ' '))
wt = WordTokenizer()
trans = Transliterator.createInstance('Latin-Cyrillic')

unit_lookup = {
    'г': 'грамм',
    'грам': 'грамм',
    'гр': 'грамм',
    'грамм': 'грамм',
    'gr': 'грамм',
    'ml': 'мл',
    'милл': 'мл',
    'млитр': 'мл',
    'млтр': 'мл',
    'мл': 'мл',
    'ш': 'шт',
    'шт': 'шт',