Пример #1
0
 def test_stem(self): #only PorterStemmer tested
     w = tb.Word("cars")
     assert_equal(w.stem(), "car")
     w = tb.Word("wolves")
     assert_equal(w.stem(), "wolv")
     w = tb.Word("went")
     assert_equal(w.stem(), "went")
Пример #2
0
 def test_pop(self):
     wl = tb.WordList(['cats', 'dogs'])
     assert_equal(wl.pop(), tb.Word('dogs'))
     assert_raises(IndexError, wl.__getitem__, 1)
     assert_equal(wl.pop(), tb.Word('cats'))
     assert_equal(len(wl), 0)
     assert_raises(IndexError, wl.pop)
Пример #3
0
 def test_lemmatize(self):
     w = tb.Word("cars")
     assert_equal(w.lemmatize(), "car")
     w = tb.Word("wolves")
     assert_equal(w.lemmatize(), "wolf")
     w = tb.Word("went")
     assert_equal(w.lemmatize("v"), "go")
Пример #4
0
 def test_lemmatize(self):
     w = tb.Word("cars")
     assert_equal(w.lemmatize(), "car")
     w = tb.Word("wolves")
     assert_equal(w.lemmatize(), "wolf")
     w = tb.Word("went")
     assert_equal(w.lemmatize("v"), "go") # wordnet tagset
     assert_equal(w.lemmatize("VBD"), "go") # penn treebank tagset
Пример #5
0
def swap_paragraph(mapping, paragraph):
    b = textblob.TextBlob(paragraph.replace('--', u' – '))
    new_sentences = []

    for sentence in b.sentences:
        new_words = []
        lengths = []

        # Inlined to exclude 'if not PUNCTUATION_REGEX.match(unicode(t))]'
        sentence_pos_tags = [
            (textblob.Word(word, pos_tag=t), unicode(t))
            for word, t in sentence.pos_tagger.tag(sentence.raw)
        ]
        for (word, pos_tag), (next_word, next_pos_tag) in shift_zip(sentence_pos_tags):
            replacements = mapping.map(word, next_word)
            if replacements:
                new_word = Substitution(word, '|'.join(replacements))
            else:
                new_word = word

            new_words.append(new_word)
            lengths.append(max(len(x) for x in (word, pos_tag, new_word)))

            # TODO: remove whitespace around punctuation.

        #print ' '.join('%*s' % (l, x) for l, (x, t) in zip(lengths, sentence.pos_tags))
        #print ' '.join('%*s' % (l, t) for l, (x, t) in zip(lengths, sentence.pos_tags))
        #print ' '.join('%*s' % (l, y) for l, y      in zip(lengths, new_words))
        #print
        new_sentences.append(new_words)

    return reassemble(new_sentences)
Пример #6
0
    def map(self, original_word, successor):
        if original_word.pos_tag == PosTag.NNS:
            word = original_word.singularize()
            word.pos_tag = PosTag.NN
        else:
            word = original_word

        for rule in self.rules:
            if word.pos_tag in rule.pos and rule.matches(word):
                replacement = textblob.Word(rule.apply(word), pos_tag=word.pos_tag)
                if replacement == 'hress':
                    continue
                #if replacement.definitions == []:
                #    continue

                #if rule.if_followed_by is None or (
                #    successor is not None and
                #    successor.pos_tag in rule.if_followed_by)

                if original_word.pos_tag == PosTag.NNS:
                    return { replacement.pluralize() }
                else:
                    return { replacement }

        return set()
Пример #7
0
def is_body_part(word):
    #print "Entered is_animal with ", word
    w = textblob.Word(word).synsets
    if len(w) == 0:
        return False
    ss = w[0]
    return traverse(ss, "body_part")
Пример #8
0
 def lemmatize(word):
     if word in ShannonEntropy.lemmas:
         lemma = ShannonEntropy.lemmas[word]
     else:
         lemma = textblob.Word(word).lemmatize()
         ShannonEntropy.lemmas[word] = lemma
     return lemma
Пример #9
0
    def iter_filth(
            self,
            text,
            document_name: Optional[str] = None
    ) -> Generator[Filth, None, None]:
        """Yields discovered filth in the provided ``text``.

        :param text: The dirty text to clean.
        :type text: str
        :param document_name: The name of the document to clean.
        :type document_name: str, optional
        :return: An iterator to the discovered :class:`Filth`
        :rtype: Iterator[:class:`Filth`]
        """

        # find 'skype' in the text using a customized tokenizer. this makes
        # sure that all valid skype usernames are kept as tokens and not split
        # into different words
        tokenizer = nltk.tokenize.regexp.RegexpTokenizer(self.SKYPE_TOKEN)
        blob = textblob.TextBlob(text, tokenizer=tokenizer)
        skype_indices, tokens = [], []
        for i, token in enumerate(blob.tokens):
            tokens.append(token)
            if 'skype' in token.lower():
                skype_indices.append(i)

        # go through the words before and after skype words to identify
        # potential skype usernames.
        skype_usernames = []
        for i in skype_indices:
            jmin = max(i - self.word_radius, 0)
            jmax = min(i + self.word_radius + 1, len(tokens))
            for j in list(range(jmin, i)) + list(range(i + 1, jmax)):
                token = tokens[j]
                if self.SKYPE_USERNAME.match(token):

                    # this token is a valid skype username. Most skype
                    # usernames appear to be misspelled words. Word.spellcheck
                    # does not handle the situation of an all caps word very
                    # well, so we cast these to all lower case before checking
                    # whether the word is misspelled
                    if token.isupper():
                        token = token.lower()
                    word = textblob.Word(token)
                    suggestions = word.spellcheck()
                    corrected_word, score = suggestions[0]
                    if score < 0.5:
                        skype_usernames.append(token)

        # replace all skype usernames
        if skype_usernames:
            self.regex = re.compile('|'.join(skype_usernames))
            yield from super(SkypeDetector,
                             self).iter_filth(text,
                                              document_name=document_name)

        return
Пример #10
0
def is_motor_vehicle(word):
    #print "Entered is_animal with ", word
    w = textblob.Word(word).synsets
    if len(w) == 0:
        return False
    for ss in w:
        if traverse(ss, "motor_vehicle"):
            return True

    return False
Пример #11
0
 def run(self, message):
     try:
         blob = textblob.Word(message)
         defs = blob.definitions
         s = ""
         if len(defs) > 0:
             for item in defs[0:4]:
                 s += item.capitalize() + ".\n"
             self.manager.say(s)
         else:
             self.manager.say("No result found.")
     except RuntimeError:
         blob = textblob.Word(message)
         defs = blob.definitions
         s = ""
         if len(defs) > 0:
             for item in defs[0:4]:
                 s += item.capitalize() + ".\n"
             self.manager.say(s)
         else:
             self.manager.say("No result found.")
Пример #12
0
def patriarchy(word_ending_ess):
    w = textblob.Word(word_ending_ess)
    return {
        (lemma_name, ess_synset.lexname(),
         any(x in ess_synset.definition()
             for x in ('woman', 'girl', 'female')))
        for ess_synset in w.synsets if ess_synset.lexname() == u'noun.person'
        for ess_hyp_nym in ess_synset.hypernyms() + ess_synset.hyponyms()
        for lemma_name in ess_hyp_nym.lemma_names()
        if lemma_name != word_ending_ess
        if len(lemma_name) < len(word_ending_ess) if len(
            list(
                itertools.takewhile(lambda t: t[0] == t[1],
                                    zip(word_ending_ess, lemma_name)))) >= 3
    }
Пример #13
0
def annotate(sent):
    global TAGGER
    ts = TAGGER.tag(sent)

    for raw, pos in ts:
        pos_kind = pos[0].lower()
        w = textblob.Word(raw.lower())
        root = str(w)

        if is_not_word(raw[0]) or (pos == "SYM"):
            pos = "."
        elif pos_kind in ["n", "v"]:
            root = w.lemmatize(pos_kind)

        yield WordNode(raw=raw, pos=pos, root=root)
Пример #14
0
 def test_spellcheck_special_cases(self):
     # Punctuation
     assert_equal(tb.Word("!").spellcheck(), [("!", 1.0)])
     # Numbers
     assert_equal(tb.Word("42").spellcheck(), [("42", 1.0)])
     assert_equal(tb.Word("12.34").spellcheck(), [("12.34", 1.0)])
     # One-letter words
     assert_equal(tb.Word("I").spellcheck(), [("I", 1.0)])
     assert_equal(tb.Word("A").spellcheck(), [("A", 1.0)])
     assert_equal(tb.Word("a").spellcheck(), [("a", 1.0)])
Пример #15
0
    def iter_filth(self, text):

        # find 'skype' in the text using a customized tokenizer. this makes
        # sure that all valid skype usernames are kept as tokens and not split
        # into different words
        tokenizer = nltk.tokenize.regexp.RegexpTokenizer(
            self.filth_cls.SKYPE_TOKEN
        )
        blob = textblob.TextBlob(text, tokenizer=tokenizer)
        skype_indices, tokens = [], []
        for i, token in enumerate(blob.tokens):
            tokens.append(token)
            if 'skype' in token.lower():
                skype_indices.append(i)

        # go through the words before and after skype words to identify
        # potential skype usernames.
        skype_usernames = []
        for i in skype_indices:
            jmin = max(i-self.word_radius, 0)
            jmax = min(i+self.word_radius+1, len(tokens))
            for j in range(jmin, i) + range(i+1, jmax):
                token = tokens[j]
                if self.filth_cls.SKYPE_USERNAME.match(token):

                    # this token is a valid skype username. Most skype
                    # usernames appear to be misspelled words. Word.spellcheck
                    # does not handle the situation of an all caps word very
                    # well, so we cast these to all lower case before checking
                    # whether the word is misspelled
                    if token.isupper():
                        token = token.lower()
                    word = textblob.Word(token)
                    suggestions = word.spellcheck()
                    corrected_word, score = suggestions[0]
                    if score < 0.5:
                        skype_usernames.append(token)

        # replace all skype usernames
        if skype_usernames:
            self.filth_cls.regex = re.compile('|'.join(skype_usernames))
        else:
            self.filth_cls.regex = None
        return super(SkypeDetector, self).iter_filth(text)
def remove_noise(text, stop_words = ()):
    cleaned_text = ''

    for token, tag in text.pos_tags:
        token = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", token)
        #token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        token = tb.Word(token).lemmatize(pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_text += token.lower().strip() + ' '
    return tb.TextBlob(cleaned_text.strip())
Пример #17
0
def lookup_word(term):
    return term if isinstance(term, textblob.Word) else textblob.Word(term)
Пример #18
0
 def test_translate(self, mock_translate):
     mock_translate.return_value = 'gato'
     assert_equal(tb.Word("cat").translate(to="es"), "gato")
Пример #19
0
 def test_init(self):
     tb.Word("cat")
     assert_true(isinstance(self.cat, tb.Word))
     word = tb.Word('cat', 'NN')
     assert_equal(word.pos_tag, 'NN')
Пример #20
0
 def test_lemma(self):
     lemma = wn.Lemma('eat.v.01.eat')
     word = tb.Word("eat")
     assert_equal(word.synsets[0].lemmas()[0], lemma)
Пример #21
0
 def test_translate_without_from_lang(self, mock_translate):
     mock_translate.return_value = 'hi'
     assert_equal(tb.Word('hola').translate(), 'hi')
Пример #22
0
 def test_define(self):
     w = tb.Word("hack")
     synsets = w.get_synsets(wn.NOUN)
     definitions = w.define(wn.NOUN)
     assert_equal(len(synsets), len(definitions))
Пример #23
0
 def test_synset(self):
     syn = wn.Synset("dog.n.01")
     word = tb.Word("dog")
     assert_equal(word.synsets[0], syn)
Пример #24
0
 def test_synsets_with_pos_argument(self):
     w = tb.Word("work")
     noun_syns = w.get_synsets(pos=wn.NOUN)
     for synset in noun_syns:
         assert_equal(synset.pos(), wn.NOUN)
Пример #25
0
 def test_definitions(self):
     w = tb.Word("octopus")
     for definition in w.definitions:
         print(type(definition))
         assert_true(isinstance(definition, basestring))
Пример #26
0
 def test_synsets(self):
     w = tb.Word("car")
     assert_true(isinstance(w.synsets, (list, tuple)))
     assert_true(isinstance(w.synsets[0], Synset))
Пример #27
0
 def test_detect_language(self, mock_detect):
     mock_detect.return_value = 'fr'
     assert_equal(tb.Word("bonjour").detect_language(), 'fr')
Пример #28
0
 def test_lemma(self):
     w = tb.Word("wolves")
     assert_equal(w.lemma, "wolf")
     w = tb.Word("went", "VBD");
     assert_equal(w.lemma, "go")
Пример #29
0
 def test_spellcheck(self):
     blob = tb.Word("speling")
     suggestions = blob.spellcheck()
     assert_equal(suggestions[0][0], "spelling")
Пример #30
0
 def test_correct(self):
     w = tb.Word('speling')
     correct = w.correct()
     assert_equal(correct, tb.Word('spelling'))
     assert_true(isinstance(correct, tb.Word))