예제 #1
0
 def predict(self, text: Generator[list[str]]) -> list[list[str]]:
     preds = list()
     for words in text:
         word_ents = list(
             self.model.annotate(WordList(words, language='da')))
         preds.append([ent for word, ent in word_ents])
     return preds
예제 #2
0
def benchmark_polyglot_mdl():
    """
    Running ployglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot
    """
    from polyglot.tag import NEChunker
    from polyglot.text import WordList

    start = time.time()

    predictions = []
    for tokens in sentences_tokens:
        word_list = WordList(tokens, language='da')
        ne_chunker = NEChunker(lang='da')
        word_ent_tuples = list(ne_chunker.annotate(word_list))

        predictions.append([entity for word, entity in word_ent_tuples])
    print('polyglot:')
    print_speed_performance(start, num_sentences, num_tokens)
    assert len(predictions) == len(sentences_entities)

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
예제 #3
0
def benchmark_polyglot_mdl(corrected_output=False):
    """
    Running polyglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot

    """
    def udify_tag(tag, word):
        if tag == "CONJ":
            return "CCONJ"
        if tag == "VERB" and word in auxiliary_verbs:
            return "AUX"
        return tag

    start = time.time()

    tags_pred = []
    for tokens in sentences_tokens:
        word_list = WordList(tokens, language='da')
        tagger = POSTagger(lang='da')
        word_tag_tuples = list(tagger.annotate(word_list))
        tags_pred.append([
            udify_tag(tag, word) if corrected_output else tag
            for word, tag in word_tag_tuples
        ])
    print('**Polyglot model' +
          (' (corrected output) ' if corrected_output else '') + '**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(tags_pred) == num_sentences
    assert sum([len(s) for s in tags_pred]) == num_tokens

    print(accuracy_report(tags_true, tags_pred), end="\n\n")
예제 #4
0
def benchmark_polyglot_mdl():
    """
    Running ployglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot
    """
    from polyglot.tag import NEChunker
    from polyglot.text import WordList

    start = time.time()

    predictions = []
    for tokens in sentences_tokens:
        word_list = WordList(tokens, language='da')
        ne_chunker = NEChunker(lang='da')
        word_ent_tuples = list(ne_chunker.annotate(word_list))

        predictions.append([entity for word, entity in word_ent_tuples])

    print("Made predictions on {} sentences and {} tokens in {}s".format(
        num_sentences, num_tokens, time.time() - start))
    assert len(predictions) == len(sentences_entities)

    print(classification_report(sentences_entities, remove_miscs(predictions),
                                digits=4))
예제 #5
0
def benchmark_polyglot_mdl():
    """
    Running ployglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot
    
    """

    start = time.time()

    tags_pred = []
    for tokens in sentences_tokens:
        word_list = WordList(tokens, language='da')
        ne_chunker = POSTagger(lang='da')
        word_ent_tuples = list(ne_chunker.annotate(word_list))

        tags_pred.append([entity for word, entity in word_ent_tuples])
    print('**Polyglot model**')
    print("Made predictions on {} sentences and {} tokens in {}s".format(
        num_sentences, num_tokens,
        time.time() - start))

    assert len(tags_pred) == num_sentences
    assert sum([len(s) for s in tags_pred]) == num_tokens

    print(classification_report(tags_true, tags_pred, digits=4))
예제 #6
0
파일: text.py 프로젝트: dari28/RebuildPR
    def tokens(self):
        """Return a list of tokens, using this blob's tokenizer object
        (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`).
        """
        seq = self.word_tokenizer.transform(Sequence(self.raw))
        tokens = WordList(seq.tokens(),
                          parent=self,
                          language=self.language.code)

        fix_hyphen = []
        i = 0
        # SIDE DELETE
        # while i < len(tokens):
        #     hyphen_word = ''
        #     while i + 3 < len(tokens) and tokens[i+1] == '-' and tokens[i+2] not in string.punctuation:
        #         if tokens[i+3] == '-':
        #             hyphen_word += tokens[i] + tokens[i+1]
        #             i += 2
        #             if i + 2 < len(tokens):
        #                 if tokens[i+1] == '-' and tokens[i+2] not in string.punctuation:
        #                     hyphen_word += tokens[i] + tokens[i + 1] + tokens[i+2]
        #                     # i+=3  # SIDE delete error [list out bound]
        #                     i += 1  # SIDE ADD
        #                     if tokens[i] != '-':
        #                         break
        #         else:
        #             hyphen_word += tokens[i] + tokens[i + 1] + tokens[i + 2]
        #             i += 3
        #             if tokens[i] != '-':
        #                 break
        #     if hyphen_word:
        #         fix_hyphen.append(hyphen_word)
        #         continue
        #     else:
        #         if i + 2 < len(tokens):
        #             if tokens[i] not in string.punctuation and tokens[i+1] == '-' and tokens[i+2] not in string.punctuation:
        #                     fix_hyphen.append(tokens[i]+tokens[i+1]+tokens[i+2])
        #                     i += 3
        #                     continue
        #     fix_hyphen.append(tokens[i])
        #     i+=1

        # SIDE ADD
        while i < len(tokens):
            hyphen_word = ''
            if fix_hyphen and tokens[i] == '-' and i + 1 < len(
                    tokens) and tokens[i + 1] not in string.punctuation:
                hyphen_word = tokens[i] + tokens[i + 1]
            if hyphen_word:
                fix_hyphen[-1] = fix_hyphen[-1] + hyphen_word
                i += 1
            else:
                fix_hyphen.append(tokens[i])
            i += 1

        if self.split_apostrophe:
            fix_apostrophe = []
            for token in fix_hyphen:
                if '\'' in token:
                    split = token.split('\'')
                    for i, t in enumerate(split):
                        fix_apostrophe.append(t)
                        if i != len(split) - 1:
                            fix_apostrophe.append('\'')
                else:
                    fix_apostrophe.append(token)
            return WordList(fix_apostrophe,
                            parent=self,
                            language=self.language.code)
        else:
            return WordList(fix_hyphen,
                            parent=self,
                            language=self.language.code)