Пример #1
0
def benchmark_polyglot_mdl():
    """
    Running ployglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot
    
    """

    start = time.time()

    tags_pred = []
    for tokens in sentences_tokens:
        word_list = WordList(tokens, language='da')
        ne_chunker = POSTagger(lang='da')
        word_ent_tuples = list(ne_chunker.annotate(word_list))

        tags_pred.append([entity for word, entity in word_ent_tuples])
    print('**Polyglot model**')
    print("Made predictions on {} sentences and {} tokens in {}s".format(
        num_sentences, num_tokens,
        time.time() - start))

    assert len(tags_pred) == num_sentences
    assert sum([len(s) for s in tags_pred]) == num_tokens

    print(classification_report(tags_true, tags_pred, digits=4))
Пример #2
0
def benchmark_polyglot_mdl(corrected_output=False):
    """
    Running polyglot requires these packages:
    # Morfessor==2.0.6
    # PyICU==2.4.2
    # pycld2==0.41
    # polyglot

    """
    def udify_tag(tag, word):
        if tag == "CONJ":
            return "CCONJ"
        if tag == "VERB" and word in auxiliary_verbs:
            return "AUX"
        return tag

    start = time.time()

    tags_pred = []
    for tokens in sentences_tokens:
        word_list = WordList(tokens, language='da')
        tagger = POSTagger(lang='da')
        word_tag_tuples = list(tagger.annotate(word_list))
        tags_pred.append([
            udify_tag(tag, word) if corrected_output else tag
            for word, tag in word_tag_tuples
        ])
    print('**Polyglot model' +
          (' (corrected output) ' if corrected_output else '') + '**')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(tags_pred) == num_sentences
    assert sum([len(s) for s in tags_pred]) == num_tokens

    print(accuracy_report(tags_true, tags_pred), end="\n\n")
Пример #3
0
def tagOnly(
    text: str,
    lng: str,
):
    print("TAG=====(tagOnly)")
    print("TEXT=" + text)
    print("LNG=" + lng)
    tagger = POSTagger(lang=lng)
    lines = text.split("\n")
    taggeds = []
    for line in lines:
        words = line.strip().split()
        tagged = tagger.annotate(words)
        if len(taggeds) > 0:
            taggeds.extend([["\n", "NL"]])
        taggeds.extend([[w, p] for w, p in tagged])
    return taggeds
Пример #4
0
def pos_tag(args):
    """Tag words with their part of speech."""
    tagger = POSTagger(lang=args.lang)
    tag(tagger, args)
Пример #5
0
 def pos_tagger(self):
     return POSTagger(lang=self.language.code)