Пример #1
0
def repl(model, lm, emissions, cfd):
    while True:
        try:
            line = input('>')
        except: break
        line = line.strip()
        sentences = nltk.sent_tokenize(line)
        s_tokenized = [nltk.word_tokenize(sent) for sent in sentences]
        tokenized = []
        for sent in s_tokenized: tokenized.extend(sent)

        tagger = stanford.get_tagger()
        postags = [t.lower() for (w,t) in tagger.tag(tokenized)]

        sss = learn.maybe_lemmatize([tokenized], 'en', tt_home)
        lemmas = sss[0]
        ss = list(map(nltk.tag.tuple2str, zip(lemmas,postags)))
        print(" ".join(ss))

        if model == "unigram":
            tagged = skinnyhmm.mfs(cfd, ss)
        if model == "bigram":
            tagged = skinnyhmm.viterbi(lm, emissions, cfd, ss)
        elif model == "trigram":
            tagged = searches.beam(lm, emissions, cfd, ss, beamwidth=BEAMWIDTH)
        print(tagged)
Пример #2
0
def main():
    parser = argparse.ArgumentParser(description="clwsd")
    parser.add_argument("--sourcetext", type=str, required=True)
    parser.add_argument("--taggerhome", type=str, required=True)

    args = parser.parse_args()
    stanford.taggerhome = args.taggerhome
    sourcefn = args.sourcetext

    tagger = stanford.get_tagger()

    with open(sourcefn) as infile:
        sents = [line.strip().split() for line in infile]
    tagged_sents = tagger.batch_tag(sents)
    print("tagged.")

    with open(sourcefn + ".pretagged", "w") as outfile:
        for tagged_sent in tagged_sents:
            print(" ".join(list(map(nltk.tag.tuple2str, tagged_sent))), file=outfile)
Пример #3
0
def extract_wsd_problems(fn):
    handler = SentenceExtractor()
    parser = make_parser()
    parser.setContentHandler(handler)
    parser.parse(fn)

    out = []
    for (lexelt, head_count, context, inst) in list(handler.sentences):
        problem = WSDProblem(lexelt, context, instance_id=inst, testset=True)
        out.append(problem)

    sents = [problem.tokenized for problem in out]
    tagger = stanford.get_tagger()
    tagged_sents = tagger.batch_tag(sents)
    assert len(tagged_sents) == len(out)
    for tagged_sent, problem in zip(tagged_sents, out):
        problem.tagged = tagged_sent
    print("tagged.")
    return out