예제 #1
0
def main():
    parser = argparse.ArgumentParser(description='clwsd')
    parser.add_argument('--sourceword', type=str, required=True)
    parser.add_argument('--targetlang', type=str, required=True)
    parser.add_argument('--taggerhome', type=str, required=True)
    args = parser.parse_args()

    all_target_languages = "de es fr it nl".split()
    assert args.targetlang in all_target_languages
    target = args.targetlang
    sourceword = args.sourceword
    stanford.taggerhome = args.taggerhome

    gold_answers = read_gold.get_gold_answers(sourceword, target)
    instances = get_training_data(sourceword, target)
    print("... training ...")
    nltk.classify.megam.config_megam(bin='/usr/local/bin/megam')
    classifier = MaxentClassifier.train(instances, trace=0, algorithm='megam')
    print("LABELS", classifier.labels())

    ## with open("../eval/{0}.output".format(sourceword), "w") as outfile:
    fn = "../trialdata/alltrials/{0}.data".format(sourceword)
    problems = extract_wsd_problems(fn)
    for problem in problems:
        featureset = features.extract(problem)
        answer = classifier.classify(featureset)
        print(output_one_best(problem, target, answer))
        label = gold_answers[problem.instance_id]
        print("CORRECT" if label == answer else "WRONG")
        print("distribution was...")
        dist = classifier.prob_classify(featureset)
        for key in dist.samples():
            print(" ", key, dist.prob(key))
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description='clwsd')
    parser.add_argument('--sourceword', type=str, nargs=1, required=True)
    parser.add_argument('--targetlang', type=str, nargs=1, required=True)
    parser.add_argument('--classifier', type=str, nargs=1, required=False)
    args = parser.parse_args()

    all_target_languages = "de es fr it nl".split()
    assert args.targetlang[0] in all_target_languages
    target = args.targetlang[0]
    sourceword = args.sourceword[0]
    nltk.classify.megam.config_megam(bin='/usr/local/bin/megam')
    classifier = get_maxent_classifier(sourceword, target)

    fn = "../trialdata/alltrials/{0}.data".format(sourceword)
    ## XXX(alexr): fix later.
    stanford.taggerhome = "/home/alex/software/stanford-postagger-2012-11-11"
    problems = extract_wsd_problems(fn)
    gold_answers = read_gold.get_gold_answers(sourceword, target)
    for problem in problems:
        featureset = features.extract(problem)
        answer = classifier.classify(featureset)
        print(problem.tokenized)
        print(answer)
        label = gold_answers[problem.instance_id]
        print("CORRECT" if label == answer else "WRONG", end=" ")
        print("should be:", label)
예제 #3
0
def get_training_data(sourceword, target):
    """Return a list of (featureset, label) for training."""
    out = []
    ## map from id to labels
    gold_answers = read_gold.get_gold_answers(sourceword, target)
    problems = get_training_problems(sourceword)

    ## now collate them.
    for problem in problems:
        theid = problem.instance_id
        featureset = features.extract(problem)
        label = gold_answers[theid]
        out.append((featureset, label))
    return out