예제 #1
0
def main():
    parser = argparse.ArgumentParser(description='clwsd')
    parser.add_argument('--sourceword', type=str, nargs=1, required=True)
    parser.add_argument('--targetlang', type=str, nargs=1, required=True)
    parser.add_argument('--classifier', type=str, nargs=1, required=False)
    args = parser.parse_args()

    all_target_languages = "de es fr it nl".split()
    assert args.targetlang[0] in all_target_languages
    target = args.targetlang[0]
    sourceword = args.sourceword[0]
    nltk.classify.megam.config_megam(bin='/usr/local/bin/megam')
    classifier = get_maxent_classifier(sourceword, target)

    fn = "../trialdata/alltrials/{0}.data".format(sourceword)
    ## XXX(alexr): fix later.
    stanford.taggerhome = "/home/alex/software/stanford-postagger-2012-11-11"
    problems = extract_wsd_problems(fn)
    gold_answers = read_gold.get_gold_answers(sourceword, target)
    for problem in problems:
        featureset = features.extract(problem)
        answer = classifier.classify(featureset)
        print(problem.tokenized)
        print(answer)
        label = gold_answers[problem.instance_id]
        print("CORRECT" if label == answer else "WRONG", end=" ")
        print("should be:", label)
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description='clwsd')
    parser.add_argument('--sourceword', type=str, required=True)
    parser.add_argument('--targetlang', type=str, required=True)
    parser.add_argument('--taggerhome', type=str, required=True)
    args = parser.parse_args()

    all_target_languages = "de es fr it nl".split()
    assert args.targetlang in all_target_languages
    target = args.targetlang
    sourceword = args.sourceword
    stanford.taggerhome = args.taggerhome

    gold_answers = read_gold.get_gold_answers(sourceword, target)
    instances = get_training_data(sourceword, target)
    print("... training ...")
    nltk.classify.megam.config_megam(bin='/usr/local/bin/megam')
    classifier = MaxentClassifier.train(instances, trace=0, algorithm='megam')
    print("LABELS", classifier.labels())

    ## with open("../eval/{0}.output".format(sourceword), "w") as outfile:
    fn = "../trialdata/alltrials/{0}.data".format(sourceword)
    problems = extract_wsd_problems(fn)
    for problem in problems:
        featureset = features.extract(problem)
        answer = classifier.classify(featureset)
        print(output_one_best(problem, target, answer))
        label = gold_answers[problem.instance_id]
        print("CORRECT" if label == answer else "WRONG")
        print("distribution was...")
        dist = classifier.prob_classify(featureset)
        for key in dist.samples():
            print(" ", key, dist.prob(key))
예제 #3
0
def main():
    """Quick demo for the feature extractor."""
    parser = argparse.ArgumentParser(description="clwsd")
    parser.add_argument("--sourceword", type=str, required=True)
    parser.add_argument("--taggerhome", type=str, required=True)
    args = parser.parse_args()
    sourceword = args.sourceword
    stanford.taggerhome = args.taggerhome

    fns = ["../trialdata/alltrials/{0}.data".format(sourceword)]

    for fn in fns:
        problems = extract_wsd_problems(fn)
        for problem in problems:
            print("**** PROBLEM ****")
            tokenized = nltk.tag.untag(problem.tagged)
            print(" ".join(tokenized))
            print(problem.head_indices)
            features = extract(problem)
예제 #4
0
def get_test_instances(trialdir, sourceword):
    """Given a trialdir and a source word, load up all the problems that we need
    to solve."""
    fn = "{0}/{1}.data".format(trialdir, sourceword)
    return extract_wsd_problems(fn)
예제 #5
0
def get_training_problems(sourceword):
    fn = "../trialdata/alltrials/{0}.data".format(sourceword)
    problems = extract_wsd_problems(fn)
    return problems