def main(): parser = argparse.ArgumentParser(description='clwsd') parser.add_argument('--sourceword', type=str, nargs=1, required=True) parser.add_argument('--targetlang', type=str, nargs=1, required=True) parser.add_argument('--classifier', type=str, nargs=1, required=False) args = parser.parse_args() all_target_languages = "de es fr it nl".split() assert args.targetlang[0] in all_target_languages target = args.targetlang[0] sourceword = args.sourceword[0] nltk.classify.megam.config_megam(bin='/usr/local/bin/megam') classifier = get_maxent_classifier(sourceword, target) fn = "../trialdata/alltrials/{0}.data".format(sourceword) ## XXX(alexr): fix later. stanford.taggerhome = "/home/alex/software/stanford-postagger-2012-11-11" problems = extract_wsd_problems(fn) gold_answers = read_gold.get_gold_answers(sourceword, target) for problem in problems: featureset = features.extract(problem) answer = classifier.classify(featureset) print(problem.tokenized) print(answer) label = gold_answers[problem.instance_id] print("CORRECT" if label == answer else "WRONG", end=" ") print("should be:", label)
def main(): parser = argparse.ArgumentParser(description='clwsd') parser.add_argument('--sourceword', type=str, required=True) parser.add_argument('--targetlang', type=str, required=True) parser.add_argument('--taggerhome', type=str, required=True) args = parser.parse_args() all_target_languages = "de es fr it nl".split() assert args.targetlang in all_target_languages target = args.targetlang sourceword = args.sourceword stanford.taggerhome = args.taggerhome gold_answers = read_gold.get_gold_answers(sourceword, target) instances = get_training_data(sourceword, target) print("... training ...") nltk.classify.megam.config_megam(bin='/usr/local/bin/megam') classifier = MaxentClassifier.train(instances, trace=0, algorithm='megam') print("LABELS", classifier.labels()) ## with open("../eval/{0}.output".format(sourceword), "w") as outfile: fn = "../trialdata/alltrials/{0}.data".format(sourceword) problems = extract_wsd_problems(fn) for problem in problems: featureset = features.extract(problem) answer = classifier.classify(featureset) print(output_one_best(problem, target, answer)) label = gold_answers[problem.instance_id] print("CORRECT" if label == answer else "WRONG") print("distribution was...") dist = classifier.prob_classify(featureset) for key in dist.samples(): print(" ", key, dist.prob(key))
def main(): """Quick demo for the feature extractor.""" parser = argparse.ArgumentParser(description="clwsd") parser.add_argument("--sourceword", type=str, required=True) parser.add_argument("--taggerhome", type=str, required=True) args = parser.parse_args() sourceword = args.sourceword stanford.taggerhome = args.taggerhome fns = ["../trialdata/alltrials/{0}.data".format(sourceword)] for fn in fns: problems = extract_wsd_problems(fn) for problem in problems: print("**** PROBLEM ****") tokenized = nltk.tag.untag(problem.tagged) print(" ".join(tokenized)) print(problem.head_indices) features = extract(problem)
def get_test_instances(trialdir, sourceword): """Given a trialdir and a source word, load up all the problems that we need to solve.""" fn = "{0}/{1}.data".format(trialdir, sourceword) return extract_wsd_problems(fn)
def get_training_problems(sourceword): fn = "../trialdata/alltrials/{0}.data".format(sourceword) problems = extract_wsd_problems(fn) return problems