def training_job(corpus_fname, k, p, seed, static, dev_fname, model_out_prefix): sents = [s for s in io.conll_to_sents(file(corpus_fname)) if isprojective.is_projective(s)] print "training ",corpus_fname,k,p,seed,len(sents) explore = ExplorePolicy(k,p) TRAIN_OUT_FILE = "%s-ef.kps-k%s-p%s-seed%s" % (model_out_prefix, k, p, seed) if static: TRAIN_OUT_FILE = "%s-ef.kps-static-seed%s" % (model_out_prefix, seed) explore=None model = Model("features/znp.py", "%s.weights" % TRAIN_OUT_FILE) model.save("%s.model" % TRAIN_OUT_FILE) random.seed(seed) train(sents, model, dev=None, ITERS=20,save_every=None,explore_policy=explore,shuffle_sents=True) print "training of",corpus_fname,k,p,seed,"done" print "parsing" parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE + ".weights.FINAL", "features/znp.py") outf = file(TRAIN_OUT_FILE + ".dev.parsed","w") for sent in parsed: io.out_conll(sent, outf, parent='pparent',prel='pprel') uas,las,complete = eval(parsed) puas,plas,complete = eval(parsed,ignore_punct=True) outf.close() outf = file(TRAIN_OUT_FILE + ".dev.scores","w") print >> outf, "UAS:",uas,"LAS:",las,"NP_UAS:",puas,"NP_LAS:",plas outf.close() print "deleting" os.unlink(TRAIN_OUT_FILE + ".weights.FINAL") os.unlink(TRAIN_OUT_FILE + ".model")
def training_job(corpus_fname, k, p, seed, static, dev_fname, model_out_prefix): from training import online_greedy_train sents = [ s for s in io.conll_to_sents(file(corpus_fname)) if isprojective.is_projective(s) ] print "training ", corpus_fname, k, p, seed, len(sents) labels = set() for sent in sents: for tok in sent: if tok['prel'] == '_': tok['prel'] = 'dep' #tok['prel'] = 'dep' labels.add(tok['prel']) oracle = ArcHybridStaticOracle() if static else ArcHybridDynamicOracle() explore = None if static else ExplorePolicy(k, p) print "start" feature_extractor = features.extractors.get("hybrid.1") action_map, params = online_greedy_train( sents, transition_system=ArcHybridState, oracle=oracle, feature_extractor=feature_extractor, labels=labels, iterations=15, explore_policy=ExplorePolicy(k, p), random_seed=seed, shuffle_corpus=True) print "end" params.finalize() TRAIN_OUT_FILE = "%s-hybrid-k%s-p%s-seed%s" % (model_out_prefix, k, p, seed) if static: TRAIN_OUT_FILE = "%s-hybrid-static-seed%s" % (model_out_prefix, seed) params.dump(file(TRAIN_OUT_FILE, "w"), sparse=True) pickle.dump(action_map, file(TRAIN_OUT_FILE + ".amap", "w")) print "training of", corpus_fname, k, p, seed, "done" print "parsing" parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE, feature_extractor, ArcHybridState) print "writing" outf = file(TRAIN_OUT_FILE + ".dev.parsed", "w") for sent in parsed: io.out_conll(sent, outf, parent='pparent', prel='pprel') uas, las, complete = eval(parsed) puas, plas, complete = eval(parsed, ignore_punct=True) outf.close() outf = file(TRAIN_OUT_FILE + ".dev.scores", "w") print >> outf, "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas print "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas outf.close() print "deleting" os.unlink(TRAIN_OUT_FILE) os.unlink(TRAIN_OUT_FILE + ".amap")
def parse_corpus(corpus_fname, weights_fname, features_fname): fext = moduleloader.load_module(features_fname).FeaturesExtractor() m=MulticlassModel(weights_fname) parser=Parser(m,fext,None) parsed = [] for sent in io.conll_to_sents(file(corpus_fname)): deps = parser.parse(sent) sent = deps.annotate(sent) parsed.append(sent) return parsed
def parse_corpus(corpus_fname, model_fname, feature_extractor, transition_system): params = ml.SparseMulticlassModel(file(model_fname), sparse=True) parser = TransitionBasedParser(params, transition_system, feature_extractor) parser.action_map = pickle.load(file(model_fname + ".amap")) parsed = [] for sent in io.conll_to_sents(file(corpus_fname)): deps = parser.parse(sent) deps.annotate(sent) parsed.append(sent) return parsed
def eval_labeler(fname, devfile, fext=None, guides=_dummyguides()): ''' just labeling accuracy ''' devsents = io.conll_to_sents(file(devfile)) labeler = SimpleSentenceLabeler(Labeler.load(fname), fext) good = 0.0 bad = 0.0 for i, (sent, sguide) in enumerate(zip(devsents, guides)): if i % 100 == 0: print i, labeler.label(sent, par='parent', prelout='pprel', sent_guides=sguide) for tok in sent: if tok['prel'] == tok['pprel']: good += 1 else: bad += 1 print print >> sys.stderr, "labaling acc:", good / (good + bad)
type="int", default=20) parser.add_option("--every", dest="save_every", action="store", type="int", default=1) opts, args = parser.parse_args() if len(args) < 1 or not (opts.model_file or opts.features_file): parser.print_usage() sys.exit(1) TRAIN_FILE = args[0] DEV_FILE = args[1] if len(args) > 1 else None FEATURES = opts.features_file MODEL = opts.model_file model = Model(FEATURES, "%s.weights" % MODEL) model.save("%s.model" % MODEL) dev = [s for s in io.conll_to_sents(file(DEV_FILE))] if DEV_FILE else [] train_sents = list(io.conll_to_sents(file(TRAIN_FILE))) print len(train_sents) train_sents = [s for s in train_sents if isprojective.is_projective(s)] print len(train_sents) train(train_sents, model, dev, opts.iters, save_every=opts.save_every)
"--eval", action="store_true", dest="eval", default=False) parser.add_option("--nopunct", action="store_true", dest="ignore_punc", default=False) opts, args = parser.parse_args() if (not opts.model_file) or (len(args) != 1): parser.print_usage() sys.exit() TEST_FILE = args[0] model = Model.load("%s" % opts.model_file, opts.iter) test_sents = [s for s in io.conll_to_sents(file(TEST_FILE))] if opts.eval: test(test_sents, model, opts.iter, quiet=False, ignore_punc=opts.ignore_punc, beam_width=int(opts.beam)) else: parse(test_sents, model, opts.iter, beam_width=opts.beam)
from easyfirst import test,parse,Model from optparse import OptionParser usage="""usage: %prog -m model [options] input_file """ parser = OptionParser(usage) parser.add_option("-m","--model",dest="model_file") parser.add_option("--iter",dest="iter",default="FINAL") parser.add_option("-e","--eval",action="store_true",dest="eval",default=False) parser.add_option("--nopunct",action="store_true",dest="ignore_punc",default=False) opts, args = parser.parse_args() if (not opts.model_file) or (len(args)!=1): parser.print_usage() sys.exit() TEST_FILE = args[0] model = Model.load("%s" % opts.model_file, opts.iter) test_sents = [s for s in io.conll_to_sents(file(TEST_FILE))] if opts.eval: test(test_sents,model,opts.iter,quiet=False,ignore_punc=opts.ignore_punc) else: parse(test_sents,model,opts.iter)
parser.add_option("-o","--model",dest="model_file") parser.add_option("-f","--features",dest="features_file",default="None") parser.add_option("--iters",dest="iters",action="store",type="int",default=20) parser.add_option("--every",dest="save_every",action="store",type="int",default=1) opts, args = parser.parse_args() if len(args)<1 or not (opts.model_file or opts.features_file): parser.print_usage() sys.exit(1) TRAIN_FILE = args[0] DEV_FILE = args[1] if len(args)>1 else None FEATURES = opts.features_file MODEL = opts.model_file model = Model(FEATURES, "%s.weights" % MODEL) model.save("%s.model" % MODEL) dev = [s for s in io.conll_to_sents(file(DEV_FILE))] if DEV_FILE else [] train_sents = list(io.conll_to_sents(file(TRAIN_FILE))) print len(train_sents) train_sents = [s for s in train_sents if isprojective.is_projective(s)] print len(train_sents) train(train_sents, model, dev, opts.iters,save_every=opts.save_every)
# and that it does not have a subject already (they allowed x to do y) found = False subj = False for child in childs[tok['id']]: if child['prel'] == 'aux' and child['form'] == 'to': found = True if 'subj' in child['prel']: subj = True break if found and not subj: parent = rsent[tok[par]] if no_adj_xsubj and parent['tag'][0] == 'J': continue # "they are *likely* to go" for child in childs[parent['id']]: if 'subj' in child['prel']: yield ("xsubj", tok, child) if __name__ == '__main__': import sys sys.path.append("..") from pio import io from common import ROOT for i, sent in enumerate(io.conll_to_sents(sys.stdin)): for extra in extra_deps(sent): #print extra[0],extra[1]['form'],extra[2]['form'] print i, "%s(%s-%s, %s-%s)" % (extra[0], extra[1]['form'], extra[1]['id'], extra[2]['form'], extra[2]['id'])
def load_sentences(filename,ONLY_PROJECTIVE=False): sents = [s for s in io.conll_to_sents(file(filename)) if (not ONLY_PROJECTIVE) or isprojective.is_projective(s)] return sents
state.do_action(action_map[gold_action]) return action_map, params if __name__ == '__main__': import sys import pickle import isprojective from pio import io from explore_policies import * from arceager import ArcEagerState from dynamicoracles import ArcEagerDynamicOracle_fixed import features.extractors sents = [ s for s in list(io.conll_to_sents(file(sys.argv[1]))) if isprojective.is_projective(s) ] labels = set() for sent in sents: for tok in sent: labels.add(tok['prel']) action_map, params = online_greedy_train( sents, transition_system=ArcEagerState, oracle=ArcEagerDynamicOracle_fixed(), feature_extractor=features.extractors.get("eager.zn"), labels=labels, iterations=2, explore_policy=ExplorePolicy(2, 0.9),
gflags.DEFINE_boolean("only_proj", True, "If true, prune non-projective sentences in training.") gflags.DEFINE_boolean("add_dep_label", True, "If true, replace the '_' label with 'dep'.") gflags.DEFINE_integer("random_seed", 0, "Random seed.") gflags.DEFINE_integer("save_every", 0, "Dump a model every k iterations.") args = FLAGS(sys.argv) print args DATA_FILE = args[1] featExt = extractors.get(FLAGS.feature_extractor) sents = list(io.conll_to_sents(file(DATA_FILE))) if FLAGS.train and (True or FLAGS.only_proj): import isprojective sents = [s for s in sents if isprojective.is_projective(s)] if FLAGS.add_dep_label: for sent in sents: for tok in sent: if tok['prel'] == '_': tok['prel'] = "dep" EXPLORE = 1 LABELED = True MODE = "train" if FLAGS.train else "test"
def load_sentences(filename, ONLY_PROJECTIVE=False): sents = [ s for s in io.conll_to_sents(file(filename)) if (not ONLY_PROJECTIVE) or isprojective.is_projective(s) ] return sents