def training_job(corpus_fname, k, p, seed, static, dev_fname, model_out_prefix):
   sents = [s for s in io.conll_to_sents(file(corpus_fname)) if isprojective.is_projective(s)]
   print "training ",corpus_fname,k,p,seed,len(sents)

   explore = ExplorePolicy(k,p)
   TRAIN_OUT_FILE = "%s-ef.kps-k%s-p%s-seed%s" % (model_out_prefix, k, p, seed)
   if static:
      TRAIN_OUT_FILE = "%s-ef.kps-static-seed%s" % (model_out_prefix, seed)
      explore=None

   model = Model("features/znp.py", "%s.weights" % TRAIN_OUT_FILE)
   model.save("%s.model" % TRAIN_OUT_FILE)
   random.seed(seed)
   train(sents, model, dev=None, ITERS=20,save_every=None,explore_policy=explore,shuffle_sents=True)
   print "training of",corpus_fname,k,p,seed,"done"
   print "parsing"

   parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE + ".weights.FINAL", "features/znp.py")
   outf = file(TRAIN_OUT_FILE + ".dev.parsed","w")
   for sent in parsed:
      io.out_conll(sent, outf, parent='pparent',prel='pprel')
   uas,las,complete = eval(parsed)
   puas,plas,complete = eval(parsed,ignore_punct=True)
   outf.close()
   outf = file(TRAIN_OUT_FILE + ".dev.scores","w")
   print >> outf, "UAS:",uas,"LAS:",las,"NP_UAS:",puas,"NP_LAS:",plas
   outf.close()

   print "deleting"
   os.unlink(TRAIN_OUT_FILE + ".weights.FINAL")
   os.unlink(TRAIN_OUT_FILE + ".model")
示例#2
0
def training_job(corpus_fname, k, p, seed, static, dev_fname,
                 model_out_prefix):
    from training import online_greedy_train
    sents = [
        s for s in io.conll_to_sents(file(corpus_fname))
        if isprojective.is_projective(s)
    ]
    print "training ", corpus_fname, k, p, seed, len(sents)
    labels = set()
    for sent in sents:
        for tok in sent:
            if tok['prel'] == '_': tok['prel'] = 'dep'
            #tok['prel'] = 'dep'
            labels.add(tok['prel'])

    oracle = ArcHybridStaticOracle() if static else ArcHybridDynamicOracle()
    explore = None if static else ExplorePolicy(k, p)
    print "start"
    feature_extractor = features.extractors.get("hybrid.1")
    action_map, params = online_greedy_train(
        sents,
        transition_system=ArcHybridState,
        oracle=oracle,
        feature_extractor=feature_extractor,
        labels=labels,
        iterations=15,
        explore_policy=ExplorePolicy(k, p),
        random_seed=seed,
        shuffle_corpus=True)
    print "end"
    params.finalize()
    TRAIN_OUT_FILE = "%s-hybrid-k%s-p%s-seed%s" % (model_out_prefix, k, p,
                                                   seed)
    if static:
        TRAIN_OUT_FILE = "%s-hybrid-static-seed%s" % (model_out_prefix, seed)
    params.dump(file(TRAIN_OUT_FILE, "w"), sparse=True)
    pickle.dump(action_map, file(TRAIN_OUT_FILE + ".amap", "w"))
    print "training of", corpus_fname, k, p, seed, "done"

    print "parsing"
    parsed = parse_corpus(dev_fname, TRAIN_OUT_FILE, feature_extractor,
                          ArcHybridState)
    print "writing"
    outf = file(TRAIN_OUT_FILE + ".dev.parsed", "w")
    for sent in parsed:
        io.out_conll(sent, outf, parent='pparent', prel='pprel')
    uas, las, complete = eval(parsed)
    puas, plas, complete = eval(parsed, ignore_punct=True)
    outf.close()
    outf = file(TRAIN_OUT_FILE + ".dev.scores", "w")
    print >> outf, "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas
    print "UAS:", uas, "LAS:", las, "NP_UAS:", puas, "NP_LAS:", plas
    outf.close()

    print "deleting"
    os.unlink(TRAIN_OUT_FILE)
    os.unlink(TRAIN_OUT_FILE + ".amap")
def parse_corpus(corpus_fname, weights_fname, features_fname):
   fext = moduleloader.load_module(features_fname).FeaturesExtractor()
   m=MulticlassModel(weights_fname)
   parser=Parser(m,fext,None)
   parsed = []
   for sent in io.conll_to_sents(file(corpus_fname)):
      deps = parser.parse(sent)
      sent = deps.annotate(sent)
      parsed.append(sent)
   return parsed
示例#4
0
def parse_corpus(corpus_fname, model_fname, feature_extractor,
                 transition_system):
    params = ml.SparseMulticlassModel(file(model_fname), sparse=True)
    parser = TransitionBasedParser(params, transition_system,
                                   feature_extractor)
    parser.action_map = pickle.load(file(model_fname + ".amap"))
    parsed = []
    for sent in io.conll_to_sents(file(corpus_fname)):
        deps = parser.parse(sent)
        deps.annotate(sent)
        parsed.append(sent)
    return parsed
def eval_labeler(fname, devfile, fext=None, guides=_dummyguides()):
    '''
   just labeling accuracy
   '''
    devsents = io.conll_to_sents(file(devfile))
    labeler = SimpleSentenceLabeler(Labeler.load(fname), fext)
    good = 0.0
    bad = 0.0
    for i, (sent, sguide) in enumerate(zip(devsents, guides)):
        if i % 100 == 0:
            print i,
        labeler.label(sent, par='parent', prelout='pprel', sent_guides=sguide)
        for tok in sent:
            if tok['prel'] == tok['pprel']: good += 1
            else: bad += 1
    print
    print >> sys.stderr, "labaling acc:", good / (good + bad)
                  type="int",
                  default=20)
parser.add_option("--every",
                  dest="save_every",
                  action="store",
                  type="int",
                  default=1)

opts, args = parser.parse_args()

if len(args) < 1 or not (opts.model_file or opts.features_file):
    parser.print_usage()
    sys.exit(1)

TRAIN_FILE = args[0]
DEV_FILE = args[1] if len(args) > 1 else None
FEATURES = opts.features_file
MODEL = opts.model_file

model = Model(FEATURES, "%s.weights" % MODEL)
model.save("%s.model" % MODEL)

dev = [s for s in io.conll_to_sents(file(DEV_FILE))] if DEV_FILE else []

train_sents = list(io.conll_to_sents(file(TRAIN_FILE)))
print len(train_sents)
train_sents = [s for s in train_sents if isprojective.is_projective(s)]
print len(train_sents)

train(train_sents, model, dev, opts.iters, save_every=opts.save_every)
示例#7
0
                  "--eval",
                  action="store_true",
                  dest="eval",
                  default=False)
parser.add_option("--nopunct",
                  action="store_true",
                  dest="ignore_punc",
                  default=False)

opts, args = parser.parse_args()

if (not opts.model_file) or (len(args) != 1):
    parser.print_usage()
    sys.exit()

TEST_FILE = args[0]

model = Model.load("%s" % opts.model_file, opts.iter)

test_sents = [s for s in io.conll_to_sents(file(TEST_FILE))]

if opts.eval:
    test(test_sents,
         model,
         opts.iter,
         quiet=False,
         ignore_punc=opts.ignore_punc,
         beam_width=int(opts.beam))
else:
    parse(test_sents, model, opts.iter, beam_width=opts.beam)
示例#8
0
from easyfirst import test,parse,Model

from optparse import OptionParser

usage="""usage: %prog -m model [options] input_file """ 

parser = OptionParser(usage)
parser.add_option("-m","--model",dest="model_file")
parser.add_option("--iter",dest="iter",default="FINAL")
parser.add_option("-e","--eval",action="store_true",dest="eval",default=False)
parser.add_option("--nopunct",action="store_true",dest="ignore_punc",default=False)

opts, args = parser.parse_args()

if (not opts.model_file) or (len(args)!=1):
   parser.print_usage()
   sys.exit()

TEST_FILE = args[0]

model = Model.load("%s" % opts.model_file, opts.iter)

test_sents = [s for s in io.conll_to_sents(file(TEST_FILE))]

if opts.eval:
   test(test_sents,model,opts.iter,quiet=False,ignore_punc=opts.ignore_punc)
else:
   parse(test_sents,model,opts.iter)


示例#9
0
parser.add_option("-o","--model",dest="model_file")
parser.add_option("-f","--features",dest="features_file",default="None")
parser.add_option("--iters",dest="iters",action="store",type="int",default=20)
parser.add_option("--every",dest="save_every",action="store",type="int",default=1)

opts, args = parser.parse_args()

if len(args)<1 or not (opts.model_file or opts.features_file):
   parser.print_usage()
   sys.exit(1)

TRAIN_FILE = args[0]
DEV_FILE   = args[1] if len(args)>1 else None
FEATURES   = opts.features_file
MODEL      = opts.model_file


model = Model(FEATURES, "%s.weights" % MODEL)
model.save("%s.model" % MODEL)


dev = [s for s in io.conll_to_sents(file(DEV_FILE))] if DEV_FILE else []

train_sents = list(io.conll_to_sents(file(TRAIN_FILE)))
print len(train_sents)
train_sents = [s for s in train_sents if isprojective.is_projective(s)]
print len(train_sents)

train(train_sents, model, dev, opts.iters,save_every=opts.save_every)

示例#10
0
            # and that it does not have a subject already (they allowed x to do y)
            found = False
            subj = False
            for child in childs[tok['id']]:
                if child['prel'] == 'aux' and child['form'] == 'to':
                    found = True
                if 'subj' in child['prel']:
                    subj = True
                    break
            if found and not subj:
                parent = rsent[tok[par]]
                if no_adj_xsubj and parent['tag'][0] == 'J':
                    continue  # "they are *likely* to go"
                for child in childs[parent['id']]:
                    if 'subj' in child['prel']:
                        yield ("xsubj", tok, child)


if __name__ == '__main__':
    import sys
    sys.path.append("..")
    from pio import io
    from common import ROOT

    for i, sent in enumerate(io.conll_to_sents(sys.stdin)):
        for extra in extra_deps(sent):
            #print extra[0],extra[1]['form'],extra[2]['form']
            print i, "%s(%s-%s, %s-%s)" % (extra[0], extra[1]['form'],
                                           extra[1]['id'], extra[2]['form'],
                                           extra[2]['id'])
示例#11
0
def load_sentences(filename,ONLY_PROJECTIVE=False):
   sents = [s for s in io.conll_to_sents(file(filename)) if (not ONLY_PROJECTIVE) or isprojective.is_projective(s)]
   return sents
示例#12
0
                    state.do_action(action_map[gold_action])
    return action_map, params


if __name__ == '__main__':
    import sys
    import pickle
    import isprojective
    from pio import io
    from explore_policies import *
    from arceager import ArcEagerState
    from dynamicoracles import ArcEagerDynamicOracle_fixed
    import features.extractors

    sents = [
        s for s in list(io.conll_to_sents(file(sys.argv[1])))
        if isprojective.is_projective(s)
    ]
    labels = set()
    for sent in sents:
        for tok in sent:
            labels.add(tok['prel'])

    action_map, params = online_greedy_train(
        sents,
        transition_system=ArcEagerState,
        oracle=ArcEagerDynamicOracle_fixed(),
        feature_extractor=features.extractors.get("eager.zn"),
        labels=labels,
        iterations=2,
        explore_policy=ExplorePolicy(2, 0.9),
示例#13
0
gflags.DEFINE_boolean("only_proj", True,
                      "If true, prune non-projective sentences in training.")
gflags.DEFINE_boolean("add_dep_label", True,
                      "If true, replace the '_' label with 'dep'.")
gflags.DEFINE_integer("random_seed", 0, "Random seed.")

gflags.DEFINE_integer("save_every", 0, "Dump a model every k iterations.")

args = FLAGS(sys.argv)
print args

DATA_FILE = args[1]

featExt = extractors.get(FLAGS.feature_extractor)

sents = list(io.conll_to_sents(file(DATA_FILE)))

if FLAGS.train and (True or FLAGS.only_proj):
    import isprojective
    sents = [s for s in sents if isprojective.is_projective(s)]

if FLAGS.add_dep_label:
    for sent in sents:
        for tok in sent:
            if tok['prel'] == '_': tok['prel'] = "dep"

EXPLORE = 1

LABELED = True

MODE = "train" if FLAGS.train else "test"
示例#14
0
def load_sentences(filename, ONLY_PROJECTIVE=False):
    sents = [
        s for s in io.conll_to_sents(file(filename))
        if (not ONLY_PROJECTIVE) or isprojective.is_projective(s)
    ]
    return sents