Пример #1
0
def main():
  from ngram import Ngram
  from model import Model
  from forest import Forest
  
  flags.DEFINE_integer("beam", 100, "beam size", short_name="b")
  flags.DEFINE_integer("debuglevel", 0, "debug level")
  flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)")
  flags.DEFINE_boolean("cube", True, "using cube pruning to speedup")
  flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k")
  flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r")
  

  argv = FLAGS(sys.argv)

  weights = Model.cmdline_model()
  lm = Ngram.cmdline_ngram()
  
  false_decoder = CYKDecoder(weights, lm)
  
  def non_local_scorer(cedge, ders):
    (lmsc, alltrans, sig) = false_decoder.deltLMScore(cedge.lhsstr, ders)
    fv = Vector()
    fv["lm"] = lmsc
    return ((weights.dot(fv), fv), alltrans, sig)
  cube_prune = CubePruning(FeatureScorer(weights), non_local_scorer, FLAGS.k, FLAGS.ratio)

  for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1):
    a = false_decoder.beam_search(forest, b = FLAGS.beam)
    b = cube_prune.run(forest.root)

    assert a[0], b[0].score[0]
    assert a[1], b[0].score[1]
    print a
    print b[0]
Пример #2
0
def main():
  from ngram import Ngram
  from model import Model
  from forest import Forest
  
  flags.DEFINE_integer("beam", 100, "beam size", short_name="b")
  flags.DEFINE_integer("debuglevel", 0, "debug level")
  flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)")
  flags.DEFINE_boolean("cube", True, "using cube pruning to speedup")
  flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k")
  flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r")
  

  argv = FLAGS(sys.argv)
  [outfile] = argv[1:]
  weights = Model.cmdline_model()
  lm = Ngram.cmdline_ngram()
  

  false_decoder = CYKDecoder(weights, lm)
  out = utility.getfile(outfile, 1)
  old_bleu = Bleu()
  new_bleu = Bleu()
  
  for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1):
    
    oracle_forest, oracle_item = oracle_extracter(forest, weights, false_decoder, 100, 2, extract=100)
    print >>sys.stderr, "processed sent %s " % i
    oracle_forest.dump(out)
    bleu, hyp, fv, edgelist = forest.compute_oracle(weights, 0.0, 1)

    forest.bleu.rescore(hyp)
    old_bleu += forest.bleu
    forest.bleu.rescore(oracle_item[0].full_derivation)
    new_bleu += forest.bleu

    bad_bleu, _, _, _ = oracle_forest.compute_oracle(weights, 0.0, -1)
    #for i in range(min(len(oracle_item), 5)):
     # print >>sys.stderr, "Oracle Trans: %s %s %s" %(oracle_item[i].full_derivation, oracle_item[i].score, str(oracle_item[i].score[2]))
     # print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[i].full_derivation))
    print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[0].full_derivation))
    print >>sys.stderr, "Worst new Oracle BLEU Score: %s"% (bad_bleu)
    print >>sys.stderr, "Old Oracle BLEU Score: %s"% (bleu)
    
    print >>sys.stderr, "Running Oracle BLEU Score: %s"% (new_bleu.compute_score())
    print >>sys.stderr, "Running Old Oracle BLEU Score: %s"% (old_bleu.compute_score())
Пример #3
0
    from ngram import Ngram
    from model import Model
    from forest import Forest

    flags.DEFINE_integer("beam", 100, "beam size", short_name="b")
    flags.DEFINE_integer("debuglevel", 0, "debug level")
    flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)")
    flags.DEFINE_boolean("cube", True, "using cube pruning to speedup")
    flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k")
    flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r")
  

    argv = FLAGS(sys.argv)

    weights = Model.cmdline_model()
    lm = Ngram.cmdline_ngram()

    decoder = CYKDecoder(weights, lm)

    tot_bleu = Bleu()
    tot_score = 0.
    tot_time = 0.
    tot_len = tot_fnodes = tot_fedges = 0

    tot_lmedges = 0
    tot_lmnodes = 0
    if FLAGS.debuglevel > 0:
        print >>logs, "beam size = %d" % FLAGS.beam

    for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1):
Пример #4
0
def main():
    gc.set_threshold(100000, 10, 10)
    flags.DEFINE_integer("beam", 100, "beam size", short_name="b")
    flags.DEFINE_integer("debuglevel", 0, "debug level")
    flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)")
    flags.DEFINE_boolean("cube", True, "using cube pruning to speedup")
    flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k")
    flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r")
    flags.DEFINE_boolean("dist", False, "ditributed (hadoop) training)")
    flags.DEFINE_string("prefix", "", "prefix for distributed training")
    flags.DEFINE_string("hadoop_weights", "", "hadoop weights (formatted specially)")
    flags.DEFINE_boolean("add_features", False, "add features to training data")
    flags.DEFINE_boolean("prune_train", False, "prune before decoding")
    flags.DEFINE_boolean("no_lm", False, "don't use the unigram language model")
    flags.DEFINE_boolean("pickleinput", False, "assumed input is pickled")
    flags.DEFINE_string("oracle_forests", None, "oracle forests", short_name="o")
    flags.DEFINE_string("feature_map_file", None, "file with the integer to feature mapping (for lbfgs)")
    flags.DEFINE_boolean("cache_input", False, "cache input sentences (only works for pruned input)")
    flags.DEFINE_string("rm_features", None, "list of features to remove")
    flags.DEFINE_boolean("just_basic", False, "remove all features but basic")

    argv = FLAGS(sys.argv)

    if FLAGS.weights:
        weights = Model.cmdline_model()
    else:
        vector = Vector()
        assert glob.glob(FLAGS.hadoop_weights)
        for file in glob.glob(FLAGS.hadoop_weights):
            for l in open(file):
                if not l.strip():
                    continue
                f, v = l.strip().split()
                vector[f] = float(v)
        weights = Model(vector)

    rm_features = set()
    if FLAGS.rm_features:
        for l in open(FLAGS.rm_features):
            rm_features.add(l.strip())

    lm = Ngram.cmdline_ngram()
    if FLAGS.no_lm:
        lm = None

    if argv[1] == "train":
        local_decode = ChiangPerceptronDecoder(weights, lm)
    elif argv[1] == "sgd" or argv[1] == "crf":
        local_decode = MarginalDecoder(weights, lm)
    else:
        local_decode = MarginalDecoder(weights, lm)

    if FLAGS.add_features:
        tdm = local_features.TargetDataManager()
        local_decode.feature_adder = FeatureAdder(tdm)
    local_decode.prune_train = FLAGS.prune_train
    local_decode.use_pickle = FLAGS.pickleinput
    local_decode.cache_input = FLAGS.cache_input
    print >> logs, "Cache input is %s" % FLAGS.cache_input
    if FLAGS.debuglevel > 0:
        print >> logs, "beam size = %d" % FLAGS.beam

    if argv[1] == "train":
        if not FLAGS.dist:
            perc = trainer.Perceptron.cmdline_perc(local_decode)
        else:
            train_files = [FLAGS.prefix + file.strip() for file in sys.stdin]
            perc = distributed_trainer.DistributedPerceptron.cmdline_perc(local_decode)
            perc.set_training(train_files)
        perc.train()
    elif argv[1] == "sgd":
        crf = sgd.BaseCRF.cmdline_crf(local_decode)
        crf.set_oracle_files([FLAGS.oracle_forests])
        crf.train()

    elif argv[1] == "crf":
        if not FLAGS.dist:
            crf = CRF.LBFGSCRF.cmdline_crf(local_decode)
            crf.set_oracle_files([FLAGS.oracle_forests])
            crf.set_feature_mappers(add_features.read_features(FLAGS.feature_map_file))
            crf.rm_features(rm_features)
            if FLAGS.just_basic:
                print "Enforcing Basic"
                crf.enforce_just_basic()
            crf.train()
        else:
            # train_files = [FLAGS.prefix+file.strip() for file in sys.stdin]
            # oracle_files = [file+".oracle" for file in train_files]
            print >> sys.stderr, "DistributedCRF"
            crf = distCRF.DistributedCRF.cmdline_distibuted_crf(local_decode)
            # os.system("~/.python/bin/dumbo rm train_input -hadoop /home/nlg-03/mt-apps/hadoop/0.20.1+169.89/")
            # os.system("~/.python/bin/dumbo put "+crf.trainfiles[0]+" train_input -hadoop /home/nlg-03/mt-apps/hadoop/0.20.1+169.89/")
            crf.set_feature_mappers(add_features.read_features(FLAGS.feature_map_file))
            crf.rm_features(rm_features)
            if FLAGS.just_basic:
                print "Enforcing Basic"
                crf.enforce_just_basic()

            # crf.set_oracle_files(oracle_files)
            crf.train()

    else:
        if not FLAGS.dist:
            print "Evaluating"
            eval = Evaluator(local_decode, [FLAGS.dev])
            eval.tune()
        else:
            dev_files = [FLAGS.prefix + file.strip() for file in sys.stdin]
            eval = Evaluator(local_decode, dev_files)
        print eval.eval(verbose=True).compute_score()