def tabulate(): if log.level >= 1: sys.stderr.write("(3) Tabulating filtered phrases\n") count = 1 inputfiles = [] for input in inputs: if os.path.isdir(input): inputfiles.extend( os.path.join(input, name) for name in os.listdir(input)) else: inputfiles.append(input) inputfiles = [file(inputfile) for inputfile in inputfiles] global fsum, esum, allsum, xsum, gram fsum = {} # c(lhs, french) esum = {} # c(lhs, english) allsum = 0.0 # c(*) xsum = {} # c(lhs) gram = {} # read in all rules with matching english sides at the same time. # this way, we can sum only those english sides that ever appeared # with a french side that passes the filter. for rules in read_rule_blocks(inputfiles): flag = False blocksum = 0. for r in rules: scores = r.scores weight = scores[0] allsum += weight blocksum += weight xsum[r.lhs] = xsum.get(r.lhs, 0.0) + weight if ffilter is None or ffilter.match( r.f ): # there used to be a shortcut here -- if fsum.has_key(r.f) #fsum[(r.lhs,r.f)] = fsum.get((r.lhs,r.f), 0.0) + weight fsum[r.f] = fsum.get(r.f, 0.0) + weight if r in gram: gram[r] += r else: gram[r] = r flag = True if log.level >= 1 and count % interval == 0: sys.stderr.write( "time: %f, memory: %s, rules in: %d, rules counted: %d\n" % (monitor.cpu(), monitor.memory(), count, len(gram))) count += 1 if flag: ewordsnorm = rules[0].e.handle() if ewordsnorm in esum: sys.stderr.write("warning: files not sorted properly\n") esum[ewordsnorm] = blocksum
flags.DEFINE_boolean("uniqstat", False, "print uniq states stat info") flags.DEFINE_boolean("seq", False, "print action sequence") flags.DEFINE_string("sim", None, "simulate action sequences from FILE", short_name="s") flags.DEFINE_boolean("profile", False, "profile") flags.DEFINE_boolean("output", True, "output parsed results (turn it off for timing data)") flags.DEFINE_boolean("early", False, "use early update") flags.DEFINE_string("fakemem", None, "read in a file to occupy memory") argv = FLAGS(sys.argv) from monitor import memory, human start_mem = memory() if FLAGS.fakemem: s = Model(FLAGS.fakemem) t = Model(FLAGS.fakemem) print >> logs, "memory usage after read in fake: ", human(memory(start_mem)) if FLAGS.weights is None: if not FLAGS.sim: print >> logs, "Error: must specify a weights file" + str(FLAGS) sys.exit(1) else: model = None # can simulate w/o a model else: model = Model(FLAGS.weights) #FLAGS.model, FLAGS.weights)
"simulate action sequences from FILE", short_name="s") flags.DEFINE_boolean("profile", False, "profile") flags.DEFINE_boolean( "output", True, "output parsed results (turn it off for timing data)") flags.DEFINE_boolean("early", False, "use early update") flags.DEFINE_string("fakemem", None, "read in a file to occupy memory") argv = FLAGS(sys.argv) from monitor import memory, human start_mem = memory() if FLAGS.fakemem: s = Model(FLAGS.fakemem) t = Model(FLAGS.fakemem) print >> logs, "memory usage after read in fake: ", human( memory(start_mem)) if FLAGS.weights is None: if not FLAGS.sim: print >> logs, "Error: must specify a weights file" + str(FLAGS) sys.exit(1) else: model = None # can simulate w/o a model else: model = Model(FLAGS.weights) #FLAGS.model, FLAGS.weights)
def train(self): start_mem = memory() starttime = time.time() if FLAGS.finaldump: Perceptron.best_weights = self.decoder.model.new_weights() ## model name : Intel(R) Xeon(R) CPU W3570 @ 3.20GHz print >> logs, "%d CPUs at %s %s" % (cpu_count(), os.popen("cat /proc/cpuinfo|grep [GM]Hz").readlines()[0].strip().split(":")[-1], os.popen("cat /proc/cpuinfo|grep [GM]Hz").readlines()[-1].strip().split(":")[-1]) print >> logs, "starting perceptron at", time.ctime() best_prec = 0 acc_steps = 0 for it in xrange(1, self.iter+1): Perceptron.curr = it #ram change #print >> logs, "iteration %d starts..............%s" % (it, time.ctime()) curr_mem = memory() # outside of multi # ram change #print >> logs, "memory usage at iter %d before pool: %s" % (it, human(memory(start_mem))) iterstarttime = time.time() if Perceptron.shuffle: self.shuffle_train() if not Perceptron.singletrain: pool = Pool(processes=self.ncpus) pool_time = time.time() - iterstarttime num_updates, early_updates, total_steps, bad_updates = 0, 0, 0, 0 ## new_allweights, new_weights = self.decoder.model.new_weights(), self.decoder.model.new_weights() # ram change #print >> logs, "memory usage at iter %d after pool: %s" % (it, human(memory(start_mem))) tt= time.time() # ram change #print >> logs, "before para time...", tt results = map(self.train_worker, self.trainchunks) if Perceptron.singletrain else \ pool.map(self.train_worker, self.trainchunks, chunksize=1) if FLAGS.mydouble: print >> logs, \ "|w|=", len(Perceptron.weights), "|avgw|=", len(Perceptron.weights) if FLAGS.avg else 0, \ "|dw|=", len(results[0][-1]) print >> logs, "after para time...", time.time() compute_time = time.time() - tt copy_time = 0 para_times = [] for dtime, size, (_num_updates, _early_updates, _steps, _bad_updates), _weights in results: num_updates += _num_updates early_updates += _early_updates total_steps += _steps bad_updates += _bad_updates factor = size / self.trainsize * Perceptron.learning_rate tt = time.time() if not Perceptron.singletrain: # singletrain: updated in place in one_pass_on_train() Perceptron.weights.iaddc(_weights, factor) del _weights #, _allweights copy_time += time.time() - tt para_times.append(dtime) del results if not Perceptron.singletrain: pool.close() pool.join() # ram change #print >> logs, "gc can't reach", gc.collect() #print >> logs, "pool_time= %.1f s, compute_walltime= %.1f s, compute_cputime= %.1f (%s), copy_time= %.1f s" \ # % (pool_time, compute_time, sum(para_times), " ".join("%.1f" % x for x in para_times), copy_time) #print >> logs, "memory usage at iter %d after fork: %s" % (it, human(memory(start_mem))) if not Perceptron.singletrain: # N.B.: in non-multiproc mode, self.c is updated in place Perceptron.c += self.trainsize / self.ncpus #print >> logs, "self.c=", Perceptron.c ##print >> logs, "w =", Perceptron.weights iterendtime = time.time() #print >> logs, "memory usage at iter %d: extra %s, total %s" % (it, # human(memory(curr_mem)), # human(memory(start_mem))) #if FLAGS.debuglevel >= 1: # print >> logs, "weights=", Perceptron.weights curr_mem = memory() #print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % (it, time.ctime()) #ram adding #self.decoder.model.weights = Perceptron.weights # OK if noavg; see above #Parser.State.model.weights = Perceptron.weights # multiprocessing: State.model is static #prec_before_avg = self.eval_on_dev() #print ("Eval on dev before averaging weights, iteration:"+str(it)+" prec:",str(prec_before_avg)) ## avgweights = self.avg_weights() if self.avg else Perceptron.weights if self.avg: ## Perceptron.weights.set_step(Perceptron.c) Perceptron.weights.set_avg(Perceptron.c) # if FLAGS.debuglevel >= 1: # print >> logs, "avgweights=", self.weights avgendtime = time.time() #print >> logs, "avg weights (trim) took %.1f seconds." % (avgendtime - iterendtime) ## avgweights = self.decoder.model.new_weights() self.decoder.model.weights = Perceptron.weights # OK if noavg; see above Parser.State.model.weights = Perceptron.weights # multiprocessing: State.model is static prec = self.eval_on_dev() print ("Eval on dev without averaging weights, iteration:"+str(it)+" prec:",str(prec)) #print >> logs, "eval on dev took %.1f seconds." % (time.time() - avgendtime) acc_steps += total_steps #print >> logs, "at iter {0}, updates {1} (early {4}, er {10:.1f}%), dev {2}{7}, |w| {3}, time {5:.3f}h acctime {6:.3f}h; steps {8} cover {9:.1f}% accsteps {11}; bad {12} br {13:.1f}%"\ # .format(it, num_updates, prec, len(Perceptron.weights), early_updates, \ # (time.time() - iterstarttime)/3600, # (time.time() - starttime)/3600., # "+" if prec > best_prec else "", # total_steps, 100.0*total_steps/Perceptron.trainsteps, # 100.*early_updates/num_updates, # acc_steps, # bad_updates, 100.*bad_updates/num_updates) # 13 elements logs.flush() if prec > best_prec: best_prec = prec best_it = it best_wlen = len(Perceptron.weights) if not FLAGS.finaldump: #print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format(it, prec) self.dump(Perceptron.weights, it) else: Perceptron.best_weights = Perceptron.weights.copy() if self.avg: Perceptron.weights.reset_avg(Perceptron.c) # restore non-avg print >> logs, "gc can't reach", gc.collect() logs.flush() # for hpc #print >> logs, "peaked at iteration {0}: {1}, |bestw|= {2}.".format(best_it, best_prec, best_wlen) if FLAGS.finaldump: #print >> logs, "Dumping best weights..." self.dump(Perceptron.best_weights, best_it)
if opts.french_parse_file: french_parse_file = open(makefilename(opts.french_parse_file), "w") else: french_parse_file = None if opts.english_parse_file: english_parse_file = open(makefilename(opts.english_parse_file), "w") else: english_parse_file = None if not opts.parallel or parallel.rank != parallel.master: thedecoder = make_decoder() if log.level >= 1: gc.collect() log.write("all structures loaded, memory=%s\n" % (monitor.memory(),)) def process(sent): goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) if goal is None: log.writeln("warning: parse failure") return None if opts.forest_dir: forest_file = gzip.open(os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w") forest_file.write( forest.forest_to_json( goal, fwords=sent.words, mode="english", models=thedecoder.models, weights=thedecoder.weights
def process(sent): # Need to add an flen attribute that gives the length of the input sentence. # In the lattice-decoding case, we have to make a guess. distance = sent.compute_distance() sent.flen = distance.get((0, sent.n - 1), None) # could be missing if n == 0 theoracle.input(sent) global decoder_errors try: goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) decoder_errors = 0 if goal is None: raise Exception("parse failure") except Exception: import traceback log.write("decoder raised exception: %s" % "".join(traceback.format_exception(*sys.exc_info()))) decoder_errors += 1 if decoder_errors >= 3: log.write( "decoder failed too many times, passing exception through!\n" ) raise else: return goal.rescore(theoracle.models, thedecoder.weights, add=True) best_vector, best = decoder.get_nbest(goal, 1)[0] best_mvector = theoracle.clean(best_vector) best_ovector = theoracle.finish(best_vector, best) best_loss = theoracle.make_weights( additive="sentence").dot(best_ovector) log.write("best hyp: %s %s cost=%s loss=%s\n" % (" ".join(sym.tostring(e) for e in best), best_vector, thedecoder.weights.dot(best_mvector), best_loss)) # Set up quadratic program qp = maxmargin.QuadraticProgram() cur_instance = ForestInstance(sent.id, goal) qp.add_instance(cur_instance) if opts.parallel: while MPI.COMM_WORLD.Iprobe(tag=1, source=MPI.ANY_SOURCE): log.writeln("received update...\n") recv_instance = MPI.COMM_WORLD.recv(tag=1, source=MPI.ANY_SOURCE) log.writeln("received update for %s" % (recv_instance.instance_id, )) # need to check for duplicate instances? qp.add_instance(recv_instance) # Add cached hyps if cache_hyps: for instance in qp.instances: hyps = hyp_cache[instance.instance_id] if len(hyps) > 0: log.writeln("retrieved %d cached hyps for %s" % (len(hyps), instance.instance_id)) for hyp in hyps: instance.add_hyp(hyp) # Make oracle weight vector oweights = theoracle.make_weights(additive="sentence") oweights *= -1 # Make vector of learning rates # We have to be careful to assign a learning rate to every possible feature # This is not very efficient feats = set() for item in goal.bottomup(): for ded in item.deds: feats.update(ded.dcost) for instance in qp.instances: for hyp in instance.hyps: feats.update(hyp.mvector) learning_rates = svector.Vector() for feat in feats: learning_rates[feat] = compute_feature_learning_rate(feat) if log.level >= 3: log.writeln("learning rate vector: %s" % learning_rates) qp.optimize(thedecoder.weights, oweights, learning_rate=learning_rates) thedecoder.weights.compact() log.write("feature weights: %s\n" % (thedecoder.weights * watch_features)) # update weight sum for averaging global nweights, sumweights_helper # sumweights_helper = \sum_{i=0}^n (i \Delta w_i) sumweights_helper += nweights * qp.delta_mweights() nweights += 1 # update feature scales if update_feature_scales: global sum_updates2, n_updates for instance in qp.instances: """u = svector.Vector(instance.hope.mvector) for hyp in instance.hyps: u -= hyp.alpha*hyp.mvector sum_updates2 += u*u""" for hyp in instance.hyps: if hyp is not instance.hope: # hyp = instance.hope is a non-update u = instance.hope.mvector - hyp.mvector sum_updates2 += hyp.alpha * (u * u) n_updates += hyp.alpha #log.write("sum of squared updates: %s\n" % (" ".join("%s=%s" % (f,sum_updates2[f]) for f in watch_features))) log.write("feature learning rates: %s\n" % (" ".join("%s=%s" % (f, compute_feature_learning_rate(f)) for f in watch_features))) if opts.parallel: # flush out filled requests global requests requests = [request for request in requests if not request.Test()] # transmit updates to other nodes # make a plain Instance (without forest) # we used to designate a hope translation, #send_instance = maxmargin.Instance(cur_instance.hyps, hope=cur_instance.hope, instance_id=cur_instance.sentid) # but now are letting the other node choose. send_instance = maxmargin.Instance(cur_instance.hyps, instance_id=cur_instance.sentid) for node in parallel.slaves: if node != parallel.rank: requests.append( MPI.COMM_WORLD.isend(send_instance, dest=node, tag=1)) # save all hyps for next time if cache_hyps: epsilon = 0.01 for instance in qp.instances: hyps = hyp_cache[instance.instance_id] for hyp in instance.hyps: for hyp1 in hyps: if (hyp.mvector - hyp1.mvector).normsquared() <= epsilon and ( hyp.ovector - hyp1.ovector).normsquared() <= epsilon: break else: if log.level >= 2: log.writeln("add hyp to cache: %s" % hyp) hyps.append(hyp) theoracle.update(best_ovector) sent.score_comps = best_ovector if log.level >= 1: gc.collect() log.write("done updating, memory = %s\n" % monitor.memory()) sent.ewords = [sym.tostring(e) for e in best] return sent
if log.level >= 1: log.write("Reading configuration from %s\n" % opts.config) execfile(opts.config) if len(args) >= 1 and args[0] != "-": input_file = file(args[0], "r") else: input_file = sys.stdin if len(args) >= 2 and args[1] != "-": output_file = file(args[1], "w") else: output_file = sys.stdout gc.collect() if log.level >= 1: log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu())) log.write("models: %s\n" % (" ".join(str(x.name) for x in models))) sents = sgml.read_raw(input_file) for sent in sents: mark = sent.getmark() if mark is not None: (tag, attrs) = mark if tag == "seg": sent.unmark() dattrs = sgml.attrs_to_dict(attrs) sent.meta = attrs extract_grammar(sent)
count = {} files = [] for words in input: line = " ".join(words) if opts.key: key = " ".join(words[keystart:keystop]) else: key = line if opts.parallel and myhash.myhash(key, modulus) != residue: continue count[line] = count.get(line, 0) + 1 if len(count) >= opts.max_types: if opts.verbose: sys.stderr.write( "writing counts to temporary file (memory=%s)\n" % monitor.memory()) keys = count.keys() keys.sort() f = tempfile.TemporaryFile() for key in keys: f.write("%s\t%s\n" % (count[key], key)) f.seek(0) files.append(f) count = {} del keys fileinput.close() sys.stderr.write("merging %d files to output (memory=%s)\n" % (len(files), monitor.memory())) heap = []
def process(sent): # Need to add an flen attribute that gives the length of the input sentence. # In the lattice-decoding case, we have to make a guess. distance = sent.compute_distance() sent.flen = distance.get((0,sent.n-1), None) # could be missing if n == 0 theoracle.input(sent) global decoder_errors try: goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) decoder_errors = 0 if goal is None: raise Exception("parse failure") except Exception: import traceback log.write("decoder raised exception: %s" % "".join(traceback.format_exception(*sys.exc_info()))) decoder_errors += 1 if decoder_errors >= 3: log.write("decoder failed too many times, passing exception through!\n") raise else: return goal.rescore(theoracle.models, thedecoder.weights, add=True) best_vector, best = decoder.get_nbest(goal, 1)[0] best_mvector = theoracle.clean(best_vector) best_ovector = theoracle.finish(best_vector, best) best_loss = theoracle.make_weights(additive="sentence").dot(best_ovector) log.write("best hyp: %s %s cost=%s loss=%s\n" % (" ".join(sym.tostring(e) for e in best), best_vector, thedecoder.weights.dot(best_mvector), best_loss)) # Set up quadratic program qp = maxmargin.QuadraticProgram() cur_instance = ForestInstance(sent.id, goal) qp.add_instance(cur_instance) if opts.parallel: while MPI.COMM_WORLD.Iprobe(tag=1, source=MPI.ANY_SOURCE): log.writeln("received update...\n") recv_instance = MPI.COMM_WORLD.recv(tag=1, source=MPI.ANY_SOURCE) log.writeln("received update for %s" % (recv_instance.instance_id,)) # need to check for duplicate instances? qp.add_instance(recv_instance) # Add cached hyps if cache_hyps: for instance in qp.instances: hyps = hyp_cache[instance.instance_id] if len(hyps) > 0: log.writeln("retrieved %d cached hyps for %s" % (len(hyps), instance.instance_id)) for hyp in hyps: instance.add_hyp(hyp) # Make oracle weight vector oweights = theoracle.make_weights(additive="sentence") oweights *= -1 # Make vector of learning rates # We have to be careful to assign a learning rate to every possible feature # This is not very efficient feats = set() for item in goal.bottomup(): for ded in item.deds: feats.update(ded.dcost) for instance in qp.instances: for hyp in instance.hyps: feats.update(hyp.mvector) learning_rates = svector.Vector() for feat in feats: learning_rates[feat] = compute_feature_learning_rate(feat) if log.level >= 3: log.writeln("learning rate vector: %s" % learning_rates) qp.optimize(thedecoder.weights, oweights, learning_rate=learning_rates) thedecoder.weights.compact() log.write("feature weights: %s\n" % (thedecoder.weights * watch_features)) # update weight sum for averaging global nweights, sumweights_helper # sumweights_helper = \sum_{i=0}^n (i \Delta w_i) sumweights_helper += nweights * qp.delta_mweights() nweights += 1 # update feature scales if update_feature_scales: global sum_updates2, n_updates for instance in qp.instances: """u = svector.Vector(instance.hope.mvector) for hyp in instance.hyps: u -= hyp.alpha*hyp.mvector sum_updates2 += u*u""" for hyp in instance.hyps: if hyp is not instance.hope: # hyp = instance.hope is a non-update u = instance.hope.mvector - hyp.mvector sum_updates2 += hyp.alpha*(u*u) n_updates += hyp.alpha #log.write("sum of squared updates: %s\n" % (" ".join("%s=%s" % (f,sum_updates2[f]) for f in watch_features))) log.write("feature learning rates: %s\n" % (" ".join("%s=%s" % (f,compute_feature_learning_rate(f)) for f in watch_features))) if opts.parallel: # flush out filled requests global requests requests = [request for request in requests if not request.Test()] # transmit updates to other nodes # make a plain Instance (without forest) # we used to designate a hope translation, #send_instance = maxmargin.Instance(cur_instance.hyps, hope=cur_instance.hope, instance_id=cur_instance.sentid) # but now are letting the other node choose. send_instance = maxmargin.Instance(cur_instance.hyps, instance_id=cur_instance.sentid) for node in parallel.slaves: if node != parallel.rank: requests.append(MPI.COMM_WORLD.isend(send_instance, dest=node, tag=1)) # save all hyps for next time if cache_hyps: epsilon = 0.01 for instance in qp.instances: hyps = hyp_cache[instance.instance_id] for hyp in instance.hyps: for hyp1 in hyps: if (hyp.mvector-hyp1.mvector).normsquared() <= epsilon and (hyp.ovector-hyp1.ovector).normsquared() <= epsilon: break else: if log.level >= 2: log.writeln("add hyp to cache: %s" % hyp) hyps.append(hyp) theoracle.update(best_ovector) sent.score_comps = best_ovector if log.level >= 1: gc.collect() log.write("done updating, memory = %s\n" % monitor.memory()) sent.ewords = [sym.tostring(e) for e in best] return sent
if opts.french_parse_file: french_parse_file = open(makefilename(opts.french_parse_file), "w") else: french_parse_file = None if opts.english_parse_file: english_parse_file = open(makefilename(opts.english_parse_file), "w") else: english_parse_file = None if not opts.parallel or parallel.rank != parallel.master: thedecoder = make_decoder() if log.level >= 1: gc.collect() log.write("all structures loaded, memory=%s\n" % (monitor.memory(), )) def process(sent): goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) if goal is None: log.writeln("warning: parse failure") return None if opts.forest_dir: forest_file = gzip.open( os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w") forest_file.write( forest.forest_to_json(goal,
def train(self): start_mem = memory() starttime = time.time() print >> logs, "starting perceptron at", time.ctime() best_prec = 0 for it in xrange(1, self.iter + 1): print >> logs, "iteration %d starts..............%s" % ( it, time.ctime()) curr_mem = memory() iterstarttime = time.time() num_updates, early_updates = self.one_pass_on_train() iterendtime = time.time() print >> logs, "memory usage at iter %d: extra %s, total %s" % ( it, human(memory(curr_mem)), human(memory(start_mem))) curr_mem = memory() print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % ( it, time.ctime()) avgweights = self.avg_weights() if self.avg else self.weights avgendtime = time.time() print >> logs, "avg weights (trim) took %.1f seconds." % ( avgendtime - iterendtime) if FLAGS.debuglevel >= 2: print >> logs, "avg w=", avgweights self.decoder.model.weights = avgweights prec = self.eval_on_dev() print >> logs, "eval on dev took %.1f seconds." % (time.time() - avgendtime) print >> logs, "at iteration {0}, updates= {1} (early {4}), dev= {2}, |w|= {3}, time= {5:.3f}h acctime= {6:.3f}h"\ .format(it, num_updates, prec, len(avgweights), early_updates, \ (time.time() - iterstarttime)/3600, (time.time() - starttime)/3600.) logs.flush() if prec > best_prec: best_prec = prec best_it = it best_wlen = len(avgweights) print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format( it, prec) self.dump(avgweights) self.decoder.model.weights = self.weights # restore non-avg del avgweights gc.collect() if FLAGS.mydouble: from mydouble import counts print >> logs, "mydouble usage and freed: %d %d" % counts() print >> logs, "peaked at iteration {0}: {1}, |bestw|= {2}.".format( best_it, best_prec, best_wlen) print >> logs, "perceptron training of %d iterations finished on %s (took %.2f hours)" % \ (it, time.ctime(), (time.time() - starttime)/3600.)
# the input. The output is sorted, just like # sort | uniq -c. count = {} files = [] for words in input: line = " ".join(words) if opts.key: key = " ".join(words[keystart:keystop]) else: key = line if opts.parallel and myhash.myhash(key,modulus) != residue: continue count[line] = count.get(line, 0)+1 if len(count) >= opts.max_types: if opts.verbose: sys.stderr.write("writing counts to temporary file (memory=%s)\n" % monitor.memory()) keys = count.keys() keys.sort() f = tempfile.TemporaryFile() for key in keys: f.write("%s\t%s\n" % (count[key], key)) f.seek(0) files.append(f) count = {} del keys fileinput.close() sys.stderr.write("merging %d files to output (memory=%s)\n" % (len(files), monitor.memory())) heap = [] for f in files:
def train(self): start_mem = memory() starttime = time.time() print >> logs, "starting perceptron at", time.ctime() best_prec = 0 for it in xrange(1, self.iter + 1): print >> logs, "iteration %d starts..............%s" % ( it, time.ctime()) curr_mem = memory() iterstarttime = time.time() self.decoder.num_edges = 0 num_updates, early_updates, num_steps = self.one_pass_on_train() iterendtime = time.time() print >> logs, "memory usage at iter %d: extra %s, total %s" % ( it, human(memory(curr_mem)), human(memory(start_mem))) if FLAGS.debuglevel >= 1: print >> logs, "weights=", self.weights curr_mem = memory() print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % ( it, time.ctime()) ## avgweights = self.avg_weights() if self.avg else self.weights avgtime = 0 timer = Mytime() if self.avg: ## print >> logs, " w=", self.weights ## print >> logs, " ".join(map(str, [x.get_step() for x in self.weights.values()])) self.weights.set_avg(self.c) avgtime += timer.gap() if FLAGS.debuglevel >= 1: print >> logs, "avgweights=", self.weights prec = self.eval_on_dev() print >> logs, "eval on dev took %.1f seconds." % timer.gap() print >> logs, "at iteration {0}, updates= {1} (early {4}), dev= {2}{7}, |w|= {3}, time= {5:.3f}h acctime= {6:.3f}h, root={10:.1%}"\ .format(it, num_updates, prec, len(self.weights), early_updates, (time.time() - iterstarttime)/3600, (time.time() - starttime)/3600., "+" if prec > best_prec else "", num_steps, self.decoder.num_edges, prec.root()) logs.flush() if prec > best_prec: best_prec = prec best_it = it best_wlen = len(self.weights) best_time = time.time() - starttime print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format( it, prec) if not FLAGS.dump_last: self.dump(self.weights) else: self.bestweights = self.weights.deepcopy() if self.avg: timer = Mytime() self.weights.reset_avg(self.c) # restore weights t = timer.gap() print >> logs, "avg weights (set/reset) took %.1f+%.1f=%.1f seconds." % ( avgtime, t, avgtime + t) ## self.decoder.model.weights = self.weights # restore non-avg ## del avgweights gc.collect() if FLAGS.mydouble: from mydouble import counts print >> logs, "mydouble usage and freed: %d %d" % counts() print >> logs, "peaked at iteration {0}: {1} ({3:.1f}h), |bestw|= {2}.".format( best_it, best_prec, best_wlen, best_time / 3600) print >> logs, best_prec.details() print >> logs, "perceptron training of %d iterations finished on %s (took %.2f hours)" % \ (it, time.ctime(), (time.time() - starttime)/3600.) if FLAGS.dump_last: self.dump(self.bestweights)
def main(): if FLAGS.sim is not None: sequencefile = open(FLAGS.sim) parser = Parser(model, b=FLAGS.beam) print >> logs, "memory usage before parsing: ", human(memory(start_mem)) totalscore = 0 totalstates = 0 totaluniq = 0 totaledges = 0 totaltime = 0 totalprec = DepVal() totaloracle = DepVal() print >> logs, "gc.collect unreachable: %d" % gc.collect() if FLAGS.manual_gc: gc.disable() i = 0 gctime = 0 for i, line in enumerate(shell_input(), 1): if FLAGS.manual_gc and i % FLAGS.gc == 0: print >> logs, "garbage collection...", tt = time.time() print >> logs, "gc.collect unreachable: %d" % gc.collect() tt = time.time() - tt print >> logs, "took %.1f seconds" % tt gctime += tt line = line.strip() if line[0]=="(": # input is a gold tree (so that we can evaluate) reftree = DepTree.parse(line) sentence = DepTree.sent # assigned in DepTree.parse() else: # input is word/tag list reftree = None sentence = [tuple(x.rsplit("/", 1)) for x in line.split()] # split by default returns list DepTree.sent = sentence if FLAGS.debuglevel >= 1: print >> logs, sentence print >> logs, reftree mytime.zero() if FLAGS.sim is not None: # simulation, not parsing actions = map(int, sequencefile.readline().split()) goal, feats = parser.simulate(actions, sentence) #if model is None score=0 print >> logs, feats score, tree = goal.score, goal.top() (nstates, nedges, nuniq) = (0, 0, 0) else: # real parsing if True: #FLAGS.earlystop: refseq = reftree.seq() if reftree is not None else None tree, myseq, score, _ = parser.try_parse(sentence, refseq, update=False) if FLAGS.early: print >> logs, "ref=", refseq print >> logs, "myt=", myseq refseq = refseq[:len(myseq)] # truncate _, reffeats = parser.simulate(refseq, sentence) _, myfeats = parser.simulate(myseq, sentence) print >> logs, "+feats", reffeats print >> logs, "-feats", myfeats nstates, nedges, nuniq = parser.stats() else: goal = parser.parse(sentence) nstates, nedges, nuniq = parser.stats() ## score, tree = goal.score, goal.top() # score, tree = mytree dtime = mytime.period() if not FLAGS.early and not FLAGS.profile: if FLAGS.forest: parser.dumpforest(i) elif FLAGS.output: if not FLAGS.kbest: print tree else: stuff = parser.beams[-1][:FLAGS.kbest] print "sent.%d\t%d" % (i, len(stuff)) for state in stuff: print "%.2f\t%s" % (state.score, state.tree()) print if FLAGS.oracle: oracle, oracletree = parser.forestoracle(reftree) totaloracle += oracle prec = DepTree.compare(tree, reftree) # OK if either is None searched = sum(x.derivation_count() for x in parser.beams[-1]) if FLAGS.forest else 0 print >> logs, "sent {i:-4} (len {l}):\tmodelcost= {c:.2f}\tprec= {p:.2%}"\ "\tstates= {ns} (uniq {uq})\tedges= {ne}\ttime= {t:.3f}\tsearched= {sp}" \ .format(i=i, l=len(sentence), c=score, p=prec.prec(), \ ns=nstates, uq=nuniq, ne=nedges, t=dtime, sp=searched) if FLAGS.seq: actions = goal.all_actions() print >> logs, " ".join(actions) check = simulate(actions, sentence, model) #if model is None score=0 checkscore = check.score checktree = check.top() print >> logs, checktree checkprec = checktree.evaluate(reftree) print >> logs, "verify: tree:%s\tscore:%s\tprec:%s" % (tree == checktree, score == checkscore, prec == checkprec) print >> logs, "sentence %-4d (len %d): modelcost= %.2lf\tprec= %.2lf\tstates= %d (uniq %d)\tedges= %d\ttime= %.3lf" % \ (i, len(sentence), checkscore, checkprec.prec100(), nstates, nuniq, nedges, dtime) totalscore += score totalstates += nstates totaledges += nedges totaluniq += nuniq totaltime += dtime totalprec += prec if i == 0: print >> logs, "Error: empty input." sys.exit(1) if FLAGS.featscache: print >> logs, "feature constructions: tot= %d shared= %d (%.2f%%)" % (State.tot, State.shared, State.shared / State.tot * 100) print >> logs, "beam= {b}, avg {a} sents,\tmodelcost= {c:.2f}\tprec= {p:.2%}" \ "\tstates= {ns:.1f} (uniq {uq:.1f})\tedges= {ne:.1f}\ttime= {t:.4f}\n{d:s}" \ .format(b=FLAGS.b, a=i, c=totalscore/i, p=totalprec.prec(), ns=totalstates/i, uq=totaluniq/i, ne=totaledges/i, t=totaltime/i, d=totalprec.details()) if FLAGS.uniqstat: for i in sorted(uniqstats): print >> logs, "%d\t%.1lf\t%d\t%d" % \ (i, sum(uniqstats[i]) / len(uniqstats[i]), \ min(uniqstats[i]), max(uniqstats[i])) if FLAGS.oracle: print >> logs, "oracle= ", totaloracle if FLAGS.manual_gc: print >> logs, "garbage collection took %.1f seconds" % gctime print >> logs, "memory usage after parsing: ", human(memory(start_mem)) if FLAGS.mydouble: from mydouble import counts print >> logs, "mydouble usage and freed: %d %d" % counts()
def main(argv=None): '''Call this from the command-line to create a pre-computed binary data array for later use''' if argv is None: argv = sys.argv parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] <input file> <output file>"+ "\n\nNote: -d,-s,-a, and -p are mutually exclusive") parser.add_option("-d", "--data-array", action="store_true", default=False, dest="da", help="Compile file into data array (default)") parser.add_option("-s", "--suffix-array", action="store_true", default=False, dest="sa", help="Compile file into suffix array") parser.add_option("-a", "--alignment", action="store_true", default=False, dest="a", help="Compile file into alignment") parser.add_option("-l", "--lexical", action="store_true", default=False, dest="l", help="Compile file into lex file") parser.add_option("-x", "--compute_lexical", action="store", nargs=2, dest="lex_args", help="Compute lex file from data", metavar="<f file> <e file>") parser.add_option("-p", "--parse", action="store_true", default=False, dest="p", help="Compile file into parse") parser.add_option("-b", "--binary-infile", action="store_true", default=False, dest="bin", help="Input file is binary (default: text)") parser.add_option("-t", "--text-outfile", action="store_true", default=False, dest="text", help="Output file is text (default: binary)") parser.add_option("-e", "--enhanced-outfile", action="store_true", default=False, dest="enhanced", help="Output file is enhanced text (default: binary)") parser.add_option("-r", action="store", nargs=7, dest="precomp_args", help="Precompute collocations (Hiero only)", metavar="max-len=<INT> max-nt=<INT> max-size=<INT> min-gap=<INT> rank1=<INT> rank2=<INT> sa=<FILE>") (options, args) = parser.parse_args() filetype_opts = [options.da, options.sa, options.a, options.p] if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2: parser.print_help() sys.exit(1) (infilename, outfilename) = args if options.bin: bin = " binary" else: bin = "" start_time = monitor.cpu() if options.precomp_args: if options.bin: obj = precomputation.Precomputation(infilename, from_binary=True) else: keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"]) precomp_opts = {} sys.stderr.write("Precomputing statistics for list %s\n" % infilename) for pair in options.precomp_args: (key, val) = pair.split("=") if key in keys: keys.remove(key) if key != "sa": val = int(val) precomp_opts[key] = val else: sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key) return 1 sa = csuf.SuffixArray(precomp_opts["sa"], True) obj = precomputation.Precomputation(infilename, sa, precompute_rank=precomp_opts["rank1"], precompute_secondary_rank=precomp_opts["rank2"], max_length=precomp_opts["max-len"], max_nonterminals=precomp_opts["max-nt"], train_max_initial_size=precomp_opts["max-size"], train_min_gap_size=precomp_opts["min-gap"]) elif options.sa: sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin)) obj = csuf.SuffixArray(infilename, options.bin) elif options.a: sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin)) obj = calignment.Alignment(infilename, options.bin) elif options.p: sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin)) obj = parse.ParseArray(infilename, options.bin) elif options.l: sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin)) obj = clex.CLex(infilename, options.bin) elif options.lex_args: ffile = options.lex_args[0] efile = options.lex_args[1] sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile)) fsarray = csuf.SuffixArray(ffile, True) earray = cdat.DataArray(efile, True) aarray = calignment.Alignment(infilename, True) obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray) else: sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin)) obj = cdat.DataArray(infilename, options.bin) sys.stderr.write(" Total time for read: %f\n" % (monitor.cpu() - start_time)) start_time = monitor.cpu() if options.text: sys.stderr.write("Writing text file %s...\n" % outfilename) obj.write_text(outfilename) elif options.enhanced: sys.stderr.write("Writing enhanced text file %s...\n" % outfilename) obj.write_enhanced(outfilename) else: sys.stderr.write("Writing binary file %s...\n" % outfilename) obj.write_binary(outfilename) sys.stderr.write("Finished.\n") sys.stderr.write(" Total time for write: %f\n" % (monitor.cpu() - start_time)) mem_use = float(monitor.memory()) metric = "B" if mem_use / 1000 > 1: mem_use /= 1000 metric = "KB" if mem_use / 1000 > 1: mem_use /= 1000 metric = "MB" if mem_use / 1000 > 1: mem_use /= 1000 metric = "GB" sys.stderr.write(" Memory usage: %.1f%s\n" % (mem_use, metric))
if opts.output_dir is not None and len(gram) >= opts.dump_size: if opts.parallel: name = "%04d.%04d" % (opts.parallel[0], n_dump) else: name = "%04d" % n_dump dump_rules(gram, opts.output_dir, name) dumped += len(gram) gram = {} n_dump += 1 if log.level >= 1 and count % slice == 0: sys.stderr.write("time: %f, sentences in: %d (%.1f/sec), " % (time.time() - start_time, count, slice / (time.time() - prev_time))) sys.stderr.write("rules out: %d+%d\n" % (dumped, len(gram))) sys.stderr.write("memory: %s\n" % monitor.memory()) prev_time = time.time() count += 1 if opts.output_dir is not None: if opts.parallel: name = "%04d.%04d" % (opts.parallel[0], n_dump) else: name = "%04d" % n_dump dump_rules(gram, opts.output_dir, name) else: dump_rules(gram, output_file) """if opts.output_forests: pickler.dump(sym.alphabet)"""
def process(sent): global alphas if online_learning: updates.clear() alphas.clear() theoracle.input(sent) log.write("done preparing\n") global decoder_errors try: goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) decoder_errors = 0 if goal is None: raise Exception("parse failure") except Exception: import traceback log.writeln( "decoder raised exception: %s %s" % (sent, "".join(traceback.format_exception(*sys.exc_info()))) ) decoder_errors += 1 if decoder_errors >= 100: log.write("decoder failed too many times, passing exception through!\n") raise else: return goal.rescore(theoracle.models, thedecoder.weights, add=True) bestv, best = decoder.get_nbest(goal, 1)[0] log.write("done decoding\n") bestscore = get_score(bestv, best) log.write( "best hyp: %s %s cost=%s score=%s\n" % (" ".join(sym.tostring(e) for e in best), bestv, thedecoder.weights.dot(bestv), bestscore) ) goldv, gold, goldscore = get_gold(sent, goal, thedecoder.weights) assert ( sent.id not in updates ) # in batch learning, this can happen, and we would have to undo the update associated with this sentence updates[sent.id] = [(svector.Vector(), 0.0)] alphas[sent.id] = [max_learning_rate] if opts.parallel: while True: if mpi.world.iprobe(tag=1): (sentid, vscores) = mpi.world.recv(tag=1) log.write("received update for %s\n" % (sentid,)) if sentid in updates: # see comment above log.write("ignoring update for %s\n" % (sentid,)) continue # drop this update on the floor updates[sentid] = vscores alphas[sentid] = [max_learning_rate] + [0.0] * (len(vscores) - 1) # since the first update is zero, the alphas & updates # are still consistent with weights else: break def oracle(weights): hyps = get_hyps(sent, goal, weights) return [(goldv - hypv, goldscore - hypscore) for (hypv, hyp, hypscore) in hyps] thedecoder.weights, alphas = cutting_plane(thedecoder.weights, updates, alphas, {sent.id: oracle}) remove_zeros(thedecoder.weights) log.write("feature weights: %s\n" % (thedecoder.weights * watch_features)) log.write("weight norm: %s\n" % (math.sqrt(thedecoder.weights.normsquared()))) # update weight sum for averaging global nweights, sumweights_helper # sumweights_helper = \sum_{i=0}^n (i \Delta w_i) for sentid in updates: for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]): apply_update(sumweights_helper, nweights * alpha * v) nweights += 1 # update feature scales if update_feature_scales: global sum_updates2, n_updates, feature_scales for sentid in updates: u = svector.Vector() for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]): u += alpha / max_learning_rate * v sum_updates2 += u * u n_updates += 1 try: default_feature_scale = 1.0 / compute_variance(0, n_updates) except ZeroDivisionError: default_feature_scale = 0.0 # pseudoinverse feature_scales = collections.defaultdict(lambda: default_feature_scale) for feat in sum_updates2: try: feature_scales[feat] = 1.0 / compute_variance(sum_updates2[feat], n_updates) except ZeroDivisionError: feature_scales[feat] = 0.0 # pseudoinverse log.write( "feature scales: %s\n" % (" ".join("%s=%s" % (f, feature_scales[f]) for f in watch_features if f in feature_scales)) ) if opts.parallel: # flush out filled requests global requests requests = [request for request in requests if not request.test()] # transmit updates to other nodes for node in parallel.slaves: if node != parallel.rank: requests.append(mpi.world.isend(value=(sent.id, updates[sent.id]), dest=node, tag=1)) bestv = theoracle.finish(bestv, best) theoracle.update(bestv) sent.score_comps = bestv if log.level >= 1: gc.collect() log.write("done updating, memory = %s\n" % monitor.memory()) sent.ewords = [sym.tostring(e) for e in best] return sent
def train(self): start_mem = memory() starttime = time.time() ## model name : Intel(R) Xeon(R) CPU W3570 @ 3.20GHz print >> logs, "%d CPUs at %s %s" % ( cpu_count(), os.popen("cat /proc/cpuinfo|grep GHz").readlines() [-1].strip().split(":")[-1], os.popen("cat /proc/cpuinfo|grep MHz") .readlines()[-1].strip().split(":")[-1]) print >> logs, "starting perceptron at", time.ctime() best_prec = 0 for it in xrange(1, self.iter + 1): print >> logs, "iteration %d starts..............%s" % ( it, time.ctime()) curr_mem = memory() # outside of multi print >> logs, "memory usage at iter %d before pool: %s" % ( it, human(memory(start_mem))) iterstarttime = time.time() if Perceptron.shuffle: self.shuffle_train() if not FLAGS.singletrain: pool = Pool(processes=self.ncpus) pool_time = time.time() - iterstarttime num_updates, early_updates = 0, 0 ## new_allweights, new_weights = self.decoder.model.new_weights(), self.decoder.model.new_weights() print >> logs, "memory usage at iter %d after pool: %s" % ( it, human(memory(start_mem))) tt = time.time() print >> logs, "before para time...", tt results = map(self.train_worker, self.trainchunks) if FLAGS.singletrain else \ pool.map(self.train_worker, self.trainchunks, chunksize=1) if FLAGS.mydouble: print >> logs, "mydouble usage and freed: %d %d" % counts(), \ "|w|=", len(Perceptron.weights), "|avgw|=", len(Perceptron.allweights) if FLAGS.avg else 0, \ "|dw|=", len(results[0][-1][0]) print >> logs, "after para time...", time.time() compute_time = time.time() - tt copy_time = 0 para_times = [] for dtime, size, (_num_updates, _early_updates), (_weights, _allweights) in results: num_updates += _num_updates early_updates += _early_updates factor = size / self.trainsize # not exactly uniform (if not equal-size split)! tt = time.time() if not FLAGS.singletrain: Perceptron.weights.iaddc(_weights, factor) # print _weights # print new_weights # print if self.avg: if FLAGS.naiveavg: Perceptron.allweights.iaddc(_allweights, factor) else: Perceptron.allweights.iaddc(_allweights, factor) del _weights, _allweights copy_time += time.time() - tt para_times.append(dtime) del results if not FLAGS.singletrain: pool.close() pool.join() ## else: ## del self.delta_weights, self.delta_allweights # not in process print >> logs, "gc can't reach", gc.collect() print >> logs, "pool_time= %.1f s, compute_walltime= %.1f s, compute_cputime= %.1f (%s), copy_time= %.1f s" \ % (pool_time, compute_time, sum(para_times), " ".join("%.1f" % x for x in para_times), copy_time) print >> logs, "memory usage at iter %d after fork: %s" % ( it, human(memory(start_mem))) if not FLAGS.singletrain: # N.B.: in non-multiproc mode, self.c is updated Perceptron.c += self.trainsize / self.ncpus print >> logs, "self.c=", Perceptron.c # if self.avg: # Perceptron.allweights = new_allweights # Perceptron.weights, Decoder.model.weights = new_weights, new_weights ## num_updates, early_updates = self.one_pass_on_train() # old single-cpu iterendtime = time.time() print >> logs, "memory usage at iter %d: extra %s, total %s" % ( it, human(memory(curr_mem)), human(memory(start_mem))) if FLAGS.debuglevel >= 1: print >> logs, "weights=", Perceptron.weights curr_mem = memory() print >> logs, "iteration %d training finished at %s. now evaluating on dev..." % ( it, time.ctime()) avgweights = self.avg_weights() if self.avg else Perceptron.weights if FLAGS.avg and FLAGS.debuglevel >= 1: print >> logs, "avgweights=", avgweights avgendtime = time.time() print >> logs, "avg weights (trim) took %.1f seconds." % ( avgendtime - iterendtime) if FLAGS.debuglevel >= 2: print >> logs, "avg w=", avgweights ## avgweights = self.decoder.model.new_weights() self.decoder.model.weights = avgweights # OK if noavg; see above Parser.State.model.weights = avgweights # multiprocessing: State.model is static prec = self.eval_on_dev() print >> logs, "eval on dev took %.1f seconds." % (time.time() - avgendtime) print >> logs, "at iteration {0}, updates= {1} (early {4}), dev= {2}, |w|= {3}, time= {5:.3f}h acctime= {6:.3f}h"\ .format(it, num_updates, prec, len(avgweights), early_updates, \ (time.time() - iterstarttime)/3600, (time.time() - starttime)/3600.) logs.flush() if prec > best_prec: best_prec = prec best_it = it best_wlen = len(avgweights) print >> logs, "new high at iteration {0}: {1}. Dumping Weights...".format( it, prec) self.dump(avgweights) self.decoder.model.weights = Perceptron.weights # restore non-avg del avgweights print >> logs, "gc can't reach", gc.collect() if FLAGS.mydouble: print >> logs, "mydouble usage and freed: %d %d ------------------------" % counts( ) logs.flush() # for hpc print >> logs, "peaked at iteration {0}: {1}, |bestw|= {2}.".format( best_it, best_prec, best_wlen) print >> logs, "perceptron training of %d iterations finished on %s (took %.2f hours)" % \ (it, time.ctime(), (time.time() - starttime)/3600.)
pickler.clear_memo() if opts.output_dir is not None and len(gram) >= opts.dump_size: if opts.parallel: name = "%04d.%04d" % (opts.parallel[0], n_dump) else: name = "%04d" % n_dump dump_rules(gram, opts.output_dir, name) dumped += len(gram) gram = {} n_dump += 1 if log.level >= 1 and count%slice == 0: sys.stderr.write("time: %f, sentences in: %d (%.1f/sec), " % (time.time()-start_time, count, slice/(time.time()-prev_time))) sys.stderr.write("rules out: %d+%d\n" % (dumped, len(gram))) sys.stderr.write("memory: %s\n" % monitor.memory()) prev_time = time.time() count += 1 if opts.output_dir is not None: if opts.parallel: name = "%04d.%04d" % (opts.parallel[0], n_dump) else: name = "%04d" % n_dump dump_rules(gram, opts.output_dir, name) else: dump_rules(gram, output_file) """if opts.output_forests: pickler.dump(sym.alphabet)"""
def process(sent): goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) if goal is None: log.writeln("warning: parse failure") return None if opts.forest_dir: forest_file = gzip.open(os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w") forest_file.write( forest.forest_to_json( goal, fwords=sent.words, mode="english", models=thedecoder.models, weights=thedecoder.weights ) ) forest_file.close() if opts.rule_posterior_dir: rule_posterior_file = open(os.path.join(opts.rule_posterior_dir, "rule_posterior.%s" % sent.id), "w") beta = 1.0 insides = goal.compute_inside(thedecoder.weights, beta=beta) outsides = goal.compute_outside(thedecoder.weights, insides, beta=beta) z = insides[id(goal)] for item in goal.bottomup(): for ded in item.deds: c = outsides[id(item)] c += thedecoder.weights.dot(ded.dcost) c += sum(insides[id(ant)] for ant in ded.ants) c -= z rule_posterior_file.write( "%s ||| span=%s posterior=%s\n" % (ded.rule, (item.i, item.j), cost.prob(c)) ) ded.dcost["posterior"] = c rule_posterior_file.close() max_posterior_file = open(os.path.join(opts.rule_posterior_dir, "max_posterior.%s" % sent.id), "w") goal.reweight(svector.Vector("posterior=1")) max_posterior = goal.viterbi_deriv() def show(ded, antvalues): if ded.rule: value = rule.subst(ded.rule.erhs, antvalues) else: value = antvalues[0] return ("[%.3f" % cost.prob(ded.dcost["posterior"]),) + value + ("]",) value = max_posterior.value(show) s = " ".join(value) max_posterior_file.write("%s\n" % s) max_posterior_file.close() outputs = get_nbest(goal, n_best, ambiguity_limit) if n_best_file: for (v, e) in outputs: e = " ".join(e) # n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, -thedecoder.weights.dot(v))) n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, v)) n_best_file.flush() (bestv, best) = outputs[0] if french_parse_file: french_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().french_tree())) french_parse_file.flush() if english_parse_file: english_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().english_tree())) english_parse_file.flush() if log.level >= 1: gc.collect() log.write(" done decoding, memory=%s\n" % monitor.memory()) log.write(" features: %s; %s\n" % (bestv, thedecoder.weights.dot(bestv))) sent.ewords = best return sent
def process(sent): global alphas if online_learning: updates.clear() alphas.clear() theoracle.input(sent) log.write("done preparing\n") global decoder_errors try: goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) decoder_errors = 0 if goal is None: raise Exception("parse failure") except Exception: import traceback log.writeln( "decoder raised exception: %s %s" % (sent, "".join(traceback.format_exception(*sys.exc_info())))) decoder_errors += 1 if decoder_errors >= 100: log.write( "decoder failed too many times, passing exception through!\n" ) raise else: return goal.rescore(theoracle.models, thedecoder.weights, add=True) bestv, best = decoder.get_nbest(goal, 1)[0] log.write("done decoding\n") bestscore = get_score(bestv, best) log.write("best hyp: %s %s cost=%s score=%s\n" % (" ".join( sym.tostring(e) for e in best), bestv, thedecoder.weights.dot(bestv), bestscore)) goldv, gold, goldscore = get_gold(sent, goal, thedecoder.weights) assert ( sent.id not in updates ) # in batch learning, this can happen, and we would have to undo the update associated with this sentence updates[sent.id] = [(svector.Vector(), 0.)] alphas[sent.id] = [max_learning_rate] if opts.parallel: while True: if mpi.world.iprobe(tag=1): (sentid, vscores) = mpi.world.recv(tag=1) log.write("received update for %s\n" % (sentid, )) if sentid in updates: # see comment above log.write("ignoring update for %s\n" % (sentid, )) continue # drop this update on the floor updates[sentid] = vscores alphas[sentid] = [max_learning_rate ] + [0.] * (len(vscores) - 1) # since the first update is zero, the alphas & updates # are still consistent with weights else: break def oracle(weights): hyps = get_hyps(sent, goal, weights) return [(goldv - hypv, goldscore - hypscore) for (hypv, hyp, hypscore) in hyps] thedecoder.weights, alphas = cutting_plane(thedecoder.weights, updates, alphas, {sent.id: oracle}) remove_zeros(thedecoder.weights) log.write("feature weights: %s\n" % (thedecoder.weights * watch_features)) log.write("weight norm: %s\n" % (math.sqrt(thedecoder.weights.normsquared()))) # update weight sum for averaging global nweights, sumweights_helper # sumweights_helper = \sum_{i=0}^n (i \Delta w_i) for sentid in updates: for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]): apply_update(sumweights_helper, nweights * alpha * v) nweights += 1 # update feature scales if update_feature_scales: global sum_updates2, n_updates, feature_scales for sentid in updates: u = svector.Vector() for (v, score), alpha in itertools.izip(updates[sentid], alphas[sentid]): u += alpha / max_learning_rate * v sum_updates2 += u * u n_updates += 1 try: default_feature_scale = 1. / compute_variance(0, n_updates) except ZeroDivisionError: default_feature_scale = 0. # pseudoinverse feature_scales = collections.defaultdict( lambda: default_feature_scale) for feat in sum_updates2: try: feature_scales[feat] = 1. / compute_variance( sum_updates2[feat], n_updates) except ZeroDivisionError: feature_scales[feat] = 0. # pseudoinverse log.write( "feature scales: %s\n" % (" ".join("%s=%s" % (f, feature_scales[f]) for f in watch_features if f in feature_scales))) if opts.parallel: # flush out filled requests global requests requests = [request for request in requests if not request.test()] # transmit updates to other nodes for node in parallel.slaves: if node != parallel.rank: requests.append( mpi.world.isend(value=(sent.id, updates[sent.id]), dest=node, tag=1)) bestv = theoracle.finish(bestv, best) theoracle.update(bestv) sent.score_comps = bestv if log.level >= 1: gc.collect() log.write("done updating, memory = %s\n" % monitor.memory()) sent.ewords = [sym.tostring(e) for e in best] return sent
execfile(configfilename) opts, args = optparser.parse_args(args=sys.argv[2:]) maxmargin.watch_features = watch_features theoracle = oracle.Oracle(order=4, variant=opts.bleuvariant, oracledoc_size=10) thedecoder = make_decoder() thelearner = Learner() weight_stack = [] if log.level >= 1: gc.collect() log.write("all structures loaded, memory=%s\n" % (monitor.memory())) comm = MPI.Comm.Get_parent() log.prefix = '[%s] ' % (comm.Get_rank(), ) instances = [] while True: msg = comm.recv() if msg[0] == 'train': sent = msg[1] goal = process(sent) instances.append(ForestInstance(sent.id, goal)) while comm.Iprobe(tag=1): msg = comm.recv(tag=1)
def process(sent): goal = thedecoder.translate(sent) thedecoder.process_output(sent, goal) if goal is None: return None if opts.forest_dir: forest_file = gzip.open(os.path.join(opts.forest_dir, "forest.%s.gz" % sent.id), "w") forest_file.write(forest.forest_to_json(goal, fwords=sent.fwords, mode='english', models=thedecoder.models, weights=thedecoder.weights)) forest_file.close() if opts.rule_posterior_dir: rule_posterior_file = open(os.path.join(opts.rule_posterior_dir, "rule_posterior.%s" % sent.id), "w") beta = 1. insides = goal.compute_inside(thedecoder.weights, beta=beta) outsides = goal.compute_outside(thedecoder.weights, insides, beta=beta) z = insides[id(goal)] for item in goal.bottomup(): for ded in item.deds: c = outsides[id(item)] c += thedecoder.weights.dot(ded.dcost) c += sum(insides[id(ant)] for ant in ded.ants) c -= z rule_posterior_file.write("%s ||| span=%s posterior=%s\n" % (ded.rule, (item.i, item.j), cost.prob(c))) ded.dcost['posterior'] = c rule_posterior_file.close() max_posterior_file = open(os.path.join(opts.rule_posterior_dir, "max_posterior.%s" % sent.id), "w") goal.reweight(svector.Vector('posterior=1')) max_posterior = goal.viterbi_deriv() def show(ded, antvalues): if ded.rule: value = ded.rule.e.subst((), antvalues) else: value = antvalues[0] return ("[%.3f" % cost.prob(ded.dcost['posterior']),) + value + ("]",) value = max_posterior.value(show) s = " ".join((sym.tostring(e) if type(e) is int else e) for e in value) max_posterior_file.write("%s\n" % s) max_posterior_file.close() outputs = get_nbest(goal, n_best, ambiguity_limit) if n_best_file: for (v,e) in outputs: e = " ".join(sym.tostring(w) for w in e) #n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, -thedecoder.weights.dot(v))) n_best_file.write("%s ||| %s ||| %s\n" % (sent.id, e, v)) n_best_file.flush() (bestv,best) = outputs[0] if french_parse_file: french_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().french_tree())) french_parse_file.flush() if english_parse_file: english_parse_file.write("%s ||| %s\n" % (sent.id, goal.viterbi_deriv().english_tree())) english_parse_file.flush() if log.level >= 1: gc.collect() log.write(" done decoding, memory=%s\n" % monitor.memory()) log.write(" features: %s; %s\n" % (bestv, thedecoder.weights.dot(bestv))) sent.ewords = [sym.tostring(e) for e in best] return sent
refreader(thereader(infile), [file(fn) for fn in reffilenames])) output_file = sys.stdout else: insents = [] # dummy oraclemodel = oracle.OracleModel(4, variant=opts.bleuvariant) if not opts.parallel or parallel.rank != parallel.master: thedecoder = make_decoder() oraclemodels = [oraclemodel, oracle.WordCounter()] if log.level >= 1: gc.collect() log.write("all structures loaded, memory=%s\n" % (monitor.memory())) updates = collections.defaultdict(list) decoder_errors = 0 def process(sent): oraclemodel.input(sent) log.write("done preparing\n") try: goal = thedecoder.translate(sent) except Exception: import traceback log.writeln("decoder raised exception: %s" % "".join(traceback.format_exception(*sys.exc_info()))) global decoder_errors decoder_errors += 1
SITE_COUNT = {} SITE_CHECK = 0 SITE_CHECK_SUCCESS = 0 if FLAGS.model == 'PY' and FLAGS.discount > 0: SAMPLER.update_rule_size_tables() for s in timed(samples): #print(s) s.sample() logger.writeln('iteration time: %s sec' % (time.time() - iter_start)) logger.writeln( '%s rules, %s rule types, loglikelihood: %s' % (SAMPLER.nsamples(), SAMPLER.ntypes(), SAMPLER.likelihood())) logger.writeln('memory: %s' % memory()) logger.writeln('resident memory: %s' % resident()) if FLAGS.type: logger.writeln( '%s sampling operations in total, distribution of number of sites: %s' % (sum(SITE_COUNT.values()), SITE_COUNT)) logger.writeln( '%s sites: %s singleton sites, %s (2-10) sites, %s (>10) sites' % (sum(k * v for k, v in SITE_COUNT.items()), sum(k * v for k, v in SITE_COUNT.items() if k == 1), sum(k * v for k, v in SITE_COUNT.items() if 2 <= k <= 10), sum(k * v for k, v in SITE_COUNT.items() if k > 10))) logger.writeln('site checks: %s, success: %s' % (SITE_CHECK, SITE_CHECK_SUCCESS))
def tabulate(): if log.level >= 1: sys.stderr.write("(3) Tabulating filtered phrases\n") count = 1 inputfiles = [] for input in inputs: if os.path.isdir(input): inputfiles.extend(os.path.join(input, name) for name in os.listdir(input)) else: inputfiles.append(input) inputfiles = [file(inputfile) for inputfile in inputfiles] global fsum, esum, allsum, xsum, gram fsum = {} # c(lhs, french) esum = {} # c(lhs, english) allsum = 0.0 # c(*) xsum = {} # c(lhs) gram = {} # read in all rules with matching english sides at the same time. # this way, we can sum only those english sides that ever appeared # with a french side that passes the filter. for rules in read_rule_blocks(inputfiles): flag = False blocksum = 0. for r in rules: scores = r.scores weight = scores[0] allsum += weight blocksum += weight xsum[r.lhs] = xsum.get(r.lhs, 0.0) + weight if ffilter is None or ffilter.match(r.f): # there used to be a shortcut here -- if fsum.has_key(r.f) #fsum[(r.lhs,r.f)] = fsum.get((r.lhs,r.f), 0.0) + weight fsum[r.f] = fsum.get(r.f, 0.0) + weight if r in gram: gram[r] += r else: gram[r] = r flag = True if log.level >= 1 and count%interval == 0: sys.stderr.write("time: %f, memory: %s, rules in: %d, rules counted: %d\n" % (monitor.cpu(), monitor.memory(), count, len(gram))) count += 1 if flag: ewordsnorm = rules[0].e.handle() if ewordsnorm in esum: sys.stderr.write("warning: files not sorted properly\n") esum[ewordsnorm] = blocksum
if len(rules) >= 100000: log.write("sentence %s has %s rules\n" % (li+1, len(rules))) log.write("input: %s\n" % line.rstrip()) if not combiner: # simple version for r in rules: print "%s\t%s" % (r, r.scores) else: # do some combining before writing out for r in rules: existing = gram.get(r, None) if existing is not None: existing.scores += r.scores else: del r.fpos del r.epos del r.span gram[r] = r if len(gram) >= 100000: log.write("dumping...\n") for r in gram: print "%s\t%s" % (r, r.scores) gram.clear() log.write("memory: %s\n" % monitor.memory()) if combiner: for r in gram: print "%s\t%s" % (r, r.scores)