def __call__(self, data): vec = Vector() for i, (key, val) in enumerate(data): splits = val.split("****") if len(splits) <> 2: print >>sys.stderr,"skipping sent" continue sent, oracle = splits s2 = sent.replace("\t\t\t", "\n") o2 = oracle.replace("\t\t\t", "\n") sent_forest = Forest.load(StringIO(s2), True, lm=None).next() oracle_forest = Forest.load(StringIO(o2), True, lm=None).next() assert sent_forest, oracle_forest #print >>sys.stderr, len(sent_forest) #print >>sys.stderr, len(oracle_forest) example_marg, example_partition = fast_inside_outside.collect_marginals(sent_forest, self.weights) oracle_marg, oracle_partition = fast_inside_outside.collect_marginals(oracle_forest, self.weights) vec += example_marg - oracle_marg vec["log_likelihood"] += example_partition-oracle_partition #vec["log_likelihood"] += example_partition-oracle_partition self.processed += 1 for feat in vec: yield feat, vec[feat]
def compute_marginals(self, forest, oracle_forest): "computes the marginals of a -lm forest" # print >> logs, "Example TIME %s"%((end - start)) # oracle_bleu, oracle_trans, oracle_fv, _ = oracle_forest.compute_oracle(Vector(), model_weight=0.0, bleu_weight=1.0) def non_local_scorer(cedge, ders): hyp = cedge.assemble(ders) return ((0.0, Vector()), hyp, hyp) # decoder = CubePruning(MarginalDecoder.FeatureAdder(self.weights), non_local_scorer, 20, 5, find_min=False) # best = decoder.run(forest.root) # example_marginals = Vector() # total = -INF # for i in range(min(200, len(best))): # M = max(best[i].score[0], total) # m = min(best[i].score[0], total) # total = M + log(1.0 + exp(m - M)) # #print "before" # print total # for i in range(min(200, len(best))): # #print exp(best[i].score[0] -total) # example_marginals += exp(best[i].score[0] -total) * best[i].score[1] # #print "after" # partition = total # start = time.time() example_marg, partition = fast_inside_outside.collect_marginals(forest, self.weights) # end = time.time() # print >> logs, "marg TIME %s"%((end - start)) # print "Best Log Likelihood %s "%(best[0].score[0] - partition) # start = time.time() # oracle_forest, oracle_item = oracle.oracle_extracter(forest, self.weights, 5, 2, extract=1) # end = time.time() # print >> logs, "oracle forest %s"%((end - start)) # start = time.time() oracle_marg, oracle_partition = fast_inside_outside.collect_marginals(oracle_forest, self.weights) (oracle_best, oracle_subtree, oracle_best_fv) = oracle_forest.bestparse(self.weights, use_min=False) # end = time.time() # print >> logs, "oracle TIME %s"%((end - start)) # logs.flush() # self.write_model("", oracle_marg) # print "Best Score: %s"% best # print "Oracle Score: %s"% (self.weights.dot(oracle_fv)) # for i in range(5): # print "Oracle Trans: %s %s" %(oracle_item[i].full_derivation, oracle_item[i].score) # print "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[i].full_derivation)) # forest.bleu.rescore(oracle_subtree) # print "Oracle Trans: %s %s" %(oracle_subtree, forest.bleu.score_ratio_str()) # print "Best Trans: %s"%best[0].full_derivation # forest.bleu.rescore(best[0].full_derivation) # print "Best BLEU Score: %s"% (forest.bleu.score_ratio_str()) # print oracle_partition -partition, oracle_partition, partition average = 0.0 # for i in range(min(10, len(best))): # print " Best Trans: %s"%best[i].full_derivation # forest.bleu.rescore(best[i].full_derivation) # average += len(best[i].full_derivation.split()) # print " Best BLEU Score: %s"% (forest.bleu.score_ratio_str()) # print " Best Score: %s"% (best[i].score[0]) # print "Average Length %s"%(average / float(i)) # print "Local Difference: %s"%(oracle_partition-partition) return example_marg, oracle_marg, oracle_partition - partition # log div