def string_kld(target, hypothesis, samples=1000, verbose=False, max_length=math.inf, seed=None): ### sample n trees from target. if seed == None: rng = numpy.random.RandomState() else: rng = numpy.random.RandomState(seed) inside_target = wcfg.InsideComputation(target) inside_hypothesis = wcfg.InsideComputation(hypothesis) sampler = wcfg.Sampler(target, random=rng) total = 0.0 n = 0 for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) if len(s) <= max_length: n += 1 lp = inside_target.inside_log_probability(s) lq = inside_hypothesis.inside_log_probability(s) if verbose: print("Sample %d %s, target %f, hypothesis %f" % (i, t, lp, lq)) total += lp - lq return total / n
def do_lp_monte_carlo(target, hypotheses, samples=1000, max_length=30, seed=None, verbose=False): """ Do all of the log prob evaluations in one pass. return a dict with all results stored as "logprob:model:" """ inside_target = wcfg.InsideComputation(target) inside_hypotheses = [ wcfg.InsideComputation(hypothesis) for hypothesis in hypotheses ] scores = defaultdict(float) scores['number_hypotheses'] = len(hypotheses) def f(t, i): nonlocal scores s = utility.collect_yield(t) scores['logprob:target:n'] += 1 scores['logprob:target:length'] += len(s) scores[ "logprob:target:string"] += inside_target.inside_log_probability(s) scores[ "logprob:target:bracket"] += inside_target.inside_bracketed_log_probability( t) scores["logprob:target:tree"] += target.log_probability_derivation(t) for i, (hypothesis, inside_hypothesis) in enumerate( zip(hypotheses, inside_hypotheses)): label = "logprob:hypothesis%d:" % i try: scores[label + "string"] += inside_hypothesis.inside_log_probability(s) except utility.ParseFailureException: scores[label + "string:failures"] += 1 try: scores[ label + "bracket"] += inside_hypothesis.inside_bracketed_log_probability( t) except utility.ParseFailureException: scores[label + "bracket:failures"] += 1 try: scores[label + "tree"] += hypothesis.log_probability_derivation(t) except utility.ParseFailureException: scores[label + 'tree:failures'] += 1 monte_carlo(target, f, samples, max_length, seed) return scores
def string_density(self, length, samples): """ return an estimate of the proportion of strings of length n that are in the grammar. Do this by sampling uniformly from the derivations, and computing the number of derivations for each such string, and dividing. """ derivations = self.get_total(length) strings = 1.0 * self.vocab**length total = 0.0 parser = wcfg.InsideComputation(self.grammar) inverse = 0.0 for i in range(samples): tree = self.sample(length) w = collect_yield(tree) #print w #print w n = parser.count_parses(w) #print n if n == 0: raise ValueError("Generated a string which cannot be parsed.") total += n inverse += 1.0 / n imean = inverse / samples return (derivations / strings) * imean #, derivations, strings, 1.0/imean
def labeled_exact_match(target, hypothesis, samples=1000, max_length=30, viterbi=False, verbose=False, seed=None): """ Proportion of trees whose viterbi parse is the same up to a relabeling of the hypothesis tree. Target has to be a pcfg; hypothesis can be any WCFG. Identical nonterminals """ if seed == None: rng = numpy.random.RandomState() else: rng = numpy.random.RandomState(seed) sampler = wcfg.Sampler(target, random=rng) if viterbi: inside_target = wcfg.InsideComputation(target) inside_hypothesis = wcfg.InsideComputation(hypothesis) total = 0.0 n = 0 for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) if len(s) >= max_length: continue n += 1 if viterbi: t = inside_target.viterbi_parse(s) try: th = inside_hypothesis.viterbi_parse(s) #relabeled_tree = utility.relabel_tree(th, ntmap) relabeled_tree = th if relabeled_tree == t: total += 1 elif verbose: logging.info("Mismatch in trees with parse of %s", s) print(relabeled_tree) print(t) except utility.ParseFailureException as e: # Treat this as a failure . print("Parse failure of %s " % s) return total / n
def __init__(self, target_pcfg): self.target_pcfg = target_pcfg self.terminals = set(target_pcfg.terminals) self.te = target_pcfg.terminal_expectations() self.pe = target_pcfg.production_expectations() self.sampler = wcfg.Sampler(self.target_pcfg) self.insider = wcfg.InsideComputation(self.target_pcfg) self.contexts_map = {}
def string_density_crude(self, length, samples): terminals = list(self.grammar.terminals) n = 0 parser = wcfg.InsideComputation(self.grammar) for i in range(samples): s = tuple([numpy.random.choice(terminals) for x in range(length)]) if parser.count_parses(s) > 0: n += 1 return n / float(samples)
def conditional_kld(target, hypothesis, samples=1000, verbose=False, max_length=math.inf, seed=None): """ Estimate the kld between the conditional probability distributions. for a given string $w$ D( P(tree|w) | Q(tree|w)). difference between string KLD and tree KLD. Target must be a pcfg, hyppthesis can be arbitrary wcfg, not even convergent, with same nonterminals. """ if seed == None: rng = numpy.random.RandomState() else: rng = numpy.random.RandomState(seed) inside_target = wcfg.InsideComputation(target) inside_hypothesis = wcfg.InsideComputation(hypothesis) sampler = wcfg.Sampler(target, random=rng) total = 0.0 n = 0 for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) if len(s) > max_length: if verbose: print("Skipping", len(s)) continue n += 1 ptree = target.log_probability_derivation(t) pstring = inside_target.inside_log_probability(s) qtree = hypothesis.log_probability_derivation(t) qstring = inside_hypothesis.inside_log_probability(s) total += (ptree - qtree) - (pstring - qstring) if verbose: print("%s p(t) = %f, p(w) = %f, q(t) = %f, q(w) = %f" % (s, ptree, pstring, qtree, qstring)) return total / n
def estimate_bijection(target_pcfg, hypothesis_pcfg, samples=1000, seed=None, max_length=math.inf, verbose=False): ## Essential assumption assert len(target_pcfg.nonterminals) == len(hypothesis_pcfg.nonterminals) if seed == None: rng = numpy.random.RandomState() else: rng = numpy.random.RandomState(seed) sampler = wcfg.Sampler(target_pcfg, random=rng) insider = wcfg.InsideComputation(hypothesis_pcfg) n = len(target_pcfg.nonterminals) c = defaultdict(Counter) for _ in range(samples): tree = sampler.sample_tree() s = utility.collect_yield(tree) if len(s) <= max_length: try: learned = insider.bracketed_viterbi_parse(tree) collect_nonterminal_pairs(tree, learned, c) except utility.ParseFailureException: print("Failed", s) #print(c) maximum_value = max(max(c2.values()) for nt, c2 in c.items()) #print(maximum_value) ## Now use the Hungarian algorithm cost_matrix = np.zeros((n, n)) target_list = list(target_pcfg.nonterminals) hypothesis_list = list(hypothesis_pcfg.nonterminals) for i, a in enumerate(target_list): for j, b in enumerate(hypothesis_list): count = c[a][b] cost_matrix[i, j] = maximum_value - count # if count == 0: # cost_matrix[i,j] = maximum_value # else: # cost_matrix[i,j] = 1/count ## Maybe normalise so as to maximize something else? row_ind, col_ind = linear_sum_assignment(cost_matrix) answer = {} for i, j in zip(row_ind, col_ind): answer[target_list[i]] = hypothesis_list[j] #print(answer) return answer
def bracketed_kld(target, hypothesis, samples=1000, max_length=30, seed=None, verbose=False): ### sample n trees from target. FAST inside_target = wcfg.InsideComputation(target) inside_hypothesis = wcfg.InsideComputation(hypothesis) n = 0 total = 0 def f(t, i): lp = inside_target.inside_bracketed_log_probability(t) lq = inside_hypothesis.inside_bracketed_log_probability(t) nonlocal n, total n += 1 total += lp - lq if verbose: print("Sample %d %s, target %f, hypothesis %f" % (i, t, lp, lq)) monte_carlo(target, f, samples, max_length, seed) return total / n
def __init__(self, target_pcfg): self.target_pcfg = target_pcfg self.target_wcfg = target_pcfg.convert_parameters_pi2xi() self.insider = wcfg.InsideComputation(target_pcfg) self.terminals = set(target_pcfg.terminals) self.te = target_pcfg.terminal_expectations() self.max_length = 2 * len(self.target_pcfg.nonterminals) self.kernel = [] self.nonterminal_map = {} self.parameters = {} self.kernel_grammar = None self.sampler = None self.asymptotic_grammar = None self.prod2context = {}
def do_parseval_monte_carlo(target, hypotheses, samples=1000, max_length=30, seed=None, verbose=False): """ Do a bunch of parsing based evaluations on a sample from the target. Hypotheses is a list of pcfgs to be evaluated. """ inside_target = wcfg.InsideComputation(target) #inside_hypothesis = wcfg.InsideComputation(hypothesis) inside_hypotheses = [ wcfg.InsideComputation(hypothesis) for hypothesis in hypotheses ] baselines = Baselines(len(target.nonterminals)) scores = defaultdict(int) def f(t, i): nonlocal scores s = utility.collect_yield(t) scores['trees_denominator'] += 1 scores['labeled_denominator'] += utility.count_labeled(t) scores['unlabeled_denominator'] += utility.count_unlabeled(t) gold_viterbi = inside_target.viterbi_parse(s) try: ## Viterbi/nonviterbi ## target, hyppothesis,left,right,random ## labeled unlabeled ## exact match / microaveraged hypo_viterbis = [ (inside_hypothesis.viterbi_parse(s), "hypothesis%d" % i) for i, inside_hypothesis in enumerate(inside_hypotheses) ] lb = baselines.make_left_branch(s) rb = baselines.make_right_branch(s) rand = baselines.make_random_labeled(s) for target_tree, label1 in [(t, "original"), (gold_viterbi, "viterbi")]: for eval_tree, label2 in hypo_viterbis + [ (lb, "leftbranch"), (rb, "rightbranch"), (rand, "random"), (gold_viterbi, "gold viterbi") ]: scores[ label1 + ":" + label2 + ":labeled:exact_match"] += 1 if target_tree == eval_tree else 0 scores[ label1 + ":" + label2 + ":unlabeled:exact_match"] += 1 if utility.unlabeled_tree_equal( target_tree, eval_tree) else 0 (x, n) = utility.microaveraged_labeled( target_tree, eval_tree) scores[label1 + ":" + label2 + ":labeled:microaveraged"] += x (x, n) = utility.microaveraged_unlabeled( target_tree, eval_tree) scores[label1 + ":" + label2 + ":unlabeled:microaveraged"] += x # (x,n) = utility.microaveraged_labeled(t, hypo_viterbi) # if hypo_viterbi == t: # scores['labeled_exact_match'] += 1 # if hypo_viterbi == gold_viterbi: # scores['labeled_exact_match_viterbi'] += 1 # hvu = utility.tree_to_unlabeled_tree(hypo_viterbi) # goldu = utility.tree_to_unlabeled_tree(t) # if hvu == goldu: # scores['unlabeled_exact_match'] += 1 # scores['labeled'] += x # scores['labeled_denominator'] += n # (x,n) = utility.microaveraged_unlabeled(t, hypo_viterbi) # scores['unlabeled'] += x # scores['unlabeled_denominator'] += n # ## Now some baselines. ## exact match except utility.ParseFailureException: print("Parse failure of %s " % s) monte_carlo(target, f, samples, max_length, seed) return scores
action="store_true") ## Other options: control output format, what probs are calculated. args = parser.parse_args() mypcfg = wcfg.load_wcfg_from_file(args.inputfilename) if args.seed: print("Setting seed to ", args.seed) prng = RandomState(args.seed) else: prng = RandomState() mysampler = wcfg.Sampler(mypcfg, random=prng) insider = wcfg.InsideComputation(mypcfg) with open(args.outputfilename, 'w') as outf: i = 0 while i < args.n: tree = mysampler.sample_tree() # default is string. s = utility.collect_yield(tree) if not args.maxlength or len(s) <= args.maxlength: if not args.omitprobs: lps = insider.inside_log_probability(s) lpt = mypcfg.log_probability_derivation(tree) lpb = insider._bracketed_log_probability(tree)[mypcfg.start] outf.write("%0.12f %0.12f %0.12f " % (lpt, lpb, lps)) if args.yieldonly: outf.write(" ".join(s) + "\n")