def conditional_unlabeled_kld(target, hypothesis, samples=1000, verbose=False): """ Estimate the kld between the conditional probability distributions. for a given string $w$ D( P(unlabeled tree|w) | Q(tree|w)). difference between string KLD and tree KLD. """ inside_target = inside.InsideComputation(target) inside_hypothesis = inside.InsideComputation(hypothesis) sampler = pcfg.Sampler(target) total = 0.0 for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) ptree = inside_target.inside_bracketed_log_probability(t) pstring = inside_target.inside_log_probability(s) qtree = inside_hypothesis.inside_bracketed_log_probability(t) qstring = inside_hypothesis.inside_log_probability(s) total += (ptree - qtree) - (pstring - qstring) if verbose: logging.info("%s p(t) = %f, p(w) = %f, q(t) = %f, q(w) = %f", s, ptree, pstring, qtree, qstring) return total / samples
def nonterminal_contingency_table(target, hypothesis, samples=1000, robust=False): counter = Counter() inside_hypothesis = inside.InsideComputation(hypothesis) sampler = pcfg.Sampler(target) def gather_pairs(tree1, tree2, counter): assert len(tree1) == len(tree2) counter[(tree1[0], tree2[0])] += 1 if len(tree1) == 3: gather_pairs(tree1[1], tree2[1], counter) gather_pairs(tree1[2], tree2[2], counter) for i in range(samples): t = sampler.sample_tree() try: th = inside_hypothesis.bracketed_viterbi_parse(t) gather_pairs(t, th, counter) except utility.ParseFailureException as e: if robust: logging.info("Parse failure while doing the bracketed parse.") else: raise e return counter
def labeled_exact_match(target, hypothesis, samples=1000, test_viterbi=False, verbose=False): """ Proportion of trees whose viterbi parse is the same up to a relabeling of the hypothesis tree. SLOW """ if test_viterbi: inside_target = inside.InsideComputation(target) inside_hypothesis = inside.InsideComputation(hypothesis) sampler = pcfg.Sampler(target) total = 0.0 ntmap = best_nonterminal_rmap(target, hypothesis, samples) for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) if test_viterbi: t = inside_target.viterbi_parse(s) try: th = inside_hypothesis.viterbi_parse(s) relabeled_tree = utility.relabel_tree(th, ntmap) if relabeled_tree == t: total += 1 elif verbose: logging.info("Mismatch in trees with parse of %s", s) print(relabeled_tree) print(t) except utility.ParseFailureException as e: logging.warning("Parse failure", s) return total / samples
def bracketed_match(target, hypothesis, test_viterbi=False, samples=1000, verbose=False, exact_match=True): """ Proportion of trees whose viterbi parse has the same shape as the original. test viterbi option means that it will test against the viterbi parse wrt the true grammar not the original tree """ inside_target = inside.InsideComputation(target) inside_hypothesis = inside.InsideComputation(hypothesis) sampler = pcfg.Sampler(target) total = 0.0 ttotal = 0.0 for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) if test_viterbi: t = inside_target.viterbi_parse(s) try: th = inside_hypothesis.viterbi_parse(s) if exact_match: num, denom = utility.zero_one_unlabeled(t, th) else: num, denom = utility.microaveraged_unlabeled(t, th) total += num ttotal += denom if verbose and num < denom: logging.info("Mismatch (%d / %d) with string %s", num, denom, s) except utility.ParseFailureException as e: logging.warning("Parse failure", s) return total / ttotal
def bracketed_kld(target, hypothesis, samples=1000, verbose=False): ### sample n trees from target. FAST inside_target = inside.InsideComputation(target) inside_hypothesis = inside.InsideComputation(hypothesis) sampler = pcfg.Sampler(target) total = 0.0 for i in range(samples): t = sampler.sample_tree() lp = inside_target.inside_bracketed_log_probability(t) lq = inside_hypothesis.inside_bracketed_log_probability(t) if verbose: logging.info("Sample %d %s, target %f, hypothesis %f", i, t, lp, lq) total += lp - lq return total / samples
def preterminal_contingency_table(target, hypothesis, samples=1000): counter = Counter() inside_hypothesis = inside.InsideComputation(hypothesis) sampler = pcfg.Sampler(target) for i in range(samples): t = sampler.sample_tree() tut = utility.tree_to_preterminals(t) s = utility.collect_yield(t) try: th = inside_hypothesis.viterbi_parse(s) except utility.ParseFailureException as e: logging.warning("Parse failure", s) continue tpt = utility.tree_to_preterminals(th) for a, b in zip(tut, tpt): counter[(a, b)] += 1 return counter
def test_coverage(target, hypothesis, samples=1000): """ Sample n strings from target and see if they are parsed by hypothesis. optimisation: parse bracketed string first. """ inside_hypothesis = inside.InsideComputation(hypothesis) sampler = pcfg.Sampler(target) total = 0.0 for _ in range(samples): t = sampler.sample_tree() try: vp = inside_hypothesis.bracketed_viterbi_parse(t) total += 1 except utility.ParseFailureException as e: try: s = utility.collect_yield(t) vp = inside_hypothesis.viterbi_parse(s) total += 1 except utility.ParseFailureException as e: pass return total / samples
help="just output the yield", action="store_true") ## Other options: control output format, what probs are calculated. args = parser.parse_args() mypcfg = pcfg.load_pcfg_from_file(args.inputfilename) if args.seed: print("Setting seed to ", args.seed) prng = RandomState(args.seed) else: prng = RandomState() mysampler = pcfg.Sampler(mypcfg, random=prng) insider = inside.InsideComputation(mypcfg) with open(args.outputfilename, 'w') as outf: i = 0 while i < args.n: tree = mysampler.sample_tree() # defatul is string. s = utility.collect_yield(tree) if not args.maxlength or len(s) <= args.maxlength: if not args.omitprobs: lpt = mypcfg.log_probability_derivation(tree) lpb = insider._bracketed_log_probability(tree)[mypcfg.start] if args.omitinside: outf.write("%e %e " % (lpt, lpb))