def string_kld(target, hypothesis, samples=1000, verbose=False, max_length=math.inf, seed=None): ### sample n trees from target. if seed == None: rng = numpy.random.RandomState() else: rng = numpy.random.RandomState(seed) inside_target = wcfg.InsideComputation(target) inside_hypothesis = wcfg.InsideComputation(hypothesis) sampler = wcfg.Sampler(target, random=rng) total = 0.0 n = 0 for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) if len(s) <= max_length: n += 1 lp = inside_target.inside_log_probability(s) lq = inside_hypothesis.inside_log_probability(s) if verbose: print("Sample %d %s, target %f, hypothesis %f" % (i, t, lp, lq)) total += lp - lq return total / n
def __init__(self, target_pcfg): self.target_pcfg = target_pcfg self.terminals = set(target_pcfg.terminals) self.te = target_pcfg.terminal_expectations() self.pe = target_pcfg.production_expectations() self.sampler = wcfg.Sampler(self.target_pcfg) self.renyi = RENYI
def __init__(self, target_pcfg): self.target_pcfg = target_pcfg self.terminals = set(target_pcfg.terminals) self.te = target_pcfg.terminal_expectations() self.pe = target_pcfg.production_expectations() self.sampler = wcfg.Sampler(self.target_pcfg) self.insider = wcfg.InsideComputation(self.target_pcfg) self.contexts_map = {}
def monte_carlo(target, f, samples, max_length, seed=None): ## f is called on each element if seed == None: rng = numpy.random.RandomState() else: rng = numpy.random.RandomState(seed) sampler = wcfg.Sampler(target, random=rng) for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) if len(s) <= max_length: f(t, i)
def estimate_bijection(target_pcfg, hypothesis_pcfg, samples=1000, seed=None, max_length=math.inf, verbose=False): ## Essential assumption assert len(target_pcfg.nonterminals) == len(hypothesis_pcfg.nonterminals) if seed == None: rng = numpy.random.RandomState() else: rng = numpy.random.RandomState(seed) sampler = wcfg.Sampler(target_pcfg, random=rng) insider = wcfg.InsideComputation(hypothesis_pcfg) n = len(target_pcfg.nonterminals) c = defaultdict(Counter) for _ in range(samples): tree = sampler.sample_tree() s = utility.collect_yield(tree) if len(s) <= max_length: try: learned = insider.bracketed_viterbi_parse(tree) collect_nonterminal_pairs(tree, learned, c) except utility.ParseFailureException: print("Failed", s) #print(c) maximum_value = max(max(c2.values()) for nt, c2 in c.items()) #print(maximum_value) ## Now use the Hungarian algorithm cost_matrix = np.zeros((n, n)) target_list = list(target_pcfg.nonterminals) hypothesis_list = list(hypothesis_pcfg.nonterminals) for i, a in enumerate(target_list): for j, b in enumerate(hypothesis_list): count = c[a][b] cost_matrix[i, j] = maximum_value - count # if count == 0: # cost_matrix[i,j] = maximum_value # else: # cost_matrix[i,j] = 1/count ## Maybe normalise so as to maximize something else? row_ind, col_ind = linear_sum_assignment(cost_matrix) answer = {} for i, j in zip(row_ind, col_ind): answer[target_list[i]] = hypothesis_list[j] #print(answer) return answer
def labeled_exact_match(target, hypothesis, samples=1000, max_length=30, viterbi=False, verbose=False, seed=None): """ Proportion of trees whose viterbi parse is the same up to a relabeling of the hypothesis tree. Target has to be a pcfg; hypothesis can be any WCFG. Identical nonterminals """ if seed == None: rng = numpy.random.RandomState() else: rng = numpy.random.RandomState(seed) sampler = wcfg.Sampler(target, random=rng) if viterbi: inside_target = wcfg.InsideComputation(target) inside_hypothesis = wcfg.InsideComputation(hypothesis) total = 0.0 n = 0 for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) if len(s) >= max_length: continue n += 1 if viterbi: t = inside_target.viterbi_parse(s) try: th = inside_hypothesis.viterbi_parse(s) #relabeled_tree = utility.relabel_tree(th, ntmap) relabeled_tree = th if relabeled_tree == t: total += 1 elif verbose: logging.info("Mismatch in trees with parse of %s", s) print(relabeled_tree) print(t) except utility.ParseFailureException as e: # Treat this as a failure . print("Parse failure of %s " % s) return total / n
def sample_contexts(self, nt): a = self.kernel_map[nt] iwcfg = self.target_pcfg.intersect(a) iwcfg.renormalise() iwcfg.locally_normalise() asampler = wcfg.Sampler(iwcfg) contexts = [] for _ in range(maxsamples): w = asampler.sample_string() positions = [i for i, b in enumerate(w) if b == a] assert len(positions) > 0 position = numpy.random.choice(positions) left = w[:position] right = w[position + 1:] contexts.append((tuple(left), tuple(right))) return contexts
def test(self): """ Test the target PCFG. return a Wcfg, or None if the grammar is not anchored. """ ntmap = self.test_if_anchored() if len(ntmap) == 0: return None print(ntmap) self.nonterminal_map = ntmap self.kernel_map = {ntmap[a]: a for a in ntmap} self.kernel = ntmap.values() print(self.kernel) self.kernel_grammar = self.make_kernel_grammar() self.sampler = wcfg.Sampler(self.kernel_grammar) self.test_all(N_SAMPLES, MAX_SAMPLES) return self.make_grammar()
def conditional_kld(target, hypothesis, samples=1000, verbose=False, max_length=math.inf, seed=None): """ Estimate the kld between the conditional probability distributions. for a given string $w$ D( P(tree|w) | Q(tree|w)). difference between string KLD and tree KLD. Target must be a pcfg, hyppthesis can be arbitrary wcfg, not even convergent, with same nonterminals. """ if seed == None: rng = numpy.random.RandomState() else: rng = numpy.random.RandomState(seed) inside_target = wcfg.InsideComputation(target) inside_hypothesis = wcfg.InsideComputation(hypothesis) sampler = wcfg.Sampler(target, random=rng) total = 0.0 n = 0 for i in range(samples): t = sampler.sample_tree() s = utility.collect_yield(t) if len(s) > max_length: if verbose: print("Skipping", len(s)) continue n += 1 ptree = target.log_probability_derivation(t) pstring = inside_target.inside_log_probability(s) qtree = hypothesis.log_probability_derivation(t) qstring = inside_hypothesis.inside_log_probability(s) total += (ptree - qtree) - (pstring - qstring) if verbose: print("%s p(t) = %f, p(w) = %f, q(t) = %f, q(w) = %f" % (s, ptree, pstring, qtree, qstring)) return total / n
def sample_contexts_smart(self, a, nsamples, maxsamples): #print("Sampling contexts for ", a) iwcfg = self.target_pcfg.intersect(a) iwcfg.renormalise() iwcfg.locally_normalise() asampler = wcfg.Sampler(iwcfg) contexts = set() for _ in range(maxsamples): w = asampler.sample_string() for i, b in enumerate(w): if a == b: left = w[:i] right = w[i + 1:] contexts.add((tuple(left), tuple(right))) if len(contexts) >= nsamples: return contexts return contexts
help="just output the yield", action="store_true") ## Other options: control output format, what probs are calculated. args = parser.parse_args() mypcfg = wcfg.load_wcfg_from_file(args.inputfilename) if args.seed: print("Setting seed to ", args.seed) prng = RandomState(args.seed) else: prng = RandomState() mysampler = wcfg.Sampler(mypcfg, random=prng) insider = wcfg.InsideComputation(mypcfg) with open(args.outputfilename, 'w') as outf: i = 0 while i < args.n: tree = mysampler.sample_tree() # default is string. s = utility.collect_yield(tree) if not args.maxlength or len(s) <= args.maxlength: if not args.omitprobs: lps = insider.inside_log_probability(s) lpt = mypcfg.log_probability_derivation(tree) lpb = insider._bracketed_log_probability(tree)[mypcfg.start] outf.write("%0.12f %0.12f %0.12f " % (lpt, lpb, lps)) if args.yieldonly: