Пример #1
0
def string_kld(target,
               hypothesis,
               samples=1000,
               verbose=False,
               max_length=math.inf,
               seed=None):
    ### sample n trees from target.
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    inside_target = wcfg.InsideComputation(target)
    inside_hypothesis = wcfg.InsideComputation(hypothesis)
    sampler = wcfg.Sampler(target, random=rng)
    total = 0.0
    n = 0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) <= max_length:
            n += 1
            lp = inside_target.inside_log_probability(s)
            lq = inside_hypothesis.inside_log_probability(s)
            if verbose:
                print("Sample %d %s, target %f, hypothesis %f" %
                      (i, t, lp, lq))
            total += lp - lq
    return total / n
Пример #2
0
 def __init__(self, target_pcfg):
     self.target_pcfg = target_pcfg
     self.terminals = set(target_pcfg.terminals)
     self.te = target_pcfg.terminal_expectations()
     self.pe = target_pcfg.production_expectations()
     self.sampler = wcfg.Sampler(self.target_pcfg)
     self.renyi = RENYI
Пример #3
0
 def __init__(self, target_pcfg):
     self.target_pcfg = target_pcfg
     self.terminals = set(target_pcfg.terminals)
     self.te = target_pcfg.terminal_expectations()
     self.pe = target_pcfg.production_expectations()
     self.sampler = wcfg.Sampler(self.target_pcfg)
     self.insider = wcfg.InsideComputation(self.target_pcfg)
     self.contexts_map = {}
Пример #4
0
def monte_carlo(target, f, samples, max_length, seed=None):
    ## f is called on each element
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    sampler = wcfg.Sampler(target, random=rng)
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) <= max_length:
            f(t, i)
Пример #5
0
def estimate_bijection(target_pcfg,
                       hypothesis_pcfg,
                       samples=1000,
                       seed=None,
                       max_length=math.inf,
                       verbose=False):
    ## Essential assumption
    assert len(target_pcfg.nonterminals) == len(hypothesis_pcfg.nonterminals)
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    sampler = wcfg.Sampler(target_pcfg, random=rng)
    insider = wcfg.InsideComputation(hypothesis_pcfg)
    n = len(target_pcfg.nonterminals)

    c = defaultdict(Counter)

    for _ in range(samples):
        tree = sampler.sample_tree()

        s = utility.collect_yield(tree)
        if len(s) <= max_length:
            try:
                learned = insider.bracketed_viterbi_parse(tree)
                collect_nonterminal_pairs(tree, learned, c)
            except utility.ParseFailureException:
                print("Failed", s)
    #print(c)

    maximum_value = max(max(c2.values()) for nt, c2 in c.items())
    #print(maximum_value)
    ## Now use the Hungarian algorithm

    cost_matrix = np.zeros((n, n))
    target_list = list(target_pcfg.nonterminals)
    hypothesis_list = list(hypothesis_pcfg.nonterminals)
    for i, a in enumerate(target_list):
        for j, b in enumerate(hypothesis_list):
            count = c[a][b]
            cost_matrix[i, j] = maximum_value - count
            # if count == 0:
            # 	cost_matrix[i,j] = maximum_value
            # else:
            # 	cost_matrix[i,j] = 1/count  ## Maybe normalise so as to maximize something else?
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    answer = {}
    for i, j in zip(row_ind, col_ind):
        answer[target_list[i]] = hypothesis_list[j]
    #print(answer)
    return answer
Пример #6
0
def labeled_exact_match(target,
                        hypothesis,
                        samples=1000,
                        max_length=30,
                        viterbi=False,
                        verbose=False,
                        seed=None):
    """
	Proportion of trees whose viterbi parse is the same up to a relabeling of the hypothesis tree.
	Target has to be a pcfg; hypothesis can be any WCFG.

	Identical nonterminals
	"""
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    sampler = wcfg.Sampler(target, random=rng)
    if viterbi:
        inside_target = wcfg.InsideComputation(target)
    inside_hypothesis = wcfg.InsideComputation(hypothesis)

    total = 0.0
    n = 0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) >= max_length:
            continue
        n += 1
        if viterbi:
            t = inside_target.viterbi_parse(s)
        try:
            th = inside_hypothesis.viterbi_parse(s)
            #relabeled_tree = utility.relabel_tree(th, ntmap)
            relabeled_tree = th
            if relabeled_tree == t:
                total += 1
            elif verbose:
                logging.info("Mismatch in trees with parse of %s", s)
                print(relabeled_tree)
                print(t)
        except utility.ParseFailureException as e:
            # Treat this as a failure .

            print("Parse failure of %s " % s)
    return total / n
Пример #7
0
    def sample_contexts(self, nt):
        a = self.kernel_map[nt]
        iwcfg = self.target_pcfg.intersect(a)
        iwcfg.renormalise()
        iwcfg.locally_normalise()
        asampler = wcfg.Sampler(iwcfg)
        contexts = []

        for _ in range(maxsamples):
            w = asampler.sample_string()
            positions = [i for i, b in enumerate(w) if b == a]
            assert len(positions) > 0
            position = numpy.random.choice(positions)
            left = w[:position]
            right = w[position + 1:]
            contexts.append((tuple(left), tuple(right)))
        return contexts
Пример #8
0
    def test(self):
        """
		Test the target PCFG.

		return a Wcfg, or None if the grammar is not anchored. 
		"""
        ntmap = self.test_if_anchored()
        if len(ntmap) == 0:
            return None
        print(ntmap)
        self.nonterminal_map = ntmap
        self.kernel_map = {ntmap[a]: a for a in ntmap}
        self.kernel = ntmap.values()
        print(self.kernel)
        self.kernel_grammar = self.make_kernel_grammar()
        self.sampler = wcfg.Sampler(self.kernel_grammar)
        self.test_all(N_SAMPLES, MAX_SAMPLES)
        return self.make_grammar()
Пример #9
0
def conditional_kld(target,
                    hypothesis,
                    samples=1000,
                    verbose=False,
                    max_length=math.inf,
                    seed=None):
    """
	Estimate the kld between the conditional probability distributions.

	for a given string $w$ D( P(tree|w) | Q(tree|w)).

	difference between string KLD and tree KLD.
	
	Target must be a pcfg, hyppthesis can be arbitrary wcfg, not even convergent, with same nonterminals.
	"""
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    inside_target = wcfg.InsideComputation(target)
    inside_hypothesis = wcfg.InsideComputation(hypothesis)
    sampler = wcfg.Sampler(target, random=rng)
    total = 0.0
    n = 0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) > max_length:
            if verbose:
                print("Skipping", len(s))
            continue
        n += 1
        ptree = target.log_probability_derivation(t)
        pstring = inside_target.inside_log_probability(s)

        qtree = hypothesis.log_probability_derivation(t)
        qstring = inside_hypothesis.inside_log_probability(s)

        total += (ptree - qtree) - (pstring - qstring)
        if verbose:
            print("%s p(t) = %f, p(w) = %f, q(t) = %f, q(w) = %f" %
                  (s, ptree, pstring, qtree, qstring))
    return total / n
Пример #10
0
    def sample_contexts_smart(self, a, nsamples, maxsamples):
        #print("Sampling contexts for ", a)
        iwcfg = self.target_pcfg.intersect(a)
        iwcfg.renormalise()
        iwcfg.locally_normalise()
        asampler = wcfg.Sampler(iwcfg)
        contexts = set()

        for _ in range(maxsamples):
            w = asampler.sample_string()
            for i, b in enumerate(w):
                if a == b:
                    left = w[:i]
                    right = w[i + 1:]
                    contexts.add((tuple(left), tuple(right)))
                    if len(contexts) >= nsamples:
                        return contexts

        return contexts
Пример #11
0
                    help="just output the yield",
                    action="store_true")

## Other options: control output format, what probs are calculated.

args = parser.parse_args()

mypcfg = wcfg.load_wcfg_from_file(args.inputfilename)

if args.seed:
    print("Setting seed to ", args.seed)
    prng = RandomState(args.seed)
else:
    prng = RandomState()

mysampler = wcfg.Sampler(mypcfg, random=prng)
insider = wcfg.InsideComputation(mypcfg)

with open(args.outputfilename, 'w') as outf:
    i = 0
    while i < args.n:
        tree = mysampler.sample_tree()
        # default is string.
        s = utility.collect_yield(tree)
        if not args.maxlength or len(s) <= args.maxlength:
            if not args.omitprobs:
                lps = insider.inside_log_probability(s)
                lpt = mypcfg.log_probability_derivation(tree)
                lpb = insider._bracketed_log_probability(tree)[mypcfg.start]
                outf.write("%0.12f %0.12f %0.12f " % (lpt, lpb, lps))
            if args.yieldonly: