Пример #1
0
def string_kld(target,
               hypothesis,
               samples=1000,
               verbose=False,
               max_length=math.inf,
               seed=None):
    ### sample n trees from target.
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    inside_target = wcfg.InsideComputation(target)
    inside_hypothesis = wcfg.InsideComputation(hypothesis)
    sampler = wcfg.Sampler(target, random=rng)
    total = 0.0
    n = 0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) <= max_length:
            n += 1
            lp = inside_target.inside_log_probability(s)
            lq = inside_hypothesis.inside_log_probability(s)
            if verbose:
                print("Sample %d %s, target %f, hypothesis %f" %
                      (i, t, lp, lq))
            total += lp - lq
    return total / n
Пример #2
0
def do_lp_monte_carlo(target,
                      hypotheses,
                      samples=1000,
                      max_length=30,
                      seed=None,
                      verbose=False):
    """
	Do all of the log prob evaluations in one pass.
	return a dict with all results stored as "logprob:model:"
	"""
    inside_target = wcfg.InsideComputation(target)
    inside_hypotheses = [
        wcfg.InsideComputation(hypothesis) for hypothesis in hypotheses
    ]

    scores = defaultdict(float)
    scores['number_hypotheses'] = len(hypotheses)

    def f(t, i):
        nonlocal scores
        s = utility.collect_yield(t)
        scores['logprob:target:n'] += 1
        scores['logprob:target:length'] += len(s)
        scores[
            "logprob:target:string"] += inside_target.inside_log_probability(s)
        scores[
            "logprob:target:bracket"] += inside_target.inside_bracketed_log_probability(
                t)
        scores["logprob:target:tree"] += target.log_probability_derivation(t)
        for i, (hypothesis, inside_hypothesis) in enumerate(
                zip(hypotheses, inside_hypotheses)):
            label = "logprob:hypothesis%d:" % i
            try:
                scores[label +
                       "string"] += inside_hypothesis.inside_log_probability(s)

            except utility.ParseFailureException:
                scores[label + "string:failures"] += 1
            try:
                scores[
                    label +
                    "bracket"] += inside_hypothesis.inside_bracketed_log_probability(
                        t)

            except utility.ParseFailureException:
                scores[label + "bracket:failures"] += 1
            try:
                scores[label +
                       "tree"] += hypothesis.log_probability_derivation(t)
            except utility.ParseFailureException:
                scores[label + 'tree:failures'] += 1

    monte_carlo(target, f, samples, max_length, seed)
    return scores
Пример #3
0
    def string_density(self, length, samples):
        """
		return an estimate of the proportion of strings of length n that are in the grammar.
		Do this by sampling uniformly from the derivations, 
		and computing the number of derivations for each such string, and dividing.
		"""
        derivations = self.get_total(length)

        strings = 1.0 * self.vocab**length
        total = 0.0
        parser = wcfg.InsideComputation(self.grammar)
        inverse = 0.0
        for i in range(samples):

            tree = self.sample(length)
            w = collect_yield(tree)
            #print w
            #print w

            n = parser.count_parses(w)
            #print n
            if n == 0:
                raise ValueError("Generated a string which cannot be parsed.")
            total += n
            inverse += 1.0 / n
        imean = inverse / samples
        return (derivations /
                strings) * imean  #, derivations, strings, 1.0/imean
Пример #4
0
def labeled_exact_match(target,
                        hypothesis,
                        samples=1000,
                        max_length=30,
                        viterbi=False,
                        verbose=False,
                        seed=None):
    """
	Proportion of trees whose viterbi parse is the same up to a relabeling of the hypothesis tree.
	Target has to be a pcfg; hypothesis can be any WCFG.

	Identical nonterminals
	"""
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    sampler = wcfg.Sampler(target, random=rng)
    if viterbi:
        inside_target = wcfg.InsideComputation(target)
    inside_hypothesis = wcfg.InsideComputation(hypothesis)

    total = 0.0
    n = 0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) >= max_length:
            continue
        n += 1
        if viterbi:
            t = inside_target.viterbi_parse(s)
        try:
            th = inside_hypothesis.viterbi_parse(s)
            #relabeled_tree = utility.relabel_tree(th, ntmap)
            relabeled_tree = th
            if relabeled_tree == t:
                total += 1
            elif verbose:
                logging.info("Mismatch in trees with parse of %s", s)
                print(relabeled_tree)
                print(t)
        except utility.ParseFailureException as e:
            # Treat this as a failure .

            print("Parse failure of %s " % s)
    return total / n
Пример #5
0
 def __init__(self, target_pcfg):
     self.target_pcfg = target_pcfg
     self.terminals = set(target_pcfg.terminals)
     self.te = target_pcfg.terminal_expectations()
     self.pe = target_pcfg.production_expectations()
     self.sampler = wcfg.Sampler(self.target_pcfg)
     self.insider = wcfg.InsideComputation(self.target_pcfg)
     self.contexts_map = {}
Пример #6
0
 def string_density_crude(self, length, samples):
     terminals = list(self.grammar.terminals)
     n = 0
     parser = wcfg.InsideComputation(self.grammar)
     for i in range(samples):
         s = tuple([numpy.random.choice(terminals) for x in range(length)])
         if parser.count_parses(s) > 0:
             n += 1
     return n / float(samples)
Пример #7
0
def conditional_kld(target,
                    hypothesis,
                    samples=1000,
                    verbose=False,
                    max_length=math.inf,
                    seed=None):
    """
	Estimate the kld between the conditional probability distributions.

	for a given string $w$ D( P(tree|w) | Q(tree|w)).

	difference between string KLD and tree KLD.
	
	Target must be a pcfg, hyppthesis can be arbitrary wcfg, not even convergent, with same nonterminals.
	"""
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    inside_target = wcfg.InsideComputation(target)
    inside_hypothesis = wcfg.InsideComputation(hypothesis)
    sampler = wcfg.Sampler(target, random=rng)
    total = 0.0
    n = 0
    for i in range(samples):
        t = sampler.sample_tree()
        s = utility.collect_yield(t)
        if len(s) > max_length:
            if verbose:
                print("Skipping", len(s))
            continue
        n += 1
        ptree = target.log_probability_derivation(t)
        pstring = inside_target.inside_log_probability(s)

        qtree = hypothesis.log_probability_derivation(t)
        qstring = inside_hypothesis.inside_log_probability(s)

        total += (ptree - qtree) - (pstring - qstring)
        if verbose:
            print("%s p(t) = %f, p(w) = %f, q(t) = %f, q(w) = %f" %
                  (s, ptree, pstring, qtree, qstring))
    return total / n
Пример #8
0
def estimate_bijection(target_pcfg,
                       hypothesis_pcfg,
                       samples=1000,
                       seed=None,
                       max_length=math.inf,
                       verbose=False):
    ## Essential assumption
    assert len(target_pcfg.nonterminals) == len(hypothesis_pcfg.nonterminals)
    if seed == None:
        rng = numpy.random.RandomState()
    else:
        rng = numpy.random.RandomState(seed)
    sampler = wcfg.Sampler(target_pcfg, random=rng)
    insider = wcfg.InsideComputation(hypothesis_pcfg)
    n = len(target_pcfg.nonterminals)

    c = defaultdict(Counter)

    for _ in range(samples):
        tree = sampler.sample_tree()

        s = utility.collect_yield(tree)
        if len(s) <= max_length:
            try:
                learned = insider.bracketed_viterbi_parse(tree)
                collect_nonterminal_pairs(tree, learned, c)
            except utility.ParseFailureException:
                print("Failed", s)
    #print(c)

    maximum_value = max(max(c2.values()) for nt, c2 in c.items())
    #print(maximum_value)
    ## Now use the Hungarian algorithm

    cost_matrix = np.zeros((n, n))
    target_list = list(target_pcfg.nonterminals)
    hypothesis_list = list(hypothesis_pcfg.nonterminals)
    for i, a in enumerate(target_list):
        for j, b in enumerate(hypothesis_list):
            count = c[a][b]
            cost_matrix[i, j] = maximum_value - count
            # if count == 0:
            # 	cost_matrix[i,j] = maximum_value
            # else:
            # 	cost_matrix[i,j] = 1/count  ## Maybe normalise so as to maximize something else?
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    answer = {}
    for i, j in zip(row_ind, col_ind):
        answer[target_list[i]] = hypothesis_list[j]
    #print(answer)
    return answer
Пример #9
0
def bracketed_kld(target,
                  hypothesis,
                  samples=1000,
                  max_length=30,
                  seed=None,
                  verbose=False):
    ### sample n trees from target. FAST
    inside_target = wcfg.InsideComputation(target)
    inside_hypothesis = wcfg.InsideComputation(hypothesis)
    n = 0
    total = 0

    def f(t, i):
        lp = inside_target.inside_bracketed_log_probability(t)
        lq = inside_hypothesis.inside_bracketed_log_probability(t)
        nonlocal n, total
        n += 1
        total += lp - lq
        if verbose:
            print("Sample %d %s, target %f, hypothesis %f" % (i, t, lp, lq))

    monte_carlo(target, f, samples, max_length, seed)
    return total / n
Пример #10
0
 def __init__(self, target_pcfg):
     self.target_pcfg = target_pcfg
     self.target_wcfg = target_pcfg.convert_parameters_pi2xi()
     self.insider = wcfg.InsideComputation(target_pcfg)
     self.terminals = set(target_pcfg.terminals)
     self.te = target_pcfg.terminal_expectations()
     self.max_length = 2 * len(self.target_pcfg.nonterminals)
     self.kernel = []
     self.nonterminal_map = {}
     self.parameters = {}
     self.kernel_grammar = None
     self.sampler = None
     self.asymptotic_grammar = None
     self.prod2context = {}
Пример #11
0
def do_parseval_monte_carlo(target,
                            hypotheses,
                            samples=1000,
                            max_length=30,
                            seed=None,
                            verbose=False):
    """
	Do a bunch of parsing based evaluations on a sample from the target.

	Hypotheses is a list of pcfgs to be evaluated. 
	"""
    inside_target = wcfg.InsideComputation(target)
    #inside_hypothesis = wcfg.InsideComputation(hypothesis)
    inside_hypotheses = [
        wcfg.InsideComputation(hypothesis) for hypothesis in hypotheses
    ]
    baselines = Baselines(len(target.nonterminals))

    scores = defaultdict(int)

    def f(t, i):
        nonlocal scores
        s = utility.collect_yield(t)
        scores['trees_denominator'] += 1
        scores['labeled_denominator'] += utility.count_labeled(t)
        scores['unlabeled_denominator'] += utility.count_unlabeled(t)

        gold_viterbi = inside_target.viterbi_parse(s)

        try:
            ## Viterbi/nonviterbi
            ## target, hyppothesis,left,right,random
            ## labeled unlabeled
            ## exact match / microaveraged
            hypo_viterbis = [
                (inside_hypothesis.viterbi_parse(s), "hypothesis%d" % i)
                for i, inside_hypothesis in enumerate(inside_hypotheses)
            ]
            lb = baselines.make_left_branch(s)
            rb = baselines.make_right_branch(s)
            rand = baselines.make_random_labeled(s)

            for target_tree, label1 in [(t, "original"),
                                        (gold_viterbi, "viterbi")]:
                for eval_tree, label2 in hypo_viterbis + [
                    (lb, "leftbranch"), (rb, "rightbranch"), (rand, "random"),
                    (gold_viterbi, "gold viterbi")
                ]:
                    scores[
                        label1 + ":" + label2 +
                        ":labeled:exact_match"] += 1 if target_tree == eval_tree else 0
                    scores[
                        label1 + ":" + label2 +
                        ":unlabeled:exact_match"] += 1 if utility.unlabeled_tree_equal(
                            target_tree, eval_tree) else 0
                    (x, n) = utility.microaveraged_labeled(
                        target_tree, eval_tree)
                    scores[label1 + ":" + label2 +
                           ":labeled:microaveraged"] += x
                    (x, n) = utility.microaveraged_unlabeled(
                        target_tree, eval_tree)
                    scores[label1 + ":" + label2 +
                           ":unlabeled:microaveraged"] += x

            # (x,n) = utility.microaveraged_labeled(t, hypo_viterbi)
            # if hypo_viterbi == t:
            # 	scores['labeled_exact_match'] += 1
            # if hypo_viterbi == gold_viterbi:
            # 	scores['labeled_exact_match_viterbi'] += 1
            # hvu = utility.tree_to_unlabeled_tree(hypo_viterbi)
            # goldu = utility.tree_to_unlabeled_tree(t)
            # if hvu == goldu:
            # 	scores['unlabeled_exact_match'] += 1
            # scores['labeled'] += x
            # scores['labeled_denominator'] += n
            # (x,n) = utility.microaveraged_unlabeled(t, hypo_viterbi)
            # scores['unlabeled'] += x
            # scores['unlabeled_denominator'] += n
            # ## Now some baselines.

            ## exact match
        except utility.ParseFailureException:
            print("Parse failure of %s " % s)

    monte_carlo(target, f, samples, max_length, seed)
    return scores
Пример #12
0
                    action="store_true")

## Other options: control output format, what probs are calculated.

args = parser.parse_args()

mypcfg = wcfg.load_wcfg_from_file(args.inputfilename)

if args.seed:
    print("Setting seed to ", args.seed)
    prng = RandomState(args.seed)
else:
    prng = RandomState()

mysampler = wcfg.Sampler(mypcfg, random=prng)
insider = wcfg.InsideComputation(mypcfg)

with open(args.outputfilename, 'w') as outf:
    i = 0
    while i < args.n:
        tree = mysampler.sample_tree()
        # default is string.
        s = utility.collect_yield(tree)
        if not args.maxlength or len(s) <= args.maxlength:
            if not args.omitprobs:
                lps = insider.inside_log_probability(s)
                lpt = mypcfg.log_probability_derivation(tree)
                lpb = insider._bracketed_log_probability(tree)[mypcfg.start]
                outf.write("%0.12f %0.12f %0.12f " % (lpt, lpb, lps))
            if args.yieldonly:
                outf.write(" ".join(s) + "\n")