def test_uniformsampler2(self): grammar = cfg.load_from_file("../data/cfgs/count1.cfg") sampler = uniformsampler.UniformSampler(grammar, 10) #sampler.dump() self.assertEqual(sampler.get("S", 0), 1) self.assertEqual(sampler.get("S", 1), 2) self.assertEqual(sampler.get("S", 2), 1) self.assertEqual(sampler.get("S", 3), 0) self.assertEqual(sampler.get("S", 5), 0)
def test_smart_infix(self): grammar = cfg.load_from_file("../data/cfgs/abab2.cfg") # sample from the ones that don't have w1 = ("a1", "b1") nonterminal = "O" ig = grammar.infix_grammar_without_nt(w1, nonterminal) self.assertTrue(len(ig.nonterminals) > 0) sampler = uniformsampler.UniformSampler(ig, 20) self.assertEqual(sampler.get_total(3), 2)
def test_fkp_strong_1nt(self): """ Test finding strings for a given NT. """ grammar = cfg.load_from_file("../data/cfgs/abab2.cfg") parser = earleyparser.EarleyParser(grammar) sampler = uniformsampler.UniformSampler(grammar, 100) k = 2 n = 5 nonterminal = "O"
def test_uniformsampler(self): grammar = cfg.load_from_file("../data/cfgs/cfg1.cfg") sampler = uniformsampler.UniformSampler(grammar, 20) self.assertEqual(sampler.get("S", 0), 0) self.assertEqual(sampler.get("S", 1), 1) self.assertEqual(sampler.get("S", 2), 0) self.assertEqual(sampler.get("S", 3), 2) self.assertEqual(sampler.get("S", 5), 4) #print "about to sample." tree = sampler.sample(5) self.assertEqual(tree.width(), 5)
def test_intersection1(self): grammar = cfg.load_from_file("../data/cfgs/cfg1.cfg") prefix = ("ax", ) pg = grammar.prefix_grammar(prefix) #print "Dump prefix grammar" #pg.dump() self.assertTrue("S" in pg.nonterminals) sampler = uniformsampler.UniformSampler(pg, 10) #print "Dumping intersected sampler" #sampler.dump() self.assertEqual(sampler.get("S", 0), 0)
def test_context_sampler1(self): grammar = cfg.load_from_file("../data/cfgs/cfg6.cfg") us = uniformsampler.UniformSampler(grammar, 5) cs = uniformsampler.ContextSampler(grammar, us, 5) self.assertEqual(cs.index["B"][0], 0) self.assertEqual(cs.index["S"][0], 1) self.assertEqual(cs.index["B"][2], 3) context0 = cs.sample_context("S", 0) self.assertEqual(context0, ((), ())) for i in xrange(100): l, r = cs.sample_context("B", 2) #print l, "---", r self.assertEqual(len(l) + len(r), 2)
def test_one_fkp_exact(grammar, nyields): """ See if this has the exact 1-fkp. """ sampler = uniformsampler.UniformSampler(grammar, max_substring_length) result = dict() for nt in grammar.nonterminals: w = test_one_fkp_nt_exact(grammar,sampler,nt,nyields) if w: result[nt] = w else: logging.info("failed on %s" % nt) return False return result
def test_strong_fkp_full(grammar, k): """ Main entry point for the primal tester. Returns a tuple (True|False, map[ nonterminals to k-tuples of strings ]) """ result = dict() parser = earleyparser.EarleyParser(grammar) sampler = uniformsampler.UniformSampler(grammar, max_substring_length) ncontexts = 25 for nt in grammar.nonterminals: r = test_strong_fkp_nt(grammar, parser, sampler, nt, k, ncontexts, stop=True) if r: result[nt] = r else: return False return result
def count_one_fcp_exact(grammar, ncontexts): """ See if this has the exact 1-fcp. Use ncontexts """ sampler = uniformsampler.UniformSampler(grammar, max_substring_length) contextsampler = uniformsampler.ContextSampler(grammar, sampler, max_context_length) result = dict() #ok = True for nt in grammar.nonterminals: if nt in grammar.start_set and len(grammar.start_set) == 1: result[nt] = ((),()) else: c = test_one_fcp_nt_exact(grammar,sampler, contextsampler,nt,ncontexts) if c: result[nt] = c return result
def test_strong_fcp_full(grammar, k): """ Main entry point for the dual tester. Method: """ result = dict() parser = earleyparser.EarleyParser(grammar) sampler = uniformsampler.UniformSampler(grammar, max_substring_length) contextsampler = uniformsampler.ContextSampler(grammar, sampler, max_context_length) ncontexts = 25 for nt in grammar.nonterminals: r = test_strong_fcp_nt(grammar, parser, sampler, contextsampler, nt, k, ncontexts, stop=True) if r: print "nt", nt, r result[nt] = r else: print "Fail ", nt return False return result
def test_one_fkp_nt_string_inexact(grammar, nonterminal, w, ncontexts): """ See if this nonterminal is characterised by this single substring which is one of its yields. First """ ig = grammar.infix_grammar_without_nt(w, nonterminal) if ig.is_empty(): return True # it is noempty so sample sampler = uniformsampler.UniformSampler(ig, max_substring_length) samples = sampler.multiple_sample(ncontexts, ncontexts ** 2) contexts = [] for lwr in set(samples): # extract the contexts # for each context test if it is a context of the nonterminal. for context in extract(lwr,w): contexts.append(context)
def pick_some_inside_strings(igrammar, context, nsamples): """ Pick some strings that are generated by this grammar which is the intersection of a context automata. Method: construct sampler; fiddle about a bit to get the right length. Sample and then snip off. """ (left,right) = context ll = len(left) lr = len(right) l = len(left) + len(right) sampler = uniformsampler.UniformSampler(igrammar, l + max_substring_length) # this will not generate any strings of length less than l raw = list(set(sampler.multiple_sample(nsamples, nsamples*2))) #print context #print "RAW", raw result = [ x[ll: len(x) -lr] for x in raw ] #print result return result
def sample_from_substring(grammar, w, n): """ return a set of n short contexts of this substring. If we can't find any then return. """ infixgrammar = grammar.infix_grammar(w) max_length = 10 sampler = uniformsampler.UniformSampler(infixgrammar, len(w) + max_context_length) total = 0 counts = [] lengths = [] for i in xrange(len(w), len(w) + max_context_length): c = sampler.get_total(i) counts.append(c) lengths.append(i) total += c if total > n * n: break else: # we are in some situation where the number of strings is low. logging.warning("Sparse contexts") max_attempts = n * n result = set() distribution = [x / total for x in counts] clengths = numpy.random.choice(lengths, max_attempts, p=distribution ) for l in clengths: lwr = sampler.sample(l).collectYield() for context in extract(lwr,w): result.add(context) if len(result) >= n: return result else: logging.warning("Didn't find all of the contexts.") return result
factory = generatecfg.CnfFactory() factory.number_nonterminals = 10 factory.number_terminals = 100 factory.number_binary_productions = 30 factory.number_lexical_productions = 100 number_grammars = 5 max_length = 20 samples_per_length = 10 for g in xrange(number_grammars): print "Grammar ", g x = [] y = [] grammar = factory.make_grammar() us = uniformsampler.UniformSampler(grammar, max_length) for l in xrange(1, max_length + 1): density = us.density(l) x.append(l) y.append(density) plt.plot(x, y, "bx-") x = [] y = [] for l in xrange(1, max_length + 1): density = us.string_density(l, samples_per_length) x.append(l) y.append(density) plt.plot(x, y, "ro-") plt.xlabel('Length') plt.ylabel('Density')
args = parser.parse_args() if args.seed: random.seed(args.seed) numpy.random.seed(args.seed) verbose = False result_dict = {} target_pcfg = wcfg.load_wcfg_from_file(args.input) target_ambiguity = target_pcfg.estimate_ambiguity(samples = args.samples, maxlength = args.maxlength) result_dict["ambiguity"] = target_ambiguity print("Target grammar ambiguity H( tree | word): %e" % target_ambiguity) ## Now try string denisyt using a sensible approach. us = uniformsampler.UniformSampler(target_pcfg, args.length) sd = us.string_density(args.length,args.samples) print("String density: %e" % sd) result_dict["string density"] = sd naivesd = target_pcfg.estimate_string_density(args.length, args.samples) print("Naive String density: %e" % naivesd) result_dict["naive string density"] = naivesd try: asymptotic_wcfg = wcfg.load_wcfg_from_file(args.output) ## Check to see if the kernel is in fact identifiable ol = oracle_learner.OracleLearner(target_pcfg)
def test_density1(self): grammar = cfg.load_from_file("../data/cfgs/cfg1.cfg") sampler = uniformsampler.UniformSampler(grammar, 10) density = 4.0 / (5**5) self.assertAlmostEqual(sampler.density(5), density)