parser.add_argument( '--outgrammars', type=str, help= 'output filenames of grammars to be unkified. (optional -- default just adds .unk to the end of the filename) ', nargs='*') args = parser.parse_args() unk = args.unk if args.data: print("Unkifying using file.") myunk = Unkifier(args.filename, args.unk, args.mincount) frequent = myunk.frequent else: print("Using first pcfg, ", args.ingrammars[0]) target = wcfg.load_wcfg_from_file(args.ingrammars[0]) if args.minexpectation > 0: frequent = set(target.frequent_terminals(args.minexpectation)) elif args.vocabsize > 0: frequent = target.most_frequent_terminals(args.vocabsize) rarest = frequent[-1] print("rarest", rarest, "expectation", target.terminal_expectations()[rarest]) frequent = set(frequent) else: raise ValueError("No option set") if args.outgrammars and len(args.outgrammars) == len(args.ingrammars): outg = args.outgrammars else: outg = [f + ".unk" for f in args.ingrammars]
#convert_wcfg_to_pcfg.py import utility import wcfg import argparse parser = argparse.ArgumentParser( description= 'Convert Grammar from potentially inconsistent BUWCFG to a PCFG that defines the same condirional distribution of trees given strings.' ) parser.add_argument('input', type=str, help='filename of input grammar') parser.add_argument('output', type=str, help='filename of output grammar') args = parser.parse_args() mywcfg = wcfg.load_wcfg_from_file(args.input) if not mywcfg.is_convergent(): print("Renormalising divergent WCFG") mywcfg = mywcfg.renormalise_divergent_wcfg2() print(mywcfg.check_local_normalisation()) print(mywcfg.compute_partition_function_fp()) #print(mywcfg.compute_partition_function_fast()) assert mywcfg.is_convergent() mywcfg.renormalise() mywcfg.store(args.output)
parser.add_argument( '--min_count_nmf', type=int, default=100, help= "Minimum frequency of words that can be considered to be amchors for nonterminals.(default 100)" ) parser.add_argument('--verbose', action="store_true", help="Print out some useful information") args = parser.parse_args() ll = locallearner.LocalLearner(args.input) if args.cheat: target_pcfg = wcfg.load_wcfg_from_file(args.cheat) n = len(target_pcfg.nonterminals) print(f"Number of nonterminals {n}") ll.nonterminals = n else: ll.nonterminals = args.nonterminals ll.seed = args.seed ll.number_clusters = args.number_clusters ll.min_count_nmf = args.min_count_nmf kernels = ll.find_kernels(verbose=args.verbose) with open(args.output, 'w') as outf: json.dump(kernels, outf)
parser.add_argument("--seed", help="Choose random seed", type=int) parser.add_argument("--maxlength", help="limit samples to this length", type=int) parser.add_argument("--omitprobs", help="don't compute probabilities", action="store_true") parser.add_argument("--yieldonly", help="just output the yield", action="store_true") ## Other options: control output format, what probs are calculated. args = parser.parse_args() mypcfg = wcfg.load_wcfg_from_file(args.inputfilename) if args.seed: print("Setting seed to ", args.seed) prng = RandomState(args.seed) else: prng = RandomState() mysampler = wcfg.Sampler(mypcfg, random=prng) insider = wcfg.InsideComputation(mypcfg) with open(args.outputfilename, 'w') as outf: i = 0 while i < args.n: tree = mysampler.sample_tree() # default is string.
parser.add_argument('--json', type=str, help='location of the output json file if needed.') parser.add_argument("--seed",help="Choose random seed",type=int) parser.add_argument('--length', type=int, default=10, help='length to measure the string density at.') parser.add_argument('--samples', type=int, default=1000, help='samples to measure the string density.') parser.add_argument('--maxlength', type=int, default=20, help='limit on the length of strings when measuring ambiguity.') args = parser.parse_args() if args.seed: random.seed(args.seed) numpy.random.seed(args.seed) verbose = False result_dict = {} target_pcfg = wcfg.load_wcfg_from_file(args.input) target_ambiguity = target_pcfg.estimate_ambiguity(samples = args.samples, maxlength = args.maxlength) result_dict["ambiguity"] = target_ambiguity print("Target grammar ambiguity H( tree | word): %e" % target_ambiguity) ## Now try string denisyt using a sensible approach. us = uniformsampler.UniformSampler(target_pcfg, args.length) sd = us.string_density(args.length,args.samples) print("String density: %e" % sd) result_dict["string density"] = sd naivesd = target_pcfg.estimate_string_density(args.length, args.samples) print("Naive String density: %e" % naivesd) result_dict["naive string density"] = naivesd try:
parser.add_argument("--seed", help="Choose random seed", type=int) parser.add_argument("--verbose", help="Print useful information", action="store_true") parser.add_argument('input', type=str, help='location of the target pcfg.') parser.add_argument('output', type=str, help='location of the output wcfg.') args = parser.parse_args() if args.seed: random.seed(args.seed) numpy.random.seed(args.seed) oracle_learner.N_SAMPLES = args.nsamples oracle_learner.MAX_SAMPLES = args.maxsamples i = args.input target_pcfg = wcfg.load_wcfg_from_file(i) logging.info("Loaded") output_wcfg = args.output ol = oracle_learner.OracleLearner(target_pcfg) og = ol.test() if og: og.store(output_wcfg) else: ## create an empty file logging.warning("Error: Unanchored, empty wcfg") open(output_wcfg, 'a').close()
parser.add_argument("--batchsize",type=int,default=10000,help="Samples in batch size (default 10000)") parser.add_argument("--maxbatches",type=float,default=math.inf,help="Number of batches (default is all of them)") parser.add_argument("--alpha",type=float,default=0.75,help="Alpha parameter eta_k = (k+2)^{-alpha}, default 0.75") import glob args = parser.parse_args() bsz = args.batchsize batches = args.maxbatches alpha = args.alpha epochs = args.epochs ## Maybe set the parameters intelligently wrt to the size of the data etc. ## Create temporary directory mywcfg = wcfg.load_wcfg_from_file(args.grammar) tmpdir = mkdtemp() #with TemporaryDirectory() as tmpdir: print("Creating temp directory ", tmpdir) ## Convert file to MJIO format mjio_filename1 = tmpdir + "/igrammar.mjio" mjio_counts = tmpdir + "/ogrammar.counts"
import wcfg import argparse import evaluation import logging parser = argparse.ArgumentParser(description='Map the labels of the first grammar onto the second grammar and save it') parser.add_argument('target', type=str, help='filename of grammar with the right nonterminals') parser.add_argument('hypothesis', type=str, help='filename of grammar to be relabeled.') parser.add_argument('output', type=str, help='filename of output grammar (isomorphic to hypothesis') parser.add_argument('--samples', type=int, default=1000, help='Number of samples to use (default 1000)') parser.add_argument('--verbose', action="store_true", help='Print out relabelling') args = parser.parse_args() target = wcfg.load_wcfg_from_file(args.target) hypothesis = wcfg.load_wcfg_from_file(args.hypothesis) minn = min(target.nonterminal_expectations().values()) if args.samples < 10 / minn: logging.warning("May be too few samples to correctly estimate; since minimum nonterminal expectation is %f", minn) mapping = evaluation.estimate_bijection(target, hypothesis,args.samples) if args.verbose: for a,b in mapping.items(): print(a,"->",b) output = hypothesis.relabel({a:b for b,a in mapping.items()}) output.store(args.output)
import argparse import evaluation import utility import math import sys import json from collections import defaultdict parser = argparse.ArgumentParser(description='Evaluate kernels against a target pcfg') parser.add_argument('target', type=str,help='filename of original (gold) grammar') parser.add_argument('kernels', help='filename of kernels') parser.add_argument('--json', help='filename of json file') args = parser.parse_args() scores = {} target = wcfg.load_wcfg_from_file(args.target) with open(args.kernels,'r') as inf: for line in inf: #print(line) kernels = json.loads(line) results = set() product = 1.0 te = target.terminal_expectations() for a in kernels: if a == 'S': print("Skipping S") results.add('S') else: x = target.find_best_lhs(a)
if __name__ == '__main__': parser = argparse.ArgumentParser( description= 'Estimate hyperparameters for a learner from a PCFG. Dont use this on a grammar you will learn. Of course.' ) parser.add_argument('input', type=str, help='filename of input grammar') # parser.add_argument('output', type=str, help='filename of output json file') parser.add_argument("--posterior", help="Kernel posterior (default 0.9)", default=0.9, type=float) args = parser.parse_args() target = wcfg.load_wcfg_from_file(args.input) te = target.terminal_expectations() pe = target.production_expectations() ## Hyperparams threshold = args.posterior ## Samples needed ## Number of clusters ## Min count ## Number of clusters. number_clusters = 2 * len(target.nonterminals) result = {} result["kernel"] = True