def get_unigram_cnt(bi_tree, unigram): uni_cnt = [] # bi_tree -> tree bi_tree = ast.literal_eval(bi_tree) # get all unigrams q = Queue.Queue() q.put(0) while not q.empty(): i = q.get_nowait() if 'children' not in bi_tree[i]: continue for child in bi_tree[i]['children']: q.put(child) tree = Tree(name=bi_tree[i]['type']) for child in bi_tree[i]['children']: tree.add_child(Tree(name=bi_tree[child]['type'])) uni_cnt.append(unigram[tree_to_string(tree)]) return uni_cnt
def tokenize_unigram(unigram, target): global UNSEEN sentence = [] q = Queue.Queue() q.put(0) while not q.empty(): i = q.get_nowait() if 'children' not in target[i]: continue for child in target[i]['children']: q.put(child) root = Tree(name=target[i]['type']) for child in target[i]['children']: root.add_child(Tree(name=target[child]['type'])) uni = tree_to_string(root) if uni in unigram: sentence.append(uni) else: sentence.append(UNSEEN) return sentence
if args.seed: print("Setting seed to ", args.seed) prng = RandomState(args.seed) else: prng = RandomState() mysampler = pcfg.Sampler(mypcfg, random=prng) insider = inside.InsideComputation(mypcfg) with open(args.outputfilename, 'w') as outf: i = 0 while i < args.n: tree = mysampler.sample_tree() # defatul is string. s = utility.collect_yield(tree) if not args.maxlength or len(s) <= args.maxlength: if not args.omitprobs: lpt = mypcfg.log_probability_derivation(tree) lpb = insider._bracketed_log_probability(tree)[mypcfg.start] if args.omitinside: outf.write("%e %e " % (lpt, lpb)) else: lps = insider.inside_log_probability(s) outf.write("%e %e %e " % (lpt, lpb, lps)) if args.yieldonly: outf.write(" ".join(s) + "\n") else: outf.write(utility.tree_to_string(tree) + "\n") i += 1
def tokenize_bigram(bigram, bigram_score, unigram, target): global UNSEEN global THRESH sentence = [] vis = [0] * len(target) stack = [] stack.append(0) while len(stack) != 0: i = stack.pop() if 'children' not in target[i]: vis[i] = 1 continue for child in reversed(target[i]['children']): stack.append(child) if vis[i] == 1: continue # search bigram tree = Tree(name=target[i]['type']) for child in target[i]['children']: tree.add_child(Tree(name=target[child]['type'])) str_tree = tree_to_string(tree) if str_tree not in unigram: sentence.append(UNSEEN) vis[i] = 1 continue # depth 2 tree to_traverse = [] for child in target[i]['children']: if 'children' in target[child]: subtree = Tree(name=target[child]['type']) for grand in target[child]['children']: subtree.add_child(Tree(name=target[grand]['type'])) if tree_to_string(subtree) not in unigram: continue to_traverse.append(child) # depth 2 tree doesn't exist if len(to_traverse) == 0: # color unigram vis[i] = 1 sentence.append(str_tree) continue # get all bigrams num_nodes = len(to_traverse) max_score = 0 save_tree = [] save_score = [] save_vis = [] save_child_num = [] while num_nodes > 0: # get appended tree, changed vis index if num_nodes == len(to_traverse): permute = [tuple(to_traverse)] else: permute = itertools.permutations(to_traverse, num_nodes) for tup in permute: tmp_vis = [v for v in vis] tmp_tree = deepcopy(tree) tmp_vis[i] = 1 for child in tup: for idx, c in enumerate(target[i]['children']): if c == child: tmp_vis[c] = 1 for grand in target[child]['children']: tmp_tree.children[idx].add_child( Tree(name=target[grand]['type'])) str_bi_tree = tree_to_string(tmp_tree) if str_bi_tree not in bigram: continue if str_bi_tree in save_tree: continue save_tree.append(str_bi_tree) save_score.append(bigram_score[str_bi_tree]) save_vis.append(tmp_vis) save_child_num.append(num_nodes) if bigram_score[str_bi_tree] > max_score: max_score = bigram_score[str_bi_tree] num_nodes -= 1 # final check if max_score > THRESH: target_idx = [] # get all indicies of max score for ii, score in enumerate(save_score): if score == max_score: target_idx.append(ii) target_min = [] mini = 99999 # get minium node value among them for ii in target_idx: if save_child_num[ii] < mini: mini = save_child_num[ii] # get final index which has min node, and leftmost and max score for ii in target_idx: if save_child_num[ii] == mini: real_target = ii vis = save_vis[real_target] sentence.append(save_tree[real_target]) else: vis[i] = 1 sentence.append(str_tree) return sentence
import utility import argparse parser = argparse.ArgumentParser( description='Convert treebank from MJIO to normal') parser.add_argument('input', type=str, help='filename of input treebank') parser.add_argument('output', type=str, help='filename of output treebank') args = parser.parse_args() with open(args.input) as inf: with open(args.output, 'w') as outf: for line in inf: # print(line) if line.startswith("("): tree = utility.string_to_tree(line) ltree = utility.convert_mjtree(tree) output = utility.tree_to_string(ltree) outf.write(output + "\n")