def get_unigram_cnt(bi_tree, unigram):
    uni_cnt = []
    # bi_tree -> tree
    bi_tree = ast.literal_eval(bi_tree)

    # get all unigrams
    q = Queue.Queue()
    q.put(0)
    while not q.empty():
        i = q.get_nowait()
        if 'children' not in bi_tree[i]: continue
        for child in bi_tree[i]['children']:
            q.put(child)
        tree = Tree(name=bi_tree[i]['type'])
        for child in bi_tree[i]['children']:
            tree.add_child(Tree(name=bi_tree[child]['type']))
        uni_cnt.append(unigram[tree_to_string(tree)])
    return uni_cnt
예제 #2
0
def tokenize_unigram(unigram, target):
    global UNSEEN
    sentence = []
    q = Queue.Queue()
    q.put(0)
    while not q.empty():
        i = q.get_nowait()
        if 'children' not in target[i]: continue
        for child in target[i]['children']:
            q.put(child)
        root = Tree(name=target[i]['type'])
        for child in target[i]['children']:
            root.add_child(Tree(name=target[child]['type']))
        uni = tree_to_string(root)
        if uni in unigram:
            sentence.append(uni)
        else:
            sentence.append(UNSEEN)
    return sentence
예제 #3
0
if args.seed:
    print("Setting seed to ", args.seed)
    prng = RandomState(args.seed)
else:
    prng = RandomState()

mysampler = pcfg.Sampler(mypcfg, random=prng)
insider = inside.InsideComputation(mypcfg)

with open(args.outputfilename, 'w') as outf:
    i = 0
    while i < args.n:
        tree = mysampler.sample_tree()
        # defatul is string.
        s = utility.collect_yield(tree)
        if not args.maxlength or len(s) <= args.maxlength:
            if not args.omitprobs:

                lpt = mypcfg.log_probability_derivation(tree)
                lpb = insider._bracketed_log_probability(tree)[mypcfg.start]
                if args.omitinside:
                    outf.write("%e %e " % (lpt, lpb))
                else:
                    lps = insider.inside_log_probability(s)
                    outf.write("%e %e %e " % (lpt, lpb, lps))
            if args.yieldonly:
                outf.write(" ".join(s) + "\n")
            else:
                outf.write(utility.tree_to_string(tree) + "\n")
            i += 1
예제 #4
0
def tokenize_bigram(bigram, bigram_score, unigram, target):
    global UNSEEN
    global THRESH
    sentence = []
    vis = [0] * len(target)
    stack = []
    stack.append(0)
    while len(stack) != 0:
        i = stack.pop()
        if 'children' not in target[i]:
            vis[i] = 1
            continue
        for child in reversed(target[i]['children']):
            stack.append(child)
        if vis[i] == 1: continue
        # search bigram
        tree = Tree(name=target[i]['type'])
        for child in target[i]['children']:
            tree.add_child(Tree(name=target[child]['type']))
        str_tree = tree_to_string(tree)
        if str_tree not in unigram:
            sentence.append(UNSEEN)
            vis[i] = 1
            continue

        # depth 2 tree
        to_traverse = []
        for child in target[i]['children']:
            if 'children' in target[child]:
                subtree = Tree(name=target[child]['type'])
                for grand in target[child]['children']:
                    subtree.add_child(Tree(name=target[grand]['type']))
                if tree_to_string(subtree) not in unigram:
                    continue
                to_traverse.append(child)

        # depth 2 tree doesn't exist
        if len(to_traverse) == 0:
            # color unigram
            vis[i] = 1
            sentence.append(str_tree)
            continue

        # get all bigrams
        num_nodes = len(to_traverse)
        max_score = 0
        save_tree = []
        save_score = []
        save_vis = []
        save_child_num = []
        while num_nodes > 0:
            # get appended tree, changed vis index
            if num_nodes == len(to_traverse):
                permute = [tuple(to_traverse)]
            else:
                permute = itertools.permutations(to_traverse, num_nodes)
            for tup in permute:
                tmp_vis = [v for v in vis]
                tmp_tree = deepcopy(tree)
                tmp_vis[i] = 1
                for child in tup:
                    for idx, c in enumerate(target[i]['children']):
                        if c == child:
                            tmp_vis[c] = 1
                            for grand in target[child]['children']:
                                tmp_tree.children[idx].add_child(
                                    Tree(name=target[grand]['type']))
                str_bi_tree = tree_to_string(tmp_tree)

                if str_bi_tree not in bigram: continue
                if str_bi_tree in save_tree: continue
                save_tree.append(str_bi_tree)
                save_score.append(bigram_score[str_bi_tree])
                save_vis.append(tmp_vis)
                save_child_num.append(num_nodes)
                if bigram_score[str_bi_tree] > max_score:
                    max_score = bigram_score[str_bi_tree]
            num_nodes -= 1
        # final check
        if max_score > THRESH:
            target_idx = []
            # get all indicies of max score
            for ii, score in enumerate(save_score):
                if score == max_score:
                    target_idx.append(ii)
            target_min = []
            mini = 99999
            # get minium node value among them
            for ii in target_idx:
                if save_child_num[ii] < mini:
                    mini = save_child_num[ii]
            # get final index which has min node, and leftmost and max score
            for ii in target_idx:
                if save_child_num[ii] == mini:
                    real_target = ii
            vis = save_vis[real_target]
            sentence.append(save_tree[real_target])
        else:
            vis[i] = 1
            sentence.append(str_tree)
    return sentence
예제 #5
0
import utility
import argparse

parser = argparse.ArgumentParser(
    description='Convert treebank from MJIO to normal')
parser.add_argument('input', type=str, help='filename of input treebank')
parser.add_argument('output', type=str, help='filename of output treebank')

args = parser.parse_args()

with open(args.input) as inf:
    with open(args.output, 'w') as outf:
        for line in inf:
            #			print(line)
            if line.startswith("("):
                tree = utility.string_to_tree(line)
                ltree = utility.convert_mjtree(tree)
                output = utility.tree_to_string(ltree)
                outf.write(output + "\n")