示例#1
0
    def test_json_grammar_export(self):
        dog = build_acyclic_dog()
        terminals = Enumerator()
        data = dog.export_graph_json(terminals)
        with open('/tmp/json_graph_1.json', 'w') as file:
            json.dump(data, file)

        dsg = build_dsg()
        data = dsg.export_bihypergraph_json(terminals)
        with open('/tmp/json_bigraph_1.json', 'w') as file:
            json.dump(data, file)

        rule_dog = dog_se()
        data2 = rule_dog.export_graph_json(terminals)
        with open('/tmp/json_nonterminal_graph_1.json', 'w') as file:
            json.dump(data2, file)

        terminals.print_index()

        dsg = build_dsg()
        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('right-branching')[0]
        rec_part = rec_part_strategy(dsg)
        dcmp = compute_decomposition(dsg, rec_part)

        grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=simple_labeling, terminal_labeling=str)

        print(grammar)
        data = export_dog_grammar_to_json(grammar, terminals)
        with open('/tmp/json_grammar.json', 'w') as file:
            json.dump(data, file)

        with open('/tmp/json_corpus.json', 'w') as file:
            json.dump(export_corpus_to_json([dsg], terminals), file)
示例#2
0
    def test_json_export(self):
        dog = build_acyclic_dog()
        terminals = Enumerator()
        data = dog.export_graph_json(terminals)
        with open('/tmp/json_graph_1.json', 'w') as file:
            json.dump(data, file)

        dsg = build_dsg()
        data = dsg.export_bihypergraph_json(terminals)
        with open('/tmp/json_bigraph_1.json', 'w') as file:
            json.dump(data, file)

        rule_dog = dog_se()
        data2 = rule_dog.export_graph_json(terminals)
        with open('/tmp/json_nonterminal_graph_1.json', 'w') as file:
            json.dump(data2, file)

        terminals.print_index()
示例#3
0
    def test_json_corpus_grammar_export(self):
        start = 1
        stop = 50
        # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
        path = "res/tiger/tiger_8000.xml"
        exclude = []
        dsgs = sentence_names_to_deep_syntax_graphs(
            ['s' + str(i) for i in range(start, stop + 1) if i not in exclude]
            , path
            , hold=False)

        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0]

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label

        nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge)

        term_labeling_token = PosTerminals()

        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling)
        grammar.make_proper()

        terminals = Enumerator()

        data = export_dog_grammar_to_json(grammar, terminals)
        grammar_path = '/tmp/json_grammar.json'
        with open(grammar_path, 'w') as file:
            json.dump(data, file)

        corpus_path = '/tmp/json_corpus.json'
        with open(corpus_path, 'w') as file:
            json.dump(export_corpus_to_json(dsgs, terminals, terminal_labeling=term_labeling), file)

        with open('/tmp/enumerator.enum', 'w') as file:
            terminals.print_index(file)

        reduct_dir = '/tmp/reduct_grammars'
        if os.path.isdir(reduct_dir):
            shutil.rmtree(reduct_dir)
        os.makedirs(reduct_dir)
        p = subprocess.Popen([' '.join(
            ["java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t',
             corpus_path, "-o", reduct_dir])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        print("stdout", p.stdout.name)

        while True:
            nextline = p.stdout.readline()
            if nextline == b'' and p.poll() is not None:
                break
            print(nextline.decode('unicode_escape'), end='')
            # sys.stdout.write(nextline)
            # sys.stdout.flush()

        p.wait()
        p.stdout.close()
        self.assertEqual(0, p.returncode)

        rtgs = []
        for i in range(1, len(dsgs) + 1):
            rtgs.append(read_rtg('/tmp/reduct_grammars/' + str(i) + '.gra'))

        derivation_manager = PyDerivationManager(grammar)
        derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
        derivation_manager.serialize(bytes('/tmp/reduct_manager.trace', encoding='utf8'))

        f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token

        for i, (rtg, dsg) in enumerate(zip(rtgs, dsgs)):
            derivations = [LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(i, grammar)]
            self.assertGreaterEqual(len(derivations), 1)
            if len(derivations) > 1:
                print("Sentence", i)
                for der in derivations:
                    print(der)

            for der in derivations:
                dog, sync = dog_evaluation(der)
                dsg2 = DeepSyntaxGraph(der.compute_yield(), dog, sync)
                dsg.dog.project_labels(f)
                dsg.sentence = list(map(f, dsg.sentence))
                self.assertEqual(dsg.sentence, dsg2.sentence)
                morphs = dsg.dog.compute_isomorphism(dsg2.dog)
                self.assertFalse(morphs is None)
                self.assertListEqual([[morphs[0].get(node, node) for node in syncs]
                                      for syncs in dsg.synchronization], dsg2.synchronization)
        pass
def linearize(grammar,
              nonterminal_labeling,
              terminal_labeling,
              file,
              delimiter='::',
              nonterminal_encoder=None):
    """
    :type grammar: LCFRS
    :param nonterminal_labeling:
    :param terminal_labeling:
    :param file: file handle to write to
    :type delimiter: str
    :param delimiter: string used to join terminal symbol with edge label symbol
    :type nonterminal_encoder: Enumerator
    :param nonterminal_encoder: mapping that assigns unique non-negative integer to each nonterminal
    """
    print("Nonterminal Labeling: ", nonterminal_labeling, file=file)
    print("Terminal Labeling: ", terminal_labeling, file=file)
    print(file=file)

    terminals = Enumerator(first_index=1)
    if nonterminal_encoder is None:
        nonterminals = Enumerator()
    else:
        nonterminals = nonterminal_encoder
    num_inherited_args = {}
    num_synthesized_args = {}

    for rule in grammar.rules():
        rid = 'r%i' % (rule.get_idx() + 1)
        print(rid,
              'RTG   ',
              nonterminals.object_index(rule.lhs().nont()),
              '->',
              file=file,
              end=" ")
        print(list(
            map(lambda nont: nonterminals.object_index(nont), rule.rhs())),
              ';',
              file=file)

        print(rid, 'WEIGHT', rule.weight(), ';', file=file)

        sync_index = {}
        inh_args = defaultdict(lambda: 0)
        lhs_var_counter = CountLHSVars()
        synthesized_attributes = 0

        dcp_ordered = sorted(rule.dcp(),
                             key=lambda x: (x.lhs().mem(), x.lhs().arg()))

        for dcp in dcp_ordered:
            if dcp.lhs().mem() != -1:
                inh_args[dcp.lhs().mem()] += 1
            else:
                synthesized_attributes += 1
            lhs_var_counter.evaluate_list(dcp.rhs())
        num_inherited_args[nonterminals.object_index(
            rule.lhs().nont())] = inh_args[-1] = lhs_var_counter.get_number()
        num_synthesized_args[nonterminals.object_index(
            rule.lhs().nont())] = synthesized_attributes

        for dcp in dcp_ordered:
            printer = DcpPrinter(terminals.object_index,
                                 rule,
                                 sync_index,
                                 inh_args,
                                 delimiter=delimiter)
            printer.evaluate_list(dcp.rhs())
            var = dcp.lhs()
            if var.mem() == -1:
                var_string = 's<0,%i>' % (var.arg() + 1 - inh_args[-1])
            else:
                var_string = 's<%i,%i>' % (var.mem() + 1, var.arg() + 1)
            print('%s sDCP   %s == %s ;' % (rid, var_string, printer.string),
                  file=file)

        s = 0
        for j, arg in enumerate(rule.lhs().args()):
            print(rid, 'LCFRS  s<0,%i> == [' % (j + 1), end=' ', file=file)
            first = True
            for a in arg:
                if not first:
                    print(",", end=' ', file=file)
                if isinstance(a, LCFRS_var):
                    print("x<%i,%i>" % (a.mem + 1, a.arg + 1),
                          end=' ',
                          file=file)
                    pass
                else:
                    if s in sync_index:
                        print(str(terminals.object_index(a)) +
                              '^{%i}' % sync_index[s],
                              end=' ',
                              file=file)
                    else:
                        print(str(terminals.object_index(a)),
                              end=' ',
                              file=file)
                    s += 1
                first = False
            print('] ;', file=file)
        print(file=file)

    print("Terminals: ", file=file)
    terminals.print_index(to_file=file)
    print(file=file)

    print("Nonterminal ID, nonterminal name, fanout, #inh, #synth: ",
          file=file)
    max_fanout, max_inh, max_syn, max_args, fanouts, inherits, synths, args \
        = print_index_and_stats(nonterminals, grammar, num_inherited_args, num_synthesized_args, file=file)
    print(file=file)
    print("max fanout:", max_fanout, file=file)
    print("max inh:", max_inh, file=file)
    print("max synth:", max_syn, file=file)
    print("max args:", max_args, file=file)
    print(file=file)
    for s, d, m in [('fanout', fanouts, max_fanout),
                    ('inh', inherits, max_inh), ('syn', synths, max_syn),
                    ('args', args, max_args)]:
        for i in range(m + 1):
            print('# the number of nonterminals with %s = %i is %i' %
                  (s, i, d[i]),
                  file=file)
        print(file=file)
    print(file=file)

    print("Initial nonterminal: ",
          nonterminals.object_index(grammar.start()),
          file=file)
    print(file=file)
    return nonterminals, terminals
def run_experiment(rec_part_strategy,
                   nonterminal_labeling,
                   exp,
                   reorder_children,
                   binarize=True):
    start = 1
    stop = 7000

    test_start = 7001
    test_stop = 7200

    # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
    corpus_path = "res/tiger/tiger_8000.xml"
    exclude = []
    train_dsgs = sentence_names_to_deep_syntax_graphs(
        ['s' + str(i) for i in range(start, stop + 1) if i not in exclude],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)
    test_dsgs = sentence_names_to_deep_syntax_graphs(
        [
            's' + str(i)
            for i in range(test_start, test_stop + 1) if i not in exclude
        ],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)

    # Grammar induction
    term_labeling_token = PosTerminals()

    def term_labeling(token):
        if isinstance(token, ConstituentTerminal):
            return term_labeling_token.token_label(token)
        else:
            return token

    if binarize:

        def modify_token(token):
            if isinstance(token, ConstituentCategory):
                token_new = deepcopy(token)
                token_new.set_category(token.category() + '-BAR')
                return token_new
            elif isinstance(token, str):
                return token + '-BAR'
            else:
                assert False

        train_dsgs = [
            dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs
        ]

        def is_bin(token):
            if isinstance(token, ConstituentCategory):
                if token.category().endswith('-BAR'):
                    return True
            elif isinstance(token, str):
                if token.endswith('-BAR'):
                    return True
            return False

        def debinarize(dsg):
            return dsg.debinarize(is_bin=is_bin)

    else:
        debinarize = id

    grammar = induction_on_a_corpus(train_dsgs, rec_part_strategy,
                                    nonterminal_labeling, term_labeling)
    grammar.make_proper()

    print("Nonterminals", len(grammar.nonts()), "Rules", len(grammar.rules()))

    parser = GFParser_k_best(grammar, k=500)
    return do_parsing(parser,
                      test_dsgs,
                      term_labeling_token,
                      oracle=True,
                      debinarize=debinarize)

    # Compute reducts, i.e., intersect grammar with each training dsg
    basedir = path.join('/tmp/dog_experiments', 'exp' + str(exp))
    reduct_dir = path.join(basedir, 'reduct_grammars')

    terminal_map = Enumerator()
    if not os.path.isdir(basedir):
        os.makedirs(basedir)
    data = export_dog_grammar_to_json(grammar, terminal_map)
    grammar_path = path.join(basedir, 'grammar.json')
    with open(grammar_path, 'w') as file:
        json.dump(data, file)

    corpus_path = path.join(basedir, 'corpus.json')
    with open(corpus_path, 'w') as file:
        json.dump(
            export_corpus_to_json(train_dsgs,
                                  terminal_map,
                                  terminal_labeling=term_labeling), file)

    with open(path.join(basedir, 'enumerator.enum'), 'w') as file:
        terminal_map.print_index(file)

    if os.path.isdir(reduct_dir):
        shutil.rmtree(reduct_dir)
    os.makedirs(reduct_dir)
    p = subprocess.Popen([
        ' '.join([
            "java", "-jar",
            os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g',
            grammar_path, '-t', corpus_path, "-o", reduct_dir
        ])
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)

    while True:
        nextline = p.stdout.readline()
        if nextline == '' and p.poll() is not None:
            break
        sys.stdout.write(nextline)
        sys.stdout.flush()

    p.wait()
    p.stdout.close()

    rtgs = []
    for i in range(1, len(train_dsgs) + 1):
        rtgs.append(read_rtg(path.join(reduct_dir, str(i) + '.gra')))

    derivation_manager = PyDerivationManager(grammar)
    derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
    derivation_manager.serialize(path.join(basedir, 'reduct_manager.trace'))

    # Training
    ## prepare EM training
    em_epochs = 20
    seed = 0
    smoothing_factor = 0.01
    split_randomization = 0.01
    sm_cycles = 2
    merge_percentage = 50.0
    grammarInfo = PyGrammarInfo(grammar,
                                derivation_manager.get_nonterminal_map())
    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=THREADS)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    do_parsing(CFGParser(grammar), test_dsgs, term_labeling_token)
    return
    ## prepare SM training
    builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=THREADS)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    # builder.set_scc_merger(-0.2)
    builder.set_percent_merger(merge_percentage)
    splitMergeTrainer = builder.build()

    # splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotation = [la_no_splits]

    # carry out split/merge training and do parsing
    parsing_method = "filter-ctf"
    # parsing_method = "single-best-annotation"
    k_best = 50
    for i in range(1, sm_cycles + 1):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        print("Cycle: ", i)
        if parsing_method == "single-best-annotation":
            smGrammar = latentAnnotation[i].build_sm_grammar(
                grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1)
            print("Rules in smoothed grammar: ", len(smGrammar.rules()))
            parser = GFParser(smGrammar)
        elif parsing_method == "filter-ctf":
            latentAnnotation[-1].project_weights(grammar, grammarInfo)
            parser = Coarse_to_fine_parser(
                grammar,
                latentAnnotation[-1],
                grammarInfo,
                derivation_manager.get_nonterminal_map(),
                base_parser_type=GFParser_k_best,
                k=k_best)
        else:
            raise (Exception())
        do_parsing(parser, test_dsgs, term_labeling_token)
        del parser