def test_induction_2(self): def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) features = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(self.tree3, rec_part(self.tree3), naming="child", feature_logging=features, isolate_pos=True) grammar.make_proper() if False: for idx in range(0, len(grammar.rules())): print(grammar.rule_index(idx)) for key in features: if key[0] == idx: print(key, features[key]) print() for key in features: if type(key[0]) == int: continue print(key, features[key]) nont_splits, root_weights, rule_weights, _ = build_nont_splits_dict( grammar, features, nonterminals=Enumerator(), feat_function=pos_cat_feats, debug=True) print(nont_splits) print(root_weights) print(rule_weights)
def test_la_viterbi_parsing_2(self): grammar = self.build_paper_grammar() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() print(nontMap.object_index("S")) print(nontMap.object_index("B")) la = build_PyLatentAnnotation( [2, 1], [1.0], [[0.25, 1.0], [1.0, 0.0], [0.0, 0.5, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0]], gi, sm) self.assertTrue(la.is_proper()) parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.latent_viterbi_derivation(True) print(der) ranges = {der.spanned_ranges(idx)[0] for idx in der.ids()} self.assertSetEqual({(0, 3), (0, 2), (0, 1), (1, 2), (2, 3)}, ranges)
def test_json_grammar_export(self): dog = build_acyclic_dog() terminals = Enumerator() data = dog.export_graph_json(terminals) with open('/tmp/json_graph_1.json', 'w') as file: json.dump(data, file) dsg = build_dsg() data = dsg.export_bihypergraph_json(terminals) with open('/tmp/json_bigraph_1.json', 'w') as file: json.dump(data, file) rule_dog = dog_se() data2 = rule_dog.export_graph_json(terminals) with open('/tmp/json_nonterminal_graph_1.json', 'w') as file: json.dump(data2, file) terminals.print_index() dsg = build_dsg() rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('right-branching')[0] rec_part = rec_part_strategy(dsg) dcmp = compute_decomposition(dsg, rec_part) grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=simple_labeling, terminal_labeling=str) print(grammar) data = export_dog_grammar_to_json(grammar, terminals) with open('/tmp/json_grammar.json', 'w') as file: json.dump(data, file) with open('/tmp/json_corpus.json', 'w') as file: json.dump(export_corpus_to_json([dsg], terminals), file)
def test_something(self): grammar, r1, r2 = self.build_grammar() nont_map = Enumerator() grammarInfo = PyGrammarInfo(grammar, nont_map) def w(x): return "S", x rtg = RTG(w(3)) rtg.construct_and_add_rule(w(3), r1, [w(1), w(2)]) rtg.construct_and_add_rule(w(3), r1, [w(2), w(1)]) rtg.construct_and_add_rule(w(2), r1, [w(1), w(1)]) rtg.construct_and_add_rule(w(1), r2, []) rtg2 = RTG(("A", 3)) rtg3 = RTG(w(3)) rtg3.construct_and_add_rule(w(3), r1, [w(1), w(2)]) rtg3.construct_and_add_rule(w(3), r1, [w(2), w(1)]) rtg3.construct_and_add_rule(w(2), r2, [w(1), w(1)]) rtg3.construct_and_add_rule(w(1), r2, []) traces = PyDerivationManager(grammar, nont_map) traces.convert_rtgs_to_hypergraphs([rtg, rtg2, rtg3]) self.assertTrue( traces.is_consistent_with_grammar(grammarInfo, traceId=0)) self.assertFalse( traces.is_consistent_with_grammar(grammarInfo, traceId=1)) self.assertFalse( traces.is_consistent_with_grammar(grammarInfo, traceId=2))
def test_la_viterbi_parsing_3(self): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.25) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.5) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"], 1.0) # rule 3 lhs = LCFRS_lhs("A") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.5) # rule 4 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.75) grammar.make_proper() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() print(nontMap.object_index("S")) print(nontMap.object_index("B")) la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.latent_viterbi_derivation(True) print(der) der2 = None for w, der_ in parser.k_best_derivation_trees(): if der2 is None: der2 = der_ print(w, der_) print(der2)
def __test_projection(self, split_weights, goal_weights, merge_method=False): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "A"]) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) lhs = LCFRS_lhs("A") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], weight=2.0) grammar.make_proper() # print(grammar) nonterminal_map = Enumerator() grammarInfo = PyGrammarInfo(grammar, nonterminal_map) storageManager = PyStorageManager() la = build_PyLatentAnnotation([1, 2], [1.0], split_weights, grammarInfo, storageManager) # parser = LCFRS_parser(grammar) # parser.set_input(["a", "b"]) # parser.parse() # der = parser.best_derivation_tree() # print(la.serialize()) if merge_method: la.project_weights(grammar, grammarInfo) else: splits, _, _ = la.serialize() merge_sources = [[[ split for split in range(0, splits[nont_idx]) ]] for nont_idx in range(0, nonterminal_map.get_counter())] # print("Projecting to fine grammar LA", file=self.logger) coarse_la = la.project_annotation_by_merging(grammarInfo, merge_sources, debug=False) coarse_la.project_weights(grammar, grammarInfo) # print(grammar) for i in range(3): self.assertAlmostEqual( grammar.rule_index(i).weight(), goal_weights[i])
def test_projection_based_parser_k_best_hack(self): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.25) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.5) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"], 1.0) # rule 3 lhs = LCFRS_lhs("A") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.5) # rule 4 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.75) grammar.make_proper() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = Coarse_to_fine_parser(grammar, la, gi, nontMap, base_parser_type=GFParser_k_best) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.max_rule_product_derivation() print(der) der = parser.best_derivation_tree() print(der) for node in der.ids(): print(der.getRule(node), der.spanned_ranges(node))
def test_la_viterbi_parsing(self): grammar = self.build_grammar() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) for node in der.ids(): print(node, der.getRule(node), der.spanned_ranges(node))
def test_json_export(self): dog = build_acyclic_dog() terminals = Enumerator() data = dog.export_graph_json(terminals) with open('/tmp/json_graph_1.json', 'w') as file: json.dump(data, file) dsg = build_dsg() data = dsg.export_bihypergraph_json(terminals) with open('/tmp/json_bigraph_1.json', 'w') as file: json.dump(data, file) rule_dog = dog_se() data2 = rule_dog.export_graph_json(terminals) with open('/tmp/json_nonterminal_graph_1.json', 'w') as file: json.dump(data2, file) terminals.print_index()
def test_projection_based_parser_k_best_hack(self): grammar = self.build_grammar() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = Coarse_to_fine_parser(grammar, la, gi, nontMap, base_parser_type=GFParser_k_best) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.max_rule_product_derivation() print(der) der = parser.best_derivation_tree() print(der) for node in der.ids(): print(der.getRule(node), der.spanned_ranges(node))
def test_induction(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") feature_log1 = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(tree, rec_part(tree), feature_logging=feature_log1, naming=naming) for key in feature_log1: print(key, feature_log1[key]) print(grammar) feats = defaultdict(lambda: 0) grammar_ = fringe_extract_lcfrs(tree, rec_part(tree), isolate_pos=True, feature_logging=feats, naming=naming) print(grammar_) for key in feats: print(key, feats[key]) print("Adding 2nd grammar to first") grammar.add_gram(grammar_, feature_logging=(feature_log1, feats)) for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print("Adding 3rd grammar to first") feats3 = defaultdict(lambda: 0) grammar3 = fringe_extract_lcfrs(self.tree2, rec_part(self.tree2), isolate_pos=True, feature_logging=feats3, naming=naming) grammar.add_gram(grammar3, feature_logging=(feature_log1, feats3)) print() for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print() print("New feature log") print() for key in feature_log1: print(key, feature_log1[key]) grammar.make_proper() build_nont_splits_dict(grammar, feature_log1, nonterminals=Enumerator()) print(grammar.rule_index(0)) print(grammar.rule_index(2))
def test_json_corpus_grammar_export(self): start = 1 stop = 50 # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml" path = "res/tiger/tiger_8000.xml" exclude = [] dsgs = sentence_names_to_deep_syntax_graphs( ['s' + str(i) for i in range(start, stop + 1) if i not in exclude] , path , hold=False) rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0] def label_edge(edge): if isinstance(edge.label, ConstituentTerminal): return edge.label.pos() else: return edge.label nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge) term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling) grammar.make_proper() terminals = Enumerator() data = export_dog_grammar_to_json(grammar, terminals) grammar_path = '/tmp/json_grammar.json' with open(grammar_path, 'w') as file: json.dump(data, file) corpus_path = '/tmp/json_corpus.json' with open(corpus_path, 'w') as file: json.dump(export_corpus_to_json(dsgs, terminals, terminal_labeling=term_labeling), file) with open('/tmp/enumerator.enum', 'w') as file: terminals.print_index(file) reduct_dir = '/tmp/reduct_grammars' if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([' '.join( ["java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t', corpus_path, "-o", reduct_dir])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print("stdout", p.stdout.name) while True: nextline = p.stdout.readline() if nextline == b'' and p.poll() is not None: break print(nextline.decode('unicode_escape'), end='') # sys.stdout.write(nextline) # sys.stdout.flush() p.wait() p.stdout.close() self.assertEqual(0, p.returncode) rtgs = [] for i in range(1, len(dsgs) + 1): rtgs.append(read_rtg('/tmp/reduct_grammars/' + str(i) + '.gra')) derivation_manager = PyDerivationManager(grammar) derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize(bytes('/tmp/reduct_manager.trace', encoding='utf8')) f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token for i, (rtg, dsg) in enumerate(zip(rtgs, dsgs)): derivations = [LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(i, grammar)] self.assertGreaterEqual(len(derivations), 1) if len(derivations) > 1: print("Sentence", i) for der in derivations: print(der) for der in derivations: dog, sync = dog_evaluation(der) dsg2 = DeepSyntaxGraph(der.compute_yield(), dog, sync) dsg.dog.project_labels(f) dsg.sentence = list(map(f, dsg.sentence)) self.assertEqual(dsg.sentence, dsg2.sentence) morphs = dsg.dog.compute_isomorphism(dsg2.dog) self.assertFalse(morphs is None) self.assertListEqual([[morphs[0].get(node, node) for node in syncs] for syncs in dsg.synchronization], dsg2.synchronization) pass
def serialize_acyclic_dogs_to_negra(dsg, sec_edge_to_terminal=False): """ converts a sequence of acyclic syntax graphs to the negra export format :type dsg: DeepSyntaxGraph :type sec_edge_to_terminal: bool :param sec_edge_to_terminal: if true, exports secondary edges with terminals as target """ assert not dsg.dog.cyclic() assert len(dsg.sentence) < 500 enum = Enumerator(first_index=500) # NB: contrary to the export standard, we index words starting from 1 (and not starting from 0) # NB: because 0 also refers to the virtual root (important for sec_edge_to_terminal == True) # NB: see http://www.coli.uni-saarland.de/~thorsten/publications/Brants-CLAUS98.pdf # NB: only relevant for TiGer s22084, probably annotation error synced_idxs = {idx: i + 1 for i, l in enumerate(dsg.synchronization) for idx in l} def idNum(tree_idx): if tree_idx in synced_idxs: return str(synced_idxs[tree_idx]) else: return str(enum.object_index(tree_idx)) # NB: here we enforce the indices to be topologically ordered as required by the export standard for idx in dsg.dog.topological_order(): if idx not in synced_idxs: idNum(idx) lines = [] for idx, token in enumerate(dsg.sentence): assert isinstance(token, ConstituentTerminal) # if not isinstance(token.form(), str): # print(token.form(), type(token.form())) # assert isinstance(token.form(), str) morph_order = ['person', 'case', 'number', 'tense', 'mood', 'gender', 'degree'] morph = sorted(token.morph_feats(), key=lambda x: morph_order.index(x[0])) morph = '.'.join([str(x[1]) for x in morph if str(x[1]) != '--']) if morph == '': morph = u'--' line = [token.form(), token.pos(), morph] tree_idx = dsg.get_graph_position(idx) assert len(tree_idx) == 1 tree_idx = tree_idx[0] parents = [] if tree_idx in dsg.dog.outputs: parents.append(u'--') parents.append(u'0') for parent_idx in dsg.dog.parents: if not sec_edge_to_terminal and parent_idx in synced_idxs: continue edge = dsg.dog.incoming_edge(parent_idx) for j, child_idx in enumerate(edge.inputs): if child_idx == tree_idx: if j in edge.primary_inputs: parents = [edge.get_function(j), idNum(parent_idx)] + parents else: parents.append(edge.get_function(j)) parents.append(idNum(parent_idx)) line += parents lines.append(u'\t'.join(line) + u'\n') category_lines = [] for tree_idx in dsg.dog.nodes: token = dsg.dog.incoming_edge(tree_idx).label if isinstance(token, ConstituentTerminal): continue morph = u'--' line = ['#' + str(idNum(tree_idx)), token, morph] parents = [] if tree_idx in dsg.dog.outputs: parents.append(u'--') parents.append(u'0') for parent_idx in dsg.dog.parents: if not sec_edge_to_terminal and parent_idx in synced_idxs: continue edge = dsg.dog.incoming_edge(parent_idx) for j, child_idx in enumerate(edge.inputs): if child_idx == tree_idx: if j in edge.primary_inputs: parents = [edge.get_function(j), idNum(parent_idx)] + parents else: parents.append(edge.get_function(j)) parents.append(idNum(parent_idx)) line += parents category_lines.append(line) category_lines = sorted(category_lines, key=lambda x: x[0]) for line in category_lines: lines.append(u'\t'.join(line) + u'\n') return lines
def linearize(grammar, nonterminal_labeling, terminal_labeling, file, delimiter='::', nonterminal_encoder=None): """ :type grammar: LCFRS :param nonterminal_labeling: :param terminal_labeling: :param file: file handle to write to :type delimiter: str :param delimiter: string used to join terminal symbol with edge label symbol :type nonterminal_encoder: Enumerator :param nonterminal_encoder: mapping that assigns unique non-negative integer to each nonterminal """ print("Nonterminal Labeling: ", nonterminal_labeling, file=file) print("Terminal Labeling: ", terminal_labeling, file=file) print(file=file) terminals = Enumerator(first_index=1) if nonterminal_encoder is None: nonterminals = Enumerator() else: nonterminals = nonterminal_encoder num_inherited_args = {} num_synthesized_args = {} for rule in grammar.rules(): rid = 'r%i' % (rule.get_idx() + 1) print(rid, 'RTG ', nonterminals.object_index(rule.lhs().nont()), '->', file=file, end=" ") print(list( map(lambda nont: nonterminals.object_index(nont), rule.rhs())), ';', file=file) print(rid, 'WEIGHT', rule.weight(), ';', file=file) sync_index = {} inh_args = defaultdict(lambda: 0) lhs_var_counter = CountLHSVars() synthesized_attributes = 0 dcp_ordered = sorted(rule.dcp(), key=lambda x: (x.lhs().mem(), x.lhs().arg())) for dcp in dcp_ordered: if dcp.lhs().mem() != -1: inh_args[dcp.lhs().mem()] += 1 else: synthesized_attributes += 1 lhs_var_counter.evaluate_list(dcp.rhs()) num_inherited_args[nonterminals.object_index( rule.lhs().nont())] = inh_args[-1] = lhs_var_counter.get_number() num_synthesized_args[nonterminals.object_index( rule.lhs().nont())] = synthesized_attributes for dcp in dcp_ordered: printer = DcpPrinter(terminals.object_index, rule, sync_index, inh_args, delimiter=delimiter) printer.evaluate_list(dcp.rhs()) var = dcp.lhs() if var.mem() == -1: var_string = 's<0,%i>' % (var.arg() + 1 - inh_args[-1]) else: var_string = 's<%i,%i>' % (var.mem() + 1, var.arg() + 1) print('%s sDCP %s == %s ;' % (rid, var_string, printer.string), file=file) s = 0 for j, arg in enumerate(rule.lhs().args()): print(rid, 'LCFRS s<0,%i> == [' % (j + 1), end=' ', file=file) first = True for a in arg: if not first: print(",", end=' ', file=file) if isinstance(a, LCFRS_var): print("x<%i,%i>" % (a.mem + 1, a.arg + 1), end=' ', file=file) pass else: if s in sync_index: print(str(terminals.object_index(a)) + '^{%i}' % sync_index[s], end=' ', file=file) else: print(str(terminals.object_index(a)), end=' ', file=file) s += 1 first = False print('] ;', file=file) print(file=file) print("Terminals: ", file=file) terminals.print_index(to_file=file) print(file=file) print("Nonterminal ID, nonterminal name, fanout, #inh, #synth: ", file=file) max_fanout, max_inh, max_syn, max_args, fanouts, inherits, synths, args \ = print_index_and_stats(nonterminals, grammar, num_inherited_args, num_synthesized_args, file=file) print(file=file) print("max fanout:", max_fanout, file=file) print("max inh:", max_inh, file=file) print("max synth:", max_syn, file=file) print("max args:", max_args, file=file) print(file=file) for s, d, m in [('fanout', fanouts, max_fanout), ('inh', inherits, max_inh), ('syn', synths, max_syn), ('args', args, max_args)]: for i in range(m + 1): print('# the number of nonterminals with %s = %i is %i' % (s, i, d[i]), file=file) print(file=file) print(file=file) print("Initial nonterminal: ", nonterminals.object_index(grammar.start()), file=file) print(file=file) return nonterminals, terminals
def run_experiment(rec_part_strategy, nonterminal_labeling, exp, reorder_children, binarize=True): start = 1 stop = 7000 test_start = 7001 test_stop = 7200 # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml" corpus_path = "res/tiger/tiger_8000.xml" exclude = [] train_dsgs = sentence_names_to_deep_syntax_graphs( ['s' + str(i) for i in range(start, stop + 1) if i not in exclude], corpus_path, hold=False, reorder_children=reorder_children) test_dsgs = sentence_names_to_deep_syntax_graphs( [ 's' + str(i) for i in range(test_start, test_stop + 1) if i not in exclude ], corpus_path, hold=False, reorder_children=reorder_children) # Grammar induction term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token if binarize: def modify_token(token): if isinstance(token, ConstituentCategory): token_new = deepcopy(token) token_new.set_category(token.category() + '-BAR') return token_new elif isinstance(token, str): return token + '-BAR' else: assert False train_dsgs = [ dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs ] def is_bin(token): if isinstance(token, ConstituentCategory): if token.category().endswith('-BAR'): return True elif isinstance(token, str): if token.endswith('-BAR'): return True return False def debinarize(dsg): return dsg.debinarize(is_bin=is_bin) else: debinarize = id grammar = induction_on_a_corpus(train_dsgs, rec_part_strategy, nonterminal_labeling, term_labeling) grammar.make_proper() print("Nonterminals", len(grammar.nonts()), "Rules", len(grammar.rules())) parser = GFParser_k_best(grammar, k=500) return do_parsing(parser, test_dsgs, term_labeling_token, oracle=True, debinarize=debinarize) # Compute reducts, i.e., intersect grammar with each training dsg basedir = path.join('/tmp/dog_experiments', 'exp' + str(exp)) reduct_dir = path.join(basedir, 'reduct_grammars') terminal_map = Enumerator() if not os.path.isdir(basedir): os.makedirs(basedir) data = export_dog_grammar_to_json(grammar, terminal_map) grammar_path = path.join(basedir, 'grammar.json') with open(grammar_path, 'w') as file: json.dump(data, file) corpus_path = path.join(basedir, 'corpus.json') with open(corpus_path, 'w') as file: json.dump( export_corpus_to_json(train_dsgs, terminal_map, terminal_labeling=term_labeling), file) with open(path.join(basedir, 'enumerator.enum'), 'w') as file: terminal_map.print_index(file) if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([ ' '.join([ "java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t', corpus_path, "-o", reduct_dir ]) ], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) while True: nextline = p.stdout.readline() if nextline == '' and p.poll() is not None: break sys.stdout.write(nextline) sys.stdout.flush() p.wait() p.stdout.close() rtgs = [] for i in range(1, len(train_dsgs) + 1): rtgs.append(read_rtg(path.join(reduct_dir, str(i) + '.gra'))) derivation_manager = PyDerivationManager(grammar) derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize(path.join(basedir, 'reduct_manager.trace')) # Training ## prepare EM training em_epochs = 20 seed = 0 smoothing_factor = 0.01 split_randomization = 0.01 sm_cycles = 2 merge_percentage = 50.0 grammarInfo = PyGrammarInfo(grammar, derivation_manager.get_nonterminal_map()) storageManager = PyStorageManager() em_builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo) em_builder.set_em_epochs(em_epochs) em_builder.set_simple_expector(threads=THREADS) emTrainer = em_builder.build() # randomize initial weights and do em training la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager) la_no_splits.add_random_noise(seed=seed) emTrainer.em_train(la_no_splits) la_no_splits.project_weights(grammar, grammarInfo) do_parsing(CFGParser(grammar), test_dsgs, term_labeling_token) return ## prepare SM training builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo) builder.set_em_epochs(em_epochs) builder.set_split_randomization(1.0, seed + 1) builder.set_simple_expector(threads=THREADS) builder.set_smoothing_factor(smoothingFactor=smoothing_factor) builder.set_split_randomization(percent=split_randomization) # builder.set_scc_merger(-0.2) builder.set_percent_merger(merge_percentage) splitMergeTrainer = builder.build() # splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing") splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing") # set initial latent annotation latentAnnotation = [la_no_splits] # carry out split/merge training and do parsing parsing_method = "filter-ctf" # parsing_method = "single-best-annotation" k_best = 50 for i in range(1, sm_cycles + 1): splitMergeTrainer.reset_random_seed(seed + i + 1) latentAnnotation.append( splitMergeTrainer.split_merge_cycle(latentAnnotation[-1])) print("Cycle: ", i) if parsing_method == "single-best-annotation": smGrammar = latentAnnotation[i].build_sm_grammar( grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1) print("Rules in smoothed grammar: ", len(smGrammar.rules())) parser = GFParser(smGrammar) elif parsing_method == "filter-ctf": latentAnnotation[-1].project_weights(grammar, grammarInfo) parser = Coarse_to_fine_parser( grammar, latentAnnotation[-1], grammarInfo, derivation_manager.get_nonterminal_map(), base_parser_type=GFParser_k_best, k=k_best) else: raise (Exception()) do_parsing(parser, test_dsgs, term_labeling_token) del parser