def __projection_based_derivation_tree(self, la, variational=False, op=prod): manager = PyDerivationManager(self.grammar, self.nontMap) derivations = [der for _, der in self.base_parser.k_best_derivation_trees()] manager.convert_derivations_to_hypergraph(derivations) manager.set_io_cycle_limit(200) manager.set_io_precision(0.000001) self.debug = False self.log_mode = True edge_weights = py_edge_weight_projection(la, manager, variational=variational, debug=self.debug, log_mode=self.log_mode) der = manager.viterbi_derivation(0, edge_weights, self.grammar, op=op, log_mode=self.log_mode) if der is None: if True or self.debug: nans = 0 infs = 0 zeros = 0 for weight in edge_weights: if math.isnan(weight): nans += 1 if math.isinf(weight): infs += 1 if weight == 0.0: zeros += 1 print("[", len(edge_weights), nans, infs, zeros, "]") if len(edge_weights) < 200: print("orig:", edge_weights) edge_weights = py_edge_weight_projection(la, manager, variational=variational, debug=True, log_mode=self.log_mode) print("1:", edge_weights) edge_weights = py_edge_weight_projection(la, manager, variational=variational, debug=True, log_mode=self.log_mode) print("2:", edge_weights) print("p", end="") _, der = next(self.k_best_derivation_trees()) return der
def build_score_validator(self, resource): self.organizer.validator = PyCandidateScoreValidator( self.organizer.grammarInfo, self.organizer.storageManager, self.score_name) corpus_validation = self.read_corpus(resource) obj_count = 0 der_count = 0 timeout = False if self.parsing_timeout: timeout_manager = multiprocessing.Manager() return_dict = timeout_manager.dict() for gold in corpus_validation: obj_count += 1 self.parser.set_input(self.parsing_preprocess(gold)) if self.parsing_timeout: timeout, derivations_ = self._compute_derivations_with_timeout( return_dict) derivations = list(map(lambda x: x[1], derivations_)) else: self.parser.parse() derivations = list( map(lambda x: x[1], self.parser.k_best_derivation_trees())) manager = PyDerivationManager(self.base_grammar, self.organizer.nonterminal_map) manager.convert_derivations_to_hypergraphs(derivations) scores = [] # derivations = self.parser.k_best_derivation_trees() for der in derivations: der_count += 1 result = self.parsing_postprocess(self.obtain_sentence(gold), der) score = self.score_object(result, gold) scores.append(score) self.organizer.validator.add_scored_candidates( manager, scores, self.max_score) # print(obj_count, self.max_score, scores) token = 't' if timeout else ('.' if scores else '-') print(token, end='', file=self.logger) if scores: print(obj_count, 'max', max(scores), 'firsts', scores[0:10], file=self.logger) else: print(obj_count, 'max 00.00', '[]', file=self.logger) self.parser.clear()
def __projection_based_derivation_tree(self, la, variational=False, op=prod): if self.nontMap is None: print("A nonterminal map is required for weight projection based parsing!") return None manager = PyDerivationManager(self.grammar, self.nontMap) manager.convert_chart_to_hypergraph(self.chart, self.disco_grammar, debug=False) if self.grammarInfo is not None: assert manager.is_consistent_with_grammar(self.grammarInfo) manager.set_io_cycle_limit(200) manager.set_io_precision(0.000001) if not isinstance(la, list): la = [la] edge_weights = None for l in la: edge_weights_l = py_edge_weight_projection(l, manager, variational=variational, debug=self.debug, log_mode=self.log_mode) if edge_weights is None: edge_weights = edge_weights_l else: if self.log_mode: edge_weights = [w1 + w2 for w1, w2 in zip(edge_weights, edge_weights_l)] else: edge_weights = [op(w1, w2) for w1, w2 in zip(edge_weights, edge_weights_l)] if self.debug: nans = 0 infs = 0 zeros = 0 for weight in edge_weights: if weight == float("nan"): nans += 1 if weight == float("inf") or weight == float("-inf"): infs += 1 if weight == 0.0: zeros += 1 print("[", len(edge_weights), nans, infs, zeros, "]") if len(edge_weights) < 100: print(edge_weights) der = manager.viterbi_derivation(0, edge_weights, self.grammar, op=op, log_mode=self.log_mode) if der is None: print("p", end="") der = self.latent_viterbi_derivation(debug=self.debug) if der is not None: der = LCFRSDerivationWrapper(der) if der is None: _, der = next(self.k_best_derivation_trees()) return der
def latent_viterbi_derivation(self, debug=False): manager = PyDerivationManager(self.grammar, self.nontMap) manager.convert_chart_to_hypergraph(self.chart, self.disco_grammar, debug=False) if debug: manager.serialize(b'/tmp/my_debug_hypergraph.hg') if isinstance(self.la, list): la = self.la[0] else: la = self.la vit_der = manager.latent_viterbi_derivation(0, la, self.grammar, debug=debug) # if len(self.input) < 15 and not debug: # for weight, der in self.k_best_derivation_trees(): # if der != vit_der: # print(weight, der, vit_der) # vit_der2 = self.latent_viterbi_derivation(debug=True) # print("vit2", vit_der2) # if vit_der2 != vit_der: # print("first and second viterbi derivation differ") # if vit_der2 == der: # print("second viterbi derivation = 1-best-disco-dop derivation") # print("##############################", flush=True) # break # # raise Exception("too much to read") if vit_der is not None: vit_der = LCFRSDerivationWrapper(vit_der) return vit_der
def test_something(self): grammar, r1, r2 = self.build_grammar() nont_map = Enumerator() grammarInfo = PyGrammarInfo(grammar, nont_map) def w(x): return "S", x rtg = RTG(w(3)) rtg.construct_and_add_rule(w(3), r1, [w(1), w(2)]) rtg.construct_and_add_rule(w(3), r1, [w(2), w(1)]) rtg.construct_and_add_rule(w(2), r1, [w(1), w(1)]) rtg.construct_and_add_rule(w(1), r2, []) rtg2 = RTG(("A", 3)) rtg3 = RTG(w(3)) rtg3.construct_and_add_rule(w(3), r1, [w(1), w(2)]) rtg3.construct_and_add_rule(w(3), r1, [w(2), w(1)]) rtg3.construct_and_add_rule(w(2), r2, [w(1), w(1)]) rtg3.construct_and_add_rule(w(1), r2, []) traces = PyDerivationManager(grammar, nont_map) traces.convert_rtgs_to_hypergraphs([rtg, rtg2, rtg3]) self.assertTrue( traces.is_consistent_with_grammar(grammarInfo, traceId=0)) self.assertFalse( traces.is_consistent_with_grammar(grammarInfo, traceId=1)) self.assertFalse( traces.is_consistent_with_grammar(grammarInfo, traceId=2))
def main(): # induce grammar from a corpus trees = parse_conll_corpus(train, False, limit_train) nonterminal_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] _, grammar = induce_grammar(trees, nonterminal_labelling, term_labelling.token_label, recursive_partitioning, start) # compute some derivations derivations = obtain_derivations(grammar, term_labelling) # create derivation manager and add derivations manager = PyDerivationManager(grammar) manager.convert_derivations_to_hypergraphs(derivations) manager.serialize(b"/tmp/derivations.txt") # build and configure split/merge trainer and supplementary objects rule_to_nonterminals = [] for i in range(0, len(grammar.rule_index())): rule = grammar.rule_index(i) nonts = [ manager.get_nonterminal_map().object_index(rule.lhs().nont()) ] + [ manager.get_nonterminal_map().object_index(nont) for nont in rule.rhs() ] rule_to_nonterminals.append(nonts) grammarInfo = PyGrammarInfo(grammar, manager.get_nonterminal_map()) storageManager = PyStorageManager() builder = PySplitMergeTrainerBuilder(manager, grammarInfo) builder.set_em_epochs(20) builder.set_percent_merger(60.0) splitMergeTrainer = builder.build() latentAnnotation = [ build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager) ] for i in range(max_cycles + 1): latentAnnotation.append( splitMergeTrainer.split_merge_cycle(latentAnnotation[-1])) # pickle.dump(map(lambda la: la.serialize(), latentAnnotation), open(sm_info_path, 'wb')) smGrammar = build_sm_grammar(latentAnnotation[i], grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.01) print("Cycle: ", i, "Rules: ", len(smGrammar.rules())) if parsing: parser = GFParser(smGrammar) trees = parse_conll_corpus(test, False, limit_test) for tree in trees: parser.set_input( term_labelling.prepare_parser_input(tree.token_yield())) parser.parse() if parser.recognized(): print( derivation_to_hybrid_tree( parser.best_derivation_tree(), [token.pos() for token in tree.token_yield()], [token.form() for token in tree.token_yield()], construct_constituent_token))
def test_individual_parsing_stages(self): grammar = self.build_grammar() for r in transform_grammar(grammar): pprint(r) rule_list = list(transform_grammar(grammar)) pprint(rule_list) disco_grammar = Grammar(rule_list, start=grammar.start()) print(disco_grammar) inp = ["a"] * 3 estimates = 'SXlrgaps', getestimates(disco_grammar, 40, grammar.start()) print(type(estimates)) chart, msg = parse(inp, disco_grammar, estimates=estimates) print(chart) print(msg) chart.filter() print("filtered chart") print(disco_grammar.nonterminals) print(type(disco_grammar.nonterminals)) print(chart) # print(help(chart)) root = chart.root() print("root", root, type(root)) print(chart.indices(root)) print(chart.itemstr(root)) print(chart.stats()) print("root label", chart.label(root)) print(root, chart.itemid1(chart.label(root), chart.indices(root))) for i in range(1, chart.numitems() + 1): print(i, chart.label(i), chart.indices(i), chart.numedges(i)) if True or len(chart.indices(i)) > 1: for edge_num in range(chart.numedges(i)): edge = chart.getEdgeForItem(i, edge_num) if isinstance(edge, tuple): print("\t", disco_grammar.nonterminalstr(chart.label(i)) + "[" + str(i) + "]", "->", ' '.join([disco_grammar.nonterminalstr(chart.label(j)) + "[" + str(j) + "]" for j in [edge[1], edge[2]] if j != 0])) else: print("\t", disco_grammar.nonterminalstr(chart.label(i)) + "[" + str(i) + "]", "->", inp[edge]) print(chart.getEdgeForItem(root, 0)) # print(lazykbest(chart, 5)) manager = PyDerivationManager(grammar) manager.convert_chart_to_hypergraph(chart, disco_grammar, debug=True) file = tempfile.mktemp() print(file) manager.serialize(bytes(file, encoding="utf-8")) gi = PyGrammarInfo(grammar, manager.get_nonterminal_map()) sm = PyStorageManager() la = build_PyLatentAnnotation_initial(grammar, gi, sm) vec = py_edge_weight_projection(la, manager, variational=True, debug=True, log_mode=False) print(vec) self.assertEqual([1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.5, 0.25, 0.25, 0.25, 0.25, 1.0], vec) vec = py_edge_weight_projection(la, manager, variational=False, debug=True, log_mode=False) print(vec) self.assertEqual([1.0, 1.0, 1.0, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 1.0], vec) der = manager.viterbi_derivation(0, vec, grammar) print(der) # print(disco_grammar.rulenos) # print(disco_grammar.numrules) # print(disco_grammar.lexicalbylhs) # print(disco_grammar.lexicalbyword) # print(disco_grammar.lexicalbynum) # print(disco_grammar.origrules, type(disco_grammar.origrules)) # print(disco_grammar.numbinary) # print(disco_grammar.numunary) # print(disco_grammar.toid) # print(disco_grammar.tolabel) # print(disco_grammar.bitpar) # striplabelre = re.compile(r'-\d+$') # msg = disco_grammar.getmapping(None, None) # disco_grammar.getrulemapping(disco_grammar, striplabelre) # mapping = disco_grammar.rulemapping # print(mapping) # for idx, group in enumerate(mapping): # print("Index", idx) # for elem in group: # print(grammar.rule_index(elem)) # for _, item in zip(range(20), chart.parseforest): # edge = chart.parseforest[item] # print(item, item.binrepr(), item.__repr__(), item.lexidx()) # print(type(edge)) for _ in range(5): vec2 = py_edge_weight_projection(la, manager, debug=True, log_mode=True) print(vec2)
def test_negra_to_dag_parsing(self): names = list(map(str, [26954])) fd_, primary_file = tempfile.mkstemp(suffix='.export') with open(primary_file, mode='w') as pf: for s in names: dsg = tp.sentence_names_to_deep_syntax_graphs( ["s" + s], "res/tiger/tiger_s%s.xml" % s, hold=False, ignore_puntcuation=False)[0] dsg.set_label(dsg.label[1:]) lines = np.serialize_hybrid_dag_to_negra( [dsg], 0, 500, use_sentence_names=True) print(''.join(lines), file=pf) _, binarized_file = tempfile.mkstemp(suffix='.export') subprocess.call([ "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", primary_file, binarized_file ]) print(primary_file) print(binarized_file) corpus = np.sentence_names_to_hybridtrees(names, primary_file, secedge=True) corpus2 = np.sentence_names_to_hybridtrees(names, binarized_file, secedge=True) dag = corpus[0] print(dag) assert isinstance(dag, HybridDag) self.assertEqual(8, len(dag.token_yield())) for token in dag.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() dag_bin = corpus2[0] print(dag_bin) for token in dag_bin.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() self.assertEqual(8, len(dag_bin.token_yield())) for node, token in zip( dag_bin.nodes(), list(map(str, map(dag_bin.node_token, dag_bin.nodes())))): print(node, token) print() print(top(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'101', '500'}, top(dag_bin, {'500', '101', '102'})) print(bottom(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'})) nont_labeling = BasicNonterminalLabeling() term_labeling = FormTerminals() # PosTerminals() grammar = direct_extract_lcfrs_from_prebinarized_corpus( dag_bin, term_labeling, nont_labeling) # print(grammar) for rule in grammar.rules(): print(rule.get_idx(), rule) print("Testing LCFRS parsing and DCP evaluation".center(80, '=')) parser = LCFRS_parser(grammar) parser_input = term_labeling.prepare_parser_input( dag_bin.token_yield()) print(parser_input) parser.set_input(parser_input) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) dcp_term = DCP_evaluator(der).getEvaluation() print(dcp_term[0]) dag_eval = HybridDag(dag_bin.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(dag_bin.token_yield()), False, construct_token=construct_constituent_token) print(dag_eval) for node in dag_eval.nodes(): token = dag_eval.node_token(node) if token.type() == "CONSTITUENT-CATEGORY": label = token.category() elif token.type() == "CONSTITUENT-TERMINAL": label = token.form(), token.pos() print(node, label, dag_eval.children(node), dag_eval.sec_children(node), dag_eval.sec_parents(node)) lines = np.serialize_hybridtrees_to_negra([dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='') print() with open(primary_file) as pcf: for line in pcf: print(line, end='') print('Testing reduct computation with Schick parser'.center(80, '=')) grammar_path = '/tmp/lcfrs_dcp_grammar.gr' derivation_manager = PyDerivationManager(grammar) with open(grammar_path, 'w') as grammar_file: nonterminal_enc, terminal_enc = linearize( grammar, nont_labeling, term_labeling, grammar_file, delimiter=' : ', nonterminal_encoder=derivation_manager.get_nonterminal_map()) print(np.negra_to_json(dag, terminal_enc, term_labeling)) json_data = np.export_corpus_to_json([dag], terminal_enc, term_labeling) corpus_path = '/tmp/json_dags.json' with open(corpus_path, 'w') as data_file: json.dump(json_data, data_file) reduct_dir = '/tmp/schick_parser_reducts' if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([ ' '.join([ "java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'reduct', '-g', grammar_path, '-t', corpus_path, "--input-format", "json", "-o", reduct_dir ]) ], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print("stdout", p.stdout.name) while True: nextline = p.stdout.readline() if nextline == b'' and p.poll() is not None: break print(nextline.decode('unicode_escape'), end='') # sys.stdout.write(nextline) # sys.stdout.flush() p.wait() p.stdout.close() self.assertEqual(0, p.returncode) rtgs = [] def decode_nonterminals(s): return derivation_manager.get_nonterminal_map().index_object( int(s)) for i in range(1, len(corpus) + 1): rtgs.append( read_rtg(os.path.join(reduct_dir, str(i) + '.gra'), symbol_offset=-1, rule_prefix='r', process_nonterminal=decode_nonterminals)) print("Reduct RTG") for rule in rtgs[0].rules: print(rule.lhs, "->", rule.symbol, rule.rhs) derivation_manager.get_nonterminal_map().print_index() derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize( bytes('/tmp/reduct_manager.trace', encoding='utf8')) derivations = [ LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(0, grammar) ] self.assertGreaterEqual(len(derivations), 1) if len(derivations) >= 1: print("Sentence", i) for der in derivations: print(der) self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start()))
def build_score_validator(baseline_grammar, grammarInfo, nont_map, storageManager, term_labelling, parser, corpus_validation, validationMethod): validator = PyCandidateScoreValidator(grammarInfo, storageManager, validationMethod) # parser = GFParser(baseline_grammar) tree_count = 0 der_count = 0 for gold_tree in corpus_validation: tree_count += 1 parser.set_input( term_labelling.prepare_parser_input(gold_tree.token_yield())) parser.parse() derivations = [der for _, der in parser.k_best_derivation_trees()] manager = PyDerivationManager(baseline_grammar, nont_map) manager.convert_hypergraphs(derivations) scores = [] relevant = set([tuple(t) for t in gold_tree.labelled_spans()]) for der in derivations: der_count += 1 h_tree = ConstituentTree() cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield()) dcp = DCP_evaluator(der).getEvaluation() dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_constituent_token) retrieved = set([tuple(t) for t in h_tree.labelled_spans()]) inters = retrieved & relevant # in case of parse failure there are two options here: # - parse failure -> no spans at all, thus precision = 1 # - parse failure -> a dummy tree with all spans wrong, thus precision = 0 precision = 1.0 * len(inters) / len(retrieved) \ if len(retrieved) > 0 else 0 recall = 1.0 * len(inters) / len(relevant) \ if len(relevant) > 0 else 0 fmeasure = 2.0 * precision * recall / (precision + recall) \ if precision + recall > 0 else 0 if validationMethod == "F1": scores.append(fmeasure) elif validationMethod == "Precision": scores.append(precision) elif validationMethod == "Recall": scores.append(recall) else: raise () validator.add_scored_candidates(manager, scores, 1.0 if len(relevant) > 0 else 0.0) # print(tree_count, scores) parser.clear() print("trees used for validation ", tree_count, "with", der_count * 1.0 / tree_count, "derivations on average") return validator
def build_score_validator(baseline_grammar, grammarInfo, nont_map, storageManager, term_labelling, parser, corpus_validation, validationMethod): validator = PyCandidateScoreValidator(grammarInfo, storageManager, validationMethod) # parser = GFParser(baseline_grammar) tree_count = 0 der_count = 0 for gold_tree in corpus_validation.get_trees(): tree_count += 1 parser.set_input( term_labelling.prepare_parser_input(gold_tree.token_yield())) parser.parse() derivations = map(lambda x: x[1], parser.k_best_derivation_trees()) manager = PyDerivationManager(baseline_grammar, nont_map) manager.convert_derivations_to_hypergraphs(derivations) scores = [] gold_labels = {} gold_heads = {} for position, id in enumerate(gold_tree.id_yield()): parent_id = gold_tree.parent(id) gold_labels[position] = gold_tree.node_token(id).deprel() if parent_id is None: assert id in gold_tree.root gold_heads[position] = 0 else: gold_heads[position] = gold_tree.id_yield().index( parent_id) + 1 derivations = parser.k_best_derivation_trees() for _, der in derivations: der_count += 1 h_tree = HybridTree() cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield()) dcp = DCP_evaluator(der).getEvaluation() dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) las, uas, lac = 0, 0, 0 for position, id in enumerate(h_tree.id_yield()): parent_id = h_tree.parent(id) if parent_id is None: assert id in h_tree.root head = 0 else: head = h_tree.id_yield().index(parent_id) + 1 label = h_tree.node_token(id).deprel() if gold_heads[position] == head: uas += 1 if gold_labels[position] == label: lac += 1 if gold_heads[position] == head and gold_labels[ position] == label: las += 1 if validationMethod == "LAS": scores.append(las) elif validationMethod == "UAS": scores.append(uas) elif validationMethod == "LAC": scores.append(lac) max_score = len(gold_tree.id_yield()) validator.add_scored_candidates(manager, scores, max_score) print(tree_count, max_score, scores) parser.clear() print("trees used for validation ", tree_count, "with", der_count * 1.0 / tree_count, "derivations on average") return validator
def test_json_corpus_grammar_export(self): start = 1 stop = 50 # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml" path = "res/tiger/tiger_8000.xml" exclude = [] dsgs = sentence_names_to_deep_syntax_graphs( ['s' + str(i) for i in range(start, stop + 1) if i not in exclude] , path , hold=False) rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0] def label_edge(edge): if isinstance(edge.label, ConstituentTerminal): return edge.label.pos() else: return edge.label nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge) term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling) grammar.make_proper() terminals = Enumerator() data = export_dog_grammar_to_json(grammar, terminals) grammar_path = '/tmp/json_grammar.json' with open(grammar_path, 'w') as file: json.dump(data, file) corpus_path = '/tmp/json_corpus.json' with open(corpus_path, 'w') as file: json.dump(export_corpus_to_json(dsgs, terminals, terminal_labeling=term_labeling), file) with open('/tmp/enumerator.enum', 'w') as file: terminals.print_index(file) reduct_dir = '/tmp/reduct_grammars' if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([' '.join( ["java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t', corpus_path, "-o", reduct_dir])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print("stdout", p.stdout.name) while True: nextline = p.stdout.readline() if nextline == b'' and p.poll() is not None: break print(nextline.decode('unicode_escape'), end='') # sys.stdout.write(nextline) # sys.stdout.flush() p.wait() p.stdout.close() self.assertEqual(0, p.returncode) rtgs = [] for i in range(1, len(dsgs) + 1): rtgs.append(read_rtg('/tmp/reduct_grammars/' + str(i) + '.gra')) derivation_manager = PyDerivationManager(grammar) derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize(bytes('/tmp/reduct_manager.trace', encoding='utf8')) f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token for i, (rtg, dsg) in enumerate(zip(rtgs, dsgs)): derivations = [LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(i, grammar)] self.assertGreaterEqual(len(derivations), 1) if len(derivations) > 1: print("Sentence", i) for der in derivations: print(der) for der in derivations: dog, sync = dog_evaluation(der) dsg2 = DeepSyntaxGraph(der.compute_yield(), dog, sync) dsg.dog.project_labels(f) dsg.sentence = list(map(f, dsg.sentence)) self.assertEqual(dsg.sentence, dsg2.sentence) morphs = dsg.dog.compute_isomorphism(dsg2.dog) self.assertFalse(morphs is None) self.assertListEqual([[morphs[0].get(node, node) for node in syncs] for syncs in dsg.synchronization], dsg2.synchronization) pass
def run_experiment(rec_part_strategy, nonterminal_labeling, exp, reorder_children, binarize=True): start = 1 stop = 7000 test_start = 7001 test_stop = 7200 # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml" corpus_path = "res/tiger/tiger_8000.xml" exclude = [] train_dsgs = sentence_names_to_deep_syntax_graphs( ['s' + str(i) for i in range(start, stop + 1) if i not in exclude], corpus_path, hold=False, reorder_children=reorder_children) test_dsgs = sentence_names_to_deep_syntax_graphs( [ 's' + str(i) for i in range(test_start, test_stop + 1) if i not in exclude ], corpus_path, hold=False, reorder_children=reorder_children) # Grammar induction term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token if binarize: def modify_token(token): if isinstance(token, ConstituentCategory): token_new = deepcopy(token) token_new.set_category(token.category() + '-BAR') return token_new elif isinstance(token, str): return token + '-BAR' else: assert False train_dsgs = [ dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs ] def is_bin(token): if isinstance(token, ConstituentCategory): if token.category().endswith('-BAR'): return True elif isinstance(token, str): if token.endswith('-BAR'): return True return False def debinarize(dsg): return dsg.debinarize(is_bin=is_bin) else: debinarize = id grammar = induction_on_a_corpus(train_dsgs, rec_part_strategy, nonterminal_labeling, term_labeling) grammar.make_proper() print("Nonterminals", len(grammar.nonts()), "Rules", len(grammar.rules())) parser = GFParser_k_best(grammar, k=500) return do_parsing(parser, test_dsgs, term_labeling_token, oracle=True, debinarize=debinarize) # Compute reducts, i.e., intersect grammar with each training dsg basedir = path.join('/tmp/dog_experiments', 'exp' + str(exp)) reduct_dir = path.join(basedir, 'reduct_grammars') terminal_map = Enumerator() if not os.path.isdir(basedir): os.makedirs(basedir) data = export_dog_grammar_to_json(grammar, terminal_map) grammar_path = path.join(basedir, 'grammar.json') with open(grammar_path, 'w') as file: json.dump(data, file) corpus_path = path.join(basedir, 'corpus.json') with open(corpus_path, 'w') as file: json.dump( export_corpus_to_json(train_dsgs, terminal_map, terminal_labeling=term_labeling), file) with open(path.join(basedir, 'enumerator.enum'), 'w') as file: terminal_map.print_index(file) if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([ ' '.join([ "java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t', corpus_path, "-o", reduct_dir ]) ], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) while True: nextline = p.stdout.readline() if nextline == '' and p.poll() is not None: break sys.stdout.write(nextline) sys.stdout.flush() p.wait() p.stdout.close() rtgs = [] for i in range(1, len(train_dsgs) + 1): rtgs.append(read_rtg(path.join(reduct_dir, str(i) + '.gra'))) derivation_manager = PyDerivationManager(grammar) derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize(path.join(basedir, 'reduct_manager.trace')) # Training ## prepare EM training em_epochs = 20 seed = 0 smoothing_factor = 0.01 split_randomization = 0.01 sm_cycles = 2 merge_percentage = 50.0 grammarInfo = PyGrammarInfo(grammar, derivation_manager.get_nonterminal_map()) storageManager = PyStorageManager() em_builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo) em_builder.set_em_epochs(em_epochs) em_builder.set_simple_expector(threads=THREADS) emTrainer = em_builder.build() # randomize initial weights and do em training la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager) la_no_splits.add_random_noise(seed=seed) emTrainer.em_train(la_no_splits) la_no_splits.project_weights(grammar, grammarInfo) do_parsing(CFGParser(grammar), test_dsgs, term_labeling_token) return ## prepare SM training builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo) builder.set_em_epochs(em_epochs) builder.set_split_randomization(1.0, seed + 1) builder.set_simple_expector(threads=THREADS) builder.set_smoothing_factor(smoothingFactor=smoothing_factor) builder.set_split_randomization(percent=split_randomization) # builder.set_scc_merger(-0.2) builder.set_percent_merger(merge_percentage) splitMergeTrainer = builder.build() # splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing") splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing") # set initial latent annotation latentAnnotation = [la_no_splits] # carry out split/merge training and do parsing parsing_method = "filter-ctf" # parsing_method = "single-best-annotation" k_best = 50 for i in range(1, sm_cycles + 1): splitMergeTrainer.reset_random_seed(seed + i + 1) latentAnnotation.append( splitMergeTrainer.split_merge_cycle(latentAnnotation[-1])) print("Cycle: ", i) if parsing_method == "single-best-annotation": smGrammar = latentAnnotation[i].build_sm_grammar( grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1) print("Rules in smoothed grammar: ", len(smGrammar.rules())) parser = GFParser(smGrammar) elif parsing_method == "filter-ctf": latentAnnotation[-1].project_weights(grammar, grammarInfo) parser = Coarse_to_fine_parser( grammar, latentAnnotation[-1], grammarInfo, derivation_manager.get_nonterminal_map(), base_parser_type=GFParser_k_best, k=k_best) else: raise (Exception()) do_parsing(parser, test_dsgs, term_labeling_token) del parser