def test_multiroot(self): tree = multi_dep_tree() term_pos = the_terminal_labeling_factory().get_strategy( 'pos').token_label fanout_1 = the_recursive_partitioning_factory().get_partitioning( 'fanout-1') for top_level_labeling_strategy in ['strict', 'child']: labeling_strategy = the_labeling_factory( ).create_simple_labeling_strategy(top_level_labeling_strategy, 'pos+deprel') for recursive_partitioning in [[direct_extraction], fanout_1, [left_branching]]: (_, grammar) = induce_grammar([tree], labeling_strategy, term_pos, recursive_partitioning, 'START') print(grammar) parser = LCFRS_parser(grammar, 'pA pB pC pD pE'.split(' ')) print(parser.best_derivation_tree()) cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') hybrid_tree = HybridTree() hybrid_tree = parser.dcp_hybrid_tree_best_derivation( hybrid_tree, cleaned_tokens, True, construct_conll_token) print(hybrid_tree) self.assertEqual(tree, hybrid_tree)
def test_conll_grammar_induction(): ignore_punctuation = True trees = parse_conll_corpus(TEST_FILE, False) trees = disconnect_punctuation(trees) terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') nonterminal_labeling = the_labeling_factory( ).create_simple_labeling_strategy('child', 'pos') (_, grammar) = d_i.induce_grammar(trees, nonterminal_labeling, terminal_labeling.token_label, [direct_extraction], 'START') trees2 = parse_conll_corpus(TEST_FILE_MODIFIED, False) trees2 = disconnect_punctuation(trees2) for tree in trees2: parser = LCFRS_parser( grammar, terminal_labeling.prepare_parser_input(tree.token_yield())) cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree() h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) # print h_tree print('input -> hybrid-tree -> output') print(tree_to_conll_str(tree)) print('parsed tokens') print(list(map(str, h_tree.full_token_yield()))) print('test_parser output') print(tree_to_conll_str(h_tree))
def test_induction_from_corpus_tree(self): dsg = sentence_names_to_deep_syntax_graphs(["s26954"], "res/tiger/tiger_s26954.xml", hold=False)[0] def label_edge(edge): if isinstance(edge.label, ConstituentTerminal): return edge.label.pos() else: return edge.label labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge) term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0] rec_part = rec_part_strategy(dsg) dcmp = compute_decomposition(dsg, rec_part) grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=labeling, terminal_labeling=term_labeling) print(grammar) parser = LCFRS_parser(grammar) parser.set_input(term_labeling_token.prepare_parser_input(dsg.sentence)) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() self.assertNotEqual(derivation, None)
def test_induction_with_labeling_strategies(self): dsg = build_dsg() rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('right-branching')[0] rec_part = rec_part_strategy(dsg) dcmp = compute_decomposition(dsg, rec_part) grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=simple_labeling, terminal_labeling=str) print(grammar) parser = LCFRS_parser(grammar) parser.set_input(dsg.sentence) # ["Sie", "entwickelt", "und", "druckt", "Verpackungen", "und", "Etiketten"] parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() self.assertNotEqual(derivation, None) dog, sync_list = dog_evaluation(derivation) self.assertEqual(dog, dsg.dog) self.assertEqual(len(sync_list), len(dsg.sentence)) # print(dog) # print(sync) # print(sync_list) morphism, _ = dsg.dog.compute_isomorphism(dog) for i in range(len(dsg.sentence)): self.assertListEqual(list(map(lambda x: morphism[x], dsg.get_graph_position(i))), sync_list[i])
def __process_single_dsg(self, i, dsg, rec_part_strat, terminal_labeling): if True or len(dsg.dog.outputs) > 1: print(i, dsg, dsg.label) # if i == 89: # render_and_view_dog(dsg.dog, 'dm0', 'dm0') # render_and_view_dog(corpus[1].dog, 'dm1', 'dm1') # print(dsg.sentence, dsg.synchronization, dsg.label) # dog39 = dsg.dog.extract_dog([39], [], enforce_outputs=False) # render_and_view_dog(dog39, "dog39") rec_part = rec_part_strat(dsg) if False and i == 89: pretty_print_rec_partitioning(rec_part) decomp = compute_decomposition(dsg, rec_part) # print(decomp) grammar = induce_grammar_from(dsg, rec_part, decomp, terminal_labeling=terminal_labeling, enforce_outputs=False, normalize=True) if False and i == 89: print(grammar) parser = LCFRS_parser(grammar) parser.set_input(list(map(terminal_labeling, dsg.sentence))) print("parsing") parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() self.assertNotEqual(derivation, None) dog, sync_list = dog_evaluation(derivation) dsg.dog.project_labels(terminal_labeling) if False and i == 89: render_and_view_dog(dsg.dog, "corpus", "corpus_graph") render_and_view_dog(dog, "parse_result", "parse_result") print("comparing") self.assertEqual(dog, dsg.dog)
def test_dsg(self): dsg = build_dsg() rec_part = dsg.recursive_partitioning() self.assertEqual(rec_part, ({0, 1, 2, 3, 4, 5, 6}, [({0, 1}, [({0}, []), ({1}, [])]), ({2}, []), ( {3, 4, 5, 6}, [({3}, []), ({4, 5, 6}, [({4}, []), ({5}, []), ({6}, [])])])])) dcmp = compute_decomposition(dsg, rec_part) self.assertEqual(dcmp, ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [([1, 4, 5], [([4], []), ([5], [])]), ([2], []), ( [3, 6, 7, 8, 9, 10], [([7], []), ([6, 8, 9, 10], [([8], []), ([9], []), ([10], [])])])])) self.__structurally_equal(rec_part, dcmp) grammar = induce_grammar_from(dsg, rec_part, dcmp, terminal_labeling=str) # print(grammar) for nont, label in zip(["[4]", "[5]", "[2]", "[7]", "[8]", "[9]", "[10]"], ["Sie", "entwickelt", "und", "druckt", "Verpackungen", "und", "Etiketten"]): for rule in grammar.lhs_nont_to_rules(nont): self.assertEqual(rule.dcp()[0], build_terminal_dog(label)) for nont, graph in zip( ["[1, 4, 5]", "[6, 8, 9, 10]", "[3, 6, 7, 8, 9, 10]", "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"], [dog_s1(), dog_s13(), dog_s3(), dog_se()]): for rule in grammar.lhs_nont_to_rules(nont): self.assertEqual(rule.dcp()[0], graph) parser = LCFRS_parser(grammar) parser.set_input(dsg.sentence) # ["Sie", "entwickelt", "und", "druckt", "Verpackungen", "und", "Etiketten"] parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() self.assertNotEqual(derivation, None) dog, sync_list = dog_evaluation(derivation) self.assertEqual(dog, dsg.dog) self.assertEqual(len(sync_list), len(dsg.sentence)) # print(dog) # print(sync) # print(sync_list) morphism, _ = dsg.dog.compute_isomorphism(dog) for i in range(len(dsg.sentence)): self.assertListEqual(list(map(lambda x: morphism[x], dsg.get_graph_position(i))), sync_list[i])
def test_induction_and_parsing_with_pos_recovery(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=FormTerminals()) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos())
def test_negra_to_dag_parsing(self): pass names = list(map(str, [26954])) fd_, primary_file = tempfile.mkstemp(suffix='.export') with open(primary_file, mode='w') as pf: for s in names: dsg = tp.sentence_names_to_deep_syntax_graphs( [s], "res/tiger/tiger_s%s.xml" % s, hold=False, ignore_puntcuation=False)[0] dsg.set_label(dsg.label[1:]) lines = np.serialize_hybrid_dag_to_negra( [dsg], 0, 500, use_sentence_names=True) print(''.join(lines), file=pf) _, binarized_file = tempfile.mkstemp(suffix='.export') subprocess.call([ "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", primary_file, binarized_file ]) print(primary_file) print(binarized_file) corpus = np.sentence_names_to_hybridtrees(names, primary_file, secedge=True) corpus2 = np.sentence_names_to_hybridtrees(names, binarized_file, secedge=True) dag = corpus[0] print(dag) assert isinstance(dag, HybridDag) self.assertEqual(8, len(dag.token_yield())) for token in dag.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() dag_bin = corpus2[0] print(dag_bin) for token in dag_bin.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() self.assertEqual(8, len(dag_bin.token_yield())) for node, token in zip( dag_bin.nodes(), list(map(str, map(dag_bin.node_token, dag_bin.nodes())))): print(node, token) print() print(top(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'101', '500'}, top(dag_bin, {'500', '101', '102'})) print(bottom(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'})) grammar = direct_extract_lcfrs_from_prebinarized_corpus(dag_bin) print(grammar) parser = LCFRS_parser(grammar) poss = list(map(lambda x: x.pos(), dag_bin.token_yield())) print(poss) parser.set_input(poss) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) dcp_term = DCP_evaluator(der).getEvaluation() print(dcp_term[0]) dag_eval = HybridDag(dag_bin.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(dag_bin.token_yield()), False, construct_token=construct_constituent_token) print(dag_eval) for node in dag_eval.nodes(): token = dag_eval.node_token(node) if token.type() == "CONSTITUENT-CATEGORY": label = token.category() elif token.type() == "CONSTITUENT-TERMINAL": label = token.form(), token.pos() print(node, label, dag_eval.children(node), dag_eval.sec_children(node), dag_eval.sec_parents(node)) lines = np.serialize_hybridtrees_to_negra([dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='') print() with open(primary_file) as pcf: for line in pcf: print(line, end='')
def test_negra_to_dag_parsing(self): names = list(map(str, [26954])) fd_, primary_file = tempfile.mkstemp(suffix='.export') with open(primary_file, mode='w') as pf: for s in names: dsg = tp.sentence_names_to_deep_syntax_graphs( ["s" + s], "res/tiger/tiger_s%s.xml" % s, hold=False, ignore_puntcuation=False)[0] dsg.set_label(dsg.label[1:]) lines = np.serialize_hybrid_dag_to_negra( [dsg], 0, 500, use_sentence_names=True) print(''.join(lines), file=pf) _, binarized_file = tempfile.mkstemp(suffix='.export') subprocess.call([ "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", primary_file, binarized_file ]) print(primary_file) print(binarized_file) corpus = np.sentence_names_to_hybridtrees(names, primary_file, secedge=True) corpus2 = np.sentence_names_to_hybridtrees(names, binarized_file, secedge=True) dag = corpus[0] print(dag) assert isinstance(dag, HybridDag) self.assertEqual(8, len(dag.token_yield())) for token in dag.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() dag_bin = corpus2[0] print(dag_bin) for token in dag_bin.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() self.assertEqual(8, len(dag_bin.token_yield())) for node, token in zip( dag_bin.nodes(), list(map(str, map(dag_bin.node_token, dag_bin.nodes())))): print(node, token) print() print(top(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'101', '500'}, top(dag_bin, {'500', '101', '102'})) print(bottom(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'})) nont_labeling = BasicNonterminalLabeling() term_labeling = FormTerminals() # PosTerminals() grammar = direct_extract_lcfrs_from_prebinarized_corpus( dag_bin, term_labeling, nont_labeling) # print(grammar) for rule in grammar.rules(): print(rule.get_idx(), rule) print("Testing LCFRS parsing and DCP evaluation".center(80, '=')) parser = LCFRS_parser(grammar) parser_input = term_labeling.prepare_parser_input( dag_bin.token_yield()) print(parser_input) parser.set_input(parser_input) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) dcp_term = DCP_evaluator(der).getEvaluation() print(dcp_term[0]) dag_eval = HybridDag(dag_bin.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(dag_bin.token_yield()), False, construct_token=construct_constituent_token) print(dag_eval) for node in dag_eval.nodes(): token = dag_eval.node_token(node) if token.type() == "CONSTITUENT-CATEGORY": label = token.category() elif token.type() == "CONSTITUENT-TERMINAL": label = token.form(), token.pos() print(node, label, dag_eval.children(node), dag_eval.sec_children(node), dag_eval.sec_parents(node)) lines = np.serialize_hybridtrees_to_negra([dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='') print() with open(primary_file) as pcf: for line in pcf: print(line, end='') print('Testing reduct computation with Schick parser'.center(80, '=')) grammar_path = '/tmp/lcfrs_dcp_grammar.gr' derivation_manager = PyDerivationManager(grammar) with open(grammar_path, 'w') as grammar_file: nonterminal_enc, terminal_enc = linearize( grammar, nont_labeling, term_labeling, grammar_file, delimiter=' : ', nonterminal_encoder=derivation_manager.get_nonterminal_map()) print(np.negra_to_json(dag, terminal_enc, term_labeling)) json_data = np.export_corpus_to_json([dag], terminal_enc, term_labeling) corpus_path = '/tmp/json_dags.json' with open(corpus_path, 'w') as data_file: json.dump(json_data, data_file) reduct_dir = '/tmp/schick_parser_reducts' if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([ ' '.join([ "java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'reduct', '-g', grammar_path, '-t', corpus_path, "--input-format", "json", "-o", reduct_dir ]) ], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print("stdout", p.stdout.name) while True: nextline = p.stdout.readline() if nextline == b'' and p.poll() is not None: break print(nextline.decode('unicode_escape'), end='') # sys.stdout.write(nextline) # sys.stdout.flush() p.wait() p.stdout.close() self.assertEqual(0, p.returncode) rtgs = [] def decode_nonterminals(s): return derivation_manager.get_nonterminal_map().index_object( int(s)) for i in range(1, len(corpus) + 1): rtgs.append( read_rtg(os.path.join(reduct_dir, str(i) + '.gra'), symbol_offset=-1, rule_prefix='r', process_nonterminal=decode_nonterminals)) print("Reduct RTG") for rule in rtgs[0].rules: print(rule.lhs, "->", rule.symbol, rule.rhs) derivation_manager.get_nonterminal_map().print_index() derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize( bytes('/tmp/reduct_manager.trace', encoding='utf8')) derivations = [ LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(0, grammar) ] self.assertGreaterEqual(len(derivations), 1) if len(derivations) >= 1: print("Sentence", i) for der in derivations: print(der) self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start()))
def test_stanford_unking_scheme(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) terminal_labeling = StanfordUNKing([tree]) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=terminal_labeling) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos()) rules = terminal_labeling.create_smoothed_rules() print(rules) new_rules = {} for rule in grammar.rules(): if rule.rhs() == []: assert len(rule.dcp()) == 1 dcp = rule.dcp()[0] assert len(dcp.rhs()) == 1 term = dcp.rhs()[0] head = term.head() pos = head.pos() for tag, form in rules: if tag == pos: lhs = LCFRS_lhs(rule.lhs().nont()) lhs.add_arg([form]) new_rules[lhs, dcp] = rules[tag, form] for lhs, dcp in new_rules: print(str(lhs), str(dcp), new_rules[(lhs, dcp)]) tokens = [ construct_constituent_token('hat', '--', True), construct_constituent_token('HAT', '--', True) ] self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), '_UNK') terminal_labeling.test_mode = True self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), 'hat')
def test_single_root_induction(self): tree = hybrid_tree_1() # print tree.children("v") # print tree # # for id_set in ['v v1 v2 v21'.split(' '), 'v1 v2'.split(' '), # 'v v21'.split(' '), ['v'], ['v1'], ['v2'], ['v21']]: # print id_set, 'top:', top(tree, id_set), 'bottom:', bottom(tree, id_set) # print id_set, 'top_max:', max(tree, top(tree, id_set)), 'bottom_max:', max(tree, bottom(tree, id_set)) # # print "some rule" # for mem, arg in [(-1, 0), (0,0), (1,0)]: # print create_DCP_rule(mem, arg, top_max(tree, ['v','v1','v2','v21']), bottom_max(tree, ['v','v1','v2','v21']), # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1', 'v2'], ['v', 'v21']]]) # # # print "some other rule" # for mem, arg in [(-1,1),(1,0)]: # print create_DCP_rule(mem, arg, top_max(tree, ['v1','v2']), bottom_max(tree, ['v1','v2']), # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1'], ['v2']]]) # # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21'])) # print 'child:' , child_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21'])) # print '---' # print 'strict: ', strict_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21'])) # print 'child: ', child_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21'])) # print '---' # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21'])) # print 'child:' , child_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21'])) tree2 = hybrid_tree_2() # print tree2.children("v") # print tree2 # # print 'siblings v211', tree2.siblings('v211') # print top(tree2, ['v','v1', 'v211']) # print top_max(tree2, ['v','v1', 'v211']) # # print '---' # print 'strict:' , strict_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211'])) # print 'child:' , child_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211'])) # rec_par = ('v v1 v2 v21'.split(' '), # [('v1 v2'.split(' '), [(['v1'],[]), (['v2'],[])]) # ,('v v21'.split(' '), [(['v'],[]), (['v21'],[])]) # ]) # # grammar = LCFRS(nonterminal_str(tree, top_max(tree, rec_par[0]), bottom_max(tree, rec_par[0]), 'strict')) # # add_rules_to_grammar_rec(tree, rec_par, grammar, 'child') # # grammar.make_proper() # print grammar print(tree.recursive_partitioning()) terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'), terminal_labeling.token_label, [direct_extraction], 'START') print(max([grammar.fanout(nont) for nont in grammar.nonts()])) print(grammar) parser = LCFRS_parser(grammar, 'NP N V V'.split(' ')) print(parser.best_derivation_tree()) tokens = [ construct_conll_token(form, pos) for form, pos in zip( 'Piet Marie helpen lezen'.split(' '), 'NP N V V'.split(' ')) ] hybrid_tree = HybridTree() hybrid_tree = parser.dcp_hybrid_tree_best_derivation( hybrid_tree, tokens, True, construct_conll_token) print(list(map(str, hybrid_tree.full_token_yield()))) print(hybrid_tree) string = "foo" dcp_string = DCP_string(string) dcp_string.set_edge_label("bar") print(dcp_string, dcp_string.edge_label()) linearize( grammar, the_labeling_factory().create_simple_labeling_strategy( 'child', 'pos+deprel'), the_terminal_labeling_factory().get_strategy('pos'), sys.stdout)