def test_corpus_split_merge_training(self): train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' limit_train = 100 test = train # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory().create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) # for rule in grammar.rules(): # print >>stderr, rule trees = parse_conll_corpus(train, False, limit_train) print("call S/M Training", file=stderr) new_grammars = split_merge_training(grammar_prim, term_labelling, trees, 4, 10, tie_breaking=True, init="equal", sigma=0.05, seed=50, merge_threshold=0.1) print("finished S/M Training", file=stderr) for new_grammar in new_grammars: for i, rule in enumerate(new_grammar.rules()): print(i, rule, file=stderr) print(file=stderr)
def test_corpus_em_training(self): train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' limit_train = 200 test = train # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) # for rule in grammar.rules(): # print >>stderr, rule trees = parse_conll_corpus(train, False, limit_train) print("compute reducts", file=stderr) trace = compute_reducts(grammar_prim, trees, term_labelling) print("call em Training", file=stderr) emTrainer = PyEMTrainer(trace) emTrainer.em_training(grammar_prim, 20, tie_breaking=True, init="equal", sigma=0.05, seed=50) print("finished em Training", file=stderr)
def test_minimum_risk_parsing(self): limit_train = 20 limit_test = 10 train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train parser_type = GFParser_k_best # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim) tree_yield = term_labelling.prepare_parser_input trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print("Parsing sentence ", i, file=stderr) # print >>stderr, tree parser = parser_type(grammar_prim, tree_yield(tree.token_yield()), k=50) self.assertTrue(parser.recognized()) derivations = [der for der in parser.k_best_derivation_trees()] print("# derivations: ", len(derivations), file=stderr) h_trees = [] current_weight = 0 weights = [] derivation_list = [] for weight, der in derivations: self.assertTrue(not der in derivation_list) derivation_list.append(der) dcp = DCP_evaluator(der).getEvaluation() h_tree = HybridTree() cleaned_tokens = copy.deepcopy(tree.full_token_yield()) dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) h_trees.append(h_tree) weights.append(weight) if True: min_risk_tree = compute_minimum_risk_tree(h_trees, weights) if not min_risk_tree.__eq__(h_trees[0]): print(h_trees[0]) print(min_risk_tree)
def dbtest(): connection = openDatabase(dbfile) connection.text_factory = str create_experiment_table(connection) training_corpus = test_file test_corpus = test_file experiment = add_experiment(connection, 'term_pos', 'child_pos', 'direct_extraction', False, training_corpus, test_corpus, time.time(), None) c = connection.cursor() for row in c.execute('SELECT * FROM experiments'): print(row) create_tree_table(connection) create_tree_node_table(connection) for tree in conll_parse.parse_conll_corpus(test_corpus, False): add_tree(connection, tree, test_corpus) for row in c.execute('SELECT * FROM trees'): print(row) for row2 in c.execute('SELECT * FROM tree_nodes'): print(row2) print() create_result_tree_table(connection) create_result_tree_node_table(connection) time_stamp = time.clock() for tree in conll_parse.parse_conll_corpus(test_file_modified, False): add_result_tree(connection, tree, test_corpus, experiment, 1, 0.142, time.clock() - time_stamp, "parse") time_stamp = time.clock() for row3 in c.execute('SELECT * FROM result_tree_nodes'): print(row3, type(row3[0]).__name__) print() print(experiment, type(experiment).__name__) for row4 in c.execute( '''SELECT * FROM result_trees INNER JOIN result_tree_nodes ON result_trees.rt_id = result_tree_nodes.rt_id WHERE exp_id = ?''', (experiment, )): print(row4) connection.close()
def test_wsj(self): corpus = "res/wsj_dependency/24.conll" trees = parse_conll_corpus(corpus, False, 5000) trees = disconnect_punctuation(trees) x = 0 for tree in trees: x += 1 self.assertEqual(1346, x)
def test_tiger(self): corpus = "res/dependency_conll/german/tiger/test/german_tiger_test.conll" trees = parse_conll_corpus(corpus, False, 5000) trees = disconnect_punctuation(trees) x = 0 for tree in trees: x += 1 self.assertEqual(357, x)
def get_trees(self): if self._trees is not None: for tree in self._trees: yield tree else: self._trees = [] for tree in length_limit( parse_conll_corpus(self._path, False, limit=self._end, start=self._start), self._max_length): self._trees.append(tree) yield tree
def obtain_derivations(grammar, term_labelling): # build parser tree_yield = term_labelling.prepare_parser_input parser = GFParser_k_best(grammar, k=50) # parse sentences trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print( "Parsing sentence ", i, file=stderr, ) parser.set_input(tree_yield(tree.token_yield())) parser.parse() derivations = [der for der in parser.k_best_derivation_trees()] print("# derivations: ", len(derivations), file=stderr) parser.clear() for der in derivations: yield der[1]
def main(limit=100000, ignore_punctuation=False): if PARSER_TYPE.__name__ != 'GFParser': print('GFParser not found, using', PARSER_TYPE.__name__, 'instead!') print('Please install grammatical framework to reproduce experiments.') test_limit = 10000 trees = parse_conll_corpus(TRAIN, False, limit) if ignore_punctuation: trees = disconnect_punctuation(trees) (n_trees, grammar_prim) = d_i.induce_grammar(trees, PRIMARY_LABELLING, TERMINAL_LABELLING.token_label, RECURSIVE_PARTITIONING, START) PARSER_TYPE.preprocess_grammar(grammar_prim) trees = parse_conll_corpus(TRAIN, False, limit) if ignore_punctuation: trees = disconnect_punctuation(trees) (n_trees, grammar_second) = d_i.induce_grammar(trees, SECONDARY_LABELLING, TERMINAL_LABELLING.token_label, RECURSIVE_PARTITIONING, START) PARSER_TYPE.preprocess_grammar(grammar_second) trees = parse_conll_corpus(TRAIN, False, limit) if ignore_punctuation: trees = disconnect_punctuation(trees) (n_trees, grammar_tern) = d_i.induce_grammar(trees, TERNARY_LABELLING, TERMINAL_LABELLING.token_label, RECURSIVE_PARTITIONING, START) PARSER_TYPE.preprocess_grammar(grammar_tern) trees = parse_conll_corpus(TEST, False, test_limit) if ignore_punctuation: trees = disconnect_punctuation(trees) total_time = 0.0 with open(RESULT, 'w') as result_file: failures = 0 for tree in trees: time_stamp = time.clock() the_parser = PARSER_TYPE(grammar_prim, TREE_YIELD(tree.token_yield())) if not the_parser.recognized(): the_parser = PARSER_TYPE(grammar_second, TREE_YIELD(tree.token_yield())) if not the_parser.recognized(): the_parser = PARSER_TYPE(grammar_tern, TREE_YIELD(tree.token_yield())) time_stamp = time.clock() - time_stamp total_time += time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) h_tree = the_parser.dcp_hybrid_tree_best_derivation(h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) if h_tree: result_file.write(tree_to_conll_str(h_tree)) result_file.write('\n\n') else: failures += 1 forms = [token.form() for token in tree.full_token_yield()] poss = [token.pos() for token in tree.full_token_yield()] result_file.write(tree_to_conll_str(fall_back_left_branching(forms, poss))) result_file.write('\n\n') print("parse failures", failures) print("parse time", total_time) print("eval.pl", "no punctuation") p = subprocess.Popen(["perl", "util/eval.pl", "-g", TEST, "-s", RESULT, "-q"]) p.communicate() print("eval.pl", "punctuation") p = subprocess.Popen( ["perl", "util/eval.pl", "-g", TEST, "-s", RESULT, "-q", "-p"]) p.communicate()
def main(): # induce grammar from a corpus trees = parse_conll_corpus(train, False, limit_train) nonterminal_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] _, grammar = induce_grammar(trees, nonterminal_labelling, term_labelling.token_label, recursive_partitioning, start) # compute some derivations derivations = obtain_derivations(grammar, term_labelling) # create derivation manager and add derivations manager = PyDerivationManager(grammar) manager.convert_derivations_to_hypergraphs(derivations) manager.serialize(b"/tmp/derivations.txt") # build and configure split/merge trainer and supplementary objects rule_to_nonterminals = [] for i in range(0, len(grammar.rule_index())): rule = grammar.rule_index(i) nonts = [ manager.get_nonterminal_map().object_index(rule.lhs().nont()) ] + [ manager.get_nonterminal_map().object_index(nont) for nont in rule.rhs() ] rule_to_nonterminals.append(nonts) grammarInfo = PyGrammarInfo(grammar, manager.get_nonterminal_map()) storageManager = PyStorageManager() builder = PySplitMergeTrainerBuilder(manager, grammarInfo) builder.set_em_epochs(20) builder.set_percent_merger(60.0) splitMergeTrainer = builder.build() latentAnnotation = [ build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager) ] for i in range(max_cycles + 1): latentAnnotation.append( splitMergeTrainer.split_merge_cycle(latentAnnotation[-1])) # pickle.dump(map(lambda la: la.serialize(), latentAnnotation), open(sm_info_path, 'wb')) smGrammar = build_sm_grammar(latentAnnotation[i], grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.01) print("Cycle: ", i, "Rules: ", len(smGrammar.rules())) if parsing: parser = GFParser(smGrammar) trees = parse_conll_corpus(test, False, limit_test) for tree in trees: parser.set_input( term_labelling.prepare_parser_input(tree.token_yield())) parser.parse() if parser.recognized(): print( derivation_to_hybrid_tree( parser.best_derivation_tree(), [token.pos() for token in tree.token_yield()], [token.form() for token in tree.token_yield()], construct_constituent_token))
def main(limit=300, ignore_punctuation=False, baseline_path=baseline_path, recompileGrammar=True, retrain=True, parsing=True, seed=1337): max_length = 20 trees = length_limit(parse_conll_corpus(train, False, limit), max_length) if recompileGrammar or not os.path.isfile(baseline_path): (n_trees, baseline_grammar) = d_i.induce_grammar(trees, empty_labelling, term_labelling.token_label, recursive_partitioning, start) pickle.dump(baseline_grammar, open(baseline_path, 'wb')) else: baseline_grammar = pickle.load(open(baseline_path)) test_limit = 10000 print("Rules: ", len(baseline_grammar.rules())) if parsing: do_parsing(baseline_grammar, test_limit, ignore_punctuation, recompileGrammar, [dir, "baseline_gf_grammar"]) em_trained = pickle.load(open(baseline_path)) if recompileGrammar or not os.path.isfile(reduct_path): trees = length_limit(parse_conll_corpus(train, False, limit), max_length) trace = compute_reducts(em_trained, trees, term_labelling) trace.serialize(reduct_path) else: print("loading trace") trace = PySDCPTraceManager(em_trained, term_labelling) trace.load_traces_from_file(reduct_path) discr = False if discr: if recompileGrammar or not os.path.isfile(reduct_path_discr): trees = length_limit(parse_conll_corpus(train, False, limit), max_length) trace_discr = compute_LCFRS_reducts( em_trained, trees, term_labelling, nonterminal_map=trace.get_nonterminal_map()) trace_discr.serialize(reduct_path_discr) else: print("loading trace discriminative") trace_discr = PyLCFRSTraceManager(em_trained, trace.get_nonterminal_map()) trace_discr.load_traces_from_file(reduct_path_discr) n_epochs = 20 init = "rfe" tie_breaking = True em_trained_path_ = em_trained_path(n_epochs, init, tie_breaking) if recompileGrammar or retrain or not os.path.isfile(em_trained_path_): emTrainer = PyEMTrainer(trace) emTrainer.em_training(em_trained, n_epochs=n_epochs, init=init, tie_breaking=tie_breaking, seed=seed) pickle.dump(em_trained, open(em_trained_path_, 'wb')) else: em_trained = pickle.load(open(em_trained_path_, 'rb')) if parsing: do_parsing(em_trained, test_limit, ignore_punctuation, recompileGrammar or retrain, [dir, "em_trained_gf_grammar"]) grammarInfo = PyGrammarInfo(baseline_grammar, trace.get_nonterminal_map()) storageManager = PyStorageManager() builder = PySplitMergeTrainerBuilder(trace, grammarInfo) builder.set_em_epochs(n_epochs) builder.set_split_randomization(1.0, seed + 1) if discr: builder.set_discriminative_expector(trace_discr, maxScale=10, threads=1) else: builder.set_simple_expector(threads=1) splitMergeTrainer = builder.set_percent_merger(65.0).build() if (not recompileGrammar) and ( not retrain) and os.path.isfile(sm_info_path): print("Loading splits and weights of LA rules") latentAnnotation = map( lambda t: build_PyLatentAnnotation(t[0], t[1], t[2], grammarInfo, storageManager), pickle.load(open(sm_info_path, 'rb'))) else: latentAnnotation = [ build_PyLatentAnnotation_initial(em_trained, grammarInfo, storageManager) ] max_cycles = 4 reparse = False # parsing = False for i in range(max_cycles + 1): if i < len(latentAnnotation): if reparse: smGrammar = latentAnnotation[i].build_sm_grammar( baseline_grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.01) print("Cycle: ", i, "Rules: ", len(smGrammar.rules())) do_parsing(smGrammar, test_limit, ignore_punctuation, recompileGrammar or retrain, [dir, "sm_cycles" + str(i) + "_gf_grammar"]) else: # setting the seed to achieve reproducibility in case of continued training splitMergeTrainer.reset_random_seed(seed + i + 1) latentAnnotation.append( splitMergeTrainer.split_merge_cycle(latentAnnotation[-1])) pickle.dump(map(lambda la: la.serialize(), latentAnnotation), open(sm_info_path, 'wb')) smGrammar = latentAnnotation[i].build_sm_grammar( baseline_grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1) print("Cycle: ", i, "Rules: ", len(smGrammar.rules())) if parsing: do_parsing(smGrammar, test_limit, ignore_punctuation, recompileGrammar or retrain, [dir, "sm_cycles" + str(i) + "_gf_grammar"])
def test_k_best_parsing(self): limit_train = 20 limit_test = 10 train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train parser_type = GFParser_k_best # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim) tree_yield = term_labelling.prepare_parser_input trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print("Parsing sentence ", i, file=stderr) # print >>stderr, tree parser = parser_type(grammar_prim, tree_yield(tree.token_yield()), k=50) self.assertTrue(parser.recognized()) derivations = [der for der in parser.k_best_derivation_trees()] print("# derivations: ", len(derivations), file=stderr) h_trees = [] current_weight = 0 weights = [] derivation_list = [] for weight, der in derivations: # print >>stderr, exp(-weight) # print >>stderr, der self.assertTrue(not der in derivation_list) derivation_list.append(der) # TODO this should hold, but it looks like a GF bug! # self.assertGreaterEqual(weight, current_weight) current_weight = weight dcp = DCP_evaluator(der).getEvaluation() h_tree = HybridTree() cleaned_tokens = copy.deepcopy(tree.full_token_yield()) dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) h_trees.append(h_tree) weights.append(weight) # print >>stderr, h_tree # print a matrix indicating which derivations result # in the same hybrid tree if True: for i, h_tree1 in enumerate(h_trees): for h_tree2 in h_trees: if h_tree1 == h_tree2: print("x", end=' ', file=stderr) else: print("", end=' ', file=stderr) print(weights[i], file=stderr) print(file=stderr)
def test_best_trees(self): limit_train = 5000 limit_test = 100 train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train parser_type = GFParser_k_best # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("child", "pos+deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim) tree_yield = term_labelling.prepare_parser_input trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print("Parsing sentence ", i, file=stderr) parser = parser_type(grammar_prim, tree_yield(tree.token_yield()), k=200) self.assertTrue(parser.recognized()) viterbi_weight = parser.viterbi_weight() viterbi_deriv = parser.viterbi_derivation() der_to_tree = lambda der: dcp_to_hybridtree( HybridTree(), DCP_evaluator(der).getEvaluation(), copy.deepcopy(tree.full_token_yield()), False, construct_conll_token) viterbi_tree = der_to_tree(viterbi_deriv) ordered_parse_trees = parser.best_trees(der_to_tree) best_tree, best_weight, best_witnesses = ordered_parse_trees[0] for i, (parsed_tree, _, _) in enumerate(ordered_parse_trees): if parsed_tree.__eq__(tree): print("Gold tree is ", i + 1, " in best tree list", file=stderr) break if (not viterbi_tree.__eq__(best_tree) and viterbi_weight != best_weight): print("viterbi and k-best tree differ", file=stderr) print("viterbi: ", viterbi_weight, file=stderr) print("k-best: ", best_weight, best_witnesses, file=stderr) if False: print(viterbi_tree, file=stderr) print(tree_to_conll_str(viterbi_tree), file=stderr) print(best_tree, file=stderr) print(tree_to_conll_str(best_tree), file=stderr) print("gold tree", file=stderr) print(tree, file=stderr) print(tree_to_conll_str(tree), file=stderr)
def induce_grammar_from_file(path, connection, nont_labelling, term_labelling, recursive_partitioning, limit=sys.maxsize, quiet=False, start='START', ignore_punctuation=True): """ :param path: path to dependency corpus in CoNLL format :type path: str :param connection: database connection :type connection: Connection :param nont_labelling: nonterminal labeling strategy :type nont_labelling: AbstractLabeling :param term_labelling: GeneralHybridTree, NodeId -> str :type term_labelling: GeneralHybridTree, str -> str :param recursive_partitioning: GeneralHybridTree -> RecursivePartitioning :type recursive_partitioning: GeneralHybridTree -> [str], unknown :param limit: use only the first _limit_ trees for grammar induction :type limit: int :param quiet: status output :type quiet: bool :param start: set start nonterminal for grammar :type start: str :param ignore_punctuation: include punctuation into grammar :type ignore_punctuation: bool :rtype: LCFRS, int Extract an LCFRS/sDCP-Hybrid Grammar from a dependency corpus in CoNLL format. """ experiment = experiment_database.add_experiment( connection, str(term_labelling), str(nont_labelling), ','.join([rec_par.__name__ for rec_par in recursive_partitioning]), ignore_punctuation, path, '', time.time(), None) if not quiet: print('Inducing grammar') print('file: ' + path) print('Nonterminal labelling strategy: ', nont_labelling.__str__()) print('Terminal labelling strategy: ', str(term_labelling)) print( 'Recursive partitioning strategy:', ','.join([rec_par.__name__ for rec_par in recursive_partitioning])) print('limit: ', str(limit)) print('Ignoring punctuation ', ignore_punctuation) start_clock = time.clock() trees = parse_conll_corpus(path, False, limit) trees = add_trees_to_db(path, connection, trees) if ignore_punctuation: trees = disconnect_punctuation(trees) (n_trees, grammar) = d_i.induce_grammar(trees, nont_labelling, term_labelling.token_label, recursive_partitioning, start) end_clock = time.clock() if not quiet: print('Number of trees: ', str(n_trees)) print('Number of nonterminals: ', len(grammar.nonts())) print('Number of rules: ', len(grammar.rules())) print('Total size: ', grammar.size()) print('Fanout: ', max(map(grammar.fanout, grammar.nonts()))) print('Induction time: ', end_clock - start_clock, 'seconds') print(experiment) experiment_database.add_grammar(connection, grammar, experiment) grammar_output = open('.tmp/grammar-' + str(experiment) + '.gra', 'w') linearize(grammar, nont_labelling, term_labelling, grammar_output) assert grammar.ordered() return grammar, experiment
def parse_sentences_from_file(grammar, parser_type, experiment, connection, path, tree_yield, max_length=sys.maxsize, limit=sys.maxsize, quiet=False, ignore_punctuation=True, root_default_deprel=None, disconnected_default_deprel=None): """ :rtype: None :type grammar: LCFRS :param path: file path for test corpus (dependency grammar in CoNLL format) :type path: str :param tree_yield: parse on words or POS or .. :type tree_yield: GeneralHybridTree -> list[str] :param max_length: don't parse sentences with yield > max_length :type max_length: int :param limit: only parse the limit first sentences of the corpus :type limit: int :param quiet: output status information :type quiet: bool :param ignore_punctuation: exclude punctuation from parsing :type ignore_punctuation: bool Parse sentences from corpus and compare derived dependency structure with gold standard information. """ if not quiet: print("Building lookahead tables for grammar") parser_type.preprocess_grammar(grammar) experiment_database.set_experiment_test_corpus(connection, experiment, path) if not quiet: if max_length != sys.maxsize: s = ', ignoring sentences with length > ' + str(max_length) else: s = '' print('Start parsing sentences' + s) trees = parse_conll_corpus(path, False, limit) trees = add_trees_to_db(path, connection, trees) if ignore_punctuation: trees = disconnect_punctuation(trees) (UAS, LAS, UEM, LEM) = (0, 0, 0, 0) parse = 0 no_parse = 0 n_gaps_gold = 0 n_gaps_test = 0 skipped = 0 start_at = time.clock() for tree in trees: if len(tree.id_yield()) > max_length: skipped += 1 continue time_stamp = time.clock() parser = parser_type(grammar, tree_yield(tree.token_yield())) time_stamp = time.clock() - time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) if h_tree: experiment_database.add_result_tree(connection, h_tree, path, experiment, 1, parser.best(), time_stamp, 'parse', root_default_deprel, disconnected_default_deprel) n_gaps_gold += tree.n_gaps() n_gaps_test += h_tree.n_gaps() parse += 1 (dUAS, dLAS, dUEM, dLEM) = score_cmp_dep_trees(tree, h_tree) UAS += dUAS LAS += dLAS UEM += dUEM LEM += dLEM else: experiment_database.no_parse_result(connection, tree.sent_label(), path, experiment, time_stamp, "no_parse") no_parse += 1 end_at = time.clock() total = parse + no_parse if not quiet: print('Parsed ' + str(parse) + ' out of ' + str(total) + ' (skipped ' + str(skipped) + ')') print('fail: ', no_parse) if parse > 0: print('UAS: ', UAS / parse) print('LAS: ', LAS / parse) print('UEM: ', UEM / parse) print('LEM: ', LEM / parse) print('n gaps (gold): ', n_gaps_gold * 1.0 / parse) print('n gaps (test): ', n_gaps_test * 1.0 / parse) print('parse time: ', end_at - start_at, 's') print()
def do_parsing(grammar_prim, limit, ignore_punctuation, recompile=True, preprocess_path=None): trees = parse_conll_corpus(test, False, limit) if ignore_punctuation: trees = disconnect_punctuation(trees) total_time = 0.0 load_preprocess = preprocess_path if recompile or (not os.path.isfile( parser_type.resolve_path(preprocess_path))): load_preprocess = None parser = parser_type(grammar_prim, save_preprocess=preprocess_path, load_preprocess=load_preprocess) with open(result, 'w') as result_file: failures = 0 for tree in trees: if len(tree.id_yield()) > limit: continue time_stamp = time.clock() parser.set_input(tree_yield(tree.token_yield())) parser.parse() # if not parser.recognized(): # parser = parser_type(grammar_second, tree_yield(tree.token_yield())) # if not parser.recognized(): # parser = parser_type(grammar_tern, tree_yield(tree.token_yield())) time_stamp = time.clock() - time_stamp total_time += time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) if parser_type == GFParser_k_best and parser.recognized(): der_to_tree = lambda der: dcp_to_hybridtree( HybridTree(), DCP_evaluator(der).getEvaluation(), copy.deepcopy(tree.full_token_yield()), False, construct_conll_token) h_tree = parser.best_trees(der_to_tree)[0][0] elif parser_type == CFGParser \ or parser_type == GFParser \ or parser_type == LeftBranchingFSTParser \ or parser_type == RightBranchingFSTParser: h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) else: h_tree = None if h_tree: result_file.write(tree_to_conll_str(h_tree)) result_file.write('\n\n') else: failures += 1 forms = [token.form() for token in tree.full_token_yield()] poss = [token.pos() for token in tree.full_token_yield()] result_file.write( tree_to_conll_str(fall_back_left_branching(forms, poss))) result_file.write('\n\n') parser.clear() print("parse failures", failures) print("parse time", total_time) print("eval.pl", "no punctuation") p = subprocess.Popen( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"]) p.communicate() print("eval.pl", "punctation") p = subprocess.Popen( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"]) p.communicate()
def generic_parsing_test(self, parser_type, limit_train, limit_test, compare_order): def filter_by_id(n, trees): j = 0 for tree in trees: if j in n: yield tree j += 1 #params train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train # test = 'res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim, term_labelling) trees = parse_conll_corpus(test, False, limit_test) count_derivs = {} no_complete_match = 0 for i, tree in enumerate(trees): print("Parsing tree for ", i, file=stderr) print(tree, file=stderr) parser = parser_type(grammar_prim, tree) self.assertTrue(parser.recognized()) count_derivs[i] = 0 print("Found derivations for ", i, file=stderr) j = 0 derivations = [] for der in parser.all_derivation_trees(): self.assertTrue( der.check_integrity_recursive(der.root_id(), start)) print(count_derivs[i], file=stderr) print(der, file=stderr) output_tree = HybridTree() tokens = tree.token_yield() the_yield = der.compute_yield() # print >>stderr, the_yield tokens2 = list( map(lambda pos: construct_conll_token('_', pos), the_yield)) dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens2, False, construct_conll_token, reorder=False) print(tree, file=stderr) print(output_tree, file=stderr) self.compare_hybrid_trees(tree, output_tree, compare_order) count_derivs[i] += 1 derivations.append(der) self.assertTrue( sDCPParserTest.pairwise_different( derivations, sDCPParserTest.compare_derivations)) self.assertEqual(len(derivations), count_derivs[i]) if count_derivs[i] == 0: no_complete_match += 1 for key in count_derivs: print(key, count_derivs[key]) print("# trees with no complete match:", no_complete_match)
def trainAndEval(strategy, labelling1, labelling2, fanout, parser_type, train, test, cDT, parseStrings, ignore_punctuation=False): file = open('results.txt', 'a') term_labelling = the_terminal_labeling_factory().get_strategy('pos') recursive_partitioning = d_i.the_recursive_partitioning_factory( ).get_partitioning('fanout-' + str(fanout) + strategy) primary_labelling = d_l.the_labeling_factory( ).create_simple_labeling_strategy(labelling1, labelling2) trees = parse_conll_corpus(train, False, train_limit) if ignore_punctuation: trees = disconnect_punctuation(trees) (n_trees, grammar) = d_i.induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) # write current transformation strategy and hyperparameters to results.txt if strategy == '': file.write('rtl ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) else: splitList = strategy.split('-') if splitList[1] == 'left': file.write('ltr ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) elif splitList[1] == 'random': file.write('random seed:' + splitList[2] + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) elif splitList[1] == 'no': if splitList[4] == 'random': file.write('nnont fallback:random seed:' + splitList[5] + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) elif splitList[4] == 'ltr': file.write('nnont fallback:ltr' + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) elif splitList[4] == 'rtl': file.write('nnont fallback:rtl' + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) else: file.write('nnont fallback:argmax' + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) else: #argmax file.write('argmax ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) file.write('\n') res = '' res += '#nonts:' + str(len(grammar.nonts())) res += ' #rules:' + str(len(grammar.rules())) file.write(res) res = '' # The following code is to count the number of derivations for a hypergraph (tree parser required) if cDT == True: tree_parser.preprocess_grammar(grammar, term_labelling) trees = parse_conll_corpus(train, False, train_limit) if ignore_punctuation: trees = disconnect_punctuation(trees) derCount = 0 derMax = 0 for tree in trees: parser = tree_parser(grammar, tree) # if tree parser is used der = parser.count_derivation_trees() if der > derMax: derMax = der derCount += der res += "\n#derivation trees: average: " + str( 1.0 * derCount / n_trees) res += " maximal: " + str(derMax) file.write(res) res = '' total_time = 0.0 # The following code works for string parsers for evaluating if parseStrings == True: parser_type.preprocess_grammar(grammar) trees = parse_conll_corpus(test, False, test_limit) if ignore_punctuation: trees = disconnect_punctuation(trees) i = 0 with open(result, 'w') as result_file: failures = 0 for tree in trees: time_stamp = time.clock() i += i #if (i % 100 == 0): #print '.', #sys.stdout.flush() parser = parser_type(grammar, tree_yield(tree.token_yield())) time_stamp = time.clock() - time_stamp total_time += time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) if h_tree: result_file.write(tree_to_conll_str(h_tree)) result_file.write('\n\n') else: failures += 1 forms = [token.form() for token in tree.full_token_yield()] poss = [token.pos() for token in tree.full_token_yield()] result_file.write( tree_to_conll_str( fall_back_left_branching_token(cleaned_tokens))) result_file.write('\n\n') res += "\nattachment scores:\nno punctuation: " out = subprocess.check_output( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"]) match = re.search(r'[^=]*= (\d+\.\d+)[^=]*= (\d+.\d+).*', out) res += ' labelled:' + match.group(1) #labeled attachment score res += ' unlabelled:' + match.group(2) #unlabeled attachment score res += "\npunctation: " out = subprocess.check_output( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"]) match = re.search(r'[^=]*= (\d+\.\d+)[^=]*= (\d+.\d+).*', out) res += ' labelled:' + match.group(1) res += ' unlabelled:' + match.group(2) res += "\nparse time: " + str(total_time) file.write(res) file.write('\n\n\n') file.close()