def initialize_parser(self): if "disco-dop" in self.parsing_mode: self.parser = DiscodopKbestParser(grammar=self.base_grammar, k=self.k_best, beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_delta"], pruning_k=self.disco_dop_params["pruning_k"], cfg_ctf=self.disco_dop_params["cfg_ctf"]) else: self.parser = GFParser_k_best(grammar=self.base_grammar, k=self.k_best, save_preprocessing=(self.directory, "gfgrammar"))
def initialize_parser(self): save_preprocess = (self.directory, "mygrammar") k = 1 if not self.organizer.disable_split_merge or self.oracle_parsing else self.k_best if "disco-dop" in self.parsing_mode: self.parser = DiscodopKbestParser(grammar=self.base_grammar, k=self.k_best, cfg_ctf=self.disco_dop_params["cfg_ctf"], pruning_k=self.disco_dop_params["pruning_k"], beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_delta"] ) else: self.parser = GFParser_k_best(self.base_grammar, save_preprocessing=save_preprocess, k=k)
def obtain_derivations(grammar, term_labelling): # build parser tree_yield = term_labelling.prepare_parser_input parser = GFParser_k_best(grammar, k=50) # parse sentences trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print( "Parsing sentence ", i, file=stderr, ) parser.set_input(tree_yield(tree.token_yield())) parser.parse() derivations = [der for der in parser.k_best_derivation_trees()] print("# derivations: ", len(derivations), file=stderr) parser.clear() for der in derivations: yield der[1]
def prepare_sm_parser(self): last_la = self.organizer.latent_annotations[ self.organizer.last_sm_cycle] if self.parsing_mode == "discodop-multi-method": if self.organizer.project_weights_before_parsing: self.project_weights() self.parser = DiscodopKbestParser( self.base_grammar, k=self.k_best, la=last_la, nontMap=self.organizer.nonterminal_map, variational=False, sum_op=False, cfg_ctf=self.disco_dop_params["cfg_ctf"], beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_delta"], pruning_k=self.disco_dop_params["pruning_k"], grammarInfo=self.organizer.grammarInfo, projection_mode=False, latent_viterbi_mode=True, secondaries=[ "VARIATIONAL", "MAX-RULE-PRODUCT", "LATENT-RERANK" ]) self.parser.k_best_reranker = Coarse_to_fine_parser( self.base_grammar, last_la, self.organizer.grammarInfo, self.organizer.nonterminal_map, base_parser=self.parser) elif self.parsing_mode == "best-latent-derivation": grammar = build_sm_grammar(last_la, self.base_grammar, self.organizer.grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1) self.parser = GFParser_k_best(grammar=grammar, k=1, save_preprocessing=(self.directory, "gfgrammar")) elif self.parsing_mode in { method + engine for method in {"k-best-rerank", "latent-viterbi"} for engine in {"-GF", "-disco-dop", ""} }: if self.organizer.project_weights_before_parsing: self.project_weights() if "disco-dop" in self.parsing_mode: engine = DiscodopKbestParser( grammar=self.base_grammar, k=self.k_best, la=last_la, nontMap=self.organizer.nonterminal_map, grammarInfo=self.organizer.grammarInfo, cfg_ctf=self.disco_dop_params["cfg_ctf"], beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_beta"], pruning_k=self.disco_dop_params["pruning_k"], latent_viterbi_mode="latent-viterbi" in self.parsing_mode) else: engine = GFParser_k_best(grammar=self.base_grammar, k=self.k_best, heuristics=self.heuristics, save_preprocessing=(self.directory, "gfgrammar")) if "latent-viterbi" in self.parsing_mode: self.parser = engine else: self.parser = Coarse_to_fine_parser( self.base_grammar, last_la, self.organizer.grammarInfo, self.organizer.nonterminal_map, base_parser=engine) elif self.parsing_mode in { method + "%s" % engine for method in {"max-rule-prod", "max-rule-sum", "variational"} for engine in {"-GF", "-disco-dop", ""} }: if self.organizer.project_weights_before_parsing: self.project_weights() if "GF" in self.parsing_mode: self.parser = Coarse_to_fine_parser( self.base_grammar, last_la, self.organizer.grammarInfo, nontMap=self.organizer.nonterminal_map, base_parser_type=GFParser_k_best, k=self.k_best, heuristics=self.heuristics, save_preprocessing=(self.directory, "gfgrammar"), mode=self.parsing_mode, variational="variational" in self.parsing_mode, sum_op="sum" in self.parsing_mode) else: self.parser = DiscodopKbestParser( self.base_grammar, k=self.k_best, la=last_la, nontMap=self.organizer.nonterminal_map, variational="variational" in self.parsing_mode, sum_op="sum" in self.parsing_mode, cfg_ctf=self.disco_dop_params["cfg_ctf"], beam_beta=self.disco_dop_params["beam_beta"], beam_delta=self.disco_dop_params["beam_delta"], pruning_k=self.disco_dop_params["pruning_k"], grammarInfo=self.organizer.grammarInfo, projection_mode=True) else: raise ValueError("Unknown parsing mode %s" % self.parsing_mode)
def main(): # # induce or load grammar # if not os.path.isfile(grammar_path): # grammar = LCFRS('START') # for tree in train_corpus: # if not tree.complete() or tree.empty_fringe(): # continue # part = recursive_partitioning(tree) # tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling) # grammar.add_gram(tree_grammar) # grammar.make_proper() # pickle.dump(grammar, open(grammar_path, 'wb')) # else: # grammar = pickle.load(open(grammar_path, 'rb')) grammar = LCFRS('START') for tree in train_corpus: if not tree.complete() or tree.empty_fringe(): continue part = recursive_partitioning(tree) tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling) grammar.add_gram(tree_grammar) grammar.make_proper() # # compute or load reducts # if not os.path.isfile(reduct_path): # traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling) # traceTrain.serialize(reduct_path) # else: # traceTrain = PySDCPTraceManager(grammar, terminal_labeling) # traceTrain.load_traces_from_file(reduct_path) traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling) traceValidationGenetic = compute_reducts(grammar, validation_genetic_corpus, terminal_labeling) traceValidation = compute_reducts(grammar, validation_corpus, terminal_labeling) # prepare EM training grammarInfo = PyGrammarInfo(grammar, traceTrain.get_nonterminal_map()) if not grammarInfo.check_for_consistency(): print("[Genetic] GrammarInfo is not consistent!") storageManager = PyStorageManager() em_builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo) em_builder.set_em_epochs(em_epochs) em_builder.set_simple_expector(threads=threads) emTrainer = em_builder.build() # randomize initial weights and do em training la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager) la_no_splits.add_random_noise(seed=seed) emTrainer.em_train(la_no_splits) la_no_splits.project_weights(grammar, grammarInfo) # emTrainerOld = PyEMTrainer(traceTrain) # emTrainerOld.em_training(grammar, 30, "rfe", tie_breaking=True) # compute parses for validation set baseline_parser = GFParser_k_best(grammar, k=k_best) validator = build_score_validator(grammar, grammarInfo, traceTrain.get_nonterminal_map(), storageManager, terminal_labeling, baseline_parser, validation_corpus, validationMethod) del baseline_parser # prepare SM training builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo) builder.set_em_epochs(em_epochs) builder.set_split_randomization(1.0, seed + 1) builder.set_simple_expector(threads=threads) builder.set_score_validator(validator, validationDropIterations) builder.set_smoothing_factor(smoothingFactor=smoothing_factor) builder.set_split_randomization(percent=split_randomization) splitMergeTrainer = builder.set_scc_merger(threshold=scc_merger_threshold, threads=threads).build() splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing") splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing") # set initial latent annotation latentAnnotations = [] for i in range(0, genetic_initial): splitMergeTrainer.reset_random_seed(seed + i + 1) la = splitMergeTrainer.split_merge_cycle(la_no_splits) if not la.check_for_validity(): print('[Genetic] Initial LA', i, 'is not consistent! (See details before)') if not la.is_proper(): print('[Genetic] Initial LA', i, 'is not proper!') heapq.heappush( latentAnnotations, (evaluate_la(grammar, grammarInfo, la, traceValidationGenetic, validation_genetic_corpus), i, la)) print('[Genetic] added initial LA', i) (fBest, idBest, laBest) = min(latentAnnotations) validation_score = evaluate_la(grammar, grammarInfo, laBest, traceValidation, test_corpus) print("[Genetic] Started with best F-Score (Test) of", validation_score, "from Annotation ", idBest) geneticCount = genetic_initial random.seed(seed) for round in range(1, genetic_cycles + 1): print("[Genetic] Starting Recombination Round ", round) # newpopulation = list(latentAnnotations) newpopulation = [] # Cross all candidates! for leftIndex in range(0, len(latentAnnotations)): (fLeft, idLeft, left) = latentAnnotations[leftIndex] # TODO: How to determine NTs to keep? # do SM-Training print("[Genetic] do SM-training on", idLeft, "and create LA", geneticCount) la = splitMergeTrainer.split_merge_cycle(la) if not la.check_for_validity(): print( '[Genetic] Split/Merge introduced invalid weights into LA', geneticCount) if not la.is_proper(): print( '[Genetic] Split/Merge introduced problems with properness of LA', geneticCount) fscore = evaluate_la(grammar, grammarInfo, la, traceValidationGenetic, validation_genetic_corpus) print("[Genetic] LA", geneticCount, "has F-score: ", fscore) heapq.heappush(newpopulation, (fscore, geneticCount, la)) geneticCount += 1 heapq.heapify(newpopulation) latentAnnotations = heapq.nsmallest( genetic_population, heapq.merge(latentAnnotations, newpopulation)) heapq.heapify(latentAnnotations) (fBest, idBest, laBest) = min(latentAnnotations) validation_score = evaluate_la(grammar, grammarInfo, laBest, traceValidation, test_corpus) print("[Genetic] Best LA", idBest, "has F-Score (Test) of ", validation_score)
def run_experiment(rec_part_strategy, nonterminal_labeling, exp, reorder_children, binarize=True): start = 1 stop = 7000 test_start = 7001 test_stop = 7200 # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml" corpus_path = "res/tiger/tiger_8000.xml" exclude = [] train_dsgs = sentence_names_to_deep_syntax_graphs( ['s' + str(i) for i in range(start, stop + 1) if i not in exclude], corpus_path, hold=False, reorder_children=reorder_children) test_dsgs = sentence_names_to_deep_syntax_graphs( [ 's' + str(i) for i in range(test_start, test_stop + 1) if i not in exclude ], corpus_path, hold=False, reorder_children=reorder_children) # Grammar induction term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token if binarize: def modify_token(token): if isinstance(token, ConstituentCategory): token_new = deepcopy(token) token_new.set_category(token.category() + '-BAR') return token_new elif isinstance(token, str): return token + '-BAR' else: assert False train_dsgs = [ dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs ] def is_bin(token): if isinstance(token, ConstituentCategory): if token.category().endswith('-BAR'): return True elif isinstance(token, str): if token.endswith('-BAR'): return True return False def debinarize(dsg): return dsg.debinarize(is_bin=is_bin) else: debinarize = id grammar = induction_on_a_corpus(train_dsgs, rec_part_strategy, nonterminal_labeling, term_labeling) grammar.make_proper() print("Nonterminals", len(grammar.nonts()), "Rules", len(grammar.rules())) parser = GFParser_k_best(grammar, k=500) return do_parsing(parser, test_dsgs, term_labeling_token, oracle=True, debinarize=debinarize) # Compute reducts, i.e., intersect grammar with each training dsg basedir = path.join('/tmp/dog_experiments', 'exp' + str(exp)) reduct_dir = path.join(basedir, 'reduct_grammars') terminal_map = Enumerator() if not os.path.isdir(basedir): os.makedirs(basedir) data = export_dog_grammar_to_json(grammar, terminal_map) grammar_path = path.join(basedir, 'grammar.json') with open(grammar_path, 'w') as file: json.dump(data, file) corpus_path = path.join(basedir, 'corpus.json') with open(corpus_path, 'w') as file: json.dump( export_corpus_to_json(train_dsgs, terminal_map, terminal_labeling=term_labeling), file) with open(path.join(basedir, 'enumerator.enum'), 'w') as file: terminal_map.print_index(file) if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([ ' '.join([ "java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t', corpus_path, "-o", reduct_dir ]) ], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) while True: nextline = p.stdout.readline() if nextline == '' and p.poll() is not None: break sys.stdout.write(nextline) sys.stdout.flush() p.wait() p.stdout.close() rtgs = [] for i in range(1, len(train_dsgs) + 1): rtgs.append(read_rtg(path.join(reduct_dir, str(i) + '.gra'))) derivation_manager = PyDerivationManager(grammar) derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize(path.join(basedir, 'reduct_manager.trace')) # Training ## prepare EM training em_epochs = 20 seed = 0 smoothing_factor = 0.01 split_randomization = 0.01 sm_cycles = 2 merge_percentage = 50.0 grammarInfo = PyGrammarInfo(grammar, derivation_manager.get_nonterminal_map()) storageManager = PyStorageManager() em_builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo) em_builder.set_em_epochs(em_epochs) em_builder.set_simple_expector(threads=THREADS) emTrainer = em_builder.build() # randomize initial weights and do em training la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager) la_no_splits.add_random_noise(seed=seed) emTrainer.em_train(la_no_splits) la_no_splits.project_weights(grammar, grammarInfo) do_parsing(CFGParser(grammar), test_dsgs, term_labeling_token) return ## prepare SM training builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo) builder.set_em_epochs(em_epochs) builder.set_split_randomization(1.0, seed + 1) builder.set_simple_expector(threads=THREADS) builder.set_smoothing_factor(smoothingFactor=smoothing_factor) builder.set_split_randomization(percent=split_randomization) # builder.set_scc_merger(-0.2) builder.set_percent_merger(merge_percentage) splitMergeTrainer = builder.build() # splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing") splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing") # set initial latent annotation latentAnnotation = [la_no_splits] # carry out split/merge training and do parsing parsing_method = "filter-ctf" # parsing_method = "single-best-annotation" k_best = 50 for i in range(1, sm_cycles + 1): splitMergeTrainer.reset_random_seed(seed + i + 1) latentAnnotation.append( splitMergeTrainer.split_merge_cycle(latentAnnotation[-1])) print("Cycle: ", i) if parsing_method == "single-best-annotation": smGrammar = latentAnnotation[i].build_sm_grammar( grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1) print("Rules in smoothed grammar: ", len(smGrammar.rules())) parser = GFParser(smGrammar) elif parsing_method == "filter-ctf": latentAnnotation[-1].project_weights(grammar, grammarInfo) parser = Coarse_to_fine_parser( grammar, latentAnnotation[-1], grammarInfo, derivation_manager.get_nonterminal_map(), base_parser_type=GFParser_k_best, k=k_best) else: raise (Exception()) do_parsing(parser, test_dsgs, term_labeling_token) del parser
def initialize_parser(self): self.parser = GFParser_k_best(self.base_grammar, k=self.k_best)