def finalize(self): CorpusFile.finalize(self) self.reference.finalize() print('Wrote results to', self.path, file=self.logger) print('Wrote reference to', self.reference.path, file=self.logger) for i, sec in enumerate(self.secondaries): sec.finalize() print('Wrote sec %d to ' % i, sec.path, file=self.logger)
def __init__(self, experiment, path=None, directory=None, logger=None, secondary_scores=0): ConstituentScorer.__init__(self) _, path = tempfile.mkstemp(dir=directory) if path is None else path CorpusFile.__init__(self, path=path, directory=directory, logger=logger) self.experiment = experiment self.reference = CorpusFile(directory=directory, logger=logger) self.logger = logger if logger is not None else sys.stdout self.secondaries = [CorpusFile(directory=directory, logger=logger) for _ in range(secondary_scores)]
def main(): # path to corpus and ids of first/last sentences of sections train_dev_corpus_path = '../res/osdp-12/sdp/2015/en.dm.sdp' training_last = 21999042 dev_start = 22000001 # limit corpus sizes for testing purpose train_limit = 100 dev_limit = 50 def terminal_labeling(x): return '_', '_', x[2], x[3] def terminal_labeling_lcfrs(x): return x[2] def rec_part_strat(graph): # return left_branching_partitioning(len(graph.sentence)) direct = extract_recursive_partitioning(graph) # return direct return fanout_limited_partitioning_left_to_right(direct, 1) def nt_sub_labeling(edge): return edge.label[2] def nonterminal_labeling(x, graph): bot = graph.dog.bottom(x) top = graph.dog.top(x) def labels(nodes): return [ induction_settings.nt_sub_labeling( graph.dog.incoming_edge(node)) for node in nodes ] fanout = consecutive_spans(graph.covered_sentence_positions(x)) return '[' + ','.join(labels(bot)) + ';' + ','.join( labels(top)) + ';' + str(fanout) + ']' induction_settings = InductionSettings() induction_settings.terminal_labeling = terminal_labeling induction_settings.terminal_labeling_lcfrs = terminal_labeling_lcfrs induction_settings.rec_part_strat = rec_part_strat induction_settings.nt_sub_labeling = nt_sub_labeling induction_settings.nonterminal_labeling = nonterminal_labeling experiment = SDPExperiment(induction_settings) experiment.resources['TRAIN'] = CorpusFile(train_dev_corpus_path, end=training_last, limit=train_limit) experiment.resources['TEST'] = CorpusFile(train_dev_corpus_path, start=dev_start, limit=dev_limit) experiment.oracle_parsing = True experiment.parsing_timeout = 150 # seconds experiment.run_experiment()
class ScorerAndWriter(ConstituentScorer, CorpusFile): """ A resource to which parsing results can be written. Computes LF1 score (inhouse implementation) and writes resulting parse tree to a file. """ def __init__(self, experiment, path=None, directory=None, logger=None, secondary_scores=0): ConstituentScorer.__init__(self) _, path = tempfile.mkstemp(dir=directory) if path is None else path CorpusFile.__init__(self, path=path, directory=directory, logger=logger) self.experiment = experiment self.reference = CorpusFile(directory=directory, logger=logger) self.logger = logger if logger is not None else sys.stdout self.secondaries = [CorpusFile(directory=directory, logger=logger) for _ in range(secondary_scores)] def init(self): CorpusFile.init(self) self.reference.init() for sec in self.secondaries: sec.init() def finalize(self): CorpusFile.finalize(self) self.reference.finalize() print('Wrote results to', self.path, file=self.logger) print('Wrote reference to', self.reference.path, file=self.logger) for i, sec in enumerate(self.secondaries): sec.finalize() print('Wrote sec %d to ' % i, sec.path, file=self.logger) def score(self, system, gold, secondaries=None): ConstituentScorer.score(self, system, gold) self.file.writelines(self.experiment.serialize(system)) self.reference.file.writelines(self.experiment.serialize(gold)) if secondaries: for system_sec, corpus in zip(secondaries, self.secondaries): corpus.file.writelines(self.experiment.serialize(system_sec)) def failure(self, gold): ConstituentScorer.failure(self, gold) sentence = self.experiment.obtain_sentence(gold) label = self.experiment.obtain_label(gold) fallback = self.experiment.compute_fallback(sentence, label) self.file.writelines(self.experiment.serialize(fallback)) self.reference.file.writelines(self.experiment.serialize(gold)) for sec in self.secondaries: sec.file.writelines(self.experiment.serialize(fallback)) def __str__(self): return CorpusFile.__str__(self)
def run_discodop_binarization(self): """ :rtype: None Binarize the training corpus using discodop. The resulting corpus is saved to to the the disco_binarized_corus member variable. """ if self.disco_binarized_corpus is not None: return train_resource = self.resources[TRAINING] if self.induction_settings.normalize: train_normalized = self.normalize_corpus(train_resource.path, src=train_resource.type.lower(), renumber=False) else: train_normalized = train_resource.path _, second_stage = tempfile.mkstemp(suffix=".export", dir=self.directory) subprocess.call(["discodop", "treetransforms"] + self.induction_settings.discodop_binarization_params + ["--inputfmt=export", "--outputfmt=export", train_normalized, second_stage]) disco_resource = CorpusFile(path=second_stage, start=train_resource.start, end=train_resource.end, limit=train_resource.limit, filter=train_resource.filter, exclude=train_resource.exclude, type=train_resource.type ) self.disco_binarized_corpus = self.read_corpus_export(disco_resource, mode="DISCO-DOP", skip_normalization=True)
def evaluate(self, result_resource, gold_resource): if gold_resource.end is not None \ or gold_resource.limit is not None\ or gold_resource.length_limit is not None: corpus_gold_selection = self.read_corpus(gold_resource) gold_selection_resource = CorpusFile() gold_selection_resource.init() gold_selection_resource.finalize() export_corpus(corpus_gold_selection, gold_selection_resource.path) gold_resource = gold_selection_resource call([ "sh", "../util/semeval-run.sh", "Scorer", gold_resource.path, result_resource.path, "representation=DM" ])
def __init__(self, induction_settings): Experiment.__init__(self) self.induction_settings = induction_settings self.resources[RESULT] = CorpusFile(header="#SDP 2015\n")
def setup_corpus_resources(split, dev_mode=True, quick=False, test_pred=False, test_second_half=False): """ :param split: A string specifying a particular corpus and split from the literature. :type split: str :param dev_mode: If true, then the development set is used for testing. :type dev_mode: bool :param quick: If true, then a smaller version of the corpora are returned. :type quick: bool :param test_pred: If true, then predicted POS tags are used for testing. :type test_pred: bool :return: A tuple with train/dev/test (in this order) of type CorpusResource """ if split == "SPMRL": # all files are from SPMRL shared task corpus_type = corpus_type_test = "TIGERXML" train_path = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train/train.German.gold.xml' train_start = 1 train_filter = None train_limit = 40474 train_exclude = validation_exclude = test_exclude = test_input_exclude = [7561, 17632, 46234, 50224] validation_path = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml' validation_start = 40475 validation_size = validation_start + 4999 validation_filter = None if dev_mode: test_start = test_input_start = validation_start test_limit = test_input_limit = validation_size test_path = test_input_path \ = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml' else: test_start = test_input_start = 45475 test_limit = test_input_limit = test_start + 4999 test_path = test_input_path \ = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/test/test.German.gold.xml' test_filter = test_input_filter = None if quick: train_path = 'res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train5k/train5k.German.gold.xml' train_limit = train_start + 2000 validation_size = validation_start + 200 test_limit = test_input_limit = test_start + 200 # elif split == "HN08": # files are based on the scripts in Coavoux's mind the gap 1.0 # where we commented out `rm -r tiger21 tiger22 marmot_tags` in generate_tiger_data.sh corpus_type = corpus_type_test = "EXPORT" base_path = "res/TIGER/tiger21" train_start = 1 train_limit = 50474 train_path = os.path.join(base_path, "tigertraindev_root_attach.export") def train_filter(x): return x % 10 >= 2 train_exclude = [7561, 17632, 46234, 50224] validation_start = 1 validation_size = 50471 validation_exclude = train_exclude validation_path = os.path.join(base_path, "tigerdev_root_attach.export") validation_exclude = train_exclude def validation_filter(sent_id): return sent_id % 10 == 1 if not dev_mode: test_start = test_input_start = 1 # validation_size # 40475 if test_second_half: test_start = test_input_start = 25240 test_limit = test_input_limit = 50474 # test_limit = 200 * 5 // 4 test_exclude = test_input_exclude = train_exclude test_path = os.path.join(base_path, "tigertest_root_attach.export") def test_filter(sent_id): return sent_id % 10 == 0 if test_pred: corpus_type_test = "WORD/POS" test_input_start = 0 if test_second_half: test_input_start = 2524 - 1 # predicted by MATE trained on tigerHN08 train + dev test_input_path = 'res/TIGER/tigerHN08-test.train+dev.pred_tags.raw' test_input_filter = None else: test_input_path = test_path test_input_filter = test_filter else: test_start = test_input_start = 1 if test_second_half: test_start = test_input_start = 25241 test_limit = test_input_limit = 50474 test_exclude = test_input_exclude = train_exclude test_path = validation_path test_filter = validation_filter if test_pred: corpus_type_test = "WORD/POS" test_input_start = 0 if test_second_half: test_input_start = 2524 # predicted by MATE trained on tigerHN08 train test_input_path = 'res/TIGER/tigerHN08-dev.train.pred_tags.raw' test_input_filter = None else: test_input_path = validation_path test_input_filter = test_filter if quick: train_limit = 5000 * 5 // 4 validation_size = 200 * 10 // 1 TEST_LIMIT = 200 test_limit = test_input_limit = TEST_LIMIT * 10 // 1 if test_pred: test_input_limit = TEST_LIMIT + 1 # elif "WSJ" in split: # based on Kilian Evang's dptb.tar.bz2 corpus_type = corpus_type_test = "EXPORT" corpus_path_original = "res/WSJ/ptb-discontinuous/dptb7.export" corpus_path_km2003 = "res/WSJ/ptb-discontinuous/dptb7-km2003wsj.export" # obtain the km2003 version from by running # discodop treetransforms --transforms=km2003wsj corpus_path_original corpus_path_km2003 if "km2003" in split: corpus_path = corpus_path_km2003 else: corpus_path = corpus_path_original train_path = validation_path = test_path = test_input_path = corpus_path train_exclude = validation_exclude = test_exclude = test_input_exclude = [] train_filter = validation_filter = test_filter = test_input_filter = None # sections 2-21 train_start = 3915 train_limit = 43746 # section 24 validation_start = 47863 validation_size = 49208 if not dev_mode: # section 23 test_start = test_input_start = 45447 test_limit = test_input_limit = 47862 else: test_start = test_input_start = validation_start test_limit = test_input_limit = validation_size if quick: train_limit = train_start + 2000 validation_size = validation_start + 200 test_limit = test_input_limit = test_start + 200 else: raise ValueError("Unknown split: " + str(split)) train = CorpusFile(path=train_path, start=train_start, end=train_limit, exclude=train_exclude, filter=train_filter, type=corpus_type) dev = CorpusFile(path=validation_path, start=validation_start, end=validation_size, exclude=validation_exclude, filter=validation_filter, type=corpus_type) test = CorpusFile(path=test_path, start=test_start, end=test_limit, exclude=test_exclude, filter=test_filter, type=corpus_type) test_input = CorpusFile(path=test_input_path, start=test_input_start, end=test_input_limit, exclude=test_input_exclude, filter=test_input_filter, type=corpus_type_test) return train, dev, test, test_input
def __str__(self): return CorpusFile.__str__(self)
def init(self): CorpusFile.init(self) self.reference.init() for sec in self.secondaries: sec.init()
def main2(): induction_settings = InductionSettings() # terminal labeling induction_settings.terminal_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return induction_settings.terminal_labeling_token.token_label( token) else: return token induction_settings.terminal_labeling = term_labeling # recursive partitioning def rec_part_strategy(direction, subgrouping, fanout, binarize): if direction == "right-to-left": return lambda dsg: fanout_limited_partitioning( dsg.recursive_partitioning(subgrouping, weak=binarize), fanout) else: return lambda dsg: fanout_limited_partitioning_left_to_right( dsg.recursive_partitioning(subgrouping, weak=binarize), fanout) induction_settings.binarize = True induction_settings.direction = "left-to-right" induction_settings.subgrouping = False induction_settings.fanout = 1 induction_settings.rec_part_strategy = rec_part_strategy( induction_settings.direction, induction_settings.subgrouping, induction_settings.fanout, induction_settings.binarize) # Nonterminal Labeling induction_settings.start = "START" def label_edge(edge): if isinstance(edge.label, ConstituentTerminal): return edge.label.pos() else: return edge.label def stupid_edge(edge): return "X" def label_child(edge, j): return edge.get_function(j) def simple_nonterminal_labeling(nodes, dsg): return simple_labeling(nodes, dsg, label_edge) def bot_stupid_nonterminal_labeling(nodes, dsg): return top_bot_labeling(nodes, dsg, label_edge, stupid_edge) def missing_child_nonterminal_labeling(nodes, dsg): return missing_child_labeling(nodes, dsg, label_edge, label_child) induction_settings.nonterminal_labeling = simple_nonterminal_labeling induction_settings.normalize = True experiment = DOGExperiment(induction_settings) # Corpora start = 1 stop = 2000 test_start = 7001 test_stop = 7200 # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml" corpus_path = "res/tiger/tiger_8000.xml" exclude = [] experiment.resources[TRAINING] = CorpusFile(corpus_path, start, stop) experiment.resources[TESTING] = CorpusFile(corpus_path, test_start, test_stop) experiment.oracle_parsing = True experiment.purge_rule_freq = None # 1.0 experiment.k_best = 100 experiment.run_experiment()