def test_advanced_corpus_parsing(self): train_limit = 500 train_dev_corpus_path = 'res/osdp-12/sdp/2015/en.dm.sdp' training_last = 21999042 training_corpus = parse_file(train_dev_corpus_path, last_id=training_last, max_n=train_limit) dev_start = 22000001 dev_limit = 100 dev_corpus = parse_file(train_dev_corpus_path, start_id=dev_start, max_n=dev_limit) cyclic = 0 checked = 0 for sdg in training_corpus: checked += 1 if sdg.dog.cyclic(): cyclic += 1 self.assertEqual(checked, 500) self.assertEqual(cyclic, 0) cyclic = 0 checked = 0 for sdg in dev_corpus: checked += 1 if sdg.dog.cyclic(): cyclic += 1 self.assertEqual(checked, 1692) self.assertEqual(cyclic, 0) export_corpus(dev_corpus, '/tmp/dev_corpus_export.dm.sdp')
def __test_sdp_parsing_full(self): path = 'res/osdp-12/sdp/2015/en.dm.sdp' corpus = parse_file(path) print(len(corpus)) for rec_part_strat in self.rec_part_strategies: for i, dsg in enumerate(corpus): if len(dsg.sentence) > 50: continue self.__process_single_dsg(i, dsg, rec_part_strat, terminal_labeling=lambda x: x[0])
def test_sdp_parsing(self): for style, rec_part_strat in product(['dm', 'pas', 'psd'], self.rec_part_strategies): path = 'res/sdp/trial/' + style + '.sdp' corpus = parse_file(path) print(len(corpus)) for i, dsg in enumerate(corpus): self.__process_single_dsg(i, dsg, rec_part_strat, terminal_labeling=lambda x: x[0])
def read_corpus(self, resource): return parse_file(resource.path, start_id=resource.start, last_id=resource.end, max_n=resource.limit)