def cut_sent(self, text: str, sid=None) -> List[Sentence]: last_cut = 0 sentences = [] for i in range(0, len(text) - 1): if text[i] in self._eos: sentences.append(Sentence([TEXT(text[last_cut:i + 1])])) last_cut = i + 1 if last_cut < len(text) - 1: sentences.append(Sentence([TEXT(text[last_cut:])])) return sentences
def parse(self, para): edus = [] for edu in para.edus(): edu_copy = EDU([TEXT(edu.text)]) setattr(edu_copy, "words", edu.words) setattr(edu_copy, "tags", edu.tags) edus.append(edu_copy) return self.parser.parse(edus)
def cut_edu(self, sent: Sentence) -> List[EDU]: if not hasattr(sent, "parse"): print(sent.text) parse = self.parser.parse(sent.text) else: parse = getattr(sent, "parse") parse = ParentedTree.fromstring(parse.pformat()) children = list( parse.subtrees( lambda t: t.height() == 2 and t.label() != '-NONE-')) edus = [] last_edu_words = [] last_edu_tags = [] offset = 0 for child in children: if child[0] == '-LRB-': child[0] = '(' if child[0] == '-RRB-': child[0] = ')' last_edu_words.append(child[0]) last_edu_tags.append(child.label()) if child[0] in self._eos or (child[0] in self.candidate and self.model.predict(offset, parse)): text = "".join(last_edu_words) edu = EDU([TEXT(text)]) setattr(edu, "words", last_edu_words) setattr(edu, "tags", last_edu_tags) edus.append(edu) last_edu_words = [] last_edu_tags = [] offset += len(child[0]) if last_edu_words: text = "".join(last_edu_words) edu = EDU([TEXT(text)]) setattr(edu, "words", last_edu_words) setattr(edu, "tags", last_edu_tags) edus.append(edu) return edus
def parse_and_eval(dataset, model): model.eval() parser = PartitionPtrParser(model) golds = list(filter(lambda d: d.root_relation(), chain(*dataset))) num_instances = len(golds) strips = [] for paragraph in golds: edus = [] for edu in paragraph.edus(): edu_copy = EDU([TEXT(edu.text)]) setattr(edu_copy, "words", edu.words) setattr(edu_copy, "tags", edu.tags) edus.append(edu_copy) strips.append(edus) parses = [] for edus in strips: parse = parser.parse(edus) parses.append(parse) return num_instances, parse_eval(parses, golds)
def main(): logging.basicConfig(level=logging.INFO) with open("data/models/treebuilder.partptr.model", "rb") as model_fd: model = torch.load(model_fd, map_location="cpu") model.eval() model.use_gpu = False parser = PartitionPtrParser(model) cdtb = CDTB("data/CDTB", "TRAIN", "VALIDATE", "TEST", ctb_dir="data/CTB", preprocess=True, cache_dir="data/cache") golds = list(filter(lambda d: d.root_relation(), chain(*cdtb.test))) strips = [] for paragraph in golds: edus = [] for edu in paragraph.edus(): edu_copy = EDU([TEXT(edu.text)]) setattr(edu_copy, "words", edu.words) setattr(edu_copy, "tags", edu.tags) edus.append(edu_copy) strips.append(edus) parses = [] parse_sessions = [] for edus in strips: parse, session = parser.parse(edus, ret_session=True) parses.append(parse) parse_sessions.append(session) # macro cdtb scores cdtb_macro_scores = eval.parse_eval(parses, golds, average="macro") logging.info("CDTB macro (strict) scores:") logging.info(eval.gen_parse_report(*cdtb_macro_scores)) # micro cdtb scores cdtb_micro_scores = eval.parse_eval(parses, golds, average="micro") logging.info("CDTB micro (strict) scores:") logging.info(eval.gen_parse_report(*cdtb_micro_scores)) # micro rst scores rst_scores = eval.rst_parse_eval(parses, golds) logging.info("RST styled scores:") logging.info(eval.gen_parse_report(*rst_scores)) # nuclear scores nuclear_scores = eval.nuclear_eval(parses, golds) logging.info("nuclear scores:") logging.info(eval.gen_category_report(nuclear_scores)) # relation scores ctype_scores, ftype_scores = eval.relation_eval(parses, golds) logging.info("coarse relation scores:") logging.info(eval.gen_category_report(ctype_scores)) logging.info("fine relation scores:") logging.info(eval.gen_category_report(ftype_scores)) # draw gold and parse tree along with decision hotmap for gold, parse, session in zip(golds, parses, parse_sessions): gold.draw() session.draw_decision_hotmap() parse.draw()