def cal_class_distribution(data_dir, level): """ calculate the class distribution :param data_dir: :param level: 0 for inner-sentence, 1 for inter-sentence but inner paragraph, 2 for inter-paragraph, 3 for different depth :return: None """ rst_trees = DataHelper.read_rst_trees(data_dir) all_nodes = [node for rst_tree in rst_trees for node in rst_tree.postorder_DFT(rst_tree.tree, [])] if level in [0, 1, 2]: valid_relations = [RstTree.extract_relation(node.child_relation) for node in all_nodes if node.level == level and node.child_relation is not None] distribution = Counter(valid_relations) for cla in class2rel: if cla not in distribution: distribution[cla] = 0 return distribution if level == 3: depth_relation_distributions = {} for node in all_nodes: if node.lnode is None and node.rnode is None: continue if node.depth in depth_relation_distributions: depth_relation_distributions[node.depth][RstTree.extract_relation(node.child_relation)] += 1 else: depth_relation_distributions[node.depth] = Counter() depth_relation_distributions[node.depth][RstTree.extract_relation(node.child_relation)] = 1 for depth, distribution in depth_relation_distributions.items(): for cla in class2rel: if cla not in distribution: distribution[cla] = 0 return depth_relation_distributions
def eval_parser(self, path='./examples', report=False, bcvocab=None, draw=True): """ Test the parsing performance""" # Evaluation met = Metrics(levels=['span', 'nuclearity', 'relation']) # ---------------------------------------- # Read all files from the given path doclist = [ os.path.join(path, fname) for fname in os.listdir(path) if fname.endswith('.merge') ] pred_forms = [] gold_forms = [] depth_per_relation = {} for fmerge in doclist: # ---------------------------------------- # Read *.merge file doc = Doc() doc.read_from_fmerge(fmerge) # ---------------------------------------- # Parsing pred_rst = self.parser.sr_parse(doc, bcvocab) if draw: pred_rst.draw_rst(fmerge.replace(".merge", ".ps")) # Get brackets from parsing results pred_brackets = pred_rst.bracketing() fbrackets = fmerge.replace('.merge', '.brackets') # Write brackets into file Evaluator.writebrackets(fbrackets, pred_brackets) # ---------------------------------------- # Evaluate with gold RST tree if report: fdis = fmerge.replace('.merge', '.dis') gold_rst = RstTree(fdis, fmerge) gold_rst.build() met.eval(gold_rst, pred_rst) for node in pred_rst.postorder_DFT(pred_rst.tree, []): pred_forms.append(node.form) for node in gold_rst.postorder_DFT(gold_rst.tree, []): gold_forms.append(node.form) nodes = gold_rst.postorder_DFT(gold_rst.tree, []) inner_nodes = [ node for node in nodes if node.lnode is not None and node.rnode is not None ] for idx, node in enumerate(inner_nodes): relation = node.rnode.relation if node.form == 'NS' else node.lnode.relation rela_class = RstTree.extract_relation(relation) if rela_class in depth_per_relation: depth_per_relation[rela_class].append(node.depth) else: depth_per_relation[rela_class] = [node.depth] lnode_text = ' '.join([ gold_rst.doc.token_dict[tid].word for tid in node.lnode.text ]) lnode_lemmas = ' '.join([ gold_rst.doc.token_dict[tid].lemma for tid in node.lnode.text ]) rnode_text = ' '.join([ gold_rst.doc.token_dict[tid].word for tid in node.rnode.text ]) rnode_lemmas = ' '.join([ gold_rst.doc.token_dict[tid].lemma for tid in node.rnode.text ]) # if rela_class == 'Topic-Change': # print(fmerge) # print(relation) # print(lnode_text) # print(rnode_text) # print() if report: met.report()