def _build_discourse_units(self, text, tokens, numbers, start_id): """ :param text: original text :param list tokens: isanlp.annotation.Token :param numbers: positions of tokens predicted as EDU left boundaries (beginners) :return: list of DiscourseUnit """ edus = [] if numbers.shape[0]: for i in range(0, len(numbers) - 1): new_edu = DiscourseUnit(start_id + i, start=tokens[numbers[i]].begin, end=tokens[numbers[i + 1]].begin - 1, text=text[tokens[numbers[i]].begin:tokens[numbers[i + 1]].begin], relation='elementary', nuclearity='_') edus.append(new_edu) if numbers.shape[0] == 1: i = -1 new_edu = DiscourseUnit(start_id + i + 1, start=tokens[numbers[-1]].begin, end=tokens[-1].end, text=text[tokens[numbers[-1]].begin:tokens[-1].end], relation='elementary', nuclearity='_') edus.append(new_edu) return edus
def _build_discourse_units(self, text, tokens, numbers): """ :param text: original text :param list tokens: isanlp.annotation.Token :param numbers: positions of tokens predicted as EDU right boundaries :return: list of DiscourseUnit """ edus = [] new_edu = DiscourseUnit(0, start=0, end=tokens[numbers[0]].end, text=text[:tokens[numbers[0]].end], relation='elementary') edus.append(new_edu) for i in range(1, len(numbers)): new_edu = DiscourseUnit( i, start=tokens[numbers[i - 1]].end, end=tokens[numbers[i]].end, text=text[tokens[numbers[i - 1]].end:tokens[numbers[i]].end], relation='elementary') edus.append(new_edu) return edus
def predict_labels(self, nodes, annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag, annot_syntax_dep_tree): result = [] for node in nodes: left_node = DiscourseUnit( id=0, text=' '.join([ tok.text for tok in annot_tokens[node.left_id1:node.left_id2 + 1] ])) right_node = DiscourseUnit( id=1, text=' '.join([ tok.text for tok in annot_tokens[node.right_id1:node.right_id2 + 1] ])) try: pair_feature = self.tree_predictor.extract_features( left_node, right_node, annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag, annot_syntax_dep_tree) relation = self._get_relation(pair_feature) relation, nuclearity = relation.split('_') left_nuclearity = 'Satellite' if nuclearity == 'SN' else 'Nucleus' right_nuclearity = 'Satellite' if nuclearity == 'NS' else 'Nucleus' left_relation = relation right_relation = relation if left_nuclearity == 'Satellite': right_relation = 'span' if right_nuclearity == 'Satellite': left_relation = 'span' except: print('Unknown error occured.') left_relation = node.left_rel left_nuclearity = node.left_nuc right_relation = node.right_rel right_nuclearity = node.right_nuc result.append( Node(left_id1=node.left_id1, left_nuc=left_nuclearity, left_rel=left_relation, left_id2=node.left_id2, right_id1=node.right_id1, right_nuc=right_nuclearity, right_rel=right_relation, right_id2=node.right_id2)) return result
def __call__(self, left_node, right_node, proba=1., text="", start=None, end=None, relation="", nuclearity=""): self.id += 1 return DiscourseUnit(id=self.id, left=left_node, right=right_node, relation=relation, nuclearity=nuclearity, proba=proba, start=start, end=end, orig_text=self.text)
def docs_structure_to_du(self, nodes, tokens, _tok_min, _tok_max): du = None root = TopDownRSTParser.define_root(nodes, _tok_min, _tok_max) if root: nuc = 'NN' rel = root.left_rel if root.left_nuc == 'Satellite': nuc = 'SN' rel = root.left_rel elif root.right_nuc == 'Satellite': nuc = 'NS' rel = root.right_rel if root.left_id1 == root.left_id2: self._id += 1 left = DiscourseUnit( id=self._id, start=tokens[root.left_id1].begin, end=tokens[root.left_id2].end, text=' '.join([ tok.text for tok in tokens[root.left_id1:root.left_id2 + 1] ]), relation='elementary') else: left = self.docs_structure_to_du(nodes, tokens, root.left_id1, root.left_id2) if root.right_id1 == root.right_id2: self._id += 1 right = DiscourseUnit( id=self._id, start=tokens[root.left_id1].begin, end=tokens[root.left_id2].end, text=' '.join([ tok.text for tok in tokens[root.right_id1:root.right_id2 + 1] ]), relation='elementary') else: right = self.docs_structure_to_du(nodes, tokens, root.right_id1, root.right_id2) self._id += 1 new_du = DiscourseUnit( id=self._id, start=tokens[root.left_id1].begin, end=tokens[root.left_id2].end, relation=rel, nuclearity=nuc, text=' '.join([ tok.text for tok in tokens[root.left_id1:root.right_id2 + 1] ]), left=left, right=right) return new_du else: self._id += 1 return DiscourseUnit(id=self._id, start=tokens[_tok_min].begin, end=tokens[_tok_max].end, text=' '.join([ tok.text for tok in tokens[_tok_min:_tok_max + 1] ]), relation='elementary')
def __call__(self, edus, annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag, annot_syntax_dep_tree, genre=None): """ :param list edus: DiscourseUnit :param str annot_text: original text :param list annot_tokens: isanlp.annotation.Token :param list annot_sentences: isanlp.annotation.Sentence :param list annot_postag: lists of str for each sentence :param annot_lemma: lists of str for each sentence :param annot_syntax_dep_tree: list of isanlp.annotation.WordSynt for each sentence :return: list of DiscourseUnit containing each extracted tree """ def to_merge(_scores): return np.argmax(np.array(_scores)) self.tree_predictor.genre = genre nodes = edus max_id = self._get_max_id(nodes) # initialize scores features = self.tree_predictor.initialize_features( nodes, annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag, annot_syntax_dep_tree) scores = self._get_proba(features) while len(scores) > 1 and any( [score > self.confidence_threshold for score in scores]): # select two nodes to merge j = to_merge(scores) # position of the pair in list # make the new node by merging node[j] + node[j+1] relation = self._get_relation(features.iloc[j]) relation, nuclearity = relation.split('_') temp = DiscourseUnit( id=max_id + 1, left=nodes[j], right=nodes[j + 1], relation=relation, nuclearity=nuclearity, proba=min(1., scores[j]), text=annot_text[nodes[j].start:nodes[j + 1].end].strip()) max_id += 1 # modify the node list nodes = nodes[:j] + [temp] + nodes[j + 2:] # modify the scores list if j == 0: _features = self.tree_predictor.extract_features( nodes[j], nodes[j + 1], annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag, annot_syntax_dep_tree) _scores = self._get_proba(_features) scores = _scores + scores[j + 2:] features = pd.concat([_features, features.iloc[j + 2:]]) elif j + 1 < len(nodes): _features = self.tree_predictor.initialize_features( [nodes[j - 1], nodes[j], nodes[j + 1]], annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag, annot_syntax_dep_tree) _scores = self._get_proba(_features) features = pd.concat( [features.iloc[:j - 1], _features, features.iloc[j + 2:]]) scores = scores[:j - 1] + _scores + scores[j + 2:] else: _features = self.tree_predictor.extract_features( nodes[j - 1], nodes[j], annot_text, annot_tokens, annot_sentences, annot_lemma, annot_morph, annot_postag, annot_syntax_dep_tree) _scores = self._get_proba(_features) scores = scores[:j - 1] + _scores features = pd.concat([features.iloc[:j - 1], _features]) relation = self._get_relation(features.iloc[0]) relation, nuclearity = relation.split('_') if len(scores) == 1 and scores[0] > self.confidence_threshold: root = DiscourseUnit( id=max_id + 1, left=nodes[0], right=nodes[1], relation=relation, nuclearity=nuclearity, proba=min(1., scores[0]), text=annot_text[nodes[0].start:nodes[1].end].strip()) nodes = [root] return nodes