예제 #1
0
    def _build_discourse_units(self, text, tokens, numbers, start_id):
        """
        :param text: original text
        :param list tokens: isanlp.annotation.Token
        :param numbers: positions of tokens predicted as EDU left boundaries (beginners)
        :return: list of DiscourseUnit
        """

        edus = []

        if numbers.shape[0]:
            for i in range(0, len(numbers) - 1):
                new_edu = DiscourseUnit(start_id + i,
                                        start=tokens[numbers[i]].begin,
                                        end=tokens[numbers[i + 1]].begin - 1,
                                        text=text[tokens[numbers[i]].begin:tokens[numbers[i + 1]].begin],
                                        relation='elementary',
                                        nuclearity='_')
                edus.append(new_edu)

            if numbers.shape[0] == 1:
                i = -1

            new_edu = DiscourseUnit(start_id + i + 1,
                                    start=tokens[numbers[-1]].begin,
                                    end=tokens[-1].end,
                                    text=text[tokens[numbers[-1]].begin:tokens[-1].end],
                                    relation='elementary',
                                    nuclearity='_')
            edus.append(new_edu)

        return edus
예제 #2
0
    def _build_discourse_units(self, text, tokens, numbers):
        """
        :param text: original text
        :param list tokens: isanlp.annotation.Token
        :param numbers: positions of tokens predicted as EDU right boundaries
        :return: list of DiscourseUnit
        """
        edus = []

        new_edu = DiscourseUnit(0,
                                start=0,
                                end=tokens[numbers[0]].end,
                                text=text[:tokens[numbers[0]].end],
                                relation='elementary')
        edus.append(new_edu)

        for i in range(1, len(numbers)):
            new_edu = DiscourseUnit(
                i,
                start=tokens[numbers[i - 1]].end,
                end=tokens[numbers[i]].end,
                text=text[tokens[numbers[i - 1]].end:tokens[numbers[i]].end],
                relation='elementary')
            edus.append(new_edu)

        return edus
예제 #3
0
    def predict_labels(self, nodes, annot_text, annot_tokens, annot_sentences,
                       annot_lemma, annot_morph, annot_postag,
                       annot_syntax_dep_tree):

        result = []
        for node in nodes:
            left_node = DiscourseUnit(
                id=0,
                text=' '.join([
                    tok.text
                    for tok in annot_tokens[node.left_id1:node.left_id2 + 1]
                ]))
            right_node = DiscourseUnit(
                id=1,
                text=' '.join([
                    tok.text
                    for tok in annot_tokens[node.right_id1:node.right_id2 + 1]
                ]))

            try:
                pair_feature = self.tree_predictor.extract_features(
                    left_node, right_node, annot_text, annot_tokens,
                    annot_sentences, annot_lemma, annot_morph, annot_postag,
                    annot_syntax_dep_tree)
                relation = self._get_relation(pair_feature)
                relation, nuclearity = relation.split('_')

                left_nuclearity = 'Satellite' if nuclearity == 'SN' else 'Nucleus'
                right_nuclearity = 'Satellite' if nuclearity == 'NS' else 'Nucleus'

                left_relation = relation
                right_relation = relation

                if left_nuclearity == 'Satellite':
                    right_relation = 'span'

                if right_nuclearity == 'Satellite':
                    left_relation = 'span'
            except:
                print('Unknown error occured.')
                left_relation = node.left_rel
                left_nuclearity = node.left_nuc
                right_relation = node.right_rel
                right_nuclearity = node.right_nuc

            result.append(
                Node(left_id1=node.left_id1,
                     left_nuc=left_nuclearity,
                     left_rel=left_relation,
                     left_id2=node.left_id2,
                     right_id1=node.right_id1,
                     right_nuc=right_nuclearity,
                     right_rel=right_relation,
                     right_id2=node.right_id2))

        return result
예제 #4
0
 def __call__(self,
              left_node,
              right_node,
              proba=1.,
              text="",
              start=None,
              end=None,
              relation="",
              nuclearity=""):
     self.id += 1
     return DiscourseUnit(id=self.id,
                          left=left_node,
                          right=right_node,
                          relation=relation,
                          nuclearity=nuclearity,
                          proba=proba,
                          start=start,
                          end=end,
                          orig_text=self.text)
예제 #5
0
    def docs_structure_to_du(self, nodes, tokens, _tok_min, _tok_max):
        du = None
        root = TopDownRSTParser.define_root(nodes, _tok_min, _tok_max)

        if root:
            nuc = 'NN'
            rel = root.left_rel

            if root.left_nuc == 'Satellite':
                nuc = 'SN'
                rel = root.left_rel
            elif root.right_nuc == 'Satellite':
                nuc = 'NS'
                rel = root.right_rel

            if root.left_id1 == root.left_id2:
                self._id += 1
                left = DiscourseUnit(
                    id=self._id,
                    start=tokens[root.left_id1].begin,
                    end=tokens[root.left_id2].end,
                    text=' '.join([
                        tok.text
                        for tok in tokens[root.left_id1:root.left_id2 + 1]
                    ]),
                    relation='elementary')
            else:
                left = self.docs_structure_to_du(nodes, tokens, root.left_id1,
                                                 root.left_id2)

            if root.right_id1 == root.right_id2:
                self._id += 1
                right = DiscourseUnit(
                    id=self._id,
                    start=tokens[root.left_id1].begin,
                    end=tokens[root.left_id2].end,
                    text=' '.join([
                        tok.text
                        for tok in tokens[root.right_id1:root.right_id2 + 1]
                    ]),
                    relation='elementary')
            else:
                right = self.docs_structure_to_du(nodes, tokens,
                                                  root.right_id1,
                                                  root.right_id2)

            self._id += 1
            new_du = DiscourseUnit(
                id=self._id,
                start=tokens[root.left_id1].begin,
                end=tokens[root.left_id2].end,
                relation=rel,
                nuclearity=nuc,
                text=' '.join([
                    tok.text
                    for tok in tokens[root.left_id1:root.right_id2 + 1]
                ]),
                left=left,
                right=right)
            return new_du

        else:
            self._id += 1
            return DiscourseUnit(id=self._id,
                                 start=tokens[_tok_min].begin,
                                 end=tokens[_tok_max].end,
                                 text=' '.join([
                                     tok.text
                                     for tok in tokens[_tok_min:_tok_max + 1]
                                 ]),
                                 relation='elementary')
예제 #6
0
    def __call__(self,
                 edus,
                 annot_text,
                 annot_tokens,
                 annot_sentences,
                 annot_lemma,
                 annot_morph,
                 annot_postag,
                 annot_syntax_dep_tree,
                 genre=None):
        """
        :param list edus: DiscourseUnit
        :param str annot_text: original text
        :param list annot_tokens: isanlp.annotation.Token
        :param list annot_sentences: isanlp.annotation.Sentence
        :param list annot_postag: lists of str for each sentence
        :param annot_lemma: lists of str for each sentence
        :param annot_syntax_dep_tree: list of isanlp.annotation.WordSynt for each sentence
        :return: list of DiscourseUnit containing each extracted tree
        """
        def to_merge(_scores):
            return np.argmax(np.array(_scores))

        self.tree_predictor.genre = genre

        nodes = edus
        max_id = self._get_max_id(nodes)

        # initialize scores
        features = self.tree_predictor.initialize_features(
            nodes, annot_text, annot_tokens, annot_sentences, annot_lemma,
            annot_morph, annot_postag, annot_syntax_dep_tree)

        scores = self._get_proba(features)

        while len(scores) > 1 and any(
            [score > self.confidence_threshold for score in scores]):
            # select two nodes to merge
            j = to_merge(scores)  # position of the pair in list

            # make the new node by merging node[j] + node[j+1]
            relation = self._get_relation(features.iloc[j])
            relation, nuclearity = relation.split('_')
            temp = DiscourseUnit(
                id=max_id + 1,
                left=nodes[j],
                right=nodes[j + 1],
                relation=relation,
                nuclearity=nuclearity,
                proba=min(1., scores[j]),
                text=annot_text[nodes[j].start:nodes[j + 1].end].strip())

            max_id += 1

            # modify the node list
            nodes = nodes[:j] + [temp] + nodes[j + 2:]

            # modify the scores list
            if j == 0:
                _features = self.tree_predictor.extract_features(
                    nodes[j], nodes[j + 1], annot_text, annot_tokens,
                    annot_sentences, annot_lemma, annot_morph, annot_postag,
                    annot_syntax_dep_tree)

                _scores = self._get_proba(_features)
                scores = _scores + scores[j + 2:]
                features = pd.concat([_features, features.iloc[j + 2:]])

            elif j + 1 < len(nodes):
                _features = self.tree_predictor.initialize_features(
                    [nodes[j - 1], nodes[j], nodes[j + 1]], annot_text,
                    annot_tokens, annot_sentences, annot_lemma, annot_morph,
                    annot_postag, annot_syntax_dep_tree)

                _scores = self._get_proba(_features)
                features = pd.concat(
                    [features.iloc[:j - 1], _features, features.iloc[j + 2:]])
                scores = scores[:j - 1] + _scores + scores[j + 2:]

            else:
                _features = self.tree_predictor.extract_features(
                    nodes[j - 1], nodes[j], annot_text, annot_tokens,
                    annot_sentences, annot_lemma, annot_morph, annot_postag,
                    annot_syntax_dep_tree)

                _scores = self._get_proba(_features)
                scores = scores[:j - 1] + _scores
                features = pd.concat([features.iloc[:j - 1], _features])

        relation = self._get_relation(features.iloc[0])
        relation, nuclearity = relation.split('_')
        if len(scores) == 1 and scores[0] > self.confidence_threshold:
            root = DiscourseUnit(
                id=max_id + 1,
                left=nodes[0],
                right=nodes[1],
                relation=relation,
                nuclearity=nuclearity,
                proba=min(1., scores[0]),
                text=annot_text[nodes[0].start:nodes[1].end].strip())
            nodes = [root]

        return nodes