예제 #1
0
def optimal_tree_construction(span_to_label, sentence, span_to_on_score):
    conflicting = set()
    for span_a in span_to_label:
        for span_b in span_to_label:
            if check_overlap(span_a, span_b):
                conflicting.add(span_a)
    cache = {}

    def helper(left, right):
        if (left, right) in cache:
            return cache[(left, right)]

        if (left, right) in span_to_label:
            label = span_to_label[(left, right)]
            assert label != ()
        else:
            assert left != 0 or right != len(sentence)
            label = ()

        if right - left == 1:
            tag, word = sentence[left]
            tree = LeafParseNode(left, tag, word)
            score = 0
            if label:
                tree = InternalParseNode(label, [tree])
                score += span_to_on_score[(left, right)]
            return [tree], score

        split_options = []
        for split in range(right - 1, left, -1):
            if (left, split) in span_to_label:
                split_options.append(split)
                if (left, split) not in conflicting:
                    break
            if split == left + 1:
               split_options.append(left + 1)
        assert len(split_options) > 0
        best_option_score = None
        best_option = None
        for split in split_options:
            left_trees, left_score = helper(left, split)
            right_trees, right_score = helper(split, right)
            children = left_trees + right_trees
            score = left_score + right_score
            if label:
                children = [InternalParseNode(label, children)]
                score += span_to_on_score[(left, right)]

            if best_option_score is None or score > best_option_score:
                best_option_score = score
                best_option = children
        response = best_option, best_option_score
        cache[(left, right)] = response
        return response

    trees, _ = helper(0, len(sentence))
    assert (0, len(sentence)) in span_to_label
    assert len(trees) == 1, len(trees)
    return trees[0]
예제 #2
0
    def aggressive_annotation(self,
                              sentence,
                              sentence_number,
                              span_to_gold_label,
                              low_conf_cutoff,
                              seen):
        if len(span_to_gold_label) == 0:
            return []  # , []
        lstm_outputs = self._featurize_sentence(sentence, is_train=False)
        encodings = []
        spans = span_to_gold_label.keys()
        for (start, end) in spans:
            encodings.append(self._get_span_encoding(start, end, lstm_outputs))
        label_scores = self.f_label(dy.concatenate_to_batch(encodings))
        label_scores_reshaped = dy.reshape(label_scores,
                                           (self.label_vocab.size, len(encodings)))
        label_probabilities_np = dy.softmax(label_scores_reshaped).npvalue()
        low_confidence_labels = []
        # high_confidence_labels = []
        on_labels = []
        for index, (start, end) in list(enumerate(spans)):
            distribution = label_probabilities_np[:, index]
            entropy = stats.entropy(distribution)
            oracle_label = span_to_gold_label[(start, end)]
            annotation_request = dict(
                sentence_number=sentence_number,
                left=start,
                right=end,
                entropy=entropy,
                non_constituent_probability=distribution[0],
                label=oracle_label
            )
            if (start, end) in seen:
                del span_to_gold_label[(start, end)]
                continue
            if low_conf_cutoff < entropy and distribution[self.empty_label_index] < 0.5:
                # annotation_request['label'] = oracle_label
                low_confidence_labels.append(annotation_request)
            elif entropy < 10 ** -5 and distribution[self.empty_label_index] > 0.99:
                del span_to_gold_label[(start, end)]
                # if entropy > 10 ** -7:
                #     high_confidence_labels.append(annotation_request)
            if np.max(distribution) > distribution[self.empty_label_index]:
                on_labels.append(annotation_request)

        for index, label_a in enumerate(on_labels):
            span_a = (label_a['left'], label_a['right'])
            for label_b in on_labels[index + 1:]:
                span_b = (label_b['left'], label_b['right'])
                if check_overlap(span_a, span_b):
                    label_a['entropy'] = 10
                    low_confidence_labels.append(label_a)
                    label_b['entropy'] = 10
                    low_confidence_labels.append(label_b)

        return low_confidence_labels  # , high_confidence_labels
예제 #3
0
 def test_must_check_if_there_is_an_overlap(self):
     l1 = (1, 10)
     l2 = (22, 7)
     res = question_a.check_overlap(l1, l2)
     self.assertTrue(res)
예제 #4
0
 def test_must_check_if_there_is_no_overlap(self):
     l1 = (1, 10)
     l2 = (11, 22)
     res = question_a.check_overlap(l1, l2)
     self.assertFalse(res)