示例#1
0
def _collapse_entities_in_doc(doc, entities_to_collapse: Iterable[Entity],
                              entity_types_to_collapse: Union[set, frozenset]):

    if set(doc.extras.keys()).difference({"ne"}):
        raise Exception("Currently support only ne extras")

    # copy features not to affect default document
    tokens_to_process = list(doc.tokens)
    token_features_to_process = {
        k: list(v)
        for k, v in doc.token_features.items()
    }

    borders_to_change = {
        'entities_to_collapse': build_borders_dict(entities_to_collapse),
        'sentences': build_borders_dict(doc.sentences)
    }
    try:
        borders_to_change["entities"] = build_borders_dict(doc.entities)
    except ValueError:
        pass

    if "ne" in doc.extras:
        borders_to_change["ne"] = build_borders_dict(doc.extras["ne"])

    _collapse_entities_and_correct_features(entities_to_collapse,
                                            tokens_to_process,
                                            token_features_to_process,
                                            entity_types_to_collapse,
                                            borders_to_change)

    sentences_mapping = create_objects_with_new_borders(
        doc.sentences, borders_to_change['sentences'])
    collapsed_entities_mapping = create_objects_with_new_borders(
        entities_to_collapse, borders_to_change['entities_to_collapse'])

    if 'entities' in borders_to_change:
        doc_entities_mapping = create_objects_with_new_borders(
            doc.entities, borders_to_change['entities'])
        doc_entities = doc_entities_mapping.values()
    else:
        doc_entities = None

    if "ne" in doc.extras:
        ne_mapping = create_objects_with_new_borders(doc.extras["ne"],
                                                     borders_to_change["ne"])
        extras = {"ne": SortedSpansSet(ne_mapping.values())}
    else:
        extras = None

    doc_to_process = Document(doc.name,
                              tokens_to_process,
                              sentences_mapping.values(),
                              doc.paragraphs,
                              doc_entities,
                              token_features=token_features_to_process,
                              extras=extras)

    try:
        relations = [
            Relation(doc_entities_mapping[r.first_entity],
                     doc_entities_mapping[r.second_entity], r.type)
            for r in doc.relations
        ]
        doc_to_process = doc_to_process.with_relations(relations)
    except ValueError:
        pass

    return doc_to_process, collapsed_entities_mapping
示例#2
0
def make_document_from_json_file(file_path):
    d = load_json_file_as_dict(file_path)

    tokens = d.get('tokens', [])
    entities = d.get('entities', [])
    sentences = d.get('sentences', [])
    paragraphs = d.get('paragraphs', [])
    token_features = {}

    for feature in [
            'pos', 'entities_types', 'entities_depths', 'borders', 'dt_labels',
            'dt_head_distances', 'dt_depths', 'dt_deltas_forward',
            'dt_deltas_backward', 'dt_breakups_forward', 'dt_breakups_backward'
    ]:
        if feature in d:
            token_features[feature] = d[feature]

    relations = d.get('relations', [])

    doc_entities = []
    for ent in entities:
        id_, start_token, end_token, ent_type = tuple(ent)
        doc_entities.append(Entity(id_, start_token, end_token, ent_type))

    doc_sentences = []

    for sent in sentences:
        start_token, end_token = tuple(sent)
        doc_sentences.append(Sentence(start_token, end_token))

    doc_paragraphs = []

    for par in paragraphs:
        start_sentence, end_sentence = tuple(par)
        doc_paragraphs.append(Paragraph(start_sentence, end_sentence))

    doc_relations = []

    for rel in relations:
        e1 = None
        e2 = None
        e1_id, e2_id, rel_type = tuple(rel)

        for entity in doc_entities:
            if entity.id == e1_id:
                e1 = entity
            if entity.id == e2_id:
                e2 = entity

            if e1 is not None and e2 is not None:
                break

        doc_relations.append(Relation(e1, e2, rel_type))

    doc = Document("",
                   tokens,
                   doc_sentences,
                   doc_paragraphs,
                   token_features=token_features)
    if 'entities' in d:
        doc = doc.with_entities(doc_entities)
    if 'relations' in d:
        doc = doc.with_relations(doc_relations)
    return doc
示例#3
0
class TestCandidatesExtraction(unittest.TestCase):
    def setUp(self) -> None:
        tokens = [
            "I", "will", "do", "my", "homework", "today", ".", "It", "is",
            "very", "hard", "but", "i", "don't", "care", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 16)]
        paragraphs = [Paragraph(0, 2)]
        entities = [
            Entity("_", 0, 1, "t1"),
            Entity("_", 3, 5, "t2"),
            Entity("_", 7, 8, "t1"),
            Entity("_", 9, 11, "t2"),
            Entity("_", 10, 11, "t4")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities)
        self.relations = {
            Relation(entities[2], entities[3], "t1"),
            Relation(entities[3], entities[4], "t2")
        }

    def test_DifferentEntitiesCandidateFilter(self):
        f = DifferentEntitiesCandidateFilter()
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[1]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[2], self.doc.entities[2]))

    def test_InSameSentenceCandidateFilter(self):
        f = InSameSentenceCandidateFilter()
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[1]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[2], self.doc.entities[3]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[3]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[2], self.doc.entities[1]))

    def test_MaxTokenDistanceCandidateFilter_intersecting_case(self):
        f = MaxTokenDistanceCandidateFilter(0)
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[3]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[2], self.doc.entities[1]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[4], self.doc.entities[2]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[3], self.doc.entities[4]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[4], self.doc.entities[3]))

    def test_MaxTokenDistanceCandidateFilter_normal_case(self):
        f = MaxTokenDistanceCandidateFilter(3)
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[3]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[2]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[1], self.doc.entities[2]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[2], self.doc.entities[3]))

    def test_RelArgTypesCandidateFilter(self):
        valid_types = {("t1", "t1"), ("t2", "t4")}
        f = RelArgTypesCandidateFilter(valid_types)

        self.assertTrue(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[0]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[2]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[2], self.doc.entities[0]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[1], self.doc.entities[4]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[3], self.doc.entities[4]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[4], self.doc.entities[1]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[2], self.doc.entities[3]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[3], self.doc.entities[3]))

    def test_IntersectingCandidateFilter(self):
        f = IntersectingCandidateFilter()
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[2]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[2], self.doc.entities[0]))
        self.assertTrue(
            f.apply(self.doc, self.doc.entities[1], self.doc.entities[4]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[0]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[3], self.doc.entities[4]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[4], self.doc.entities[3]))

    def test_AndFilter(self):
        filts = [
            DifferentEntitiesCandidateFilter(),
            InSameSentenceCandidateFilter(),
            RelArgTypesCandidateFilter({("t1", "t1"), ("t2", "t4")})
        ]

        f = AndFilter(filts)

        self.assertFalse(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[0]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[0], self.doc.entities[2]))

        self.assertFalse(
            f.apply(self.doc, self.doc.entities[1], self.doc.entities[4]))
        self.assertFalse(
            f.apply(self.doc, self.doc.entities[2], self.doc.entities[4]))

        self.assertTrue(
            f.apply(self.doc, self.doc.entities[3], self.doc.entities[4]))

    def test_DefaultPairExtractionStrategy_no_rels(self):
        filts = [
            DifferentEntitiesCandidateFilter(),
            InSameSentenceCandidateFilter(),
            RelArgTypesCandidateFilter({("t1", "t2"), ("t2", "t4")})
        ]

        expected_pairs = [(self.doc.entities[0], self.doc.entities[1]),
                          (self.doc.entities[2], self.doc.entities[3]),
                          (self.doc.entities[3], self.doc.entities[4])]

        strategy = DefaultPairExtractionStrategy(AndFilter(filts))
        actual_pairs = strategy.apply(self.doc, include_labels=False)

        self.assertEqual(actual_pairs, expected_pairs)

    def test_DefaultPairExtractionStrategy_with_rels(self):
        filts = [
            DifferentEntitiesCandidateFilter(),
            InSameSentenceCandidateFilter(),
            RelArgTypesCandidateFilter({("t1", "t2"), ("t2", "t4")})
        ]

        expected_pairs = [(self.doc.entities[0], self.doc.entities[1]),
                          (self.doc.entities[2], self.doc.entities[3]),
                          (self.doc.entities[3], self.doc.entities[4])]

        strategy = DefaultPairExtractionStrategy(AndFilter(filts))
        actual_pairs = strategy.apply(self.doc.with_relations(self.relations),
                                      include_labels=True)

        self.assertEqual(actual_pairs, expected_pairs)

    def test_DefaultCandidateExtractionStrategy_no_rels(self):
        filts = [
            DifferentEntitiesCandidateFilter(),
            InSameSentenceCandidateFilter(),
            RelArgTypesCandidateFilter({("t1", "t2"), ("t2", "t4")})
        ]

        expected_candidates = [
            (self.doc.entities[0], self.doc.entities[1], None),
            (self.doc.entities[2], self.doc.entities[3], None),
            (self.doc.entities[3], self.doc.entities[4], None)
        ]

        strategy = DefaultCandidateExtractionStrategy(
            DefaultPairExtractionStrategy(AndFilter(filts)))
        actual_candidates = strategy.apply(self.doc, include_labels=False)

        self.assertEqual(actual_candidates, expected_candidates)

    def test_DefaultCandidateExtractionStrategy_with_rels(self):
        filts = [
            DifferentEntitiesCandidateFilter(),
            InSameSentenceCandidateFilter(),
            RelArgTypesCandidateFilter({("t1", "t2"), ("t2", "t4")})
        ]

        expected_candidates = [
            (self.doc.entities[0], self.doc.entities[1], None),
            (self.doc.entities[2], self.doc.entities[3], "t1"),
            (self.doc.entities[3], self.doc.entities[4], "t2")
        ]

        strategy = DefaultCandidateExtractionStrategy(
            DefaultPairExtractionStrategy(AndFilter(filts)))
        actual_candidates = strategy.apply(self.doc.with_relations(
            self.relations),
                                           include_labels=True)

        self.assertEqual(actual_candidates, expected_candidates)