Exemplo n.º 1
0
    def test_entities_with_nesting_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From",
            "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people",
            "in", "the", "Sardinian", "province", "of", "Cagliari", "had",
            "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 23, "Habitat"),
            Entity("T7", 19, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
Exemplo n.º 2
0
    def test_inner_entities_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7",
            ",", "1979", ",", "10", "people", "in", "the", "$Geographical$",
            "of", "$Geographical$", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        expected_sentences = [Sentence(0, 7), Sentence(7, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 23, "Habitat"),
            Entity("T7", 20, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
Exemplo n.º 3
0
    def setUp(self):
        tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",",
            "1979", ",", "10", "people", "in", "the", "Sardinian", "province",
            "of", "Cagliari", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 31)]

        entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 24, "Habitat"),
            Entity("T7", 20, 22, "Geographical"),
            Entity("T8", 23, 24, "Geographical"),
            Entity("T9", 29, 30, "Bacteria")
        ]

        paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        relations = [
            Relation(entities[0], entities[1], "Lives_in"),
            Relation(entities[8], entities[6], "Lives_in")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities,
                            relations)
Exemplo n.º 4
0
    def test_2_chains_2_pron(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, 'noun'),
            Entity('_', 1, 2, 'pron'),
            Entity('_', 2, 3, 'pron'),
            Entity('_', 3, 4, 'noun'),
            Entity('_', 5, 6, 'noun'),
        ]
        rels = {
            Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'),
                     '1'),
            Relation(Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'),
                     '1'),
            Relation(Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'),
                     '1'),
        }
        doc = Document('test', [], sentences, paragraphs, entities, rels)

        max_distance = 3

        actual_samples = get_pron_samples(doc, max_distance, True)
        expected_samples = [
            (Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), None),
            (Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'),
            (Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'),
            (Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), None),
        ]
        self.assertEqual(actual_samples, expected_samples)
Exemplo n.º 5
0
def _get_relations(raw_relations: list, entities_dict: dict, symmetric_types: set):
    relations = set()
    for rel in raw_relations:
        e1 = entities_dict[rel['first']]
        e2 = entities_dict[rel['second']]
        rel_type = rel['type']

        relations.add(Relation(e1, e2, rel_type))

        if symmetric_types and rel_type in symmetric_types:
            relations.add(Relation(e2, e1, rel_type))

    return relations
Exemplo n.º 6
0
 def _get_relations(self, predictions: dict) -> set:
     rels = set()
     for (e1, e2), label in predictions.items():
         rel_type = self.extractor.get_type(label)
         if rel_type is not None:
             rels.add(Relation(e1, e2, rel_type))
     return rels
Exemplo n.º 7
0
    def setUp(self) -> None:
        tokens = [
            "I", "will", "do", "my", "homework", "today", ".", "It", "is",
            "very", "hard", "but", "i", "don't", "care", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 16)]
        paragraphs = [Paragraph(0, 2)]
        entities = [
            Entity("_", 0, 1, "t1"),
            Entity("_", 3, 5, "t2"),
            Entity("_", 7, 8, "t1"),
            Entity("_", 9, 11, "t2"),
            Entity("_", 10, 11, "t4")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities)
        self.relations = {
            Relation(entities[2], entities[3], "t1"),
            Relation(entities[3], entities[4], "t2")
        }
Exemplo n.º 8
0
def collapse_intersecting_entities(entities: List[Entity],
                                   relations: Set[Relation]):
    # assume entities list is sorted with start token
    entities_to_process = list(entities)
    entities_mapping = {}
    new_entities = []

    while entities_to_process:
        ent1 = entities_to_process.pop(0)
        ent_end = ent1.end_token
        type_ent = ent1.type
        ents_to_collapse = []

        for ent2 in entities_to_process:
            if ent2.start_token >= ent_end:
                continue
            if ent1.type != ent2.type:
                warn(
                    f"Intersecting entities have different types: {ent1} absorbed {ent2}"
                )
                assert not ent1.coincides(
                    ent2), "Two entities of different types on the same span"
                assert ent1.contains(ent2) or ent2.contains(ent1) or not ent1.intersects(ent2), \
                    "Two entities of different types are not embedded, intersecting only"

            ents_to_collapse.append(ent2)
            ent_end = max(ent_end, ent2.end_token)

            if len(ent2) > len(ent1):
                type_ent = ent2.type

        if not ents_to_collapse:
            new_ent = ent1
        else:
            new_ent = ent1.relocated(ent1.start_token,
                                     ent_end).with_type(type_ent)

        new_entities.append(new_ent)
        entities_mapping[ent1] = new_ent

        for ent2 in ents_to_collapse:
            entities_mapping[ent2] = new_ent
            entities_to_process.remove(ent2)

    new_relations = {
        Relation(entities_mapping[r.first_entity],
                 entities_mapping[r.second_entity], r.type)
        for r in relations
    }

    # new entities list was constructed as sorted
    return new_entities, new_relations
Exemplo n.º 9
0
def chain_similar_entities(
        doc: Document, entities: List[Entity],
        entity_comparator: Callable[[Document, Entity, Entity], bool] = compare_entities_by_tokens) \
            -> List[CoreferenceChain]:

    relations = set()

    for i, e1 in enumerate(entities):
        for e2 in entities[:i]:
            if entity_comparator(doc, e1, e2):
                relations.add(Relation(e1, e2, "match"))

    return collect_chains(relations, entities)
Exemplo n.º 10
0
    def predict_doc(self, doc, include_probs=False):
        doc = self.feature_computer.create_features_for_doc(doc)

        # parallel lists for segment features and segment entity pairs for all doc segments
        samples, entity_pairs = self.extractor.extract_features_from_doc(doc, use_filter=True)
        entity_pairs = sum(entity_pairs, [])
        outputs = ["predictions"]

        if include_probs:
            outputs.append("scores")

        out = predict_for_samples(
            self.graph, self.session, outputs,
            get_coref_batcher_factory(samples, 300, self.extractor, False, False))  # labels, [scores]

        relations = self._collect_pair_results(out[0], entity_pairs)
        relations = self._get_relations(relations)

        posprocessing_result = self.classifiers.apply(doc)

        posprocessing_rels = set()
        for (e1, e2), scores in posprocessing_result.items():
            label = max(scores, key=scores.get)
            if label is not None:
                posprocessing_rels.add(Relation(e1, e2, label))

        relations |= posprocessing_rels

        try:
            relations |= doc.relations
        except ValueError:
            pass

        ret = relations

        if include_probs:
            scores = self._collect_pair_results(out[1], entity_pairs)
            scores = self._get_scores(scores)
            scores = {**scores, **posprocessing_result}
            try:
                scores = {**scores, **get_known_rel_scores(doc.relations)}
            except ValueError:
                pass
            ret = (relations, scores)

        return ret
Exemplo n.º 11
0
    def test_2_entity_rel(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, '1'),
            Entity('_', 1, 2, '1'),
        ]
        rels = {Relation(Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), '1')}
        doc = Document('test', [], sentences, paragraphs, entities, rels)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, True)

        expected_samples = [(Entity('_', 0, 1, '1'), Entity('_', 1, 2,
                                                            '1'), '1')]

        self.assertEqual(expected_samples, actual_samples)
Exemplo n.º 12
0
def _get_rank_rels(entities, pairs):
    relations = set()
    for entity in entities:
        best_score = 0
        best_candidate = None
        best_label = None
        for (e1, e2), scores in pairs.items():
            if entity != e2:
                continue
            max_scores = max(scores.values())
            label = max(scores, key=scores.get)
            if max_scores > best_score and label is not None:
                best_score = max_scores
                best_candidate = e1
                best_label = label
        if best_candidate is not None:
            relations.add(Relation(entity, best_candidate, best_label))
    return relations
Exemplo n.º 13
0
def collect_pron_vote_rank(pairs: Dict[tuple, dict], known_rels):
    """
    This method collects relations from pairs with given class confidence, using knowledge about known relations.
    Known relations are used to get info about clusters. Mention is connected to cluster with score chosen as mean of
    all pair scores.
    :param pairs: scores of mention pairs
    :param known_rels: known relations
    :return: relations selected from pairs
    """
    entities = sum(map(lambda x: [x[0], x[1]], pairs), [])
    nouns = set(filter(lambda x: x.type != 'pron', entities))
    chains = collect_chains(known_rels, list(nouns))

    entities = set(filter(lambda x: x.type == 'pron', entities))
    rels = set()
    for entity in entities:
        best_score = 0
        best_candidate = None
        for chain in chains:
            if not chain.entities:
                continue
            chain_scores = []
            candidate = get_closest_entity(chain, entity, False)
            for e in chain.entities:
                score = None
                if (e, entity) in pairs:
                    score = pairs[(e, entity)]["COREF"]
                if (entity, e) in pairs:
                    score = pairs[(entity, e)]["COREF"]
                if score is not None:
                    chain_scores.append(score)
            chain_score = np.mean(chain_scores) if chain_scores else 0
            if best_candidate is None or best_score < chain_score:
                best_candidate = candidate
                best_score = chain_score
        if best_candidate is not None:
            rels.add(Relation(best_candidate, entity, "COREF"))

    return rels
Exemplo n.º 14
0
    def _fix_entity_types(docs):
        ret = []
        for doc in docs:
            new_entities = []
            entity_mapping = {}
            new_rels = []
            for entity in doc.entities:
                head = find_span_head_token(doc, entity)
                if doc.token_features['pos'][head] == 'PRON':
                    e_type = 'pron'
                else:
                    e_type = 'noun'
                new_entity = entity.with_type(e_type)
                entity_mapping[entity] = new_entity
                new_entities.append(new_entity)
            for rel in doc.relations:
                new_rels.append(
                    Relation(entity_mapping[rel.first_entity],
                             entity_mapping[rel.second_entity], rel.type))

            ret.append(
                doc.without_relations().without_entities().with_entities(
                    new_entities).with_relations(new_rels))
        return ret
Exemplo n.º 15
0
 def to_relations_set(self) -> List[Relation]:
     relations = []
     for i, entity in enumerate(self.entities):
         for next_entity in self.entities[i + 1:]:
             relations.append(Relation(entity, next_entity, "COREF"))
     return relations
Exemplo n.º 16
0
 def to_relations_chain(self) -> List[Relation]:
     relations = []
     for prev_entity, entity in zip(self.entities[:-1], self.entities[1:]):
         relations.append(Relation(prev_entity, entity, "COREF"))
     return relations
Exemplo n.º 17
0
def _create_rel(idx1, idx2):
    return Relation(_create_entity(idx1), _create_entity(idx2), '_')
Exemplo n.º 18
0
def _create_rel(e1, e2):
    return Relation(e1, e2, "T1")
Exemplo n.º 19
0
    def setUp(self) -> None:
        self.docs = [
            Document('1',
                     ['Во', 'время', 'своих', 'прогулок', 'в', 'окрестностях', 'Симеиза', 'я', 'обратил', 'внимание',
                      'на', 'одинокую', 'дачу', ',', 'стоявшую', 'на', 'крутом', 'склоне', 'горы', '.',
                      'К', 'этой', 'даче', 'не', 'было', 'проведено', 'даже', 'дороги', '.',
                      'Кругом', 'она', 'была', 'обнесена', 'высоким', 'забором', ',', 'с', 'единственной', 'низкой',
                      'калиткой', ',', 'которая', 'всегда', 'была', 'плотно', 'прикрыта', '.'],
                     [Sentence(0, 20), Sentence(20, 29), Sentence(29, 47)],
                     [Paragraph(0, 3)],
                     [Entity('1', 2, 3, 'pron'),
                      Entity('1', 7, 8, 'pron'),
                      Entity('1', 11, 13, 'noun'),
                      Entity('1', 21, 23, 'noun'),
                      Entity('1', 30, 31, 'pron'),
                      Entity('1', 33, 35, 'noun'),
                      Entity('1', 37, 38, 'noun'),
                      Entity('1', 37, 40, 'noun'),
                      Entity('1', 41, 42, 'pron')],
                     {
                         Relation(Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), 'COREF'),
                         Relation(Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), 'COREF'),
                         Relation(Entity('1', 11, 13, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'),
                         Relation(Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'),
                         Relation(Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron'), 'COREF'),
                     },
                     {
                         'pos': ['ADP', 'NOUN', 'DET', 'NOUN', 'ADP', 'NOUN', 'PROPN', 'PRON', 'VERB', 'NOUN', 'ADP',
                                 'ADJ', 'NOUN',
                                 'PUNCT', 'VERB', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'PART',
                                 'AUX', 'VERB',
                                 'PART', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'ADJ', 'NOUN', 'PUNCT', 'ADP',
                                 'ADJ', 'ADJ',
                                 'NOUN', 'PUNCT', 'PRON', 'ADV', 'AUX', 'ADV', 'VERB', 'PUNCT'],
                         'dt_labels': ['case', 'fixed', 'amod', 'obl', 'case', 'nmod', 'nmod', 'nsubj', 'root', 'obj',
                                       'case',
                                       'amod', 'nmod', 'punct', 'amod', 'case', 'amod', 'obl', 'nmod', 'punct', 'case',
                                       'amod',
                                       'obl', 'advmod', 'aux:pass', 'root', 'advmod', 'nsubj', 'punct', 'advmod',
                                       'nsubj',
                                       'aux:pass', 'root', 'amod', 'obl', 'punct', 'case', 'amod', 'amod', 'conj',
                                       'punct', 'nsubj',
                                       'advmod', 'aux:pass', 'advmod', 'acl:relcl', 'punct'],
                         'dt_head_distances': [3, -1, 1, 5, 1, -2, -1, 1, 0, -1, 2, 1, -3, -1, -2, 2, 1, -3, -1, -1, 2,
                                               1, 3, 2, 1,
                                               0, 1, -2, -1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, -5, -1, 4, 3, 2, 1, -11,
                                               -1],
                         'lemmas': ['во', 'время', 'свой', 'прогулка', 'в', 'окрестность', 'Симеиза', 'я', 'обращать',
                                    'внимание',
                                    'на', 'одинокий', 'дача', ',', 'стоять', 'на', 'крутой', 'склон', 'гора', '.', 'к',
                                    'этот',
                                    'дача', 'не', 'быть', 'проводить', 'даже', 'дорога', '.', 'кругом', 'она', 'быть',
                                    'обнесен',
                                    'высокий', 'забор', ',', 'с', 'единственный', 'низкий', 'калитка', ',', 'который',
                                    'всегда',
                                    'быть', 'плотно', 'прикрывать', '.'],
                         'feats': [{}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                        'Gender': 'Neuter'},
                                   {'Number': 'Plural', 'Pronoun': 'REFLEXIVE', 'Case': 'Genitive'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Plural',
                                    'Gender': 'Neuter'}, {},
                                   {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Plural',
                                    'Gender': 'Masculine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'DEICTIC',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Neuter'}, {},
                                   {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular',
                                    'Gender': 'Feminine',
                                    'Tense': 'NotPast'},
                                   {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular',
                                    'Gender': 'Feminine',
                                    'Tense': 'Past'}, {},
                                   {'Case': 'Prepositional', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {'Case': 'Dative', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Neuter',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'}, {},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Feminine', 'Number': 'Singular',
                                    'Pronoun': 'PERSONAL',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'},
                                   {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {}, {}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Feminine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'}, {}],
                         'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O'],
                     },
                     {
                         'ne': SortedSpansSet([Entity('1', 6, 7, 'GPE_CITY')])
                     }
                     ),
            Document('1',
                     ['Когда', 'мы', 'шли', 'по', 'тропинке', ',', 'каждый', 'был', 'доволен', 'и', 'думал', ',', 'что',
                      'надул', 'другого', '.',
                      'Петька', 'изредка', 'посапывал', 'носом', '.',
                      'Давно', 'он', 'зарился', 'на', 'моих', 'голубей', ',', 'еще', 'с', 'прошлой', 'зимы', ',', 'а',
                      'теперь', 'вот', 'счастье', 'неожиданно', 'привалило', '.',
                      'А', 'у', 'меня', 'будет', 'пистолет', '.'],
                     [Sentence(0, 16), Sentence(16, 21), Sentence(21, 40), Sentence(40, 46)],
                     [Paragraph(0, 3)],
                     [
                         Entity('1', 1, 2, 'pron'),
                         Entity('1', 16, 17, 'noun'),
                         Entity('1', 22, 23, 'pron'),
                         Entity('1', 25, 26, 'pron'),
                         Entity('1', 25, 27, 'noun'),
                         Entity('1', 42, 43, 'pron'),
                         Entity('1', 44, 45, 'noun'),
                     ],
                     {
                         Relation(Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), 'COREF'),
                         Relation(Entity('1', 25, 26, 'pron'), Entity('1', 42, 43, 'pron'), 'COREF'),
                     },
                     {
                         'pos': ['SCONJ', 'PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'ADJ', 'AUX', 'ADJ', 'CCONJ', 'VERB',
                                 'PUNCT',
                                 'SCONJ', 'VERB', 'ADJ', 'PUNCT', 'NOUN', 'ADV', 'VERB', 'NOUN', 'PUNCT', 'ADV', 'PRON',
                                 'VERB',
                                 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'ADV',
                                 'PART',
                                 'NOUN', 'ADV', 'VERB', 'PUNCT', 'CCONJ', 'ADP', 'PRON', 'VERB', 'NOUN', 'PUNCT'],
                         'dt_labels': ['mark', 'nsubj', 'advcl', 'case', 'obl', 'punct', 'nsubj', 'cop', 'root', 'cc',
                                       'conj',
                                       'punct', 'mark', 'advcl', 'obj', 'punct', 'nsubj', 'advmod', 'root', 'obl',
                                       'punct', 'advmod',
                                       'nsubj', 'root', 'case', 'amod', 'obl', 'punct', 'advmod', 'case', 'obl', 'nmod',
                                       'punct',
                                       'cc', 'advmod', 'advmod', 'nsubj', 'advmod', 'conj', 'punct', 'cc', 'case',
                                       'root', 'cop',
                                       'nsubj', 'punct'],
                         'dt_head_distances': [8, 1, 6, 1, -2, -1, 2, 1, 0, 1, -2, -1, -2, -3, -1, -1, 2, 1, 0, -1, -1,
                                               2, 1, 0, 2,
                                               1, -3, -1, 2, 1, -7, -1, -1, 5, 4, 1, 2, 1, -15, -1, 2, 1, 0, -1, -2,
                                               -1],
                         'lemmas': ['когда', 'мы', 'идти', 'по', 'тропинка', ',', 'каждый', 'быть', 'довольный', 'и',
                                    'думать', ',',
                                    'что', 'надуть', 'другой', '.', 'Петька', 'изредка', 'посапывать', 'нос', '.',
                                    'давно', 'он',
                                    'зариться', 'на', 'мой', 'голубь', ',', 'еще', 'с', 'прошлый', 'зима', ',', 'а',
                                    'теперь', 'вот',
                                    'счастье', 'неожиданно', 'приваливать', '.', 'а', 'у', 'я', 'быть', 'пистолет',
                                    '.'],
                         'feats': [{}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'DEICTIC',
                                        'Case': 'Nominative'},
                                   {'Number': 'Plural', 'Tense': 'Past', 'Mode': 'Indicative'}, {},
                                   {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Shortness': 'Short'}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {},
                                   {'Case': 'Nominative', 'Animacy': 'Animated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'PERSONAL',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {},
                                   {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'POSSESSIVE',
                                    'Case': 'Accusative'},
                                   {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Plural',
                                    'Gender': 'Masculine'}, {}, {},
                                   {}, {'Case': 'Genitive', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {}, {}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                            'Gender': 'Neuter'},
                                   {},
                                   {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {}, {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'DEICTIC',
                                    'Case': 'Genitive'},
                                   {'Person': 'Third', 'Number': 'Singular', 'Tense': 'NotPast', 'Mode': 'Indicative'},
                                   {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {}],
                         'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O'],
                     },
                     {
                         'ne': SortedSpansSet([Entity('1', 16, 17, 'PERSON')])
                     }
                     )
        ]
        # empty sets are "known" rels
        self.hook = get_hook([doc.without_relations().with_relations(set()) for doc in self.docs])
        self.base_props = {
            "seed": 12345,

            "distance": 10,
            "max_distance": 10,
            "loss": "cross_entropy",
            "optimizer": "momentum",
            "lr_decay": 0.05,
            "momentum": 0.9,
            "dropout": 0.5,
            "internal_size": 10,
            "epoch": 1,
            "batch_size": 64,
            "learning_rate": 0.1,

            "clip_norm": 5,

            "max_candidate_distance": 50,
            "max_entity_distance": 50,
            "max_word_distance": 50,
            "max_sent_distance": 10,
            "max_dt_distance": 10,
            "dist_size": 50,

            "pos_emb_size": 0,
            "morph_feats_emb_size": 0,
            "entities_types_size": 20,

            "morph_feats_size": 0,
            "morph_feats_list": ["Gender", "Animacy", "Number"],

            "encoding_type": "lstm",
            "entity_encoding_size": 10,
            "encoding_size": 10,
            "classifiers": ["exact_match", "intersecting_mentions"],
            "use_filter": False,

            "max_sent_entities_distance": 10,
            "max_token_entities_distance": 20,

            "agreement_types": ["Gender", "Animacy", "Number"],
            "classifier_agreement_size": 0,

            "head_str_match_size": 0,
            "partial_str_match_size": 0,
            "ordered_partial_str_match_size": 0,

            "mention_interrelation_size": 0,
            "mention_distance_size": 0,
            "max_mention_distance": 50,
            "classifier_entity_distance_size": 0,
            "entities_types_in_classifier_size": 0,
            "head_ne_types_size": 0,
            "entities_token_distance_in_classifier_size": 0,
            "entities_sent_distance_in_classifier_size": 0,

            "encoder_entity_types_size": 0,
            "encoder_entity_ne_size": 0,

            "speech_types": ["said"],
            "speech_size": 0,

            "entity_encoding_type": "rnn",

            "classification_dense_size": 20,
        }
        self.experiment_props = {
            "sampling_strategy": ["coref_noun", "coref_pron_cluster", 'coref_pron_cluster_strict', 'coref_pron']
        }
Exemplo n.º 20
0
def _collapse_entities_in_doc(doc, entities_to_collapse: Iterable[Entity],
                              entity_types_to_collapse: Union[set, frozenset]):

    if set(doc.extras.keys()).difference({"ne"}):
        raise Exception("Currently support only ne extras")

    # copy features not to affect default document
    tokens_to_process = list(doc.tokens)
    token_features_to_process = {
        k: list(v)
        for k, v in doc.token_features.items()
    }

    borders_to_change = {
        'entities_to_collapse': build_borders_dict(entities_to_collapse),
        'sentences': build_borders_dict(doc.sentences)
    }
    try:
        borders_to_change["entities"] = build_borders_dict(doc.entities)
    except ValueError:
        pass

    if "ne" in doc.extras:
        borders_to_change["ne"] = build_borders_dict(doc.extras["ne"])

    _collapse_entities_and_correct_features(entities_to_collapse,
                                            tokens_to_process,
                                            token_features_to_process,
                                            entity_types_to_collapse,
                                            borders_to_change)

    sentences_mapping = create_objects_with_new_borders(
        doc.sentences, borders_to_change['sentences'])
    collapsed_entities_mapping = create_objects_with_new_borders(
        entities_to_collapse, borders_to_change['entities_to_collapse'])

    if 'entities' in borders_to_change:
        doc_entities_mapping = create_objects_with_new_borders(
            doc.entities, borders_to_change['entities'])
        doc_entities = doc_entities_mapping.values()
    else:
        doc_entities = None

    if "ne" in doc.extras:
        ne_mapping = create_objects_with_new_borders(doc.extras["ne"],
                                                     borders_to_change["ne"])
        extras = {"ne": SortedSpansSet(ne_mapping.values())}
    else:
        extras = None

    doc_to_process = Document(doc.name,
                              tokens_to_process,
                              sentences_mapping.values(),
                              doc.paragraphs,
                              doc_entities,
                              token_features=token_features_to_process,
                              extras=extras)

    try:
        relations = [
            Relation(doc_entities_mapping[r.first_entity],
                     doc_entities_mapping[r.second_entity], r.type)
            for r in doc.relations
        ]
        doc_to_process = doc_to_process.with_relations(relations)
    except ValueError:
        pass

    return doc_to_process, collapsed_entities_mapping
Exemplo n.º 21
0
def make_document_from_json_file(file_path):
    d = load_json_file_as_dict(file_path)

    tokens = d.get('tokens', [])
    entities = d.get('entities', [])
    sentences = d.get('sentences', [])
    paragraphs = d.get('paragraphs', [])
    token_features = {}

    for feature in [
            'pos', 'entities_types', 'entities_depths', 'borders', 'dt_labels',
            'dt_head_distances', 'dt_depths', 'dt_deltas_forward',
            'dt_deltas_backward', 'dt_breakups_forward', 'dt_breakups_backward'
    ]:
        if feature in d:
            token_features[feature] = d[feature]

    relations = d.get('relations', [])

    doc_entities = []
    for ent in entities:
        id_, start_token, end_token, ent_type = tuple(ent)
        doc_entities.append(Entity(id_, start_token, end_token, ent_type))

    doc_sentences = []

    for sent in sentences:
        start_token, end_token = tuple(sent)
        doc_sentences.append(Sentence(start_token, end_token))

    doc_paragraphs = []

    for par in paragraphs:
        start_sentence, end_sentence = tuple(par)
        doc_paragraphs.append(Paragraph(start_sentence, end_sentence))

    doc_relations = []

    for rel in relations:
        e1 = None
        e2 = None
        e1_id, e2_id, rel_type = tuple(rel)

        for entity in doc_entities:
            if entity.id == e1_id:
                e1 = entity
            if entity.id == e2_id:
                e2 = entity

            if e1 is not None and e2 is not None:
                break

        doc_relations.append(Relation(e1, e2, rel_type))

    doc = Document("",
                   tokens,
                   doc_sentences,
                   doc_paragraphs,
                   token_features=token_features)
    if 'entities' in d:
        doc = doc.with_entities(doc_entities)
    if 'relations' in d:
        doc = doc.with_relations(doc_relations)
    return doc
Exemplo n.º 22
0
    def setUp(self) -> None:
        self.docs = []

        # BB-event-4329237
        tokens = [
            "The", "in", "vitro", "assay", "of", "tuberculin",
            "hypersensitivity", "in", "Macaca", "mulatta", "sensitized",
            "with", "bacille", "Calmette", "Guerin", "cell", "wall", "vaccine",
            "and-or", "infected", "with", "virulent", "Mycobacterium",
            "tuberculosis", "."
        ]
        sentences = [Sentence(0, 25)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T2", 8, 18, "Habitat"),
            Entity("T3", 8, 24, "Habitat"),
            Entity("T4", 12, 18, "Habitat"),
            Entity("T5", 12, 15, "Bacteria"),
            Entity("T6", 22, 24, "Bacteria")
        ]
        relations = {Relation(entities[4], entities[1], "Lives_In")}

        # token features generated by UDPipe
        pos = [
            'DET', 'ADP', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'ADP',
            'PROPN', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'NOUN',
            'NOUN', 'NUM', 'NOUN', 'VERB', 'ADP', 'ADJ', 'PROPN', 'NOUN',
            'PUNCT'
        ]

        dt_labels = [
            'det', 'case', 'compound', 'nsubj', 'case', 'compound', 'nmod',
            'case', 'compound', 'nmod', 'root', 'case', 'compound', 'flat',
            'compound', 'compound', 'obl', 'nummod', 'appos', 'acl', 'case',
            'amod', 'compound', 'obl', 'punct'
        ]

        dt_head_distances = [
            3, 2, 1, 7, 2, 1, -3, 2, 1, -6, 0, 5, 2, -1, 2, 1, -6, 1, -2, -1,
            3, 2, 1, -4, -14
        ]

        token_features = {
            "pos": pos,
            "dt_labels": dt_labels,
            "dt_head_distances": dt_head_distances
        }
        self.docs.append(
            Document("_", tokens, sentences, paragraphs, entities, relations,
                     token_features))

        # BB-event-9564489
        tokens = [
            'Gingivomandibular', 'infection', 'due', 'to', 'Mycobacterium',
            'kansasii', 'in', 'a', 'patient', 'with', 'AIDS', '.'
        ]
        sentences = [Sentence(0, 12)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T2", 0, 1, "Habitat"),
            Entity("T3", 4, 6, "Bacteria"),
            Entity("T4", 8, 11, "Habitat")
        ]
        relations = {
            Relation(entities[1], entities[0], "Lives_In"),
            Relation(entities[1], entities[2], "Lives_In")
        }

        # token features generated by UDPipe
        pos = [
            'ADJ', 'NOUN', 'ADP', 'ADP', 'PROPN', 'PROPN', 'ADP', 'DET',
            'NOUN', 'ADP', 'NOUN', 'PUNCT'
        ]

        dt_labels = [
            'amod', 'root', 'case', 'fixed', 'compound', 'nmod', 'case', 'det',
            'nmod', 'case', 'nmod', 'punct'
        ]

        dt_head_distances = [1, 0, 3, -1, 1, -4, 2, 1, -7, 1, -2, -10]

        token_features = {
            "pos": pos,
            "dt_labels": dt_labels,
            "dt_head_distances": dt_head_distances
        }
        self.docs.append(
            Document("_", tokens, sentences, paragraphs, entities, relations,
                     token_features))
        self.docs_no_rels = [doc.without_relations() for doc in self.docs]
        self.props = {
            "shared": {
                "internal_emb_size": 10,
                "token_position_size": 10,
                "max_word_distance": 20,
                "dt_distance_emb_size": 10,
                "max_dt_distance": 10,
                "dt_depth_emb_size": 10,
                "max_dt_depth": 10,
                "pos_emb_size": 10
            },
            "add_we": "true",
            "add_shared": "true",
            "optimizer": "adam",
            "learning_rate": 0.01,
            "epoch": 2,
            "loss": "cross_entropy",
            "l2": 0.0001,
            "lr_decay": 0.1,
            "dropout": 0.5,
            "clip_norm": 1,
            "max_candidate_distance": 20,
            "batcher": {
                "batch_size": 8
            },
            "token_position_size": 10,
            "max_word_distance": 10,
            "encoding_size": 10,
            "entities_types_emb_size": 20,
            "entities_depth_emb_size": 10,
            'max_entities_depth': 2,
            "specific_encoder_size": 10,
            "aggregation": {
                "attention": {},
                "max_pooling": {},
                "mean_pooling": {},
                "take_spans": {},
                "last_hiddens": {}
            },
            "seed": 100
        }

        # GENIA id=10022435
        tokens = [
            "Glucocorticoid", "resistance", "in", "the", "squirrel", "monkey",
            "is", "associated", "with", "overexpression", "of", "the",
            "immunophilin", "FKBP51", "."
        ]
        sentences = [Sentence(0, 15)]
        paragraphs = [Paragraph(0, 1)]

        pos = [
            "NN", "NN", "IN", "DT", "NN", "NN", "VBZ", "VBN", "IN", "NN", "IN",
            "DT", "NN", "NN", "PERIOD"
        ]

        dt_labels = [
            "compound", "nsubjpass", "case", "det", "compound", "nmod",
            "auxpass", "root", "case", "nmod", "case", "det", "compound",
            "nmod", "dep"
        ]

        dt_head_distances = [1, 6, 3, 2, 1, -4, 1, 0, 1, -2, 3, 2, 1, -4, -7]

        token_features = {
            "pos": pos,
            "dt_labels": dt_labels,
            "dt_head_distances": dt_head_distances
        }
        self.unlabeled_docs = [
            Document("_",
                     tokens,
                     sentences,
                     paragraphs,
                     token_features=token_features)
        ]

        self.sdp_config = {
            "context_encoding_non_linearity_size": 10,
            "loss": "cross_entropy",
            "learning_rate": 0.02,
            "query_dense_size": 10,
            "clip_norm": 1,
            "batcher": {
                "batch_size": 1
            }
        }

        self.parser_config = {
            "context_encoding_non_linearity_size": 10,
            "loss": "cross_entropy",
            "learning_rate": 0.02,
            "clip_norm": 1,
            "batcher": {
                "batch_size": 1
            },
            "add_shared": True,
            "specific_encoder_size": 10,
            "sampling_strategy": "pos_filtering",
            "arc_token_distance_in_classifier_size": 10,
            "arc_token_distance_in_attention_size": 10,
            "max_arc_token_distance": 10,
            "aggregation": {
                "attention": {
                    "type": "luong",
                    "normalise_coefficients": True
                },
                "take_spans": {}
            }
        }