Пример #1
0
    def test_ne_extras_collapse(self):
        nes = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 4, "same"),
            Entity("_", 3, 4, "include"),
            Entity("_", 5, 6, "same"),
            Entity("_", 15, 19, "intersect"),
            Entity("_", 17, 20, "include"),
            Entity("_", 22, 25, "intersect")
        ])

        expected_nes = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 3, "same"),
            Entity("_", 2, 3, "include"),
            Entity("_", 4, 5, "same"),
            Entity("_", 14, 17, "intersect"),
            Entity("_", 16, 17, "include"),
            Entity("_", 16, 18, "intersect")
        ])

        input_doc = self.doc.with_additional_extras({"ne": nes})
        actual_doc = EntitiesCollapser({"Habitat", "Bacteria",
                                        "Geographical"}).transform(input_doc)
        actual_extras = actual_doc.extras
        self.assertDictEqual(actual_extras, {"ne": expected_nes})
Пример #2
0
    def test_collapsing_with_ne(self):
        input_doc = self.doc.with_additional_extras({"ne": self.doc.entities})
        input_doc = input_doc.without_relations().without_entities()

        entities = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 4, "same"),
            Entity("_", 3, 4, "include"),
            Entity("_", 5, 6, "same"),
            Entity("_", 15, 19, "intersect"),
            Entity("_", 17, 20, "include"),
            Entity("_", 22, 25, "intersect")
        ])

        input_doc = input_doc.with_entities(entities)

        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "$Geographical$", ".",
            "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10",
            "$Habitat$", "had", "onset", "of", "bacteriologically",
            "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 24)]
        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]

        expected_nes = SortedSpansSet([
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 17, "Habitat"),
            Entity("T7", 16, 17, "Geographical"),
            Entity("T8", 16, 17, "Geographical"),
            Entity("T9", 22, 23, "Bacteria")
        ])

        expected_entities = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 3, "same"),
            Entity("_", 2, 3, "include"),
            Entity("_", 4, 5, "same"),
            Entity("_", 14, 17, "intersect"),
            Entity("_", 16, 17, "include"),
            Entity("_", 16, 18, "intersect")
        ])

        expected_doc = Document("_",
                                expected_tokens,
                                expected_sentences,
                                expected_paragraphs,
                                expected_entities,
                                extras={"ne": expected_nes})

        actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"},
                                       True).transform(input_doc)
        self.assertEqual(expected_doc, actual_doc)
Пример #3
0
class TestSpansStorage(unittest.TestCase):
    def setUp(self):
        self.sents = SortedSpansSet([
            Sentence(6, 9),
            Sentence(0, 10),
            Sentence(4, 12),
            Sentence(6, 9),
            Sentence(6, 12)
        ])
        self.ents = SortedSpansSet([
            Entity('', 0, 5, ''),
            Entity('', 0, 5, ''),
            Entity('1', 2, 6, ''),
            Entity('2', 2, 7, ''),
            Entity('', 7, 9, ''),
            Entity('', 7, 9, '')
        ])

    def test_sent_contains(self):
        self.assertTrue(Sentence(0, 10) in self.sents)

    def test_sent_at_token_no_values(self):
        self.assertEqual([], self.sents.at_token(12))

    def test_sent_at_token_single_value(self):
        self.assertEqual([Sentence(0, 10)], self.sents.at_token(0))

    def test_sent_at_token_multiple_values(self):
        self.assertEqual([Sentence(0, 10), Sentence(4, 12)],
                         self.sents.at_token(4))

    def test_ent_indexed_at_token_middle(self):
        self.assertEqual([(1, Entity('1', 2, 6, '')),
                          (2, Entity('2', 2, 7, ''))],
                         self.ents.indexed_at_token(5))

    def test_ent_indexed_at_token_end(self):
        self.assertEqual([(3, Entity('', 7, 9, ''))],
                         self.ents.indexed_at_token(8))

    def test_sent_contained_in_exact(self):
        self.assertEqual([Sentence(6, 9)],
                         self.sents.contained_in(Sentence(6, 9)))

    def test_sent_contained_in_intersect(self):
        self.assertEqual([Sentence(0, 10), Sentence(6, 9)],
                         self.sents.contained_in(Sentence(0, 11)))

    def test_ent_indexed_contained_inexact(self):
        self.assertEqual([(1, Entity('1', 2, 6, '')),
                          (2, Entity('2', 2, 7, ''))],
                         self.ents.indexed_contained_in(Sentence(1, 8)))

    def test_ent_indexed_contained_exact(self):
        self.assertEqual([(1, Entity('1', 2, 6, '')),
                          (2, Entity('2', 2, 7, ''))],
                         self.ents.indexed_contained_in(Sentence(2, 7)))
Пример #4
0
 def setUp(self):
     self.sents = SortedSpansSet([
         Sentence(6, 9),
         Sentence(0, 10),
         Sentence(4, 12),
         Sentence(6, 9),
         Sentence(6, 12)
     ])
     self.ents = SortedSpansSet([
         Entity('', 0, 5, ''),
         Entity('', 0, 5, ''),
         Entity('1', 2, 6, ''),
         Entity('2', 2, 7, ''),
         Entity('', 7, 9, ''),
         Entity('', 7, 9, '')
     ])
Пример #5
0
    def test_net_preprocessor(self):
        filter_types = {"TeamFilter"}
        ne_replacements = {"PlayerCoach1": "Coach", "PlayerCoach2": "Coach"}
        ent_replacements = {"PlayerCoach1": "PlayerCoach", "PlayerCoach2": "PlayerCoach"}
        preprocessor = NETPreprocessor(filter_types, ne_replacements, ent_replacements)

        expected_entities = [
            Entity("T1", 4, 5, "Team"),
            Entity("T2", 6, 8, "PlayerCoach"),
            Entity("T3", 23, 24, "PlayerCoach"),
            Entity("T5", 30, 31, "Team"),
            Entity("T6", 39, 41, "Coach"),
            Entity("T7", 42, 44, "Coach")
        ]
        expected_nes = SortedSpansSet([
            Entity("T1", 4, 5, "Team"),
            Entity("T2", 6, 8, "Coach"),
            Entity("T3", 23, 24, "Coach"),
            Entity("T5", 30, 31, "Team"),
            Entity("T6", 39, 41, "Coach"),
            Entity("T7", 42, 44, "Coach")
        ])

        expected_doc = self.doc.without_entities().with_entities(expected_entities).\
            with_additional_extras({"ne": expected_nes})
        self.assertEqual(expected_doc, preprocessor.process_doc(self.doc))

        props = {
            "ent_types_to_filter": ["TeamFilter"],
            "ne_types_merge_pattern": {"Coach": ["PlayerCoach1", "PlayerCoach2"]},
            "ent_types_merge_pattern": {"PlayerCoach": ["PlayerCoach1", "PlayerCoach2"]}
        }

        preprocessor = NETPreprocessor.from_props(props)
        self.assertEqual(expected_doc, preprocessor.process_doc(self.doc))
Пример #6
0
    def setUp(self) -> None:
        tokens = ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon", "in", "LA", "in", "USA", "."]
        sents = [Sentence(0, 12)]
        ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER"), Entity("_", 8, 9, "ORG"), Entity("_", 10, 11, "ORG")]
        nes = SortedSpansSet([
                Entity("gen", 0, 1, "STUFF"),
                Entity("gen", 4, 5, "PERORG"), Entity("gen", 6, 7, "PERORG"),
                Entity("gen", 8, 9, "PERORG"), Entity("gen", 10, 11, "PERORG")
        ])

        self.doc = Document('', tokens, sents, [], ents, extras={"ne": nes})
Пример #7
0
 def test_ne_features(self):
     ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")]
     doc = Document('',
                    ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"],
                    [Sentence(0, 7)], [],
                    extras={'ne': SortedSpansSet(ents)})
     fe, meta = ne_fe_factory([doc], {"ne_emb_size": 10})
     features = fe.extract_features_from_doc(doc, 3, 7)['ne']
     self.assertEqual(len(meta.get_embedded_features()), 1)
     self.assertEqual(len(features), 4)
     self.assertEqual(features[0], features[2])  # O O
     self.assertEqual(features[1], features[3])  # I-PER I-PER
     self.assertNotEqual(features[0], features[1])  # O I-PER
Пример #8
0
    def process_doc(self, doc: Document) -> Document:
        new_entities = []
        nes = []

        for ent in doc.entities:
            if ent.type in self.__filter:
                continue

            new_ne_type = self.__ne_replacements.get(ent.type, ent.type)
            new_ent_type = self.__ents_replacements.get(ent.type, ent.type)

            new_entities.append(ent.with_type(new_ent_type))
            nes.append(ent.with_type(new_ne_type))

        return doc.without_relations().without_entities().with_entities(new_entities). \
            with_additional_extras({"ne": SortedSpansSet(nes)})
Пример #9
0
    def get_extras(self, tokens, sentences):
        sents, sent_starts, raw_tokens = _get_space_joined_sentences(
            tokens, sentences)
        ne_doc = list(self.api.named_entities(sents, language=self.lang))

        raw_entities = []
        for sent_start, ne_sent in zip(sent_starts, ne_doc):
            for ne in ne_sent:
                raw_entities.append({
                    'id': str(len(raw_entities)),
                    'type': ne[-1],
                    'start': sent_start + ne[0],
                    'end': sent_start + ne[1]
                })

        entities = align_raw_entities(raw_entities, raw_tokens)
        if self.remove_quotes:
            entities = self.__remove_quotes(tokens, entities)

        return {'ne': SortedSpansSet(entities)}
Пример #10
0
    def test_fully_augmented(self):
        tokens = [
            "Elon", "Musk", "must", "donate", "Tesla", "to", "our",
            "subscribers", ".", "It", "is", "important", "!"
        ]
        sentences = [Sentence(0, 9), Sentence(9, 13)]
        entities = [
            Entity("_", 0, 2, "CEO"),
            Entity("_", 3, 4, "donate"),
            Entity("_", 4, 5, "Tesla"),
            Entity("_", 7, 8, "subscribers"),
            Entity("_", 9, 10, "It"),
            Entity("_", 11, 12, "important")
        ]

        nes = SortedSpansSet(
            [Entity("_", 4, 5, "Tesla"),
             Entity("_", 11, 12, "important")])

        token_features = {
            "tokens":
            list(tokens),
            "pos": [
                "NNP", "NNP", "VB", "VB", "NNP", "TO", "NNPS", "NNS", "DOT",
                "NNP", "VB", "RB", "DOT"
            ]
        }

        expected_doc = Document("_",
                                tokens,
                                sentences, [],
                                entities,
                                token_features=token_features,
                                extras={"ne": nes})
        to_augment = [
            "CEO", "donate", "Tesla", "subscribers", "It", "important"
        ]
        actual_doc = EntitiesUnquoteAugmentor(1.0,
                                              to_augment).transform(self.doc)

        self.assertEqual(expected_doc, actual_doc)
Пример #11
0
    def setUp(self) -> None:
        tokens = [
            "Elon", "Musk", "must", "«", "donate", "»", "\'", "Tesla", "\'",
            "to", "our", "\"", "subscribers", "\"", ".", "It", "is", "\"",
            "important", "\"", "!"
        ]
        sentences = [Sentence(0, 15), Sentence(15, 21)]
        entities = [
            Entity("_", 0, 2, "CEO"),
            Entity("_", 4, 5, "donate"),
            Entity("_", 7, 8, "Tesla"),
            Entity("_", 12, 13, "subscribers"),
            Entity("_", 15, 16, "It"),
            Entity("_", 18, 19, "important")
        ]

        nes = SortedSpansSet(
            [Entity("_", 6, 9, "Tesla"),
             Entity("_", 17, 20, "important")])

        token_features = {
            "tokens":
            list(tokens),
            "pos": [
                "NNP", "NNP", "VB", "QUOTE", "VB", "QUOTE", "QUOTE", "NNP",
                "QUOTE", "TO", "NNPS", "QUOTE", "NNS", "QUOTE", "DOT", "NNP",
                "VB", "QUOTE", "RB", "QUOTE", "DOT"
            ]
        }

        self.doc = Document("_",
                            tokens,
                            sentences, [],
                            entities,
                            token_features=token_features,
                            extras={"ne": nes})
Пример #12
0
    def setUp(self) -> None:
        self.docs = []

        tokens = [
            "Главный", "тренер", "римского", "«", "Лацио", "»", "Симоне", "Индзаги", "продолжит", "работу", "с",
            "командой", ",", "сообщает", "пресс-служба", "клуба", ".", "Ранее", "сообщалось", ",",  "что", "в",
            "услугах", "Индзаги", "заинтересованы", "«", "Милан", "»", "и", "«", "Ювентус", "»", ",", "которые",
            "пребывают", "без", "наставников", "после", "ухода", "Дженнаро", "Гаттузо", "и", "Массимилиано", "Аллегри",
            "."
        ]

        sentences = [Sentence(0, 17), Sentence(17, 45)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T1", 4, 5, "Team"),
            Entity("T2", 6, 8, "Coach"),
            Entity("T3", 23, 24, "Coach"),
            Entity("T4", 26, 27, "Team"),
            Entity("T5", 30, 31, "Team"),
            Entity("T6", 39, 41, "Coach"),
            Entity("T7", 42, 44, "Coach")
        ]
        named_entities = [
            Entity("generated", 3, 6, "ORG"),
            Entity("generated", 6, 8, "PER"),
            Entity("generated", 23, 24, "PER"),
            Entity("generated", 25, 28, "ORG"),
            Entity("generated", 29, 32, "ORG"),
            Entity("generated", 39, 41, "PER"),
            Entity("generated", 42, 44, "PER")
        ]

        doc = Document("_", tokens, sentences, paragraphs, entities, extras={"ne": SortedSpansSet(named_entities)})
        self.docs.append(doc)

        tokens = [
            "Врачи", "сборной", "Бразилии", "подтвердили", "травму", "нападающего", "«", "Пари", "Сен-Жермен", "»",
            "Неймара", ",", "полученную", "во", "время", "товарищеского", "матча", "с", "Катаром", "."
        ]

        sentences = [Sentence(0, 20)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T1", 1, 3, "Team"),
            Entity("T2", 7, 9, "Team"),
            Entity("T3", 10, 11, "Player"),
            Entity("T4", 18, 19, "Team")
        ]
        named_entities = [
            Entity("generated", 1, 3, "ORG"),
            Entity("generated", 6, 10, "ORG"),
            Entity("generated", 10, 11, "PER"),
            Entity("generated", 18, 19, "ORG")
        ]

        doc = Document("_", tokens, sentences, paragraphs, entities, extras={"ne": SortedSpansSet(named_entities)})
        self.docs.append(doc)

        self.common_props = {
            "seed": 1,
            "internal_emb_size": 10,
            "learning_rate": 0.005,
            "batcher": {
                "batch_size": 4,
            },
            "encoding_size": 1,
            "dropout": 0.5,
            "optimizer": "adam",
            "epoch": 2,
            "clip_norm": 5
        }

        self.docs_no_entities = [d.without_entities() for d in self.docs]
Пример #13
0
    def transform(self, doc: Document) -> Document:
        if self.__prob == 0.0 or not self.__types:
            return doc

        borders_dicts = {
            'entities': build_borders_dict(doc.entities),
            'sentences': build_borders_dict(doc.sentences),
        }
        if 'ne' in doc.extras:
            borders_dicts['ne'] = build_borders_dict(doc.extras["ne"])

        if set(doc.extras.keys()).difference({"ne"}):
            raise Exception("Can only work with ne extras")

        quotes_idx = set()

        for ent in doc.entities:
            if ent.type not in self.__types or not self.__quoted(doc, ent):
                continue

            if random() < 1.0 - self.__prob:
                continue

            quotes_idx.add(ent.start_token - 1)
            quotes_idx.add(ent.end_token)

            ent_shifted_start, ent_shifted_end = borders_dicts["entities"][ent]
            for key, val in borders_dicts.items():
                shift_borders_after_collapse(val,
                                             ent_shifted_start - 1,
                                             ent_shifted_start,
                                             new_length=0)
                # shift second quote span after first quote replacement
                shift_borders_after_collapse(val,
                                             ent_shifted_end - 1,
                                             ent_shifted_end,
                                             new_length=0)

        new_tokens = [
            tok for idx, tok in enumerate(doc.tokens) if idx not in quotes_idx
        ]
        new_sentences = create_objects_with_new_borders(
            doc.sentences, borders_dicts["sentences"])
        new_entities = create_objects_with_new_borders(
            doc.entities, borders_dicts["entities"])

        if "ne" in doc.extras:
            new_extras = {
                "ne":
                SortedSpansSet(
                    create_objects_with_new_borders(
                        doc.extras["ne"], borders_dicts["ne"]).values())
            }
        else:
            new_extras = None

        new_token_features = {
            k: [v for idx, v in enumerate(val) if idx not in quotes_idx]
            for k, val in doc.token_features.items()
        }

        return Document(doc.name,
                        new_tokens,
                        new_sentences.values(),
                        doc.paragraphs,
                        new_entities.values(),
                        token_features=new_token_features,
                        extras=new_extras)
Пример #14
0
def _collapse_entities_in_doc(doc, entities_to_collapse: Iterable[Entity],
                              entity_types_to_collapse: Union[set, frozenset]):

    if set(doc.extras.keys()).difference({"ne"}):
        raise Exception("Currently support only ne extras")

    # copy features not to affect default document
    tokens_to_process = list(doc.tokens)
    token_features_to_process = {
        k: list(v)
        for k, v in doc.token_features.items()
    }

    borders_to_change = {
        'entities_to_collapse': build_borders_dict(entities_to_collapse),
        'sentences': build_borders_dict(doc.sentences)
    }
    try:
        borders_to_change["entities"] = build_borders_dict(doc.entities)
    except ValueError:
        pass

    if "ne" in doc.extras:
        borders_to_change["ne"] = build_borders_dict(doc.extras["ne"])

    _collapse_entities_and_correct_features(entities_to_collapse,
                                            tokens_to_process,
                                            token_features_to_process,
                                            entity_types_to_collapse,
                                            borders_to_change)

    sentences_mapping = create_objects_with_new_borders(
        doc.sentences, borders_to_change['sentences'])
    collapsed_entities_mapping = create_objects_with_new_borders(
        entities_to_collapse, borders_to_change['entities_to_collapse'])

    if 'entities' in borders_to_change:
        doc_entities_mapping = create_objects_with_new_borders(
            doc.entities, borders_to_change['entities'])
        doc_entities = doc_entities_mapping.values()
    else:
        doc_entities = None

    if "ne" in doc.extras:
        ne_mapping = create_objects_with_new_borders(doc.extras["ne"],
                                                     borders_to_change["ne"])
        extras = {"ne": SortedSpansSet(ne_mapping.values())}
    else:
        extras = None

    doc_to_process = Document(doc.name,
                              tokens_to_process,
                              sentences_mapping.values(),
                              doc.paragraphs,
                              doc_entities,
                              token_features=token_features_to_process,
                              extras=extras)

    try:
        relations = [
            Relation(doc_entities_mapping[r.first_entity],
                     doc_entities_mapping[r.second_entity], r.type)
            for r in doc.relations
        ]
        doc_to_process = doc_to_process.with_relations(relations)
    except ValueError:
        pass

    return doc_to_process, collapsed_entities_mapping
Пример #15
0
    def setUp(self) -> None:
        self.docs = [
            Document('1',
                     ['Во', 'время', 'своих', 'прогулок', 'в', 'окрестностях', 'Симеиза', 'я', 'обратил', 'внимание',
                      'на', 'одинокую', 'дачу', ',', 'стоявшую', 'на', 'крутом', 'склоне', 'горы', '.',
                      'К', 'этой', 'даче', 'не', 'было', 'проведено', 'даже', 'дороги', '.',
                      'Кругом', 'она', 'была', 'обнесена', 'высоким', 'забором', ',', 'с', 'единственной', 'низкой',
                      'калиткой', ',', 'которая', 'всегда', 'была', 'плотно', 'прикрыта', '.'],
                     [Sentence(0, 20), Sentence(20, 29), Sentence(29, 47)],
                     [Paragraph(0, 3)],
                     [Entity('1', 2, 3, 'pron'),
                      Entity('1', 7, 8, 'pron'),
                      Entity('1', 11, 13, 'noun'),
                      Entity('1', 21, 23, 'noun'),
                      Entity('1', 30, 31, 'pron'),
                      Entity('1', 33, 35, 'noun'),
                      Entity('1', 37, 38, 'noun'),
                      Entity('1', 37, 40, 'noun'),
                      Entity('1', 41, 42, 'pron')],
                     {
                         Relation(Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), 'COREF'),
                         Relation(Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), 'COREF'),
                         Relation(Entity('1', 11, 13, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'),
                         Relation(Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'),
                         Relation(Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron'), 'COREF'),
                     },
                     {
                         'pos': ['ADP', 'NOUN', 'DET', 'NOUN', 'ADP', 'NOUN', 'PROPN', 'PRON', 'VERB', 'NOUN', 'ADP',
                                 'ADJ', 'NOUN',
                                 'PUNCT', 'VERB', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'PART',
                                 'AUX', 'VERB',
                                 'PART', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'ADJ', 'NOUN', 'PUNCT', 'ADP',
                                 'ADJ', 'ADJ',
                                 'NOUN', 'PUNCT', 'PRON', 'ADV', 'AUX', 'ADV', 'VERB', 'PUNCT'],
                         'dt_labels': ['case', 'fixed', 'amod', 'obl', 'case', 'nmod', 'nmod', 'nsubj', 'root', 'obj',
                                       'case',
                                       'amod', 'nmod', 'punct', 'amod', 'case', 'amod', 'obl', 'nmod', 'punct', 'case',
                                       'amod',
                                       'obl', 'advmod', 'aux:pass', 'root', 'advmod', 'nsubj', 'punct', 'advmod',
                                       'nsubj',
                                       'aux:pass', 'root', 'amod', 'obl', 'punct', 'case', 'amod', 'amod', 'conj',
                                       'punct', 'nsubj',
                                       'advmod', 'aux:pass', 'advmod', 'acl:relcl', 'punct'],
                         'dt_head_distances': [3, -1, 1, 5, 1, -2, -1, 1, 0, -1, 2, 1, -3, -1, -2, 2, 1, -3, -1, -1, 2,
                                               1, 3, 2, 1,
                                               0, 1, -2, -1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, -5, -1, 4, 3, 2, 1, -11,
                                               -1],
                         'lemmas': ['во', 'время', 'свой', 'прогулка', 'в', 'окрестность', 'Симеиза', 'я', 'обращать',
                                    'внимание',
                                    'на', 'одинокий', 'дача', ',', 'стоять', 'на', 'крутой', 'склон', 'гора', '.', 'к',
                                    'этот',
                                    'дача', 'не', 'быть', 'проводить', 'даже', 'дорога', '.', 'кругом', 'она', 'быть',
                                    'обнесен',
                                    'высокий', 'забор', ',', 'с', 'единственный', 'низкий', 'калитка', ',', 'который',
                                    'всегда',
                                    'быть', 'плотно', 'прикрывать', '.'],
                         'feats': [{}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                        'Gender': 'Neuter'},
                                   {'Number': 'Plural', 'Pronoun': 'REFLEXIVE', 'Case': 'Genitive'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Plural',
                                    'Gender': 'Neuter'}, {},
                                   {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Plural',
                                    'Gender': 'Masculine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'DEICTIC',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Neuter'}, {},
                                   {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular',
                                    'Gender': 'Feminine',
                                    'Tense': 'NotPast'},
                                   {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular',
                                    'Gender': 'Feminine',
                                    'Tense': 'Past'}, {},
                                   {'Case': 'Prepositional', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {'Case': 'Dative', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Neuter',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'}, {},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Feminine', 'Number': 'Singular',
                                    'Pronoun': 'PERSONAL',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'},
                                   {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {}, {}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Feminine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'}, {}],
                         'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O'],
                     },
                     {
                         'ne': SortedSpansSet([Entity('1', 6, 7, 'GPE_CITY')])
                     }
                     ),
            Document('1',
                     ['Когда', 'мы', 'шли', 'по', 'тропинке', ',', 'каждый', 'был', 'доволен', 'и', 'думал', ',', 'что',
                      'надул', 'другого', '.',
                      'Петька', 'изредка', 'посапывал', 'носом', '.',
                      'Давно', 'он', 'зарился', 'на', 'моих', 'голубей', ',', 'еще', 'с', 'прошлой', 'зимы', ',', 'а',
                      'теперь', 'вот', 'счастье', 'неожиданно', 'привалило', '.',
                      'А', 'у', 'меня', 'будет', 'пистолет', '.'],
                     [Sentence(0, 16), Sentence(16, 21), Sentence(21, 40), Sentence(40, 46)],
                     [Paragraph(0, 3)],
                     [
                         Entity('1', 1, 2, 'pron'),
                         Entity('1', 16, 17, 'noun'),
                         Entity('1', 22, 23, 'pron'),
                         Entity('1', 25, 26, 'pron'),
                         Entity('1', 25, 27, 'noun'),
                         Entity('1', 42, 43, 'pron'),
                         Entity('1', 44, 45, 'noun'),
                     ],
                     {
                         Relation(Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), 'COREF'),
                         Relation(Entity('1', 25, 26, 'pron'), Entity('1', 42, 43, 'pron'), 'COREF'),
                     },
                     {
                         'pos': ['SCONJ', 'PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'ADJ', 'AUX', 'ADJ', 'CCONJ', 'VERB',
                                 'PUNCT',
                                 'SCONJ', 'VERB', 'ADJ', 'PUNCT', 'NOUN', 'ADV', 'VERB', 'NOUN', 'PUNCT', 'ADV', 'PRON',
                                 'VERB',
                                 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'ADV',
                                 'PART',
                                 'NOUN', 'ADV', 'VERB', 'PUNCT', 'CCONJ', 'ADP', 'PRON', 'VERB', 'NOUN', 'PUNCT'],
                         'dt_labels': ['mark', 'nsubj', 'advcl', 'case', 'obl', 'punct', 'nsubj', 'cop', 'root', 'cc',
                                       'conj',
                                       'punct', 'mark', 'advcl', 'obj', 'punct', 'nsubj', 'advmod', 'root', 'obl',
                                       'punct', 'advmod',
                                       'nsubj', 'root', 'case', 'amod', 'obl', 'punct', 'advmod', 'case', 'obl', 'nmod',
                                       'punct',
                                       'cc', 'advmod', 'advmod', 'nsubj', 'advmod', 'conj', 'punct', 'cc', 'case',
                                       'root', 'cop',
                                       'nsubj', 'punct'],
                         'dt_head_distances': [8, 1, 6, 1, -2, -1, 2, 1, 0, 1, -2, -1, -2, -3, -1, -1, 2, 1, 0, -1, -1,
                                               2, 1, 0, 2,
                                               1, -3, -1, 2, 1, -7, -1, -1, 5, 4, 1, 2, 1, -15, -1, 2, 1, 0, -1, -2,
                                               -1],
                         'lemmas': ['когда', 'мы', 'идти', 'по', 'тропинка', ',', 'каждый', 'быть', 'довольный', 'и',
                                    'думать', ',',
                                    'что', 'надуть', 'другой', '.', 'Петька', 'изредка', 'посапывать', 'нос', '.',
                                    'давно', 'он',
                                    'зариться', 'на', 'мой', 'голубь', ',', 'еще', 'с', 'прошлый', 'зима', ',', 'а',
                                    'теперь', 'вот',
                                    'счастье', 'неожиданно', 'приваливать', '.', 'а', 'у', 'я', 'быть', 'пистолет',
                                    '.'],
                         'feats': [{}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'DEICTIC',
                                        'Case': 'Nominative'},
                                   {'Number': 'Plural', 'Tense': 'Past', 'Mode': 'Indicative'}, {},
                                   {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Shortness': 'Short'}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {},
                                   {'Case': 'Nominative', 'Animacy': 'Animated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'PERSONAL',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {},
                                   {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'POSSESSIVE',
                                    'Case': 'Accusative'},
                                   {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Plural',
                                    'Gender': 'Masculine'}, {}, {},
                                   {}, {'Case': 'Genitive', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {}, {}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                            'Gender': 'Neuter'},
                                   {},
                                   {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {}, {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'DEICTIC',
                                    'Case': 'Genitive'},
                                   {'Person': 'Third', 'Number': 'Singular', 'Tense': 'NotPast', 'Mode': 'Indicative'},
                                   {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {}],
                         'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O'],
                     },
                     {
                         'ne': SortedSpansSet([Entity('1', 16, 17, 'PERSON')])
                     }
                     )
        ]
        # empty sets are "known" rels
        self.hook = get_hook([doc.without_relations().with_relations(set()) for doc in self.docs])
        self.base_props = {
            "seed": 12345,

            "distance": 10,
            "max_distance": 10,
            "loss": "cross_entropy",
            "optimizer": "momentum",
            "lr_decay": 0.05,
            "momentum": 0.9,
            "dropout": 0.5,
            "internal_size": 10,
            "epoch": 1,
            "batch_size": 64,
            "learning_rate": 0.1,

            "clip_norm": 5,

            "max_candidate_distance": 50,
            "max_entity_distance": 50,
            "max_word_distance": 50,
            "max_sent_distance": 10,
            "max_dt_distance": 10,
            "dist_size": 50,

            "pos_emb_size": 0,
            "morph_feats_emb_size": 0,
            "entities_types_size": 20,

            "morph_feats_size": 0,
            "morph_feats_list": ["Gender", "Animacy", "Number"],

            "encoding_type": "lstm",
            "entity_encoding_size": 10,
            "encoding_size": 10,
            "classifiers": ["exact_match", "intersecting_mentions"],
            "use_filter": False,

            "max_sent_entities_distance": 10,
            "max_token_entities_distance": 20,

            "agreement_types": ["Gender", "Animacy", "Number"],
            "classifier_agreement_size": 0,

            "head_str_match_size": 0,
            "partial_str_match_size": 0,
            "ordered_partial_str_match_size": 0,

            "mention_interrelation_size": 0,
            "mention_distance_size": 0,
            "max_mention_distance": 50,
            "classifier_entity_distance_size": 0,
            "entities_types_in_classifier_size": 0,
            "head_ne_types_size": 0,
            "entities_token_distance_in_classifier_size": 0,
            "entities_sent_distance_in_classifier_size": 0,

            "encoder_entity_types_size": 0,
            "encoder_entity_ne_size": 0,

            "speech_types": ["said"],
            "speech_size": 0,

            "entity_encoding_type": "rnn",

            "classification_dense_size": 20,
        }
        self.experiment_props = {
            "sampling_strategy": ["coref_noun", "coref_pron_cluster", 'coref_pron_cluster_strict', 'coref_pron']
        }