예제 #1
0
    def test_without_labels(self):
        ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")]
        doc = Document('',
                       ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"],
                       [Sentence(0, 7)], [], ents)

        ner_fe, token_meta = ner_fe_factory([doc], {"internal_emb_size": 10})
        doc = doc.without_entities()
        features, = ner_fe.extract_features_from_doc(doc)

        words = features['words_0']
        self.assertEqual(features['seq_len'], 7)
        self.assertEqual(len(words), 7)

        self.assertNotEqual(words[0], words[1])  # Planning of
        self.assertEqual(words[1], words[3])  # of of

        self.assertRaises(KeyError, lambda: features['labels'])
예제 #2
0
class TestNETFeatureExtractorClass(unittest.TestCase):
    def setUp(self) -> None:
        ents = [
            Entity("_", 4, 5, "Master"),
            Entity("_", 6, 7, "CEO"),
            Entity("_", 8, 9, "CITY")
        ]
        self.doc_ne = [
            Entity("gen", 0, 1, "STUFF"),
            Entity("gen", 4, 5, "PER"),
            Entity("gen", 4, 5, "ELON"),
            Entity("gen", 6, 7, "PER"),
            Entity("gen", 6, 7, "RESTR"),
            Entity("gen", 8, 9, "LOC"),
            Entity("gen", 8, 9, "RESTR"),
            Entity("gen", 10, 11, "LOC"),
            Entity("gen", 12, 13, "PER"),
            Entity("gen", 18, 19, "LOC")
        ]

        extras = {"ne": self.doc_ne}

        self.doc = Document('', [
            'Planning', 'of', 'work', 'of', 'Elon', "by", "Elon", "in", "LA",
            "in", "USA", "."
            'Elon', 'is', 'going', 'to', 'land', 'on', 'Mars', "."
        ], [Sentence(0, 12), Sentence(12, 20)], [],
                            ents,
                            extras=extras)

        self.net_fe, _, _ = net_fe_factory(
            [self.doc], {
                "internal_emb_size": 10,
                "ne_type_in_classifier_size": 10,
                "token_position_size": 10,
                "max_word_distance": 5,
                "restricted_ne_types": ["RESTR"]
            })

    def test_without_labels(self):
        actual_entities, actual_samples = zip(
            *self.net_fe.extract_features_from_doc(
                self.doc.without_entities()))
        self._test_samples(actual_entities, actual_samples)

        for sample in actual_samples:
            if isinstance(sample, dict):
                self.assertNotIn("labels", sample)

    def test_with_labels(self):
        actual_entities, actual_samples = zip(
            *self.net_fe.extract_features_from_doc(self.doc,
                                                   include_labels=True))
        self._test_samples(actual_entities, actual_samples)

        for sample in actual_samples:
            if isinstance(sample, dict):
                self.assertIn("labels", sample)

        self.assertNotEqual(actual_samples[1]["labels"],
                            actual_samples[3]["labels"])  # MASTER CEO
        self.assertNotEqual(actual_samples[5]["labels"],
                            actual_samples[7]["labels"])  # CITY None

    def _test_samples(self, actual_entities, actual_samples):
        self.assertListEqual(list(actual_entities), self.doc_ne)

        # STUFF entities are not classified, RESTR entities are restricted
        for idx in [0, 4, 6]:
            self.assertIsNone(actual_samples[idx])

        # ELON entities always classified as Master
        self.assertEqual("Master", actual_samples[2])

        # Need to classify PER and LOC entities
        first_sent_samples_to_classify = [
            actual_samples[i] for i in [1, 3, 5, 7]
        ]

        for ent_sample in first_sent_samples_to_classify:
            self.assertIsInstance(ent_sample, dict)
            words = ent_sample['words_0']
            self.assertEqual(ent_sample['seq_len'], 12)
            self.assertEqual(len(words), 12)

            self.assertNotEqual(words[0], words[1])  # Planning of
            self.assertEqual(words[1], words[3])  # of of

        ent_1_sample, ent_3_sample, ent_5_sample, ent_7_sample = first_sent_samples_to_classify

        self.assertEqual(ent_5_sample["ne_type_in_classifier"],
                         ent_7_sample["ne_type_in_classifier"])  # LOC LOC
        self.assertEqual(ent_1_sample["ne_type_in_classifier"],
                         ent_3_sample["ne_type_in_classifier"])  # PER PER
        self.assertNotEqual(ent_1_sample["ne_type_in_classifier"],
                            ent_5_sample["ne_type_in_classifier"])  # LOC PER

        self.assertEqual(ent_5_sample["token_position"][7],
                         ent_7_sample["token_position"][9])  # -1 -1
        self.assertEqual(ent_5_sample["token_position"][9],
                         ent_7_sample["token_position"][11])  # 1 1

        self.assertNotEqual(
            ent_3_sample["labels_mask"],
            ent_5_sample["labels_mask"])  # MASTER, CEO | CITY, None
        self.assertEqual(ent_5_sample["labels_mask"],
                         ent_7_sample["labels_mask"])
        self.assertEqual(ent_1_sample["labels_mask"],
                         ent_3_sample["labels_mask"])

        self.assertEqual(ent_1_sample["indices"], [[4, 5]])
        self.assertEqual(ent_3_sample["indices"], [[6, 7]])
        self.assertEqual(ent_5_sample["indices"], [[8, 9]])
        self.assertEqual(ent_7_sample["indices"], [[10, 11]])

        ent_8_sample, ent_9_sample = actual_samples[8], actual_samples[9]
        self.assertEqual(ent_8_sample["indices"], [[0, 1]])
        self.assertEqual(ent_9_sample["indices"], [[6, 7]])
예제 #3
0
class NERCPreprocessorsTest(unittest.TestCase):
    def setUp(self) -> None:
        tokens = [
            "Главный", "тренер", "римского", "«", "Лацио", "»", "Симоне", "Индзаги", "продолжит", "работу", "с",
            "командой", ",", "сообщает", "пресс-служба", "клуба", ".", "Ранее", "сообщалось", ",", "что", "в",
            "услугах", "Индзаги", "заинтересованы", "«", "Милан", "»", "и", "«", "Ювентус", "»", ",", "которые",
            "пребывают", "без", "наставников", "после", "ухода", "Дженнаро", "Гаттузо", "и", "Массимилиано", "Аллегри",
            "."
        ]

        sentences = [Sentence(0, 17), Sentence(17, 45)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T1", 4, 5, "Team"),
            Entity("T2", 6, 8, "PlayerCoach1"),
            Entity("T3", 23, 24, "PlayerCoach2"),
            Entity("T4", 26, 27, "TeamFilter"),
            Entity("T5", 30, 31, "Team"),
            Entity("T6", 39, 41, "Coach"),
            Entity("T7", 42, 44, "Coach")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities)

    def test_ner_preprocessor(self):
        filter_types = {"TeamFilter"}
        replacements = {"PlayerCoach1": "Coach", "PlayerCoach2": "Coach"}
        preprocessor = NERPreprocessor(filter_types, replacements)

        expected_entities = [
            Entity("T1", 4, 5, "Team"),
            Entity("T2", 6, 8, "Coach"),
            Entity("T3", 23, 24, "Coach"),
            Entity("T5", 30, 31, "Team"),
            Entity("T6", 39, 41, "Coach"),
            Entity("T7", 42, 44, "Coach")
        ]
        expected_doc = self.doc.without_entities().with_entities(expected_entities)
        self.assertEqual(expected_doc, preprocessor.process_doc(self.doc))

        props = {
            "ent_types_to_filter": ["TeamFilter"],
            "ent_types_merge_pattern": {"Coach": ["PlayerCoach1", "PlayerCoach2"]}
        }

        preprocessor = NERPreprocessor.from_props(props)
        self.assertEqual(expected_doc, preprocessor.process_doc(self.doc))

    def test_net_preprocessor(self):
        filter_types = {"TeamFilter"}
        ne_replacements = {"PlayerCoach1": "Coach", "PlayerCoach2": "Coach"}
        ent_replacements = {"PlayerCoach1": "PlayerCoach", "PlayerCoach2": "PlayerCoach"}
        preprocessor = NETPreprocessor(filter_types, ne_replacements, ent_replacements)

        expected_entities = [
            Entity("T1", 4, 5, "Team"),
            Entity("T2", 6, 8, "PlayerCoach"),
            Entity("T3", 23, 24, "PlayerCoach"),
            Entity("T5", 30, 31, "Team"),
            Entity("T6", 39, 41, "Coach"),
            Entity("T7", 42, 44, "Coach")
        ]
        expected_nes = SortedSpansSet([
            Entity("T1", 4, 5, "Team"),
            Entity("T2", 6, 8, "Coach"),
            Entity("T3", 23, 24, "Coach"),
            Entity("T5", 30, 31, "Team"),
            Entity("T6", 39, 41, "Coach"),
            Entity("T7", 42, 44, "Coach")
        ])

        expected_doc = self.doc.without_entities().with_entities(expected_entities).\
            with_additional_extras({"ne": expected_nes})
        self.assertEqual(expected_doc, preprocessor.process_doc(self.doc))

        props = {
            "ent_types_to_filter": ["TeamFilter"],
            "ne_types_merge_pattern": {"Coach": ["PlayerCoach1", "PlayerCoach2"]},
            "ent_types_merge_pattern": {"PlayerCoach": ["PlayerCoach1", "PlayerCoach2"]}
        }

        preprocessor = NETPreprocessor.from_props(props)
        self.assertEqual(expected_doc, preprocessor.process_doc(self.doc))