def test_without_labels(self): ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")] doc = Document('', ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"], [Sentence(0, 7)], [], ents) ner_fe, token_meta = ner_fe_factory([doc], {"internal_emb_size": 10}) doc = doc.without_entities() features, = ner_fe.extract_features_from_doc(doc) words = features['words_0'] self.assertEqual(features['seq_len'], 7) self.assertEqual(len(words), 7) self.assertNotEqual(words[0], words[1]) # Planning of self.assertEqual(words[1], words[3]) # of of self.assertRaises(KeyError, lambda: features['labels'])
class TestNETFeatureExtractorClass(unittest.TestCase): def setUp(self) -> None: ents = [ Entity("_", 4, 5, "Master"), Entity("_", 6, 7, "CEO"), Entity("_", 8, 9, "CITY") ] self.doc_ne = [ Entity("gen", 0, 1, "STUFF"), Entity("gen", 4, 5, "PER"), Entity("gen", 4, 5, "ELON"), Entity("gen", 6, 7, "PER"), Entity("gen", 6, 7, "RESTR"), Entity("gen", 8, 9, "LOC"), Entity("gen", 8, 9, "RESTR"), Entity("gen", 10, 11, "LOC"), Entity("gen", 12, 13, "PER"), Entity("gen", 18, 19, "LOC") ] extras = {"ne": self.doc_ne} self.doc = Document('', [ 'Planning', 'of', 'work', 'of', 'Elon', "by", "Elon", "in", "LA", "in", "USA", "." 'Elon', 'is', 'going', 'to', 'land', 'on', 'Mars', "." ], [Sentence(0, 12), Sentence(12, 20)], [], ents, extras=extras) self.net_fe, _, _ = net_fe_factory( [self.doc], { "internal_emb_size": 10, "ne_type_in_classifier_size": 10, "token_position_size": 10, "max_word_distance": 5, "restricted_ne_types": ["RESTR"] }) def test_without_labels(self): actual_entities, actual_samples = zip( *self.net_fe.extract_features_from_doc( self.doc.without_entities())) self._test_samples(actual_entities, actual_samples) for sample in actual_samples: if isinstance(sample, dict): self.assertNotIn("labels", sample) def test_with_labels(self): actual_entities, actual_samples = zip( *self.net_fe.extract_features_from_doc(self.doc, include_labels=True)) self._test_samples(actual_entities, actual_samples) for sample in actual_samples: if isinstance(sample, dict): self.assertIn("labels", sample) self.assertNotEqual(actual_samples[1]["labels"], actual_samples[3]["labels"]) # MASTER CEO self.assertNotEqual(actual_samples[5]["labels"], actual_samples[7]["labels"]) # CITY None def _test_samples(self, actual_entities, actual_samples): self.assertListEqual(list(actual_entities), self.doc_ne) # STUFF entities are not classified, RESTR entities are restricted for idx in [0, 4, 6]: self.assertIsNone(actual_samples[idx]) # ELON entities always classified as Master self.assertEqual("Master", actual_samples[2]) # Need to classify PER and LOC entities first_sent_samples_to_classify = [ actual_samples[i] for i in [1, 3, 5, 7] ] for ent_sample in first_sent_samples_to_classify: self.assertIsInstance(ent_sample, dict) words = ent_sample['words_0'] self.assertEqual(ent_sample['seq_len'], 12) self.assertEqual(len(words), 12) self.assertNotEqual(words[0], words[1]) # Planning of self.assertEqual(words[1], words[3]) # of of ent_1_sample, ent_3_sample, ent_5_sample, ent_7_sample = first_sent_samples_to_classify self.assertEqual(ent_5_sample["ne_type_in_classifier"], ent_7_sample["ne_type_in_classifier"]) # LOC LOC self.assertEqual(ent_1_sample["ne_type_in_classifier"], ent_3_sample["ne_type_in_classifier"]) # PER PER self.assertNotEqual(ent_1_sample["ne_type_in_classifier"], ent_5_sample["ne_type_in_classifier"]) # LOC PER self.assertEqual(ent_5_sample["token_position"][7], ent_7_sample["token_position"][9]) # -1 -1 self.assertEqual(ent_5_sample["token_position"][9], ent_7_sample["token_position"][11]) # 1 1 self.assertNotEqual( ent_3_sample["labels_mask"], ent_5_sample["labels_mask"]) # MASTER, CEO | CITY, None self.assertEqual(ent_5_sample["labels_mask"], ent_7_sample["labels_mask"]) self.assertEqual(ent_1_sample["labels_mask"], ent_3_sample["labels_mask"]) self.assertEqual(ent_1_sample["indices"], [[4, 5]]) self.assertEqual(ent_3_sample["indices"], [[6, 7]]) self.assertEqual(ent_5_sample["indices"], [[8, 9]]) self.assertEqual(ent_7_sample["indices"], [[10, 11]]) ent_8_sample, ent_9_sample = actual_samples[8], actual_samples[9] self.assertEqual(ent_8_sample["indices"], [[0, 1]]) self.assertEqual(ent_9_sample["indices"], [[6, 7]])
class NERCPreprocessorsTest(unittest.TestCase): def setUp(self) -> None: tokens = [ "Главный", "тренер", "римского", "«", "Лацио", "»", "Симоне", "Индзаги", "продолжит", "работу", "с", "командой", ",", "сообщает", "пресс-служба", "клуба", ".", "Ранее", "сообщалось", ",", "что", "в", "услугах", "Индзаги", "заинтересованы", "«", "Милан", "»", "и", "«", "Ювентус", "»", ",", "которые", "пребывают", "без", "наставников", "после", "ухода", "Дженнаро", "Гаттузо", "и", "Массимилиано", "Аллегри", "." ] sentences = [Sentence(0, 17), Sentence(17, 45)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T1", 4, 5, "Team"), Entity("T2", 6, 8, "PlayerCoach1"), Entity("T3", 23, 24, "PlayerCoach2"), Entity("T4", 26, 27, "TeamFilter"), Entity("T5", 30, 31, "Team"), Entity("T6", 39, 41, "Coach"), Entity("T7", 42, 44, "Coach") ] self.doc = Document("_", tokens, sentences, paragraphs, entities) def test_ner_preprocessor(self): filter_types = {"TeamFilter"} replacements = {"PlayerCoach1": "Coach", "PlayerCoach2": "Coach"} preprocessor = NERPreprocessor(filter_types, replacements) expected_entities = [ Entity("T1", 4, 5, "Team"), Entity("T2", 6, 8, "Coach"), Entity("T3", 23, 24, "Coach"), Entity("T5", 30, 31, "Team"), Entity("T6", 39, 41, "Coach"), Entity("T7", 42, 44, "Coach") ] expected_doc = self.doc.without_entities().with_entities(expected_entities) self.assertEqual(expected_doc, preprocessor.process_doc(self.doc)) props = { "ent_types_to_filter": ["TeamFilter"], "ent_types_merge_pattern": {"Coach": ["PlayerCoach1", "PlayerCoach2"]} } preprocessor = NERPreprocessor.from_props(props) self.assertEqual(expected_doc, preprocessor.process_doc(self.doc)) def test_net_preprocessor(self): filter_types = {"TeamFilter"} ne_replacements = {"PlayerCoach1": "Coach", "PlayerCoach2": "Coach"} ent_replacements = {"PlayerCoach1": "PlayerCoach", "PlayerCoach2": "PlayerCoach"} preprocessor = NETPreprocessor(filter_types, ne_replacements, ent_replacements) expected_entities = [ Entity("T1", 4, 5, "Team"), Entity("T2", 6, 8, "PlayerCoach"), Entity("T3", 23, 24, "PlayerCoach"), Entity("T5", 30, 31, "Team"), Entity("T6", 39, 41, "Coach"), Entity("T7", 42, 44, "Coach") ] expected_nes = SortedSpansSet([ Entity("T1", 4, 5, "Team"), Entity("T2", 6, 8, "Coach"), Entity("T3", 23, 24, "Coach"), Entity("T5", 30, 31, "Team"), Entity("T6", 39, 41, "Coach"), Entity("T7", 42, 44, "Coach") ]) expected_doc = self.doc.without_entities().with_entities(expected_entities).\ with_additional_extras({"ne": expected_nes}) self.assertEqual(expected_doc, preprocessor.process_doc(self.doc)) props = { "ent_types_to_filter": ["TeamFilter"], "ne_types_merge_pattern": {"Coach": ["PlayerCoach1", "PlayerCoach2"]}, "ent_types_merge_pattern": {"PlayerCoach": ["PlayerCoach1", "PlayerCoach2"]} } preprocessor = NETPreprocessor.from_props(props) self.assertEqual(expected_doc, preprocessor.process_doc(self.doc))