def test_two_entities_separated(self): sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)] entities = [Entity('', 2, 4, 'test'), Entity('', 9, 11, 'test')] expected_sentences = [Sentence(0, 2), Sentence(2, 5), Sentence(5, 8), Sentence(8, 15)] got_sentences = adjust_sentences(sentences, entities) self.assertEqual(expected_sentences, got_sentences)
def test_bio2_strategy_encoding(self): strategy = BIO2LabellingStrategy() sent = Sentence(110, 120) ents = [ Entity("_", 110, 112, "T1"), Entity("_", 112, 113, "T2"), Entity("_", 114, 115, "T3"), Entity("_", 115, 118, "T3"), Entity("_", 119, 120, "T1") ] expected_possible_categories = { "O", "I-T1", "I-T2", "I-T3", "B-T1", "B-T2", "B-T3" } actual_possible_categories = strategy.get_possible_categories( self.ent_types) self.assertEqual(expected_possible_categories, actual_possible_categories) expected_encoding = [ "I-T1", "I-T1", "I-T2", "O", "I-T3", "B-T3", "I-T3", "I-T3", "O", "I-T1" ] actual_encoding = strategy.encode_labels(sent, ents) self.assertEqual(expected_encoding, actual_encoding)
def test_nouns(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [ Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'noun'), Entity('_', 2, 3, 'noun'), Entity('_', 3, 4, 'noun'), ] rels = { Relation(Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'noun'), '1'), Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'noun'), '1'), Relation(Entity('_', 2, 3, 'noun'), Entity('_', 3, 4, 'noun'), '1'), } doc = Document('test', [], sentences, paragraphs, entities, rels) max_distance = 3 actual_samples = get_pron_samples(doc, max_distance, False) expected_samples = [] self.assertEqual(actual_samples, expected_samples)
def test_intersecting(self): ents = _create_ents([(0, 3), (0, 4), (1, 2), (5, 7), (5, 7), (6, 7), (8, 9), (10, 13), (12, 14), (13, 16)]) rels = { _create_rel(ents[1], ents[2]), _create_rel(ents[0], ents[1]), _create_rel(ents[5], ents[2]), _create_rel(ents[3], ents[4]), _create_rel(ents[6], ents[7]), _create_rel(ents[8], ents[6]) } expected_ents = [ Entity("0", 0, 4, "T1"), Entity("3", 5, 7, "T1"), Entity("6", 8, 9, "T1"), Entity("7", 10, 16, "T1") ] expected_rels = { _create_rel(expected_ents[0], expected_ents[0]), _create_rel(expected_ents[0], expected_ents[0]), _create_rel(expected_ents[1], expected_ents[0]), _create_rel(expected_ents[1], expected_ents[1]), _create_rel(expected_ents[2], expected_ents[3]), _create_rel(expected_ents[3], expected_ents[2]) } self.assertEqual((expected_ents, expected_rels), collapse_intersecting_entities(ents, rels))
def test_contained_entities(self): sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)] entities = [Entity('', 2, 6, 'test'), Entity('', 6, 7, 'test'), Entity('', 7, 8, 'test')] expected_sentences = [Sentence(0, 2), Sentence(2, 8), Sentence(8, 10), Sentence(10, 15)] got_sentences = adjust_sentences(sentences, entities) self.assertEqual(expected_sentences, got_sentences)
def _get_direction_feature(e1: Entity, e2: Entity): if e1.contains(e2): return "e2_in_e1" if e2.contains(e1): return "e1_in_e2" if e1.start_token < e2.start_token: return "e1_e2" return "e2_e1"
def test_assert_equal_with_different_types(self): ents = [ # equal lenghts, different_types Entity("8", 200, 204, "T1"), Entity("9", 200, 204, "T2"), ] rels = set() self.assertRaises(Exception, collapse_intersecting_entities, ents, rels)
def test_inner_entities_collapse(self): expected_tokens = [ "Recurrence", "of", "Pelecypod-associated", "cholera", "in", "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "$Geographical$", "of", "$Geographical$", "had", "onset", "of", "bacteriologically", "confirmed", "cholera", "." ] expected_sentences = [Sentence(0, 7), Sentence(7, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 4, "Bacteria"), Entity("T3", 3, 4, "Bacteria"), Entity("T4", 5, 6, "Geographical"), Entity("T5", 17, 18, "Habitat"), Entity("T6", 17, 23, "Habitat"), Entity("T7", 20, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def test_3_entity_paragraphs(self): sentences = [ Sentence(0, 5), Sentence(5, 10), ] paragraphs = [ Paragraph(0, 1), Paragraph(1, 2), ] entities = [ Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), ] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_samples(doc, max_distance, False) expected_samples = [ (Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), None), (Entity('_', 0, 1, '1'), Entity('_', 5, 6, '2'), None), (Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), None), ] self.assertEqual(expected_samples, actual_samples)
def setUp(self): tokens = [ "Recurrence", "of", "Pelecypod-associated", "cholera", "in", "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "Sardinian", "province", "of", "Cagliari", "had", "onset", "of", "bacteriologically", "confirmed", "cholera", "." ] sentences = [Sentence(0, 7), Sentence(7, 31)] entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 4, "Bacteria"), Entity("T3", 3, 4, "Bacteria"), Entity("T4", 5, 6, "Geographical"), Entity("T5", 17, 18, "Habitat"), Entity("T6", 17, 24, "Habitat"), Entity("T7", 20, 22, "Geographical"), Entity("T8", 23, 24, "Geographical"), Entity("T9", 29, 30, "Bacteria") ] paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] relations = [ Relation(entities[0], entities[1], "Lives_in"), Relation(entities[8], entities[6], "Lives_in") ] self.doc = Document("_", tokens, sentences, paragraphs, entities, relations)
def test_eq(self): self.assertEqual(Entity("_", 1, 2, "T"), Entity("_", 1, 2, "T")) self.assertNotEqual(Entity("_", 1, 2, "T"), Entity("_", 1, 2, "P")) self.assertNotEqual(Entity("__", 1, 2, "T"), Entity("_", 1, 2, "T")) self.assertNotEqual(Entity("_", 1, 2, "T"), Entity("_", 1, 3, "T")) self.assertNotEqual(Entity("_", 1, 2, "T"), Sentence(1, 2))
def test_entities_with_nesting_collapse(self): expected_tokens = [ "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "Sardinian", "province", "of", "Cagliari", "had", "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "." ] expected_sentences = [Sentence(0, 6), Sentence(6, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 3, "Bacteria"), Entity("T3", 2, 3, "Bacteria"), Entity("T4", 4, 5, "Geographical"), Entity("T5", 16, 17, "Habitat"), Entity("T6", 16, 23, "Habitat"), Entity("T7", 19, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def test_leq(self): self.assertTrue(Entity("_", 2, 3, "A") < Entity("_", 2, 4, "A")) self.assertTrue(Entity("_", 2, 3, "A") < Entity("_", 2, 3, "B")) self.assertTrue(Entity("1", 2, 3, "B") < Entity("11", 2, 3, "B")) self.assertTrue(Entity("1", 0, 1, "B") < Entity("1", 2, 3, "B")) self.assertRaises(Exception, lambda: Entity("_", 0, 1, "A") < TokenSpan(2, 3))
def test_ne_features(self): ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")] doc = Document('', ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"], [Sentence(0, 7)], [], extras={'ne': SortedSpansSet(ents)}) fe, meta = ne_fe_factory([doc], {"ne_emb_size": 10}) features = fe.extract_features_from_doc(doc, 3, 7)['ne'] self.assertEqual(len(meta.get_embedded_features()), 1) self.assertEqual(len(features), 4) self.assertEqual(features[0], features[2]) # O O self.assertEqual(features[1], features[3]) # I-PER I-PER self.assertNotEqual(features[0], features[1]) # O I-PER
def test_collapsement_of_same_spans(self): tokens = ["Elon", "Musk", "is", "CEO", "of", "Tesla", "."] sentences = [Sentence(0, 7)] entities = [ Entity("_", 0, 2, "ELON"), Entity("_", 0, 2, "MUSK"), Entity("_", 5, 6, "COMP"), Entity("_", 5, 6, "ORG") ] input_doc = Document("_", tokens, sentences, [], entities) expected_tokens = ["$ELON$", "is", "CEO", "of", "$COMP$", "."] expected_sentences = [Sentence(0, 6)] expected_entities = [ Entity("_", 0, 1, "ELON"), Entity("_", 0, 1, "MUSK"), Entity("_", 4, 5, "COMP"), Entity("_", 4, 5, "ORG") ] expected_doc = Document("_", expected_tokens, expected_sentences, [], expected_entities) actual_doc = EntitiesCollapser({"ELON", "COMP"}).transform(input_doc) self.assertEqual(expected_doc, actual_doc)
def test_multi_sentence(self): sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)] entities = [Entity('', 2, 9, 'test')] expected_sentences = [Sentence(0, 2), Sentence(2, 10), Sentence(10, 15)] got_sentences = adjust_sentences(sentences, entities) self.assertEqual(expected_sentences, got_sentences)
def test_one_entity(self): sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5)] entities = [Entity('', 2, 4, 'test')] expected_sentences = [Sentence(0, 2), Sentence(2, 5)] got_sentences = adjust_sentences(sentences, entities) self.assertEqual(expected_sentences, got_sentences)
def decode_labels(self, sent: Sentence, sent_labels: List[str]) -> List[Entity]: return [ Entity("generated", start + sent.start_token, end + sent.start_token, t) for start, end, t in self.decoder.decode(sent_labels) ]
def _get_token_distance(self, doc: Document, e1: Entity, e2: Entity): if 'entities_token_distance_in_classifier' not in self.classifier_converters: return {} return { 'entities_token_distance_in_classifier': self.classifier_converters['entities_token_distance_in_classifier'] [e1.token_distance_to(e2)] }
def test_io_strategy_decoding(self): strategy = IOLabellingStrategy() sent = Sentence(110, 120) labels = [ "I-T1", "I-T1", "I-T2", "O", "I-T3", "I-T3", "I-T3", "I-T3", "O", "I-T1" ] expected = [ Entity("generated", 110, 112, "T1"), Entity("generated", 112, 113, "T2"), Entity("generated", 114, 118, "T3"), Entity("generated", 119, 120, "T1") ] actual = strategy.decode_labels(sent, labels) self.assertEqual(expected, actual)
def _map_ne(doc: Document, ne: Entity): ents_at_ne = doc.entities.contained_in(ne) for ent in ents_at_ne: if ne.coincides(ent): return ent.type return None
def test_2_entity(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [ Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), ] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_samples(doc, max_distance, False) expected_samples = [(Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), None)] self.assertEqual(expected_samples, actual_samples)
def test_without_labels(self): ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")] doc = Document('', ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"], [Sentence(0, 7)], [], ents) ner_fe, token_meta = ner_fe_factory([doc], {"internal_emb_size": 10}) doc = doc.without_entities() features, = ner_fe.extract_features_from_doc(doc) words = features['words_0'] self.assertEqual(features['seq_len'], 7) self.assertEqual(len(words), 7) self.assertNotEqual(words[0], words[1]) # Planning of self.assertEqual(words[1], words[3]) # of of self.assertRaises(KeyError, lambda: features['labels'])
def _create_entities_from(self, span2position: dict, fre_objects: list) -> List[Entity]: entities = [] for fre_object in fre_objects: tokens = sorted([span2position[span] for span in fre_object["spans"]], key=lambda x: x['start']) start = tokens[0]["start"] end = tokens[-1]["end"] ent_type = self.convert_locorg(fre_object["type"]) entities.append(Entity(fre_object["id"], start, end, ent_type)) return entities
def setUp(self) -> None: tokens = ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon", "in", "LA", "in", "USA", "."] sents = [Sentence(0, 12)] ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER"), Entity("_", 8, 9, "ORG"), Entity("_", 10, 11, "ORG")] nes = SortedSpansSet([ Entity("gen", 0, 1, "STUFF"), Entity("gen", 4, 5, "PERORG"), Entity("gen", 6, 7, "PERORG"), Entity("gen", 8, 9, "PERORG"), Entity("gen", 10, 11, "PERORG") ]) self.doc = Document('', tokens, sents, [], ents, extras={"ne": nes})
def setUp(self) -> None: tokens = [ "I", "will", "do", "my", "homework", "today", ".", "It", "is", "very", "hard", "but", "i", "don't", "care", "." ] sentences = [Sentence(0, 7), Sentence(7, 16)] paragraphs = [Paragraph(0, 2)] entities = [ Entity("_", 0, 1, "t1"), Entity("_", 3, 5, "t2"), Entity("_", 7, 8, "t1"), Entity("_", 9, 11, "t2"), Entity("_", 10, 11, "t4") ] self.doc = Document("_", tokens, sentences, paragraphs, entities) self.relations = { Relation(entities[2], entities[3], "t1"), Relation(entities[3], entities[4], "t2") }
def test_1_entity(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [Entity('_', 0, 1, 'noun')] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_noun_samples(doc, max_distance, False) expected_samples = [] self.assertEqual(expected_samples, actual_samples)
def test_unify_types(self): expected = [ Entity("_", 0, 2, "League"), Entity("_", 4, 5, "Location1"), Entity("_", 6, 7, "Location1"), Entity("_", 8, 10, "League"), Entity("_", 11, 12, "League"), Entity("_", 17, 19, "Person"), Entity("_", 20, 21, "Organization"), Entity("_", 22, 23, "Person") ] self.assertListEqual( expected, unify_types_of_similar_entities(self.doc, self.doc.entities))
def _get_relation_features(doc, e1: Entity, e2: Entity, converters, name_postfix): features = {} feature_name = "rel_args_in_{}".format(name_postfix) converter = converters.get(feature_name, None) if converter is not None: features[feature_name] = converter[(e1.type, e2.type)] feature_name = "entities_token_distance_in_{}".format(name_postfix) converter = converters.get(feature_name, None) if converter is not None: features[feature_name] = converter[e1.token_distance_to(e2)] feature_name = "entities_token_log_distance_in_{}".format(name_postfix) converter = converters.get(feature_name, None) if converter is not None: features[feature_name] = converter[e1.token_distance_to(e2)] feature_name = "entities_sent_distance_in_{}".format(name_postfix) converter = converters.get(feature_name, None) if converter is not None: features[feature_name] = converter[ get_sentence_distance_between_entities(doc, e1, e2)] feature_name = "rel_dir_in_{}".format(name_postfix) converter = converters.get(feature_name, None) if converter is not None: features[feature_name] = converter[ RelExtFeatureExtractor._get_direction_feature(e1, e2)] for ent_num, ent in enumerate((e1, e2)): feature_name = "entities_types_in_{}_{}".format( name_postfix, ent_num) converter = converters.get(feature_name, None) if converter is not None: features[feature_name] = converter[ent.type] return features
def test_with_labels(self): ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")] doc = Document('', ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"], [Sentence(0, 7)], [], ents) ner_fe, token_meta = ner_fe_factory([doc], {"internal_emb_size": 10}) # one sentence in doc -> one sample features, = ner_fe.extract_features_from_doc(doc, include_labels=True) words = features['words_0'] self.assertEqual(features['seq_len'], 7) self.assertEqual(len(words), 7) self.assertNotEqual(words[0], words[1]) # Planning of self.assertEqual(words[1], words[3]) # of of labels = features["labels"] self.assertEqual(len(labels), 7) self.assertEqual(labels[4], labels[6]) # B-PER, B-PER self.assertNotEqual(labels[3], labels[4]) # O, B-PER self.assertEqual(labels[0], labels[1]) # O, O