def test_collapsement_of_same_spans(self): tokens = ["Elon", "Musk", "is", "CEO", "of", "Tesla", "."] sentences = [Sentence(0, 7)] entities = [ Entity("_", 0, 2, "ELON"), Entity("_", 0, 2, "MUSK"), Entity("_", 5, 6, "COMP"), Entity("_", 5, 6, "ORG") ] input_doc = Document("_", tokens, sentences, [], entities) expected_tokens = ["$ELON$", "is", "CEO", "of", "$COMP$", "."] expected_sentences = [Sentence(0, 6)] expected_entities = [ Entity("_", 0, 1, "ELON"), Entity("_", 0, 1, "MUSK"), Entity("_", 4, 5, "COMP"), Entity("_", 4, 5, "ORG") ] expected_doc = Document("_", expected_tokens, expected_sentences, [], expected_entities) actual_doc = EntitiesCollapser({"ELON", "COMP"}).transform(input_doc) self.assertEqual(expected_doc, actual_doc)
def setUp(self): tokens = [ "Recurrence", "of", "Pelecypod-associated", "cholera", "in", "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "Sardinian", "province", "of", "Cagliari", "had", "onset", "of", "bacteriologically", "confirmed", "cholera", "." ] sentences = [Sentence(0, 7), Sentence(7, 31)] entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 4, "Bacteria"), Entity("T3", 3, 4, "Bacteria"), Entity("T4", 5, 6, "Geographical"), Entity("T5", 17, 18, "Habitat"), Entity("T6", 17, 24, "Habitat"), Entity("T7", 20, 22, "Geographical"), Entity("T8", 23, 24, "Geographical"), Entity("T9", 29, 30, "Bacteria") ] paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] relations = [ Relation(entities[0], entities[1], "Lives_in"), Relation(entities[8], entities[6], "Lives_in") ] self.doc = Document("_", tokens, sentences, paragraphs, entities, relations)
def setUp(self): sent_1_tokens = [ "Human", "and", "tick", "spotted", "fever", "group", "Rickettsia", "isolates", "from", "Israel", ":", "a", "genotypic", "analysis", "." ] sent_1_head_distances = [ 3, -1, -2, 0, 2, 1, 1, 7, -1, -1, 4, 2, 1, 1, -11 ] self.doc_with_1_sent = Document( "", sent_1_tokens, [Sentence(0, len(sent_1_tokens))], [Paragraph(0, 1)], token_features={"dt_head_distances": sent_1_head_distances}) sent_2_tokens = [ "The", "precise", "mechanisms", "that", "initiate", "bacterial", "uptake", "have", "not", "yet", "been", "elucidated", "." ] sent_2_head_distances = [2, 1, 9, 1, -2, 1, -2, 4, 3, 2, 1, 0, -1] self.doc_with_2_sent = Document( "", sent_1_tokens + sent_2_tokens, [ Sentence(0, len(sent_1_tokens)), Sentence(len(sent_1_tokens), len(sent_1_tokens) + len(sent_2_tokens)) ], [Paragraph(0, 2)], token_features={ "dt_head_distances": sent_1_head_distances + sent_2_head_distances })
def test_3_entity_paragraphs(self): sentences = [ Sentence(0, 5), Sentence(5, 10), ] paragraphs = [ Paragraph(0, 1), Paragraph(1, 2), ] entities = [ Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), ] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_samples(doc, max_distance, False) expected_samples = [ (Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), None), (Entity('_', 0, 1, '1'), Entity('_', 5, 6, '2'), None), (Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), None), ] self.assertEqual(expected_samples, actual_samples)
def _get_doc_from_raw_text(self, raw_text, doc_name) -> Document: tokens, sentences, raw_tokens = self.segmenter.segment(raw_text) # here we assume all text to be one paragraph paragraphs = [Paragraph(0, len(sentences))] return Document(splitext(doc_name)[0], tokens, sentences, paragraphs, token_features={"char_spans": raw_tokens})
def _create_doc(self, doc_raw_tokens: List[List[str]], doc_idx) -> Document: tokens, sentences, entities, pos_tags = [], [], [], [] sent_tokens, sent_pos_tags, sent_entities_labels = [], [], [] sent_start = 0 for raw_token in doc_raw_tokens: if not raw_token: if sent_tokens: tokens.extend(sent_tokens) pos_tags.extend(sent_pos_tags) sentences.append(Sentence(sent_start, sent_start + len(sent_tokens))) sent_start += len(sent_tokens) entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels)) sent_tokens, sent_pos_tags, sent_entities_labels = [], [], [] continue token, pos_tag, _, ent_label = raw_token sent_tokens.append(token) sent_entities_labels.append(ent_label) sent_pos_tags.append(pos_tag) if sent_tokens: tokens.extend(sent_tokens) pos_tags.extend(sent_pos_tags) sentences.append(Sentence(sent_start, sent_start + len(sent_tokens))) entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels)) return Document( str(doc_idx), tokens, sentences, [Paragraph(0, len(sentences))], entities, token_features={"pos": pos_tags})
def read(self, path: str) -> List[Document]: all_files = listdir(path) file_names = sorted(set(splitext(f)[0] for f in all_files)) docs = [] for f in file_names: with open(join(path, f"{f}.txt"), "r", encoding="utf-8") as g: raw_text = g.read() if f"{f}.ann" not in all_files: warn(f"Skipping {f}.txt, no {f}.ann file found") continue with open(join(path, f"{f}.ann"), "r", encoding="utf-8") as g: annotations = g.read() tokens, sentences, raw_tokens = self.segmenter.segment(raw_text) raw_entities, raw_relations = _read_brat_annotations(annotations) raw_entities = _expand_spans(raw_entities) sentences, _, entities, relations = _merge(raw_tokens, sentences, [], raw_entities, raw_relations) if self.collapse_intersecting: entities, relations = collapse_intersecting_entities(entities, relations) # here we assume all doc to be in one paragraph doc = Document(f, tokens, sentences, [Paragraph(0, len(sentences))], entities, relations) docs.append(doc) return docs
def test_2_chains_2_pron(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [ Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'), ] rels = { Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'), Relation(Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'), Relation(Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'), '1'), } doc = Document('test', [], sentences, paragraphs, entities, rels) max_distance = 3 actual_samples = get_pron_samples(doc, max_distance, True) expected_samples = [ (Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), None), (Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'), (Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'), (Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), None), ] self.assertEqual(actual_samples, expected_samples)
def test_inner_entities_collapse(self): expected_tokens = [ "Recurrence", "of", "Pelecypod-associated", "cholera", "in", "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "$Geographical$", "of", "$Geographical$", "had", "onset", "of", "bacteriologically", "confirmed", "cholera", "." ] expected_sentences = [Sentence(0, 7), Sentence(7, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 4, "Bacteria"), Entity("T3", 3, 4, "Bacteria"), Entity("T4", 5, 6, "Geographical"), Entity("T5", 17, 18, "Habitat"), Entity("T6", 17, 23, "Habitat"), Entity("T7", 20, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def test_vectorized_features(self): doc = Document('', ['Planning', 'of', 'work', 'by', 'Elon'], [], [], token_features={ "vectors": [ np.array([1, 2]), np.array([1, 1]), np.array([0, 0]), np.array([0, 1]), np.array([9, 10]) ] }) token_fe, token_meta = generate_token_feature_extractor( [doc], {"vectors_keys": ["vectors"]}) features = token_fe.extract_features_from_doc(doc, 1, 4) vectors = features['vectors'] self.assertEqual(token_meta.get_precomputed_features(), []) self.assertEqual(len(token_meta.get_embedded_features()), 0) self.assertEqual(token_meta.get_one_hot_features(), []) self.assertEqual(token_meta.get_vectorized_features(), [{ "name": "vectors", "size": 2 }]) self.assertEqual(token_meta.get_char_features(), []) self.assertEqual(features['seq_len'], 3) self.assertEqual(len(vectors), 3) self.assertEqual(doc.token_features["vectors"][1:4], vectors)
def _get_sentence_positions_to_span(doc: Document, start_token: int, end_token: int, wrt_span): _, _, sent_idx = wrt_span return [ sent_idx - doc.get_token_sent_idx(idx) for idx in range(start_token, end_token) ]
def test_entities_with_nesting_collapse(self): expected_tokens = [ "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "Sardinian", "province", "of", "Cagliari", "had", "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "." ] expected_sentences = [Sentence(0, 6), Sentence(6, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 3, "Bacteria"), Entity("T3", 2, 3, "Bacteria"), Entity("T4", 4, 5, "Geographical"), Entity("T5", 16, 17, "Habitat"), Entity("T6", 16, 23, "Habitat"), Entity("T7", 19, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def convert_from_digger_to_derek(diggerdoc: DiggerDoc, doc_name: str) -> Document: tokens = [] token_features = { "pos": [], "dt_labels": [], "dt_head_distances": [], "lemmas": [], "feats": [] } for i, token in enumerate(diggerdoc.tokens): tokens.append(token.doc_text) token_features["pos"].append(token.pos.upos) token_features["dt_labels"].append(token.deprel) token_features["dt_head_distances"].append( token.head_index - i if token.head_index != -1 else 0) token_features["lemmas"].append(token.lemma) token_features["feats"].append(token.pos.feats) sentences = list( Sentence(sent.start, sent.end) for sent in diggerdoc.sentences_boundaries) # here we assume all doc sentences to be in 1 paragraph paragraphs = [Paragraph(0, len(sentences))] return Document(doc_name, tokens, sentences, paragraphs, token_features=token_features)
def test_borders_extraction_3(self): tokens = ["bacteria", "spotted", ".", ".", "it's", "."] sentences = [Sentence(0, 3), Sentence(3, 4), Sentence(4, 6)] broken_doc = Document("", tokens, sentences, [Paragraph(0, 2)]) borders = ["start", "in", "end", "start", "start", "end"] self.assertEqual(get_sentence_borders_feature(broken_doc), borders)
def _build_document(self, pmid, abstract, raw_entities, raw_relations) -> Document: tokens, sentences, raw_tokens = self.segmenter.segment(abstract['text']) raw_paragraphs = abstract['paragraphs'] sentences, paragraphs, entities, relations = _merge( raw_tokens, sentences, raw_paragraphs, raw_entities, raw_relations) return Document(pmid, tokens, sentences, paragraphs, entities, relations)
def _extract_features(self, doc: Document, ent: Entity, include_labels=False): ent_sent_idx = doc.get_entity_sent_idx(ent) start_token = doc.sentences[ent_sent_idx].start_token end_token = doc.sentences[ent_sent_idx].end_token features = { **self.token_feature_extractor.extract_features_from_doc( doc, start_token, end_token), **self._get_attention_features(doc, ent, start_token, end_token), **self._get_classifier_features(doc, ent) } if include_labels: label = _map_ne(doc, ent) features["labels"] = self.label_converter[label] ent_mask = [0] * len(self.label_converter) for key in self.types_mapping[ent.type]: ent_mask[self.label_converter[key]] = 1 features["labels_mask"] = ent_mask features["indices"] = [[ ent.start_token - start_token, ent.end_token - start_token ]] return features
def test_gazetteer_features(self): doc = Document( '', ['Ваня', 'едет', 'в', 'Париж', 'из', 'Москвы', 'в', 'москву'], [], [], token_features={ 'lemmas': ['Ваня', 'ехать', 'в', 'Париж', 'из', 'Москва', 'в', 'москва'] }) token_fe, token_meta = generate_token_feature_extractor( [doc], { "gazetteers": [{ "path": "tests/data/feature_extractor/gazetteer.txt", "lower": True, 'lemmatize': True }, { "path": "tests/data/feature_extractor/gazetteer.txt", "lower": False, 'lemmatize': True }, { "path": "tests/data/feature_extractor/gazetteer.txt", "lower": True, 'lemmatize': False }, { "path": "tests/data/feature_extractor/gazetteer.txt", "lower": False, 'lemmatize': False }] }) features = token_fe.extract_features_from_doc(doc, 0, 8) self.assertEqual(features['gazetteer_0'], [1, 1, 1, 2, 1, 2, 1, 2]) self.assertEqual(features['gazetteer_1'], [1, 1, 1, 2, 1, 2, 1, 1]) self.assertEqual(features['gazetteer_2'], [1, 1, 1, 2, 1, 1, 1, 1]) self.assertEqual(features['gazetteer_3'], [1, 1, 1, 2, 1, 1, 1, 1])
def test_gazetteer_features_assert(self): doc = Document('', ['Planning', 'of', 'work', 'by', 'Elon', ""], [], [], token_features={}) self.assertRaises(Exception, generate_token_feature_extractor, [doc], {"gazetteers": [{ "path": "" }]})
def test_no_features(self): doc = Document('', ['Go', 'to', 'shop'], [], []) tp_fe, tp_meta = generate_token_position_feature_extractor({}) features = tp_fe.extract_features_from_doc(doc, 0, 3, (2, 3, 0)) self.assertEqual(tp_meta.get_embedded_features(), []) self.assertEqual(tp_meta.get_one_hot_features(), []) self.assertEqual(tp_meta.get_vectorized_features(), []) self.assertEqual(features, {})
def test_collapsing_with_ne(self): input_doc = self.doc.with_additional_extras({"ne": self.doc.entities}) input_doc = input_doc.without_relations().without_entities() entities = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 4, "same"), Entity("_", 3, 4, "include"), Entity("_", 5, 6, "same"), Entity("_", 15, 19, "intersect"), Entity("_", 17, 20, "include"), Entity("_", 22, 25, "intersect") ]) input_doc = input_doc.with_entities(entities) expected_tokens = [ "Recurrence", "of", "$Bacteria$", "in", "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "$Habitat$", "had", "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "." ] expected_sentences = [Sentence(0, 6), Sentence(6, 24)] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_nes = SortedSpansSet([ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 3, "Bacteria"), Entity("T3", 2, 3, "Bacteria"), Entity("T4", 4, 5, "Geographical"), Entity("T5", 16, 17, "Habitat"), Entity("T6", 16, 17, "Habitat"), Entity("T7", 16, 17, "Geographical"), Entity("T8", 16, 17, "Geographical"), Entity("T9", 22, 23, "Bacteria") ]) expected_entities = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 3, "same"), Entity("_", 2, 3, "include"), Entity("_", 4, 5, "same"), Entity("_", 14, 17, "intersect"), Entity("_", 16, 17, "include"), Entity("_", 16, 18, "intersect") ]) expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, extras={"ne": expected_nes}) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}, True).transform(input_doc) self.assertEqual(expected_doc, actual_doc)
def test_without_labels(self): ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")] doc = Document('', ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"], [Sentence(0, 7)], [], ents) ner_fe, token_meta = ner_fe_factory([doc], {"internal_emb_size": 10}) doc = doc.without_entities() features, = ner_fe.extract_features_from_doc(doc) words = features['words_0'] self.assertEqual(features['seq_len'], 7) self.assertEqual(len(words), 7) self.assertNotEqual(words[0], words[1]) # Planning of self.assertEqual(words[1], words[3]) # of of self.assertRaises(KeyError, lambda: features['labels'])
def process_doc(self, doc: Document) -> Document: new_entities = [] for ent in doc.entities: if ent.type in self.__filter: continue new_entities.append(ent.with_type(self.__replacements.get(ent.type, ent.type))) return doc.without_relations().without_entities().with_entities(new_entities)
def _read_document(self, path: str, name: str): with open(join(path, name + ".txt"), encoding="utf-8") as f: raw_text = f.read() tokens, sentences, raw_tokens = self.segmenter.segment(raw_text) raw_entities, raw_paragraphs, raw_relations = BioNLPDataReader._read_annotations(path, name) sentences, paragraphs, entities, relations = _merge( raw_tokens, sentences, raw_paragraphs, raw_entities, raw_relations, symmetric_types=BioNLPDataReader.SYMMETRIC_RELATION_TYPES) return Document(name, tokens, sentences, paragraphs, entities, relations)
def setUp(self) -> None: tokens = [ "I", "will", "do", "my", "homework", "today", ".", "It", "is", "very", "hard", "but", "i", "don't", "care", "." ] sentences = [Sentence(0, 7), Sentence(7, 16)] paragraphs = [Paragraph(0, 2)] entities = [ Entity("_", 0, 1, "t1"), Entity("_", 3, 5, "t2"), Entity("_", 7, 8, "t1"), Entity("_", 9, 11, "t2"), Entity("_", 10, 11, "t4") ] self.doc = Document("_", tokens, sentences, paragraphs, entities) self.relations = { Relation(entities[2], entities[3], "t1"), Relation(entities[3], entities[4], "t2") }
def setUp(self) -> None: tokens = ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon", "in", "LA", "in", "USA", "."] sents = [Sentence(0, 12)] ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER"), Entity("_", 8, 9, "ORG"), Entity("_", 10, 11, "ORG")] nes = SortedSpansSet([ Entity("gen", 0, 1, "STUFF"), Entity("gen", 4, 5, "PERORG"), Entity("gen", 6, 7, "PERORG"), Entity("gen", 8, 9, "PERORG"), Entity("gen", 10, 11, "PERORG") ]) self.doc = Document('', tokens, sents, [], ents, extras={"ne": nes})
def test_1_entity(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [Entity('_', 0, 1, '1')] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_samples(doc, max_distance, False) expected_samples = [] self.assertEqual(expected_samples, actual_samples)
def test_no_entities(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_noun_samples(doc, max_distance, False) expected_samples = [] self.assertEqual(actual_samples, expected_samples)
def _conllu_text(text) -> str: tokens, sentences, _ = segmentor.segment(text) token_features = processor.get_token_features(tokens, sentences) # assume doc as paragraph paragraphs = [Paragraph(0, len(sentences))] doc = Document("", tokens, sentences, paragraphs, token_features=token_features) return writer.write_to_str(convert_from_derek_to_digger(doc))
def test_no_features(self): doc = Document('', ['Go', 'to', 'shop'], [], []) token_fe, token_meta = generate_token_feature_extractor([doc], {}) features = token_fe.extract_features_from_doc(doc, 0, 2) self.assertEqual(token_meta.get_precomputed_features(), []) self.assertEqual(token_meta.get_embedded_features(), []) self.assertEqual(token_meta.get_one_hot_features(), []) self.assertEqual(token_meta.get_vectorized_features(), []) self.assertEqual(token_meta.get_char_features(), []) self.assertEqual(features, {'seq_len': 2})
def test_ne_features(self): ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")] doc = Document('', ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"], [Sentence(0, 7)], [], extras={'ne': SortedSpansSet(ents)}) fe, meta = ne_fe_factory([doc], {"ne_emb_size": 10}) features = fe.extract_features_from_doc(doc, 3, 7)['ne'] self.assertEqual(len(meta.get_embedded_features()), 1) self.assertEqual(len(features), 4) self.assertEqual(features[0], features[2]) # O O self.assertEqual(features[1], features[3]) # I-PER I-PER self.assertNotEqual(features[0], features[1]) # O I-PER