def setUp(self): sent_1_tokens = [ "Human", "and", "tick", "spotted", "fever", "group", "Rickettsia", "isolates", "from", "Israel", ":", "a", "genotypic", "analysis", "." ] sent_1_head_distances = [ 3, -1, -2, 0, 2, 1, 1, 7, -1, -1, 4, 2, 1, 1, -11 ] self.doc_with_1_sent = Document( "", sent_1_tokens, [Sentence(0, len(sent_1_tokens))], [Paragraph(0, 1)], token_features={"dt_head_distances": sent_1_head_distances}) sent_2_tokens = [ "The", "precise", "mechanisms", "that", "initiate", "bacterial", "uptake", "have", "not", "yet", "been", "elucidated", "." ] sent_2_head_distances = [2, 1, 9, 1, -2, 1, -2, 4, 3, 2, 1, 0, -1] self.doc_with_2_sent = Document( "", sent_1_tokens + sent_2_tokens, [ Sentence(0, len(sent_1_tokens)), Sentence(len(sent_1_tokens), len(sent_1_tokens) + len(sent_2_tokens)) ], [Paragraph(0, 2)], token_features={ "dt_head_distances": sent_1_head_distances + sent_2_head_distances })
def setUp(self): tokens = [ "Recurrence", "of", "Pelecypod-associated", "cholera", "in", "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "Sardinian", "province", "of", "Cagliari", "had", "onset", "of", "bacteriologically", "confirmed", "cholera", "." ] sentences = [Sentence(0, 7), Sentence(7, 31)] entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 4, "Bacteria"), Entity("T3", 3, 4, "Bacteria"), Entity("T4", 5, 6, "Geographical"), Entity("T5", 17, 18, "Habitat"), Entity("T6", 17, 24, "Habitat"), Entity("T7", 20, 22, "Geographical"), Entity("T8", 23, 24, "Geographical"), Entity("T9", 29, 30, "Bacteria") ] paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] relations = [ Relation(entities[0], entities[1], "Lives_in"), Relation(entities[8], entities[6], "Lives_in") ] self.doc = Document("_", tokens, sentences, paragraphs, entities, relations)
def test_inner_entities_collapse(self): expected_tokens = [ "Recurrence", "of", "Pelecypod-associated", "cholera", "in", "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "$Geographical$", "of", "$Geographical$", "had", "onset", "of", "bacteriologically", "confirmed", "cholera", "." ] expected_sentences = [Sentence(0, 7), Sentence(7, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 4, "Bacteria"), Entity("T3", 3, 4, "Bacteria"), Entity("T4", 5, 6, "Geographical"), Entity("T5", 17, 18, "Habitat"), Entity("T6", 17, 23, "Habitat"), Entity("T7", 20, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def test_3_entity_paragraphs(self): sentences = [ Sentence(0, 5), Sentence(5, 10), ] paragraphs = [ Paragraph(0, 1), Paragraph(1, 2), ] entities = [ Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), ] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_samples(doc, max_distance, False) expected_samples = [ (Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), None), (Entity('_', 0, 1, '1'), Entity('_', 5, 6, '2'), None), (Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), None), ] self.assertEqual(expected_samples, actual_samples)
def test_entities_with_nesting_collapse(self): expected_tokens = [ "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "Sardinian", "province", "of", "Cagliari", "had", "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "." ] expected_sentences = [Sentence(0, 6), Sentence(6, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 3, "Bacteria"), Entity("T3", 2, 3, "Bacteria"), Entity("T4", 4, 5, "Geographical"), Entity("T5", 16, 17, "Habitat"), Entity("T6", 16, 23, "Habitat"), Entity("T7", 19, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def test_collapsing_with_ne(self): input_doc = self.doc.with_additional_extras({"ne": self.doc.entities}) input_doc = input_doc.without_relations().without_entities() entities = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 4, "same"), Entity("_", 3, 4, "include"), Entity("_", 5, 6, "same"), Entity("_", 15, 19, "intersect"), Entity("_", 17, 20, "include"), Entity("_", 22, 25, "intersect") ]) input_doc = input_doc.with_entities(entities) expected_tokens = [ "Recurrence", "of", "$Bacteria$", "in", "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "$Habitat$", "had", "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "." ] expected_sentences = [Sentence(0, 6), Sentence(6, 24)] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_nes = SortedSpansSet([ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 3, "Bacteria"), Entity("T3", 2, 3, "Bacteria"), Entity("T4", 4, 5, "Geographical"), Entity("T5", 16, 17, "Habitat"), Entity("T6", 16, 17, "Habitat"), Entity("T7", 16, 17, "Geographical"), Entity("T8", 16, 17, "Geographical"), Entity("T9", 22, 23, "Bacteria") ]) expected_entities = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 3, "same"), Entity("_", 2, 3, "include"), Entity("_", 4, 5, "same"), Entity("_", 14, 17, "intersect"), Entity("_", 16, 17, "include"), Entity("_", 16, 18, "intersect") ]) expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, extras={"ne": expected_nes}) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}, True).transform(input_doc) self.assertEqual(expected_doc, actual_doc)
def convert_from_digger_to_derek(diggerdoc: DiggerDoc, doc_name: str) -> Document: tokens = [] token_features = { "pos": [], "dt_labels": [], "dt_head_distances": [], "lemmas": [], "feats": [] } for i, token in enumerate(diggerdoc.tokens): tokens.append(token.doc_text) token_features["pos"].append(token.pos.upos) token_features["dt_labels"].append(token.deprel) token_features["dt_head_distances"].append( token.head_index - i if token.head_index != -1 else 0) token_features["lemmas"].append(token.lemma) token_features["feats"].append(token.pos.feats) sentences = list( Sentence(sent.start, sent.end) for sent in diggerdoc.sentences_boundaries) # here we assume all doc sentences to be in 1 paragraph paragraphs = [Paragraph(0, len(sentences))] return Document(doc_name, tokens, sentences, paragraphs, token_features=token_features)
def test_2_chains_2_pron(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [ Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'), ] rels = { Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'), Relation(Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'), Relation(Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'), '1'), } doc = Document('test', [], sentences, paragraphs, entities, rels) max_distance = 3 actual_samples = get_pron_samples(doc, max_distance, True) expected_samples = [ (Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), None), (Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'), (Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'), (Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), None), ] self.assertEqual(actual_samples, expected_samples)
def _get_doc_from_raw_text(self, raw_text, doc_name) -> Document: tokens, sentences, raw_tokens = self.segmenter.segment(raw_text) # here we assume all text to be one paragraph paragraphs = [Paragraph(0, len(sentences))] return Document(splitext(doc_name)[0], tokens, sentences, paragraphs, token_features={"char_spans": raw_tokens})
def _create_doc(self, doc_raw_tokens: List[List[str]], doc_idx) -> Document: tokens, sentences, entities, pos_tags = [], [], [], [] sent_tokens, sent_pos_tags, sent_entities_labels = [], [], [] sent_start = 0 for raw_token in doc_raw_tokens: if not raw_token: if sent_tokens: tokens.extend(sent_tokens) pos_tags.extend(sent_pos_tags) sentences.append(Sentence(sent_start, sent_start + len(sent_tokens))) sent_start += len(sent_tokens) entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels)) sent_tokens, sent_pos_tags, sent_entities_labels = [], [], [] continue token, pos_tag, _, ent_label = raw_token sent_tokens.append(token) sent_entities_labels.append(ent_label) sent_pos_tags.append(pos_tag) if sent_tokens: tokens.extend(sent_tokens) pos_tags.extend(sent_pos_tags) sentences.append(Sentence(sent_start, sent_start + len(sent_tokens))) entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels)) return Document( str(doc_idx), tokens, sentences, [Paragraph(0, len(sentences))], entities, token_features={"pos": pos_tags})
def test_borders_extraction_3(self): tokens = ["bacteria", "spotted", ".", ".", "it's", "."] sentences = [Sentence(0, 3), Sentence(3, 4), Sentence(4, 6)] broken_doc = Document("", tokens, sentences, [Paragraph(0, 2)]) borders = ["start", "in", "end", "start", "start", "end"] self.assertEqual(get_sentence_borders_feature(broken_doc), borders)
def _merge(raw_tokens: list, sentences: list, raw_paragraphs: list, raw_entities: list, raw_relations: list, *, symmetric_types: set = None) -> Tuple[List[Sentence], List[Paragraph], List[Entity], Set[Relation]]: """ :param raw_tokens: list of tuples: (start, end, text) :param sentences: list of Sentence objects :param raw_paragraphs: list of tuples: (start, end) :param raw_entities: list of dicts: {'id', 'type', 'start', 'end'} :param raw_relations: list of dicts: {'type', 'first', 'second'} """ paragraphs = [] cur_par_idx = 0 par_start = 0 entities = sorted(align_raw_entities(raw_entities, raw_tokens)) entities_dict = {ent.id: ent for ent in entities} sentences = adjust_sentences(sentences, entities) for i, sentence in enumerate(sentences): for token in raw_tokens[sentence.start_token: sentence.end_token]: if par_start != i + 1 and (_end_of_text(sentences, raw_tokens, sentence, token, i) or _end_of_paragraph(raw_paragraphs, cur_par_idx, token)): paragraphs.append(Paragraph(par_start, i + 1)) par_start = i + 1 cur_par_idx += 1 return sentences, paragraphs, entities, _get_relations(raw_relations, entities_dict, symmetric_types)
def read(self, path: str) -> List[Document]: all_files = listdir(path) file_names = sorted(set(splitext(f)[0] for f in all_files)) docs = [] for f in file_names: with open(join(path, f"{f}.txt"), "r", encoding="utf-8") as g: raw_text = g.read() if f"{f}.ann" not in all_files: warn(f"Skipping {f}.txt, no {f}.ann file found") continue with open(join(path, f"{f}.ann"), "r", encoding="utf-8") as g: annotations = g.read() tokens, sentences, raw_tokens = self.segmenter.segment(raw_text) raw_entities, raw_relations = _read_brat_annotations(annotations) raw_entities = _expand_spans(raw_entities) sentences, _, entities, relations = _merge(raw_tokens, sentences, [], raw_entities, raw_relations) if self.collapse_intersecting: entities, relations = collapse_intersecting_entities(entities, relations) # here we assume all doc to be in one paragraph doc = Document(f, tokens, sentences, [Paragraph(0, len(sentences))], entities, relations) docs.append(doc) return docs
def test_1_entity(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [Entity('_', 0, 1, '1')] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_samples(doc, max_distance, False) expected_samples = [] self.assertEqual(expected_samples, actual_samples)
def test_no_entities(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_noun_samples(doc, max_distance, False) expected_samples = [] self.assertEqual(actual_samples, expected_samples)
def _conllu_text(text) -> str: tokens, sentences, _ = segmentor.segment(text) token_features = processor.get_token_features(tokens, sentences) # assume doc as paragraph paragraphs = [Paragraph(0, len(sentences))] doc = Document("", tokens, sentences, paragraphs, token_features=token_features) return writer.write_to_str(convert_from_derek_to_digger(doc))
def test_2_entity(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [ Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'noun'), ] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 3 actual_samples = get_noun_samples(doc, max_distance, False) expected_samples = [(Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'noun'), None)] self.assertEqual(expected_samples, actual_samples)
def test_2_entity_rel(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [ Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), ] rels = {Relation(Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), '1')} doc = Document('test', [], sentences, paragraphs, entities, rels) max_distance = 3 actual_samples = get_samples(doc, max_distance, True) expected_samples = [(Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), '1')] self.assertEqual(expected_samples, actual_samples)
def setUp(self): doc_tokens = [ "Human", "and", "tick", "spotted", "fever", "group", "Rickettsia", "isolates", "from", "Israel", ":", "a", "genotypic", "analysis", "." ] + [ "The", "precise", "mechanisms", "that", "initiate", "bacterial", "uptake", "have", "not", "yet", "been", "elucidated", "." ] doc_sentences = [Sentence(0, 15), Sentence(15, 28)] doc_paragraphs = [Paragraph(0, 2)] doc_head_distances = [ 3, -1, -2, 0, 2, 1, 1, 7, -1, -1, 4, 2, 1, 1, -11 ] + [2, 1, 9, 1, -2, 1, -2, 4, 3, 2, 1, 0, -1] doc_dt_labels = ["test"] * len(doc_tokens) doc_token_features = { "dt_head_distances": doc_head_distances, "dt_labels": doc_dt_labels } self.entity_with_one_token_no_root = (6, 7, 0) self.entity_with_several_tokens_no_root = (12, 14, 0) self.entity_with_one_token_root = (3, 4, 0) self.entity_with_several_tokens_root = (22, 27, 1) doc_entities = [ self.entity_with_one_token_no_root, self.entity_with_several_tokens_no_root, self.entity_with_one_token_root, self.entity_with_several_tokens_root ] doc_entities = [ Entity("", start, end, "") for start, end, _ in doc_entities ] self.doc = Document("", doc_tokens, doc_sentences, doc_paragraphs, doc_entities, token_features=doc_token_features)
def _get_docs(self, raw_docs: Dict[str, List[dict]], groups: Dict[str, list]) -> Dict[str, Document]: docs = {} for doc_id, raw_tokens in raw_docs.items(): tokens = [] token_features = {} sentences = [] sent_start = 0 shift2idx = {} for i, raw_token in enumerate(raw_tokens): tokens.append(raw_token['token']) token_features.setdefault('lemma', []).append(raw_token['lemma']) token_features.setdefault('gram', []).append(raw_token['gram']) if "speech" in raw_token: token_features.setdefault("speech", []).append(raw_token['speech']) token_features.setdefault("said", []).append(raw_token['said']) token_features.setdefault("author_comment", []).append( raw_token['author_comment']) token_features.setdefault("speech_verb", []).append( raw_token['speech_verb']) shift2idx[raw_token['shift']] = i if raw_token['gram'] == 'SENT': sentences.append(Sentence(sent_start, i + 1)) sent_start = i + 1 if sentences[-1].end_token != len(tokens): sentences.append(Sentence(sent_start, len(tokens))) entities = self._get_entities(groups, shift2idx, doc_id) sentences = adjust_sentences(sentences, entities) doc = Document(doc_id, tokens, sentences, [Paragraph(0, len(sentences))], entities, token_features=token_features) docs[doc_id] = doc return docs
def setUp(self) -> None: tokens = [ "I", "will", "do", "my", "homework", "today", ".", "It", "is", "very", "hard", "but", "i", "don't", "care", "." ] sentences = [Sentence(0, 7), Sentence(7, 16)] paragraphs = [Paragraph(0, 2)] entities = [ Entity("_", 0, 1, "t1"), Entity("_", 3, 5, "t2"), Entity("_", 7, 8, "t1"), Entity("_", 9, 11, "t2"), Entity("_", 10, 11, "t4") ] self.doc = Document("_", tokens, sentences, paragraphs, entities) self.relations = { Relation(entities[2], entities[3], "t1"), Relation(entities[3], entities[4], "t2") }
def setUp(self) -> None: tokens = [ "Главный", "тренер", "римского", "«", "Лацио", "»", "Симоне", "Индзаги", "продолжит", "работу", "с", "командой", ",", "сообщает", "пресс-служба", "клуба", ".", "Ранее", "сообщалось", ",", "что", "в", "услугах", "Индзаги", "заинтересованы", "«", "Милан", "»", "и", "«", "Ювентус", "»", ",", "которые", "пребывают", "без", "наставников", "после", "ухода", "Дженнаро", "Гаттузо", "и", "Массимилиано", "Аллегри", "." ] sentences = [Sentence(0, 17), Sentence(17, 45)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T1", 4, 5, "Team"), Entity("T2", 6, 8, "PlayerCoach1"), Entity("T3", 23, 24, "PlayerCoach2"), Entity("T4", 26, 27, "TeamFilter"), Entity("T5", 30, 31, "Team"), Entity("T6", 39, 41, "Coach"), Entity("T7", 42, 44, "Coach") ] self.doc = Document("_", tokens, sentences, paragraphs, entities)
def predict_doc(self, text, raw_entities, need_entities, need_relations): """ :param raw_entities: list of {"id","start","end","type"} dicts :return: (raw_entities, raw_relations) where: raw_entities is list of {"id","start","end","type"} dicts or None raw_relations is list of {"first","second","type"} dicts or None """ if self.ent_clf is None and raw_entities is None and (need_entities or need_relations): raise BadRequest("Server doesn't support entities recognition") if self.rel_clf is None and need_relations: raise BadRequest("Server doesn't support relation extraction") tokens, sentences, raw_tokens = self.segmenter.segment(text) doc = Document("_", tokens, sentences, [Paragraph(0, len(sentences))]) doc = self.transformer.transform(doc) entities = None if raw_entities is not None: if need_relations: entities = align_raw_entities(raw_entities, raw_tokens) if not need_entities: raw_entities = None else: if need_entities or need_relations: entities = self.ent_clf.predict_doc(doc) if need_entities: raw_entities = self._to_raw_entities(entities, raw_tokens) raw_relations = None if need_relations: doc = doc.with_entities(entities) relations = self.rel_clf.predict_doc(doc) raw_relations = self._to_raw_relations(relations) return raw_entities, raw_relations
def test_2_entity_long(self): sentences = [ Sentence(0, 3), Sentence(3, 5), Sentence(5, 10), ] paragraphs = [Paragraph(0, 3)] entities = [ Entity('_', 0, 1, '1'), Entity('_', 3, 4, '1'), Entity('_', 5, 6, '1'), ] doc = Document('test', [], sentences, paragraphs, entities) max_distance = 1 actual_samples = get_samples(doc, max_distance, False) expected_samples = [(Entity('_', 0, 1, '1'), Entity('_', 3, 4, '1'), None), (Entity('_', 3, 4, '1'), Entity('_', 5, 6, '1'), None)] self.assertEqual(expected_samples, actual_samples)
def _read_document(self, directory, name): path = join(directory, name) tokens, fre_id2token_id, sentences, char_spans = self._get_tokens_and_sentences(path) paragraphs = [Paragraph(0, len(sentences))] entities = self._get_entities(path, fre_id2token_id) return Document(name, tokens, sentences, paragraphs, entities, token_features={"char_spans": char_spans})
def setUp(self) -> None: self.docs = [] tokens = [ "Главный", "тренер", "римского", "«", "Лацио", "»", "Симоне", "Индзаги", "продолжит", "работу", "с", "командой", ",", "сообщает", "пресс-служба", "клуба", ".", "Ранее", "сообщалось", ",", "что", "в", "услугах", "Индзаги", "заинтересованы", "«", "Милан", "»", "и", "«", "Ювентус", "»", ",", "которые", "пребывают", "без", "наставников", "после", "ухода", "Дженнаро", "Гаттузо", "и", "Массимилиано", "Аллегри", "." ] sentences = [Sentence(0, 17), Sentence(17, 45)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T1", 4, 5, "Team"), Entity("T2", 6, 8, "Coach"), Entity("T3", 23, 24, "Coach"), Entity("T4", 26, 27, "Team"), Entity("T5", 30, 31, "Team"), Entity("T6", 39, 41, "Coach"), Entity("T7", 42, 44, "Coach") ] named_entities = [ Entity("generated", 3, 6, "ORG"), Entity("generated", 6, 8, "PER"), Entity("generated", 23, 24, "PER"), Entity("generated", 25, 28, "ORG"), Entity("generated", 29, 32, "ORG"), Entity("generated", 39, 41, "PER"), Entity("generated", 42, 44, "PER") ] doc = Document("_", tokens, sentences, paragraphs, entities, extras={"ne": SortedSpansSet(named_entities)}) self.docs.append(doc) tokens = [ "Врачи", "сборной", "Бразилии", "подтвердили", "травму", "нападающего", "«", "Пари", "Сен-Жермен", "»", "Неймара", ",", "полученную", "во", "время", "товарищеского", "матча", "с", "Катаром", "." ] sentences = [Sentence(0, 20)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T1", 1, 3, "Team"), Entity("T2", 7, 9, "Team"), Entity("T3", 10, 11, "Player"), Entity("T4", 18, 19, "Team") ] named_entities = [ Entity("generated", 1, 3, "ORG"), Entity("generated", 6, 10, "ORG"), Entity("generated", 10, 11, "PER"), Entity("generated", 18, 19, "ORG") ] doc = Document("_", tokens, sentences, paragraphs, entities, extras={"ne": SortedSpansSet(named_entities)}) self.docs.append(doc) self.common_props = { "seed": 1, "internal_emb_size": 10, "learning_rate": 0.005, "batcher": { "batch_size": 4, }, "encoding_size": 1, "dropout": 0.5, "optimizer": "adam", "epoch": 2, "clip_norm": 5 } self.docs_no_entities = [d.without_entities() for d in self.docs]
def setUp(self) -> None: self.docs = [ Document('1', ['Во', 'время', 'своих', 'прогулок', 'в', 'окрестностях', 'Симеиза', 'я', 'обратил', 'внимание', 'на', 'одинокую', 'дачу', ',', 'стоявшую', 'на', 'крутом', 'склоне', 'горы', '.', 'К', 'этой', 'даче', 'не', 'было', 'проведено', 'даже', 'дороги', '.', 'Кругом', 'она', 'была', 'обнесена', 'высоким', 'забором', ',', 'с', 'единственной', 'низкой', 'калиткой', ',', 'которая', 'всегда', 'была', 'плотно', 'прикрыта', '.'], [Sentence(0, 20), Sentence(20, 29), Sentence(29, 47)], [Paragraph(0, 3)], [Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), Entity('1', 33, 35, 'noun'), Entity('1', 37, 38, 'noun'), Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron')], { Relation(Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), 'COREF'), Relation(Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), 'COREF'), Relation(Entity('1', 11, 13, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'), Relation(Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'), Relation(Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron'), 'COREF'), }, { 'pos': ['ADP', 'NOUN', 'DET', 'NOUN', 'ADP', 'NOUN', 'PROPN', 'PRON', 'VERB', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT', 'VERB', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'PART', 'AUX', 'VERB', 'PART', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'ADJ', 'NOUN', 'PUNCT', 'ADP', 'ADJ', 'ADJ', 'NOUN', 'PUNCT', 'PRON', 'ADV', 'AUX', 'ADV', 'VERB', 'PUNCT'], 'dt_labels': ['case', 'fixed', 'amod', 'obl', 'case', 'nmod', 'nmod', 'nsubj', 'root', 'obj', 'case', 'amod', 'nmod', 'punct', 'amod', 'case', 'amod', 'obl', 'nmod', 'punct', 'case', 'amod', 'obl', 'advmod', 'aux:pass', 'root', 'advmod', 'nsubj', 'punct', 'advmod', 'nsubj', 'aux:pass', 'root', 'amod', 'obl', 'punct', 'case', 'amod', 'amod', 'conj', 'punct', 'nsubj', 'advmod', 'aux:pass', 'advmod', 'acl:relcl', 'punct'], 'dt_head_distances': [3, -1, 1, 5, 1, -2, -1, 1, 0, -1, 2, 1, -3, -1, -2, 2, 1, -3, -1, -1, 2, 1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, -5, -1, 4, 3, 2, 1, -11, -1], 'lemmas': ['во', 'время', 'свой', 'прогулка', 'в', 'окрестность', 'Симеиза', 'я', 'обращать', 'внимание', 'на', 'одинокий', 'дача', ',', 'стоять', 'на', 'крутой', 'склон', 'гора', '.', 'к', 'этот', 'дача', 'не', 'быть', 'проводить', 'даже', 'дорога', '.', 'кругом', 'она', 'быть', 'обнесен', 'высокий', 'забор', ',', 'с', 'единственный', 'низкий', 'калитка', ',', 'который', 'всегда', 'быть', 'плотно', 'прикрывать', '.'], 'feats': [{}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Neuter'}, {'Number': 'Plural', 'Pronoun': 'REFLEXIVE', 'Case': 'Genitive'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Plural', 'Gender': 'Neuter'}, {}, {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Plural', 'Gender': 'Masculine'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular', 'Pronoun': 'DEICTIC', 'Case': 'Nominative'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Neuter'}, {}, {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'NotPast'}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past'}, {}, {'Case': 'Prepositional', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {}, {'Case': 'Dative', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Neuter', 'Shortness': 'Short', 'Tense': 'Past', 'Voice': 'Passive'}, {}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {}, {'Animacy': 'Animated', 'Gender': 'Feminine', 'Number': 'Singular', 'Pronoun': 'PERSONAL', 'Case': 'Nominative'}, {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Shortness': 'Short', 'Tense': 'Past', 'Voice': 'Passive'}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Shortness': 'Short', 'Tense': 'Past', 'Voice': 'Passive'}, {}], 'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], }, { 'ne': SortedSpansSet([Entity('1', 6, 7, 'GPE_CITY')]) } ), Document('1', ['Когда', 'мы', 'шли', 'по', 'тропинке', ',', 'каждый', 'был', 'доволен', 'и', 'думал', ',', 'что', 'надул', 'другого', '.', 'Петька', 'изредка', 'посапывал', 'носом', '.', 'Давно', 'он', 'зарился', 'на', 'моих', 'голубей', ',', 'еще', 'с', 'прошлой', 'зимы', ',', 'а', 'теперь', 'вот', 'счастье', 'неожиданно', 'привалило', '.', 'А', 'у', 'меня', 'будет', 'пистолет', '.'], [Sentence(0, 16), Sentence(16, 21), Sentence(21, 40), Sentence(40, 46)], [Paragraph(0, 3)], [ Entity('1', 1, 2, 'pron'), Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), Entity('1', 25, 26, 'pron'), Entity('1', 25, 27, 'noun'), Entity('1', 42, 43, 'pron'), Entity('1', 44, 45, 'noun'), ], { Relation(Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), 'COREF'), Relation(Entity('1', 25, 26, 'pron'), Entity('1', 42, 43, 'pron'), 'COREF'), }, { 'pos': ['SCONJ', 'PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'ADJ', 'AUX', 'ADJ', 'CCONJ', 'VERB', 'PUNCT', 'SCONJ', 'VERB', 'ADJ', 'PUNCT', 'NOUN', 'ADV', 'VERB', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'ADV', 'PART', 'NOUN', 'ADV', 'VERB', 'PUNCT', 'CCONJ', 'ADP', 'PRON', 'VERB', 'NOUN', 'PUNCT'], 'dt_labels': ['mark', 'nsubj', 'advcl', 'case', 'obl', 'punct', 'nsubj', 'cop', 'root', 'cc', 'conj', 'punct', 'mark', 'advcl', 'obj', 'punct', 'nsubj', 'advmod', 'root', 'obl', 'punct', 'advmod', 'nsubj', 'root', 'case', 'amod', 'obl', 'punct', 'advmod', 'case', 'obl', 'nmod', 'punct', 'cc', 'advmod', 'advmod', 'nsubj', 'advmod', 'conj', 'punct', 'cc', 'case', 'root', 'cop', 'nsubj', 'punct'], 'dt_head_distances': [8, 1, 6, 1, -2, -1, 2, 1, 0, 1, -2, -1, -2, -3, -1, -1, 2, 1, 0, -1, -1, 2, 1, 0, 2, 1, -3, -1, 2, 1, -7, -1, -1, 5, 4, 1, 2, 1, -15, -1, 2, 1, 0, -1, -2, -1], 'lemmas': ['когда', 'мы', 'идти', 'по', 'тропинка', ',', 'каждый', 'быть', 'довольный', 'и', 'думать', ',', 'что', 'надуть', 'другой', '.', 'Петька', 'изредка', 'посапывать', 'нос', '.', 'давно', 'он', 'зариться', 'на', 'мой', 'голубь', ',', 'еще', 'с', 'прошлый', 'зима', ',', 'а', 'теперь', 'вот', 'счастье', 'неожиданно', 'приваливать', '.', 'а', 'у', 'я', 'быть', 'пистолет', '.'], 'feats': [{}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'DEICTIC', 'Case': 'Nominative'}, {'Number': 'Plural', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Shortness': 'Short'}, {}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {'Case': 'Nominative', 'Animacy': 'Animated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {}, {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular', 'Pronoun': 'PERSONAL', 'Case': 'Nominative'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'POSSESSIVE', 'Case': 'Accusative'}, {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Plural', 'Gender': 'Masculine'}, {}, {}, {}, {'Case': 'Genitive', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {}, {}, {}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Neuter'}, {}, {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {}, {}, {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular', 'Pronoun': 'DEICTIC', 'Case': 'Genitive'}, {'Person': 'Third', 'Number': 'Singular', 'Tense': 'NotPast', 'Mode': 'Indicative'}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}], 'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], }, { 'ne': SortedSpansSet([Entity('1', 16, 17, 'PERSON')]) } ) ] # empty sets are "known" rels self.hook = get_hook([doc.without_relations().with_relations(set()) for doc in self.docs]) self.base_props = { "seed": 12345, "distance": 10, "max_distance": 10, "loss": "cross_entropy", "optimizer": "momentum", "lr_decay": 0.05, "momentum": 0.9, "dropout": 0.5, "internal_size": 10, "epoch": 1, "batch_size": 64, "learning_rate": 0.1, "clip_norm": 5, "max_candidate_distance": 50, "max_entity_distance": 50, "max_word_distance": 50, "max_sent_distance": 10, "max_dt_distance": 10, "dist_size": 50, "pos_emb_size": 0, "morph_feats_emb_size": 0, "entities_types_size": 20, "morph_feats_size": 0, "morph_feats_list": ["Gender", "Animacy", "Number"], "encoding_type": "lstm", "entity_encoding_size": 10, "encoding_size": 10, "classifiers": ["exact_match", "intersecting_mentions"], "use_filter": False, "max_sent_entities_distance": 10, "max_token_entities_distance": 20, "agreement_types": ["Gender", "Animacy", "Number"], "classifier_agreement_size": 0, "head_str_match_size": 0, "partial_str_match_size": 0, "ordered_partial_str_match_size": 0, "mention_interrelation_size": 0, "mention_distance_size": 0, "max_mention_distance": 50, "classifier_entity_distance_size": 0, "entities_types_in_classifier_size": 0, "head_ne_types_size": 0, "entities_token_distance_in_classifier_size": 0, "entities_sent_distance_in_classifier_size": 0, "encoder_entity_types_size": 0, "encoder_entity_ne_size": 0, "speech_types": ["said"], "speech_size": 0, "entity_encoding_type": "rnn", "classification_dense_size": 20, } self.experiment_props = { "sampling_strategy": ["coref_noun", "coref_pron_cluster", 'coref_pron_cluster_strict', 'coref_pron'] }
def setUp(self) -> None: self.docs = [] # BB-event-4329237 tokens = [ "The", "in", "vitro", "assay", "of", "tuberculin", "hypersensitivity", "in", "Macaca", "mulatta", "sensitized", "with", "bacille", "Calmette", "Guerin", "cell", "wall", "vaccine", "and-or", "infected", "with", "virulent", "Mycobacterium", "tuberculosis", "." ] sentences = [Sentence(0, 25)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T2", 8, 18, "Habitat"), Entity("T3", 8, 24, "Habitat"), Entity("T4", 12, 18, "Habitat"), Entity("T5", 12, 15, "Bacteria"), Entity("T6", 22, 24, "Bacteria") ] relations = {Relation(entities[4], entities[1], "Lives_In")} # token features generated by UDPipe pos = [ 'DET', 'ADP', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'NUM', 'NOUN', 'VERB', 'ADP', 'ADJ', 'PROPN', 'NOUN', 'PUNCT' ] dt_labels = [ 'det', 'case', 'compound', 'nsubj', 'case', 'compound', 'nmod', 'case', 'compound', 'nmod', 'root', 'case', 'compound', 'flat', 'compound', 'compound', 'obl', 'nummod', 'appos', 'acl', 'case', 'amod', 'compound', 'obl', 'punct' ] dt_head_distances = [ 3, 2, 1, 7, 2, 1, -3, 2, 1, -6, 0, 5, 2, -1, 2, 1, -6, 1, -2, -1, 3, 2, 1, -4, -14 ] token_features = { "pos": pos, "dt_labels": dt_labels, "dt_head_distances": dt_head_distances } self.docs.append( Document("_", tokens, sentences, paragraphs, entities, relations, token_features)) # BB-event-9564489 tokens = [ 'Gingivomandibular', 'infection', 'due', 'to', 'Mycobacterium', 'kansasii', 'in', 'a', 'patient', 'with', 'AIDS', '.' ] sentences = [Sentence(0, 12)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T2", 0, 1, "Habitat"), Entity("T3", 4, 6, "Bacteria"), Entity("T4", 8, 11, "Habitat") ] relations = { Relation(entities[1], entities[0], "Lives_In"), Relation(entities[1], entities[2], "Lives_In") } # token features generated by UDPipe pos = [ 'ADJ', 'NOUN', 'ADP', 'ADP', 'PROPN', 'PROPN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'PUNCT' ] dt_labels = [ 'amod', 'root', 'case', 'fixed', 'compound', 'nmod', 'case', 'det', 'nmod', 'case', 'nmod', 'punct' ] dt_head_distances = [1, 0, 3, -1, 1, -4, 2, 1, -7, 1, -2, -10] token_features = { "pos": pos, "dt_labels": dt_labels, "dt_head_distances": dt_head_distances } self.docs.append( Document("_", tokens, sentences, paragraphs, entities, relations, token_features)) self.docs_no_rels = [doc.without_relations() for doc in self.docs] self.props = { "shared": { "internal_emb_size": 10, "token_position_size": 10, "max_word_distance": 20, "dt_distance_emb_size": 10, "max_dt_distance": 10, "dt_depth_emb_size": 10, "max_dt_depth": 10, "pos_emb_size": 10 }, "add_we": "true", "add_shared": "true", "optimizer": "adam", "learning_rate": 0.01, "epoch": 2, "loss": "cross_entropy", "l2": 0.0001, "lr_decay": 0.1, "dropout": 0.5, "clip_norm": 1, "max_candidate_distance": 20, "batcher": { "batch_size": 8 }, "token_position_size": 10, "max_word_distance": 10, "encoding_size": 10, "entities_types_emb_size": 20, "entities_depth_emb_size": 10, 'max_entities_depth': 2, "specific_encoder_size": 10, "aggregation": { "attention": {}, "max_pooling": {}, "mean_pooling": {}, "take_spans": {}, "last_hiddens": {} }, "seed": 100 } # GENIA id=10022435 tokens = [ "Glucocorticoid", "resistance", "in", "the", "squirrel", "monkey", "is", "associated", "with", "overexpression", "of", "the", "immunophilin", "FKBP51", "." ] sentences = [Sentence(0, 15)] paragraphs = [Paragraph(0, 1)] pos = [ "NN", "NN", "IN", "DT", "NN", "NN", "VBZ", "VBN", "IN", "NN", "IN", "DT", "NN", "NN", "PERIOD" ] dt_labels = [ "compound", "nsubjpass", "case", "det", "compound", "nmod", "auxpass", "root", "case", "nmod", "case", "det", "compound", "nmod", "dep" ] dt_head_distances = [1, 6, 3, 2, 1, -4, 1, 0, 1, -2, 3, 2, 1, -4, -7] token_features = { "pos": pos, "dt_labels": dt_labels, "dt_head_distances": dt_head_distances } self.unlabeled_docs = [ Document("_", tokens, sentences, paragraphs, token_features=token_features) ] self.sdp_config = { "context_encoding_non_linearity_size": 10, "loss": "cross_entropy", "learning_rate": 0.02, "query_dense_size": 10, "clip_norm": 1, "batcher": { "batch_size": 1 } } self.parser_config = { "context_encoding_non_linearity_size": 10, "loss": "cross_entropy", "learning_rate": 0.02, "clip_norm": 1, "batcher": { "batch_size": 1 }, "add_shared": True, "specific_encoder_size": 10, "sampling_strategy": "pos_filtering", "arc_token_distance_in_classifier_size": 10, "arc_token_distance_in_attention_size": 10, "max_arc_token_distance": 10, "aggregation": { "attention": { "type": "luong", "normalise_coefficients": True }, "take_spans": {} } }
def _get_lemma(token, transformer): doc = Document("", [token], [Sentence(0, 1)], [Paragraph(0, 1)]) featured_doc = transformer.transform(doc) return featured_doc.token_features['lemmas'][0]
def make_document_from_json_file(file_path): d = load_json_file_as_dict(file_path) tokens = d.get('tokens', []) entities = d.get('entities', []) sentences = d.get('sentences', []) paragraphs = d.get('paragraphs', []) token_features = {} for feature in [ 'pos', 'entities_types', 'entities_depths', 'borders', 'dt_labels', 'dt_head_distances', 'dt_depths', 'dt_deltas_forward', 'dt_deltas_backward', 'dt_breakups_forward', 'dt_breakups_backward' ]: if feature in d: token_features[feature] = d[feature] relations = d.get('relations', []) doc_entities = [] for ent in entities: id_, start_token, end_token, ent_type = tuple(ent) doc_entities.append(Entity(id_, start_token, end_token, ent_type)) doc_sentences = [] for sent in sentences: start_token, end_token = tuple(sent) doc_sentences.append(Sentence(start_token, end_token)) doc_paragraphs = [] for par in paragraphs: start_sentence, end_sentence = tuple(par) doc_paragraphs.append(Paragraph(start_sentence, end_sentence)) doc_relations = [] for rel in relations: e1 = None e2 = None e1_id, e2_id, rel_type = tuple(rel) for entity in doc_entities: if entity.id == e1_id: e1 = entity if entity.id == e2_id: e2 = entity if e1 is not None and e2 is not None: break doc_relations.append(Relation(e1, e2, rel_type)) doc = Document("", tokens, doc_sentences, doc_paragraphs, token_features=token_features) if 'entities' in d: doc = doc.with_entities(doc_entities) if 'relations' in d: doc = doc.with_relations(doc_relations) return doc