def __merge(self, dataset, annotators): for doc_id in list(dataset.documents): doc = dataset.documents[doc_id] annotator_entities = {} # find the annotations that are marked complete by any annotator filenames = [] doc_is_read = False annotatable_parts = set() for annotator in annotators: # either once or zero times for filename in glob.glob(os.path.join(os.path.join(self.directory, annotator), '*{}*.ann.json'.format(doc_id))): with open(filename, 'r', encoding='utf-8') as file: ann_json = json.load(file) if ann_json['anncomplete'] or not self.delete_incomplete_docs: doc_is_read = True filenames.append(filename) annotatable_parts |= set(ann_json['annotatable']['parts']) annotator_entities[annotator] = ann_json['entities'] if self.filter_below_iaa_threshold and not self.__is_acceptable(doc_id, doc, filenames): del dataset.documents[doc_id] continue # if there is at least once set of annotations if len(annotator_entities) > 0: Entity.equality_operator = 'exact_or_overlapping' if self.entity_strategy == 'priority': merged = reduce(self.__merge_priority, [annotator_entities[x] for x in self.priority if x in annotator_entities]) else: merged = reduce(self.__merge_pair, annotator_entities.values()) for entity in merged: try: part = doc.parts[entity['part']] except KeyError: # TODO: Remove once the tagtog bug is fixed break if not self.read_only_class_id or entity['classId'] == self.read_only_class_id: if self.is_predicted: part.predicted_annotations.append( Entity(entity['classId'], entity['offsets'][0]['start'], entity['offsets'][0]['text'])) else: part.annotations.append( Entity(entity['classId'], entity['offsets'][0]['start'], entity['offsets'][0]['text'])) # delete parts that are not annotatable part_ids_to_del = [] for part_id, part in doc.parts.items(): if part_id not in annotatable_parts: part_ids_to_del.append(part_id) for part_id in part_ids_to_del: del doc.parts[part_id] # Delete docs with no ann.jsons elif not doc_is_read: del dataset.documents[doc_id] else: continue # keep the document
def __merge_priority(self, entities_x, entities_y): merged = [] merged_indices_x = [] merged_indices_y = [] for index_x, entity_x in enumerate(entities_x): for index_y, entity_y in enumerate(entities_y): if entity_x['part'] == entity_y['part']: ann_x = Entity(entity_x['classId'], entity_x['offsets'][0]['start'], entity_x['offsets'][0]['text']) ann_y = Entity(entity_y['classId'], entity_y['offsets'][0]['start'], entity_y['offsets'][0]['text']) # if they are the same or overlap # use the first once since that one has higher priority if ann_x == ann_y: if index_x not in merged_indices_x and index_y not in merged_indices_y: merged_indices_x.append(index_x) merged_indices_y.append(index_y) merged.append(entity_x) self.__append_union(merged, entities_x, entities_y) return merged
def test_DocumentLevelRelationEvaluator_arbitrary_relation_accept_fun_order_does_not_matter(self): entity_map_fun = (lambda e: "SAME") def relation_accept_fun(gold, pred): print('gold:', gold, ' <---> ', 'pred:', pred) return gold == pred r1 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "yin"), Entity(STUB_E_ID_2, 0, "yan")) r2 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "yan"), Entity(STUB_E_ID_2, 0, "yin")) self.assertTrue(relation_accept_fun(r1.map(entity_map_fun), r1.map(entity_map_fun))) self.assertTrue(relation_accept_fun(r1.map(entity_map_fun), r2.map(entity_map_fun))) self.assertTrue(relation_accept_fun(r2.map(entity_map_fun), r1.map(entity_map_fun))) evaluator = DocumentLevelRelationEvaluator(STUB_R_ID_1, entity_map_fun, relation_accept_fun) (dataset, part) = self._create_basic_dataset() # - part.relations = [r1] part.predicted_relations = [r1] evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) print(evaluation) self.assertEqual(evaluation.tp, 1) self.assertEqual(evaluation.fn, 0) self.assertEqual(evaluation.fp, 0) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 1.0)
def test_DocumentLevelRelationEvaluator_parts_irrelevant(self): evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1) dataset = Dataset() doc_1 = Document() part_1 = Part('_irrelevant_ PART *1*') dataset.documents['doc_1'] = doc_1 doc_1.parts['part_1'] = part_1 part_2 = Part('_irrelevant_ PART *2*') dataset.documents['doc_1'] = doc_1 doc_1.parts['part_2'] = part_2 part_1.relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")), ] # - part_2.predicted_relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_2, 0, "maynard"), Entity(STUB_E_ID_1, 0, "TOOL")), ] self._apply_pipeline(dataset) # --- evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 1) self.assertEqual(evaluation.fn, 0) self.assertEqual(evaluation.fp, 0) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 1.0)
def annotate(self, dataset): """ :type dataset: nalaf.structures.data.Dataset """ for filename in glob.glob(str(self.directory + "/*.ann")): with open(filename, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') pmid = os.path.basename(filename).replace('.ann', '') document = dataset.documents[pmid] for row in reader: if row[0].startswith('T'): entity_type, start, end = row[1].split() start = int(start) end = int(end) title_len = len(document.parts['title'].text) if 0 <= start < end <= title_len: part = document.parts['title'] else: part = document.parts['abstract'] start -= title_len + 1 end -= title_len + 1 if entity_type == 'SNP' or entity_type == 'RS': ann = Entity(self.mut_class_id, start, row[2]) part.annotations.append(ann) elif self.gene_class_id is not None and entity_type == 'Gene': ann = Entity(self.gene_clas_id, start, row[2]) part.annotations.append(ann)
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc part1 = Part('Sentence 1: e_1_yolo may be related to e_2_tool plus hey, e_2_coco. Sentence 2: e_1_nin. Sentence 3: e_2_musk. Sentence 4: nothing') entities = [ # Sent 1 Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=12, text='e_1_yolo', confidence=0), Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=39, text='e_2_tool', confidence=0), Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=58, text='e_2_coco', confidence=0), # Sent 2 Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=80, text='e_1_nin', confidence=0), # Sent 3 Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=101, text='e_2_musk', confidence=0), # Sent 4 ] for e in entities: part1.annotations.append(e) cls.doc.parts['s1h1'] = part1 cls.splitter = NLTKSplitter() cls.tokenizer = NLTK_TOKENIZER cls.splitter.split(cls.dataset) cls.tokenizer.tokenize(cls.dataset) # assert False, str(list(cls.dataset.sentences())) assert 4 == len(list(cls.dataset.sentences())), str(list(cls.dataset.sentences()))
def __append_union(self, merged, entities_x, entities_y): # if the strategy is union # append the ones that are not overlapping with the already merged ones if self.strategy == 'union': existing = [Entity(entity['classId'], entity['offsets'][0]['start'], entity['offsets'][0]['text']) for entity in merged] for entity in chain(entities_x, entities_y): ann = Entity(entity['classId'], entity['offsets'][0]['start'], entity['offsets'][0]['text']) if ann not in existing: merged.append(entity)
def setUp(self): self.dataset = StringReader( 'some text ... (c.2708_2711delTTAG, p.V903GfsX905) ... text').read( ) NLTKSplitter().split(self.dataset) TmVarTokenizer().tokenize(self.dataset) part = list(self.dataset.parts())[0] part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 15, 'c.2708_2711delTTAG')) part.annotations.append( Entity(STUB_ENTITY_CLASS_ID, 35, 'p.V903GfsX905'))
def annotate(self, dataset): """ :type dataset: nalaf.structures.data.Dataset """ for filename in glob.glob(str(self.directory + "/*.ann")): with open(filename, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') docid, partid = os.path.basename(filename).replace('.ann', '').split( '-', 1) for row in reader: if row[0].startswith('T'): entity_type, start, end = row[1].split() text = row[2] if entity_type == 'mutation': ann = Entity(self.entity_class_id, int(start), text) if self.is_predicted: dataset.documents[docid].parts[ partid].predicted_annotations.append(ann) else: dataset.documents[docid].parts[ partid].annotations.append(ann)
def read(self): """ :returns: nalaf.structures.data.Dataset """ dataset = Dataset() with open(self.corpus_file, encoding='utf-8') as file: for row in file: columns = row.split("\t") docid = columns[0] typ = columns[1] start = columns[2] end = columns[3] entity_text = columns[7] class_id = None if typ == 'Mutation': class_id = self.mut_class_id elif typ == 'AminoacidResidue': class_id = self.residue_class_id if class_id: document = dataset.documents.get(docid, Document()) part = Part(entity_text) document.parts[typ + '|' + start + '|' + end] = part part.annotations.append( Entity(class_id, int(start), entity_text)) dataset.documents[docid] = document return dataset
def _parse_pubtator(doc_id, doc, response_text): lines = response_text.strip().splitlines() if len(lines) >= 2 and len(doc.parts) == 2: tm_var_title = re.search('{}\|t\|(.*)'.format(doc_id), lines[0]).group(1) tm_var_abstract = re.search('{}\|a\|(.*)'.format(doc_id), lines[1]).group(1) parts = iter(doc.parts.values()) title = next(parts) abstract = next(parts) for line in lines[2:]: _, start, end, _, _, _ = line.split('\t') start = int(start) end = int(end) if 0 <= start < end <= len(tm_var_title): part = title tm_part = tm_var_title else: part = abstract tm_part = tm_var_abstract start -= len(tm_var_title) + 1 end -= len(tm_var_title) + 1 start, end = TmVarTagger._adjust_offsets( part.text, tm_part, start, end) part.predicted_annotations.append( Entity(MUT_CLASS_ID, start, part.text[start:end]))
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc # TEXT = "123 45678" # POS = "012345678" # ANN1 = " X " # ANN2 = " XXX " # PAR1 = "XXX " # PAR1 = " XXXXX" cls.part = Part( 'Here is a random sentence for the benefit of your mamma') cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=10, text='random sentence', confidence=0) cls.part.annotations.append(cls.entity) cls.doc.parts['s1h1'] = cls.part # Apply through pipeline NLTKSplitter().split(cls.dataset) NLTK_TOKENIZER.tokenize(cls.dataset) nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(nlp) cls.parser.parse(cls.dataset) # cls.part.percolate_tokens_to_entities() cls.sentence = cls.part.sentences[0]
def process(self, dataset, class_id=MUT_CLASS_ID): for doc_id, doc in dataset.documents.items(): for part_id, part in doc.parts.items(): self.__fix_issues(part) for regex in self.patterns: for match in regex.finditer(part.text): start = match.start() end = match.end() matched_text = part.text[start:end] ann = Entity(class_id, start, matched_text) Entity.equality_operator = 'exact_or_overlapping' if ann not in part.predicted_annotations: part.predicted_annotations.append( Entity(class_id, start, matched_text)) Entity.equality_operator = 'overlapping' if ann in part.predicted_annotations: for index, ann_b in enumerate( part.predicted_annotations): if ann == ann_b and len(matched_text) > len( ann_b.text): part.predicted_annotations[index] = ann to_delete = [ index for index, ann in enumerate(part.predicted_annotations) if any(r.search(ann.text) for r in self.negative_patterns) or (not self.keep_silent and self.__is_silent(ann)) or (not self.keep_unnumbered and not self._is_numbered(ann)) ] part.predicted_annotations = [ ann for index, ann in enumerate(part.predicted_annotations) if index not in to_delete ] # sanity check, make sure annotations match their offset for part in dataset.parts(): for ann in part.predicted_annotations: assert ann.text == part.text[ann.offset:ann.offset + len(ann.text)] while ann.text[0] == ' ': ann.offset += 1 ann.text = ann.text[1:] while ann.text[-1] == ' ': ann.text = ann.text[:-1]
def __merge_pair(self, entities_x, entities_y): merged = [] merged_indices_x = {} merged_indices_y = {} for index_x, entity_x in enumerate(entities_x): for index_y, entity_y in enumerate(entities_y): # if they have the same part_id if entity_x['part'] == entity_y['part']: ann_x = Entity(entity_x['classId'], entity_x['offsets'][0]['start'], entity_x['offsets'][0]['text']) ann_y = Entity(entity_y['classId'], entity_y['offsets'][0]['start'], entity_y['offsets'][0]['text']) # if they are the same or overlap if ann_x == ann_y: # if neither of them haven't been matched before if index_x not in merged_indices_x and index_y not in merged_indices_y: if self.operator(len(ann_x.text), len(ann_y.text)): merged.append(entity_x) merged_indices_x[index_x] = len(merged), ann_x merged_indices_y[index_y] = len(merged), ann_x else: merged.append(entity_y) merged_indices_x[index_x] = len(merged), ann_y merged_indices_y[index_y] = len(merged), ann_y # if we already matched them before else: # try to see if we have a more suitable match now if index_x in merged_indices_x: index, ann_existing = merged_indices_x[index_x] else: index, ann_existing = merged_indices_y[index_y] if self.operator(len(ann_x.text), len(ann_y.text)): ann_new, entity_new = ann_x, entity_x else: ann_new, entity_new = ann_y, entity_y if self.operator(len(ann_new.text), len(ann_existing.text)): merged[index - 1] = entity_new self.__append_union(merged, entities_x, entities_y) return merged
def generate_abstracts(self, list_of_pmids): """ Generates list of documents using pmids and the restapi interface from tmtools. Source: "http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/" :param list_of_pmids: strings :return nalaf.structures.Dataset: dataset """ # if os.path.isfile('cache.json'): # with open('cache.json') as f: # tm_var = json.load() # else: url_tmvar = 'http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/Mutation/{0}/JSON/' url_converter = 'http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/' # load cache.json if exists if os.path.exists('cache.json'): with open('cache.json', 'r', encoding='utf-8') as f: tm_var = json.load(f) else: tm_var = {} for pmid in list_of_pmids: if pmid not in tm_var: # if pmid was not already downloaded from tmTools req = requests.get(url_tmvar.format(pmid)) try: tm_var[pmid] = req.json() except ValueError: pass # cache the tmVar annotations so we don't pull them every time with open('cache.json', 'w') as file: json.dump(tm_var, file, indent=4) # for key in tm_var: # print(json.dumps(tm_var[key], indent=4)) dataset = Dataset() for doc_id in list_of_pmids: if doc_id in tm_var: doc = Document() text = tm_var[doc_id]['text'] part = Part(text) denotations = tm_var[doc_id]['denotations'] annotations = [] for deno in denotations: ann = Entity( class_id=self.mut_class_id, offset=int(deno['span']['begin']), text=text[deno['span']['begin']:deno['span']['end']]) annotations.append(ann) # note should the annotations from tmvar go to predicted_annotations or annotations? part.annotations = annotations doc.parts['abstract'] = part dataset.documents[doc_id] = doc return dataset
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc # TEXT = "123 45678" # POS = "012345678" # ANN1 = " X " # ANN2 = " XXX " # PAR1 = "XXX " # PAR1 = " XXXXX" part1 = Part('123') part2 = Part('45678') ann1 = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=1, text='2', confidence=0) ann2 = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=1, text='567', confidence=1) ann1.subclass = 0 ann2.subclass = 2 part1.annotations.append(ann1) part2.annotations.append(ann2) cls.doc.parts['s1h1'] = part1 cls.doc.parts['s2p1'] = part2 doc2 = Document() doc3 = Document().parts['someid'] = Part('marmor stein und eisen') cls.dataset2 = Dataset() cls.dataset2.documents['newid'] = doc3 cls.dataset2.documents['testid'] = doc2
def annotate(self, dataset): """ :type dataset: nalaf.structures.data.Dataset """ for filename in glob.glob(str(self.directory + "/*.ann")): with open(filename, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') pmid = os.path.basename(filename).replace('.ann', '') document = dataset.documents[pmid] for row in reader: if row[0].startswith('T'): entity_type, start, end = row[1].split() if entity_type == 'SNP' or entity_type == 'RS': ann = Entity(MUT_CLASS_ID, start, row[2]) document.parts['abstract'].annotations.append(ann) elif entity_type == 'Gene': ann = Entity(self.gene_class_id, start, row[2]) document.parts['abstract'].annotations.append(ann)
def test_DocumentLevelRelationEvaluator_default_entities_case_irrelevant( self): evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1) dataset = Dataset() doc_1 = Document() part_1 = Part('_irrelevant_') dataset.documents['doc_1'] = doc_1 doc_1.parts['part_1'] = part_1 part_1.relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")), ] # - part_1.predicted_relations = [ # empty ] self._apply_pipeline(dataset) # - evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 0) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 0.0) # --- part_1.predicted_relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")), ] evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 1) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 1.0) # - part_1.predicted_relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "tool"), Entity(STUB_E_ID_2, 0, "MAYNARD")), ] evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 1) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 1.0)
def test_DocumentLevelRelationEvaluator_arbitrary_relation_accept_fun_ignore_some_predictions(self): entity_map_fun = (lambda e: e.text) def relation_accept_fun(gold, pred): gold_pred_char_num = int(gold[-1]) pred_last_char_num = int(pred[-1]) print('gold:', gold, ' <---> ', 'pred:', pred,) if gold == pred: # 1 == 1 return True elif gold < pred: # 1 < 2 return None else: return False # 1 !<= 0 return gold == pred r1 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "1")) r2 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "1")) # Accept r3 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "2")) # Ignore r4 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "0")) # Reject self.assertEqual(True, relation_accept_fun(r1.map(entity_map_fun), r2.map(entity_map_fun))) self.assertEqual(None, relation_accept_fun(r1.map(entity_map_fun), r3.map(entity_map_fun))) self.assertEqual(False, relation_accept_fun(r1.map(entity_map_fun), r4.map(entity_map_fun))) evaluator = DocumentLevelRelationEvaluator(STUB_R_ID_1, entity_map_fun, relation_accept_fun) (dataset, part) = self._create_basic_dataset() # - part.relations = [r1] part.predicted_relations = [r2, r4] + [r3, r3, r3, r3, r3] # All the r3's should be ignored evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) print(evaluation) self.assertEqual(evaluation.tp, 1) self.assertEqual(evaluation.fn, 0) self.assertEqual(evaluation.fp, 1) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 0.6666666666666666)
def test_overlapping(self): e1 = Entity(class_id="e_x", offset=987, text="PKB/Akt") e2 = Entity(class_id="e_x", offset=987, text="PKB") Entity.equality_operator = 'exact_or_overlapping' print(e1.offset, e1.end_offset()) print(e2.offset, e2.end_offset()) self.assertEqual(e1, e2)
def _parse_json(doc_id, doc, response_text): try: for pred_part in json.loads(response_text, strict=False): partid = pred_part['sourceid'] part = doc.parts[partid] for pred in pred_part['denotations']: start = pred['span']['begin'] end = pred['span']['end'] start, end = TmVarTagger._adjust_offsets( part.text, pred_part['text'], start, end) part.predicted_annotations.append( Entity(MUT_CLASS_ID, start, part.text[start:end])) except Exception: print("ERROR PARSING JSON", response_text) raise
def read(self): """ :returns: nalaf.structures.data.Dataset """ dataset = Dataset() with open(self.corpus_file, encoding='utf-8') as file: documents = file.read().strip().split('\n\n') for document_text in documents: lines = document_text.strip().splitlines() first_line = re.search('(\d+)\|t\|(.*)', lines[0]) doc_id = first_line.group(1) tmvar_title = first_line.group(2) tmvar_abstract = re.search('(\d+)\|a\|(.*)', lines[1]).group(2) document = Document() title = Part(tmvar_title) abstract = Part(tmvar_abstract) document.parts['title'] = title document.parts['abstract'] = abstract for line in lines[2:]: _, start, end, _, _, _ = line.split('\t') start = int(start) end = int(end) if 0 <= start < end <= len(tmvar_title): part = title else: part = abstract start -= len(tmvar_title) + 1 end -= len(tmvar_title) + 1 part.annotations.append( Entity(self.mut_class_id, start, part.text[start:end])) dataset.documents[doc_id] = document return dataset
def _get_test_data(self, entity_sentence, assumed_tokens_words=None): if assumed_tokens_words is None: assumed_tokens_words = entity_sentence.split(' ') # Create dataset dataset = StringReader(entity_sentence).read() part = next(dataset.parts()) entity = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=0, text=entity_sentence) part.annotations.append(entity) # Apply through pipeline NLTKSplitter().split(dataset) NLTK_TOKENIZER.tokenize(dataset) self.parser.parse(dataset) # Rest sentences = part.sentences assert len(sentences) == 1 sentence = sentences[0] assert len(assumed_tokens_words) == len(sentence) for (assumed_token_word, actual_token) in zip(assumed_tokens_words, sentence): assert assumed_token_word == actual_token.word part.compute_tokens_depth() roots = Part.get_sentence_roots(sentence) for r in roots: self._assert_depth_eq(r, 0) part.set_entities_head_tokens() return (dataset, sentence, entity, roots)
def read(self): """ read each html file in the directory, parse it and create and instance of Document form a dataset consisting of every document parsed and return it Note that the text files may contain multiple paragraphs. The reader converts these paragraphs into different parts. Because of necessary offset corrections, the reader reads at the same time both the content and the annotations. :returns structures.data.Dataset """ dataset = Dataset() ids_per_file_array = [1] file_list = glob.glob(str(self.directory + "/*.txt")) for file_path in file_list: file_name = os.path.basename(file_path) docid, partid_prefix, = file_name.replace('.txt', '').split('-', 1) # partid_prefix not complete due to multiple part cration for a single .txt file if 'Abstract' in partid_prefix: is_abstract = True else: is_abstract = False with open(file_path, encoding='utf-8') as file: text_raw = file.read() text = text_raw.replace('** IGNORE LINE **\n', '') paragraph_list = text.split('\n\n') # inital offset for raw_text tot_offset = text_raw.count('** IGNORE LINE **\n') * 18 offsets = [tot_offset] for i, text_part in enumerate(paragraph_list): # if text is empty (usually last text due to splitting of "\n\n") if text_part != "": partid = "{}-p{}".format(partid_prefix, i + 1) if docid in dataset: dataset.documents[docid].parts[partid] = Part( text_part, is_abstract=is_abstract) else: document = Document() document.parts[partid] = Part(text_part, is_abstract=is_abstract) dataset.documents[docid] = document # add offset for next paragraph tot_offset += len(text_part) + 2 offsets.append(tot_offset) # to delete last element del offsets[-1] # annotations with open(file_path.replace('.txt', '.ann'), encoding='utf-8') as f: reader = csv.reader(f, delimiter='\t') for row in reader: if row[0].startswith('T'): entity_type, start, end = row[1].split() start = int(start) end = int(end) text = row[2] partid = None part_index = None for i in range(len(offsets) - 1): if offsets[i + 1] > start: part_index = i break if part_index is None: part_index = len(offsets) - 1 partid = "{}-p{}".format(partid_prefix, part_index + 1) real_start = start - offsets[part_index] real_end = end - offsets[part_index] calc_ann_text = document.parts[partid].text[ real_start:real_end] if calc_ann_text != text: print(" ERROR", docid, part_index, partid, start, offsets, real_start, "\n\t", text, "\n\t", calc_ann_text, "\n\t", document.parts[partid].text) if entity_type == 'mutation': ann = Entity(self.mut_class_id, real_start, text) dataset.documents[docid].parts[ partid].annotations.append(ann) elif entity_type == 'gene': ann = Entity(self.gene_class_id, real_start, text) dataset.documents[docid].parts[ partid].annotations.append(ann) return dataset
def tag(self, dataset, annotated=False, uniprot=False, process_only_abstract=True): """ :type dataset: nalaf.structures.data.Dataset :param annotated: if True then saved into annotations otherwise into predicted_annotations """ with GNormPlus() as gnorm: for doc_id, doc in dataset.documents.items(): if process_only_abstract: genes, gnorm_title, gnorm_abstract = gnorm.get_genes_for_pmid( doc_id, postproc=True) if uniprot: with Uniprot() as uprot: list_of_ids = gnorm.uniquify_genes(genes) genes_mapping = uprot.get_uniprotid_for_entrez_geneid( list_of_ids) else: genes_mapping = {} # find the title and the abstract parts = iter(doc.parts.values()) title = next(parts) abstract = next(parts) adjustment_offsets = [] if title.text != gnorm_title: adjustment_offsets += self.__find_offset_adjustments( title.text, gnorm_title, 0) if abstract.text != gnorm_abstract: adjustment_offsets += self.__find_offset_adjustments( abstract.text, gnorm_abstract, len(gnorm_title)) for start, end, text, gene_id in genes: if 0 <= start < end <= len(title.text): part = title else: part = abstract # we have to readjust the offset since GnormPlus provides # offsets for title and abstract together offset = len(title.text) + 1 start -= offset end -= offset for adjustment_offset, adjustment in adjustment_offsets: if start > adjustment_offset: start -= adjustment # discussion which confidence value for gnormplus because there is no value supplied ann = Entity(class_id=self.predicts_classes[0], offset=start, text=text, confidence=0.5) try: norm_dict = { self.predicts_classes[1]: gene_id, self.predicts_classes[2]: genes_mapping[gene_id] } except KeyError: norm_dict = {self.predicts_classes[1]: gene_id} norm_string = '' # todo normalized_text (stemming ... ?) ann.norms = norm_dict ann.normalized_text = norm_string if annotated: part.annotations.append(ann) else: part.predicted_annotations.append(ann) else: # todo this is not used for now anywhere, might need to be re-worked or excluded # genes = gnorm.get_genes_for_text(part.text) pass
def read(self): """ :returns: nalaf.structures.data.Dataset """ from functools import reduce dataset = Dataset() for filename in glob.glob(self.path + '/*.txt'): with open(filename, 'r') as f: data = f.read() content = data.split("\n") try: pmid = int(content[0]) except ValueError: continue doc = Document() title = content[2] part_title = Part(title, is_abstract=True) body = content[4] part_abstract = Part(body, is_abstract=True) title_offset = len(str(pmid)) + 2 # +2 for twice newline body_offset = title_offset + len( title) + 2 # +2 for twice newline # elements for temporary current_annotation = [] last_element = None # print(filename, pmid, title) with open(filename + '.ann', 'r') as fa: tree = ET.parse(fa) for element in tree.iterfind( 'Annotation/Annotation[@type]'): # if gene annotation skip if element.attrib['type'] == 'ge': continue # if last element is empty (beginning of new doc) save as last_element and skip if last_element is None: last_element = element continue span = last_element.attrib['span'].split('..') start = int(span[0]) end = int(span[1]) text = data[start:end] if start >= body_offset: norm_start = start - body_offset norm_end = end - body_offset else: norm_start = start - title_offset norm_end = end - title_offset if end + 1 == int( element.attrib['span'].split('..')[0] ): # todo bugfix still mistake if space is in between the whole annotation case: "#1632 T" if len( current_annotation ) == 0: # if no series of annotations linked current_annotation.append(norm_start) current_annotation.append(norm_end) current_annotation.append(text) current_annotation.append( (start >= body_offset)) # if is_body else: # if already annotations contained there current_annotation[1] = norm_end current_annotation[2] += text else: if len(current_annotation) > 0: entity = Entity(self.mut_class_id, current_annotation[0], current_annotation[2]) if current_annotation[3]: part_abstract.annotations.append(entity) else: part_title.annotations.append(entity) current_annotation = [] entity = Entity(self.mut_class_id, norm_start, text) if start >= body_offset: part_abstract.annotations.append(entity) else: part_title.annotations.append(entity) last_element = element span = last_element.attrib['span'].split('..') start = int(span[0]) end = int(span[1]) text = data[start:end] if len(current_annotation ) == 0: # if no series of annotations linked if start >= body_offset: norm_start = start - body_offset is_body = True else: norm_start = start - title_offset is_body = False entity = Entity(self.mut_class_id, norm_start, text) if is_body: part_abstract.annotations.append(entity) else: part_title.annotations.append(entity) else: # if already annotations contained there current_annotation[2] += text entity = Entity(self.mut_class_id, current_annotation[0], current_annotation[2]) if current_annotation[3]: part_abstract.annotations.append(entity) else: part_title.annotations.append(entity) doc.parts['title'] = part_title doc.parts['abstract'] = part_abstract # print(part_title) # print(part_body) dataset.documents[pmid] = doc # print(doc) return dataset
def test_DocumentLevelRelationEvaluator_arbitrary_relation_accept_fun_order_matters(self): entity_map_fun = (lambda e: e.text) def relation_accept_fun(gold, pred): print('gold:', gold, ' <---> ', 'pred:', pred) return gold < pred r1 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "1"), Entity(STUB_E_ID_2, 0, "2")) r2 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "2"), Entity(STUB_E_ID_2, 0, "1")) # r1 not equiv r1 because this IS NOT equals (r1 not < r1) self.assertFalse(relation_accept_fun(r1.map(entity_map_fun), r1.map(entity_map_fun))) # r1 < r2 self.assertTrue(relation_accept_fun(r1.map(entity_map_fun), r2.map(entity_map_fun))) # r2 not < r1 self.assertFalse(relation_accept_fun(r2.map(entity_map_fun), r1.map(entity_map_fun))) evaluator = DocumentLevelRelationEvaluator(STUB_R_ID_1, entity_map_fun, relation_accept_fun) (dataset, part) = self._create_basic_dataset() # - part.relations = [r1] part.predicted_relations = [r1] evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) print(evaluation) self.assertEqual(evaluation.tp, 0) self.assertEqual(evaluation.fn, 1) self.assertEqual(evaluation.fp, 1) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 0.0) # - part.relations = [r1] part.predicted_relations = [r2] evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) print(evaluation) self.assertEqual(evaluation.tp, 1) self.assertEqual(evaluation.fn, 0) self.assertEqual(evaluation.fp, 0) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 1.0) # - part.relations = [r2] part.predicted_relations = [r1] evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 0) self.assertEqual(evaluation.fn, 1) self.assertEqual(evaluation.fp, 1) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 0.0)
def read(self): """ :returns: nalaf.structures.data.Dataset """ from functools import reduce dataset = Dataset() with open(self.path, 'r') as f: tree = ET.parse(f) # level document for element in tree.iterfind('Article'): doc = Document() # pmid <Pmid> pmid = element[0].text # title <Title> title = element[1].text if not title: title = "" title_annotations = [] for child in element[1]: if child.tag == 'variant': entity = Entity(self.mut_class_id, len(title), child.text) title_annotations.append(entity) # unforunately child.text or child.tail can be empty and return None, which cannot be written as "" try: title += child.text except TypeError: pass try: title += child.tail except TypeError: pass part_title = Part(title) part_title.annotations.extend(title_annotations) # body - abstract <Abstract> abstract = element[2].text if not abstract: abstract = "" abstract_annotations = [] for child in element[2]: if child.tag == 'variant': entity = Entity(self.mut_class_id, len(abstract), child.text) abstract_annotations.append(entity) # unforunately child.text or child.tail can be empty and return None, which cannot be written as "" try: abstract += child.text except TypeError: pass try: abstract += child.tail except TypeError: pass part_abstract = Part(abstract) part_abstract.annotations.extend(abstract_annotations) # save part to document doc.parts['title'] = part_title doc.parts['abstract'] = part_abstract dataset.documents[pmid] = doc # save document to dataset return dataset
def __read_annjson(self, reader, filename, dataset): try: doc_id = os.path.basename(filename).replace('.ann.json', '').replace('.json', '') if not self.whole_basename_as_docid and '-' in doc_id: doc_id = doc_id.split('-')[-1] ann_json = json.load(reader) try: document = dataset.documents[doc_id] except Exception as err: print_warning("The annjson with docid={} was not in the whole plain dataset.".format(doc_id)) return doc_id if not (ann_json['anncomplete'] or self.is_predicted) and self.delete_incomplete_docs: del dataset.documents[doc_id] else: for e in ann_json['entities']: if self.read_only_class_id is None or e['classId'] in self.read_only_class_id: part = document.parts[e['part']] try: normalizations = {key: obj['source']['id'] for key, obj in e['normalizations'].items()} except KeyError as err: print_warning("The normalization is badly formatted: (docid={}) {}".format(doc_id, str(e['normalizations']))) normalizations = None entity = Entity( e['classId'], e['offsets'][0]['start'], e['offsets'][0]['text'], e['confidence']['prob'], norms=normalizations) if self.is_predicted: part.predicted_annotations.append(entity) else: part.annotations.append(entity) if self.read_relations: for relation in ann_json['relations']: # Note: no distinction with predicted_relations yet part = document.parts[relation['entities'][0].split('|')[0]] e1_start = int(relation['entities'][0].split('|')[1].split(',')[0]) e2_start = int(relation['entities'][1].split('|')[1].split(',')[0]) rel_id = relation['classId'] e1 = part.get_entity(e1_start, use_pred=False, raise_exception_on_incosistencies=self.raise_exception_on_incosistencies) e2 = part.get_entity(e2_start, use_pred=False, raise_exception_on_incosistencies=self.raise_exception_on_incosistencies) if (not self.raise_exception_on_incosistencies and (e1 is None or e2 is None)): continue rel = Relation(rel_id, e1, e2) part.relations.append(rel) # delete parts that are not annotatable annotatable_parts = set(ann_json['annotatable']['parts']) part_ids_to_del = [] for part_id, part in document.parts.items(): if part_id not in annotatable_parts: part_ids_to_del.append(part_id) for part_id in part_ids_to_del: del document.parts[part_id] return doc_id except Exception as err: if self.raise_exception_on_incosistencies: raise err else: pass
def test_DocumentLevelRelationEvaluator_arbitrary_relation_accept_fun_dont_count_multiple_same_hits(self): entity_map_fun = (lambda e: e.text) def relation_accept_fun(gold, pred): print('gold:', gold, ' <---> ', 'pred:', pred,) gold = int(gold[-1]) pred = int(pred[-1]) if gold <= pred and ((pred - gold) < 3): # e.g., 1 <= 1, 2, 3 return True else: return False return gold == pred r1 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "1")) r5 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "9")) # Missing == fn r6 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "5")) r8 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "2")) # (maps to 1) Own repetition in gold, so 1 should be counted twice r2 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "1")) # Accept 1 --> do count == tp r3 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "2")) # repeated Accept 1,2 --> do count because of own repetition in gold == tp r4 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "3")) # repeated Accept 1,2 --> do not count because it's over repetition r7 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "6")) # Accept 5 --> do count == tp r9 = Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "xxx"), Entity(STUB_E_ID_2, 0, "5")) # Accept 5 --> do not count because it's over repetition self.assertEqual(True, relation_accept_fun(r1.map(entity_map_fun), r2.map(entity_map_fun))) self.assertEqual(True, relation_accept_fun(r1.map(entity_map_fun), r3.map(entity_map_fun))) self.assertEqual(True, relation_accept_fun(r1.map(entity_map_fun), r4.map(entity_map_fun))) self.assertEqual(False, relation_accept_fun(r1.map(entity_map_fun), r7.map(entity_map_fun))) self.assertEqual(False, relation_accept_fun(r5.map(entity_map_fun), r2.map(entity_map_fun))) self.assertEqual(False, relation_accept_fun(r5.map(entity_map_fun), r3.map(entity_map_fun))) self.assertEqual(False, relation_accept_fun(r5.map(entity_map_fun), r4.map(entity_map_fun))) self.assertEqual(False, relation_accept_fun(r5.map(entity_map_fun), r7.map(entity_map_fun))) self.assertEqual(True, relation_accept_fun(r6.map(entity_map_fun), r7.map(entity_map_fun))) self.assertEqual(False, relation_accept_fun(r8.map(entity_map_fun), r2.map(entity_map_fun))) self.assertEqual(True, relation_accept_fun(r8.map(entity_map_fun), r3.map(entity_map_fun))) self.assertEqual(True, relation_accept_fun(r8.map(entity_map_fun), r4.map(entity_map_fun))) self.assertEqual(False, relation_accept_fun(r8.map(entity_map_fun), r7.map(entity_map_fun))) evaluator = DocumentLevelRelationEvaluator(STUB_R_ID_1, entity_map_fun, relation_accept_fun) (dataset, part) = self._create_basic_dataset() # - part.relations = [r1, r5, r6, r8] part.predicted_relations = [r2, r3, r4, r7, r9] # Only one shold be accepted evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) print(evaluation) self.assertEqual(evaluation.tp, 3, evaluation) self.assertEqual(evaluation.fn, 1) self.assertEqual(evaluation.fp, 0) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 0.8571428571428571)