def get_ner_predictions(ehr_record: str, model_name: str = "biobert", record_id: str = "1") -> HealthRecord: """ Get predictions for NER using either BioBERT or BiLSTM Parameters -------------- ehr_record : str An EHR record in text format. model_name : str The model to use for prediction. Default is biobert. record_id : str The record id of the returned object. Default is 1. Returns ----------- A HealthRecord object with entities set. """ if model_name.lower() == "biobert": test_ehr = HealthRecord(record_id=record_id, text=ehr_record, tokenizer=biobert_ner_tokenizer.tokenize, is_bert_tokenizer=True, is_training=False) predictions = get_biobert_ner_predictions(test_ehr) elif model_name.lower() == "bilstm": test_ehr = HealthRecord(text=ehr_record, tokenizer=scispacy_plus_tokenizer, is_bert_tokenizer=False, is_training=False) predictions = get_bilstm_ner_predictions(test_ehr) else: raise AttributeError("Accepted model names include 'biobert' " "and 'bilstm'.") ent_preds = [] for i, pred in enumerate(predictions): ent = Entity("T%d" % i, label_ent_map[pred[0]], [pred[1], pred[2]]) ent_text = test_ehr.text[ent[0]:ent[1]] if not any(letter.isalnum() for letter in ent_text): continue ent.set_text(ent_text) ent_preds.append(ent) test_ehr.entities = ent_preds return test_ehr
def _extract_annotations(path: str) \ -> Tuple[Dict[str, Entity], Dict[str, Relation]]: """ Internal function that extracts entities and relations as a dictionary from an annotation file. Parameters ---------- path : str Path for the ann file. Returns ------- Tuple[Dict[str, Entity], Dict[str, Relation]] Entities and relations. """ f = open(path) raw_data = f.read().split('\n') f.close() entities = {} relations = {} # Relations with entities that haven't been processed yet relation_backlog = [] for line in raw_data: if line.startswith('#'): continue line = line.split('\t') # Remove empty strings from list line = list(filter(None, line)) if not line or not line[0]: continue if line[0][0] == 'T': assert len(line) == 3 idx = 0 # Find the end of first word, which is the entity type for idx in range(len(line[1])): if line[1][idx] == ' ': break char_ranges = line[1][idx + 1:] # Get all character ranges, separated by ; char_ranges = [r.split() for r in char_ranges.split(';')] # Create an Entity object ent = Entity(entity_id=line[0], entity_type=line[1][:idx]) r = [char_ranges[0][0], char_ranges[-1][1]] r = list(map(int, r)) ent.set_range(r) ent.set_text(line[2]) entities[line[0]] = ent elif line[0][0] == 'R': assert len(line) == 2 rel_details = line[1].split(' ') entity1 = rel_details[1].split(':')[-1] entity2 = rel_details[2].split(':')[-1] if entity1 in entities and entity2 in entities: rel = Relation(relation_id=line[0], relation_type=rel_details[0], arg1=entities[entity1], arg2=entities[entity2]) relations[line[0]] = rel else: # If the entities aren't processed yet, # add them to backlog to process later relation_backlog.append( [line[0], rel_details[0], entity1, entity2]) else: # If the annotation is not a relation or entity, warn user warnings.warn("Invalid annotation encountered: " + str(line)) for r in relation_backlog: rel = Relation(relation_id=r[0], relation_type=r[1], arg1=entities[r[2]], arg2=entities[r[3]]) relations[r[0]] = rel return entities, relations
def process_ade_files(ade_data: List[dict]) -> List[dict]: """ Extracts tokens and creates Entity and Relation objects from raw json data. Parameters ---------- ade_data : List[dict] Raw json data. Returns ------- List[dict] Tokens, entities and relations. """ ade_records = [] for ade in ade_data: entities = {} relations = {} relation_backlog = [] # Tokens tokens = ade['tokens'] # Entities e_num = 1 for ent in ade['entities']: ent_id = 'T' + "%s" % e_num if ent['type'] == 'Adverse-Effect': ent['type'] = 'ADE' ent_obj = Entity(entity_id=ent_id, entity_type=ent['type']) r = [ent['start'], ent['end'] - 1] r = list(map(int, r)) ent_obj.set_range(r) text = '' for token_ent in ade['tokens'][ent['start']:ent['end']]: text += token_ent + ' ' ent_obj.set_text(text) entities[ent_id] = ent_obj e_num += 1 # Relations r_num = 1 for relation in ade['relations']: rel_id = 'R' + "%s" % r_num rel_details = 'ADE-Drug' entity1 = "T" + str(relation['head'] + 1) entity2 = "T" + str(relation['tail'] + 1) if entity1 in entities and entity2 in entities: rel = Relation(relation_id=rel_id, relation_type=rel_details, arg1=entities[entity1], arg2=entities[entity2]) relations[rel_id] = rel else: relation_backlog.append([rel_id, rel_details, entity1, entity2]) r_num += 1 ade_records.append({"tokens": tokens, "entities": entities, "relations": relations}) return ade_records