def get_bilstm_ner_predictions( test_ehr: HealthRecord) -> List[Tuple[str, int, int]]: """ Get predictions for a single EHR record using BiLSTM Parameters ---------- test_ehr : HealthRecord The EHR record, this object should have a tokenizer set. Returns ------- pred_entities : List[Tuple[str, int, int]] List of predicted Entities each with the format ("entity", start_idx, end_idx). """ split_points = test_ehr.get_split_points(max_len=BILSTM_NER_SEQ_LEN) examples = [] for idx in range(len(split_points) - 1): words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]] examples.append(words) predictions = bilstm_learn.predict(examples) pred_entities = [] for idx in range(len(split_points) - 1): chunk_pred = get_chunks(predictions[idx]) for ent in chunk_pred: pred_entities.append( (ent[0], test_ehr.get_char_idx(split_points[idx] + ent[1])[0], test_ehr.get_char_idx(split_points[idx] + ent[2])[1])) return pred_entities
def get_biobert_ner_predictions(test_ehr: HealthRecord) -> List[Tuple[str, int, int]]: """ Get predictions for a single EHR record using BioBERT Parameters ---------- test_ehr : HealthRecord The EHR record, this object should have a tokenizer set. Returns ------- pred_entities : List[Tuple[str, int, int]] List of predicted Entities each with the format ("entity", start_idx, end_idx). """ split_points = test_ehr.get_split_points(max_len=BIOBERT_SEQ_LEN - 2) examples = [] for idx in range(len(split_points) - 1): words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]] examples.append(NerExample(guid=str(split_points[idx]), words=words, labels=["O"] * len(words))) input_features = convert_examples_to_features( examples, biobert_ner_labels, max_seq_length=BIOBERT_SEQ_LEN, tokenizer=biobert_ner_tokenizer, cls_token_at_end=False, cls_token=biobert_ner_tokenizer.cls_token, cls_token_segment_id=0, sep_token=biobert_ner_tokenizer.sep_token, sep_token_extra=False, pad_on_left=bool(biobert_ner_tokenizer.padding_side == "left"), pad_token=biobert_ner_tokenizer.pad_token_id, pad_token_segment_id=biobert_ner_tokenizer.pad_token_type_id, pad_token_label_id=nn.CrossEntropyLoss().ignore_index) predictions, _, _ = biobert_ner_trainer.predict(input_features) predictions = align_predictions(predictions) pred_entities = [] for idx in range(len(split_points) - 1): chunk_pred = get_chunks(predictions[idx]) for ent in chunk_pred: pred_entities.append((ent[0], test_ehr.get_char_idx(split_points[idx] + ent[1] - 1)[0], test_ehr.get_char_idx(split_points[idx] + ent[2] - 1)[1])) return pred_entities
def get_biobert_ner_predictions( test_ehr: HealthRecord) -> List[Tuple[str, int, int]]: """ Get predictions for a single EHR record using BioBERT Parameters ---------- test_ehr : HealthRecord The EHR record, this object should have a tokenizer set. Returns ------- pred_entities : List[Tuple[str, int, int]] List of predicted Entities each with the format ("entity", start_idx, end_idx). """ split_points = test_ehr.get_split_points(max_len=BIOBERT_NER_SEQ_LEN - 2) examples = [] for idx in range(len(split_points) - 1): words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]] examples.append( NerExample(guid=str(split_points[idx]), words=words, labels=["O"] * len(words))) input_features = convert_examples_to_features( examples, biobert_ner_labels, max_seq_length=BIOBERT_NER_SEQ_LEN, tokenizer=biobert_ner_tokenizer, cls_token_at_end=False, cls_token=biobert_ner_tokenizer.cls_token, cls_token_segment_id=0, sep_token=biobert_ner_tokenizer.sep_token, sep_token_extra=False, pad_on_left=bool(biobert_ner_tokenizer.padding_side == "left"), pad_token=biobert_ner_tokenizer.pad_token_id, pad_token_segment_id=biobert_ner_tokenizer.pad_token_type_id, pad_token_label_id=nn.CrossEntropyLoss().ignore_index, verbose=0) test_dataset = NerTestDataset(input_features) predictions, label_ids, _ = biobert_ner_trainer.predict(test_dataset) predictions = align_predictions(predictions, label_ids) # Flatten the prediction list predictions = [p for ex in predictions for p in ex] input_tokens = test_ehr.get_tokens() prev_pred = "" final_predictions = [] idx = 0 for token in input_tokens: if token.startswith("##"): if prev_pred == "O": final_predictions.append(prev_pred) else: pred_typ = prev_pred.split("-")[-1] final_predictions.append("I-" + pred_typ) else: prev_pred = predictions[idx] final_predictions.append(prev_pred) idx += 1 pred_entities = [] chunk_pred = get_chunks(final_predictions) for ent in chunk_pred: pred_entities.append((ent[0], test_ehr.get_char_idx(ent[1])[0], test_ehr.get_char_idx(ent[2])[1])) return pred_entities