def get_bilstm_ner_predictions(
        test_ehr: HealthRecord) -> List[Tuple[str, int, int]]:
    """
    Get predictions for a single EHR record using BiLSTM

    Parameters
    ----------
    test_ehr : HealthRecord
        The EHR record, this object should have a tokenizer set.

    Returns
    -------
    pred_entities : List[Tuple[str, int, int]]
        List of predicted Entities each with the format
        ("entity", start_idx, end_idx).

    """
    split_points = test_ehr.get_split_points(max_len=BILSTM_NER_SEQ_LEN)
    examples = []

    for idx in range(len(split_points) - 1):
        words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]]
        examples.append(words)

    predictions = bilstm_learn.predict(examples)

    pred_entities = []
    for idx in range(len(split_points) - 1):
        chunk_pred = get_chunks(predictions[idx])
        for ent in chunk_pred:
            pred_entities.append(
                (ent[0], test_ehr.get_char_idx(split_points[idx] + ent[1])[0],
                 test_ehr.get_char_idx(split_points[idx] + ent[2])[1]))

    return pred_entities
示例#2
0
def get_biobert_ner_predictions(test_ehr: HealthRecord) -> List[Tuple[str, int, int]]:
    """
    Get predictions for a single EHR record using BioBERT

    Parameters
    ----------
    test_ehr : HealthRecord
        The EHR record, this object should have a tokenizer set.

    Returns
    -------
    pred_entities : List[Tuple[str, int, int]]
        List of predicted Entities each with the format
        ("entity", start_idx, end_idx).

    """
    split_points = test_ehr.get_split_points(max_len=BIOBERT_SEQ_LEN - 2)
    examples = []

    for idx in range(len(split_points) - 1):
        words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]]
        examples.append(NerExample(guid=str(split_points[idx]),
                                   words=words,
                                   labels=["O"] * len(words)))

    input_features = convert_examples_to_features(
        examples,
        biobert_ner_labels,
        max_seq_length=BIOBERT_SEQ_LEN,
        tokenizer=biobert_ner_tokenizer,
        cls_token_at_end=False,
        cls_token=biobert_ner_tokenizer.cls_token,
        cls_token_segment_id=0,
        sep_token=biobert_ner_tokenizer.sep_token,
        sep_token_extra=False,
        pad_on_left=bool(biobert_ner_tokenizer.padding_side == "left"),
        pad_token=biobert_ner_tokenizer.pad_token_id,
        pad_token_segment_id=biobert_ner_tokenizer.pad_token_type_id,
        pad_token_label_id=nn.CrossEntropyLoss().ignore_index)

    predictions, _, _ = biobert_ner_trainer.predict(input_features)
    predictions = align_predictions(predictions)

    pred_entities = []
    for idx in range(len(split_points) - 1):
        chunk_pred = get_chunks(predictions[idx])
        for ent in chunk_pred:
            pred_entities.append((ent[0],
                                  test_ehr.get_char_idx(split_points[idx] + ent[1] - 1)[0],
                                  test_ehr.get_char_idx(split_points[idx] + ent[2] - 1)[1]))

    return pred_entities
def get_biobert_ner_predictions(
        test_ehr: HealthRecord) -> List[Tuple[str, int, int]]:
    """
    Get predictions for a single EHR record using BioBERT

    Parameters
    ----------
    test_ehr : HealthRecord
        The EHR record, this object should have a tokenizer set.

    Returns
    -------
    pred_entities : List[Tuple[str, int, int]]
        List of predicted Entities each with the format
        ("entity", start_idx, end_idx).

    """
    split_points = test_ehr.get_split_points(max_len=BIOBERT_NER_SEQ_LEN - 2)
    examples = []

    for idx in range(len(split_points) - 1):
        words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]]
        examples.append(
            NerExample(guid=str(split_points[idx]),
                       words=words,
                       labels=["O"] * len(words)))

    input_features = convert_examples_to_features(
        examples,
        biobert_ner_labels,
        max_seq_length=BIOBERT_NER_SEQ_LEN,
        tokenizer=biobert_ner_tokenizer,
        cls_token_at_end=False,
        cls_token=biobert_ner_tokenizer.cls_token,
        cls_token_segment_id=0,
        sep_token=biobert_ner_tokenizer.sep_token,
        sep_token_extra=False,
        pad_on_left=bool(biobert_ner_tokenizer.padding_side == "left"),
        pad_token=biobert_ner_tokenizer.pad_token_id,
        pad_token_segment_id=biobert_ner_tokenizer.pad_token_type_id,
        pad_token_label_id=nn.CrossEntropyLoss().ignore_index,
        verbose=0)

    test_dataset = NerTestDataset(input_features)

    predictions, label_ids, _ = biobert_ner_trainer.predict(test_dataset)
    predictions = align_predictions(predictions, label_ids)

    # Flatten the prediction list
    predictions = [p for ex in predictions for p in ex]

    input_tokens = test_ehr.get_tokens()
    prev_pred = ""
    final_predictions = []
    idx = 0

    for token in input_tokens:
        if token.startswith("##"):
            if prev_pred == "O":
                final_predictions.append(prev_pred)
            else:
                pred_typ = prev_pred.split("-")[-1]
                final_predictions.append("I-" + pred_typ)
        else:
            prev_pred = predictions[idx]
            final_predictions.append(prev_pred)
            idx += 1

    pred_entities = []
    chunk_pred = get_chunks(final_predictions)
    for ent in chunk_pred:
        pred_entities.append((ent[0], test_ehr.get_char_idx(ent[1])[0],
                              test_ehr.get_char_idx(ent[2])[1]))

    return pred_entities