def get_re_predictions(test_ehr: HealthRecord) -> HealthRecord:
    """
    Get predictions for Relation Extraction.

    Parameters
    -----------
    test_ehr : HealthRecord
        A HealthRecord object with entities set.

    Returns
    --------
    HealthRecord
        The original object with relations set.
    """
    test_dataset = RETestDataset(test_ehr, biobert_ner_tokenizer,
                                 BIOBERT_RE_SEQ_LEN, re_label_list)

    if len(test_dataset) == 0:
        test_ehr.relations = []
        return test_ehr

    re_predictions = biobert_re_trainer.predict(
        test_dataset=test_dataset).predictions
    re_predictions = np.argmax(re_predictions, axis=1)

    idx = 1
    rel_preds = []
    for relation, pred in zip(test_dataset.relation_list, re_predictions):
        if pred == 1:
            relation.ann_id = "R%d" % idx
            idx += 1
            rel_preds.append(relation)

    test_ehr.relations = rel_preds
    return test_ehr
def get_bilstm_ner_predictions(
        test_ehr: HealthRecord) -> List[Tuple[str, int, int]]:
    """
    Get predictions for a single EHR record using BiLSTM

    Parameters
    ----------
    test_ehr : HealthRecord
        The EHR record, this object should have a tokenizer set.

    Returns
    -------
    pred_entities : List[Tuple[str, int, int]]
        List of predicted Entities each with the format
        ("entity", start_idx, end_idx).

    """
    split_points = test_ehr.get_split_points(max_len=BILSTM_NER_SEQ_LEN)
    examples = []

    for idx in range(len(split_points) - 1):
        words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]]
        examples.append(words)

    predictions = bilstm_learn.predict(examples)

    pred_entities = []
    for idx in range(len(split_points) - 1):
        chunk_pred = get_chunks(predictions[idx])
        for ent in chunk_pred:
            pred_entities.append(
                (ent[0], test_ehr.get_char_idx(split_points[idx] + ent[1])[0],
                 test_ehr.get_char_idx(split_points[idx] + ent[2])[1]))

    return pred_entities
def get_ner_predictions(ehr_record: str,
                        model_name: str = "biobert",
                        record_id: str = "1") -> HealthRecord:
    """
    Get predictions for NER using either BioBERT or BiLSTM

    Parameters
    --------------
    ehr_record : str
        An EHR record in text format.

    model_name : str
        The model to use for prediction. Default is biobert.

    record_id : str
        The record id of the returned object. Default is 1.

    Returns
    -----------
    A HealthRecord object with entities set.
    """
    if model_name.lower() == "biobert":
        test_ehr = HealthRecord(record_id=record_id,
                                text=ehr_record,
                                tokenizer=biobert_ner_tokenizer.tokenize,
                                is_bert_tokenizer=True,
                                is_training=False)

        predictions = get_biobert_ner_predictions(test_ehr)

    elif model_name.lower() == "bilstm":
        test_ehr = HealthRecord(text=ehr_record,
                                tokenizer=scispacy_plus_tokenizer,
                                is_bert_tokenizer=False,
                                is_training=False)
        predictions = get_bilstm_ner_predictions(test_ehr)

    else:
        raise AttributeError("Accepted model names include 'biobert' "
                             "and 'bilstm'.")

    ent_preds = []
    for i, pred in enumerate(predictions):
        ent = Entity("T%d" % i, label_ent_map[pred[0]], [pred[1], pred[2]])
        ent_text = test_ehr.text[ent[0]:ent[1]]

        if not any(letter.isalnum() for letter in ent_text):
            continue

        ent.set_text(ent_text)
        ent_preds.append(ent)

    test_ehr.entities = ent_preds
    return test_ehr
示例#4
0
def get_biobert_ner_predictions(test_ehr: HealthRecord) -> List[Tuple[str, int, int]]:
    """
    Get predictions for a single EHR record using BioBERT

    Parameters
    ----------
    test_ehr : HealthRecord
        The EHR record, this object should have a tokenizer set.

    Returns
    -------
    pred_entities : List[Tuple[str, int, int]]
        List of predicted Entities each with the format
        ("entity", start_idx, end_idx).

    """
    split_points = test_ehr.get_split_points(max_len=BIOBERT_SEQ_LEN - 2)
    examples = []

    for idx in range(len(split_points) - 1):
        words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]]
        examples.append(NerExample(guid=str(split_points[idx]),
                                   words=words,
                                   labels=["O"] * len(words)))

    input_features = convert_examples_to_features(
        examples,
        biobert_ner_labels,
        max_seq_length=BIOBERT_SEQ_LEN,
        tokenizer=biobert_ner_tokenizer,
        cls_token_at_end=False,
        cls_token=biobert_ner_tokenizer.cls_token,
        cls_token_segment_id=0,
        sep_token=biobert_ner_tokenizer.sep_token,
        sep_token_extra=False,
        pad_on_left=bool(biobert_ner_tokenizer.padding_side == "left"),
        pad_token=biobert_ner_tokenizer.pad_token_id,
        pad_token_segment_id=biobert_ner_tokenizer.pad_token_type_id,
        pad_token_label_id=nn.CrossEntropyLoss().ignore_index)

    predictions, _, _ = biobert_ner_trainer.predict(input_features)
    predictions = align_predictions(predictions)

    pred_entities = []
    for idx in range(len(split_points) - 1):
        chunk_pred = get_chunks(predictions[idx])
        for ent in chunk_pred:
            pred_entities.append((ent[0],
                                  test_ehr.get_char_idx(split_points[idx] + ent[1] - 1)[0],
                                  test_ehr.get_char_idx(split_points[idx] + ent[2] - 1)[1]))

    return pred_entities
def get_biobert_ner_predictions(
        test_ehr: HealthRecord) -> List[Tuple[str, int, int]]:
    """
    Get predictions for a single EHR record using BioBERT

    Parameters
    ----------
    test_ehr : HealthRecord
        The EHR record, this object should have a tokenizer set.

    Returns
    -------
    pred_entities : List[Tuple[str, int, int]]
        List of predicted Entities each with the format
        ("entity", start_idx, end_idx).

    """
    split_points = test_ehr.get_split_points(max_len=BIOBERT_NER_SEQ_LEN - 2)
    examples = []

    for idx in range(len(split_points) - 1):
        words = test_ehr.tokens[split_points[idx]:split_points[idx + 1]]
        examples.append(
            NerExample(guid=str(split_points[idx]),
                       words=words,
                       labels=["O"] * len(words)))

    input_features = convert_examples_to_features(
        examples,
        biobert_ner_labels,
        max_seq_length=BIOBERT_NER_SEQ_LEN,
        tokenizer=biobert_ner_tokenizer,
        cls_token_at_end=False,
        cls_token=biobert_ner_tokenizer.cls_token,
        cls_token_segment_id=0,
        sep_token=biobert_ner_tokenizer.sep_token,
        sep_token_extra=False,
        pad_on_left=bool(biobert_ner_tokenizer.padding_side == "left"),
        pad_token=biobert_ner_tokenizer.pad_token_id,
        pad_token_segment_id=biobert_ner_tokenizer.pad_token_type_id,
        pad_token_label_id=nn.CrossEntropyLoss().ignore_index,
        verbose=0)

    test_dataset = NerTestDataset(input_features)

    predictions, label_ids, _ = biobert_ner_trainer.predict(test_dataset)
    predictions = align_predictions(predictions, label_ids)

    # Flatten the prediction list
    predictions = [p for ex in predictions for p in ex]

    input_tokens = test_ehr.get_tokens()
    prev_pred = ""
    final_predictions = []
    idx = 0

    for token in input_tokens:
        if token.startswith("##"):
            if prev_pred == "O":
                final_predictions.append(prev_pred)
            else:
                pred_typ = prev_pred.split("-")[-1]
                final_predictions.append("I-" + pred_typ)
        else:
            prev_pred = predictions[idx]
            final_predictions.append(prev_pred)
            idx += 1

    pred_entities = []
    chunk_pred = get_chunks(final_predictions)
    for ent in chunk_pred:
        pred_entities.append((ent[0], test_ehr.get_char_idx(ent[1])[0],
                              test_ehr.get_char_idx(ent[2])[1]))

    return pred_entities
示例#6
0
def read_data(data_dir: str = 'data/',
              tokenizer: Callable[[str], List[str]] = None,
              is_bert_tokenizer: bool = True,
              verbose: int = 0) -> Tuple[List[HealthRecord], List[HealthRecord]]:
    """
    Reads train and test data

    Parameters
    ----------
    data_dir : str, optional
        Directory where the data is located.
        It should have directories named 'train' and 'test'
        The default is 'data/'.

    tokenizer : Callable[[str], List[str]], optional
        The tokenizer function to use.. The default is None.

    is_bert_tokenizer : bool
        If the tokenizer is a BERT-based WordPiece tokenizer

    verbose : int, optional
        1 to print reading progress, 0 otherwise. The default is 0.

    Returns
    -------
    Tuple[List[HealthRecord], List[HealthRecord]]
        Train data, Test data.

    """
    train_path = os.path.join(data_dir, "train")
    test_path = os.path.join(data_dir, "test")

    # Get all IDs for train and test data
    train_ids = list(set(['.'.join(fname.split('.')[:-1]) \
                          for fname in os.listdir(train_path) \
                          if not fname.startswith('.')]))

    test_ids = list(set(['.'.join(fname.split('.')[:-1]) \
                         for fname in os.listdir(test_path) \
                         if not fname.startswith('.')]))

    if verbose == 1:
        print("Train data:")

    train_data = []
    for idx, fid in enumerate(train_ids):
        record = HealthRecord(fid, text_path=os.path.join(train_path, fid + '.txt'),
                              ann_path=os.path.join(train_path, fid + '.ann'),
                              tokenizer=tokenizer,
                              is_bert_tokenizer=is_bert_tokenizer)
        train_data.append(record)
        if verbose == 1:
            draw_progress_bar(idx + 1, len(train_ids))

    if verbose == 1:
        print('\n\nTest Data:')

    test_data = []
    for idx, fid in enumerate(test_ids):
        record = HealthRecord(fid, text_path=os.path.join(test_path, fid + '.txt'),
                              ann_path=os.path.join(test_path, fid + '.ann'),
                              tokenizer=tokenizer,
                              is_bert_tokenizer=is_bert_tokenizer)
        test_data.append(record)
        if verbose == 1:
            draw_progress_bar(idx + 1, len(test_ids))

    return train_data, test_data
示例#7
0
def generate_re_test_file(
        ehr_record: HealthRecord,
        max_len: int = 128) -> Tuple[List[str], List[Relation]]:
    """
    Generates test file for Relation Extraction.

    Parameters
    -----------
    ehr_record : HealthRecord
        The EHR record with entities set.

    max_len : int
        The maximum length of sequence.

    Returns
    --------
    Tuple[List[str], List[Relation]]
        List of sequences with entity replaced by it's tag.
        And a list of relation objects representing relation in those sequences.
    """
    random.seed(0)

    re_text_list = []
    relation_list = []

    text = ehr_record.text
    entities = ehr_record.get_entities()
    if isinstance(entities, dict):
        entities = list(entities.values())

    # get character split points
    char_split_points = get_char_split_points(ehr_record, max_len)

    start = 0
    end = char_split_points[0]

    for i in range(len(char_split_points)):
        # Obtain only entities within the split text
        range_entities = [
            ent for ent in filter(
                lambda item: int(item[0]) >= start and int(item[1]) <= end,
                entities)
        ]

        # Get all possible relations within the split text
        possible_relations = utils.map_entities(range_entities)

        for rel, label in possible_relations:
            split_text = text[start:end]
            split_offset = start

            ent1 = rel.get_entities()[0]
            ent2 = rel.get_entities()[1]

            # Check if both entities are within split text
            if ent1[0] >= start and ent1[1] < end and \
                    ent2[0] >= start and ent2[1] < end:

                modified_text = replace_entity_text(split_text, ent1, ent2,
                                                    split_offset)

                # Replace un-required characters with space
                final_text = modified_text.replace('\n',
                                                   ' ').replace('\t', ' ')

                re_text_list.append(final_text)
                relation_list.append(rel)

        start = end
        if i != len(char_split_points) - 1:
            end = char_split_points[i + 1]
        else:
            end = len(text) + 1

    assert len(re_text_list) == len(relation_list)

    return re_text_list, relation_list
示例#8
0
def read_data(data_dir: str = 'data/',
              train_ratio: int = 0.8,
              tokenizer: Callable[[str], List[str]] = None,
              verbose: int = 0) -> Tuple[List[HealthRecord], List[HealthRecord]]:
    """
    Reads train and test data

    Parameters
    ----------
    data_dir : str, optional
        Directory where the data is located. The default is 'data/'.

    train_ratio : int, optional
        Percentage split of train data. The default is 0.8.

    tokenizer : Callable[[str], List[str]], optional
        The tokenizer function to use.. The default is None.

    verbose : int, optional
        1 to print reading progress, 0 otherwise. The default is 0.

    Returns
    -------
    Tuple[List[HealthRecord], List[HealthRecord]]
        Train data, Test data.

    """
    # Get all the IDs of data
    file_ids = sorted(list(set(['.'.join(fname.split('.')[:-1]) \
                                for fname in os.listdir(data_dir) \
                                if not fname.startswith('.')])))

    # Splitting IDs into random training and test data
    random.seed(0)
    random.shuffle(file_ids)

    split_idx = int(train_ratio * len(file_ids))
    train_ids = file_ids[:split_idx]
    test_ids = file_ids[split_idx:]

    if verbose == 1:
        print("Train data:")

    train_data = []
    for idx, fid in enumerate(train_ids):
        record = HealthRecord(fid, text_path=data_dir + fid + '.txt',
                              ann_path=data_dir + fid + '.ann',
                              tokenizer=tokenizer)
        train_data.append(record)
        if verbose == 1:
            draw_progress_bar(idx + 1, split_idx)

    if verbose == 1:
        print('\n\nTest Data:')

    test_data = []
    for idx, fid in enumerate(test_ids):
        record = HealthRecord(fid, text_path=data_dir + fid + '.txt',
                              ann_path=data_dir + fid + '.ann',
                              tokenizer=tokenizer)
        test_data.append(record)
        if verbose == 1:
            draw_progress_bar(idx + 1, len(file_ids) - split_idx)

    return train_data, test_data