def annotated_docs_to_tokens(docs, sentence_pad=False):
    """Align tokenized docs
    
    """
    text_list = []
    label_list = []
    tokens_list = []
    for i, doc in enumerate(docs):
        if sentence_pad:
            text = [[r'<s>'] + text_to_tokens(sent) + [r'<\s>']
                    for sent in text_to_sentences(doc.plain_text_)[0]
                    if len(sent.split()) > 0]
        else:
            text = [
                text_to_tokens(sent)
                for sent in text_to_sentences(doc.plain_text_)[0]
                if len(sent.split()) > 0
            ]

        text_list.append(text)

        count = 0
        pad_index = []
        for line in text:
            for idx, word in enumerate(line):
                if word == r'<s>' or word == r'<\s>':
                    pad_index.append(count + idx)
            count += len(line)

        tokens, labels = transform_annotated_document_to_bio_format(doc)

        count = 0
        for i, line in enumerate(text_list[-1]):
            start_count = 0
            for j, word in enumerate(line):
                if word not in [r'<s>', r'<\s>'] and word != tokens[count]:
                    k = 0
                    start_count = count
                    if tokens[count] in word:
                        text_list[-1][i][j] = tokens[count + k]
                        k += 1
                    while count + k < len(tokens) and tokens[count +
                                                             k] in word:
                        text_list[-1][i].insert(j + k, tokens[count + k])
                        k += 1
                    # print(f'Error: split text= {word}, token{tokens[start_count:count+k]}')
                    count += 1
                elif word not in [r'<s>', r'<\s>']:
                    count += 1

        [labels.insert(i, 'O') for i in pad_index]
        [tokens.insert(i, r'<s>') for i in pad_index]

        label_list.append(labels)
        tokens_list.append(tokens)

    return text_list, label_list, tokens_list
예제 #2
0
def annotated_docs_to_tokens(docs):
    text_list = []
    for doc in docs:
        text = [[r'<s>'] + text_to_tokens(sent) + [r'<\s>']
                for sent in text_to_sentences(doc.plain_text_)[0]
                if len(sent.split()) > 0]
        text_list.append(text)

    return text_list
def is_well_formed_sentence(line):
    text = line.strip()
    if not text:
        return False
    if text[0].islower():
        return False
    if not text[0].isalpha():
        return False
    if text[-1].isdigit():
        return False
    if text.startswith("Notes:"):
        return False
    if text[-1] == "." or text[-1] == ":":
        return True
    tokens = text_to_tokens(line)
    for i in range(len(tokens) - 1):
        if tokens[i] == "." and tokens[i + 1][0].isupper():
            return True
    return False
예제 #4
0
def construct_data(data,
                   annotated_docs,
                   predictions,
                   scope_note,
                   id_dict,
                   ctd_file,
                   c2m_file,
                   use_ELMO=True,
                   elmo_model=None,
                   elmo_dim=1024,
                   device=torch.device('cpu')):
    """ re-format the data in easily trainable format using pytorch generators
    """
    text = []  # sentence
    text_emb = []
    scope = []  # scope note
    m_id = []  # mesh ID
    mask_list = []  # mask list
    label = []  # labels for positive and vegative examples

    toD = Convert2D(ctd_file, c2m_file)
    skipped_id = []
    for idx, pred_doc in enumerate(annotated_docs):
        tags = predictions[idx]

        o_doc = data[idx]

        tokens, bio_labels = transform_annotated_document_to_bio_format(o_doc)

        new_tags = check_tags(bio_labels, tags)
        entity_list = get_normalizations(o_doc, copy.deepcopy(pred_doc))

        masks = get_masks(new_tags, len(entity_list))

        for i in range(len(entity_list)):
            # create C-2-D and UMIM-D and UMIM-C-M filter
            if '+' in entity_list[i]:
                entity_list[i] = entity_list[i].split('+')[0]
            elif '|' in entity_list[i]:
                entity_list[i] = entity_list[i].split('|')[0]
            if entity_list[i] not in id_dict:
                item = toD.transform(entity_list[i])
                if item is not None:
                    if item not in id_dict:
                        print(
                            f"D MeSH {item} not found in Disease list. Skipping this normalization..."
                        )
                        skipped_id.append(item)
                        continue
                    entity_list[i] = item
                else:
                    print(
                        f"D MeSH equivalent of {entity_list[i]} not found. Skipping this normalization..."
                    )
                    skipped_id.append(entity_list[i])
                    continue
            note = []
            # text, scope_note, Mesh_ID, Mask, positive_lable
            if use_ELMO:
                t = [[r'<s>'] + text_to_tokens(sent) + [r'<\s>']
                     for sent in text_to_sentences(pred_doc.plain_text_)[0]
                     if len(sent.split()) > 0]

                char_id = batch_to_ids(t).to(device)
                with torch.no_grad():
                    elmo_emb = elmo_model(char_id)
                t_emb = elmo_emb['elmo_representations'][0].view(
                    -1, elmo_dim).detach().cpu()
                t_emb = torch.stack([
                    tensor for tensor in t_emb
                    if len(np.nonzero(tensor.numpy())[0]) != 0
                ],
                                    dim=0)
                text_emb.append(t_emb)
                text.extend(t)

                note = scope_note[id_dict[entity_list[i]]]
                note = batch_to_ids(note).to(device)
                with torch.no_grad():
                    elmo_emb = elmo_model(note)
                note = elmo_emb['elmo_representations'][0].view(
                    -1, elmo_dim).detach().cpu()
                scope.append(note)
                mask = masks[i].tolist()
                mask = adjust_mask(mask, t, tokens)
                mask_list.append(torch.tensor(mask))

            else:
                t = text_to_tokens(pred_doc.plain_text_)
                text.append(t)
                _ = [
                    note.extend(line[1:-1])
                    for line in scope_note[id_dict[entity_list[i]]]
                    if len(line) > 1
                ]
                scope.append(note)
                mask = masks[i].tolist()
                mask = adjust_mask(mask, [t], tokens)
                mask_list.append(torch.tensor(mask))

                assert (len(t) == len(mask)
                        ), 'Length of mask is not equal to length of sentence.'

            m_id.append(entity_list[i])
            label.append(1)

    print('Total skipped: ', len(skipped_id), ' unique skips: ',
          len(set(skipped_id)))
    sample = []
    for i in range(len(text)):
        sample.append(
            (text[i], text_emb[i], scope[i], m_id[i], mask_list[i], label[i]))

    return sample, text
def main():
    parser = argparse.ArgumentParser(description='traditional_models.py')
    parser.path('path', dest='path', deafult=None, type=str)
    parser.label('label', dest='label', default=None, type=str)
    parser.output_dir('output_dir', dest='output_dir', default=None, type=str)
    parser.model('model', dest='model', default=None)
    parser.tokenlevel_file_name('tokenlevel_file_name',
                                dest='tokenlevel_file_name',
                                default=None,
                                type=str)
    parser.entitylevel_file_name('entitylevel_file_name',
                                 dest='entitylevel_file_name',
                                 default=None,
                                 type=str)
    args = parser.parse_args()

    # Import data
    data_path = path
    ann_docs = BratInput(data_path).transform()
    data = retain_annotations(ann_docs, label)
    clean_data = clean_annotated_documents(data)
    non_overlap_data = resolve_overlaps(clean_data)

    # Split all documents collection into sentences
    sent_docs = split_annotated_documents(non_overlap_data)

    # Select sentences with less than 130 words
    short_sentences = []
    for i in sent_docs:
        tokens = text_to_tokens(i.plain_text_)
        if len(tokens) < 130:
            print(len(tokens))
            short_sentences.append(i)

    ### Models ###

    #Cvsplit
    splitter_2 = CVSplit(strategy="random", n_folds=5)
    splits = splitter_2.make_cv_folds(short_sentences)

    train_1 = splits[1] + splits[2] + splits[3] + splits[4]
    test_1 = splits[0]

    train_2 = splits[0] + splits[2] + splits[3] + splits[4]
    test_2 = splits[1]

    train_3 = splits[0] + splits[1] + splits[3] + splits[4]
    test_3 = splits[2]

    train_4 = splits[0] + splits[1] + splits[2] + splits[4]
    test_4 = splits[3]

    train_5 = splits[0] + splits[1] + splits[2] + splits[3]
    test_5 = splits[4]

    ### Save the different splits

    BratOutput("output_dir").transform(train_1)
    BratOutput("output_dir").transform(test_1)

    BratOutput("output_dir").transform(train_2)
    BratOutput("output_dir").transform(test_2)

    BratOutput("output_dir").transform(train_3)
    BratOutput("output_dir").transform(test_3)

    BratOutput("output_dir").transform(train_4)
    BratOutput("output_dir").transform(test_4)

    BratOutput("output_dir").transform(train_5)
    BratOutput("output_dir").transform(test_5)

    np.random.seed(0)

    y_true = np.array([0] * 400 + [1] * 600)
    y_pred = np.random.randint(2, size=1000)

    def pandas_classification_report(y_true, y_pred):
        metrics_summary = precision_recall_fscore_support(y_true=y_true,
                                                          y_pred=y_pred)

        avg = list(
            precision_recall_fscore_support(y_true=y_true,
                                            y_pred=y_pred,
                                            average='weighted'))

        metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
        class_report_df = pd.DataFrame(list(metrics_summary),
                                       index=metrics_sum_index)

        support = class_report_df.loc['support']
        total = support.sum()
        avg[-1] = total

        class_report_df['avg / total'] = avg

        return class_report_df.T

    ### RUN the models ###
    idx = 0
    entity_level_results = []
    token_level_df = pd.DataFrame()

    for split in splits:
        test = split
        train_splits = splits[:idx] + splits[idx + 1:]
        train = [item for sublist in train_splits for item in sublist]
        idx += 1

        #Train
        if model == 'ExactMatchDictionaryNER':

            model = model(entity_labels=label)
            model.fit(train)
            pred_docs = model.transform(test)

        if model == 'BidirectionalLSTM':

            model = model(entity_labels=label)
            model.fit(train)
            pred_docs = model.transform(test)

        if model == 'CRF':

            model = model(entity_labels=label)
            model.fit(train, max_iterations=100)
            pred_docs = model.transform(test)

        #Evaluate and store (entity-level evaluation)
        metrics_1fold = []
        p, r, f = annotation_precision_recall_f1score(pred_docs,
                                                      test,
                                                      ann_label=label)
        print(p, r, f)
        metrics_1fold.append(p)
        metrics_1fold.append(r)
        metrics_1fold.append(f)
        entity_level_results.append(metrics_1fold)

        # Convert to X_test, y_test, X_pred, y_pred
        X_test, y_test = transform_annotated_documents_to_bio_format(
            test, entity_labels=label)
        X_pred, y_pred = transform_annotated_documents_to_bio_format(
            pred_docs, entity_labels=label)

        #Keep only the first y_pred of each sentence
        label_pred = []
        for i in range(len(y_pred)):
            unique = y_pred[i][:len(y_test[i])]
            label_pred.append(unique)

        # Flat the nested lists
        flat_y_test = [item for sublist in y_test for item in sublist]
        flat_y_pred = [item for sublist in label_pred for item in sublist]

        # Print separate for B and I (token-level evaluation)
        classes = [f'B_{label}', f'I_{label}']
        print(
            classification_report(flat_y_test,
                                  flat_y_pred,
                                  target_names=classes,
                                  digits=4))
        df_class_report = pandas_classification_report(y_true=flat_y_test,
                                                       y_pred=flat_y_pred)
        token_level_df = token_level_df.append(df_class_report)

    # Save token-level evaluation report in a csv file
    token_level_df.to_csv(f'{tokenlevel_file_name}.csv', sep=',')
    df = pd.DataFrame(entity_level_results,
                      columns=["Precision", "Recall", "F1 measure"])
    # Save entity-level evaluation report in a csv file
    df.to_csv(f'{entitylevel_file_name}.csv')