Exemplo n.º 1
0
def evaluate(documents, dictionary, dictionary_reverse, model):
    model.eval()

    ct_predicted = 0
    ct_gold = 0
    ct_correct = 0

    for document in documents:

        # copy entities from gold entities
        pred_entities = []
        for gold in document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.section = gold.section
            pred.name = gold.name
            pred_entities.append(pred)

        model.process_one_doc(document, pred_entities, dictionary,
                              dictionary_reverse)

        p1, p2, p3 = evaluate_for_ehr(document.entities, pred_entities,
                                      dictionary)

        ct_gold += p1
        ct_predicted += p2
        ct_correct += p3

    if ct_gold == 0:
        precision = 0
        recall = 0
    else:
        precision = ct_correct * 1.0 / ct_predicted
        recall = ct_correct * 1.0 / ct_gold

    if precision + recall == 0:
        f_measure = 0
    else:
        f_measure = 2 * precision * recall / (precision + recall)

    return precision, recall, f_measure
Exemplo n.º 2
0
    def metamap_ner_my_norm(d):
        print("load umls ...")

        UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(d.config['norm_dict'])

        predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap"
        annotation_dir = os.path.join(opt.test_file, 'bioc')
        corpus_dir = os.path.join(opt.test_file, 'txt')
        annotation_files = [f for f in os.listdir(annotation_dir) if os.path.isfile(os.path.join(annotation_dir, f))]



        if opt.test_in_cpu:
            model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
        else:
            model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))
        model.eval()


        ct_norm_predict = 0
        ct_norm_gold = 0
        ct_norm_correct = 0

        correct_counter = Counter()
        wrong_counter = Counter()

        for gold_file_name in annotation_files:
            print("# begin {}".format(gold_file_name))
            gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name)

            predict_document = metamap.load_metamap_result_from_file(
                os.path.join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt"))

            # copy entities from metamap entities
            pred_entities = []
            for gold in predict_document.entities:
                pred = Entity()
                pred.id = gold.id
                pred.type = gold.type
                pred.spans = gold.spans
                pred.section = gold.section
                pred.name = gold.name
                pred_entities.append(pred)


            model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse)


            p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities, UMLS_dict,
                                          predict_document.entities, correct_counter, wrong_counter)

            ct_norm_gold += p1
            ct_norm_predict += p2
            ct_norm_correct += p3




        sorted_correct_entities = OrderedDict(correct_counter.most_common())
        sorted_correct_entities = json.dumps(sorted_correct_entities, indent=4)
        with codecs.open("sorted_correct_entities.txt", 'w', 'UTF-8') as fp:
            fp.write(sorted_correct_entities)

        sorted_wrong_entities = OrderedDict(wrong_counter.most_common())
        sorted_wrong_entities = json.dumps(sorted_wrong_entities, indent=4)
        with codecs.open("sorted_wrong_entities.txt", 'w', 'UTF-8') as fp:
            fp.write(sorted_wrong_entities)

        p = ct_norm_correct * 1.0 / ct_norm_predict
        r = ct_norm_correct * 1.0 / ct_norm_gold
        f1 = 2.0 * p * r / (p + r)
        print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
Exemplo n.º 3
0
        if file_count < 1:
            file_count += 1
            continue

        file_count += 1

        gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name)

        pred_entities = []
        for gold in gold_document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.section = gold.section
            pred.name = gold.name
            pred_entities.append(pred)

        Xs, Ys = generate_instances_ehr(pred_entities, model.dict_alphabet, UMLS_dict_reverse)

        data_loader = DataLoader(MyDataset(Xs, Ys), opt.batch_size, shuffle=False, collate_fn=my_collate)
        data_iter = iter(data_loader)
        num_iter = len(data_loader)

        entity_start = 0

        for i in range(num_iter):

            x, mask, sentences, _, lengths = next(data_iter)
Exemplo n.º 4
0
def evaluate(documents, dictionary, dictionary_reverse, vsm_model,
             neural_model, ensemble_model, d, isMeddra_dict):
    if vsm_model is not None:
        vsm_model.eval()

    if neural_model is not None:
        neural_model.eval()

    if ensemble_model is not None:
        ensemble_model.eval()

    ct_predicted = 0
    ct_gold = 0
    ct_correct = 0

    # if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
    #     ct_correct_rule = 0
    #     ct_correct_vsm = 0
    #     ct_correct_neural = 0
    #     ct_correct_all = 0
    #     ct_correct_rule_vsm = 0
    #     ct_correct_rule_neural = 0
    #     ct_correct_vsm_neural = 0

    for document in documents:

        # copy entities from gold entities
        pred_entities = []
        for gold in document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.section = gold.section
            pred.name = gold.name
            pred_entities.append(pred)

        if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
            if opt.ensemble == 'learn':
                ensemble_model.process_one_doc(document, pred_entities,
                                               dictionary, dictionary_reverse,
                                               isMeddra_dict)
            else:
                pred_entities2 = copy.deepcopy(pred_entities)
                pred_entities3 = copy.deepcopy(pred_entities)
                merge_entities = copy.deepcopy(pred_entities)
                multi_sieve.runMultiPassSieve(document, pred_entities,
                                              dictionary, isMeddra_dict)
                vsm_model.process_one_doc(document, pred_entities2, dictionary,
                                          dictionary_reverse, isMeddra_dict)
                neural_model.process_one_doc(document, pred_entities3,
                                             dictionary, dictionary_reverse,
                                             isMeddra_dict)
        elif opt.norm_rule:
            multi_sieve.runMultiPassSieve(document, pred_entities, dictionary,
                                          isMeddra_dict)
        elif opt.norm_vsm:
            vsm_model.process_one_doc(document, pred_entities, dictionary,
                                      dictionary_reverse, isMeddra_dict)
        elif opt.norm_neural:
            neural_model.process_one_doc(document, pred_entities, dictionary,
                                         dictionary_reverse, isMeddra_dict)
        else:
            raise RuntimeError("wrong configuration")

        if opt.norm_rule and opt.norm_vsm and opt.norm_neural:

            # ct_gold += len(document.entities)
            # ct_predicted += len(pred_entities)
            # up bound of ensemble, if at least one system makes a correct prediction, we count it as correct.
            # for idx, gold in enumerate(document.entities):
            # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids):
            #     ct_correct_all += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is None or pred_entities2[idx].vsm_id not in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is None or pred_entities3[idx].neural_id not in gold.norm_ids):
            #     ct_correct_rule += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is None or pred_entities[idx].rule_id not in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is None or pred_entities3[idx].neural_id not in gold.norm_ids):
            #     ct_correct_vsm += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is None or pred_entities[idx].rule_id not in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is None or pred_entities2[idx].vsm_id not in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids):
            #     ct_correct_neural += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is None or pred_entities3[idx].neural_id not in gold.norm_ids):
            #     ct_correct_rule_vsm += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is not None and pred_entities[idx].rule_id in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is None or pred_entities2[idx].vsm_id not in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids):
            #     ct_correct_rule_neural += 1
            #     ct_correct += 1
            #
            # if (pred_entities[idx].rule_id is None or pred_entities[idx].rule_id not in gold.norm_ids)\
            #     and (pred_entities2[idx].vsm_id is not None and pred_entities2[idx].vsm_id in gold.norm_ids) \
            #         and (pred_entities3[idx].neural_id is not None and pred_entities3[idx].neural_id in gold.norm_ids):
            #     ct_correct_vsm_neural += 1
            #     ct_correct += 1

            if opt.ensemble == 'learn':

                if isMeddra_dict:
                    p1, p2, p3 = evaluate_for_fda(document.entities,
                                                  pred_entities)
                else:
                    p1, p2, p3 = evaluate_for_ehr(document.entities,
                                                  pred_entities, dictionary)

                ct_gold += p1
                ct_predicted += p2
                ct_correct += p3

            else:
                ensemble.merge_result(pred_entities, pred_entities2,
                                      pred_entities3, merge_entities,
                                      dictionary, isMeddra_dict,
                                      vsm_model.dict_alphabet, d)

                if isMeddra_dict:
                    p1, p2, p3 = evaluate_for_fda(document.entities,
                                                  merge_entities)
                else:
                    p1, p2, p3 = evaluate_for_ehr(document.entities,
                                                  merge_entities, dictionary)

                ct_gold += p1
                ct_predicted += p2
                ct_correct += p3

        else:

            if isMeddra_dict:
                p1, p2, p3 = evaluate_for_fda(document.entities, pred_entities)
            else:
                p1, p2, p3 = evaluate_for_ehr(document.entities, pred_entities,
                                              dictionary)

            ct_gold += p1
            ct_predicted += p2
            ct_correct += p3

    # if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
    #     logging.info("ensemble correct. all:{} rule:{} vsm:{} neural:{} rule_vsm:{} rule_neural:{} vsm_neural:{}"
    #                  .format(ct_correct_all, ct_correct_rule, ct_correct_vsm, ct_correct_neural, ct_correct_rule_vsm,
    #                          ct_correct_rule_neural, ct_correct_vsm_neural))
    #
    # logging.info("gold:{} pred:{} correct:{}".format(ct_gold, ct_predicted, ct_correct))

    if ct_gold == 0:
        precision = 0
        recall = 0
    else:
        precision = ct_correct * 1.0 / ct_predicted
        recall = ct_correct * 1.0 / ct_gold

    if precision + recall == 0:
        f_measure = 0
    else:
        f_measure = 2 * precision * recall / (precision + recall)

    return precision, recall, f_measure
Exemplo n.º 5
0
def metamap_ner_my_norm(d):
    print("load umls ...")

    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(
        d.config['norm_dict'])

    predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap"
    annotation_dir = os.path.join(opt.test_file, 'bioc')
    corpus_dir = os.path.join(opt.test_file, 'txt')
    annotation_files = [
        f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f))
    ]

    if opt.norm_rule:
        multi_sieve.init(opt, None, d, UMLS_dict, UMLS_dict_reverse, False)
    elif opt.norm_neural:
        logging.info("use neural-based normer")
        if opt.test_in_cpu:
            neural_model = torch.load(os.path.join(opt.output,
                                                   'norm_neural.pkl'),
                                      map_location='cpu')
        else:
            neural_model = torch.load(
                os.path.join(opt.output, 'norm_neural.pkl'))
        neural_model.eval()
    elif opt.norm_vsm:
        logging.info("use vsm-based normer")
        if opt.test_in_cpu:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'),
                                   map_location='cpu')
        else:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
        vsm_model.eval()

    ct_norm_predict = 0
    ct_norm_gold = 0
    ct_norm_correct = 0

    for gold_file_name in annotation_files:
        print("# begin {}".format(gold_file_name))
        gold_document = parse_one_gold_file(annotation_dir, corpus_dir,
                                            gold_file_name)

        predict_document = metamap.load_metamap_result_from_file(
            join(predict_dir,
                 gold_file_name[:gold_file_name.find('.')] + ".field.txt"))

        # copy entities from metamap entities
        pred_entities = []
        for gold in predict_document.entities:
            pred = Entity()
            pred.id = gold.id
            pred.type = gold.type
            pred.spans = gold.spans
            pred.section = gold.section
            pred.name = gold.name
            pred_entities.append(pred)

        if opt.norm_rule:
            multi_sieve.runMultiPassSieve(gold_document, pred_entities,
                                          UMLS_dict, False)
        elif opt.norm_neural:
            neural_model.process_one_doc(gold_document, pred_entities,
                                         UMLS_dict, UMLS_dict_reverse, False)
        elif opt.norm_vsm:
            vsm_model.process_one_doc(gold_document, pred_entities, UMLS_dict,
                                      UMLS_dict_reverse, False)
        else:
            raise RuntimeError("wrong configuration")

        p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities,
                                      UMLS_dict)

        ct_norm_gold += p1
        ct_norm_predict += p2
        ct_norm_correct += p3

    p = ct_norm_correct * 1.0 / ct_norm_predict
    r = ct_norm_correct * 1.0 / ct_norm_gold
    f1 = 2.0 * p * r / (p + r)
    print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
Exemplo n.º 6
0
def generate_instances(document, word_alphabet, dict_alphabet, dictionary,
                       dictionary_reverse, isMeddra_dict):
    Xs = []
    Ys = []

    # copy entities from gold entities
    pred_entities = []
    for gold in document.entities:
        pred = Entity()
        pred.id = gold.id
        pred.type = gold.type
        pred.spans = gold.spans
        pred.section = gold.section
        pred.name = gold.name
        pred_entities.append(pred)

    multi_sieve.runMultiPassSieve(document, pred_entities, dictionary,
                                  isMeddra_dict)

    for idx, entity in enumerate(document.entities):

        if isMeddra_dict:
            if len(entity.norm_ids) > 0:
                Y = norm_utils.get_dict_index(dict_alphabet,
                                              entity.norm_ids[0])
                if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                    Ys.append(Y)
                else:
                    continue
            else:
                Ys.append(0)
        else:
            if len(entity.norm_ids) > 0:
                if entity.norm_ids[0] in dictionary_reverse:
                    cui_list = dictionary_reverse[entity.norm_ids[0]]
                    Y = norm_utils.get_dict_index(
                        dict_alphabet,
                        cui_list[0])  # use the first id to generate instance
                    if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                        Ys.append(Y)
                    else:
                        raise RuntimeError(
                            "entity {}, {}, cui not in dict_alphabet".format(
                                entity.id, entity.name))
                else:
                    logging.info(
                        "entity {}, {}, can't map to umls, ignored".format(
                            entity.id, entity.name))
                    continue
            else:
                Ys.append(0)

        X = dict()

        tokens = my_tokenize(entity.name)
        word_ids = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            word_ids.append(word_id)
        X['word'] = word_ids

        if pred_entities[idx].rule_id is None:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
        else:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
            X['rule'][norm_utils.get_dict_index(
                dict_alphabet, pred_entities[idx].rule_id)] = 1

        Xs.append(X)

    return Xs, Ys