Exemplo n.º 1
0
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse,
          opt, fold_idx, isMeddra_dict):
    logging.info("train the ensemble normalization model ...")

    external_train_data = []
    if d.config.get('norm_ext_corpus') is not None:
        for k, v in d.config['norm_ext_corpus'].items():
            if k == 'tac':
                external_train_data.extend(
                    load_data_fda(v['path'], True, v.get('types'),
                                  v.get('types'), False, True))
            else:
                raise RuntimeError("not support external corpus")
    if len(external_train_data) != 0:
        train_data.extend(external_train_data)

    logging.info("build alphabet ...")
    word_alphabet = Alphabet('word')
    norm_utils.build_alphabet_from_dict(word_alphabet, dictionary,
                                        isMeddra_dict)
    norm_utils.build_alphabet(word_alphabet, train_data)
    if opt.dev_file:
        norm_utils.build_alphabet(word_alphabet, dev_data)
    if opt.test_file:
        norm_utils.build_alphabet(word_alphabet, test_data)
    norm_utils.fix_alphabet(word_alphabet)

    if d.config.get('norm_emb') is not None:
        logging.info("load pretrained word embedding ...")
        pretrain_word_embedding, word_emb_dim = build_pretrain_embedding(
            d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False)
        word_embedding = nn.Embedding(word_alphabet.size(),
                                      word_emb_dim,
                                      padding_idx=0)
        word_embedding.weight.data.copy_(
            torch.from_numpy(pretrain_word_embedding))
        embedding_dim = word_emb_dim
    else:
        logging.info("randomly initialize word embedding ...")
        word_embedding = nn.Embedding(word_alphabet.size(),
                                      d.word_emb_dim,
                                      padding_idx=0)
        word_embedding.weight.data.copy_(
            torch.from_numpy(
                random_embedding(word_alphabet.size(), d.word_emb_dim)))
        embedding_dim = d.word_emb_dim

    dict_alphabet = Alphabet('dict')
    norm_utils.init_dict_alphabet(dict_alphabet, dictionary)
    norm_utils.fix_alphabet(dict_alphabet)

    # rule
    logging.info("init rule-based normer")
    multi_sieve.init(opt, train_data, d, dictionary, dictionary_reverse,
                     isMeddra_dict)

    if opt.ensemble == 'learn':
        logging.info("init ensemble normer")
        poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet,
                                         dictionary, isMeddra_dict)
        ensemble_model = Ensemble(word_alphabet, word_embedding, embedding_dim,
                                  dict_alphabet, poses)
        if pretrain_neural_model is not None:
            ensemble_model.neural_linear.weight.data.copy_(
                pretrain_neural_model.linear.weight.data)
        if pretrain_vsm_model is not None:
            ensemble_model.vsm_linear.weight.data.copy_(
                pretrain_vsm_model.linear.weight.data)
        ensemble_train_X = []
        ensemble_train_Y = []
        for doc in train_data:
            temp_X, temp_Y = generate_instances(doc, word_alphabet,
                                                dict_alphabet, dictionary,
                                                dictionary_reverse,
                                                isMeddra_dict)

            ensemble_train_X.extend(temp_X)
            ensemble_train_Y.extend(temp_Y)
        ensemble_train_loader = DataLoader(MyDataset(ensemble_train_X,
                                                     ensemble_train_Y),
                                           opt.batch_size,
                                           shuffle=True,
                                           collate_fn=my_collate)
        ensemble_optimizer = optim.Adam(ensemble_model.parameters(),
                                        lr=opt.lr,
                                        weight_decay=opt.l2)
        if opt.tune_wordemb == False:
            freeze_net(ensemble_model.word_embedding)
    else:

        # vsm
        logging.info("init vsm-based normer")
        poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet,
                                         dictionary, isMeddra_dict)
        # alphabet can share between vsm and neural since they don't change
        # but word_embedding cannot
        vsm_model = vsm.VsmNormer(word_alphabet, copy.deepcopy(word_embedding),
                                  embedding_dim, dict_alphabet, poses)
        vsm_train_X = []
        vsm_train_Y = []
        for doc in train_data:
            if isMeddra_dict:
                temp_X, temp_Y = vsm.generate_instances(
                    doc.entities, word_alphabet, dict_alphabet)
            else:
                temp_X, temp_Y = vsm.generate_instances_ehr(
                    doc.entities, word_alphabet, dict_alphabet,
                    dictionary_reverse)

            vsm_train_X.extend(temp_X)
            vsm_train_Y.extend(temp_Y)
        vsm_train_loader = DataLoader(vsm.MyDataset(vsm_train_X, vsm_train_Y),
                                      opt.batch_size,
                                      shuffle=True,
                                      collate_fn=vsm.my_collate)
        vsm_optimizer = optim.Adam(vsm_model.parameters(),
                                   lr=opt.lr,
                                   weight_decay=opt.l2)
        if opt.tune_wordemb == False:
            freeze_net(vsm_model.word_embedding)

        if d.config['norm_vsm_pretrain'] == '1':
            vsm.dict_pretrain(dictionary, dictionary_reverse, d, True,
                              vsm_optimizer, vsm_model)

        # neural
        logging.info("init neural-based normer")
        neural_model = norm_neural.NeuralNormer(word_alphabet,
                                                copy.deepcopy(word_embedding),
                                                embedding_dim, dict_alphabet)

        neural_train_X = []
        neural_train_Y = []
        for doc in train_data:
            if isMeddra_dict:
                temp_X, temp_Y = norm_neural.generate_instances(
                    doc.entities, word_alphabet, dict_alphabet)
            else:
                temp_X, temp_Y = norm_neural.generate_instances_ehr(
                    doc.entities, word_alphabet, dict_alphabet,
                    dictionary_reverse)

            neural_train_X.extend(temp_X)
            neural_train_Y.extend(temp_Y)
        neural_train_loader = DataLoader(norm_neural.MyDataset(
            neural_train_X, neural_train_Y),
                                         opt.batch_size,
                                         shuffle=True,
                                         collate_fn=norm_neural.my_collate)
        neural_optimizer = optim.Adam(neural_model.parameters(),
                                      lr=opt.lr,
                                      weight_decay=opt.l2)
        if opt.tune_wordemb == False:
            freeze_net(neural_model.word_embedding)

        if d.config['norm_neural_pretrain'] == '1':
            neural_model.dict_pretrain(dictionary, dictionary_reverse, d, True,
                                       neural_optimizer, neural_model)

    best_dev_f = -10
    best_dev_p = -10
    best_dev_r = -10

    bad_counter = 0

    logging.info("start training ...")
    for idx in range(opt.iter):
        epoch_start = time.time()

        if opt.ensemble == 'learn':

            ensemble_model.train()
            ensemble_train_iter = iter(ensemble_train_loader)
            ensemble_num_iter = len(ensemble_train_loader)

            for i in range(ensemble_num_iter):
                x, rules, lengths, y = next(ensemble_train_iter)

                y_pred = ensemble_model.forward(x, rules, lengths)

                l = ensemble_model.loss(y_pred, y)

                l.backward()

                if opt.gradient_clip > 0:
                    torch.nn.utils.clip_grad_norm_(ensemble_model.parameters(),
                                                   opt.gradient_clip)
                ensemble_optimizer.step()
                ensemble_model.zero_grad()

        else:

            vsm_model.train()
            vsm_train_iter = iter(vsm_train_loader)
            vsm_num_iter = len(vsm_train_loader)

            for i in range(vsm_num_iter):
                x, lengths, y = next(vsm_train_iter)

                l, _ = vsm_model.forward_train(x, lengths, y)

                l.backward()

                if opt.gradient_clip > 0:
                    torch.nn.utils.clip_grad_norm_(vsm_model.parameters(),
                                                   opt.gradient_clip)
                vsm_optimizer.step()
                vsm_model.zero_grad()

            neural_model.train()
            neural_train_iter = iter(neural_train_loader)
            neural_num_iter = len(neural_train_loader)

            for i in range(neural_num_iter):

                x, lengths, y = next(neural_train_iter)

                y_pred = neural_model.forward(x, lengths)

                l = neural_model.loss(y_pred, y)

                l.backward()

                if opt.gradient_clip > 0:
                    torch.nn.utils.clip_grad_norm_(neural_model.parameters(),
                                                   opt.gradient_clip)
                neural_optimizer.step()
                neural_model.zero_grad()

        epoch_finish = time.time()
        logging.info("epoch: %s training finished. Time: %.2fs" %
                     (idx, epoch_finish - epoch_start))

        if opt.dev_file:
            if opt.ensemble == 'learn':
                # logging.info("weight w1: %.4f, w2: %.4f, w3: %.4f" % (ensemble_model.w1.data.item(), ensemble_model.w2.data.item(), ensemble_model.w3.data.item()))
                p, r, f = norm_utils.evaluate(dev_data, dictionary,
                                              dictionary_reverse, None, None,
                                              ensemble_model, d, isMeddra_dict)
            else:
                p, r, f = norm_utils.evaluate(dev_data, dictionary,
                                              dictionary_reverse, vsm_model,
                                              neural_model, None, d,
                                              isMeddra_dict)
            logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f))
        else:
            f = best_dev_f

        if f > best_dev_f:
            logging.info("Exceed previous best f score on dev: %.4f" %
                         (best_dev_f))

            if opt.ensemble == 'learn':
                if fold_idx is None:
                    torch.save(ensemble_model,
                               os.path.join(opt.output, "ensemble.pkl"))
                else:
                    torch.save(
                        ensemble_model,
                        os.path.join(opt.output,
                                     "ensemble_{}.pkl".format(fold_idx + 1)))
            else:
                if fold_idx is None:
                    torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl"))
                    torch.save(neural_model,
                               os.path.join(opt.output, "norm_neural.pkl"))
                else:
                    torch.save(
                        vsm_model,
                        os.path.join(opt.output,
                                     "vsm_{}.pkl".format(fold_idx + 1)))
                    torch.save(
                        neural_model,
                        os.path.join(opt.output,
                                     "norm_neural_{}.pkl".format(fold_idx +
                                                                 1)))

            best_dev_f = f
            best_dev_p = p
            best_dev_r = r

            bad_counter = 0
        else:
            bad_counter += 1

        if len(opt.dev_file) != 0 and bad_counter >= opt.patience:
            logging.info('Early Stop!')
            break

    logging.info("train finished")

    if fold_idx is None:
        multi_sieve.finalize(True)
    else:
        if fold_idx == opt.cross_validation - 1:
            multi_sieve.finalize(True)
        else:
            multi_sieve.finalize(False)

    if len(opt.dev_file) == 0:
        if opt.ensemble == 'learn':
            torch.save(ensemble_model, os.path.join(opt.output,
                                                    "ensemble.pkl"))
        else:
            torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl"))
            torch.save(neural_model, os.path.join(opt.output,
                                                  "norm_neural.pkl"))

    return best_dev_p, best_dev_r, best_dev_f
Exemplo n.º 2
0
def test(data, opt):

    corpus_dir = opt.test_file

    if opt.nlp_tool == "nltk":
        nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle')
    else:
        raise RuntimeError("invalid nlp tool")

    corpus_files = [f for f in os.listdir(corpus_dir) if f.find('.xml') != -1]

    model = SeqModel(data, opt)
    if opt.test_in_cpu:
        model.load_state_dict(
            torch.load(os.path.join(opt.output, 'model.pkl'), map_location='cpu'))
    else:
        model.load_state_dict(torch.load(os.path.join(opt.output, 'model.pkl')))

    meddra_dict = load_meddra_dict(data)

    # initialize norm models
    if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ensemble
        logging.info("use ensemble normer")
        multi_sieve.init(opt, None, data, meddra_dict, None, True)
        if opt.ensemble == 'learn':
            if opt.test_in_cpu:
                ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl'), map_location='cpu')
            else:
                ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl'))
            ensemble_model.eval()
        else:
            if opt.test_in_cpu:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu')
                neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
            else:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
                neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))

            vsm_model.eval()
            neural_model.eval()

    elif opt.norm_rule:
        logging.info("use rule-based normer")
        multi_sieve.init(opt, None, data, meddra_dict)

    elif opt.norm_vsm:
        logging.info("use vsm-based normer")
        if opt.test_in_cpu:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu')
        else:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
        vsm_model.eval()

    elif opt.norm_neural:
        logging.info("use neural-based normer")
        if opt.test_in_cpu:
            neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
        else:
            neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))
        neural_model.eval()
    else:
        logging.info("no normalization is performed.")


    makedir_and_clear(opt.predict)

    ct_success = 0
    ct_error = 0

    for fileName in corpus_files:
        try:
            start = time.time()
            document, annotation_file = processOneFile_fda(fileName, corpus_dir, nlp_tool, False, opt.types, opt.type_filter, True, False)
            pred_entities = []

            for section in document:

                data.test_texts = []
                data.test_Ids = []
                read_instance_from_one_document(section, data.word_alphabet, data.char_alphabet, data.label_alphabet,
                                                data.test_texts, data.test_Ids, data)

                _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test', False, opt.nbest)

                entities = translateResultsintoEntities(section.sentences, pred_results)

                # remove the entity in the ignore_region and fill section_id
                section_id = section.name[section.name.rfind('_')+1: ]
                entities = remove_entity_in_the_ignore_region(annotation_file.ignore_regions, entities, section_id)


                if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
                    if opt.ensemble == 'learn':
                        ensemble_model.process_one_doc(section, entities, meddra_dict, None, True)
                    else:
                        pred_entities1 = copy.deepcopy(entities)
                        pred_entities2 = copy.deepcopy(entities)
                        pred_entities3 = copy.deepcopy(entities)
                        multi_sieve.runMultiPassSieve(section, pred_entities1, meddra_dict, True)
                        vsm_model.process_one_doc(section, pred_entities2, meddra_dict, None, True)
                        neural_model.process_one_doc(section, pred_entities3, meddra_dict, None, True)

                        # merge pred_entities1, pred_entities2, pred_entities3 into entities
                        ensemble.merge_result(pred_entities1, pred_entities2, pred_entities3, entities, meddra_dict, True, vsm_model.dict_alphabet, data)

                elif opt.norm_rule:
                    multi_sieve.runMultiPassSieve(section, entities, meddra_dict, True)
                elif opt.norm_vsm:
                    vsm_model.process_one_doc(section, entities, meddra_dict, None, True)
                elif opt.norm_neural:
                    neural_model.process_one_doc(section, entities, meddra_dict, None, True)


                for entity in entities:
                    if len(entity.norm_ids)!=0: # if a mention can't be normed, not output it
                        pred_entities.append(entity)


            dump_results(fileName, pred_entities, opt, annotation_file)

            end = time.time()
            logging.info("process %s complete with %.2fs" % (fileName, end - start))

            ct_success += 1
        except Exception as e:
            logging.error("process file {} error: {}".format(fileName, e))
            ct_error += 1

    if opt.norm_rule:
        multi_sieve.finalize(True)

    logging.info("test finished, total {}, error {}".format(ct_success + ct_error, ct_error))
Exemplo n.º 3
0
                            if gold_entity.norm_ids[0] in concept.codes:
                                ct_norm_correct += 1

                        else:  # if there are multiple answers, we use multi-sieve to disambiguate

                            copy_entity = copy.deepcopy(predict_entity)
                            copy_entity.norm_ids = []

                            multi_sieve.runMultiPassSieve_oneentity(
                                gold_document, copy_entity)

                            concept = dictionary[copy_entity.norm_ids[0]]

                            if gold_entity.norm_ids[0] in concept.codes:
                                ct_norm_correct += 1

                        break

            ct_norm_gold += len(gold_document.entities)
            ct_norm_predict += len(pred_entities)

        multi_sieve.finalize(True)

        p = ct_norm_correct * 1.0 / ct_norm_predict
        r = ct_norm_correct * 1.0 / ct_norm_gold
        f1 = 2.0 * p * r / (p + r)
        print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))

    else:
        logging.info("wrong whattodo")