Exemplo n.º 1
0
def load_model_decode(data, name):
    print("Load Model from file: ", data.model_dir)
    model = SeqModel(data)
    ## load model need consider if the model trained in GPU and load in CPU, or vice versa
    # if not gpu:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage)
    #     # model = torch.load(model_dir, map_location=lambda storage, loc: storage)
    # else:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model = torch.load(model_dir)
    model.load_state_dict(torch.load(data.load_model_dir))

    print("Decode %s data, nbest: %s ..." % (name, data.nbest))
    start_time = time.time()
    speed, acc, p, r, f, pred_results, pred_scores = evaluate(
        data, model, name, data.nbest)
    end_time = time.time()
    time_cost = end_time - start_time
    if data.seg:
        print(
            "%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
            % (name, time_cost, speed, acc, p, r, f))
    else:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f" %
              (name, time_cost, speed, acc))
    return pred_results, pred_scores
Exemplo n.º 2
0
def load_model_decode(data, name):
    print("Load Model from file: ", data.model_dir)
    model = SeqModel(data)
    ## load model need consider if the model trained in GPU and load in CPU, or vice versa
    # if not gpu:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage)
    #     # model = torch.load(model_dir, map_location=lambda storage, loc: storage)
    # else:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model = torch.load(model_dir)
    if data.HP_gpu:
        model.load_state_dict(torch.load(data.load_model_dir))
    else:
        model.load_state_dict(
            torch.load(data.load_model_dir,
                       map_location=lambda storage, loc: storage.cpu()))

    #start_time = time.time()
    speed, acc, p, r, f, pred_results, pred_scores, probs, acc_instances, acc_speed = evaluate(
        data, model, name, data.nbest)
    #end_time = time.time()
    #time_cost = end_time - start_time

    return pred_results, pred_scores, probs, acc_instances, acc_speed
Exemplo n.º 3
0
def load_model_decode(data, name, label_flag=True):
    print("Load Model from file", data.model_dir)
    model = SeqModel(data)
    ## handle GPU/non GPU issues
    map_location = lambda storage, loc: storage
    if data.HP_gpu:
        map_location = None
    ## load weights
    model.load_state_dict(
        torch.load(data.load_model_dir, map_location=map_location))
    start_time = time.time()
    speed, acc, p, r, f, pred_results, pred_scores = evaluate(
        data, model, name, nbest=None, label_flag=label_flag)
    end_time = time.time()
    time_cost = end_time - start_time
    # distinguish between non-segmentation tasks (POS, CCG ) and segmentation tasks (word segmentation, ner, chuncking) for which f1 score is necessary
    if data.seg:
        print(
            "{}: time{:.2f}s, speed: {:.2f}st/s; acc: {:.4f}, p: {:.4f}, r: {:.4f}, f: {:.4f}"
            .format(name, time_cost, speed, acc, p, r, f))
    else:
        print("{}: time{:.2f}s, speed: {:.2f}st/s; acc: {:.4f}".format(
            name, time_cost, speed, acc))
    # pred_scores is empty (it is only filled when nbest is not None)
    return pred_results, pred_scores
Exemplo n.º 4
0
def load_model_decode(data, name):
    print "Load Model from file: ", data.model_dir
    model = SeqModel(data)
    model.load_state_dict(torch.load(data.load_model_dir))

    print("Decode %s data, nbest: %s ..." % (name, data.nbest))
    start_time = time.time()

    summary = evaluate(data, model, name, True, data.nbest)
    pred_results_tasks = []
    pred_scores_tasks = []
    range_tasks = len(data.index_of_main_tasks)

    for idtask in xrange(range_tasks):
        speed, acc, p, r, f, pred_results, pred_scores = summary[idtask]
        pred_results_tasks.append(pred_results)
        pred_scores_tasks.append(pred_scores)
    end_time = time.time()
    time_cost = end_time - start_time
    if data:
        print(
            "%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"
            % (name, time_cost, speed, acc, p, r, f))
    else:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f" %
              (name, time_cost, speed, acc))

    return pred_results_tasks, pred_scores_tasks
Exemplo n.º 5
0
Arquivo: main.py Projeto: ml-lab/UANet
def load_model_decode(data):
    print("Load Model from dir: ", data.model_dir)
    model = SeqModel(data)
    model_name = data.model_dir + "/best_model.ckpt"
    model.load_state_dict(torch.load(model_name))

    evaluate(data, model, "raw")
Exemplo n.º 6
0
def load_model_decode(data, name):
    print("Load Model from file: ", data.model_dir)
    model = SeqModel(data)
    model.load_state_dict(torch.load(data.load_model_dir))

    print("Decode %s data, nbest: %s ..." % (name, data.nbest))
    start_time = time.time()
    speed, acc, p, r, f, pred_results, pred_scores = evaluate(data, model, name, data.nbest)
    end_time = time.time()
    time_cost = end_time - start_time
    print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (
        name, time_cost, speed, acc[0], p[0], r[0], f[0]))
    print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (
        name, time_cost, speed, acc[1], p[1], r[1], f[1]))
    return pred_results[1], pred_scores[1]
Exemplo n.º 7
0
def build_model(data):
    ''' For deployment: instantiate the model based on data object architecture specifications
    '''
    print("Load Model weights from file", data.load_model_dir)
    start_time = time.time()
    model = SeqModel(data)
    ## handle GPU/non GPU issues
    map_location = lambda storage, loc: storage
    if data.HP_gpu:
        map_location = None
    # laoding the weights of the model from load_model_dir
    model.load_state_dict(torch.load(data.load_model_dir, map_location = map_location))
    
    end_time = time.time()
    time_cost = end_time - start_time
    return model
Exemplo n.º 8
0
def load_model_decode(data, name):
    print("Load Model from file: ", data.model_dir)
    model = SeqModel(data)
    ## load model need consider if the model trained in GPU and load in CPU, or vice versa
    # if not gpu:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage)
    #     # model = torch.load(model_dir, map_location=lambda storage, loc: storage)
    # else:
    #     model.load_state_dict(torch.load(model_dir))
    #     # model = torch.load(model_dir)
    model.load_state_dict(torch.load(data.load_model_dir))

    print("Decode %s data, nbest: %s ..."%(name, data.nbest))
    start_time = time.time()
    speed, acc, p, r, f, pred_results, pred_scores = evaluate(data, model, name, data.nbest)
    end_time = time.time()
    time_cost = end_time - start_time
    if data.seg:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(name, time_cost, speed, acc, p, r, f))
    else:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f"%(name, time_cost, speed, acc))
    return pred_results, pred_scores
Exemplo n.º 9
0
def train(data):
    print "Training model..."
    data.show_data_summary()
    save_data_name = data.model_dir +".dset"
    data.save(save_data_name)
    model = SeqModel(data)
    
    if data.pretrained_model is not None:
        model_dict = model.state_dict()
        
        #We load the weights for the layers that we have pretrained (e.g. for language modeling)
        pretrained_dict = torch.load(data.pretrained_model)
        pretrained_dict = {k: v for k, v in pretrained_dict.items() 
                           if data.pretrained_part == data.PRETRAINED_ALL or 
                           (data.pretrained_part == data.PRETRAINED_LSTMS and "hidden2tagList" not in k)}

        # We overwrite entries in the existing state dict
        model_dict.update(pretrained_dict) 
        # We load the new state dict
        model.load_state_dict(model_dict)
    


    loss_function = nn.NLLLoss()
    if data.optimizer.lower() == "sgd":
        #optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum)
        optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum,weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s"%(data.optimizer))
        exit(0)
    best_dev = -10
    range_valid_tasks = range(data.HP_tasks)
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" %(idx,data.HP_iteration))
        if data.optimizer == "SGD":
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        total_loss = 0
        
        sample_loss = {idtask: 0 for idtask in range(data.HP_tasks)}
        right_token = {idtask: 0 for idtask in range(data.HP_tasks)}
        whole_token = {idtask: 0 for idtask in range(data.HP_tasks)}
        random.shuffle(data.train_Ids)
        
        #We get the indexes where are the samples of each (shuffled) treebank
        if data.disjoint:
            treebank_indexes = {}
            for idxsample, sample in enumerate(data.train_Ids):
                if sample[-1] not in treebank_indexes:
                    treebank_indexes[sample[-1]] = []   
                treebank_indexes[sample[-1]].append(idxsample)
        
        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = data.HP_batch_size
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num//batch_size+1

        if data.disjoint:
            tb_idxs = {tb:(0,batch_size) for tb in treebank_indexes}
            
        for batch_id in range(total_batch):
            start = batch_id*batch_size
            end = (batch_id+1)*batch_size 
            if end >train_num:
                end = train_num
            if data.disjoint:
                eligible_treebanks = [t for t in treebank_indexes
                                      if tb_idxs[t][0] < len(treebank_indexes[t]) and idx < data.ignore_after_epoch[t] ]
                if eligible_treebanks == []: break
                
                tb = random.choice(eligible_treebanks)
                range_valid_tasks = data.dataset_ids[tb]
            
                idx_init, idx_end = tb_idxs[tb]
                train_idxs = treebank_indexes[tb][idx_init:idx_end]
                instance = [data.train_Ids[idx_ins] for idx_ins in train_idxs] #data.train_Ids[train_idxs]
                tb_idxs[tb] = (idx_end, idx_end+batch_size)
            else:  
                instance = data.train_Ids[start:end]
                
            if not instance:
                continue
          
            batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu, inference=False)
            instance_count += 1

            loss, losses, tag_seq = model.neg_log_likelihood_loss(batch_word,batch_features, 
                                                                  batch_wordlen, batch_char, 
                                                                  batch_charlen, batch_charrecover, 
                                                                  batch_label, mask, range_valid_tasks, 
                                                                  inference=False)

            log=True
            for idtask in range_valid_tasks:
                right, whole = predict_check(tag_seq[idtask], batch_label[idtask], mask)
                sample_loss[idtask]+= losses[idtask].data[0]
                right_token[idtask]+=right
                whole_token[idtask]+=whole

            if end%500 == 0 and log:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                log = False
                if sample_loss[idtask] > 1e8 or str(sample_loss) == "nan":
                    print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                    exit(0)
                sys.stdout.flush()
    
                for aux_idtask in range(data.HP_tasks):
    
                    if whole_token[aux_idtask] == 0:
                        print ("Task %d (no samples found)"%(aux_idtask))
                    else:
                        if data.inv_dataset_ids[aux_idtask] in eligible_treebanks:
                            print("Task %d %s Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(aux_idtask,data.inv_dataset_ids[aux_idtask],end, temp_cost, sample_loss[aux_idtask], right_token[aux_idtask], whole_token[aux_idtask],(right_token[aux_idtask]+0.)/whole_token[aux_idtask]))
                        else:
                            print("Task %d %s does not contain more samples; loss: %4f"%(aux_idtask,data.inv_dataset_ids[aux_idtask], 
                                                                                      losses[aux_idtask].data[0]))
                    sample_loss[aux_idtask] = 0                 

            total_loss += loss.data[0]
            loss.backward()
            optimizer.step()
            model.zero_grad()
                                  
        temp_time = time.time()
        temp_cost = temp_time - temp_start
 
        for aux_idtask in range(data.HP_tasks):
    
            if whole_token[aux_idtask] == 0:
                print ("Task %d (no samples found)"%(aux_idtask))
            else:
                name_tb = data.inv_dataset_ids[aux_idtask]
                print("Task %d %s Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(aux_idtask,name_tb,len(treebank_indexes[name_tb]), temp_cost, sample_loss[aux_idtask], right_token[aux_idtask], whole_token[aux_idtask],(right_token[aux_idtask]+0.)/whole_token[aux_idtask]))

            sample_loss[aux_idtask] = 0
        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"%(idx,epoch_cost, train_num/epoch_cost, total_loss))

        if total_loss > 1e8 or str(total_loss) == "nan":
            print "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
            exit(0)

        summary = evaluate(data,model, "dev", False, False)
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish
        current_scores = []
        for idtask in xrange(0, data.HP_tasks):
            speed,acc,p,r,f,pred_labels,_,valid_indexes = summary[idtask]
            if data.seg:
                current_score = f
                current_scores.append(f)
                print("Task %d Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(idtask, dev_cost, speed, acc, p, r, f))
            else:
                current_score = acc
                current_scores.append(acc)
                print("Task %d Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(idtask, dev_cost, speed, acc))

        pred_results_tasks = []
        pred_scores_tasks = []
        
        pred_las_tasks = []
        valid_indexes = None
        for idtask in xrange(data.HP_tasks):
            speed, acc, p, r, f, pred_results, pred_scores, pred_indexes = summary[idtask]
            pred_results_tasks.append(pred_results)
            pred_scores_tasks.append(pred_scores_tasks)
            
            if idtask in data.task_metric and data.task_metric[idtask] == "LAS":
                pred_las_tasks.append(pred_results)
                valid_indexes = pred_indexes

        with tempfile.NamedTemporaryFile() as f_decode_mt:
            with tempfile.NamedTemporaryFile() as f_decode_st:

                # If we are learning multiple task we move it as a sequence labeling
                if data.HP_main_tasks > 1:
                    data.decode_dir = f_decode_mt.name
                    decoded_st_dir = f_decode_st.name
                    data.write_decoded_results(pred_las_tasks, 'dev', indexes=valid_indexes)
                    split_char = "{}"
                else:
                    if data.decode_dir is None:
                        data.decode_dir = f_decode_st.name
                        decoded_st_dir =  f_decode_st.name
                    data.write_decoded_results(pred_las_tasks, 'dev', indexes=valid_indexes)
                    split_char = "@"
                output_nn = open(data.decode_dir)
                tmp = tempfile.NamedTemporaryFile().name

                if data.offset:
                    decode_dependencies.decode_combined_tasks(output_nn, tmp, split_char)
                else:
                    print("decoding single task")
                    decode_dependencies.decode(output_nn, tmp, split_char)
                current_score = decode_dependencies.evaluate_dependencies(data.gold_dev_dep, tmp)
                print "Current Score (from LAS)", current_score, "Previous best dev (from LAS)", best_dev

        if current_score > best_dev:
            if data.seg:
                print "Exceed previous best f score:", best_dev
            else:
                print "Exceed previous best acc score:", best_dev
            model_name = data.model_dir +".model"
            print "Overwritting model to", model_name
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
        else:
            print("sofar the best "+repr(best_dev))

        if data.HP_tasks_weight_decays:
            print "Updating the weights using linear weight decay. ",
            print "The old weights were", data.HP_tasks_weights,
            data.HP_tasks_weights =[max(weight-decay,0) 
                                    for weight,decay in zip(data.HP_tasks_weights, data.HP_tasks_weight_decays)]
            print ". The new weights are", data.HP_tasks_weights
            model.set_tasks_weights(data.HP_tasks_weights)
        gc.collect()
Exemplo n.º 10
0
def load_model_decode(data):
    print("Load Model from file: ", data.model_dir)
    model = SeqModel(data)
    model.load_state_dict(torch.load(data.load_model_dir))
    evaluate(data.ner_2_test_idx, data, model)
Exemplo n.º 11
0
            config_dic.get("glove_path"),
            emb_dim=config_dic.get("word_emb_dim"))
        pretrain_embeddings = build_pretrain_embeddings(
            word2vec, word_dic, emb_dim=config_dic.get("word_emb_dim"))
    else:
        pretrain_embeddings = None

    # initialize Model
    seq_model = SeqModel(
        config_dic, len(word_dic.token2id), len(char_dic.token2id),
        [len(sw_dic.token2id) for sw_dic in sw_dicts.values()],
        len(label_dic.token2id), pretrain_embeddings)

    # load Model
    print(f"Load model {args.model} !")
    seq_model.load_state_dict(torch.load(args.model))

    word_dic.id2token = {v: k for k, v in word_dic.token2id.items()}
    char_dic.id2token = {v: k for k, v in char_dic.token2id.items()}
    for sp_key, sp in sps.items():
        sw_dicts[sp_key].id2token = {
            v: k
            for k, v in sw_dicts[sp_key].token2id.items()
        }

    ################ test predict check #####################
    print("============== Predict Check==========")
    true_seqs, pred_seqs, word_seqs, char_seqs = [], [], [], []
    right_token, total_token = 0, 0
    batch_size = config_dic.get("ner_batch_size")
    batch_steps = len(test_word_documents) // batch_size + 1
Exemplo n.º 12
0
    data.build_pretrain_emb()
    train(data)

elif opt.whattodo == 3:
    # step 3, evaluate on test data and output results in bioc format, one doc one file
    data = Data()
    data.read_config(opt.config)
    status = data.status.lower()
    data.HP_gpu = torch.cuda.is_available()
    data.load(data.dset_dir)
    data.read_config(opt.config)

    data.show_data_summary()
    data.fix_alphabet()
    model = SeqModel(data)
    model.load_state_dict(torch.load(data.load_model_dir))

    ner_output_dir = os.path.join(opt.testdata, "ner")
    if os.path.exists(ner_output_dir):
        shutil.rmtree(ner_output_dir)
        os.makedirs(ner_output_dir)
    else:
        os.makedirs(ner_output_dir)

    test_token, test_entity, _, test_name = preprocess.loadPreprocessData(
        opt.testdata)

    for i in tqdm(range(len(test_name))):
        doc_name = test_name[i]
        doc_token = test_token[i]
        doc_entity = test_entity[i]
Exemplo n.º 13
0
def error_pipeline(data, opt):
    test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData(
        data.test_dir)

    # evaluate on test data and output results in bioc format, one doc one file

    data.load(opt.data_file)
    data.MAX_SENTENCE_LENGTH = -1
    data.show_data_summary()

    data.fix_alphabet()
    seq_model = SeqModel(data)
    seq_model.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'model.pkl')))
    seq_wordseq = WordSequence(data, False, True, True, data.use_char)
    seq_wordseq.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'wordseq.pkl')))

    classify_model = ClassifyModel(data)
    if torch.cuda.is_available():
        classify_model = classify_model.cuda(data.HP_gpu)
    classify_model.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'model.pkl')))
    classify_wordseq = WordSequence(data, True, False, True, False)
    classify_wordseq.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'wordseq.pkl')))

    error_dir = "error"
    if not os.path.exists(error_dir):
        os.makedirs(error_dir)

    for i in tqdm(range(len(test_name))):
        doc_name = test_name[i]
        doc_token = test_token[i]
        doc_entity = test_entity[i]
        doc_relation = test_relation[i]

        listEntityFP = []
        listEntityFN = []

        ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity)

        data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer(
            ncrf_data, data.word_alphabet, data.char_alphabet,
            data.feature_alphabets, data.label_alphabet,
            data.number_normalized, data.MAX_SENTENCE_LENGTH)

        decode_results = ner.evaluateWhenTest(data, seq_wordseq, seq_model)

        entities = ner.translateNCRFPPintoEntities(doc_token, decode_results,
                                                   doc_name)

        # entity fn
        for _, gold in doc_entity.iterrows():
            find = False
            for predict in entities:
                if gold['type'] == predict.type and gold[
                        'start'] == predict.start and gold[
                            'end'] == predict.end:
                    find = True
                    break
            if not find:
                context_token = doc_token[(
                    doc_token['sent_idx'] == gold['sent_idx'])]
                sequence = ""
                for _, token in context_token.iterrows():
                    if token['start'] == gold['start']:
                        sequence += "["
                    sequence += token['text']
                    if token['end'] == gold['end']:
                        sequence += "]"
                    sequence += " "
                listEntityFN.append("{} | {}\n{}\n".format(
                    gold['text'], gold['type'], sequence))

        # entity fp
        for predict in entities:
            find = False
            for _, gold in doc_entity.iterrows():
                if gold['type'] == predict.type and gold[
                        'start'] == predict.start and gold[
                            'end'] == predict.end:
                    find = True
                    break
            if not find:
                context_token = doc_token[(
                    doc_token['sent_idx'] == predict.sent_idx)]
                sequence = ""
                for _, token in context_token.iterrows():
                    if token['start'] == predict.start:
                        sequence += "["
                    sequence += token['text']
                    if token['end'] == predict.end:
                        sequence += "]"
                    sequence += " "
                listEntityFP.append("{} | {}\n{}\n".format(
                    predict.text, predict.type, sequence))

        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(
            doc_token, entities, doc_name, data)

        relations = relation_extraction.evaluateWhenTest(
            classify_wordseq, classify_model, test_X, data, test_other,
            data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        listRelationFP = []
        listRelationFN = []
        # relation fn
        for _, gold in doc_relation.iterrows():
            find = False
            gold_entity1 = doc_entity[(
                doc_entity['id'] == gold['entity1_id'])].iloc[0]
            gold_entity2 = doc_entity[(
                doc_entity['id'] == gold['entity2_id'])].iloc[0]

            for predict in relations:
                predict_entity1 = predict.node1
                predict_entity2 = predict.node2

                if gold['type'] == predict.type \
                    and gold_entity1['type']==predict_entity1.type and gold_entity1['start']==predict_entity1.start and gold_entity1['end']==predict_entity1.end \
                    and gold_entity2['type']==predict_entity2.type and gold_entity2['start']==predict_entity2.start and gold_entity2['end']==predict_entity2.end:
                    find = True
                    break
                elif gold['type'] == predict.type \
                    and gold_entity1['type']==predict_entity2.type and gold_entity1['start']==predict_entity2.start and gold_entity1['end']==predict_entity2.end \
                    and gold_entity2['type']==predict_entity1.type and gold_entity2['start']==predict_entity1.start and gold_entity2['end']==predict_entity1.end:
                    find = True
                    break

            if not find:
                former = gold_entity1 if gold_entity1['start'] < gold_entity2[
                    'start'] else gold_entity2
                latter = gold_entity2 if gold_entity1['start'] < gold_entity2[
                    'start'] else gold_entity1
                context_token = doc_token[
                    (doc_token['sent_idx'] >= former['sent_idx'])
                    & (doc_token['sent_idx'] <= latter['sent_idx'])]

                # print("{}: {} | {}: {}".format(former['id'], former['text'], latter['id'], latter['text']))
                sequence = ""
                for _, token in context_token.iterrows():
                    if token['start'] == former['start'] or token[
                            'start'] == latter['start']:
                        sequence += "["
                    sequence += token['text']
                    if token['end'] == former['end'] or token['end'] == latter[
                            'end']:
                        sequence += "]"
                    sequence += " "

                listRelationFN.append("{} | {} | {}\n{}\n".format(
                    former['text'], latter['text'], gold['type'], sequence))

        # relation fp
        for predict in relations:
            predict_entity1 = predict.node1
            predict_entity2 = predict.node2
            find = False

            for _, gold in doc_relation.iterrows():

                gold_entity1 = doc_entity[(
                    doc_entity['id'] == gold['entity1_id'])].iloc[0]
                gold_entity2 = doc_entity[(
                    doc_entity['id'] == gold['entity2_id'])].iloc[0]

                if gold['type'] == predict.type \
                    and gold_entity1['type']==predict_entity1.type and gold_entity1['start']==predict_entity1.start and gold_entity1['end']==predict_entity1.end \
                    and gold_entity2['type']==predict_entity2.type and gold_entity2['start']==predict_entity2.start and gold_entity2['end']==predict_entity2.end:
                    find = True
                    break
                elif gold['type'] == predict.type \
                    and gold_entity1['type']==predict_entity2.type and gold_entity1['start']==predict_entity2.start and gold_entity1['end']==predict_entity2.end \
                    and gold_entity2['type']==predict_entity1.type and gold_entity2['start']==predict_entity1.start and gold_entity2['end']==predict_entity1.end:
                    find = True
                    break

            if not find:
                former = predict_entity1 if predict_entity1.start < predict_entity2.start else predict_entity2
                latter = predict_entity2 if predict_entity1.start < predict_entity2.start else predict_entity1
                context_token = doc_token[
                    (doc_token['sent_idx'] >= former.sent_idx)
                    & (doc_token['sent_idx'] <= latter.sent_idx)]

                sequence = ""
                for _, token in context_token.iterrows():
                    if token['start'] == former.start or token[
                            'start'] == latter.start:
                        sequence += "["
                    sequence += token['text']
                    if token['end'] == former.end or token['end'] == latter.end:
                        sequence += "]"
                    sequence += " "

                listRelationFP.append("{} | {} | {}\n{}\n".format(
                    former.text, latter.text, predict.type, sequence))

        with open(os.path.join(error_dir, doc_name + ".txt"), 'w') as fp:
            fp.write("######## ENTITY FN ERROR ##########\n\n")
            for item in listEntityFN:
                fp.write(item)
                fp.write('\n')

            fp.write("######## ENTITY FP ERROR ##########\n\n")
            for item in listEntityFP:
                fp.write(item)
                fp.write('\n')

            fp.write("######## RELATION FN ERROR ##########\n\n")
            for item in listRelationFN:
                fp.write(item)
                fp.write('\n')

            fp.write("######## RELATION FP ERROR ##########\n\n")
            for item in listRelationFP:
                fp.write(item)
                fp.write('\n')
Exemplo n.º 14
0
def train(data):
    print("Training model...")
    data.show_data_summary()
    save_data_name = data.model_dir + ".dset"
    # 存储data数据
    data.save(save_data_name)
    model = SeqModel(data)
    # check to load pretrained model
    if data.use_crf:
        pretrain_model_path = os.path.join('model_snapshot', 'lan_crf.model')
    else:
        pretrain_model_path = os.path.join('model_snapshot', 'lan.model')
    if data.use_pre_trained_model and os.path.exists(pretrain_model_path):
        model.load_state_dict(torch.load(pretrain_model_path))
        print("load pretrained model success:%s" % pretrain_model_path)
    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print("--------pytorch total params--------")
    print(pytorch_total_params)
    optimizer = None
    if data.optimizer.lower() == "sgd":
        optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=data.HP_lr,
                              momentum=data.HP_momentum, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "rmsprop":
        optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    elif data.optimizer.lower() == "adam":
        optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
    else:
        print("Optimizer illegal: %s" % (data.optimizer))
        exit(1)
    best_dev = -10
    best_test = -10
    no_imprv_epoch = 0
    ## start training
    for idx in range(data.HP_iteration):
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" % (idx, data.HP_iteration))  # print (self.train_Ids)
        # every 5 epoch decay learning rate
        if idx % 5 == 0:
            optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        total_loss = 0
        ## set model in train model
        model.train()
        model.zero_grad()
        start = 0
        end = start + data.HP_batch_size
        train_epochs = []
        while end <= len(data.train_Ids):
            train_epochs.append((start, end))
            start = end
            end = end + data.HP_batch_size
        if end > len(data.train_Ids) > start:
            train_epochs.append((start, len(data.train_Ids)))
        for sample_id, (start, end) in enumerate(train_epochs):
            instance = data.train_Ids[start: end]
            sample_loss = 0
            batch_word, batch_word_len, _, batch_word_recover, batch_label, mask, input_label_seq_tensor = batchify_with_label(
                instance, data.HP_gpu, data)
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(
                batch_word, batch_word_len, batch_label, mask, input_label_seq_tensor)
            sample_loss += loss.item()
            total_loss += loss.item()
            print("Epoch:%s,no_imprv_epoch:%s,Instance: %s" % (
                idx, no_imprv_epoch, sample_id))
            right, whole = predict_check(tag_seq, batch_label, mask, data.use_crf)
            print("               loss: %.4f, acc: %s/%s=%.4f" % (
                loss.item(), right, whole, (right + 0.) / whole * 100))

            if sample_loss > 1e8 or str(sample_loss) == "nan":
                print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
                exit(1)
            sys.stdout.flush()
            loss.backward()
            if data.whether_clip_grad:
                nn.utils.clip_grad_norm_(model.parameters(), data.clip_grad)
            optimizer.step()
            model.zero_grad()
            # break
        epoch_finish = time.time()
        if total_loss > 1e8 or str(total_loss) == "nan":
            print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....")
            exit(1)
        speed, acc, report, f_value, \
        ner_acc, ner_p, ner_r, ner_f = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if data.seg:
            current_score = f_value
            # current_score = sent_f1
            print("Dev: time: %.2fs, speed: %.2fst/s;\n"
                  "acc: %.4f, f_value: %.4f\n"
                  "ner_acc: %.4f, ner_p: %.4f, ner_r: %.4f, ner_f: %.4f\n"
                  "current f1:%.4f" % (
                      dev_cost, speed, acc, f_value,
                      ner_acc, ner_p, ner_r, ner_f, current_score
                  ))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (
                dev_cost, speed, acc))

        # ## decode test
        speed, acc, report, f_value, \
        ner_acc, ner_p, ner_r, ner_f = evaluate(data, model, "test")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish
        if data.seg:
            print("Test: time: %.2fs, speed: %.2fst/s;\n"
                  "acc: %.4f, f_value: %.4f\n"
                  "ner_acc: %.4f, ner_p: %.4f, ner_r: %.4f, ner_f: %.4f\n"
                  "current f1:%.4f" % (
                      dev_cost, speed, acc, f_value,
                      ner_acc, ner_p, ner_r, ner_f, current_score
                  ))
        else:
            print("Test: time: %.2fs speed: %.2fst/s; acc: %.4f" % (
                dev_cost, speed, acc))

        if current_score > best_dev:
            if data.seg:
                best_test = f_value
                # best_test = sent_f1
                print("Exceed previous best avg f score:", best_dev)
            else:
                best_test = acc
                print("Exceed previous best acc score:", best_dev)
            if data.use_crf:
                result_file = "result_crf.txt"
                model_name = data.model_dir + "_crf.model"
            else:
                result_file = "result.txt"
                model_name = data.model_dir + ".model"
            with open(result_file, 'w', encoding='utf-8') as w:
                w.write(
                    "Save current best model in file:%s, iteration:%s/%s, best_test_f_score:%.5f\n"
                    "ner:\n"
                    "   precision:%.5f, recall:%.5f, f1_score:%.5f\n"
                    "%s\n\n" % (
                        model_name, idx, data.HP_iteration, best_test,
                        ner_p, ner_r, ner_f,
                        report))
            print("Save current best model in file:", model_name)
            torch.save(model.state_dict(), model_name)
            best_dev = current_score
            no_imprv_epoch = 0
        else:
            # early stop
            no_imprv_epoch += 1
            if no_imprv_epoch >= 10:
                print("early stop")
                print("Current best f score in dev", best_dev)
                print("Current best f score in test", best_test)
                break

        if data.seg:
            print("Current best f score in dev", best_dev)
            print("Current best f score in test", best_test)
        else:
            print("Current best acc score in dev", best_dev)
            print("Current best acc score in test", best_test)
        gc.collect()