Exemplo n.º 1
0
    def save_checkpoint(self, model, optimizer, 
                        save_dir='/home/workspace/ImageClassifier', 
                        checkpoint_file='checkpoint.pth'):
        """
        Saves the neural network to a checkpoint file so it can be
        reloaded again without the need to re-train the network.
        INPUTS:
            1. Network model:           <model object>
            2. Gradient descent def:    <optimizer object>
            3. URL for checkpoint file: <str>
            4. Checkpoint file name     <str>
        RETURNS:
            None
        """
        # define the checkpoint dict for saving, loading and inference later
        checkpoint = {'arch' : self.arch,
                      'input_size' : self.input_size,
                      'hidden_size' : self.hidden_size,
                      'output_size' : self.output_size,
                      'classifier' : model.classifier,
                      'learning_rate' : self.learning_rate,
                      'epochs' : self.epochs,
                      'loss' : self.training_loss,
                      'class_to_idx' : self.class_to_idx,
                      'model_state_dict': model.state_dict(),
                      'optimizer_state_dict': optimizer.state_dict()}

        # save the model to the specified folder and file name
        try:
            torch.save(checkpoint, save_dir + "/" + checkpoint_file)
        except Exception as error:
            print("The following error: {} occurred while saving the checkpoint file to: {}".format(error, save_dir + "/" + checkpoint_file))
        else:
            print("Trained model saved to: {}".format(save_dir + "/" + checkpoint_file))
def saveModel(state, epoch, loss_epoch, diff_epoch, is_best, epoch_len):
    torch.save({
        "epoch": epoch,
        "epoch_len": epoch_len,
        "state_dict": state,
        "epoch_avg_loss": float(loss_epoch) / epoch_len,
        "epoch_avg_diff": float(diff_epoch) / epoch_len
    }, MODEL_PATH)
Exemplo n.º 3
0
def saveModel(state, epoch, epoch_loss, epoch_diff, is_best):
    torch.save({
        "epoch": epoch,
        "state_dict": state,
        "epoch_avg_loss": epoch_loss,
        "epoch_avg_diff": epoch_diff
    }, MODEL_PATH)
    if is_best:
        shutil.copyfile(MODEL_PATH, MODEL_PATH_BEST)
Exemplo n.º 4
0
def saveModel(state, epoch, loss_epoch, diff_epoch, is_best, episode_idx):
    torch.save({
        "epoch": epoch,
        "episodes": episode_idx + 1,
        "state_dict": state,
        "epoch_avg_loss": float(loss_epoch) / (episode_idx + 1),
        "epoch_avg_diff": float(diff_epoch) / (episode_idx + 1)
    }, MODEL_PATH)
    if is_best:
        shutil.copyfile(MODEL_PATH, MODEL_PATH_BEST)
Exemplo n.º 5
0
def saveModel(state, epoch, loss_epoch, valid_epoch, is_best, episode_idx):
    torch.save({
        "epoch": epoch,
        "episodes": episode_idx + 1,
        "state_dict": state,
        "epoch_avg_loss": round(loss_epoch, 10),
        "epoch_avg_valid": round(valid_epoch, 10)
    }, MODEL_PATH)
    if is_best:
        shutil.copyfile(MODEL_PATH, MODEL_PATH_BEST)
Exemplo n.º 6
0
def saveModel(state, epoch, loss_epoch, diff_epoch, is_best, epoch_len):
    print("saving...")
    torch.save(
        {
            "epoch": epoch,
            "epoch_len": epoch_len,
            "state_dict": state,
            "epoch_avg_loss": float(loss_epoch) / epoch_len,
            "epoch_avg_diff": float(diff_epoch) / epoch_len
        }, MODEL_PATH)
    if is_best:
        shutil.copyfile(MODEL_PATH, MODEL_PATH_BEST)
    print("saved.")
def train():
    train_dataset, test_dataset = load_normalized_datasets()

    net = SimpleConvNet()  #debug=True)

    train_dataset_loader = DataLoader(train_dataset,
                                      batch_size=BATCH_SIZE,
                                      shuffle=True)
    test_dataset_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE)

    for epoch in xrange(0, EPOCHS):
        train_step(net, train_dataset_loader, epoch)
        test_training_accuracy(net, test_dataset_loader, epoch)
        torch.save(net.state_dict(), open(MODEL_FILE, "wb"))
Exemplo n.º 8
0
    def train_epoch(self, save_model=False):
        self.model.train()
        # if batch_size is None:
        #     batch_size = len(self.train_loader)
        # else:
        #     batch_size = min(batch_size, len(self.train_loader))

        # print('data loader size', len(self.train_loader['label']))

        # for i, sample in enumerate(self.train_loader):
        #     print(sample['label'], len(sample['label']))

        loss_list = []
        with tqdm(enumerate(self.train_loader),
                  total=len(self.train_loader),
                  desc='train epochs') as progress_bar:
            for i_batch, batch in progress_bar:
                data = batch['data'].to(self.device)

                # print('data shape:', data.size())
                # print(data.type)
                # print('labels:', batch['label'])
                labels = batch['label'].to(self.device)

                output = self.model(data)

                acc = self.compute_acc(labels=labels, output=output)
                # print('accuracy:', acc.item())

                #zero out the parameter gradients
                self.optimizer.zero_grad()
                # print('output', output.size())
                # print('labels', labels.size())

                loss = self.loss_fcn(output, labels)
                # print('loss:', loss.item())

                loss.backward()
                self.optimizer.step()
                loss_list.append(loss)
                # progress_bar.set_postfix(avg_loss=sum(loss_list)/len(loss_list))
                progress_bar.set_postfix(loss=loss.item(), acc=acc.item())

        if save_model:
            torch.save(self.model.state_dict(), self.model.path)

        return loss_list
Exemplo n.º 9
0
def save_to(model: nn.Module, path: str, ep: int):
    '''保存模型到指定路径

    Args:
        model(nn.Module): 模型
        path(str): 存档路径
        ep(int): 当前所处的epoch
    '''
    if not os.path.exists(path):
        os.mkdir(path)
    ckpt_path = os.path.join(path, 'ep-%d.pth' % ep)
    torch.save({
        'epoch': ep,
        'model_state_dict': model.state_dict()
    }, ckpt_path)
    print('Model trained after %d epochs has been saved to: %s.' %
          (ep, ckpt_path))
Exemplo n.º 10
0
def main():
    model = MODEL_DISPATCHER[BASE_MODEL](pretrained=True)
    model.to(DEVICE)

    train_dataset = BengaliDatasetTrain(folds=TRAINING_FOLDS,
                                        img_height=IMG_HEIGHT,
                                        img_width=IMG_WIDTH,
                                        mean=MODEL_MEAN,
                                        std=MODEL_STD)

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=TRAIN_BATCH_SIZE,
                                               shuffle=True,
                                               num_workers=4)

    valid_dataset = BengaliDatasetTrain(folds=VALIDATION_FOLDS,
                                        img_height=IMG_HEIGHT,
                                        img_width=IMG_WIDTH,
                                        mean=MODEL_MEAN,
                                        std=MODEL_STD)

    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                               batch_size=TEST_BATCH_SIZE,
                                               shuffle=False,
                                               num_workers=4)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           patience=5,
                                                           factor=0.3,
                                                           verbose=True)

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    for epoch in range(0, EPOCHS):
        train(train_dataset, train_loader, model, optimizer)
        with torch.no_grad():
            val_score = evaluate(valid_dataset, valid_loader, model)
        scheduler.step(val_score)
        torch.save(model.state_dict(),
                   f"{BASE_MODEL}_fold{VALIDATION_FOLDS[0]}.bin")
Exemplo n.º 11
0
    def merge_checkpoints(checkpoint_paths, output_path):
        if checkpoint_paths is None or len(checkpoint_paths) < 1:
            raise ValueError(
                'Need to specify at least one checkpoint, %d provided.' %
                len(checkpoint_paths))

        if len(checkpoint_paths) < 2:
            shutil.copyfile(checkpoint_paths[0], output_path)

        def __sum(source, destination):
            for key, value in source.items():
                if isinstance(value, dict):
                    node = destination.setdefault(key, {})
                    __sum(value, node)
                else:
                    if isinstance(value, torch.FloatTensor):
                        destination[key] = torch.add(destination[key], 1.0,
                                                     value)

            return destination

        def __divide(source, denominator):
            for key, value in source.items():
                if isinstance(value, dict):
                    node = source.setdefault(key, {})
                    __divide(node, denominator)
                else:
                    if isinstance(value, torch.FloatTensor):
                        source[key] = torch.div(value, denominator)

            return source

        output_checkpoint = torch.load(checkpoint_paths[0])

        for checkpoint_path in checkpoint_paths[1:]:
            checkpoint = torch.load(checkpoint_path)
            output_checkpoint = __sum(checkpoint, output_checkpoint)

        output_checkpoint = __divide(output_checkpoint, len(checkpoint_paths))

        torch.save(output_checkpoint, output_path)
Exemplo n.º 12
0
def build_model(train_dataset, dev_dataset, test_dataset, collate_fn, tag_idx,
                is_oov, embedding_matrix, model_save_path, plot_save_path):
    # init model
    model = BiLSTM_CRF(embedding_matrix, tag_idx)

    # Turn on cuda
    model = model.cuda()

    # verify model
    print(model)

    # remove paramters that have required_grad = False

    optimizer = optim.Adadelta(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=cfg.LEARNING_RATE)
    # optimizer = optim.SGD(model.parameters(), lr=cfg.LEARNING_RATE, momentum=0.9)
    optimizer.zero_grad()
    model.zero_grad()

    # init loss criteria
    best_res_val_0 = 0.0
    best_epoch = 0
    dev_eval_history = []
    test_eval_history = []
    for epoch in range(cfg.MAX_EPOCH):
        print('-' * 40)
        print("EPOCH = {0}".format(epoch))
        print('-' * 40)

        random.seed(epoch)
        train_loader = DataLoader(train_dataset,
                                  batch_size=cfg.BATCH_SIZE,
                                  shuffle=cfg.RANDOM_TRAIN,
                                  num_workers=28,
                                  collate_fn=collate_fn)

        train_eval, model = train_a_epoch(name="train",
                                          data=train_loader,
                                          tag_idx=tag_idx,
                                          model=model,
                                          optimizer=optimizer)

        dev_loader = DataLoader(dev_dataset,
                                batch_size=cfg.BATCH_SIZE,
                                num_workers=28,
                                collate_fn=collate_fn)
        test_loader = DataLoader(test_dataset,
                                 batch_size=cfg.BATCH_SIZE,
                                 num_workers=28,
                                 collate_fn=collate_fn)

        dev_eval, _, _ = test("dev", dev_loader, tag_idx, model)
        test_eval, _, _ = test("test", test_loader, tag_idx, model)

        dev_eval.verify_results()
        test_eval.verify_results()
        dev_eval_history.append(dev_eval.results[cfg.BEST_MODEL_SELECTOR[0]])
        test_eval_history.append(test_eval.results['test_conll_f'])
        plot_curve(epoch, dev_eval_history, test_eval_history, "epochs",
                   "fscore", "epoch learning curve", plot_save_path)
        pickle.dump((dev_eval_history, test_eval_history),
                    open("plot_data.p", "wb"))
        # pick the best epoch
        if epoch < cfg.MIN_EPOCH_IMP or (
                dev_eval.results[cfg.BEST_MODEL_SELECTOR[0]] > best_res_val_0):
            best_epoch = epoch
            best_res_val_0 = dev_eval.results[cfg.BEST_MODEL_SELECTOR[0]]

            torch.save(model, model_save_path)

        print("current dev micro_score: {0}".format(
            dev_eval.results[cfg.BEST_MODEL_SELECTOR[0]]))
        print("current dev macro_score: {0}".format(
            dev_eval.results[cfg.BEST_MODEL_SELECTOR[1]]))
        print("best dev micro_score: {0}".format(best_res_val_0))
        print("best_epoch: {0}".format(str(best_epoch)))

        # if the best epoch model outperforms MA
        if 0 < cfg.MAX_EPOCH_IMP <= (epoch - best_epoch):
            break
    print("Loading Best Model ...")

    model = torch.load(model_save_path)
    return model
Exemplo n.º 13
0
    def train_model(self, train_data, valid_data=None, save_path=None, save_epochs=5):
        multi_gpu = self._gpu_ids is not None and len(self._gpu_ids) > 1

        # set the mask to None; required when the same model is trained after a translation
        if multi_gpu:
            decoder = self._model.module.decoder
        else:
            decoder = self._model.decoder
        decoder.attn.applyMask(None)
        self._model.train()

        # define criterion of each GPU
        criterion = self._new_nmt_criterion(self._trg_dict.size())

        perplexity_history = []
        checkpoint_files = []
        valid_acc, valid_ppl = None, None

        try:
            for epoch in range(self.start_epoch, self.max_epochs + 1):
                self._logger.log(self._log_level, 'Training epoch %g... START' % epoch)
                start_time_epoch = time.time()

                #  (1) train for one epoch on the training set
                train_loss, train_acc = self._train_epoch(epoch, train_data, self._model, criterion, self._optim)
                train_ppl = math.exp(min(train_loss, 100))
                self._logger.log(self._log_level, 'trainEpoch Epoch %g Train loss: %g perplexity: %g accuracy: %g' % (
                    epoch, train_loss, train_ppl, (float(train_acc) * 100)))

                force_termination = False

                if self.min_perplexity_decrement > 0.:
                    perplexity_history.append(train_ppl)
                    force_termination = self._should_terminate(perplexity_history)

                if valid_data:
                    #  (2) evaluate on the validation set
                    valid_loss, valid_acc = self._evaluate(criterion, valid_data)
                    valid_ppl = math.exp(min(valid_loss, 100))
                    self._logger.log(self._log_level,
                                     'trainModel Epoch %g Validation loss: %g perplexity: %g accuracy: %g' % (
                                         epoch, valid_loss, valid_ppl, (float(valid_acc) * 100)))

                    # (3) update the learning rate
                    self._optim.updateLearningRate(valid_loss, epoch)

                    self._logger.log(self._log_level,
                                     "trainModel Epoch %g Decaying learning rate to %g" % (epoch, self._optim.lr))

                if save_path is not None and save_epochs > 0:
                    if len(checkpoint_files) > 0 and len(checkpoint_files) > save_epochs - 1:
                        os.remove(checkpoint_files.pop(0))

                    model_state_dict = self._model.module.state_dict() if multi_gpu else self._model.state_dict()
                    model_state_dict = {k: v for k, v in model_state_dict.items() if 'generator' not in k}
                    generator_state_dict = self._model.generator.module.state_dict() if multi_gpu \
                        else self._model.generator.state_dict()

                    #  (4) drop a checkpoint
                    checkpoint = {
                        'model': model_state_dict,
                        'generator': generator_state_dict,
                        'dicts': {'src': self._src_dict, 'tgt': self._trg_dict},
                        'opt': copy.deepcopy(self._model_params.__dict__),
                        'epoch': epoch,
                        'optim': self._optim
                    }

                    if valid_acc is not None:
                        checkpoint_file = \
                            '%s_acc_%.2f_ppl_%.2f_e%d.pt' % (save_path, 100 * valid_acc, valid_ppl, epoch)
                    else:
                        checkpoint_file = '%s_acc_NA_ppl_NA_e%d.pt' % (save_path, epoch)

                    torch.save(checkpoint, checkpoint_file)
                    checkpoint_files.append(checkpoint_file)
                    self._logger.log(self._log_level,
                                     "Checkpoint for epoch %d saved to file %s" % (epoch, checkpoint_file))

                if force_termination:
                    break

                self._logger.log(self._log_level,
                                 'Training epoch %g... END %.2fs' % (epoch, time.time() - start_time_epoch))
        except KeyboardInterrupt:
            raise TrainingInterrupt(checkpoint=checkpoint_files[-1] if len(checkpoint_files) > 0 else None)

        return checkpoint_files[-1] if len(checkpoint_files) > 0 else None
Exemplo n.º 14
0
def train_model():
    num_epochs = 200
    learning_rate = 0.000001
    batch_size = 10
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model = ConvNet().to(device)
    """
    dataxy=get_data()
    with open("psd_score.txt","wb") as f:
        pickle.dump(dataxy,f)
    """
    dataxy = []
    with open('psd_score.txt', 'rb') as file:
        dataxy = pickle.load(file)

    x = np.array(dataxy[0])
    y = np.array(dataxy[1])
    train_x_origin, test_x_origin, train_y_origin, test_y_origin = train_test_split(
        x, y)

    traindataset = ds(train_x_origin, train_y_origin, len(train_x_origin))
    testdataset = ds(test_x_origin, test_y_origin, len(test_x_origin))
    loss = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_loader = DataLoader(dataset=traindataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=0)
    test_loader = DataLoader(dataset=testdataset,
                             batch_size=batch_size,
                             shuffle=True,
                             num_workers=0)

    train_loss = []
    test_loss = []

    for epoch in tqdm(range(num_epochs)):
        cur_train_loss = []
        cur_test_loss = []
        for i, (data, labels) in enumerate(train_loader):
            data = data.to(device).reshape(-1, 1, 5, 31)
            data = data.type(torch.FloatTensor).to(device)
            labels = labels.to(device)
            y_pred = model(data).type(torch.FloatTensor).to(device)
            labels = labels.type(torch.FloatTensor).to(device)
            l = loss(y_pred, labels)
            l.backward()
            optimizer.step()
            optimizer.zero_grad()
            cur_train_loss.append(l.item())
        train_loss.append(np.array(cur_train_loss).mean())
        print("train loss ", np.array(cur_train_loss).mean())

        for i, (test_x, test_y) in enumerate(test_loader):
            test_x = test_x.reshape(-1, 1, 5,
                                    31).type(torch.FloatTensor).to(device)
            test_y = test_y.to(device)
            out = model(test_x)
            l = loss(out, test_y)
            cur_test_loss.append(l.item())
        test_loss.append(np.array(cur_test_loss).mean())
        print("test loss ", np.array(cur_test_loss).mean())

    plt.plot(train_loss, label="train_loss")
    plt.plot(test_loss, label="test_loss")
    plt.legend()
    plt.show()
    predict = []
    torch.save(model.state_dict(), "model1.pt")
    for i in range(len(test_x_origin)):
        predict.append(
            model(
                torch.from_numpy(test_x_origin[i]).reshape(-1, 1, 5, 31).type(
                    torch.FloatTensor).to(device)).item())
    return test_y_origin, predict
Exemplo n.º 15
0
                            EPOCHS_a, lr_a, clip)
# Printing Learning Curves
learn_curves(valid_losses, train_losses, "AudioRNN_Loss")

#save model metadata
audio_rnn_metadata = {"accuracy": audio_accuracies,
                      "valid_loss": valid_losses,
                      "train_loss": train_losses}

# save metadata dictionaries
pickle_save("audio_rnn.p", audio_rnn_metadata)
'''
####################################################################
# Save/Load models
####################################################################
'''
# SAVING MODE
# save model dictionary to PATH
rnn_path = os.path.abspath("rnn_metadata")
TEXT_RNN_PATH = os.path.join(rnn_path, "text_rnn_model.py")
AUDIO_RNN_PATH = os.path.join(rnn_path, "audio_rnn_model.py")


# always tranfer to cpu for interuser compatibility
model = text_rnn.to("cpu")
torch.save(model.state_dict(), TEXT_RNN_PATH)

model = audio_rnn.to("cpu")
torch.save(model.state_dict(), AUDIO_RNN_PATH)

Exemplo n.º 16
0
        "accuracy": text_accuracies,
        "valid_loss": valid_losses,
        "train_loss": train_losses
    }

    # save metadata dict
    pickle_save(file_prefix + "text_rnn.p", text_rnn_metadata)

    # SAVING MODE
    # save model dictionary to PATH
    rnn_path = os.path.abspath("pretrained_models")
    TEXT_RNN_PATH = os.path.join(rnn_path, file_prefix + "text_rnn_model.pt")

    # always tranfer to cpu for interuser compatibility
    model = text_rnn.to("cpu")
    torch.save(model.state_dict(), TEXT_RNN_PATH)
elif len(sys.argv) > 2 and sys.argv[2] == '-pre_audio':
    print(' ----- Pretrain Audio classifier ----- ')
    ####################################################################
    # Training Audio RNN Model
    ####################################################################
    EPOCHS_a = 150
    lr_a = 0.0001
    clip = 5.0
    data_loaders = (train_loader, valid_loader, test_loader)

    audio_rnn, audio_accuracies, valid_losses, train_losses\
        = audio_rnn_pretraining(data_loaders,
                                audio_hyperparameters,
                                EPOCHS_a, lr_a, clip)
    # Printing Learning Curves
Exemplo n.º 17
0
def train_model():
    num_epochs = 100
    learning_rate = 0.00001
    batch_size = 5
    channel_size = 3
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(device)
    model = ConvNet().to(device)
    #model=AlexNet().to(device)
    """
    dataxy=get_data()
    with open("psd_score.txt","wb") as f:
        pickle.dump(dataxy,f)
    """
    dataxy = []
    with open(
            "D:/code/code/eegemotion/git/model/corr_classify/corr_score_classify.txt",
            "rb") as f:
        dataxy = pickle.load(f)
    x = np.array(dataxy[0])
    y = np.array(dataxy[1])
    train_x_origin, test_x_origin, train_y_origin, test_y_origin = train_test_split(
        x, y)

    traindataset = ds(train_x_origin, train_y_origin, len(train_x_origin))
    testdataset = ds(test_x_origin, test_y_origin, len(test_x_origin))
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_loader = DataLoader(dataset=traindataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=0)
    test_loader = DataLoader(dataset=testdataset,
                             batch_size=batch_size,
                             shuffle=True,
                             num_workers=0)

    train_loss = []
    test_loss = []
    train_accuracy = []
    test_accuracy = []
    for epoch in tqdm(range(num_epochs)):
        cur_train_loss = []
        cur_test_loss = []
        right, total = 0, 0
        for i, (data, labels) in enumerate(train_loader):
            data = data.to(device).reshape(-1, channel_size, 31, 31)
            data = data.type(torch.FloatTensor).to(device)
            y_pred = model(data).to(device)
            labels = labels.to(device).long()
            l = loss(y_pred, labels)
            l.backward()
            optimizer.step()
            optimizer.zero_grad()
            cur_train_loss.append(l.item())
            right += (torch.argmax(y_pred, dim=1) == labels).sum().item()
            total += batch_size

        train_loss.append(np.array(cur_train_loss).mean())
        train_accuracy.append(right / total)
        print("train loss ",
              np.array(cur_train_loss).mean(), " accuracy: ", right / total)

        right, total = 0, 0
        for i, (test_x, test_y) in enumerate(test_loader):
            test_x = test_x.reshape(-1, channel_size, 31,
                                    31).type(torch.FloatTensor).to(device)
            test_y = test_y.to(device).long()
            out = model(test_x)
            l = loss(out, test_y)
            cur_test_loss.append(l.item())

            right += (torch.argmax(out, dim=1) == test_y).sum().item()
            total += batch_size

        test_loss.append(np.array(cur_test_loss).mean())
        test_accuracy.append(right / total)
        print("test loss ",
              np.array(cur_test_loss).mean(), " accuracy: ", right / total)

    plt.plot(train_loss, label="train_loss")
    plt.plot(test_loss, label="test_loss")
    plt.legend()
    plt.show()

    plt.plot(train_accuracy, label="train_accuracy")
    plt.plot(test_accuracy, label="test_accuracy")
    plt.legend()
    plt.show()

    torch.save(model.state_dict(),
               "D:/code/code/eegemotion/git/model/corr_classify/model.pt")
Exemplo n.º 18
0
def build_model(train_dataset, dev_dataset, test_dataset, collate_fn, tag_idx,
                is_oov, embedding_matrix, model_save_path, plot_save_path):
    # init model
    model = MultiBatchSeqNet(embedding_matrix,
                             batch_size=cfg.BATCH_SIZE,
                             isCrossEnt=False,
                             char_level=cfg.CHAR_LEVEL,
                             pos_feat=cfg.POS_FEATURE,
                             dep_rel_feat=cfg.DEP_LABEL_FEATURE,
                             dep_word_feat=cfg.DEP_WORD_FEATURE)

    # Turn on cuda
    model = model.cuda()

    # verify model
    print(model)

    # remove paramters that have required_grad = False

    optimizer = optim.Adadelta(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=cfg.LEARNING_RATE)
    # optimizer = optim.SGD(model.parameters(), lr=cfg.LEARNING_RATE, momentum=0.9)
    optimizer.zero_grad()
    model.zero_grad()

    # init loss criteria
    seq_criterion = nn.NLLLoss(size_average=False)
    lm_f_criterion = nn.NLLLoss(size_average=False)
    lm_b_criterion = nn.NLLLoss(size_average=False)
    att_loss = nn.CosineEmbeddingLoss(margin=1)
    best_res_val_0 = 0.0
    best_res_val_1 = 0.0
    best_epoch = 0
    dev_eval_history = []
    test_eval_history = []
    for epoch in range(cfg.MAX_EPOCH):
        print('-' * 40)
        print("EPOCH = {0}".format(epoch))
        print('-' * 40)

        random.seed(epoch)
        train_loader = DataLoader(train_dataset,
                                  batch_size=cfg.BATCH_SIZE,
                                  shuffle=cfg.RANDOM_TRAIN,
                                  num_workers=28,
                                  collate_fn=collate_fn)

        train_eval, model = train_a_epoch(name="train",
                                          data=train_loader,
                                          tag_idx=tag_idx,
                                          is_oov=is_oov,
                                          model=model,
                                          optimizer=optimizer,
                                          seq_criterion=seq_criterion,
                                          lm_f_criterion=lm_f_criterion,
                                          lm_b_criterion=lm_b_criterion,
                                          att_loss=att_loss,
                                          gamma=cfg.LM_GAMMA)

        dev_loader = DataLoader(dev_dataset,
                                batch_size=cfg.BATCH_SIZE,
                                num_workers=28,
                                collate_fn=collate_fn)
        test_loader = DataLoader(test_dataset,
                                 batch_size=cfg.BATCH_SIZE,
                                 num_workers=28,
                                 collate_fn=collate_fn)

        dev_eval, _, _, _ = test("dev", dev_loader, tag_idx, model)
        test_eval, _, _, _ = test("test", test_loader, tag_idx, model)

        dev_eval.verify_results()
        test_eval.verify_results()
        dev_eval_history.append(dev_eval.results[cfg.BEST_MODEL_SELECTOR[0]])
        test_eval_history.append(test_eval.results['test_conll_f'])
        plot_curve(epoch, dev_eval_history, test_eval_history, "epochs",
                   "fscore", "epoch learning curve", plot_save_path)
        pickle.dump((dev_eval_history, test_eval_history),
                    open("plot_data.p", "wb"))
        # pick the best epoch
        if epoch < cfg.MIN_EPOCH_IMP or (
                dev_eval.results[cfg.BEST_MODEL_SELECTOR[0]] > best_res_val_0):
            best_epoch = epoch
            best_res_val_0 = dev_eval.results[cfg.BEST_MODEL_SELECTOR[0]]

            torch.save(model, model_save_path)

        print("current dev micro_score: {0}".format(
            dev_eval.results[cfg.BEST_MODEL_SELECTOR[0]]))
        print("current dev macro_score: {0}".format(
            dev_eval.results[cfg.BEST_MODEL_SELECTOR[1]]))
        print("best dev micro_score: {0}".format(best_res_val_0))
        print("best_epoch: {0}".format(str(best_epoch)))

        # if the best epoch model outperforms MA
        if 0 < cfg.MAX_EPOCH_IMP <= (epoch - best_epoch):
            break
    print("Loading Best Model ...")

    model = torch.load(model_save_path)
    return model