def fit(x,y,z,dev_x,dev_y,dev_z,lr,decay_weight,n_epochs=n_epochs):
        train_K = np.load(ROOT_PATH+'/mendelian_precomp/{}_train_K.npy'.format(sname))
        dev_K = np.load(ROOT_PATH+'/mendelian_precomp/{}_dev_K.npy'.format(sname))
        train_K = torch.from_numpy(train_K).float()
        dev_K = torch.from_numpy(dev_K).float()

        n_data = x.shape[0]
        net = Net(x.shape[1])
        es = EarlyStopping(patience=5)
        optimizer = optim.Adam(list(net.parameters()), lr=lr, weight_decay=decay_weight)

        for epoch in range(n_epochs):
            permutation = torch.randperm(n_data)

            for i in range(0, n_data, batch_size):
                indices = permutation[i:i+batch_size]
                batch_x, batch_y = x[indices], y[indices]

                # training loop
                def closure():
                    optimizer.zero_grad()
                    pred_y = net(batch_x)
                    loss = my_loss(pred_y, batch_y, indices, train_K)
                    loss.backward()
                    return loss

                optimizer.step(closure)  # Does the update
            if epoch % 5 == 0 and epoch >= 5 and dev_x is not None: # 5, 10 for small # 5,50 for large 
                g_pred = net(test.x.float())
                test_err = ((g_pred-test.g.float())**2).mean()
                dev_err = my_loss(net(dev_x), dev_y, None, dev_K)
                print('test',test_err,'dev',dev_err)
                if es.step(dev_err):
                    break
        return es.best, epoch, net
示例#2
0
    def fit(x,y,z,dev_x,dev_y,dev_z,a,lr,decay_weight, ax, y_axz, w_samples, n_epochs=n_epochs):
        if 'mnist' in sname:
            train_K = torch.eye(x.shape[0])
        else:
            train_K = (kernel(z, None, a, 1)+kernel(z, None, a/10, 1)+kernel(z, None, a*10, 1))/3
        if dev_z is not None:
            if 'mnist' in sname:
                dev_K = torch.eye(x.shape[0])
            else:
                dev_K = (kernel(dev_z, None, a, 1)+kernel(dev_z, None, a/10, 1)+kernel(dev_z, None, a*10, 1))/3
        n_data = x.shape[0]
        net = FCNN(x.shape[1]) if sname not in ['mnist_x', 'mnist_xz'] else CNN()
        es = EarlyStopping(patience=10)  # 10 for small
        optimizer = optim.Adam(list(net.parameters()), lr=lr, weight_decay=decay_weight)

        test_errs, dev_errs, exp_errs, mse_s = [], [], [], []

        for epoch in range(n_epochs):
            permutation = torch.randperm(n_data)

            for i in range(0, n_data, batch_size):
                indices = permutation[i:i+batch_size]
                batch_x, batch_y = x[indices], y[indices]

                # training loop
                def closure():
                    optimizer.zero_grad()
                    pred_y = net(batch_x)
                    loss = my_loss(pred_y, batch_y, indices, train_K)
                    loss.backward()
                    return loss

                optimizer.step(closure)  # Does the update
            if epoch % 5 == 0 and epoch >= 50 and dev_x is not None:  # 5, 10 for small # 5,50 for large
                g_pred = net(test_X)  # TODO: is it supposed to be test_X here? A: yes I think so.
                test_err = ((g_pred-test_Y)**2).mean() # TODO: why isn't this loss reweighted? A: because it is supposed to measure the agreement between prediction and labels.
                if epoch == 50 and 'mnist' in sname:
                    if z.shape[1] > 100:
                        train_K = np.load(ROOT_PATH+'/mnist_precomp/{}_train_K0.npy'.format(sname))
                        train_K = (torch.exp(-train_K/a**2/2)+torch.exp(-train_K/a**2*50)+torch.exp(-train_K/a**2/200))/3
                        dev_K = np.load(ROOT_PATH+'/mnist_precomp/{}_dev_K0.npy'.format(sname))
                        dev_K = (torch.exp(-dev_K/a**2/2)+torch.exp(-dev_K/a**2*50)+torch.exp(-dev_K/a**2/200))/3
                    else:
                        train_K = (kernel(z, None, a, 1)+kernel(z, None, a/10, 1)+kernel(z, None, a*10, 1))/3
                        dev_K = (kernel(dev_z, None, a, 1)+kernel(dev_z, None, a/10, 1)+kernel(dev_z, None, a*10, 1))/3

                dev_err = my_loss(net(dev_x), dev_y, None, dev_K)
                err_in_expectation, mse = conditional_expected_loss(net=net, ax=ax, w_samples=w_samples, y_samples=y_samples, y_axz=y_axz, x_on=False)
                print('test', test_err, 'dev', dev_err, 'err_in_expectation', err_in_expectation, 'mse: ', mse)
                test_errs.append(test_err)
                dev_errs.append(dev_err)
                exp_errs.append(err_in_expectation)
                mse_s.append(mse)

                if es.step(dev_err):
                    break
            losses = {'test': test_errs, 'dev': dev_errs, 'exp': exp_errs, 'mse_': mse_s}
        return es.best, epoch, net, losses
示例#3
0
def train(opt):
    if torch.cuda.is_available():
        logger.info("%s", torch.cuda.get_device_name(0))

    # set etc
    torch.autograd.set_detect_anomaly(True)

    # set config
    config = load_config(opt)
    config['opt'] = opt
    logger.info("%s", config)
 
    # set path
    set_path(config)
  
    # prepare train, valid dataset
    train_loader, valid_loader = prepare_datasets(config)

    with temp_seed(opt.seed):
        # prepare model
        model = prepare_model(config)

        # create optimizer, scheduler, summary writer, scaler
        optimizer, scheduler, writer, scaler = prepare_osws(config, model, train_loader)
        config['optimizer'] = optimizer
        config['scheduler'] = scheduler
        config['writer'] = writer
        config['scaler'] = scaler

        # training
        early_stopping = EarlyStopping(logger, patience=opt.patience, measure='f1', verbose=1)
        local_worse_steps = 0
        prev_eval_f1 = -float('inf')
        best_eval_f1 = -float('inf')
        for epoch_i in range(opt.epoch):
            epoch_st_time = time.time()
            eval_loss, eval_f1 = train_epoch(model, config, train_loader, valid_loader, epoch_i)
            # early stopping
            if early_stopping.validate(eval_f1, measure='f1'): break
            if eval_f1 > best_eval_f1:
                best_eval_f1 = eval_f1
                if opt.save_path:
                    logger.info("[Best model saved] : {:10.6f}".format(best_eval_f1))
                    save_model(config, model)
                    # save finetuned bert model/config/tokenizer
                    if config['emb_class'] in ['bert', 'distilbert', 'albert', 'roberta', 'bart', 'electra']:
                        if not os.path.exists(opt.bert_output_dir):
                            os.makedirs(opt.bert_output_dir)
                        model.bert_tokenizer.save_pretrained(opt.bert_output_dir)
                        model.bert_model.save_pretrained(opt.bert_output_dir)
                early_stopping.reset(best_eval_f1)
            early_stopping.status()
            # begin: scheduling, apply rate decay at the measure(ex, loss) getting worse for the number of deacy epoch steps.
            if prev_eval_f1 >= eval_f1:
                local_worse_steps += 1
            else:
                local_worse_steps = 0
            logger.info('Scheduler: local_worse_steps / opt.lr_decay_steps = %d / %d' % (local_worse_steps, opt.lr_decay_steps))
            if not opt.use_transformers_optimizer and \
               epoch_i > opt.warmup_epoch and \
               (local_worse_steps >= opt.lr_decay_steps or early_stopping.step() > opt.lr_decay_steps):
                scheduler.step()
                local_worse_steps = 0
            prev_eval_f1 = eval_f1
示例#4
0
def train_pytorch(**kwargs):
    CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True)

    # 调用logging.basicConfig会给进程添加一个root logger,这样其他模块中logger的日志才会显示到console当中
    # (子logger传到root logger,root logger通过他自带的StreamHandler输出)。
    # 如果不调用logging.basicConfig,必须得每个子logger配置一个StreamHandler,很麻烦
    logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.INFO)
    formater = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # Print logs to the terminal.
    # stream_handler = logging.StreamHandler()
    # stream_handler.setFormatter(formater)
    # # Save logs to file.
    log_path = CHECKPOINT_PATH / 'train.log'
    file_handler = logging.FileHandler(filename=log_path,
                                       mode='w',
                                       encoding='utf-8')
    file_handler.setFormatter(formater)

    # logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    inputs = kwargs['inputs']
    outputs = kwargs['outputs']
    # test_inputs = kwargs['test_inputs']

    gkf = GroupKFold(n_splits=kwargs['n_splits']).split(X=df_train.q2,
                                                        groups=df_train.id)

    # sss = StratifiedShuffleSplit(n_splits=kwargs['n_splits'], test_size=0.2, random_state=RANDOM_SEED).split(X=df_train.q2,
    # y=df_train.label)
    # skf = StratifiedKFold(n_splits=kwargs['n_splits'], shuffle=True, random_state=RANDOM_SEED).split(X=df_train.q2, y=outputs)

    # oof = np.zeros((len(df_train),1))
    # all_pred = np.zeros(shape=(len(df_train), 2))     # 分类任务
    all_pred = np.zeros(shape=(len(df_train)))  # 回归任务
    all_true = np.zeros(shape=(len(df_train)))
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        # for fold, (train_idx, valid_idx) in enumerate(skf):
        logger.info(f'Fold No. {fold}')
        train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
        train_outputs = outputs[train_idx]

        train_qa_id = df_train[['id', 'id_sub', 'label']].iloc[train_idx]

        # ===============================================================
        # 通过反向翻译进行样本增强(只增强正样本)
        # 获得训练集样本的(id, id_sub)
        # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()])
        # # 从增强样本中找出训练集中出现的样本
        # mask = df_train_ex[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1)
        # df_train_fold = df_train_ex[mask]

        # 获得训练集样本的(id, id_sub)
        # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()])
        # # 从增强样本中找出训练集中出现的样本
        # mask = df_train_aug[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1)
        # df_train_fold = df_train_aug[mask]
        # train_inputs, train_inputs_overlap = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
        # train_outputs = compute_output_arrays(df_train_fold, output_categories)

        # df_train_fold = df_train.iloc[train_idx]
        # train_q_aug = []
        # for x in tqdm(df_train_fold['q1']):
        # train_q_aug.append(eda_one(x))
        # train_a_aug = []
        # for x in tqdm(df_train_fold['q2']):
        # train_a_aug.append(eda_one(x))
        # df_train_fold = pd.DataFrame(data={'q1': train_q_aug, 'q2': train_a_aug})

        # train_inputs, train_inputs_overlap = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
        # train_outputs = compute_output_arrays(df_train_fold, output_categories)

        # 添加安居客数据到训练集
        # train_inputs = [np.concatenate([train_inputs[i], anjuke_inputs[i]], axis=0) for i in range(len(inputs))]
        # train_outputs = np.concatenate([train_outputs, anjuke_outputs], axis=0)
        # ================================================================

        valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
        valid_outputs = outputs[valid_idx]
        valid_qa_id = df_train[['id', 'id_sub', 'label']].iloc[valid_idx]

        train_set = HouseDataset(train_inputs, train_outputs, train_qa_id)
        valid_set = HouseDataset(valid_inputs, valid_outputs, valid_qa_id)
        # test_set = HouseDataset(test_inputs, np.zeros_like(test_inputs[0])) # 测试集没有标签

        logger.info('Train set size: {}, valid set size {}'.format(
            len(train_set), len(valid_set)))

        train_loader = DataLoader(
            train_set,
            batch_size=kwargs['batch_size'],
            #   shuffle=True  # 如果使用分类训练,设为True
        )

        valid_loader = DataLoader(valid_set,
                                  batch_size=kwargs['valid_batch_size'])

        # test_loader = DataLoader(test_set,
        # batch_size=512)

        device = torch.device(f"cuda:{kwargs['device']}")
        # model = BertForHouseQA().cuda(device)
        model = torch.nn.DataParallel(BertForHouseQA(),
                                      device_ids=[1, 2, 3]).cuda(device)

        # 找到分数最高的checkpoint文件并加载
        # best_score_ = max([float(x.name[len(MODEL_NAME)+1:-3]) for x in CHECKPOINT_PATH.iterdir() if x.is_file()])
        # best_ckpt_path = CHECKPOINT_PATH/f'{MODEL_NAME}_{best_score_}.pt'
        # ckpt = torch.load(best_ckpt_path)
        # model.load_state_dict(ckpt['model_state_dict'])

        # 加载point-wise模型,使用pair-wise继续训练
        # 或者加载安居客模型
        # =====================================================
        # org_model = BertForHouseQA().cuda(device)
        # time_str = '2020-11-18-12:49:44'
        # org_ckpt_path = DATA_PATH / f"model_record/{MODEL_NAME}/{time_str}"
        # org_ckpt_path = DATA_PATH / f'anjuke/model_record/{MODEL_NAME}/{time_str}'
        # org_ckpt_paths = [x for x in org_ckpt_path.iterdir() if x.is_file() and x.suffix == '.pt']
        # prefix = f'{MODEL_NAME}_'
        # best_ckpt_path = [x for x in org_ckpt_paths if str(x.name).startswith(prefix)][0]
        # ckpt = torch.load(best_ckpt_path)
        # org_model.load_state_dict(ckpt['model_state_dict'])

        # model = BertClsToReg(org_model).cuda(device)
        # model = BertClsToCls(org_model).cuda(device)
        # =====================================================

        # List all modules inside the model.
        logger.info('Model modules:')
        for i, m in enumerate(model.named_children()):
            logger.info('{} -> {}'.format(i, m))

        # # Get the number of total parameters.
        # total_params = sum(p.numel() for p in model.parameters())
        # trainable_params = sum(p.numel()
        #                     for p in model.parameters() if p.requires_grad)

        # logger.info("Total params: {:,}".format(total_params))
        # logger.info("Trainable params: {:,}".format(trainable_params))

        # 使用HingeLoss
        criterion = torch.nn.MarginRankingLoss(margin=1.0)
        # criterion = torch.nn.MSELoss()
        # criterion = torch.nn.CrossEntropyLoss()
        # criterion_scl = SupConLoss(temperature=0.1, device=device)

        # optimizer = torch.optim.Adam(
        # model.parameters(), lr=kwargs['lr'], weight_decay=kwargs['weight_decay'])
        optimizer = transformers.AdamW(model.parameters(),
                                       lr=kwargs['lr'],
                                       weight_decay=kwargs['weight_decay'])
        logger.info('Optimizer:')
        logger.info(optimizer)
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
        #    mode='min',
        #    patience=int(kwargs['patience']/2),
        #    verbose=True
        #    )
        scheduler = transformers.get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=4, num_training_steps=kwargs['epoch'])
        # best_score = 0.0
        stopper = EarlyStopping(patience=kwargs['patience'], mode='max')
        ckpt_path = None
        for epoch in range(kwargs['epoch']):
            pass
            # =======================Training===========================
            # Set model to train mode.
            model.train()
            steps = int(np.ceil(len(train_set) / kwargs['batch_size']))
            pbar = tqdm(desc='Epoch {}, loss {}'.format(epoch, 'NAN'),
                        total=steps)
            for i, sample in enumerate(train_loader):
                x, y = sample[0].cuda(device).long(), sample[1].cuda(
                    device).long()
                optimizer.zero_grad()

                feat, model_outputs = model(x)  # [batch_size, 2]
                # CrossEntropy
                # loss = criterion(model_outputs, y)
                # MSE
                # loss = criterion(model_outputs, y.float().unsqueeze(-1))

                # 使用 HingeLoss
                train_qa_id_sub = sample[2].cpu().detach().numpy()
                loss = get_hinge_loss(model_outputs, train_qa_id_sub,
                                      criterion)

                # 使用SCL
                # feat = F.normalize(feat, dim=-1).unsqueeze(1)
                # scl = criterion_scl(feat, y)
                # scl_weight = 0.3
                # loss = (1-scl_weight)*loss + scl_weight*scl
                # loss += scl

                loss.backward()
                optimizer.step()
                pbar.set_description('Epoch {}, train loss {:.4f}'.format(
                    epoch, loss.item()))
                pbar.update()
            pbar.close()
            # =========================================================
            # =======================Validation========================
            # Set model to evaluation mode.
            model.eval()
            with torch.no_grad():
                # Validation step
                valid_loss = []
                valid_pred = []
                valid_true = []
                steps = int(
                    np.ceil(len(valid_set) / kwargs['valid_batch_size']))
                pbar = tqdm(desc='Validating', total=steps)
                for i, sample in enumerate(valid_loader):
                    y_true_local = sample[1].numpy()
                    x, y_true = sample[0].cuda(device).long(), sample[1].cuda(
                        device).long()

                    feat, model_outputs = model(x)
                    # MSELoss
                    # loss = criterion(model_outputs, y_true.float().unsqueeze(-1)).cpu().detach().item()
                    # HingeLoss
                    valid_qa_id_sub = sample[2].cpu().detach().numpy()
                    loss = get_hinge_loss(model_outputs, valid_qa_id_sub,
                                          criterion).cpu().detach().item()
                    y_pred = model_outputs.cpu().detach().squeeze(-1).numpy()
                    # CrossEntropy
                    # loss = criterion(
                    # model_outputs, y_true).cpu().detach().item()
                    # y_pred = F.softmax(
                    # model_outputs.cpu().detach(), dim=1).numpy()

                    valid_loss.append(loss)
                    valid_pred.append(y_pred)
                    valid_true.append(y_true_local)
                    pbar.update()
            pbar.close()
            valid_loss = np.asarray(valid_loss).mean()
            valid_pred = np.concatenate(valid_pred, axis=0)
            valid_true = np.concatenate(valid_true, axis=0)

            # 如果使用回归模型
            valid_f1, thr = search_f1(valid_true, valid_pred)
            logger.info("Epoch {}, valid loss {:.5f}, valid f1 {:.4f}".format(
                epoch, valid_loss, valid_f1))

            # 如果使用分类模型
            # valid_pred_label = np.argmax(valid_pred, axis=1)
            # valid_auc = roc_auc_score(valid_true, valid_pred_label)
            # valid_p, valid_r, valid_f1, _ = precision_recall_fscore_support(
            # valid_true, valid_pred_label, average='binary')

            # logger.info(
            # "Epoch {}, valid loss {:.5f}, valid P {:.4f}, valid R {:.4f}, valid f1 {:.4f}, valid auc {:.4f}".format(
            # epoch, valid_loss, valid_p, valid_r, valid_f1, valid_auc)
            # )
            # logger.info('Confusion Matrix: ')
            # logger.info(confusion_matrix(y_true=valid_true,
            # y_pred=valid_pred_label, normalize='all'))

            # Apply ReduceLROnPlateau to the lr.
            scheduler.step(valid_f1)
            stop_flag, best_flag = stopper.step(valid_f1)
            if best_flag:
                # 删除之前保存的模型
                if ckpt_path is not None:
                    ckpt_path.unlink()
                ckpt_path = CHECKPOINT_PATH / \
                    f"{MODEL_NAME}_{fold}_{epoch}_{stopper.best_score}.pt"
                # 保存目前的最佳模型
                torch.save(
                    {
                        "model_name": "BertForHouseQA",
                        "epoch": epoch,
                        "valid_loss": valid_loss,
                        "valid_f1": valid_f1,
                        "model_state_dict": model.state_dict(),
                        "train_idx": train_idx,
                        "valid_idx": valid_idx,
                        "fold": fold,
                        # "optimizer_state_dict": optimizer.state_dict(),
                        "thr": thr
                        # 'scheduler_state_dict': scheduler.state_dict()
                    },
                    f=ckpt_path,
                )
                logger.info("A best score! Saved to checkpoints.")
                # 保存每个验证折的预测值,用作最后整个训练集的f1评估
                all_pred[valid_idx] = valid_pred
                all_true[valid_idx] = valid_true
            if stop_flag:
                logger.info("Stop training due to early stopping.")
                # 终止训练
                break
            # 保存每个验证折的预测值,用作最后整个训练集的f1评估
            # oof[valid_idx] = valid_pred
            # valid_f1, _ = search_f1(valid_outputs, valid_pred)  # 寻找最佳分类阈值和f1 score
            # print('Valid f1 score = ', valid_f1)
            # ==========================================================

    # 结束后,评估整个训练集
    # CrossEntropy
    # all_pred = np.argmax(all_pred, axis=1)
    # all_auc = roc_auc_score(all_true, all_pred)
    # all_p, all_r, all_f1, _ = precision_recall_fscore_support(
    # all_true, all_pred, average='binary')
    # logger.info(
    # "all P {:.4f}, all R {:.4f}, all f1 {:.4f}, all auc {:.4f}".format(
    # all_p, all_r, all_f1, all_auc)
    # )
    # logger.info('Confusion Matrix: ')
    # logger.info(confusion_matrix(y_true=all_true,
    #  y_pred=all_pred, normalize='all'))
    # MSELoss
    all_f1, all_thr = search_f1(all_true, all_pred)
    logger.info("All f1 {:.4f}, all thr {:.4f}".format(all_f1, all_thr))
    return all_f1, CHECKPOINT_PATH
示例#5
0
    def fit(x,
            y,
            z,
            dev_x,
            dev_y,
            dev_z,
            a,
            lr,
            decay_weight,
            n_epochs=n_epochs):
        if 'mnist' in sname:
            train_K = torch.eye(x.shape[0])
        else:
            train_K = (kernel(z, None, a, 1) + kernel(z, None, a / 10, 1) +
                       kernel(z, None, a * 10, 1)) / 3
        if dev_z is not None:
            if 'mnist' in sname:
                dev_K = torch.eye(x.shape[0])
            else:
                dev_K = (kernel(dev_z, None, a, 1) +
                         kernel(dev_z, None, a / 10, 1) +
                         kernel(dev_z, None, a * 10, 1)) / 3
        n_data = x.shape[0]
        net = FCNN(x.shape[1]) if sname not in ['mnist_x', 'mnist_xz'
                                                ] else CNN()
        es = EarlyStopping(patience=5)  # 10 for small
        optimizer = optim.Adam(list(net.parameters()),
                               lr=lr,
                               weight_decay=decay_weight)
        # optimizer = optim.SGD(list(net.parameters()),lr=1e-1, momentum=0.9)
        # optimizer = optim.Adadelta(list(net.parameters()))

        for epoch in range(n_epochs):
            permutation = torch.randperm(n_data)

            for i in range(0, n_data, batch_size):
                indices = permutation[i:i + batch_size]
                batch_x, batch_y = x[indices], y[indices]

                # training loop
                def closure():
                    optimizer.zero_grad()
                    pred_y = net(batch_x)
                    loss = my_loss(pred_y, batch_y, indices, train_K)
                    loss.backward()
                    return loss

                optimizer.step(closure)  # Does the update
            if epoch % 5 == 0 and epoch >= 50 and dev_x is not None:  # 5, 10 for small # 5,50 for large
                g_pred = net(test_X)
                test_err = ((g_pred - test_G)**2).mean()
                if epoch == 50 and 'mnist' in sname:
                    if z.shape[1] > 100:
                        train_K = np.load(
                            ROOT_PATH +
                            '/mnist_precomp/{}_train_K0.npy'.format(sname))
                        train_K = (torch.exp(-train_K / a**2 / 2) +
                                   torch.exp(-train_K / a**2 * 50) +
                                   torch.exp(-train_K / a**2 / 200)) / 3
                        dev_K = np.load(
                            ROOT_PATH +
                            '/mnist_precomp/{}_dev_K0.npy'.format(sname))
                        dev_K = (torch.exp(-dev_K / a**2 / 2) +
                                 torch.exp(-dev_K / a**2 * 50) +
                                 torch.exp(-dev_K / a**2 / 200)) / 3
                    else:
                        train_K = (kernel(z, None, a, 1) +
                                   kernel(z, None, a / 10, 1) +
                                   kernel(z, None, a * 10, 1)) / 3
                        dev_K = (kernel(dev_z, None, a, 1) +
                                 kernel(dev_z, None, a / 10, 1) +
                                 kernel(dev_z, None, a * 10, 1)) / 3

                dev_err = my_loss(net(dev_x), dev_y, None, dev_K)
                print('test', test_err, 'dev', dev_err)
                if es.step(dev_err):
                    break
        return es.best, epoch, net
示例#6
0
def train_pytorch(**kwargs):
    CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True)

    # 调用logging.basicConfig会给进程添加一个root logger,这样其他模块中logger的日志才会显示到console当中
    # (子logger传到root logger,root logger通过他自带的StreamHandler输出)。
    # 如果不调用logging.basicConfig,必须得每个子logger配置一个StreamHandler,很麻烦
    logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
    formater = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # Print logs to the terminal.
    # stream_handler = logging.StreamHandler()
    # stream_handler.setFormatter(formater)
    # # Save logs to file.
    log_path = CHECKPOINT_PATH / 'train.log'
    file_handler = logging.FileHandler(filename=log_path, mode='w', encoding='utf-8')
    file_handler.setFormatter(formater)
    
    # logger.addHandler(stream_handler)
    logger.addHandler(file_handler)
    
    inputs = kwargs['inputs']
    outputs = kwargs['outputs']
    # test_inputs = kwargs['test_inputs']
    
    # gkf = GroupKFold(n_splits=kwargs['n_splits']).split(X=df_train.q2, groups=df_train.id)

    sss = StratifiedShuffleSplit(n_splits=kwargs['n_splits'], test_size=0.2).split(X=df_train.q2, 
            y=df_train.label)
    # skf = StratifiedKFold(n_splits=kwargs['n_splits'], shuffle=True, random_state=RANDOM_SEED).split(X=df_train.q2, y=outputs)

    # oof = np.zeros((len(df_train),1))
    all_pred = np.zeros(shape=(len(df_train), 2))     # 分类任务
    # all_pred = np.zeros(shape=(len(df_train)))  # 回归任务
    all_true = np.zeros(shape=(len(df_train)))
    for fold, (train_idx, valid_idx) in enumerate(sss):
    # for fold, (train_idx, valid_idx) in enumerate(skf):
        logger.info(f'Fold No. {fold}')
        train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
        train_outputs = outputs[train_idx]

        train_qa_id = df_train[['id', 'id_sub', 'label']].iloc[train_idx]

        # 通过反向翻译进行样本增强(只增强正样本)
        # 获得训练集样本的(id, id_sub)
        # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()])   
        # # 从增强样本中找出训练集中出现的样本
        # mask = df_train_ex[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1)    
        # df_train_fold = df_train_ex[mask]

        # 获得训练集样本的(id, id_sub)
        # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()])   
        # # 从增强样本中找出训练集中出现的样本
        # mask = df_train_aug[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1)    
        # df_train_fold = df_train_aug[mask]

        # train_inputs = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
        # train_outputs = compute_output_arrays(df_train_fold, output_categories)

        valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
        valid_outputs = outputs[valid_idx]
        valid_qa_id = df_train[['id', 'id_sub', 'label']].iloc[valid_idx]

        train_set = HouseDataset(train_inputs, train_outputs, train_qa_id)
        valid_set = HouseDataset(valid_inputs, valid_outputs, valid_qa_id)
        # test_set = HouseDataset(test_inputs, np.zeros_like(test_inputs[0])) # 测试集没有标签

        logger.info('Train set size: {}, valid set size {}'.format(
            len(train_set), len(valid_set)))

        train_loader = DataLoader(train_set,
                                batch_size=kwargs['batch_size'],
                                shuffle=True  # 如果使用分类训练,建议True
                                )

        valid_loader = DataLoader(valid_set,
                                batch_size=kwargs['valid_batch_size'])

        # test_loader = DataLoader(test_set,
                                # batch_size=512)

        device = torch.device(f"cuda:{kwargs['device']}")
        model = BertForHouseQA().cuda(device)

        # List all modules inside the model.
        logger.info('Model modules:')
        for i, m in enumerate(model.named_children()):
            logger.info('{} -> {}'.format(i, m))

        # # Get the number of total parameters.
        # total_params = sum(p.numel() for p in model.parameters())
        # trainable_params = sum(p.numel()
        #                     for p in model.parameters() if p.requires_grad)

        # logger.info("Total params: {:,}".format(total_params))
        # logger.info("Trainable params: {:,}".format(trainable_params))

        # 使用HingeLoss
        # criterion = torch.nn.MarginRankingLoss(margin=1.0)
        # criterion = torch.nn.MSELoss()
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=kwargs['lr'], weight_decay=kwargs['weight_decay'])
        logger.info('Optimizer:')
        logger.info(optimizer)
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
        #                                                        mode='min',
        #                                                        patience=8,
        #                                                        verbose=True
        #                                                        )
        # best_score = 0.0
        stopper = EarlyStopping(patience=kwargs['patience'], mode='max')
        ckpt_path = None
        for epoch in range(kwargs['epoch']):
            pass
            # =======================Training===========================
            # Set model to train mode.
            model.train()
            steps = int(np.ceil(len(train_set) / kwargs['batch_size']))
            pbar = tqdm(desc='Epoch {}, loss {}'.format(epoch, 'NAN'),
                        total=steps)
            for i, sample in enumerate(train_loader):
                x, y = sample[0].cuda(device).long(), sample[1].cuda(device).long()
                optimizer.zero_grad()

                model_outputs = model(x)    # [batch_size, 2]
                # CrossEntropy
                loss = criterion(model_outputs, y)
                # MSE
                # loss = criterion(model_outputs, y.float().unsqueeze(-1))

                # 使用 HingeLoss
                # train_qa_id_sub = sample[2].numpy()
                # loss = get_hinge_loss(model_outputs, train_qa_id_sub, criterion)

                # 使用SCL
                # inners = torch.do
                
                loss.backward()
                optimizer.step()
                pbar.set_description(
                    'Epoch {}, train loss {:.4f}'.format(epoch, loss.item()))
                pbar.update()
            pbar.close()
            # =========================================================
            # =======================Validation========================
            # Set model to evaluation mode.
            model.eval()
            with torch.no_grad():
                # Validation step
                valid_loss = []
                valid_pred = []
                valid_true = []
                steps = int(np.ceil(len(valid_set) / kwargs['valid_batch_size']))
                pbar = tqdm(desc='Validating', total=steps)
                for i, sample in enumerate(valid_loader):
                    y_true_local = sample[1].numpy()
                    x, y_true = sample[0].cuda(
                        device).long(), sample[1].cuda(device).long()

                    model_outputs = model(x)
                    # MSELoss
                    # loss = criterion(model_outputs, y_true.float().unsqueeze(-1)).cpu().detach().item()
                    # HingeLoss
                    # valid_qa_id_sub = sample[2].numpy()
                    # loss = get_hinge_loss(model_outputs, valid_qa_id_sub, criterion)
                    # y_pred = model_outputs.cpu().detach().squeeze(-1).numpy()
                    # CrossEntropy
                    loss = criterion(model_outputs, y_true).cpu().detach().item()
                    y_pred = F.softmax(model_outputs.cpu().detach(), dim=1).numpy()
                    
                    valid_loss.append(loss)
                    valid_pred.append(y_pred)
                    valid_true.append(y_true_local)
                    pbar.update()
            pbar.close()
            valid_loss = np.asarray(valid_loss).mean()
            valid_pred = np.concatenate(valid_pred, axis=0)
            valid_true = np.concatenate(valid_true, axis=0)

            # 如果使用回归模型
            # valid_f1, thr = search_f1(valid_true, valid_pred)
            # logger.info("Epoch {}, valid loss {:.5f}, valid f1 {:.4f}".format(epoch, valid_loss, valid_f1)))

            # 如果使用分类模型
            valid_pred_label = np.argmax(valid_pred, axis=1)
            valid_auc = roc_auc_score(valid_true, valid_pred_label)
            valid_p, valid_r, valid_f1, _ = precision_recall_fscore_support(valid_true, valid_pred_label, average='binary')

            # Apply ReduceLROnPlateau to the lr.
            # scheduler.step(valid_loss)

            logger.info(
            "Epoch {}, valid loss {:.5f}, valid P {:.4f}, valid R {:.4f}, valid f1 {:.4f}, valid auc {:.4f}".format(
                epoch, valid_loss, valid_p, valid_r, valid_f1, valid_auc)   
            )
            logger.info('Confusion Matrix: ')
            logger.info(confusion_matrix(y_true=valid_true, y_pred=valid_pred_label, normalize='all'))
            stop_flag, best_flag = stopper.step(valid_f1)
            if best_flag:
                # 删除之前保存的模型
                if ckpt_path is not None:
                    ckpt_path.unlink()
                ckpt_path = CHECKPOINT_PATH / f"{MODEL_NAME}_{fold}_{epoch}_{stopper.best_score}.pt"
            # 保存目前的最佳模型
                torch.save(
                    {
                        "model_name": "BertForHouseQA",
                        "epoch": epoch,
                        "valid_loss": valid_loss,
                        "valid_f1": valid_f1,
                        "model_state_dict": model.state_dict(),
                        # "optimizer_state_dict": optimizer.state_dict(),
                        # "thr": thr
                        # 'scheduler_state_dict': scheduler.state_dict()
                    },
                    f=ckpt_path,
                )
                logger.info("A best score! Saved to checkpoints.")
                # 保存每个验证折的预测值,用作最后整个训练集的f1评估
                all_pred[valid_idx] = valid_pred
                all_true[valid_idx] = valid_true
            if stop_flag:
                logger.info("Stop training due to early stopping.")
                # 终止训练
                break
            # 保存每个验证折的预测值,用作最后整个训练集的f1评估
            # oof[valid_idx] = valid_pred
            # valid_f1, _ = search_f1(valid_outputs, valid_pred)  # 寻找最佳分类阈值和f1 score
            # print('Valid f1 score = ', valid_f1)
            # ==========================================================

    # 结束后,评估整个训练集
    # CrossEntropy
    all_pred = np.argmax(all_pred, axis=1)
    all_auc = roc_auc_score(all_true, all_pred)
    all_p, all_r, all_f1, _ = precision_recall_fscore_support(all_true, all_pred, average='binary')
    logger.info(
        "all P {:.4f}, all R {:.4f}, all f1 {:.4f}, all auc {:.4f}".format(
            all_p, all_r, all_f1, all_auc)
        )
    logger.info('Confusion Matrix: ')
    logger.info(confusion_matrix(y_true=all_true, y_pred=all_pred, normalize='all'))
    # MSELoss
    # all_f1, all_thr = search_f1(all_true, all_pred)
    # logger.info("All f1 {:.4f}, all thr {:.4f}".format(all_f1, all_thr))
    return all_f1, CHECKPOINT_PATH
示例#7
0
def main(args):

    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    splits = ['train', 'valid'] + (['test'] if args.test else [])

    datasets = OrderedDict()
    for split in splits:
        datasets[split] = PTB(data_dir=args.data_dir,
                              split=split,
                              create_data=args.create_data,
                              max_sequence_length=args.max_sequence_length,
                              min_occ=args.min_occ)

    model = SentenceVAE(vocab_size=datasets['train'].vocab_size,
                        sos_idx=datasets['train'].sos_idx,
                        eos_idx=datasets['train'].eos_idx,
                        pad_idx=datasets['train'].pad_idx,
                        unk_idx=datasets['train'].unk_idx,
                        max_sequence_length=args.max_sequence_length,
                        embedding_size=args.embedding_size,
                        rnn_type=args.rnn_type,
                        hidden_size=args.hidden_size,
                        word_dropout=args.word_dropout,
                        embedding_dropout=args.embedding_dropout,
                        latent_size=args.latent_size,
                        num_layers=args.num_layers,
                        bidirectional=args.bidirectional)

    if torch.cuda.is_available():
        model = model.cuda()

    print(model)

    if args.tensorboard_logging:
        writer = SummaryWriter(
            os.path.join(args.logdir, expierment_name(args, ts)))
        writer.add_text("model", str(model))
        writer.add_text("args", str(args))
        writer.add_text("ts", ts)

    save_model_path = os.path.join(args.save_model_path, ts)
    os.makedirs(save_model_path)

    def kl_anneal_function(anneal_function, step, k, x0):
        if anneal_function == 'logistic':
            return float(1 / (1 + np.exp(-k * (step - x0))))
        elif anneal_function == 'linear':
            return min(1, step / x0)
        else:
            return 1.0

    NLL = torch.nn.NLLLoss(size_average=False,
                           ignore_index=datasets['train'].pad_idx)

    def loss_fn(logp, target, length, mean, logv, anneal_function, step, k,
                x0):

        # cut-off unnecessary padding from target, and flatten
        target = target[:, :torch.max(length).data[0]].contiguous().view(-1)
        logp = logp.view(-1, logp.size(2))

        # Negative Log Likelihood
        NLL_loss = NLL(logp, target)

        # KL Divergence
        KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
        KL_weight = kl_anneal_function(anneal_function, step, k, x0)

        return NLL_loss, KL_loss, KL_weight

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    tensor = torch.cuda.FloatTensor if torch.cuda.is_available(
    ) else torch.Tensor
    step = 0
    early_stop = EarlyStopping(min_delta=0.001, patience=5)
    for epoch in range(args.epochs):

        for split in splits:

            data_loader = DataLoader(dataset=datasets[split],
                                     batch_size=args.batch_size,
                                     shuffle=split == 'train',
                                     num_workers=cpu_count(),
                                     pin_memory=torch.cuda.is_available())

            tracker = defaultdict(tensor)

            # Enable/Disable Dropout
            if split == 'train':
                model.train()
            else:
                model.eval()

            for iteration, batch in enumerate(data_loader):

                batch_size = batch['input'].size(0)

                for k, v in batch.items():
                    if torch.is_tensor(v):
                        batch[k] = to_var(v)

                # Forward pass
                logp, mean, logv, z = model(batch['input'], batch['length'])

                # loss calculation
                NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'],
                                                       batch['length'], mean,
                                                       logv,
                                                       args.anneal_function,
                                                       step, args.k, args.x0)

                if split != 'train':
                    KL_weight = 1.0

                loss = (NLL_loss + KL_weight * KL_loss) / batch_size

                # backward + optimization
                if split == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    step += 1

                # bookkeeping
                tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.data))

                if args.tensorboard_logging:
                    writer.add_scalar("%s/ELBO" % split.upper(), loss.data[0],
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/NLL Loss" % split.upper(),
                                      NLL_loss.data[0] / batch_size,
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Loss" % split.upper(),
                                      KL_loss.data[0] / batch_size,
                                      epoch * len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Weight" % split.upper(),
                                      KL_weight,
                                      epoch * len(data_loader) + iteration)

                if iteration % args.print_every == 0 or iteration + 1 == len(
                        data_loader):
                    print(
                        "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f"
                        % (split.upper(), iteration, len(data_loader) - 1,
                           loss.data[0], NLL_loss.data[0] / batch_size,
                           KL_loss.data[0] / batch_size, KL_weight))

                if split == 'valid':
                    if 'target_sents' not in tracker:
                        tracker['target_sents'] = list()
                    tracker['target_sents'] += idx2word(
                        batch['target'].data,
                        i2w=datasets['train'].get_i2w(),
                        pad_idx=datasets['train'].pad_idx)
                    tracker['z'] = torch.cat((tracker['z'], z.data), dim=0)

            print("%s Epoch %02d/%i, Mean ELBO %9.4f" %
                  (split.upper(), epoch, args.epochs,
                   torch.mean(tracker['ELBO'])))

            if args.tensorboard_logging:
                writer.add_scalar("%s-Epoch/ELBO" % split.upper(),
                                  torch.mean(tracker['ELBO']), epoch)

            # save a dump of all sentences and the encoded latent space
            if split == 'valid':
                dump = {
                    'target_sents': tracker['target_sents'],
                    'z': tracker['z'].tolist()
                }
                if not os.path.exists(os.path.join('dumps', ts)):
                    os.makedirs('dumps/' + ts)
                with open(
                        os.path.join('dumps/' + ts +
                                     '/valid_E%i.json' % epoch),
                        'w') as dump_file:
                    json.dump(dump, dump_file)

            # save checkpoint
            if split == 'train':
                checkpoint_path = os.path.join(save_model_path,
                                               "E%i.pytorch" % (epoch))
                torch.save(model.state_dict(), checkpoint_path)
                print("Model saved at %s" % checkpoint_path)

            if split == 'valid' and early_stop.step(torch.mean(
                    tracker['ELBO'])):
                print("Early Stopping after {}".format(epoch))
                exit(0)
示例#8
0
    ),
                                               batch_size=1)

    val_loader = torch.utils.data.DataLoader(dataset.listDataset(
        val_list,
        shuffle=False,
        transform=tf,
    ),
                                             batch_size=1)
    train_loss_values = []
    for epoch in range(num_of_epochs):
        print(' --- teacher training: epoch {}'.format(epoch + 1))
        train_loss = train(model, optimizer, train_loader)
        #evaluate for one epoch on validation set
        val = evaluate(model, val_loader)
        train_loss_values.append(val)

        #if val_metric is best, add checkpoint
        if (True):
            #print("New Best!")
            #best = val.item()
            torch.save(model.state_dict(),
                       'checkpoints/CP_{}.pth'.format(epoch))
            print("Checkpoint {} saved!".format(epoch + 1))

        if es.step(val):
            plt.plot(train_loss_values)
            print("Early Stopping . . .")
            break

        scheduler.step()
示例#9
0
    def fit(self, input_xy,
            lr=0.01,
            weight=None,
            epoch='auto',
            print_batch_num=10,
            batch_size=128,
            tot_size=np.inf,
            max_epoch=20,
            test_input=None,
            target_names=None,
            enable_early_stopping=False,
            **kwargs):
        optimizer = torch.optim.SGD(self.nn_module.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss(weight=torch.tensor(weight,
                                                            dtype=torch.float32,
                                                            device=self.device
                                                            ) if weight is not None else None)

        if test_input is not None:
            # use test returned f1 score if validation set specified
            early_stopping = EarlyStopping(mode='max', patience=5, percentage=True)
            test_input = tee(test_input, max_epoch if test_input is not None else 0)
        else:
            # else use average loss
            early_stopping = EarlyStopping(mode='min', patience=5, percentage=False)


        logging.info('Train starting...')

        for epoch_idx, epoch_input in enumerate(tee(input_xy, max_epoch)):
            tot_loss = []
            for batch_cnt, batch_data in enumerate(zip(*([iter(epoch_input)] * batch_size))):
                outputs = []
                trues = []

                optimizer.zero_grad()
                for x, y in batch_data:
                    output = self.nn_module(x).view(-1)
                    outputs.append(output)
                    trues.append(y.argmax())
                outputs = torch.stack(outputs)
                trues = torch.tensor(trues, dtype=torch.long, device=self.device)
                loss = criterion(outputs, trues)

                if (batch_cnt+1) % print_batch_num == 0:
                    logging.debug('Epoch %5.3f, Current loss: %10.8f' %
                                  (epoch_idx+batch_cnt*batch_size/tot_size, loss))


                tot_loss.append(loss.item())
                loss.backward()
                optimizer.step()

            if test_input is not None:
                score = self.test(test_input[epoch_idx], target_names=target_names)
                logging.info('Epoch %d, test score %4.2f' % (epoch_idx, score))
            else:
                score = sum(tot_loss) / len(tot_loss)
                logging.info('Epoch %d, avg loss %4.2f' % (epoch_idx, score))

            if enable_early_stopping and early_stopping.step(score):
                logging.info('Early stopped.')
                break

        logging.info('Train end.')
示例#10
0
文件: main.py 项目: arpit9295/ce7455
def init_model_and_train(
        label='',
        crf=parameters['crf'],
        char_mode=parameters['char_mode'],
        encoder_mode=parameters['encoder_mode'],
        use_gpu=parameters['use_gpu'],
        eval_every=parameters[
            'eval_every'],  # Calculate F-1 Score after this many iterations
        plot_every=parameters[
            'plot_every'],  # Store loss after this many iterations
        gradient_clip=parameters['gradient_clip'],
        total_epochs=parameters['epochs'] + 1,
        output_dir=parameters['output_dir'],
        embedding_dim=parameters['word_dim'],
        hidden_dim=parameters['word_lstm_dim']):
    # Create model
    model = BiLSTM_CRF(vocab_size=len(word_to_id),
                       tag_to_ix=tag_to_id,
                       embedding_dim=embedding_dim,
                       hidden_dim=hidden_dim,
                       use_gpu=use_gpu,
                       char_to_ix=char_to_id,
                       pre_word_embeds=word_embeds,
                       use_crf=crf,
                       char_mode=char_mode,
                       encoder_mode=encoder_mode)

    # Enable GPU
    if use_gpu:
        model.cuda()

    print(f"Char mode: {char_mode}, Encoder mode: {encoder_mode}")

    # Training parameters
    learning_rate = 0.015
    momentum = 0.9
    decay_rate = 0.05
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=learning_rate,
                                momentum=momentum)

    # Variables which will used in training process
    losses = []  # list to store all losses
    loss = 0.0  # Loss Initializatoin
    best_dev_F = -1.0  # Current best F-1 Score on Dev Set
    best_test_F = -1.0  # Current best F-1 Score on Test Set
    best_train_F = -1.0  # Current best F-1 Score on Train Set
    all_F = [[0, 0, 0]]  # List storing all the F-1 Scores
    all_acc = [[0, 0, 0]]  # List storing all the Accuracy Scores
    count = 0  # Counts the number of iterations
    train_length = len(train_data)

    # Define early stopping
    es = EarlyStopping(patience=3, mode='max')

    # eval_every = 1

    tr = time.time()
    model.train(True)
    for epoch in range(1, total_epochs):
        print(f'Epoch {epoch}:')
        for i, index in enumerate(np.random.permutation(train_length)):
            # for i, index in enumerate(np.random.permutation(eval_every)):
            count += 1
            data = train_data[index]

            # gradient updates for each data entry
            model.zero_grad()

            sentence_in = data['words']
            sentence_in = Variable(torch.LongTensor(sentence_in))
            tags = data['tags']
            chars2 = data['chars']

            if char_mode == 'LSTM':
                chars2_sorted = sorted(chars2,
                                       key=lambda p: len(p),
                                       reverse=True)
                d = {}
                for i, ci in enumerate(chars2):
                    for j, cj in enumerate(chars2_sorted):
                        if ci == cj and not j in d and not i in d.values():
                            d[j] = i
                            continue
                chars2_length = [len(c) for c in chars2_sorted]
                char_maxl = max(chars2_length)
                chars2_mask = np.zeros((len(chars2_sorted), char_maxl),
                                       dtype='int')
                for i, c in enumerate(chars2_sorted):
                    chars2_mask[i, :chars2_length[i]] = c
                chars2_mask = Variable(torch.LongTensor(chars2_mask))

            if char_mode == 'CNN':

                d = {}

                # Padding the each word to max word size of that sentence
                chars2_length = [len(c) for c in chars2]
                char_maxl = max(chars2_length)
                chars2_mask = np.zeros((len(chars2_length), char_maxl),
                                       dtype='int')
                for i, c in enumerate(chars2):
                    chars2_mask[i, :chars2_length[i]] = c
                chars2_mask = Variable(torch.LongTensor(chars2_mask))

            targets = torch.LongTensor(tags)

            # we calculate the negative log-likelihood for the predicted tags using the predefined function
            if use_gpu:
                neg_log_likelihood = model.get_neg_log_likelihood(
                    sentence_in.cuda(), targets.cuda(), chars2_mask.cuda(),
                    chars2_length, d)
            else:
                neg_log_likelihood = model.get_neg_log_likelihood(
                    sentence_in, targets, chars2_mask, chars2_length, d)

            loss += neg_log_likelihood.item() / len(data['words'])
            neg_log_likelihood.backward()

            # we use gradient clipping to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
            optimizer.step()

            # Storing loss
            if count % plot_every == 0:
                loss /= plot_every
                print(count, ': ', loss)
                if losses == []:
                    losses.append(loss)
                losses.append(loss)
                loss = 0.0

        # Evaluating on Train, Test, Dev Sets
        if (epoch > 20) or (epoch % eval_every == 0):
            print(f'Evaluating on Train, Test, Dev Sets at count={count}')
            model.train(False)
            best_train_F, new_train_F, new_train_acc, _ = evaluating(
                model,
                train_data,
                best_train_F,
                "Train",
                char_mode=char_mode,
                use_gpu=use_gpu)
            best_dev_F, new_dev_F, new_dev_acc, save = evaluating(
                model,
                dev_data,
                best_dev_F,
                "Dev",
                char_mode=char_mode,
                use_gpu=use_gpu)
            if save:
                print("Saving Model to ", model_name)
                torch.save(model.state_dict(), model_name)
            best_test_F, new_test_F, new_test_acc, _ = evaluating(
                model,
                test_data,
                best_test_F,
                "Test",
                char_mode=char_mode,
                use_gpu=use_gpu)

            all_F.append([new_train_F, new_dev_F, new_test_F])
            all_acc.append([new_train_acc, new_dev_acc, new_test_acc])

            model.train(True)

        if (epoch > 20 or epoch % eval_every == 0) and es.step(all_F[-1][1]):
            print(
                f'Early stopping: epoch={epoch}, count={count}, new_acc_F={all_acc[-1][1]}'
            )
            break  # early stopping criterion is met, we can stop now

        # Performing decay on the learning rate
        adjust_learning_rate(optimizer,
                             lr=learning_rate /
                             (1 + decay_rate * count / len(train_data)))

    print(f'{(time.time() - tr) / 60} minutes')

    torch.save(model, output_dir + '/' + label + '.model')

    plt.figure(0)
    plt.plot(losses)
    plt.savefig(output_dir + '/' + label + '_appended.png', transparent=True)

    plt.figure(1)
    plt.clf()
    plt.plot(losses)
    plt.savefig(output_dir + '/' + label + '.png', transparent=True)

    return all_F