예제 #1
0
    def fit(x,y,z,dev_x,dev_y,dev_z,lr,decay_weight,n_epochs=n_epochs):
        train_K = np.load(ROOT_PATH+'/mendelian_precomp/{}_train_K.npy'.format(sname))
        dev_K = np.load(ROOT_PATH+'/mendelian_precomp/{}_dev_K.npy'.format(sname))
        train_K = torch.from_numpy(train_K).float()
        dev_K = torch.from_numpy(dev_K).float()

        n_data = x.shape[0]
        net = Net(x.shape[1])
        es = EarlyStopping(patience=5)
        optimizer = optim.Adam(list(net.parameters()), lr=lr, weight_decay=decay_weight)

        for epoch in range(n_epochs):
            permutation = torch.randperm(n_data)

            for i in range(0, n_data, batch_size):
                indices = permutation[i:i+batch_size]
                batch_x, batch_y = x[indices], y[indices]

                # training loop
                def closure():
                    optimizer.zero_grad()
                    pred_y = net(batch_x)
                    loss = my_loss(pred_y, batch_y, indices, train_K)
                    loss.backward()
                    return loss

                optimizer.step(closure)  # Does the update
            if epoch % 5 == 0 and epoch >= 5 and dev_x is not None: # 5, 10 for small # 5,50 for large 
                g_pred = net(test.x.float())
                test_err = ((g_pred-test.g.float())**2).mean()
                dev_err = my_loss(net(dev_x), dev_y, None, dev_K)
                print('test',test_err,'dev',dev_err)
                if es.step(dev_err):
                    break
        return es.best, epoch, net
예제 #2
0
    def fit(x,y,z,dev_x,dev_y,dev_z,a,lr,decay_weight, ax, y_axz, w_samples, n_epochs=n_epochs):
        if 'mnist' in sname:
            train_K = torch.eye(x.shape[0])
        else:
            train_K = (kernel(z, None, a, 1)+kernel(z, None, a/10, 1)+kernel(z, None, a*10, 1))/3
        if dev_z is not None:
            if 'mnist' in sname:
                dev_K = torch.eye(x.shape[0])
            else:
                dev_K = (kernel(dev_z, None, a, 1)+kernel(dev_z, None, a/10, 1)+kernel(dev_z, None, a*10, 1))/3
        n_data = x.shape[0]
        net = FCNN(x.shape[1]) if sname not in ['mnist_x', 'mnist_xz'] else CNN()
        es = EarlyStopping(patience=10)  # 10 for small
        optimizer = optim.Adam(list(net.parameters()), lr=lr, weight_decay=decay_weight)

        test_errs, dev_errs, exp_errs, mse_s = [], [], [], []

        for epoch in range(n_epochs):
            permutation = torch.randperm(n_data)

            for i in range(0, n_data, batch_size):
                indices = permutation[i:i+batch_size]
                batch_x, batch_y = x[indices], y[indices]

                # training loop
                def closure():
                    optimizer.zero_grad()
                    pred_y = net(batch_x)
                    loss = my_loss(pred_y, batch_y, indices, train_K)
                    loss.backward()
                    return loss

                optimizer.step(closure)  # Does the update
            if epoch % 5 == 0 and epoch >= 50 and dev_x is not None:  # 5, 10 for small # 5,50 for large
                g_pred = net(test_X)  # TODO: is it supposed to be test_X here? A: yes I think so.
                test_err = ((g_pred-test_Y)**2).mean() # TODO: why isn't this loss reweighted? A: because it is supposed to measure the agreement between prediction and labels.
                if epoch == 50 and 'mnist' in sname:
                    if z.shape[1] > 100:
                        train_K = np.load(ROOT_PATH+'/mnist_precomp/{}_train_K0.npy'.format(sname))
                        train_K = (torch.exp(-train_K/a**2/2)+torch.exp(-train_K/a**2*50)+torch.exp(-train_K/a**2/200))/3
                        dev_K = np.load(ROOT_PATH+'/mnist_precomp/{}_dev_K0.npy'.format(sname))
                        dev_K = (torch.exp(-dev_K/a**2/2)+torch.exp(-dev_K/a**2*50)+torch.exp(-dev_K/a**2/200))/3
                    else:
                        train_K = (kernel(z, None, a, 1)+kernel(z, None, a/10, 1)+kernel(z, None, a*10, 1))/3
                        dev_K = (kernel(dev_z, None, a, 1)+kernel(dev_z, None, a/10, 1)+kernel(dev_z, None, a*10, 1))/3

                dev_err = my_loss(net(dev_x), dev_y, None, dev_K)
                err_in_expectation, mse = conditional_expected_loss(net=net, ax=ax, w_samples=w_samples, y_samples=y_samples, y_axz=y_axz, x_on=False)
                print('test', test_err, 'dev', dev_err, 'err_in_expectation', err_in_expectation, 'mse: ', mse)
                test_errs.append(test_err)
                dev_errs.append(dev_err)
                exp_errs.append(err_in_expectation)
                mse_s.append(mse)

                if es.step(dev_err):
                    break
            losses = {'test': test_errs, 'dev': dev_errs, 'exp': exp_errs, 'mse_': mse_s}
        return es.best, epoch, net, losses
예제 #3
0
    def make_lda_model(self, sentences: list, threshold_remove_doc_freq_rate_over_this: float, rate_of_valid: float,
                       num_topics=20, passes=200, patience=5, must_move_this_rate=0.03, round_check_convergence=3) -> bool:
        """
        LDAモデルを作成するラッパー
        LDA関連一式はjoblibを使って自動セーブ

        passesは学習のiteration回数で、少ないと精度がとても悪い
        精度の指標値を数値で表示するので、収束してなさそうならpassesを大きくして再実行

        reference:
        [数式多めでさらっと](http://acro-engineer.hatenablog.com/entry/2017/12/11/120000)
        [ガチ勢のため](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)
        :param sentences: list of str, 文書群, 文のlist
        :param threshold_remove_doc_freq_rate_over_this: float(0-1), この割合以上の出現率の語をstop word入り
        :param rate_of_valid: float, greater than 0 and less than 1, 収束判定でデータをvalidationに回す割合
        :param num_topics: int, 想定する話題の数, 意味はreference参照
        :param passes: int, LDAの学習回数, 多くすると基本的には良いことが多い
        :param patience: int, 収束判定でpatience回動かなかったら打ち切る
        :param must_move_this_rate: float, greater than 0 and less than 1, 収束判定で動いていないと見なす割合
        :param round_check_convergence: int, 収束判定のskip回数。重い処理なのでround_check_convergence回に1回判定
        :return:
        """
        # documents -> list of tokens. token must be not high freq
        self.make_tokens_list(sentences=sentences,
                              threshold_remove_doc_freq_rate_over_this=threshold_remove_doc_freq_rate_over_this)
        # gensim dictionary, 全異なり語数を端折らずに辞書化する指定。 * 指定しないと10,000語くらいで端折るはず
        self.dictionary = Dictionary(self.tokens_list, prune_at=self.num_tokens_variation)
        # tokens -> corpus
        corpus = [self.dictionary.doc2bow(tokens) for tokens in self.tokens_list]

        # prepare LDA
        # make LDA
        # ## for early stopping
        train, valid = split_train_valid(corpus=corpus, rate_of_valid=rate_of_valid)
        early_stopping = EarlyStopping(patience=patience, must_move_this_rate=must_move_this_rate)
        # ## make
        model_lda = LdaMulticore(corpus=train, num_topics=num_topics, id2word=self.dictionary,
                                 workers=self.num_process, passes=1, eval_every=round_check_convergence)
        _ = early_stopping.is_converged(model=model_lda, valid_corpus=valid)
        # ## train model
        for i_loop in tqdm(range(1, passes), desc="lda learning @ lda"):
            model_lda.update(train)
            if i_loop % round_check_convergence == 0:
                if early_stopping.is_converged(model=model_lda, valid_corpus=valid):
                    break

        # show convergence
        self.log = early_stopping.log
        # save
        joblib.dump(self.dict_is_high_freq_token, f"{self.path_to_save}dict_is_stops.joblib")
        joblib.dump(model_lda, f"{self.path_to_save}model_LDA.joblib")
        joblib.dump(self.dictionary, f"{self.path_to_save}dictionary_LDA.joblib")

        return True
예제 #4
0
    def run(self, num_epochs, patience):
        early_stopping = (patience >= 1)
        if early_stopping:
            from early_stopping import EarlyStopping
            self.stopper = EarlyStopping(patience=patience)
        self.initClassifier()
        self.dataset.train()
        self.train_loader = DataLoader(self.dataset,
                                       self.batch_size,
                                       shuffle=True,
                                       num_workers=0)
        self.optimizer = optim.Adam(self.classifier.parameters(),
                                    lr=1e-3,
                                    weight_decay=1e-1,
                                    amsgrad=False)
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                              'min',
                                                              patience=2,
                                                              cooldown=3,
                                                              factor=0.5)
        self.log_loss = torch.nn.BCEWithLogitsLoss()
        self.pbar = progressbar(range(num_epochs))
        for ep in self.pbar:
            if early_stopping:
                with torch.no_grad():
                    shouldStop = self.test()
                    if shouldStop:
                        self.pbar.close()
                        break
            self.train()

        return self.classifier
예제 #5
0
파일: train.py 프로젝트: pvk444/etagger
def do_train(model, config, train_data, dev_data):
    early_stopping = EarlyStopping(patience=10, measure='f1', verbose=1)
    maximum = 0
    session_conf = tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    sess = tf.Session(config=session_conf)
    feed_dict = {model.wrd_embeddings_init: config.embvec.wrd_embeddings}
    sess.run(tf.global_variables_initializer(),
             feed_dict=feed_dict)  # feed large embedding data
    saver = tf.train.Saver()
    if config.restore is not None:
        saver.restore(sess, config.restore)
        print('model restored')

    # summary setting
    loss_summary = tf.summary.scalar('loss', model.loss)
    acc_summary = tf.summary.scalar('accuracy', model.accuracy)
    train_summary_op = tf.summary.merge([loss_summary, acc_summary])
    train_summary_dir = os.path.join(config.summary_dir, 'summaries', 'train')
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
    dev_summary_dir = os.path.join(config.summary_dir, 'summaries', 'dev')
    dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
    for e in range(config.epoch):
        train_step(sess, model, config, train_data, train_summary_op,
                   train_summary_writer)
        m = dev_step(sess, model, config, dev_data, dev_summary_writer, e)
        # early stopping
        if early_stopping.validate(m, measure='f1'): break
        if m > maximum:
            print('new best f1 score! : %s' % m)
            maximum = m
            # save best model
            save_path = saver.save(sess,
                                   config.checkpoint_dir + '/' + 'ner_model')
            print('max model saved in file: %s' % save_path)
            tf.train.write_graph(sess.graph,
                                 '.',
                                 config.checkpoint_dir + '/' + 'graph.pb',
                                 as_text=False)
            tf.train.write_graph(sess.graph,
                                 '.',
                                 config.checkpoint_dir + '/' + 'graph.pb_txt',
                                 as_text=True)
    sess.close()
예제 #6
0
def fit(model, train_data, dev_data):
    """Do actual training. 
    """
    def get_summary_setting(model):
        config = model.config
        sess = model.sess
        loss_summary = tf.summary.scalar('loss', model.loss)
        acc_summary = tf.summary.scalar('accuracy', model.accuracy)
        f1_summary = tf.summary.scalar('f1', model.f1)
        lr_summary = tf.summary.scalar('learning_rate', model.learning_rate)
        train_summary_op = tf.summary.merge(
            [loss_summary, acc_summary, f1_summary, lr_summary])
        train_summary_dir = os.path.join(config.summary_dir, 'summaries',
                                         'train')
        train_summary_writer = tf.summary.FileWriter(train_summary_dir,
                                                     sess.graph)
        dev_summary_dir = os.path.join(config.summary_dir, 'summaries', 'dev')
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
        return train_summary_op, train_summary_writer, dev_summary_writer

    config = model.config
    sess = model.sess

    # restore previous model if provided
    saver = tf.train.Saver()
    if config.restore is not None:
        saver.restore(sess, config.restore)
        tf.logging.debug('model restored')

    # summary setting
    train_summary_op, train_summary_writer, dev_summary_writer = get_summary_setting(
        model)

    # train and evaluate
    early_stopping = EarlyStopping(patience=10, measure='f1', verbose=1)
    max_seqeval_f1 = 0
    for e in range(config.epoch):
        train_step(model, train_data, train_summary_op, train_summary_writer)
        seqeval_f1, avg_f1 = dev_step(model, dev_data, dev_summary_writer, e)
        # early stopping
        if early_stopping.validate(seqeval_f1, measure='f1'): break
        if seqeval_f1 > max_seqeval_f1:
            tf.logging.debug('new best f1 score! : %s' % seqeval_f1)
            max_seqeval_f1 = seqeval_f1
            # save best model
            save_path = saver.save(sess,
                                   config.checkpoint_dir + '/' + 'ner_model')
            tf.logging.debug('max model saved in file: %s' % save_path)
            tf.train.write_graph(sess.graph,
                                 '.',
                                 config.checkpoint_dir + '/' + 'graph.pb',
                                 as_text=False)
            tf.train.write_graph(sess.graph,
                                 '.',
                                 config.checkpoint_dir + '/' + 'graph.pb_txt',
                                 as_text=True)
            early_stopping.reset(max_seqeval_f1)
        early_stopping.status()
    sess.close()
예제 #7
0
def train(model,
          train_iterator,
          valid_iterator,
          test_iterator,
          optimizer,
          criterion,
          clip=1,
          short_train=True,
          n_epochs=10,
          teacher_force=0.5,
          eval_words=None,
          patience=3):
    early_stopping = EarlyStopping(patience=patience,
                                   verbose=False,
                                   filename='cache/checkpoint.pt')
    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss = train_epoch(model,
                                 train_iterator,
                                 optimizer,
                                 criterion,
                                 clip,
                                 short_train,
                                 teacher_force=teacher_force)
        valid_loss, valid_accuracy = evaluate(model,
                                              valid_iterator,
                                              criterion,
                                              eval_words=eval_words)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3E}'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3E}'
        )
        print(f'\t Val. Accuracy: {valid_accuracy:.3f}')
        early_stopping(valid_loss, model)
        if early_stopping.early_stop:
            print("Early stopping, reloading checkpoint model")
            model.load_state_dict(torch.load('cache/checkpoint.pt'))
            break

    test_loss, test_accuracy = evaluate(model,
                                        test_iterator,
                                        criterion,
                                        eval_words=eval_words)
    print(
        f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3E} |'
    )
    print(f'| Test Accuracy: {test_accuracy:.3f}')
예제 #8
0
def hp_search(trial: optuna.Trial):
    if torch.cuda.is_available():
        logger.info("%s", torch.cuda.get_device_name(0))

    global gopt
    opt = gopt
    # set config
    config = load_config(opt)
    config['opt'] = opt
    logger.info("%s", config)

    # set path
    set_path(config)

    # set search spaces
    lr = trial.suggest_loguniform('lr', 1e-6, 1e-3) # .suggest_float('lr', 1e-6, 1e-3, log=True)
    bsz = trial.suggest_categorical('batch_size', [32, 64, 128])
    seed = trial.suggest_int('seed', 17, 42)
    epochs = trial.suggest_int('epochs', 1, opt.epoch)

    # prepare train, valid dataset
    train_loader, valid_loader = prepare_datasets(config, hp_search_bsz=bsz)

    with temp_seed(seed):
        # prepare model
        model = prepare_model(config)
        # create optimizer, scheduler, summary writer, scaler
        optimizer, scheduler, writer, scaler = prepare_osws(config, model, train_loader, hp_search_lr=lr)
        config['optimizer'] = optimizer
        config['scheduler'] = scheduler
        config['writer'] = writer
        config['scaler'] = scaler

        early_stopping = EarlyStopping(logger, patience=opt.patience, measure=opt.measure, verbose=1)
        best_eval_measure = float('inf') if opt.measure == 'loss' else -float('inf')
        for epoch in range(epochs):
            eval_loss, eval_acc = train_epoch(model, config, train_loader, valid_loader, epoch)

            if opt.measure == 'loss': eval_measure = eval_loss 
            else: eval_measure = eval_acc
            # early stopping
            if early_stopping.validate(eval_measure, measure=opt.measure): break
            if opt.measure == 'loss': is_best = eval_measure < best_eval_measure
            else: is_best = eval_measure > best_eval_measure
            if is_best:
                best_eval_measure = eval_measure
                early_stopping.reset(best_eval_measure)
            early_stopping.status()

            trial.report(eval_acc, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()
        return eval_acc
예제 #9
0
def main():
    # is_training = True
    model = MODEL_DISPATCHER[BASE_MODEL](training=True)
    print(model)
    model = model.to(DEVICE)
    EarlyStoppingObject = EarlyStopping()

    Training_Dataset = BengaliAiDataset(
        folds = TRAINING_FOLDS, \
        img_height= IMG_HEIGHT, \
        img_width= IMG_WIDTH, \
        mean = MODEL_MEAN,\
        std = MODEL_STD)

    Train_DataLoader = torch.utils.data.DataLoader(dataset=Training_Dataset,
                                                   batch_size=TRAIN_BATCH_SIZE,
                                                   shuffle=True,
                                                   num_workers=4)

    Validation_Dataset = BengaliAiDataset(
        folds = VALIDATION_FOLDS, \
        img_height= IMG_HEIGHT, \
        img_width= IMG_WIDTH, \
        mean = MODEL_MEAN,\
        std = MODEL_STD)

    Validation_DataLoader = torch.utils.data.DataLoader(
        dataset=Validation_Dataset,
        batch_size=TEST_BATCH_SIZE,
        shuffle=False,
        num_workers=4)

    optimiser = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimiser, mode = "min", \
                                                            patience = 5, factor = 0.3, \
                                                                verbose = True)

    for epoch in range(EPOCHS):
        train(Training_Dataset, Train_DataLoader, model, optimiser)
        validationScore = evaluate(Validation_Dataset, Validation_DataLoader,
                                   model, optimiser)
        scheduler.step(validationScore)
        print(f"EPOCH : {epoch} VALIDATION SCORE : {validationScore}")
        # torch.save(model.state_dict(), f"../input/output_models/{BASE_MODEL}_fold{VALIDATION_FOLDS[0]}.bin")
        EarlyStoppingObject(
            validationScore, model,
            f"../input/output_models/{BASE_MODEL}_fold{VALIDATION_FOLDS[0]}.bin"
        )
예제 #10
0
def main():
    # Load experiment configuration
    config = load_config()
    manual_seed = config.get('manual_seed', None)
    if manual_seed is not None:
        torch.manual_seed(manual_seed)
        # see https://pytorch.org/docs/stable/notes/randomness.html
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    # Create the model
    device = config['training']['device']
    model = get_model(config)

    learning_rate = config['training']['learning_rate']
    # momentum = config['training']['momentum']
    wd = config['training']['wd']
    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=wd)
    # betas=(0.9, 0.999), eps=1e-08, amsgrad=False
    step_size = config['training']['step_size']
    gamma = config['training']['gamma']
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=step_size,
                                          gamma=gamma,
                                          last_epoch=-1)

    patience = config['training']['patience']
    delta = config['training']['delta']
    early_stopping = EarlyStopping(patience=patience,
                                   verbose=True,
                                   delta=delta,
                                   checkpoint_path=None)
    # Create loss criterion
    loss_type = config['loss']['loss_type']
    w0, w1 = config['loss']['w0'], config['loss']['w1']
    ce_weights = [w0, w1]
    dce_w = config['loss']['dce_w']
    nll = config['loss']['nll']
    criterion = DiceCrossEntropyLoss(loss=loss_type, logging_name=None, ce_weights = ce_weights, \
                                         dce_weight=dce_w, nll=nll)

    # Start training
    train(model, config, optimizer, scheduler, criterion, early_stopping=None)
예제 #11
0
파일: train.py 프로젝트: microvnn/ntagger
def train(opt):
    if torch.cuda.is_available():
        logger.info("%s", torch.cuda.get_device_name(0))

    # set etc
    torch.autograd.set_detect_anomaly(True)

    # set config
    config = load_config(opt)
    config['opt'] = opt
    logger.info("%s", config)

    # set path
    set_path(config)

    # prepare train, valid dataset
    train_loader, valid_loader = prepare_datasets(config)

    with temp_seed(opt.seed):
        # prepare model
        model = prepare_model(config)

        # create optimizer, scheduler, summary writer, scaler
        optimizer, scheduler, writer, scaler = prepare_osws(
            config, model, train_loader)
        config['optimizer'] = optimizer
        config['scheduler'] = scheduler
        config['writer'] = writer
        config['scaler'] = scaler

        # training
        early_stopping = EarlyStopping(logger,
                                       patience=opt.patience,
                                       measure='f1',
                                       verbose=1)
        local_worse_epoch = 0
        best_eval_f1 = -float('inf')
        for epoch_i in range(opt.epoch):
            epoch_st_time = time.time()
            eval_loss, eval_f1, best_eval_f1 = train_epoch(
                model, config, train_loader, valid_loader, epoch_i,
                best_eval_f1)
            # early stopping
            if early_stopping.validate(eval_f1, measure='f1'): break
            if eval_f1 == best_eval_f1:
                early_stopping.reset(best_eval_f1)
            early_stopping.status()
예제 #12
0
def main():
    parser = define_args()
    args = parser.parse_args()

    num_classes = 17
    cudnn.benchmark = True

    print('Training model(s) for folds: {}'.format(args.folds))

    for fold in args.folds:

        model, bootstrap_params, full_params = create_model(
            num_classes,
            args.lr,
        )
        criterion = torch.nn.MultiLabelSoftMarginLoss()

        if torch.cuda.is_available():
            model = model.cuda()
            criterion = criterion.cuda()

        bootstrap_optimizer = torch.optim.Adam(bootstrap_params, args.lr)
        optimizer = torch.optim.Adam(full_params, args.lr)

        train_loader, val_loader = create_data_pipeline(fold, args)

        tuner = Tuner(model,
                      criterion,
                      bootstrap_optimizer,
                      optimizer,
                      tag='fold_{}'.format(fold),
                      early_stopping=EarlyStopping(
                          mode='max',
                          threshold_mode='abs',
                          patience=7,
                      ))

        if args.resume:
            if os.path.isfile(args.resume):
                tuner.restore_checkpoint(args.resume)

        tuner.run(train_loader, val_loader)
def train(model, train_iterator, valid_iterator, test_iterator, optimizer, criterion, model_checkpoint, device,
          clip=1, short_train=True, n_epochs=50, patience=3):
    
    early_stopping = EarlyStopping(patience=patience, verbose=False, filename=model_checkpoint)
    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss = train_epoch(model, train_iterator, optimizer, criterion, clip, device, short_train)
        valid_loss = evaluate(model, valid_iterator, criterion)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3E}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3E}')
        
        early_stopping(valid_loss, model)
        if early_stopping.early_stop:
            print("Early stopping, reloading checkpoint model")
            model.load_state_dict(torch.load(model_checkpoint))
            break

    test_loss = evaluate(model, test_iterator, criterion)
    print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3E} |')
예제 #14
0
def train(model, train_iter, val_iter, test_iter, optimizer, criterion,
          n_epochs, short_train, checkpoint_name, patience):
    early_stopping = EarlyStopping(filename=checkpoint_name, patience=patience)
    for epoch in range(n_epochs):
        start_time = time.time()
        train_loss = train_epoch(model, train_iter, optimizer, criterion,
                                 short_train)
        val_loss, val_acc = evaluate(model, val_iter, criterion)
        end_time = time.time()

        epoch_min, epoch_sec = epoch_time(start_time, end_time)
        print(f'Epoch: {epoch + 1:02} | Time: {epoch_min}m {epoch_sec}s')
        print(f'\tTrain Loss: {train_loss:.3f}')
        print(f'\t Val. Loss: {val_loss:.3f} | Val. Accuracy {val_acc:.3f}')

        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping, reloading checkpoint model")
            model.load_state_dict(torch.load(checkpoint_name))
            break

    test_loss, test_acc = evaluate(model, test_iter, criterion)
    print(f'Test Loss: {test_loss:.3f} | Test Accuracy: {test_acc:.3f}')
예제 #15
0
 def __init__(self,
              trainLoader,
              testLoader,
              model,
              epoch=100,
              eps=1e-3,
              savePath="./"):
     self.trainLoader = trainLoader
     self.testLoader = testLoader
     self.model = model
     # self.optimizer=torch.optim.SGD(self.model.parameters(),lr=0.01, momentum=0.9)
     self.optimizer = torch.optim.Adam(self.model.parameters())
     self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         self.optimizer, mode="min", patience=3, threshold=0.0001)
     self.start_index = 1
     self.epoch = epoch
     self.eps = eps
     self.vis = visdom.Visdom(env="imageNet")
     self.interval = 1
     self.checker = EarlyStopping(delta=self.eps)
     self.device = device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     self.currentId = self.start_index
     self.savePath = savePath
예제 #16
0
def train_model(train_iterator, val_iterator, test_iterator):
    hidden_size = 8
    vocab_size = len(train_iterator.word2index)
    n_extra_feat = 10
    output_size = 2
    n_layers = 1
    dropout = 0.5
    learning_rate = 0.001
    epochs = 40
    spatial_dropout = True
    bidirectional = True

    # Load the weights matrix
    weights = np.load('glove/weights-biGRU-glove.npy')

    # Check whether system supports CUDA
    CUDA = torch.cuda.is_available()

    model = BiGRU(hidden_size, vocab_size, n_extra_feat, weights, output_size,
                  n_layers, dropout, spatial_dropout, bidirectional)

    # Move the model to GPU if possible
    if CUDA:
        model.cuda()

    model.add_loss_fn(nn.NLLLoss())

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    model.add_optimizer(optimizer)

    device = torch.device('cuda' if CUDA else 'cpu')

    model.add_device(device)

    # Instantiate the EarlyStopping
    early_stop = EarlyStopping(wait_epochs=2)

    train_losses_list, train_avg_loss_list, train_accuracy_list = [], [], []
    eval_avg_loss_list, eval_accuracy_list, conf_matrix_list = [], [], []

    for epoch in range(epochs):

        print('\nStart epoch [{}/{}]'.format(epoch + 1, epochs))

        train_losses, train_avg_loss, train_accuracy = model.train_model(
            train_iterator)

        train_losses_list.append(train_losses)
        train_avg_loss_list.append(train_avg_loss)
        train_accuracy_list.append(train_accuracy)

        _, eval_avg_loss, eval_accuracy, conf_matrix = model.evaluate_model(
            val_iterator)

        eval_avg_loss_list.append(eval_avg_loss)
        eval_accuracy_list.append(eval_accuracy)
        conf_matrix_list.append(conf_matrix)

        print(
            '\nEpoch [{}/{}]: Train accuracy: {:.3f}. Train loss: {:.4f}. Evaluation accuracy: {:.3f}. Evaluation loss: {:.4f}' \
            .format(epoch + 1, epochs, train_accuracy, train_avg_loss, eval_accuracy, eval_avg_loss))

        if early_stop.stop(eval_avg_loss, model, delta=0.003):
            break

    _, test_avg_loss, test_accuracy, test_conf_matrix = model.evaluate_model(
        test_iterator)
    print('Test accuracy: {:.3f}. Test error: {:.3f}'.format(
        test_accuracy, test_avg_loss))
예제 #17
0
    #                      lr=params.lr,
    #                      warmup=warmup_proportion,
    #                      t_total=num_train_optimization_steps)

    optimizer = AdamW(optimizer_grouped_parameters, lr = params.lr, correct_bias=True )
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = int(warmup_proportion*num_train_optimization_steps), num_training_steps = num_train_optimization_steps )

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    binary_criterion = nn.BCEWithLogitsLoss(
        pos_weight=torch.Tensor([3932/14263]).to(dev))

    avg_train_losses = []
    avg_valid_losses = []

    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=params.patience, verbose=True)

    for epoch in range(1, params.n_epochs+1):
        # print("For epoch {} cached is {}\n allocated is {}".format(epoch, 
        #     torch.cuda.memory_cached(0), torch.cuda.memory_allocated(0)))

        print("=========eval at epoch={epoch}=========")
        if not os.path.exists('checkpoints'):
            os.makedirs('checkpoints')
        if not os.path.exists('results'):
            os.makedirs('results')
        fname = os.path.join('checkpoints','epoch_{}_'.format(epoch)+params.run)
        spath = os.path.join('checkpoints','epoch_{}_'.format(epoch)+params.run+".pt")

        # print("For epoch {} cached is {}\n allocated is {}".format(epoch, torch.cuda.memory_cached(0), torch.cuda.memory_allocated(0)))
예제 #18
0
    device = torch.device("cuda:0")
    unet = UnetResnet34().to(device)

    criterion = BinaryFocalLoss2d()

    optimizer = torch.optim.SGD(unet.parameters(),
                                lr=args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    scheduler = ReduceLROnPlateau(optimizer,
                                  'max',
                                  factor=args.rop_reduce_factor,
                                  patience=args.rop_patience,
                                  verbose=True)
    early_stopping = EarlyStopping(args.early_stopping_patience, mode='max')

    for epoch in range(args.epochs):
        unet.train()
        train_loss = []
        for images, masks in tqdm(train_loader):
            optimizer.zero_grad()

            images, masks = images.to(device), masks.to(device)
            prediction = unet(images)

            predicted_mask = prediction.squeeze(1)
            masks = masks.squeeze(1)
            loss = criterion(predicted_mask, masks)
            train_loss.append(loss.item())
예제 #19
0
def run(fold, args):
    if args.sz:
        print(f"Images will be resized to {args.sz}")
        args.sz = int(args.sz)

    # get training and valid data
    df = pd.read_csv(args.training_folds_csv)
    if args.loss == 'crossentropy' and not args.isic2019:
        diag_to_ix = {
            v: i
            for i, v in enumerate(sorted(list(set(df.diagnosis))))
        }
        ix_to_diag = {v: i for i, v in diag_to_ix.items()}

    if args.external_csv_path:
        df_external = pd.read_csv(args.external_csv_path)
    df_train = df.query(f"kfold != {fold}").reset_index(drop=True)
    df_valid = df.query(f"kfold == {fold}").reset_index(drop=True)
    print(
        f"Running for K-Fold {fold}; train_df: {df_train.shape}, valid_df: {df_valid.shape}"
    )

    # calculate weights for NN loss
    weights = len(df) / df.target.value_counts().values
    class_weights = torch.FloatTensor(weights)
    if args.loss == 'weighted_bce':
        print(f"assigning weights {weights} to loss fn.")
    if args.loss == 'focal_loss':
        print("Focal loss will be used for training.")
    if args.loss == 'weighted_cross_entropy':
        print(f"assigning weights {weights} to loss fn.")

    # create model
    if 'efficient_net' in args.model_name:
        model = MODEL_DISPATCHER[args.model_name](
            pretrained=args.pretrained,
            arch_name=args.arch_name,
            ce=(args.loss == 'crossentropy'
                or args.loss == 'weighted_cross_entropy'
                or args.load_pretrained_2019))
    else:
        model = MODEL_DISPATCHER[args.model_name](pretrained=args.pretrained)

    if args.model_path is not None:
        print(
            f"Loading pretrained model and updating final layer from {args.model_path}"
        )
        model.load_state_dict(torch.load(args.model_path))
        nftrs = model.base_model._fc.in_features
        model.base_model._fc = nn.Linear(nftrs, 1)

    meta_array = None
    if args.use_metadata:
        # create meta array
        sex_dummy_train = pd.get_dummies(df_train['sex'])[['male', 'female']]
        site_dummy_train = pd.get_dummies(
            df_train['anatom_site_general_challenge'])[[
                'head/neck', 'lower extremity', 'oral/genital', 'palms/soles',
                'torso', 'upper extremity'
            ]]
        assert max(df_train.age_approx) < 100
        age_train = df_train.age_approx.fillna(-5) / 100
        meta_array = pd.concat([sex_dummy_train, site_dummy_train, age_train],
                               axis=1).values
        # modify model forward
        if args.freeze_cnn:
            model.load_state_dict(torch.load(args.model_path))

        # update the forward pass
        model = modify_model(model, args)

        # freeze cnn
        if args.freeze_cnn:
            print("\nFreezing CNN layers!\n")
            for param in model.base_model.parameters():
                param.requires_grad = False

        # add external meta to meta array
        if args.external_csv_path:
            sex_dummy_ext = pd.get_dummies(
                df_external['sex'])[['male', 'female']]
            df_external[
                'anatom_site_general'] = df_external.anatom_site_general.replace(
                    {
                        'anterior torso': 'torso',
                        'lateral torso': 'torso',
                        'posterior torso': 'torso'
                    })
            site_dummy_ext = pd.get_dummies(
                df_external['anatom_site_general'])[[
                    'head/neck', 'lower extremity', 'oral/genital',
                    'palms/soles', 'torso', 'upper extremity'
                ]]
            assert max(df_external.age_approx) < 100
            age_ext = df_external.age_approx.fillna(-5) / 100
            meta_array = np.concatenate([
                meta_array,
                pd.concat([sex_dummy_ext, site_dummy_ext, age_ext],
                          axis=1).values
            ])

        assert meta_array.shape[1] == 9

    model = model.to(args.device)

    train_aug = albumentations.Compose([
        albumentations.RandomScale(0.07),
        albumentations.Rotate(50),
        albumentations.RandomBrightnessContrast(0.15, 0.1),
        albumentations.Flip(p=0.5),
        albumentations.IAAAffine(shear=0.1),
        albumentations.RandomCrop(args.sz, args.sz)
        if args.sz else albumentations.NoOp(),
        albumentations.OneOf([
            albumentations.Cutout(random.randint(1, 8), 16, 16),
            albumentations.CoarseDropout(random.randint(1, 8), 16, 16)
        ]),
        albumentations.Normalize(always_apply=True)
    ])

    valid_aug = albumentations.Compose([
        albumentations.CenterCrop(args.sz, args.sz)
        if args.sz else albumentations.NoOp(),
        albumentations.Normalize(always_apply=True),
    ])

    print(f"\nUsing train augmentations: {train_aug}\n")

    # get train and valid images & targets and add external data if required (external data only contains melonama data)
    train_images = df_train.image_name.tolist()
    if args.external_csv_path:
        external_images = df_external.image.tolist()
        if args.exclude_outliers_2019:
            # from EDA notebook
            external_images = np.load(
                f'/home/ubuntu/repos/kaggle/melonama/data/external/clean_external_2019_{args.sz}.npy'
            ).tolist()
        print(
            f"\n\n{len(external_images)} external images will be added to each training fold."
        )
        train_images = train_images + external_images
    if args.use_pseudo_labels:
        test_df = pd.read_csv(
            '/home/ubuntu/repos/kaggle/melonama/data/test.csv')
        test_images = test_df.image_name.tolist()

        if args.pseudo_images_path:
            test_images = list(
                np.load(args.pseudo_images_path, allow_pickle=True))

        print(
            f"\n\n{len(test_images)} test images will be added to each training fold."
        )
        train_images = train_images + test_images

    train_image_paths = [
        os.path.join(args.train_data_dir, image_name + '.jpg')
        for image_name in train_images
    ]
    train_targets = df_train.target if not args.external_csv_path else np.concatenate(
        [df_train.target.values,
         np.ones(len(external_images))])

    if args.use_pseudo_labels:
        train_targets = np.concatenate([
            train_targets,
            np.load(args.pseudo_labels_path, allow_pickle=True)
        ])

    if args.loss == 'crossentropy':
        df_train['diagnosis'] = df_train.diagnosis.map(diag_to_ix)
        train_targets = df_train.diagnosis.values

    assert len(train_image_paths) == len(
        train_targets
    ), "Length of train images {} doesnt match length of targets {}".format(
        len(train_images), len(train_targets))

    # same for valid dataframe
    valid_images = df_valid.image_name.tolist()
    valid_image_paths = [
        os.path.join(args.train_data_dir, image_name + '.jpg')
        for image_name in valid_images
    ]
    valid_targets = df_valid.target
    if args.loss == 'crossentropy':
        df_valid['diagnosis'] = df_valid.diagnosis.map(diag_to_ix)
        valid_targets = df_valid.diagnosis.values

    print(
        f"\n\n Total Train images: {len(train_image_paths)}, Total val: {len(valid_image_paths)}\n\n"
    )
    # create train and valid dataset, dont use color constancy as already preprocessed in directory
    train_dataset = MelonamaDataset(train_image_paths,
                                    train_targets,
                                    train_aug,
                                    cc=args.cc,
                                    meta_array=meta_array)
    valid_dataset = MelonamaDataset(valid_image_paths,
                                    valid_targets,
                                    valid_aug,
                                    cc=args.cc,
                                    meta_array=meta_array)

    # create dataloaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.train_batch_size,
        shuffle=True,
        num_workers=4)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=args.valid_batch_size,
        shuffle=False,
        num_workers=4)

    # create optimizer and scheduler for training
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[3, 5, 6, 7, 8, 9, 10, 11, 13, 15], gamma=0.5)

    es = EarlyStopping(patience=3,
                       mode='min' if args.metric == 'valid_loss' else 'max')

    for epoch in range(args.epochs):
        train_loss = train_one_epoch(
            args,
            train_loader,
            model,
            optimizer,
            weights=None
            if not args.loss.startswith('weighted') else class_weights)
        preds, valid_loss = evaluate(args, valid_loader, model)
        predictions = np.vstack(preds).ravel()

        if args.loss == 'crossentropy' or args.loss == 'weighted_cross_entropy':
            accuracy = metrics.accuracy_score(valid_targets, predictions)
        else:
            auc = metrics.roc_auc_score(valid_targets, predictions)

        preds_df = pd.DataFrame({
            'predictions': predictions,
            'targets': valid_targets,
            'valid_image_paths': valid_image_paths
        })
        print(
            f"Epoch: {epoch}, Train loss: {train_loss}, Valid loss: {valid_loss}, Valid Score: {locals()[f'{args.metric}']}"
        )

        scheduler.step()
        for param_group in optimizer.param_groups:
            print(f"Current Learning Rate: {param_group['lr']}")
        es(locals()[f"{args.metric}"],
           model,
           model_path=
           f"/home/ubuntu/repos/kaggle/melonama/models/{syd_now.strftime(r'%d%m%y')}/{args.arch_name}_fold_{fold}_{args.sz}_{locals()[f'{args.metric}']}.bin",
           preds_df=preds_df,
           df_path=
           f"/home/ubuntu/repos/kaggle/melonama/valid_preds/{syd_now.strftime(r'%d%m%y')}/{args.arch_name}_fold_{fold}_{args.sz}_{locals()[f'{args.metric}']}.bin",
           args=args)
        if es.early_stop:
            return preds_df
예제 #20
0
def train(fold):
    training_data_path = "/home/dragoshh1984/repos/kaggle/datasets/melanomia_classification/512x512-dataset-melanoma/512x512-dataset-melanoma"
    model_path = "/home/dragoshh1984/repos/kaggle/melanomia-classification"
    df = pd.read_csv(
        "/home/dragoshh1984/repos/kaggle/datasets/melanomia_classification/new_train.csv"
    )

    # defines
    device = "cuda"
    epochs = 20
    train_bs = 16
    valid_bs = 16

    # for this model
    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)

    # data for training
    df_train = df[df.fold != fold].reset_index(drop=True)
    df_valid = df[df.fold == fold].reset_index(drop=True)

    # augmentations
    train_aug = albumentations.Compose([
        albumentations.RandomResizedCrop(224, 224, (0.7, 1.0)),
        albumentations.HorizontalFlip(),
        albumentations.VerticalFlip(),
        albumentations.Cutout(),
        albumentations.RandomBrightness(),
        albumentations.RandomContrast(),
        albumentations.Rotate(),
        albumentations.RandomScale(),
        albumentations.PadIfNeeded(300, 300),
        albumentations.Normalize(mean,
                                 std,
                                 max_pixel_value=255.0,
                                 always_apply=True),
    ])

    valid_aug = albumentations.Compose([
        albumentations.RandomResizedCrop(224, 224, (0.7, 1.0)),
        albumentations.HorizontalFlip(),
        albumentations.VerticalFlip(),
        albumentations.Cutout(),
        albumentations.RandomBrightness(),
        albumentations.RandomContrast(),
        albumentations.Rotate(),
        albumentations.RandomScale(),
        albumentations.PadIfNeeded(300, 300),
        albumentations.Normalize(mean,
                                 std,
                                 max_pixel_value=255.0,
                                 always_apply=True),
    ])

    train_images = df_train.image_id.values.tolist()
    train_images = [
        os.path.join(training_data_path, i + ".jpg") for i in train_images
    ]
    train_metada = df_train.drop([
        "fold", "target", "image_id", "patient_id", "source", "stratify_group"
    ],
                                 axis=1).values.tolist()
    train_targets = df_train.target.values

    valid_images = df_valid.image_id.values.tolist()
    valid_images = [
        os.path.join(training_data_path, i + ".jpg") for i in valid_images
    ]
    valid_metadata = df_valid.drop([
        "fold", "target", "image_id", "patient_id", "source", "stratify_group"
    ],
                                   axis=1).values.tolist()
    valid_targets = df_valid.target.values

    # datasets
    training_dataset = ClassificationLoader(image_paths=train_images,
                                            metadata=train_metada,
                                            targets=train_targets,
                                            resize=None,
                                            augmentations=train_aug)

    # loaders
    train_loader = torch.utils.data.DataLoader(training_dataset,
                                               batch_size=train_bs,
                                               shuffle=True,
                                               num_workers=4)

    valid_dataset = ClassificationLoader(image_paths=valid_images,
                                         metadata=valid_metadata,
                                         targets=valid_targets,
                                         resize=None,
                                         augmentations=valid_aug)

    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=valid_bs,
                                               shuffle=False,
                                               num_workers=4)

    model = EfficientNet_tabular(pretrained="imagenet")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    # max for auc metric
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience=3,
                                                           mode="max")

    # early stopping
    es = EarlyStopping(patience=3, mode="max")
    # import pdb; pdb.set_trace()
    for epoch in range(epochs):
        training_loss = Engine.train(train_loader, model, optimizer, device)
        predictions, valid_loss = Engine.evaluate(valid_loader, model, device)
        # import pdb; pdb.set_trace()
        predictions = np.vstack((predictions)).ravel()
        auc = metrics.roc_auc_score(valid_targets, predictions)
        scheduler.step(auc)

        print(f"epoch={epoch}, auc={auc}")
        es(auc, model, os.path.join(model_path, f"model{fold}.bin"))

        if es.early_stop:
            print("early stopping")
            break
예제 #21
0
파일: train.py 프로젝트: xiaoanshi/ntagger
def train(opt):
    if torch.cuda.is_available():
        logger.info("%s", torch.cuda.get_device_name(0))

    # set etc
    torch.autograd.set_detect_anomaly(True)

    # set config
    config = load_config(opt)
    config['opt'] = opt
    logger.info("%s", config)
 
    # set path
    set_path(config)
  
    # prepare train, valid dataset
    train_loader, valid_loader = prepare_datasets(config)

    with temp_seed(opt.seed):
        # prepare model
        model = prepare_model(config)

        # create optimizer, scheduler, summary writer, scaler
        optimizer, scheduler, writer, scaler = prepare_osws(config, model, train_loader)
        config['optimizer'] = optimizer
        config['scheduler'] = scheduler
        config['writer'] = writer
        config['scaler'] = scaler

        # training
        early_stopping = EarlyStopping(logger, patience=opt.patience, measure='f1', verbose=1)
        local_worse_steps = 0
        prev_eval_f1 = -float('inf')
        best_eval_f1 = -float('inf')
        for epoch_i in range(opt.epoch):
            epoch_st_time = time.time()
            eval_loss, eval_f1 = train_epoch(model, config, train_loader, valid_loader, epoch_i)
            # early stopping
            if early_stopping.validate(eval_f1, measure='f1'): break
            if eval_f1 > best_eval_f1:
                best_eval_f1 = eval_f1
                if opt.save_path:
                    logger.info("[Best model saved] : {:10.6f}".format(best_eval_f1))
                    save_model(config, model)
                    # save finetuned bert model/config/tokenizer
                    if config['emb_class'] in ['bert', 'distilbert', 'albert', 'roberta', 'bart', 'electra']:
                        if not os.path.exists(opt.bert_output_dir):
                            os.makedirs(opt.bert_output_dir)
                        model.bert_tokenizer.save_pretrained(opt.bert_output_dir)
                        model.bert_model.save_pretrained(opt.bert_output_dir)
                early_stopping.reset(best_eval_f1)
            early_stopping.status()
            # begin: scheduling, apply rate decay at the measure(ex, loss) getting worse for the number of deacy epoch steps.
            if prev_eval_f1 >= eval_f1:
                local_worse_steps += 1
            else:
                local_worse_steps = 0
            logger.info('Scheduler: local_worse_steps / opt.lr_decay_steps = %d / %d' % (local_worse_steps, opt.lr_decay_steps))
            if not opt.use_transformers_optimizer and \
               epoch_i > opt.warmup_epoch and \
               (local_worse_steps >= opt.lr_decay_steps or early_stopping.step() > opt.lr_decay_steps):
                scheduler.step()
                local_worse_steps = 0
            prev_eval_f1 = eval_f1
예제 #22
0
    def train(self):
        train_sampler = RandomSampler(self.train_dataset)
        train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size)
        writer = SummaryWriter(log_dir = self.args.model_dir)
        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': self.args.weight_decay},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total)

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(self.train_dataset))
        logger.info("  Num Epochs = %d", self.args.num_train_epochs)
        logger.info("  Total train batch size = %d", self.args.train_batch_size)
        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)
        logger.info("  Logging steps = %d", self.args.logging_steps)
        logger.info("  Save steps = %d", self.args.save_steps)

        global_step = 0
        tr_loss = 0.0
        self.model.zero_grad()

        train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")
        early_stopping = EarlyStopping(patience = self.args.early_stopping, verbose = True)


        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration", position=0, leave=True)
            print("\nEpoch", _)
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                batch = tuple(t.to(self.device) for t in batch)  # GPU or CPU

                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'intent_label_ids': batch[3],
                          'slot_labels_ids': batch[4]}
                if self.args.model_type != 'distilbert':
                    inputs['token_type_ids'] = batch[2]
                outputs = self.model(**inputs)
                loss = outputs[0]
                
                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1

                    if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0:
                        print('\nTuning metrics:', self.args.tuning_metric)
                        results = self.evaluate("dev")
                        writer.add_scalar("Loss/validation", results['loss'], _)
                        writer.add_scalar("Intent Accuracy/validation", results['intent_acc'], _)
                        writer.add_scalar("Slot F1/validation", results['slot_f1'], _)
                        writer.add_scalar("Mean Intent Slot", results['mean_intent_slot'], _)
                        writer.add_scalar("Sentence Accuracy/validation", results['semantic_frame_acc'], _)
                        early_stopping(results[self.args.tuning_metric], self.model, self.args)
                        if early_stopping.early_stop:
                            print("Early stopping")
                            break


                    # if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
                    #     self.save_model()

                if 0 < self.args.max_steps < global_step:
                    epoch_iterator.close()
                    break

            if 0 < self.args.max_steps < global_step or early_stopping.early_stop:
                train_iterator.close()
                break
            writer.add_scalar("Loss/train", tr_loss / global_step, _)

        return global_step, tr_loss / global_step
예제 #23
0
파일: model.py 프로젝트: jianshu93/PEPMAN
def train():
    lr = 0.002
    lambd = 0.001
    MAX_EPOCHS = 35
    bs = 32
    fp = open('../../../../4.training_data_all_4_1/data.pkl', 'rb')
    data = pickle.load(fp)
    X_train_pos = data['X_train_pos']
    X_train_neg_all = data['X_train_neg']
    X_valid_pos = data['X_valid_pos']
    X_valid_neg_all = data['X_valid_neg']
    x_pos_train = X_train_pos
    train_pos_num = len(x_pos_train)
    train_neg_num = multi * train_pos_num
    train_num = train_pos_num + train_neg_num
    Y_pos_train = np.ones((train_pos_num, 1), dtype=np.float32)
    Y_neg_train = np.zeros((train_neg_num, 1), dtype=np.float32)
    Y_train = np.concatenate((Y_pos_train, Y_neg_train))
    x_pos_valid = X_valid_pos
    valid_pos_num = len(x_pos_valid)
    valid_neg_num = multi * valid_pos_num
    valid_num = valid_pos_num + valid_neg_num
    Y_pos_valid = np.ones((valid_pos_num, 1), dtype=np.float32)
    Y_neg_valid = np.zeros((valid_neg_num, 1), dtype=np.float32)
    Y_valid = np.concatenate((Y_pos_valid, Y_neg_valid))
    print('train_sequence:', train_pos_num + train_neg_num)
    print('valid_sequence:', valid_neg_num + valid_pos_num)

    for m in range(10):
        print('model {}'.format(m))
        savedir = './model_file/model_' + str(m)
        if not os.path.exists(savedir):
            os.makedirs(savedir)

        np.random.shuffle(X_train_neg_all)
        np.random.shuffle(X_valid_neg_all)
        x_neg_valid = X_valid_neg_all[:valid_neg_num]
        x_valid = np.vstack((x_pos_valid, x_neg_valid))

        net = Net()
        label_loss = nn.BCELoss(reduction='none')

        if USE_CUDA:
            net = net.cuda()
        optimizer = torch.optim.Adam(net.parameters(), lr=lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=5,
                                              gamma=0.8)
        earlystop = EarlyStopping(patience=10)

        valid_dataset = TensorDataset(
            torch.from_numpy(x_valid).cuda(),
            torch.from_numpy(Y_valid).cuda())
        valid_loader = DataLoader(dataset=valid_dataset,
                                  batch_size=bs,
                                  shuffle=True)
        for epochs in range(MAX_EPOCHS):
            np.random.shuffle(X_train_neg_all)
            x_neg_train = X_train_neg_all[:train_neg_num]
            x_train = np.vstack((x_pos_train, x_neg_train))
            train_dataset = TensorDataset(
                torch.from_numpy(x_train).cuda(),
                torch.from_numpy(Y_train).cuda())
            train_loader = DataLoader(dataset=train_dataset,
                                      batch_size=bs,
                                      shuffle=True)
            Loss = 0
            net.train()
            for i, (x, y) in enumerate(train_loader):
                x = Variable(x)
                y = Variable(y)
                bss = x.size(0)
                output = net(x)
                #loss=torch.mean(label_loss(output,y)+lambd*pterm)
                loss = torch.mean(label_loss(output, y))
                if USE_CUDA:
                    Loss += loss.cpu().data.numpy() * bss
                else:
                    Loss += loss.data.numpy()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if i % 20 == 0 and i != 0:
                    pass
                    #print('\rEpoch {}, process {}, loss {}, loss1{}, loss2{}'.format(epochs,i*bs/x_train.shape[0],Loss/i,Loss1/i,Loss2/i),end='')
            Loss /= train_num
            prob = []
            Y = []
            validloss = 0
            net.eval()
            for i, (x, y) in enumerate(valid_loader):
                Y.append(y.cpu().data.numpy())
                x = Variable(x)
                y = Variable(y)
                bss = x.size(0)
                with torch.no_grad():
                    output = net(x)
                    prob.append(output.cpu().data.numpy())
                    #loss=torch.mean(label_loss(output,y)+lambd*pterm)
                    loss = torch.mean(label_loss(output, y))

                    validloss += loss.cpu().data.numpy() * bss
            validloss /= valid_num
            prob = np.concatenate(prob)
            Y = np.concatenate(Y)
            vfpr, vtpr, vthresholds = metrics.roc_curve(Y, prob, pos_label=1)
            vauc = metrics.auc(vfpr, vtpr)
            print('Epoch {}, trainloss {}, validloss {}, vauc: {}'.format(
                epochs, Loss, validloss, vauc))
            #print(' vauc: {}'.format(vauc))
            earlystop(validloss, net, savedir)
            if earlystop.early_stop:
                print('early_stopping at {}'.format(epochs))
                break
            scheduler.step()
예제 #24
0
                 conv3_filter_size=(2, 2),
                 pool3_pool_size=(2, 2),
                 dropout3_p=0.3,
                 hidden4_num_units=1000,
                 dropout4_p=0.5,
                 hidden5_num_units=1000,
                 output_num_units=30,
                 output_nonlinearity=None,
                 update_learning_rate=theano.shared(float32(0.03)),
                 update_momentum=theano.shared(float32(0.9)),
                 regression=True,
                 batch_iterator_train=FlipBatchIterator(batch_size=128),
                 on_epoch_finished=[
                     AdjustVariable('update_learning_rate',
                                    start=0.03,
                                    stop=0.0001),
                     AdjustVariable('update_momentum', start=0.9, stop=0.999),
                     EarlyStopping(patience=200)
                 ],
                 max_epochs=10000,
                 verbose=1)

X, y = load2d()  # load 2D data
net8.fit(X, y)

# Training for 10000 epochs will take a while. We'll pickle the
# trained model so that we can load it back later:

from pickle import dump
with open("net8_{0}.pickle".format(sys.argv[1]), 'wb') as f:
    dump(net8, f, -1)
예제 #25
0
    def train(self, train_param, verbose=False):
        # unpack
        model = train_param['model']
        data = train_param['data']
        loss_param = train_param['loss']
        loss_param['train_loss'] = train_param['train_loss']
        loss_param['test_loss'] = train_param['test_loss']

        # write loss file
        loss_file = open(self.path_log + 'loss_' + self.current_hash + '.txt',
                         "a")
        loss_file.write(
            '\n==========================================================================\n'
        )

        # get training data loader
        train_batch_size = math.ceil(
            data.y_train_tensor.size(0) / train_param['num_batch'])
        data_train_loader = DataLoader(list(
            zip(data.X_train_tensor, data.y_train_tensor,
                data.X_train_origin)),
                                       batch_size=train_batch_size,
                                       shuffle=True)

        # get test data loader
        val_batch_size = math.ceil(
            data.y_val_tensor.size(0) / train_param['num_batch'])
        data_val_loader = DataLoader(list(
            zip(data.X_val_tensor, data.y_val_tensor, data.X_val_origin)),
                                     batch_size=val_batch_size,
                                     shuffle=True)

        # get test data loader
        test_batch_size = math.ceil(
            data.y_test_tensor.size(0) / train_param['num_batch'])
        data_test_loader = DataLoader(list(
            zip(data.X_test_tensor, data.y_test_tensor, data.X_test_origin)),
                                      batch_size=test_batch_size,
                                      shuffle=True)

        # Optimizer
        optimizer = train_param['optimizer'](
            model.parameters(), weight_decay=train_param['L2_reg'])

        # cyclical scheduler
        if train_param['cyclical']:
            scheduler = torch.optim.lr_scheduler.CyclicLR(
                optimizer, **train_param['cyclical'])

        # MSE Loss
        criterion = torch.nn.MSELoss()

        # path to save model
        path_to_model = self.path_model + '%s.pt' % self.current_hash

        # for early stopping
        if train_param['early_stopping']:
            train_param['early_stopping']['saved_model'] = path_to_model
            early_stopping = EarlyStopping(**train_param['early_stopping'])

        # For Recording Losses
        NUMEPOCHS = train_param['epochs']
        train_losses = np.zeros(NUMEPOCHS)
        val_losses = np.zeros(NUMEPOCHS)
        test_losses = np.zeros(NUMEPOCHS)
        train_phy_losses = np.zeros(NUMEPOCHS)
        val_phy_losses = np.zeros(NUMEPOCHS)
        test_phy_losses = np.zeros(NUMEPOCHS)
        train_norm_phy_losses = np.zeros(NUMEPOCHS)
        val_norm_phy_losses = np.zeros(NUMEPOCHS)
        test_norm_phy_losses = np.zeros(NUMEPOCHS)
        train_e_losses = np.zeros(NUMEPOCHS)
        val_e_losses = np.zeros(NUMEPOCHS)
        test_e_losses = np.zeros(NUMEPOCHS)
        train_all_losses = np.zeros(NUMEPOCHS)
        val_all_losses = np.zeros(NUMEPOCHS)
        test_all_losses = np.zeros(NUMEPOCHS)
        lambda_s_train = np.zeros(NUMEPOCHS)
        lambda_e_train = np.zeros(NUMEPOCHS)
        lambda_s_test = np.zeros(NUMEPOCHS)
        lambda_e_test = np.zeros(NUMEPOCHS)

        # write log file
        task_timestamp = self.str_now()
        statistics = \
        """
==========================================================================
Action: training model.
Time: %s
Task Id: %s

Number of Epochs:   %d
Train Batch Size:   %d
Test Batch Size:    %d
Optimizer:          %s

Training Loss:      %s
Test Loss:          %s

--------------------------------------------------------------------------
            
        """ % \
        (
            task_timestamp,
            self.current_hash,
            NUMEPOCHS,
            train_batch_size,
            test_batch_size,
            optimizer,
            train_param['train_loss'],
            train_param['test_loss'],
        )

        if verbose:
            print(statistics)

        # write log file
        log_file = open(self.path_log + 'log_' + self.current_hash + '.txt',
                        "a")
        log_file.write(statistics)
        log_file.close()
        loss_file.write('Time: %s\n' % task_timestamp)
        loss_file.write('Task Id: %s\n' % self.current_hash)

        # training
        loss_file.write(
            'Epoch \t Training \t Test \t\t Loss-Phy \t Loss-E \t Anealing Factor\n'
        )

        e_coff = loss_param['lambda_e0']
        s_coff = loss_param['lambda_s']

        if loss_param['noise']:
            noise_param = loss_param['noise']
            mode = noise_param['mode']
            mean = noise_param['mean']
            var = noise_param['var']
            noise_decay = noise_param['decay']

        if loss_param['cyclical']:
            cyclic_param = loss_param['cyclical']
            cyclic_mode = cyclic_param['mode']
            cyclic_mean = cyclic_param['mean']
            amp = cyclic_param['amp']
            period = cyclic_param['period']
            cyclic_decay = cyclic_param['decay']

        # the progress bar
        if self.master_bar is not None:
            child_bar = progress_bar(range(NUMEPOCHS), parent=self.master_bar)
            self.master_bar.names = ['train', 'val', 'test']
        else:
            child_bar = range(NUMEPOCHS)

        # record when training started
        train_time = 0

        if 'break_loop_early' in train_param:
            if train_param['break_loop_early'] == False:
                stopped_epoch = NUMEPOCHS - 1

        # save initial state of the model
        torch.save(model.state_dict(), self.path_state + 'state_0.pt')

        for epoch in child_bar:
            model.train()
            start_time = time.time()  # start recording time
            if train_param['train_loss'] != []:
                for batchX, batchY, batchH in data_train_loader:
                    # Forward pass
                    outputs = model(batchX)

                    # add noise
                    if loss_param['noise']:
                        s_coff = loss_param['lambda_s']
                        noise = self.generate_noise(mode=mode,
                                                    mean=mean,
                                                    var=var)
                        s_coff += noise
                        s_coff = max(0, s_coff)

                    # add noise
                    if loss_param['cyclical']:
                        s_coff = loss_param['lambda_s']
                        cyclic = self.cyclical(mode=cyclic_mode,
                                               epoch=epoch,
                                               mean=cyclic_mean,
                                               amp=amp,
                                               period=period)
                        s_coff += cyclic
                        s_coff = max(0, s_coff)

                    lambda_s_train[epoch] = s_coff
                    lambda_e_train[epoch] = e_coff

                    # calculate gradients and save
                    (loss_all, loss_mse, loss_phy, loss_phy_norm, loss_e,
                     loss_se) = self.loss_for_grad(data,
                                                   train_param['train_loss'],
                                                   outputs=outputs,
                                                   e_coff=e_coff,
                                                   s_coff=s_coff,
                                                   batchX=batchX,
                                                   batchY=batchY,
                                                   batchH=batchH,
                                                   norm=loss_param['norm_wf'])
                    grad_all = self.calc_gradient(
                        loss=loss_all,
                        model=model,
                        save_name='train_all_%d.pkl' % (epoch + 1))
                    grad_mse = self.calc_gradient(
                        loss=loss_mse,
                        model=model,
                        save_name='train_mse_%d.pkl' % (epoch + 1))
                    grad_phy = self.calc_gradient(loss=loss_phy,
                                                  model=model,
                                                  save_name='train_s_%d.pkl' %
                                                  (epoch + 1))
                    grad_phy_norm = self.calc_gradient(
                        loss=loss_phy_norm,
                        model=model,
                        save_name='train_train_s_norm_%d.pkl' % (epoch + 1))
                    grad_e = self.calc_gradient(loss=loss_e,
                                                model=model,
                                                save_name='train_e_%d.pkl' %
                                                (epoch + 1))
                    grad_se = self.calc_gradient(loss=loss_se,
                                                 model=model,
                                                 save_name='train_se_%d.pkl' %
                                                 (epoch + 1))

                    # Backward and optimize
                    optimizer.zero_grad()
                    loss = self.loss_func(data,
                                          train_param['train_loss'],
                                          outputs=outputs,
                                          e_coff=e_coff,
                                          s_coff=s_coff,
                                          batchX=batchX,
                                          batchY=batchY,
                                          batchH=batchH,
                                          norm=loss_param['norm_wf'])[0]

                    loss.backward()
                    if train_param['cyclical']:
                        scheduler.step()
                    else:
                        optimizer.step()

            if train_param['test_loss'] != []:
                for batchX, batchY, batchH in data_test_loader:
                    # Forward pass
                    outputs = model(batchX)

                    # add noise
                    if loss_param['noise']:
                        s_coff = loss_param['lambda_s']
                        noise = self.generate_noise(mode=mode,
                                                    mean=mean,
                                                    var=var)
                        s_coff += noise
                        s_coff = max(0, s_coff)

                    # add noise
                    if loss_param['cyclical']:
                        s_coff = loss_param['lambda_s']
                        cyclic = self.cyclical(mode=cyclic_mode,
                                               epoch=epoch,
                                               mean=cyclic_mean,
                                               amp=amp,
                                               period=period)
                        s_coff += cyclic
                        s_coff = max(0, s_coff)

                    lambda_s_test[epoch] = s_coff
                    lambda_e_test[epoch] = e_coff

                    # calculate gradients and save
                    (loss_all, loss_mse, loss_phy, loss_phy_norm, loss_e,
                     loss_se) = self.loss_for_grad(
                         data,
                         train_param['test_loss'],
                         outputs=outputs,
                         e_coff=e_coff,
                         s_coff=s_coff,
                         batchX=batchX,
                         batchY=None,
                         batchH=batchH,
                         norm=loss_param['norm_wf'],
                     )

                    grad_all = self.calc_gradient(loss=loss_all,
                                                  model=model,
                                                  save_name='test_all_%d.pkl' %
                                                  (epoch + 1))
                    grad_mse = self.calc_gradient(loss=loss_mse,
                                                  model=model,
                                                  save_name='test_mse_%d.pkl' %
                                                  (epoch + 1))
                    grad_phy = self.calc_gradient(loss=loss_phy,
                                                  model=model,
                                                  save_name='test_s_%d.pkl' %
                                                  (epoch + 1))
                    grad_phy_norm = self.calc_gradient(
                        loss=loss_phy_norm,
                        model=model,
                        save_name='test_s_norm_%d.pkl' % (epoch + 1))
                    grad_e = self.calc_gradient(loss=loss_e,
                                                model=model,
                                                save_name='test_e_%d.pkl' %
                                                (epoch + 1))
                    grad_se = self.calc_gradient(loss=loss_se,
                                                 model=model,
                                                 save_name='test_se_%d.pkl' %
                                                 (epoch + 1))

                    loss = self.loss_func(
                        data,
                        train_param['test_loss'],
                        outputs=outputs,
                        e_coff=e_coff,
                        s_coff=s_coff,
                        batchX=batchX,
                        batchY=None,
                        batchH=batchH,
                        norm=loss_param['norm_wf'],
                    )[0]

                    # Backward and optimize
                    optimizer.zero_grad()
                    loss.backward()
                    if train_param['cyclical']:
                        scheduler.step()
                    else:
                        optimizer.step()

            end_time = time.time()  # end recording time
            train_time += end_time - start_time  # accumulate training time

            # record the loss history
            model.eval()

            # save initial state of the model
            torch.save(model.state_dict(),
                       self.path_state + 'state_%d.pt' % (epoch + 1))

            train_losses[epoch] = criterion(model(data.X_train_tensor),
                                            data.y_train_tensor).item()
            val_losses[epoch] = criterion(model(data.X_val_tensor),
                                          data.y_val_tensor).item()
            test_losses[epoch] = criterion(model(data.X_test_tensor),
                                           data.y_test_tensor).item()

            s_coff = loss_param['lambda_s']
            # train losses
            (loss, train_phy_losses[epoch], train_norm_phy_losses[epoch],
             train_e_losses[epoch]) = self.loss_func(
                 data,
                 train_param['train_loss'],
                 outputs=model(data.X_train_tensor),
                 e_coff=e_coff,
                 s_coff=s_coff,
                 batchX=data.X_train_tensor,
                 batchY=data.y_train_tensor,
                 batchH=data.X_train_origin,
                 norm=loss_param['norm_wf'])
            train_all_losses[epoch] = float(loss)

            # val losses
            (loss, val_phy_losses[epoch], val_norm_phy_losses[epoch],
             val_e_losses[epoch]) = self.loss_func(data,
                                                   train_param['test_loss'],
                                                   outputs=model(
                                                       data.X_val_tensor),
                                                   e_coff=e_coff,
                                                   s_coff=s_coff,
                                                   batchX=data.X_val_tensor,
                                                   batchY=data.y_val_tensor,
                                                   batchH=data.X_val_origin,
                                                   norm=loss_param['norm_wf'])
            val_all_losses[epoch] = float(loss)

            # test losses
            (loss, test_phy_losses[epoch], test_norm_phy_losses[epoch],
             test_e_losses[epoch]) = self.loss_func(data,
                                                    train_param['test_loss'],
                                                    outputs=model(
                                                        data.X_test_tensor),
                                                    e_coff=e_coff,
                                                    s_coff=s_coff,
                                                    batchX=data.X_test_tensor,
                                                    batchY=data.y_test_tensor,
                                                    batchH=data.X_test_origin,
                                                    norm=loss_param['norm_wf'])
            test_all_losses = float(loss)

            if epoch % loss_param['anneal_interval'] == 0:
                e_coff *= loss_param['anneal_factor']
                if loss_param['noise']:
                    var *= noise_decay
                if loss_param['cyclical']:
                    amp *= cyclic_decay

            if epoch % train_param['print_interval'] == 0:
                loss_file.write(
                    '%d \t %.8f \t %.8f \t %.8f \t %.8f \t %.8f\n' %
                    (epoch, train_losses[epoch], test_losses[epoch],
                     val_phy_losses[epoch], val_e_losses[epoch], e_coff))

            # plot loss curve
            if epoch % 1 == 0 and self.master_bar is not None and self.plot_flag:
                y_upper_bound = max(train_losses.max(), val_losses[0].max(),
                                    test_losses[0].max())
                x_axis = np.arange(epoch + 1) + 1
                graphs = [[x_axis, train_losses[:epoch + 1]],
                          [x_axis, val_losses[:epoch + 1]],
                          [x_axis, test_losses[:epoch + 1]]]
                x_bounds = [0, NUMEPOCHS]
                y_bounds = [0.0, y_upper_bound]
                self.master_bar.update_graph(graphs, x_bounds, y_bounds)

            # early stopping
            if train_param['early_stopping']:
                early_stopping(val_losses[epoch], model)
                if early_stopping.early_stop:
                    if 'break_loop_early' in train_param:
                        if train_param['break_loop_early'] == True:
                            break
                        else:
                            stopped_epoch = min(epoch, stopped_epoch)
                    else:
                        break

        # record when training stopped and calculate time
        time_per_epoch = train_time / epoch

        if 'break_loop_early' in train_param:
            if train_param['break_loop_early'] == False:
                epoch = stopped_epoch

        # print loss in log files
        if verbose and self.master_bar is not None:
            self.master_bar.write('Training stopped at %d/%d.' %
                                  (epoch, NUMEPOCHS))
        loss_file.write('Training stopped at %d/%d.' % (epoch, NUMEPOCHS))
        loss_file.write('Training time: %f seconds.' % train_time)
        loss_file.write('\nTraining Complete')
        loss_file.write(
            '\n--------------------------------------------------------------------------\n'
        )
        loss_file.close()

        # data frame for losses
        df_loss = pd.DataFrame({
            'train_mse': train_losses,
            'val_mse': train_losses,
            'test_mse': test_losses,
            'train_phy': train_phy_losses,
            'val_phy': val_phy_losses,
            'test_phy': test_phy_losses,
            'train_norm_phy': train_norm_phy_losses,
            'train_e': train_e_losses,
            'val_e': val_e_losses,
            'test_e': test_e_losses,
            'val_norm_phy': val_norm_phy_losses,
            'test_norm_phy': test_norm_phy_losses,
            'train_all': train_all_losses,
            'val_all': val_all_losses,
            'test_all': test_all_losses,
            'lambda_s_train': lambda_s_train,
            'lambda_s_test': lambda_s_test,
            'lambda_e_train': lambda_e_train,
            'lambda_e_test': lambda_e_test
        })
        df_loss.to_csv(self.path_out + "losses_" + self.current_hash + ".csv",
                       index=False)

        # training statistics to return
        train_stats = {
            'epoch': epoch,
            'train_time': train_time,
            'time_per_epoch': time_per_epoch
        }

        # save or load model
        if train_param['early_stopping']:
            model.load_state_dict(torch.load(path_to_model))
        else:
            torch.save(model.state_dict(), path_to_model)
        return model, train_stats
예제 #26
0
파일: train.py 프로젝트: hungryjireh/prta
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=hp.lr,
                         warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    binary_criterion = nn.BCEWithLogitsLoss(
        pos_weight=torch.Tensor([3932 / 14263]))

    avg_train_losses = []
    avg_valid_losses = []

    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=hp.patience, verbose=True)

    for epoch in range(1, hp.n_epochs + 1):
        print("=========eval at epoch={epoch}=========")
        if not os.path.exists('checkpoints'): os.makedirs('checkpoints')
        if not os.path.exists('results'): os.makedirs('results')
        fname = os.path.join('checkpoints', timestr)
        spath = os.path.join('checkpoints', timestr + ".pt")

        train_loss = train(model, train_iter, optimizer, criterion,
                           binary_criterion)
        avg_train_losses.append(train_loss.item())

        precision, recall, f1, valid_loss = eval(model, eval_iter, fname,
                                                 criterion, binary_criterion)
        avg_valid_losses.append(valid_loss.item())
예제 #27
0
def fit(path, epochs=30):
    """
    Args:
        epochs: Number of training epochs. The BERT authors recommend between 2 and 4.
                We chose to run for 4, but we'll see later that this may be over-fitting the training data.
    """

    train_dataloader, validation_dataloader = dataloader(path)
    model, optimizer = get_model()

    early_stopping = EarlyStopping()
    # Tell pytorch to run this model on the GPU.
    # model.cuda()
    model.cpu()
    # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    device = torch.device("cpu")
    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # Measure the total training time for the whole run.
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):

        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        # Put the model into training mode. Don't be mislead--the call to
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(tqdm(train_dataloader)):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed))

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because
            # accumulating the gradients is "convenient while training RNNs".
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()

            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # arge given and what flags are set. For our useage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.
            loss, logits = model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value
            # from the tensor.
            total_train_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables
        total_eval_accuracy = 0
        total_eval_loss = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using
            # the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here:
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                (loss, logits) = model(b_input_ids,
                                       token_type_ids=None,
                                       attention_mask=b_input_mask,
                                       labels=b_labels)

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the jaccard for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_accuracy += jaccard(logits, label_ids)

        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Add early stopping
        early_stopping(avg_val_accuracy, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(
        format_time(time.time() - total_t0)))
예제 #28
0
# gpu_used = int(get_free_gpu())
model = torchfcn.models.AutoEncoderConv3().cuda()
model.apply(weight_init)

summary(model, input_size=(1, 256, 256))
model = nn.DataParallel(model)
if args.loss == "MSE":
    criterion = nn.MSELoss()

if args.optimiser == "Adam":
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
else:
    optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr)

early_stopping = EarlyStopping(patience=4)


def train(epoch):
    model.train()
    train_loss = 0
    count = 0
    for i, (_, img) in tqdm.tqdm(enumerate(train_loader),
                                 total=len(train_loader),
                                 desc='Train epoch=%d' % epoch,
                                 ncols=80,
                                 leave=False):

        img = img.float()
        img = img[:, np.newaxis, :, :]
        img = Variable(img.cuda())
예제 #29
0
        print("Using", torch.cuda.device_count(), "NVIDIA 1080TI GPUs!")

if GPU_SELECT == 1:
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
    print("Using one (the second) NVIDIA 1080TI GPU!")

if GPU_SELECT == 0:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Using one (the first) NVIDIA 1080TI GPU!")

# In[2]:

from early_stopping import EarlyStopping
from dataset3 import dataset

early_stopping = EarlyStopping(
    patience=patience, verbose=True)  # initialize the early_stopping object

# Counter for the execution time
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

start.record()

if OPTIMIZATION_PLUGIN == 'Bayesian':
    from bayes_opt import BayesianOptimization

    #def black_box_function(x, y):
    def objective(SCI_RELU, SCI_BIAS, SCI_loss_type, SCI_optimizer, SCI_LR,
                  SCI_MM, SCI_REGULARIZATION, SCI_EPOCHS, SCI_BATCH_SIZE,
                  SCI_DROPOUT, SCI_L_SECOND, SCI_BN_MOMENTUM, SCI_SGD_MOMENTUM,
                  SCI_BN_EPS, SCI_BN_STATS, SCI_LAST_LAYER, SCI_ACT_LAYER):
예제 #30
0
def train_pytorch(**kwargs):
    CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True)

    # 调用logging.basicConfig会给进程添加一个root logger,这样其他模块中logger的日志才会显示到console当中
    # (子logger传到root logger,root logger通过他自带的StreamHandler输出)。
    # 如果不调用logging.basicConfig,必须得每个子logger配置一个StreamHandler,很麻烦
    logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.INFO)
    formater = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # Print logs to the terminal.
    # stream_handler = logging.StreamHandler()
    # stream_handler.setFormatter(formater)
    # # Save logs to file.
    log_path = CHECKPOINT_PATH / 'train.log'
    file_handler = logging.FileHandler(filename=log_path,
                                       mode='w',
                                       encoding='utf-8')
    file_handler.setFormatter(formater)

    # logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    inputs = kwargs['inputs']
    outputs = kwargs['outputs']
    # test_inputs = kwargs['test_inputs']

    gkf = GroupKFold(n_splits=kwargs['n_splits']).split(X=df_train.q2,
                                                        groups=df_train.id)

    # sss = StratifiedShuffleSplit(n_splits=kwargs['n_splits'], test_size=0.2, random_state=RANDOM_SEED).split(X=df_train.q2,
    # y=df_train.label)
    # skf = StratifiedKFold(n_splits=kwargs['n_splits'], shuffle=True, random_state=RANDOM_SEED).split(X=df_train.q2, y=outputs)

    # oof = np.zeros((len(df_train),1))
    # all_pred = np.zeros(shape=(len(df_train), 2))     # 分类任务
    all_pred = np.zeros(shape=(len(df_train)))  # 回归任务
    all_true = np.zeros(shape=(len(df_train)))
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        # for fold, (train_idx, valid_idx) in enumerate(skf):
        logger.info(f'Fold No. {fold}')
        train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]
        train_outputs = outputs[train_idx]

        train_qa_id = df_train[['id', 'id_sub', 'label']].iloc[train_idx]

        # ===============================================================
        # 通过反向翻译进行样本增强(只增强正样本)
        # 获得训练集样本的(id, id_sub)
        # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()])
        # # 从增强样本中找出训练集中出现的样本
        # mask = df_train_ex[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1)
        # df_train_fold = df_train_ex[mask]

        # 获得训练集样本的(id, id_sub)
        # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()])
        # # 从增强样本中找出训练集中出现的样本
        # mask = df_train_aug[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1)
        # df_train_fold = df_train_aug[mask]
        # train_inputs, train_inputs_overlap = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
        # train_outputs = compute_output_arrays(df_train_fold, output_categories)

        # df_train_fold = df_train.iloc[train_idx]
        # train_q_aug = []
        # for x in tqdm(df_train_fold['q1']):
        # train_q_aug.append(eda_one(x))
        # train_a_aug = []
        # for x in tqdm(df_train_fold['q2']):
        # train_a_aug.append(eda_one(x))
        # df_train_fold = pd.DataFrame(data={'q1': train_q_aug, 'q2': train_a_aug})

        # train_inputs, train_inputs_overlap = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
        # train_outputs = compute_output_arrays(df_train_fold, output_categories)

        # 添加安居客数据到训练集
        # train_inputs = [np.concatenate([train_inputs[i], anjuke_inputs[i]], axis=0) for i in range(len(inputs))]
        # train_outputs = np.concatenate([train_outputs, anjuke_outputs], axis=0)
        # ================================================================

        valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]
        valid_outputs = outputs[valid_idx]
        valid_qa_id = df_train[['id', 'id_sub', 'label']].iloc[valid_idx]

        train_set = HouseDataset(train_inputs, train_outputs, train_qa_id)
        valid_set = HouseDataset(valid_inputs, valid_outputs, valid_qa_id)
        # test_set = HouseDataset(test_inputs, np.zeros_like(test_inputs[0])) # 测试集没有标签

        logger.info('Train set size: {}, valid set size {}'.format(
            len(train_set), len(valid_set)))

        train_loader = DataLoader(
            train_set,
            batch_size=kwargs['batch_size'],
            #   shuffle=True  # 如果使用分类训练,设为True
        )

        valid_loader = DataLoader(valid_set,
                                  batch_size=kwargs['valid_batch_size'])

        # test_loader = DataLoader(test_set,
        # batch_size=512)

        device = torch.device(f"cuda:{kwargs['device']}")
        # model = BertForHouseQA().cuda(device)
        model = torch.nn.DataParallel(BertForHouseQA(),
                                      device_ids=[1, 2, 3]).cuda(device)

        # 找到分数最高的checkpoint文件并加载
        # best_score_ = max([float(x.name[len(MODEL_NAME)+1:-3]) for x in CHECKPOINT_PATH.iterdir() if x.is_file()])
        # best_ckpt_path = CHECKPOINT_PATH/f'{MODEL_NAME}_{best_score_}.pt'
        # ckpt = torch.load(best_ckpt_path)
        # model.load_state_dict(ckpt['model_state_dict'])

        # 加载point-wise模型,使用pair-wise继续训练
        # 或者加载安居客模型
        # =====================================================
        # org_model = BertForHouseQA().cuda(device)
        # time_str = '2020-11-18-12:49:44'
        # org_ckpt_path = DATA_PATH / f"model_record/{MODEL_NAME}/{time_str}"
        # org_ckpt_path = DATA_PATH / f'anjuke/model_record/{MODEL_NAME}/{time_str}'
        # org_ckpt_paths = [x for x in org_ckpt_path.iterdir() if x.is_file() and x.suffix == '.pt']
        # prefix = f'{MODEL_NAME}_'
        # best_ckpt_path = [x for x in org_ckpt_paths if str(x.name).startswith(prefix)][0]
        # ckpt = torch.load(best_ckpt_path)
        # org_model.load_state_dict(ckpt['model_state_dict'])

        # model = BertClsToReg(org_model).cuda(device)
        # model = BertClsToCls(org_model).cuda(device)
        # =====================================================

        # List all modules inside the model.
        logger.info('Model modules:')
        for i, m in enumerate(model.named_children()):
            logger.info('{} -> {}'.format(i, m))

        # # Get the number of total parameters.
        # total_params = sum(p.numel() for p in model.parameters())
        # trainable_params = sum(p.numel()
        #                     for p in model.parameters() if p.requires_grad)

        # logger.info("Total params: {:,}".format(total_params))
        # logger.info("Trainable params: {:,}".format(trainable_params))

        # 使用HingeLoss
        criterion = torch.nn.MarginRankingLoss(margin=1.0)
        # criterion = torch.nn.MSELoss()
        # criterion = torch.nn.CrossEntropyLoss()
        # criterion_scl = SupConLoss(temperature=0.1, device=device)

        # optimizer = torch.optim.Adam(
        # model.parameters(), lr=kwargs['lr'], weight_decay=kwargs['weight_decay'])
        optimizer = transformers.AdamW(model.parameters(),
                                       lr=kwargs['lr'],
                                       weight_decay=kwargs['weight_decay'])
        logger.info('Optimizer:')
        logger.info(optimizer)
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
        #    mode='min',
        #    patience=int(kwargs['patience']/2),
        #    verbose=True
        #    )
        scheduler = transformers.get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=4, num_training_steps=kwargs['epoch'])
        # best_score = 0.0
        stopper = EarlyStopping(patience=kwargs['patience'], mode='max')
        ckpt_path = None
        for epoch in range(kwargs['epoch']):
            pass
            # =======================Training===========================
            # Set model to train mode.
            model.train()
            steps = int(np.ceil(len(train_set) / kwargs['batch_size']))
            pbar = tqdm(desc='Epoch {}, loss {}'.format(epoch, 'NAN'),
                        total=steps)
            for i, sample in enumerate(train_loader):
                x, y = sample[0].cuda(device).long(), sample[1].cuda(
                    device).long()
                optimizer.zero_grad()

                feat, model_outputs = model(x)  # [batch_size, 2]
                # CrossEntropy
                # loss = criterion(model_outputs, y)
                # MSE
                # loss = criterion(model_outputs, y.float().unsqueeze(-1))

                # 使用 HingeLoss
                train_qa_id_sub = sample[2].cpu().detach().numpy()
                loss = get_hinge_loss(model_outputs, train_qa_id_sub,
                                      criterion)

                # 使用SCL
                # feat = F.normalize(feat, dim=-1).unsqueeze(1)
                # scl = criterion_scl(feat, y)
                # scl_weight = 0.3
                # loss = (1-scl_weight)*loss + scl_weight*scl
                # loss += scl

                loss.backward()
                optimizer.step()
                pbar.set_description('Epoch {}, train loss {:.4f}'.format(
                    epoch, loss.item()))
                pbar.update()
            pbar.close()
            # =========================================================
            # =======================Validation========================
            # Set model to evaluation mode.
            model.eval()
            with torch.no_grad():
                # Validation step
                valid_loss = []
                valid_pred = []
                valid_true = []
                steps = int(
                    np.ceil(len(valid_set) / kwargs['valid_batch_size']))
                pbar = tqdm(desc='Validating', total=steps)
                for i, sample in enumerate(valid_loader):
                    y_true_local = sample[1].numpy()
                    x, y_true = sample[0].cuda(device).long(), sample[1].cuda(
                        device).long()

                    feat, model_outputs = model(x)
                    # MSELoss
                    # loss = criterion(model_outputs, y_true.float().unsqueeze(-1)).cpu().detach().item()
                    # HingeLoss
                    valid_qa_id_sub = sample[2].cpu().detach().numpy()
                    loss = get_hinge_loss(model_outputs, valid_qa_id_sub,
                                          criterion).cpu().detach().item()
                    y_pred = model_outputs.cpu().detach().squeeze(-1).numpy()
                    # CrossEntropy
                    # loss = criterion(
                    # model_outputs, y_true).cpu().detach().item()
                    # y_pred = F.softmax(
                    # model_outputs.cpu().detach(), dim=1).numpy()

                    valid_loss.append(loss)
                    valid_pred.append(y_pred)
                    valid_true.append(y_true_local)
                    pbar.update()
            pbar.close()
            valid_loss = np.asarray(valid_loss).mean()
            valid_pred = np.concatenate(valid_pred, axis=0)
            valid_true = np.concatenate(valid_true, axis=0)

            # 如果使用回归模型
            valid_f1, thr = search_f1(valid_true, valid_pred)
            logger.info("Epoch {}, valid loss {:.5f}, valid f1 {:.4f}".format(
                epoch, valid_loss, valid_f1))

            # 如果使用分类模型
            # valid_pred_label = np.argmax(valid_pred, axis=1)
            # valid_auc = roc_auc_score(valid_true, valid_pred_label)
            # valid_p, valid_r, valid_f1, _ = precision_recall_fscore_support(
            # valid_true, valid_pred_label, average='binary')

            # logger.info(
            # "Epoch {}, valid loss {:.5f}, valid P {:.4f}, valid R {:.4f}, valid f1 {:.4f}, valid auc {:.4f}".format(
            # epoch, valid_loss, valid_p, valid_r, valid_f1, valid_auc)
            # )
            # logger.info('Confusion Matrix: ')
            # logger.info(confusion_matrix(y_true=valid_true,
            # y_pred=valid_pred_label, normalize='all'))

            # Apply ReduceLROnPlateau to the lr.
            scheduler.step(valid_f1)
            stop_flag, best_flag = stopper.step(valid_f1)
            if best_flag:
                # 删除之前保存的模型
                if ckpt_path is not None:
                    ckpt_path.unlink()
                ckpt_path = CHECKPOINT_PATH / \
                    f"{MODEL_NAME}_{fold}_{epoch}_{stopper.best_score}.pt"
                # 保存目前的最佳模型
                torch.save(
                    {
                        "model_name": "BertForHouseQA",
                        "epoch": epoch,
                        "valid_loss": valid_loss,
                        "valid_f1": valid_f1,
                        "model_state_dict": model.state_dict(),
                        "train_idx": train_idx,
                        "valid_idx": valid_idx,
                        "fold": fold,
                        # "optimizer_state_dict": optimizer.state_dict(),
                        "thr": thr
                        # 'scheduler_state_dict': scheduler.state_dict()
                    },
                    f=ckpt_path,
                )
                logger.info("A best score! Saved to checkpoints.")
                # 保存每个验证折的预测值,用作最后整个训练集的f1评估
                all_pred[valid_idx] = valid_pred
                all_true[valid_idx] = valid_true
            if stop_flag:
                logger.info("Stop training due to early stopping.")
                # 终止训练
                break
            # 保存每个验证折的预测值,用作最后整个训练集的f1评估
            # oof[valid_idx] = valid_pred
            # valid_f1, _ = search_f1(valid_outputs, valid_pred)  # 寻找最佳分类阈值和f1 score
            # print('Valid f1 score = ', valid_f1)
            # ==========================================================

    # 结束后,评估整个训练集
    # CrossEntropy
    # all_pred = np.argmax(all_pred, axis=1)
    # all_auc = roc_auc_score(all_true, all_pred)
    # all_p, all_r, all_f1, _ = precision_recall_fscore_support(
    # all_true, all_pred, average='binary')
    # logger.info(
    # "all P {:.4f}, all R {:.4f}, all f1 {:.4f}, all auc {:.4f}".format(
    # all_p, all_r, all_f1, all_auc)
    # )
    # logger.info('Confusion Matrix: ')
    # logger.info(confusion_matrix(y_true=all_true,
    #  y_pred=all_pred, normalize='all'))
    # MSELoss
    all_f1, all_thr = search_f1(all_true, all_pred)
    logger.info("All f1 {:.4f}, all thr {:.4f}".format(all_f1, all_thr))
    return all_f1, CHECKPOINT_PATH