示例#1
0
def train(loader, model, epochs=5, batch_size=2, show_loss=False, augmenter=None, lr=None, init_lr=2e-4,
          saver=None, variables_to_optimize=None, evaluation=True, name_best_model='/root/Ev-SegNet-old/weights/model/best',
          preprocess_mode=None):
    training_samples = len(loader.image_train_list)
    steps_per_epoch = int((training_samples / batch_size) + 1)
    best_miou = 0

    for epoch in range(epochs):  # for each epoch
        lr_decay(lr, init_lr, 1e-9, epoch, epochs - 1)  # compute the new lr
        print('epoch: ' + str(epoch) + '. Learning rate: ' + str(lr.numpy()))
        for step in range(steps_per_epoch):  # for every batch
            with tf.GradientTape() as g:
                # get batch
                # print("process:%.2f"%(step/steps_per_epoch), "\t", step, "/", steps_per_epoch)
                x, y, mask = loader.get_batch(size=batch_size, train=True, augmenter=augmenter)

                x = preprocess(x, mode=preprocess_mode)
                [x, y, mask] = convert_to_tensors([x, y, mask])

                y_, aux_y_ = model(x, training=True, aux_loss=True)  # get output of the model

                loss = tf.losses.softmax_cross_entropy(y, y_, weights=mask)  # compute loss
                loss_aux = tf.losses.softmax_cross_entropy(y, aux_y_, weights=mask)  # compute loss
                loss = 1 * loss + 0.8 * loss_aux
                if show_loss: print('Training loss: ' + str(loss.numpy()))

            # Gets gradients and applies them
            grads = g.gradient(loss, variables_to_optimize)
            optimizer.apply_gradients(zip(grads, variables_to_optimize))

        if evaluation:
            # get metrics
            # train_acc, train_miou = get_metrics(loader, model, loader.n_classes, train=True, preprocess_mode=preprocess_mode)
            test_acc, test_miou = get_metrics(loader, model, loader.n_classes, train=False, flip_inference=False,
                                              scales=[1], preprocess_mode=preprocess_mode)

            # print('Train accuracy: ' + str(train_acc.numpy()))
            # print('Train miou: ' + str(train_miou))
            print('Test accuracy: ' + str(test_acc.numpy()))
            print('Test miou: ' + str(test_miou))
            print('Best miou: ' + str(best_miou))
            print('')

            # save model if bet
            if test_miou > best_miou:
                best_miou = test_miou
                saver.save(name_best_model)
        else:
            saver.save(name_best_model)

        loader.suffle_segmentation()  # sheffle trainign set
示例#2
0
def train(data_loader,
          model_pos,
          criterion,
          optimizer,
          device,
          lr_init,
          lr_now,
          step,
          decay,
          gamma,
          max_norm=True):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    epoch_loss_3d_pos = AverageMeter()

    # Switch to train mode
    torch.set_grad_enabled(True)
    model_pos.train()
    end = time.time()

    bar = Bar('Train', max=len(data_loader))
    for i, (targets_3d, inputs_2d, _, _) in enumerate(data_loader):
        # Measure data loading time
        data_time.update(time.time() - end)
        num_poses = targets_3d.size(0)

        step += 1
        if step % decay == 0 or step == 1:
            lr_now = lr_decay(optimizer, step, lr_init, decay, gamma)

        targets_3d, inputs_2d = targets_3d.to(device), inputs_2d.to(device)
        targets_3d = targets_3d[:, :, :] - targets_3d[:, :
                                                      1, :]  # the output is relative to the 0 joint

        outputs_3d = model_pos(inputs_2d)

        optimizer.zero_grad()
        loss_3d_pos = criterion(outputs_3d, targets_3d)
        loss_3d_pos.backward()
        if max_norm:
            nn.utils.clip_grad_norm_(model_pos.parameters(), max_norm=1)
        optimizer.step()

        epoch_loss_3d_pos.update(loss_3d_pos.item(), num_poses)

        # Measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        bar.suffix = '({batch}/{size}) Data: {data:.6f}s | Batch: {bt:.3f}s | Total: {ttl:} | ETA: {eta:} ' \
                     '| Loss: {loss: .4f}' \
            .format(batch=i + 1, size=len(data_loader), data=data_time.avg, bt=batch_time.avg,
                    ttl=bar.elapsed_td, eta=bar.eta_td, loss=epoch_loss_3d_pos.avg)
        bar.next()

    bar.finish()
    return epoch_loss_3d_pos.avg, lr_now, step
示例#3
0
def main(opt):
    start_epoch = 0
    err_best = 10000
    lr_now = opt.lr
    is_cuda = torch.cuda.is_available()

    script_name = os.path.basename(__file__).split('.')[0]
    script_name = script_name + '_in{:d}_out{:d}_dctn_{:d}'.format(
        opt.input_n, opt.output_n, opt.dct_n)

    # create model
    print(">>> creating model")
    input_n = opt.input_n
    output_n = opt.output_n
    dct_n = opt.dct_n

    model = nnmodel.GCN(input_feature=dct_n,
                        hidden_feature=opt.linear_size,
                        p_dropout=opt.dropout,
                        num_stage=opt.num_stage,
                        node_n=69)

    if is_cuda:
        model.cuda()

    print(">>> total params: {:.2f}M".format(
        sum(p.numel() for p in model.parameters()) / 1000000.0))
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    if opt.is_load:
        model_path_len = 'checkpoint/test/ckpt_main_last.pth.tar'
        print(">>> loading ckpt len from '{}'".format(model_path_len))
        if is_cuda:
            ckpt = torch.load(model_path_len)
        else:
            ckpt = torch.load(model_path_len, map_location='cpu')
        start_epoch = ckpt['epoch']
        err_best = ckpt['err']
        lr_now = ckpt['lr']
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])
        print(">>> ckpt len loaded (epoch: {} | err: {})".format(
            start_epoch, err_best))

    # data loading
    print(">>> loading data")
    train_dataset = Pose3dPW(path_to_data=opt.data_dir_3dpw,
                             input_n=input_n,
                             output_n=output_n,
                             dct_n=dct_n,
                             split=0)
    dim_used = train_dataset.dim_used
    test_dataset = Pose3dPW(path_to_data=opt.data_dir_3dpw,
                            input_n=input_n,
                            output_n=output_n,
                            dct_n=dct_n,
                            split=1)
    val_dataset = Pose3dPW(path_to_data=opt.data_dir_3dpw,
                           input_n=input_n,
                           output_n=output_n,
                           dct_n=dct_n,
                           split=2)

    # load dadasets for training
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=opt.train_batch,
                              shuffle=True,
                              num_workers=opt.job,
                              pin_memory=True)
    test_loader = DataLoader(dataset=test_dataset,
                             batch_size=opt.test_batch,
                             shuffle=False,
                             num_workers=opt.job,
                             pin_memory=True)
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=opt.test_batch,
                            shuffle=False,
                            num_workers=opt.job,
                            pin_memory=True)
    print(">>> data loaded !")
    print(">>> train data {}".format(train_dataset.__len__()))
    print(">>> test data {}".format(test_dataset.__len__()))
    print(">>> validation data {}".format(val_dataset.__len__()))

    for epoch in range(start_epoch, opt.epochs):

        if (epoch + 1) % opt.lr_decay == 0:
            lr_now = utils.lr_decay(optimizer, lr_now, opt.lr_gamma)
        print('==========================')
        print('>>> epoch: {} | lr: {:.5f}'.format(epoch + 1, lr_now))
        ret_log = np.array([epoch + 1])
        head = np.array(['epoch'])
        # per epoch
        lr_now, t_l, t_err = train(train_loader,
                                   model,
                                   optimizer,
                                   input_n=input_n,
                                   dct_n=dct_n,
                                   dim_used=dim_used,
                                   lr_now=lr_now,
                                   max_norm=opt.max_norm,
                                   is_cuda=is_cuda)
        ret_log = np.append(ret_log, [lr_now, t_l, t_err])
        head = np.append(head, ['lr', 't_l', 't_err'])

        v_err = val(val_loader,
                    model,
                    input_n=input_n,
                    dct_n=dct_n,
                    dim_used=dim_used,
                    is_cuda=is_cuda)

        ret_log = np.append(ret_log, v_err)
        head = np.append(head, ['v_err'])

        test_3d = test(test_loader,
                       model,
                       input_n=input_n,
                       output_n=output_n,
                       dct_n=dct_n,
                       dim_used=dim_used,
                       is_cuda=is_cuda)
        # ret_log = np.append(ret_log, test_l)
        ret_log = np.append(ret_log, test_3d)
        if output_n == 15:
            head = np.append(head,
                             ['1003d', '2003d', '3003d', '4003d', '5003d'])
        elif output_n == 30:
            head = np.append(head, [
                '1003d', '2003d', '3003d', '4003d', '5003d', '6003d', '7003d',
                '8003d', '9003d', '10003d'
            ])

        # update log file
        df = pd.DataFrame(np.expand_dims(ret_log, axis=0))
        if epoch == start_epoch:
            df.to_csv(opt.ckpt + '/' + script_name + '.csv',
                      header=head,
                      index=False)
        else:
            with open(opt.ckpt + '/' + script_name + '.csv', 'a') as f:
                df.to_csv(f, header=False, index=False)
        # save ckpt
        is_best = v_err < err_best
        err_best = min(v_err, err_best)
        file_name = [
            'ckpt_' + script_name + '_best.pth.tar',
            'ckpt_' + script_name + '_last.pth.tar'
        ]
        utils.save_ckpt(
            {
                'epoch': epoch + 1,
                'lr': lr_now,
                'err': test_3d[0],
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
            },
            ckpt_path=opt.ckpt,
            is_best=is_best,
            file_name=file_name)
示例#4
0
def main(opt):
    start_epoch = 0
    err_best = 10000
    lr_now = opt.lr
    is_cuda = torch.cuda.is_available()

    # save option in log
    script_name = os.path.basename(__file__).split('.')[0]
    script_name = script_name + '_3D_in{:d}_out{:d}_dct_n_{:d}'.format(
        opt.input_n, opt.output_n, opt.dct_n)

    # create model
    print(">>> creating model")
    input_n = opt.input_n
    output_n = opt.output_n
    dct_n = opt.dct_n
    sample_rate = opt.sample_rate

    model = nnmodel.GCN(input_feature=dct_n,
                        hidden_feature=opt.linear_size,
                        p_dropout=opt.dropout,
                        num_stage=opt.num_stage,
                        node_n=66)

    if is_cuda:
        model.cuda()

    print(">>> total params: {:.2f}M".format(
        sum(p.numel() for p in model.parameters()) / 1000000.0))
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    if opt.is_load:
        model_path_len = 'checkpoint/test/' + 'ckpt_' + script_name + '_last.pth.tar'
        print(">>> loading ckpt len from '{}'".format(model_path_len))
        if is_cuda:
            ckpt = torch.load(model_path_len)
        else:
            ckpt = torch.load(model_path_len, map_location='cpu')
        start_epoch = ckpt['epoch']
        err_best = ckpt['err']
        lr_now = ckpt['lr']
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])
        print(">>> ckpt len loaded (epoch: {} | err: {})".format(
            start_epoch, err_best))

    # data loading
    print(">>> loading data")
    train_dataset = H36motion3D(path_to_data=opt.data_dir,
                                actions='all',
                                input_n=input_n,
                                output_n=output_n,
                                split=0,
                                dct_used=dct_n,
                                sample_rate=sample_rate)

    acts = data_utils.define_actions('all')
    test_data = dict()
    for act in acts:
        test_dataset = H36motion3D(path_to_data=opt.data_dir,
                                   actions=act,
                                   input_n=input_n,
                                   output_n=output_n,
                                   split=1,
                                   sample_rate=sample_rate,
                                   dct_used=dct_n)
        test_data[act] = DataLoader(dataset=test_dataset,
                                    batch_size=opt.test_batch,
                                    shuffle=False,
                                    num_workers=opt.job,
                                    pin_memory=True)
    val_dataset = H36motion3D(path_to_data=opt.data_dir,
                              actions='all',
                              input_n=input_n,
                              output_n=output_n,
                              split=2,
                              dct_used=dct_n,
                              sample_rate=sample_rate)

    # load dadasets for training
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=opt.train_batch,
                              shuffle=True,
                              num_workers=opt.job,
                              pin_memory=True)
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=opt.test_batch,
                            shuffle=False,
                            num_workers=opt.job,
                            pin_memory=True)
    print(">>> data loaded !")
    print(">>> train data {}".format(train_dataset.__len__()))
    print(">>> test data {}".format(test_dataset.__len__()))
    print(">>> validation data {}".format(val_dataset.__len__()))

    for epoch in range(start_epoch, opt.epochs):

        if (epoch + 1) % opt.lr_decay == 0:
            lr_now = utils.lr_decay(optimizer, lr_now, opt.lr_gamma)

        print('==========================')
        print('>>> epoch: {} | lr: {:.5f}'.format(epoch + 1, lr_now))
        ret_log = np.array([epoch + 1])
        head = np.array(['epoch'])
        # per epoch
        lr_now, t_l = train(train_loader,
                            model,
                            optimizer,
                            lr_now=lr_now,
                            max_norm=opt.max_norm,
                            is_cuda=is_cuda,
                            dim_used=train_dataset.dim_used,
                            dct_n=dct_n)
        ret_log = np.append(ret_log, [lr_now, t_l])
        head = np.append(head, ['lr', 't_l'])

        v_3d = val(val_loader,
                   model,
                   is_cuda=is_cuda,
                   dim_used=train_dataset.dim_used,
                   dct_n=dct_n)

        ret_log = np.append(ret_log, [v_3d])
        head = np.append(head, ['v_3d'])

        test_3d_temp = np.array([])
        test_3d_head = np.array([])
        for act in acts:
            test_l, test_3d = test(test_data[act],
                                   model,
                                   input_n=input_n,
                                   output_n=output_n,
                                   is_cuda=is_cuda,
                                   dim_used=train_dataset.dim_used,
                                   dct_n=dct_n)
            # ret_log = np.append(ret_log, test_l)
            ret_log = np.append(ret_log, test_3d)
            head = np.append(
                head,
                [act + '3d80', act + '3d160', act + '3d320', act + '3d400'])
            if output_n > 10:
                head = np.append(head, [act + '3d560', act + '3d1000'])
        ret_log = np.append(ret_log, test_3d_temp)
        head = np.append(head, test_3d_head)

        # update log file and save checkpoint
        df = pd.DataFrame(np.expand_dims(ret_log, axis=0))
        if epoch == start_epoch:
            df.to_csv(opt.ckpt + '/' + script_name + '.csv',
                      header=head,
                      index=False)
        else:
            with open(opt.ckpt + '/' + script_name + '.csv', 'a') as f:
                df.to_csv(f, header=False, index=False)
        if not np.isnan(v_3d):
            is_best = v_3d < err_best
            err_best = min(v_3d, err_best)
        else:
            is_best = False
        file_name = [
            'ckpt_' + script_name + '_best.pth.tar',
            'ckpt_' + script_name + '_last.pth.tar'
        ]
        utils.save_ckpt(
            {
                'epoch': epoch + 1,
                'lr': lr_now,
                'err': test_3d[0],
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
            },
            ckpt_path=opt.ckpt,
            is_best=is_best,
            file_name=file_name)
示例#5
0
def main(opt):
    start_epoch = 0
    err_best = 10000
    lr_now = opt.lr
    is_cuda = torch.cuda.is_available()

    print(">>> loading data")
    input_n = opt.input_n
    output_n = opt.output_n
    dct_n = opt.dct_n
    sample_rate = opt.sample_rate

    #####################################################
    # Load data
    #####################################################
    data = DATA(opt.dataset, opt.data_dir)
    out_of_distribution = data.get_dct_and_sequences(input_n, output_n,
                                                     sample_rate, dct_n,
                                                     opt.out_of_distribution)
    train_loader, val_loader, OoD_val_loader, test_loaders = data.get_dataloaders(
        opt.train_batch, opt.test_batch, opt.job)
    print(">>> data loaded !")
    print(">>> train data {}".format(data.train_dataset.__len__()))
    if opt.dataset == 'h3.6m':
        print(">>> validation data {}".format(data.val_dataset.__len__()))

    #####################################################
    # Define script name
    #####################################################
    script_name = os.path.basename(__file__).split('.')[0]
    script_name = script_name + "_{}_in{:d}_out{:d}_dctn{:d}_dropout_{}".format(
        str(opt.dataset), opt.input_n, opt.output_n, opt.dct_n, str(
            opt.dropout))
    if out_of_distribution:
        script_name = script_name + "_OoD_{}_".format(
            str(opt.out_of_distribution))
    if opt.variational:
        script_name = script_name + "_var_lambda_{}_nz_{}_lr_{}_n_layers_{}".format(
            str(opt.lambda_), str(opt.n_z), str(opt.lr),
            str(opt.num_decoder_stage))

    ##################################################################
    # Instantiate model, and methods used fro training and valdation
    ##################################################################
    print(">>> creating model")
    model = nnmodel.GCN(input_feature=dct_n,
                        hidden_feature=opt.linear_size,
                        p_dropout=opt.dropout,
                        num_stage=opt.num_stage,
                        node_n=data.node_n,
                        variational=opt.variational,
                        n_z=opt.n_z,
                        num_decoder_stage=opt.num_decoder_stage)
    methods = MODEL_METHODS(model, is_cuda)
    if opt.is_load:
        start_epoch, err_best, lr_now = methods.load_weights(opt.load_path)
    print(">>> total params: {:.2f}M".format(
        sum(p.numel() for p in model.parameters()) / 1000000.0))
    methods.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)

    for epoch in range(start_epoch, opt.epochs):
        #####################################################################################################################################################
        # Training step
        #####################################################################################################################################################
        if (epoch + 1) % opt.lr_decay == 0:
            lr_now = utils.lr_decay(methods.optimizer, lr_now, opt.lr_gamma)
        print('==========================')
        print('>>> epoch: {} | lr: {:.5f}'.format(epoch + 1, lr_now))
        ret_log = np.array([epoch + 1])
        head = np.array(['epoch'])
        # per epoch
        lr_now, t_l, t_l_joint, t_l_vlb, t_l_latent, t_e, t_3d = methods.train(
            train_loader,
            dataset=opt.dataset,
            input_n=input_n,
            lr_now=lr_now,
            cartesian=data.cartesian,
            lambda_=opt.lambda_,
            max_norm=opt.max_norm,
            dim_used=data.train_dataset.dim_used,
            dct_n=dct_n)
        ret_log = np.append(
            ret_log, [lr_now, t_l, t_l_joint, t_l_vlb, t_l_latent, t_e, t_3d])
        head = np.append(
            head,
            ['lr', 't_l', 't_l_joint', 't_l_vlb', 't_l_latent', 't_e', 't_3d'])

        #####################################################################################################################################################
        # Evaluate on validation set; Keep track of best, either via val set, OoD val set (in the case of OoD), or train set in the case of the CMU dataset
        #####################################################################################################################################################
        if opt.dataset == 'h3.6m':
            v_e, v_3d = methods.val(val_loader,
                                    input_n=input_n,
                                    dim_used=data.train_dataset.dim_used,
                                    dct_n=dct_n)
            ret_log = np.append(ret_log, [v_e, v_3d])
            head = np.append(head, ['v_e', 'v_3d'])

            is_best, err_best = utils.check_is_best(v_e, err_best)
            if out_of_distribution:
                OoD_v_e, OoD_v_3d = methods.val(
                    OoD_val_loader,
                    input_n=input_n,
                    dim_used=data.train_dataset.dim_used,
                    dct_n=dct_n)
                ret_log = np.append(ret_log, [OoD_v_e, OoD_v_3d])
                head = np.append(head, ['OoD_v_e', 'OoD_v_3d'])
        else:
            is_best, err_best = utils.check_is_best(t_e, err_best)

        #####################################################
        # Evaluate on test set
        #####################################################
        test_3d_temp = np.array([])
        test_3d_head = np.array([])
        for act in data.acts_test:
            test_e, test_3d = methods.test(
                test_loaders[act],
                dataset=opt.dataset,
                input_n=input_n,
                output_n=output_n,
                cartesian=data.cartesian,
                dim_used=data.train_dataset.dim_used,
                dct_n=dct_n)
            ret_log = np.append(ret_log, test_e)
            test_3d_temp = np.append(test_3d_temp, test_3d)
            test_3d_head = np.append(
                test_3d_head,
                [act + '3d80', act + '3d160', act + '3d320', act + '3d400'])
            head = np.append(
                head, [act + '80', act + '160', act + '320', act + '400'])
            if output_n > 10:
                head = np.append(head, [act + '560', act + '1000'])
                test_3d_head = np.append(test_3d_head,
                                         [act + '3d560', act + '3d1000'])
        ret_log = np.append(ret_log, test_3d_temp)
        head = np.append(head, test_3d_head)

        #####################################################
        # Update log file and save checkpoint
        #####################################################
        df = pd.DataFrame(np.expand_dims(ret_log, axis=0))
        if epoch == start_epoch:
            df.to_csv(opt.ckpt + '/' + script_name + '.csv',
                      header=head,
                      index=False)
        else:
            with open(opt.ckpt + '/' + script_name + '.csv', 'a') as f:
                df.to_csv(f, header=False, index=False)
        file_name = [
            'ckpt_' + script_name + '_best.pth.tar',
            'ckpt_' + script_name + '_last.pth.tar'
        ]
        utils.save_ckpt(
            {
                'epoch': epoch + 1,
                'lr': lr_now,
                'err': test_e[0],
                'state_dict': model.state_dict(),
                'optimizer': methods.optimizer.state_dict()
            },
            ckpt_path=opt.ckpt,
            is_best=is_best,
            file_name=file_name)
示例#6
0
def main():
    cmd_ls = sys.argv[1:]
    cmd = generate_cmd(cmd_ls)
    if "--freeze_bn False" in cmd:
        opt.freeze_bn = False
    if "--addDPG False" in cmd:
        opt.addDPG = False

    print(
        "----------------------------------------------------------------------------------------------------"
    )
    print("This is the model with id {}".format(save_ID))
    print(opt)
    print("Training backbone is: {}".format(opt.backbone))
    dataset_str = ""
    for k, v in config.train_info.items():
        dataset_str += k
        dataset_str += ","
    print("Training data is: {}".format(dataset_str[:-1]))
    print("Warm up end at {}".format(warm_up_epoch))
    for k, v in config.bad_epochs.items():
        if v > 1:
            raise ValueError("Wrong stopping accuracy!")
    print(
        "----------------------------------------------------------------------------------------------------"
    )

    exp_dir = os.path.join("exp/{}/{}".format(folder, save_ID))
    log_dir = os.path.join(exp_dir, "{}".format(save_ID))
    os.makedirs(log_dir, exist_ok=True)
    log_name = os.path.join(log_dir, "{}.txt".format(save_ID))
    train_log_name = os.path.join(log_dir, "{}_train.xlsx".format(save_ID))
    bn_file = os.path.join(log_dir, "{}_bn.txt".format(save_ID))
    # Prepare Dataset

    # Model Initialize
    if device != "cpu":
        m = createModel(cfg=model_cfg).cuda()
    else:
        m = createModel(cfg=model_cfg).cpu()
    print(m, file=open("model.txt", "w"))

    begin_epoch = 0
    pre_train_model = opt.loadModel
    flops = print_model_param_flops(m)
    print("FLOPs of current model is {}".format(flops))
    params = print_model_param_nums(m)
    print("Parameters of current model is {}".format(params))
    inf_time = get_inference_time(m,
                                  height=opt.outputResH,
                                  width=opt.outputResW)
    print("Inference time is {}".format(inf_time))
    print(
        "----------------------------------------------------------------------------------------------------"
    )

    if opt.freeze > 0 or opt.freeze_bn:
        if opt.backbone == "mobilenet":
            feature_layer_num = 155
            feature_layer_name = "features"
        elif opt.backbone == "seresnet101":
            feature_layer_num = 327
            feature_layer_name = "preact"
        elif opt.backbone == "seresnet18":
            feature_layer_num = 75
            feature_layer_name = "seresnet18"
        elif opt.backbone == "shufflenet":
            feature_layer_num = 167
            feature_layer_name = "shuffle"
        else:
            raise ValueError("Not a correct name")

        feature_num = int(opt.freeze * feature_layer_num)

        for idx, (n, p) in enumerate(m.named_parameters()):
            if len(p.shape) == 1 and opt.freeze_bn:
                p.requires_grad = False
            elif feature_layer_name in n and idx < feature_num:
                p.requires_grad = False
            else:
                p.requires_grad = True

    writer = SummaryWriter('exp/{}/{}'.format(folder, save_ID), comment=cmd)

    if device != "cpu":
        # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True).cuda()
        rnd_inps = torch.rand(3, 3, 224, 224).cuda()
    else:
        rnd_inps = torch.rand(3, 3, 224, 224)
        # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True)
    try:
        writer.add_graph(m, (rnd_inps, ))
    except:
        pass

    shuffle_dataset = False
    for k, v in config.train_info.items():
        if k not in open_source_dataset:
            shuffle_dataset = True

    train_dataset = MyDataset(config.train_info, train=True)
    val_dataset = MyDataset(config.train_info, train=False)
    if shuffle_dataset:
        val_dataset.img_val, val_dataset.bbox_val, val_dataset.part_val = \
            train_dataset.img_val, train_dataset.bbox_val, train_dataset.part_val

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=opt.trainBatch,
                                               shuffle=True,
                                               num_workers=opt.trainNW,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=opt.validBatch,
                                             shuffle=True,
                                             num_workers=opt.valNW,
                                             pin_memory=True)

    # for k, v in config.train_info.items():
    #     train_dataset = Mscoco([v[0], v[1]], train=True, val_img_num=v[2])
    #     val_dataset = Mscoco([v[0], v[1]], train=False, val_img_num=v[2])
    #
    # train_loaders[k] = torch.utils.data.DataLoader(
    #     train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker,
    #     pin_memory=True)
    #
    # val_loaders[k] = torch.utils.data.DataLoader(
    #     val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True)
    #
    # train_loader = torch.utils.data.DataLoader(
    #         train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker,
    #         pin_memory=True)
    # val_loader = torch.utils.data.DataLoader(
    #         val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True)

    # assert train_loaders != {}, "Your training data has not been specific! "

    os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True)
    if pre_train_model:
        if "duc_se.pth" not in pre_train_model:
            if "pretrain" not in pre_train_model:
                try:
                    info_path = os.path.join("exp", folder, save_ID,
                                             "option.pkl")
                    info = torch.load(info_path)
                    opt.trainIters = info.trainIters
                    opt.valIters = info.valIters
                    begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1
                except:
                    # begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1
                    with open(log_name, "a+") as f:
                        f.write(cmd)

            print('Loading Model from {}'.format(pre_train_model))
            m.load_state_dict(torch.load(pre_train_model))
        else:
            with open(log_name, "a+") as f:
                f.write(cmd)
            print('Loading Model from {}'.format(pre_train_model))
            m.load_state_dict(torch.load(pre_train_model))
            m.conv_out = nn.Conv2d(m.DIM,
                                   opt.kps,
                                   kernel_size=3,
                                   stride=1,
                                   padding=1)
            if device != "cpu":
                m.conv_out.cuda()
            os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True)
    else:
        print('Create new model')
        with open(log_name, "a+") as f:
            f.write(cmd)
            print(opt, file=f)
            f.write("FLOPs of current model is {}\n".format(flops))
            f.write("Parameters of current model is {}\n".format(params))

    with open(os.path.join(log_dir, "tb.py"), "w") as pyfile:
        pyfile.write("import os\n")
        pyfile.write("os.system('conda init bash')\n")
        pyfile.write("os.system('conda activate py36')\n")
        pyfile.write(
            "os.system('tensorboard --logdir=../../../../exp/{}/{}')".format(
                folder, save_ID))

    params_to_update, layers = [], 0
    for name, param in m.named_parameters():
        layers += 1
        if param.requires_grad:
            params_to_update.append(param)
    print("Training {} layers out of {}".format(len(params_to_update), layers))

    if optimize == 'rmsprop':
        optimizer = torch.optim.RMSprop(params_to_update,
                                        lr=opt.LR,
                                        momentum=opt.momentum,
                                        weight_decay=opt.weightDecay)
    elif optimize == 'adam':
        optimizer = torch.optim.Adam(params_to_update,
                                     lr=opt.LR,
                                     weight_decay=opt.weightDecay)
    elif optimize == 'sgd':
        optimizer = torch.optim.SGD(params_to_update,
                                    lr=opt.LR,
                                    momentum=opt.momentum,
                                    weight_decay=opt.weightDecay)
    else:
        raise Exception

    if mix_precision:
        m, optimizer = amp.initialize(m, optimizer, opt_level="O1")

    # Model Transfer
    if device != "cpu":
        m = torch.nn.DataParallel(m).cuda()
        criterion = torch.nn.MSELoss().cuda()
    else:
        m = torch.nn.DataParallel(m)
        criterion = torch.nn.MSELoss()

    # loss, acc = valid(val_loader, m, criterion, optimizer, writer)
    # print('Valid:-{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f}'.format(
    #     idx=-1,
    #     loss=loss,
    #     acc=acc
    # ))

    early_stopping = EarlyStopping(patience=opt.patience, verbose=True)
    train_acc, val_acc, train_loss, val_loss, best_epoch, train_dist, val_dist, train_auc, val_auc, train_PR, val_PR = \
        0, 0, float("inf"), float("inf"), 0, float("inf"), float("inf"), 0, 0, 0, 0
    train_acc_ls, val_acc_ls, train_loss_ls, val_loss_ls, train_dist_ls, val_dist_ls, train_auc_ls, val_auc_ls, \
        train_pr_ls, val_pr_ls, epoch_ls, lr_ls = [], [], [], [], [], [], [], [], [], [], [], []
    decay, decay_epoch, lr, i = 0, [], opt.LR, begin_epoch
    stop = False
    m_best = m

    train_log = open(train_log_name, "w", newline="")
    bn_log = open(bn_file, "w")
    csv_writer = csv.writer(train_log)
    csv_writer.writerow(write_csv_title())
    begin_time = time.time()

    os.makedirs("result", exist_ok=True)
    result = os.path.join(
        "result", "{}_result_{}.csv".format(opt.expFolder, config.computer))
    exist = os.path.exists(result)

    # Start Training
    try:
        for i in range(opt.nEpochs)[begin_epoch:]:

            opt.epoch = i
            epoch_ls.append(i)
            train_log_tmp = [save_ID, i, lr]

            log = open(log_name, "a+")
            print('############# Starting Epoch {} #############'.format(i))
            log.write(
                '############# Starting Epoch {} #############\n'.format(i))

            # optimizer, lr = adjust_lr(optimizer, i, config.lr_decay, opt.nEpochs)
            # writer.add_scalar("lr", lr, i)
            # print("epoch {}: lr {}".format(i, lr))

            loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = \
                train(train_loader, m, criterion, optimizer, writer)
            train_log_tmp.append(" ")
            train_log_tmp.append(loss)
            train_log_tmp.append(acc.tolist())
            train_log_tmp.append(dist.tolist())
            train_log_tmp.append(auc)
            train_log_tmp.append(pr)
            for a in pt_acc:
                train_log_tmp.append(a.tolist())
            train_log_tmp.append(" ")
            for d in pt_dist:
                train_log_tmp.append(d.tolist())
            train_log_tmp.append(" ")
            for ac in pt_auc:
                train_log_tmp.append(ac)
            train_log_tmp.append(" ")
            for p in pt_pr:
                train_log_tmp.append(p)
            train_log_tmp.append(" ")

            train_acc_ls.append(acc)
            train_loss_ls.append(loss)
            train_dist_ls.append(dist)
            train_auc_ls.append(auc)
            train_pr_ls.append(pr)
            train_acc = acc if acc > train_acc else train_acc
            train_loss = loss if loss < train_loss else train_loss
            train_dist = dist if dist < train_dist else train_dist
            train_auc = auc if auc > train_auc else train_auc
            train_PR = pr if pr > train_PR else train_PR

            log.write(
                'Train:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n'
                .format(
                    idx=i,
                    loss=loss,
                    acc=acc,
                    dist=dist,
                    AUC=auc,
                    PR=pr,
                ))

            opt.acc = acc
            opt.loss = loss
            m_dev = m.module

            loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = valid(
                val_loader, m, criterion, writer)
            train_log_tmp.insert(9, loss)
            train_log_tmp.insert(10, acc.tolist())
            train_log_tmp.insert(11, dist.tolist())
            train_log_tmp.insert(12, auc)
            train_log_tmp.insert(13, pr)
            train_log_tmp.insert(14, " ")
            for a in pt_acc:
                train_log_tmp.append(a.tolist())
            train_log_tmp.append(" ")
            for d in pt_dist:
                train_log_tmp.append(d.tolist())
            train_log_tmp.append(" ")
            for ac in pt_auc:
                train_log_tmp.append(ac)
            train_log_tmp.append(" ")
            for p in pt_pr:
                train_log_tmp.append(p)
            train_log_tmp.append(" ")

            val_acc_ls.append(acc)
            val_loss_ls.append(loss)
            val_dist_ls.append(dist)
            val_auc_ls.append(auc)
            val_pr_ls.append(pr)
            if acc > val_acc:
                best_epoch = i
                val_acc = acc
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_best_acc.pkl'.format(folder, save_ID))
                m_best = copy.deepcopy(m)
            val_loss = loss if loss < val_loss else val_loss
            if dist < val_dist:
                val_dist = dist
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_best_dist.pkl'.format(folder, save_ID))
            if auc > val_auc:
                val_auc = auc
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_best_auc.pkl'.format(folder, save_ID))
            if pr > val_PR:
                val_PR = pr
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_best_pr.pkl'.format(folder, save_ID))

            log.write(
                'Valid:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n'
                .format(
                    idx=i,
                    loss=loss,
                    acc=acc,
                    dist=dist,
                    AUC=auc,
                    PR=pr,
                ))

            bn_sum, bn_num = 0, 0
            for mod in m.modules():
                if isinstance(mod, nn.BatchNorm2d):
                    bn_num += mod.num_features
                    bn_sum += torch.sum(abs(mod.weight))
                    writer.add_histogram("bn_weight",
                                         mod.weight.data.cpu().numpy(), i)

            bn_ave = bn_sum / bn_num
            bn_log.write("{} --> {}".format(i, bn_ave))
            print("Current bn : {} --> {}".format(i, bn_ave))
            bn_log.write("\n")
            log.close()
            csv_writer.writerow(train_log_tmp)

            writer.add_scalar("lr", lr, i)
            print("epoch {}: lr {}".format(i, lr))
            lr_ls.append(lr)

            torch.save(opt, 'exp/{}/{}/option.pkl'.format(folder, save_ID, i))
            if i % opt.save_interval == 0 and i != 0:
                torch.save(
                    m_dev.state_dict(),
                    'exp/{0}/{1}/{1}_{2}.pkl'.format(folder, save_ID, i))
                # torch.save(
                #     optimizer, 'exp/{}/{}/optimizer.pkl'.format(dataset, save_folder))

            if i < warm_up_epoch:
                optimizer, lr = warm_up_lr(optimizer, i)
            elif i == warm_up_epoch:
                lr = opt.LR
                early_stopping(acc)
            else:
                early_stopping(acc)
                if early_stopping.early_stop:
                    optimizer, lr = lr_decay(optimizer, lr)
                    decay += 1
                    # if decay == 2:
                    #     draw_pred_img = False
                    if decay > opt.lr_decay_time:
                        stop = True
                    else:
                        decay_epoch.append(i)
                        early_stopping.reset(
                            int(opt.patience * patience_decay[decay]))
                        # torch.save(m_dev.state_dict(), 'exp/{0}/{1}/{1}_decay{2}.pkl'.format(folder, save_ID, decay))
                        m = m_best

            for epo, ac in config.bad_epochs.items():
                if i == epo and val_acc < ac:
                    stop = True
            if stop:
                print("Training finished at epoch {}".format(i))
                break

        training_time = time.time() - begin_time
        writer.close()
        train_log.close()

        # draw_graph(epoch_ls, train_loss_ls, val_loss_ls, train_acc_ls, val_acc_ls, train_dist_ls, val_dist_ls, log_dir)
        draw_graph(epoch_ls, train_loss_ls, val_loss_ls, "loss", log_dir)
        draw_graph(epoch_ls, train_acc_ls, val_acc_ls, "acc", log_dir)
        draw_graph(epoch_ls, train_auc_ls, val_auc_ls, "AUC", log_dir)
        draw_graph(epoch_ls, train_dist_ls, val_dist_ls, "dist", log_dir)
        draw_graph(epoch_ls, train_pr_ls, val_pr_ls, "PR", log_dir)

        with open(result, "a+") as f:
            if not exist:
                title_str = "id,backbone,structure,DUC,params,flops,time,loss_param,addDPG,kps,batch_size,optimizer," \
                            "freeze_bn,freeze,sparse,sparse_decay,epoch_num,LR,Gaussian,thresh,weightDecay,loadModel," \
                            "model_location, ,folder_name,training_time,train_acc,train_loss,train_dist,train_AUC," \
                            "train_PR,val_acc,val_loss,val_dist,val_AUC,val_PR,best_epoch,final_epoch"
                title_str = write_decay_title(len(decay_epoch), title_str)
                f.write(title_str)
            info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".\
                format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG,
                       opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay,
                       opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer,
                       os.path.join(folder, save_ID), training_time, train_acc, train_loss, train_dist, train_auc,
                       train_PR, val_acc, val_loss, val_dist, val_auc, val_PR, best_epoch, i)
            info_str = write_decay_info(decay_epoch, info_str)
            f.write(info_str)
    # except IOError:
    #     with open(result, "a+") as f:
    #         training_time = time.time() - begin_time
    #         writer.close()
    #         train_log.close()
    #         info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \
    #             format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG,
    #                    opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay,
    #                    opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer,
    #                    os.path.join(folder, save_ID), training_time, "Some file is closed")
    #         f.write(info_str)
    except ZeroDivisionError:
        with open(result, "a+") as f:
            training_time = time.time() - begin_time
            writer.close()
            train_log.close()
            info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \
                format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG,
                       opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay,
                       opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer,
                       os.path.join(folder, save_ID), training_time, "Gradient flow")
            f.write(info_str)
    except KeyboardInterrupt:
        with open(result, "a+") as f:
            training_time = time.time() - begin_time
            writer.close()
            train_log.close()
            info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \
                format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG,
                       opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay,
                       opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer,
                       os.path.join(folder, save_ID), training_time, "Be killed by someone")
            f.write(info_str)

    print("Model {} training finished".format(save_ID))
    print(
        "----------------------------------------------------------------------------------------------------"
    )
示例#7
0
def _train(loader,
           optimizer,
           loss_function,
           model,
           config=None,
           lr=None,
           evaluation=True,
           name_best_model='weights/best',
           preprocess_mode=None):
    # Parameters for training
    training_samples = len(loader.image_train_list)
    steps_per_epoch = int(training_samples / config['batch_size']) + 1
    best_miou = 0
    log_freq = min(50, int(steps_per_epoch / 5))
    avg_loss = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
    train_summary_writer = tf.summary.create_file_writer(
        '/tmp/summaries/train')  # tensorboard
    test_summary_writer = tf.summary.create_file_writer(
        '/tmp/summaries/test')  # tensorboard
    print('Please enter in terminal: tensorboard --logdir /tmp/summaries')

    for epoch in range(config['epochs']):  # for each epoch
        start_time_epoch = time.time()
        lr_decay(lr, config['init_lr'], 1e-9, epoch,
                 config['epochs'] - 1)  # compute the new lr
        print('epoch: ' + str(epoch + 1) + '. Learning rate: ' +
              str(lr.numpy()))

        for step in range(steps_per_epoch):  # for every batch

            # get batch
            x, y, mask = loader.get_batch(size=config['batch_size'],
                                          train=True)

            x = preprocess(x, mode=preprocess_mode)

            with train_summary_writer.as_default():
                loss = train_step(
                    model, x, y, mask, loss_function, optimizer,
                    (config['height_train'], config['width_train']),
                    config['zoom_augmentation'])
                # tensorboard
                avg_loss.update_state(loss)
                if tf.equal(optimizer.iterations % log_freq, 0):
                    tf.summary.scalar('loss',
                                      avg_loss.result(),
                                      step=optimizer.iterations)
                    avg_loss.reset_states()

        if evaluation:
            # get metrics

            # with train_summary_writer.as_default():
            #     train_acc, train_miou = get_metrics(loader, model, loader.n_classes, train=True, flip_inference=False, preprocess_mode=preprocess_mode, optimizer=optimizer)

            with test_summary_writer.as_default():
                test_acc, test_miou = get_metrics(
                    loader,
                    model,
                    loader.n_classes,
                    train=False,
                    flip_inference=False,
                    preprocess_mode=preprocess_mode,
                    optimizer=optimizer,
                    scales=[1])

            # print('Train accuracy: ' + str(train_acc.numpy()))
            # print('Train miou: ' + str(train_miou.numpy()))
            print('Test accuracy: ' + str(test_acc.numpy()))
            print('Test miou: ' + str(test_miou.numpy()))

            # save model if best model
            if test_miou.numpy() > best_miou:
                best_miou = test_miou.numpy()
                model.save_weights(name_best_model)

            print('Current Best model miou: ' + str(best_miou))
            print('')

        else:
            model.save_weights(name_best_model)

        loader.suffle_segmentation()  # sheffle training set every epoch
        print('Epoch time seconds: ' + str(time.time() - start_time_epoch))
示例#8
0
def train(loader,
          optimizer,
          loss_function,
          model,
          size_input,
          epochs=5,
          batch_size=2,
          lr=None,
          init_lr=2e-4,
          evaluation=True,
          name_best_model='weights/best',
          preprocess_mode=None,
          labels_resize_factor=1):
    # Parameters for training
    training_samples = len(loader.image_train_list)
    steps_per_epoch = int(training_samples / batch_size) + 1
    best_miou = 0
    log_freq = min(50, int(steps_per_epoch / 5))
    avg_loss = tf.keras.metrics.Mean(name='loss', dtype=tf.float32)
    train_summary_writer = tf.summary.create_file_writer(
        '/tmp/summaries/train')  # tensorboard
    test_summary_writer = tf.summary.create_file_writer(
        '/tmp/summaries/test')  # tensorboard
    print('Please enter in terminal: tensorboard --logdir \\tmp\\summaries')

    for epoch in range(epochs):  # for each epoch
        lr_decay(lr, init_lr, 1e-9, epoch, epochs - 1)  # compute the new lr
        print('epoch: ' + str(epoch + 1) + '. Learning rate: ' +
              str(lr.numpy()))
        for step in range(steps_per_epoch):  # for every batch
            # get batch
            x, y, mask = loader.get_batch(size=batch_size, train=True)
            x = preprocess(x, mode=preprocess_mode)

            with train_summary_writer.as_default():
                loss = train_step(model, x, y, mask, loss_function, optimizer,
                                  labels_resize_factor, size_input)
                # tensorboard
                avg_loss.update_state(loss)
                if tf.equal(optimizer.iterations % log_freq, 0):
                    tf.summary.scalar('loss',
                                      avg_loss.result(),
                                      step=optimizer.iterations)
                    avg_loss.reset_states()

        if evaluation:
            # get metrics
            with train_summary_writer.as_default():
                train_acc, train_miou = get_metrics(
                    loader,
                    model,
                    loader.n_classes,
                    train=True,
                    preprocess_mode=preprocess_mode,
                    labels_resize_factor=labels_resize_factor,
                    optimizer=optimizer)

            with test_summary_writer.as_default():
                test_acc, test_miou = get_metrics(
                    loader,
                    model,
                    loader.n_classes,
                    train=False,
                    flip_inference=False,
                    scales=[1],
                    preprocess_mode=preprocess_mode,
                    labels_resize_factor=labels_resize_factor,
                    optimizer=optimizer)

            print('Train accuracy: ' + str(train_acc.numpy()))
            print('Train miou: ' + str(train_miou.numpy()))
            print('Test accuracy: ' + str(test_acc.numpy()))
            print('Test miou: ' + str(test_miou.numpy()))
            print('')

            # save model if bet
            if test_miou > best_miou:
                best_miou = test_miou
                model.save_weights(name_best_model)
        else:
            model.save_weights(name_best_model)

        loader.suffle_segmentation()  # sheffle trainign set
示例#9
0
def main(opt):
    start_epoch = 0
    err_best = 10000
    lr_now = opt.lr
    is_cuda = torch.cuda.is_available()

    # define log csv file
    script_name = os.path.basename(__file__).split('.')[0]
    script_name = script_name + "_in{:d}_out{:d}_dctn{:d}".format(
        opt.input_n, opt.output_n, opt.dct_n)

    # create model
    print(">>> creating model")
    input_n = opt.input_n
    output_n = opt.output_n
    dct_n = opt.dct_n
    sample_rate = opt.sample_rate

    # 48 nodes for angle prediction
    model = nnmodel.GCN(input_feature=dct_n,
                        hidden_feature=opt.linear_size,
                        p_dropout=opt.dropout,
                        num_stage=opt.num_stage,
                        node_n=48)

    if is_cuda:
        model.cuda()

    print(">>> total params: {:.2f}M".format(
        sum(p.numel() for p in model.parameters()) / 1000000.0))
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)

    # continue from checkpoint
    if opt.is_load:
        model_path_len = 'checkpoint/test/ckpt_main_gcn_muti_att_best.pth.tar'
        print(">>> loading ckpt len from '{}'".format(model_path_len))
        if is_cuda:
            ckpt = torch.load(model_path_len)
        else:
            ckpt = torch.load(model_path_len, map_location='cpu')
        start_epoch = ckpt['epoch']
        err_best = ckpt['err']
        lr_now = ckpt['lr']
        model.load_state_dict(ckpt['state_dict'])
        optimizer.load_state_dict(ckpt['optimizer'])
        print(">>> ckpt len loaded (epoch: {} | err: {})".format(
            start_epoch, err_best))

    # data loading
    print(">>> loading data")
    train_dataset = H36motion(path_to_data=opt.data_dir,
                              actions='all',
                              input_n=input_n,
                              output_n=output_n,
                              split=0,
                              sample_rate=sample_rate,
                              dct_n=dct_n)
    data_std = train_dataset.data_std
    data_mean = train_dataset.data_mean

    val_dataset = H36motion(path_to_data=opt.data_dir,
                            actions='all',
                            input_n=input_n,
                            output_n=output_n,
                            split=2,
                            sample_rate=sample_rate,
                            data_mean=data_mean,
                            data_std=data_std,
                            dct_n=dct_n)

    # load datasets for training
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=opt.train_batch,
                              shuffle=True,
                              num_workers=opt.job,
                              pin_memory=True)
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=opt.test_batch,
                            shuffle=False,
                            num_workers=opt.job,
                            pin_memory=True)

    acts = data_utils.define_actions('all')
    test_data = dict()
    for act in acts:
        test_dataset = H36motion(path_to_data=opt.data_dir,
                                 actions=act,
                                 input_n=input_n,
                                 output_n=output_n,
                                 split=1,
                                 sample_rate=sample_rate,
                                 data_mean=data_mean,
                                 data_std=data_std,
                                 dct_n=dct_n)
        test_data[act] = DataLoader(dataset=test_dataset,
                                    batch_size=opt.test_batch,
                                    shuffle=False,
                                    num_workers=opt.job,
                                    pin_memory=True)
    print(">>> data loaded !")
    print(">>> train data {}".format(train_dataset.__len__()))
    print(">>> validation data {}".format(val_dataset.__len__()))

    for epoch in range(start_epoch, opt.epochs):

        if (epoch + 1) % opt.lr_decay == 0:
            lr_now = utils.lr_decay(optimizer, lr_now, opt.lr_gamma)
        print('==========================')
        print('>>> epoch: {} | lr: {:.5f}'.format(epoch + 1, lr_now))
        ret_log = np.array([epoch + 1])
        head = np.array(['epoch'])
        # per epoch
        a = train_dataset.dim_used