示例#1
0
def main(args):
  """Run training."""
  val_perf = []  # summary of validation performance, and the training loss

  train_data = utils.read_data(args, "train")
  val_data = utils.read_data(args, "val")

  args.train_num_examples = train_data.num_examples

  # construct model under gpu0
  model = models.get_model(args, gpuid=args.gpuid)

  trainer = models.Trainer(model, args)
  tester = models.Tester(model, args)
  saver = tf.train.Saver(max_to_keep=5)
  bestsaver = tf.train.Saver(max_to_keep=5)

  save_period = args.save_period  # also the eval period

  # start training!
  tfconfig = tf.ConfigProto(allow_soft_placement=True)
  tfconfig.gpu_options.allow_growth = True
  tfconfig.gpu_options.visible_device_list = "%s" % (
      ",".join(["%s" % i for i in [args.gpuid]]))
  with tf.Session(config=tfconfig) as sess:

    utils.initialize(
        load=args.load, load_best=args.load_best, args=args, sess=sess)

    # the total step (iteration) the model will run
    # total / batchSize  * epoch
    num_steps = int(math.ceil(train_data.num_examples /
                              float(args.batch_size)))*args.num_epochs
    # get_batches is a generator, run on the fly

    print(" batch_size:%s, epoch:%s, %s step every epoch, total step:%s,"
          " eval/save every %s steps" % (args.batch_size,
                                         args.num_epochs,
                                         math.ceil(train_data.num_examples/
                                                   float(args.batch_size)),
                                         num_steps,
                                         args.save_period))

    metric = "ade"  # average displacement error # smaller better
    # remember the best eval acc during training
    best = {metric: 999999, "step": -1}

    finalperf = None
    is_start = True
    loss = -1
    grid_loss = -1
    xyloss = -1
    act_loss = -1
    traj_class_loss = -1

    for batch in tqdm(train_data.get_batches(args.batch_size,
                                             num_steps=num_steps),
                      total=num_steps, ascii=True):

      global_step = sess.run(model.global_step) + 1  # start from 0

      # if load from existing model, save if first
      if (global_step % save_period == 0) or \
         (args.load_best and is_start) or \
         (args.load and is_start and (args.ignore_vars is None)):

        tqdm.write("\tsaving model %s..." % global_step)
        saver.save(sess, args.save_dir_model, global_step=global_step)
        tqdm.write("\tdone")

        evalperf = utils.evaluate(val_data, args, sess, tester)

        tqdm.write(("\tlast loss:%.5f, xyloss:%.5f, traj_class_loss:%.5f,"
                    " grid_loss:%s, act_loss:%.5f, eval on validation:%s,"
                    " (best %s:%s at step %s) ") % (
                        loss, xyloss, traj_class_loss, grid_loss, act_loss,
                        ["%s: %s" % (k, evalperf[k])
                         for k in sorted(evalperf.keys())], metric,
                        best[metric], best["step"]))

        # remember the best acc
        if evalperf[metric] < best[metric]:
          best[metric] = evalperf[metric]
          best["step"] = global_step
          # save the best model
          tqdm.write("\t saving best model...")
          bestsaver.save(sess, args.save_dir_best_model,
                         global_step=global_step)
          tqdm.write("\t done.")

          finalperf = evalperf
          val_perf.append((loss, evalperf))
        is_start = False

      loss, _, xyloss, act_loss, traj_class_loss, grid_loss = \
          trainer.step(sess, batch)
      if math.isnan(loss):
        print("nan loss.")
        print(grid_loss)
        sys.exit()

    if global_step % save_period != 0:
      saver.save(sess, args.save_dir_model, global_step=global_step)

    print("best eval on val %s: %s at %s step, final step %s %s is %s" % (
        metric, best[metric], best["step"], global_step, metric,
        finalperf[metric]))
示例#2
0
def train():
    # Setup Dataloader,训练集和验证集数据,决定了如分类类别等
    train_dataset = data.UAVDataClassSeg(
        txt_path=
        '/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/data/train.txt')
    trainloader = DataLoader(train_dataset,
                             batch_size=12,
                             shuffle=True,
                             drop_last=True,
                             num_workers=24,
                             pin_memory=True)
    val_dataset = data.UAVDataClassSeg(
        '/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/data/valid/valid.txt',
        train=False)
    valloader = DataLoader(val_dataset, batch_size=1, shuffle=False)

    # Setup device89
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #Setup model
    model = models.segnet(n_classes=len(val_dataset.class_names))
    #用预训练的Vgg16网络初始化FCN32s的参数
    model.init_vgg16_params(torchvision.models.vgg16(pretrained=True))

    # Setup optimizer, lr_scheduler and loss function(优化器、学习率调整策略、损失函数)

    def cross_entropy2d(input, target, weight=None, size_average=True):
        # input: (n, c, h, w), target: (n, h, w)
        n, c, h, w = input.size()
        # log_p: (n, c, h, w)
        if LooseVersion(torch.__version__) < LooseVersion(
                '0.3'):  #简单的版本比较操作,此处传入的时torch.__version__,所以比较的时torch的版本
            # ==0.2.X
            log_p = F.log_softmax(input)
        else:
            # >=0.3
            log_p = F.log_softmax(input, dim=1)
        # log_p: (n*h*w, c) log_p是对input做log_softmax后的结果,表示每个类的概率。tensor.transpose将tensor的维度交换,如行变成列,列变成行
        log_p = log_p.transpose(1, 2).transpose(2, 3).contiguous()
        log_p = log_p[target.view(n, h, w, 1).repeat(1, 1, 1, c) >= 0]
        log_p = log_p.view(-1, c)
        # target: (n*h*w,)
        mask = target >= 0
        target = target[mask]
        loss = F.nll_loss(log_p, target, weight=weight)
        if size_average:
            loss /= mask.data.sum()
        return loss

    lossFun = cross_entropy2d

    optim = torch.optim.Adam(params=model.parameters(),
                             lr=1.0e-5,
                             weight_decay=0.0005)
    #定义学习率调整策略
    scheduler = lr_scheduler.ReduceLROnPlateau(
        optim, mode='min', patience=1, min_lr=10e-10,
        eps=10e-9)  # min表示当指标不在降低时,patience表示可以容忍的step次数

    # utils.ModelLoad('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/output_segnet/bestModel/1.4000*3000_trainModel.tar',
    #                  model)
    now = datetime.datetime.now()
    logFile = utils.Log(
        osp.join(
            '/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/output_segnet/visualization_viz/',
            now.strftime('%Y%m%d_%H%M%S.%f') + 'log.csv'), [
                'iteration', 'train/loss', 'train/mean_iu', 'valid/loss',
                'valid/mean_iu', 'lr'
            ])
    trainer = models.Trainer(cuda=True,
                             model=model,
                             optimizer=optim,
                             loss_fcn=lossFun,
                             train_loader=trainloader,
                             val_loader=valloader,
                             out='./output_segnet/',
                             max_iter=100000,
                             scheduler=scheduler,
                             interval_validate=800,
                             logFile=logFile)
    trainer.train()  #进入训练
示例#3
0
def train():
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #Setup model
    model = models.UNet(n_channels=3,n_classes=5)
    #用预训练的Vgg16网络初始化FCN32s的参数
    vgg16 = models.VGG16(pretrained=True)
    model.copy_params_from_vgg16(vgg16)

    # Setup Dataloader,训练集和验证集数据
    """data.picFulPath('/home/mlxuan/project/DeepLearning/data/benchmark/benchmark_RELEASE/dataset/train.txt',
               '/home/mlxuan/project/DeepLearning/data/benchmark/benchmark_RELEASE/dataset/img/',
               '/home/mlxuan/project/DeepLearning/data/benchmark/benchmark_RELEASE/dataset/cls/')
    train_dataset = data.SBDClassSeg('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/ImagAndLal.txt')
    trainloader = DataLoader(train_dataset, batch_size=4, shuffle=False, drop_last=True)

    data.picFulPath('/home/mlxuan/project/DeepLearning/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012/ImageSets/Segmentation/val.txt',
                    '/home/mlxuan/project/DeepLearning/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012/JPEGImages/',
                    '/home/mlxuan/project/DeepLearning/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012/SegmentationClass/',
                    destPath='/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/ValImagAndLal.txt',
                    ImgFix='.jpg',lblFix='.png')

    val_dataset = data.VOCClassSeg('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/ValImagAndLal.txt',train=False)
    valloader = DataLoader(val_dataset,batch_size=1,shuffle=False)"""

    train_dataset = data.RSDataClassSeg('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/Data/trainFullPath.txt')
    trainloader = DataLoader(train_dataset, batch_size=4, shuffle=False, drop_last=True)
    val_dataset = data.RSDataClassSeg('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/Data/validFullPath.txt',train=False)
    valloader = DataLoader(val_dataset, batch_size=1, shuffle=False)




    # Setup optimizer, lr_scheduler and loss function(优化器、学习率调整策略、损失函数)

    def cross_entropy2d(input, target, weight=None, size_average=True):
        # input: (n, c, h, w), target: (n, h, w)
        n, c, h, w = input.size()
        # log_p: (n, c, h, w)
        if LooseVersion(torch.__version__) < LooseVersion('0.3'):#简单的版本比较操作,此处传入的时torch.__version__,所以比较的时torch的版本
            # ==0.2.X
            log_p = F.log_softmax(input)
        else:
            # >=0.3
            log_p = F.log_softmax(input, dim=1)
        # log_p: (n*h*w, c) log_p是对input做log_softmax后的结果,表示每个类的概率。tensor.transpose将tensor的维度交换,如行变成列,列变成行
        log_p = log_p.transpose(1, 2).transpose(2, 3).contiguous()
        log_p = log_p[target.view(n, h, w, 1).repeat(1, 1, 1, c) >= 0]
        log_p = log_p.view(-1, c)
        # target: (n*h*w,)
        mask = target >= 0
        target = target[mask]
        loss = F.nll_loss(log_p, target, weight=weight)
        if size_average:
            loss /= mask.data.sum()
        return loss


    lossFun = cross_entropy2d

    def get_parameters(model, bias=False):
        import torch.nn as nn
        modules_skipped = (
            nn.ReLU,
            nn.MaxPool2d,
            nn.Dropout2d,
            nn.Sequential,
            models.FCN32s,


        )
        for m in model.modules():
            if isinstance(m, nn.Conv2d):
                if bias:
                    yield m.bias
                else:
                    yield m.weight
            elif isinstance(m, nn.ConvTranspose2d):
                # weight is frozen because it is just a bilinear upsampling
                if bias:
                    assert m.bias is None
            elif isinstance(m, modules_skipped):
                continue
            else:
                raise ValueError('Unexpected module: %s' % str(m))


    optim = torch.optim.SGD(
        [
            {'params': get_parameters(model, bias=False)},
            {'params': get_parameters(model, bias=True),
             'lr':  1.0e-5* 2, 'weight_decay': 0},
        ],
        lr=1.0e-5,
        momentum=0.99,
        weight_decay=0.0005)
    #定义学习率调整策略
    scheduler = lr_scheduler.ReduceLROnPlateau(optim, mode='min', patience=0,min_lr=10e-10,eps=10e-8)  # min表示当指标不在降低时,patience表示可以容忍的step次数

    utils.ModelLoad('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/output/Model.path/20181227_220035.852449model_best.pth.tar',model,optim)

    trainer = models.Trainer(
        cuda =True,
        model=model,
        optimizer=optim,
        loss_fcn=lossFun,
        train_loader=trainloader,
        val_loader=valloader,
        out='./output/',
        max_iter=40000,
        scheduler = scheduler,
        interval_validate=2000
    )
    trainer.train()#进入训练
示例#4
0
epochs = args.epochs
quiet = args.quiet == True
verbose = not quiet
latent_dim = 256
mixed_probability = 0.9
discriminator_filters = 8
generator_filters = 8
pl_beta = 0.99

if __name__ == '__main__':
    device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    print("Using device: ", device)
    #model = StyleGan2Model()
    dataLoader = utils.getDataLoader(batch_size, image_size)
    # print(len(dataLoader) / 10)
    Trainer = models.Trainer(batch_size, image_size, latent_dim, epochs, discriminator_filters, generator_filters, device, mixed_probability, pl_beta)
    # print(Trainer.StyleGan)
    # print(Trainer.StyleGan.generator.state_dict())
    # print(sum(p.numel() for p in Trainer.StyleGan.parameters()))
    # print(Trainer.StyleGan.discriminator.state_dict())

    print(Trainer.StyleGan.generator.state_dict()['generatorBlocks.2.style_to_input_channels.weight'][0][0].item())
    # print(Trainer.StyleGan.discriminator.state_dict()[])
    print("Apex available: ", Trainer.apex_available)
    # Trainer.resetSaves()
    # x, y = next(enumerate(dataLoader))
    x, y = next(enumerate(dataLoader))
    # print(y[0])
    # utils.showImage(y[0][0].expand(3, -1, -1))
    # print(y[0].size())287
    # for x in range(10):
示例#5
0
def train(args):
    '''\
  Training function.

  Args:
    args: namespace of arguments. Run 'artRecycle train --help' for info.
  '''

    # Model name and paths
    model_name = '{}|{}'.format(*args.datasets)
    model_path, log_path, logs_path = _prepare_directories(model_name,
                                                           resume=args.cont)

    model_json = os.path.join(model_path, 'keras.json')
    model_checkpoint = os.path.join(model_path, 'model')

    # Summary writers
    train_summary_writer = tf.summary.create_file_writer(
        os.path.join(log_path, 'train'))

    # Define datasets
    image_shape = (300, 300, 3)
    train_dataset, train_size = data.load_pair(*args.datasets,
                                               'all',
                                               shape=image_shape,
                                               batch=args.batch)

    train_dataset_it = iter(train_dataset)
    few_samples = [data.load_few(name, 'all', image_shape, 1) \
        for name in args.datasets]

    # Define keras model
    keras_model, model_layer = models.define_model(image_shape)

    # Save keras model
    keras_json = keras_model.to_json()
    keras_json = json.dumps(json.loads(keras_json), indent=2)
    with open(model_json, 'w') as f:
        f.write(keras_json)

    # Save TensorBoard graph
    @tf.function
    def tracing_model_ops(inputs):
        return model_layer(inputs)

    tf.summary.trace_on()
    tracing_model_ops(next(train_dataset_it))
    with train_summary_writer.as_default():
        tf.summary.trace_export('Model', step=0)

    # Resuming
    if args.cont:
        keras_model.load_weights(model_checkpoint)
        print('> Weights loaded')

    # Training steps
    step_saver = CountersSaver(log_dir=logs_path, log_every=args.logs)

    steps_per_epoch = int(train_size/args.batch) \
        if not args.epoch_steps else args.epoch_steps
    epochs = range(step_saver.epoch, args.epochs)

    # Training tools
    make_optmizer = lambda: tf.optimizers.Adam(args.rate)
    trainer = models.Trainer(keras_model, make_optmizer, train_dataset_it)
    tester = models.Tester(keras_model, train_dataset_it)
    saver = CheckpointSaver(keras_model, model_checkpoint)

    # Print job
    print('> Training.  Epochs:', epochs)

    # Training loop
    for epoch in epochs:
        print('> Epoch', step_saver.epoch)

        for epoch_step in range(steps_per_epoch):
            print('> Step', step_saver.step, end='\r')

            # Train step
            output = trainer.step()

            # Validation and log
            if step_saver.step % args.logs == 0 or epoch_step == steps_per_epoch - 1:
                print('\n> Validation')

                # Evaluation
                for i in range(args.val_steps):
                    tester.step()
                train_metrics = tester.result()

                # Log in console
                print('  Train metrics:', train_metrics)

                # Log in TensorBoard
                with train_summary_writer.as_default():
                    for metric in train_metrics:
                        tf.summary.scalar(metric,
                                          train_metrics[metric],
                                          step=step_saver.step)

                # Save weigths
                loss = 0
                for m in train_metrics.values():
                    loss += m
                saved = saver.save(score=-loss)
                if saved:
                    print('Weigths saved')

                # Transform images for visualization
                if args.images:
                    fake_A, fake_B, *_ = keras_model(few_samples)
                    fake_A_viz = image_unnormalize(fake_A)
                    fake_B_viz = image_unnormalize(fake_B)

                    # Log images
                    with train_summary_writer.as_default():
                        tf.summary.image('fake_A',
                                         fake_A_viz,
                                         step=step_saver.step)
                        tf.summary.image('fake_B',
                                         fake_B_viz,
                                         step=step_saver.step)

            # End step
            step_saver.new_step()

        # End epoch
        step_saver.new_epoch()
                    default=dt.now().strftime("%Y-%m-%d_%H-%M"),
                    type=str,
                    help='Path to output directory')
    ap.add_argument(
        '-l',
        '--load',
        type=str,
        help='Path to directory from which best_model.ckpt should be loaded')
    args = vars(ap.parse_args())

    loss_list, mape_list = [], []

    # Initialize model
    for i in range(args['num_runs']):
        print(f"\n[INFO] NOW STARTING RUN {i+1}\n")
        trainer = models.Trainer(args)
        best_loss, best_mape = trainer.train(run_id=i)
        loss_list.append(best_loss)
        mape_list.append(best_mape)

    with open(os.path.join('outputs', args['task'], 'mse_run_stats.txt'),
              'w') as f:
        f.write(" ".join([str(l) for l in loss_list]))
        f.write(f"\nMean: {np.mean(loss_list)}")
        f.write(f"\nStdev: {np.std(loss_list)}")

    with open(os.path.join('outputs', args['task'], 'mape_run_stats.txt'),
              'w') as f:
        f.write(" ".join([str(l) for l in mape_list]))
        f.write(f"\nMean: {np.mean(mape_list)}")
        f.write(f"\nStdev: {np.std(mape_list)}")