示例#1
0
def model_fn(features, labels, mode, params):
    """Model function used in the estimator.

    Args:
        features (Tensor):  Input features to the model.
        labels (Tensor):    Labels tensor for training and evaluation.
        mode (ModeKeys):    Specifies if training, evaluation or prediction.
        params (dict):      Dictionary of Hyper-parameters.

    Returns:
        (EstimatorSpec):    Model to be run by Estimator.
    """

    # Define model's architecture
    if params['model'] == 'DNN':
        y_prob, logit, l1_reg, l2_reg = DNN.model(features, params, mode)
    else:
        tf.logging.fatal('Params model setting wrong with {0}'.format(
            params['model']))
        sys.exit(1)

    export_outputs = None
    y_prediction = tf.reshape(y_prob, [-1])
    predictions_dict = {'predicted': y_prediction}

    if mode in (TRAIN, EVAL):

        labels_flatten = tf.reshape(tf.cast(labels, dtype=tf.int32), [-1])

        auc = tf.compat.v1.metrics.auc(labels_flatten, y_prediction)
        apk = tf.compat.v1.metrics.average_precision_at_k(
            tf.cast(labels_flatten, dtype=tf.int64),
            y_prediction,
            k=params['batch_size'])
        precision = tf.compat.v1.metrics.precision(labels_flatten,
                                                   y_prediction)
        recall = tf.compat.v1.metrics.recall(labels_flatten, y_prediction)

        metrics = {
            'eval_auc': auc,
            'eval_avgpk': apk,
            'eval_precision': precision,
            'eval_recall': recall
        }
        tf.compat.v1.summary.scalar('train_auc', auc[1])
        tf.compat.v1.summary.scalar('train_avgpk', apk[1])
        tf.compat.v1.summary.scalar('train_precision', precision[1])
        tf.compat.v1.summary.scalar('train_recall', recall[1])

        loss = Loss.loss_func(labels, y_prob, logit, l1_reg, l2_reg, params)

        if mode == TRAIN:
            update_ops = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                global_step = tf.compat.v1.train.get_global_step()
                learning_rate = tf.compat.v1.train.exponential_decay(
                    learning_rate=params['learning_rate'],
                    global_step=global_step,
                    decay_steps=params['decay_step'],
                    decay_rate=params['decay_rate'])

                optimizer = tf.compat.v1.train.AdamOptimizer(
                    learning_rate=learning_rate, epsilon=0.1)
                trainable_variables = tf.compat.v1.trainable_variables()
                gradients = tf.gradients(loss, trainable_variables)
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                train_op = optimizer.apply_gradients(zip(
                    clipped_gradients, trainable_variables),
                                                     global_step=global_step)
                update_global_step = tf.compat.v1.assign(
                    global_step, global_step + 1, name='update_global_step')
                final_train_op = tf.group(train_op, update_global_step)

            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=predictions_dict,
                                              loss=loss,
                                              train_op=final_train_op,
                                              export_outputs=export_outputs)

        else:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=predictions_dict,
                                              loss=loss,
                                              eval_metric_ops=metrics,
                                              export_outputs=export_outputs)

    elif mode == PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions_dict)

    else:
        tf.logging.fatal('Training MODE setting is wrong: {0}'.format(mode))
        sys.exit(1)
def train(train_loop_func, logger, args):
    if args.amp:
        amp_handle = amp.init(enabled=args.fp16)
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda

    # Setup multi-GPU if necessary
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    if args.seed is None:
        args.seed = np.random.randint(1e4)

    if args.distributed:
        args.seed = (args.seed + torch.distributed.get_rank()) % 2**32
    print("Using seed = {}".format(args.seed))
    torch.manual_seed(args.seed)
    np.random.seed(seed=args.seed)

    # Setup data, defaults
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    train_loader = get_train_loader(args, args.seed - 2**31)

    val_dataset = get_val_dataset(args)
    val_dataloader = get_val_dataloader(val_dataset, args)

    ssd300 = SSD300(backbone=args.backbone)
    args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size /
                                                            32)
    start_epoch = 0
    iteration = 0
    loss_func = Loss(dboxes)

    if use_cuda:
        ssd300.cuda()
        loss_func.cuda()

    if args.fp16 and not args.amp:
        ssd300 = network_to_half(ssd300)

    if args.distributed:
        ssd300 = DDP(ssd300)

    optimizer = torch.optim.SGD(tencent_trick(ssd300),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    scheduler = MultiStepLR(optimizer=optimizer,
                            milestones=args.multistep,
                            gamma=0.1)
    if args.fp16:
        if args.amp:
            optimizer = amp_handle.wrap_optimizer(optimizer)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.)
    if args.checkpoint is not None:
        if os.path.isfile(args.checkpoint):
            load_checkpoint(ssd300, args.checkpoint)
            checkpoint = torch.load(args.checkpoint,
                                    map_location=lambda storage, loc: storage.
                                    cuda(torch.cuda.current_device()))
            start_epoch = checkpoint['epoch']
            iteration = checkpoint['iteration']
            scheduler.load_state_dict(checkpoint['scheduler'])
            ssd300.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print('Provided checkpoint is not path to a file')
            return

    inv_map = {v: k for k, v in val_dataset.label_map.items()}

    total_time = 0

    if args.mode == 'evaluation':
        acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
        if args.local_rank == 0:
            print('Model precision {} mAP'.format(acc))

        return
    mean, std = generate_mean_std(args)

    for epoch in range(start_epoch, args.epochs):
        start_epoch_time = time.time()
        scheduler.step()
        iteration = train_loop_func(ssd300, loss_func, epoch, optimizer,
                                    train_loader, val_dataloader, encoder,
                                    iteration, logger, args, mean, std)
        end_epoch_time = time.time() - start_epoch_time
        total_time += end_epoch_time

        if args.local_rank == 0:
            logger.update_epoch_time(epoch, end_epoch_time)

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map,
                           args)

            if args.local_rank == 0:
                logger.update_epoch(epoch, acc)

        if args.save and args.local_rank == 0:
            print("saving model...")
            obj = {
                'epoch': epoch + 1,
                'iteration': iteration,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                'label_map': val_dataset.label_info
            }
            if args.distributed:
                obj['model'] = ssd300.module.state_dict()
            else:
                obj['model'] = ssd300.state_dict()
            torch.save(obj, './models/epoch_{}.pt'.format(epoch))
        train_loader.reset()
    print('total training time: {}'.format(total_time))
示例#3
0
def train(train_loop_func, logger, args):
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda
    train_samples = 118287

    # Setup multi-GPU if necessary
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='smddp', init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    if args.seed is None:
        args.seed = np.random.randint(1e4)

    if args.distributed:
        args.seed = (args.seed + torch.distributed.get_rank()) % 2**32
    print("Using seed = {}".format(args.seed))
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    np.random.seed(seed=args.seed)


    # Setup data, defaults
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    train_loader = get_train_loader(args, args.seed - 2**31)

    val_dataset = get_val_dataset(args)
    val_dataloader = get_val_dataloader(val_dataset, args)

    ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path))
    args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
    start_epoch = 0
    iteration = 0
    loss_func = Loss(dboxes)

    if use_cuda:
        ssd300.cuda()
        loss_func.cuda()

    optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate,
                                    momentum=args.momentum, weight_decay=args.weight_decay)
    scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1)
    if args.amp:
        ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2')

    if args.distributed:
        ssd300 = DDP(ssd300)

    if args.checkpoint is not None:
        if os.path.isfile(args.checkpoint):
            load_checkpoint(ssd300.module if args.distributed else ssd300, args.checkpoint)
            checkpoint = torch.load(args.checkpoint,
                                    map_location=lambda storage, loc: storage.cuda(torch.cuda.current_device()))
            start_epoch = checkpoint['epoch']
            iteration = checkpoint['iteration']
            scheduler.load_state_dict(checkpoint['scheduler'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print('Provided checkpoint is not path to a file')
            return

    inv_map = {v: k for k, v in val_dataset.label_map.items()}

    total_time = 0

    if args.mode == 'evaluation':
        acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
        if args.local_rank == 0:
            print('Model precision {} mAP'.format(acc))

        return
    mean, std = generate_mean_std(args)

    for epoch in range(start_epoch, args.epochs):
        start_epoch_time = time.time()
        scheduler.step()
        iteration = train_loop_func(ssd300, loss_func, epoch, optimizer, train_loader, val_dataloader, encoder, iteration,
                                    logger, args, mean, std)
        end_epoch_time = time.time() - start_epoch_time
        total_time += end_epoch_time

        if torch.distributed.get_rank() == 0:
            throughput = train_samples / end_epoch_time
            logger.update_epoch_time(epoch, end_epoch_time)
            logger.update_throughput_speed(epoch, throughput)

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)

        if args.save and args.local_rank == 0:
            print("saving model...")
            obj = {'epoch': epoch + 1,
                   'iteration': iteration,
                   'optimizer': optimizer.state_dict(),
                   'scheduler': scheduler.state_dict(),
                   'label_map': val_dataset.label_info}
            if args.distributed:
                obj['model'] = ssd300.module.state_dict()
            else:
                obj['model'] = ssd300.state_dict()
            save_path = os.path.join(args.save, f'epoch_{epoch}.pt')
            torch.save(obj, save_path)
            logger.log('model path', save_path)
        train_loader.reset()

    if torch.distributed.get_rank() == 0:
        DLLogger.log((), { 'Total training time': '%.2f' % total_time + ' secs' })
        logger.log_summary()
示例#4
0
文件: main.py 项目: alpop/DALI
def train(args):
    if args.amp:
        amp_handle = amp.init(enabled=args.fp16)

    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    ssd300 = model(args)
    args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
    iteration = 0
    loss_func = Loss(dboxes)

    loss_func.cuda()

    optimizer = torch.optim.SGD(
        tencent_trick(ssd300), 
        lr=args.learning_rate,
        momentum=args.momentum,
        weight_decay=args.weight_decay)

    scheduler = MultiStepLR(
        optimizer=optimizer, 
        milestones=args.multistep, 
        gamma=0.1)

    if args.fp16:
        if args.amp:
            optimizer = amp_handle.wrap_optimizer(optimizer)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.)

    val_dataloader, inv_map = get_val_dataloader(args)
    train_loader = get_train_loader(args, dboxes)

    acc = 0
    logger = Logger(args.batch_size, args.local_rank)
    
    for epoch in range(0, args.epochs):
        logger.start_epoch()
        scheduler.step()

        iteration = train_loop(
            ssd300, loss_func, epoch, optimizer, 
            train_loader, iteration, logger, args)

        logger.end_epoch()

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
            if args.local_rank == 0:
                print('Epoch {:2d}, Accuracy: {:4f} mAP'.format(epoch, acc))

        if args.data_pipeline == 'dali':
            train_loader.reset()

    return acc, logger.average_speed()
示例#5
0
def train(train_loop_func, logger, args):
    # Check that GPUs are actually available
    use_cuda = not args.no_cuda

    # Setup multi-GPU if necessary
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    if args.seed is None:
        args.seed = np.random.randint(1e4)

    if args.distributed:
        args.seed = (args.seed + torch.distributed.get_rank()) % 2**32
    print("Using seed = {}".format(args.seed))
    torch.manual_seed(args.seed)
    np.random.seed(seed=args.seed)

    # Setup data, defaults
    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    train_loader = get_train_loader(args, args.seed - 2**31)

    val_dataset = get_val_dataset(args)
    val_dataloader = get_val_dataloader(val_dataset, args)

    ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path))
    # args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)

    print(f"Actual starting LR: {args.learning_rate}")

    start_epoch = 0
    iteration = 0
    loss_func = Loss(dboxes)

    if use_cuda:
        ssd300.cuda()
        loss_func.cuda()

    # optimizer = torch.optim.SGD(tencent_trick(ssd300), lr=args.learning_rate,
    #                                 momentum=args.momentum, weight_decay=args.weight_decay, nesterov=True)
    optimizer = torch.optim.AdamW(tencent_trick(ssd300),
                                  lr=args.learning_rate,
                                  betas=(0.8, 0.999),
                                  eps=1e-08,
                                  weight_decay=0.01,
                                  amsgrad=True)

    # scheduler = MultiStepLR(optimizer=optimizer, milestones=args.multistep, gamma=0.1)
    # scheduler = CosineAnnealingWarmRestarts(optimizer=optimizer, T_0=20, T_mult=1, eta_min=1e-6)
    scheduler = CosineAnnealingLR(optimizer=optimizer,
                                  T_max=args.epochs,
                                  eta_min=1e-6)

    # scheduler = OneCycleLR(optimizer, max_lr=0.003, epochs=41, steps_per_epoch=173)
    # scheduler = CyclicLR(optimizer, base_lr=args.learning_rate, max_lr=2*args.learning_rate,
    #                      step_size_up=173*3, step_size_down=173*10)

    if args.amp:
        ssd300, optimizer = amp.initialize(ssd300, optimizer, opt_level='O2')

    if args.distributed:
        ssd300 = DDP(ssd300)

    if args.checkpoint is not None:
        if os.path.isfile(args.checkpoint):
            load_checkpoint(ssd300.module if args.distributed else ssd300,
                            args.checkpoint)
            checkpoint = torch.load(args.checkpoint,
                                    map_location=lambda storage, loc: storage.
                                    cuda(torch.cuda.current_device()))
            start_epoch = checkpoint['epoch']
            iteration = checkpoint['iteration']
            scheduler.load_state_dict(checkpoint['scheduler'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print('Provided checkpoint is not path to a file')
            return

    inv_map = {v: k for k, v in val_dataset.label_map.items()}

    total_time = 0

    if args.mode == 'evaluation':
        acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map, args)
        if args.local_rank == 0:
            print('Model precision {} mAP'.format(acc))

        return
    mean, std = generate_mean_std(args)

    for epoch in range(start_epoch, args.epochs):
        start_epoch_time = time.time()
        # scheduler.step()
        iteration = train_loop_func(ssd300, loss_func, epoch, optimizer,
                                    scheduler, train_loader, val_dataloader,
                                    encoder, iteration, logger, args, mean,
                                    std)
        end_epoch_time = time.time() - start_epoch_time
        total_time += end_epoch_time

        # https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
        scheduler.step()

        if args.local_rank == 0:
            logger.update_epoch_time(epoch, end_epoch_time)

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map,
                           args)

            if args.local_rank == 0:
                logger.update_epoch(epoch, acc)

        if args.save and args.local_rank == 0:
            print("saving model...")
            obj = {
                'epoch': epoch + 1,
                'iteration': iteration,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
                'label_map': val_dataset.label_info
            }
            if args.distributed:
                obj['model'] = ssd300.module.state_dict()
            else:
                obj['model'] = ssd300.state_dict()
            torch.save(obj, './models/epoch_{}.pt'.format(epoch))
        train_loader.reset()
    print('total training time: {}'.format(total_time))
示例#6
0
文件: train.py 项目: turgunyusuf/DALI
def train(args):
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.N_gpu = torch.distributed.get_world_size()
    else:
        args.N_gpu = 1

    dboxes = dboxes300_coco()
    encoder = Encoder(dboxes)
    cocoGt = get_coco_ground_truth(args)

    val_dataset = get_val_dataset(args)
    val_dataloader = get_val_dataloader(val_dataset, args)

    ssd300 = SSD300(len(cocoGt.cats) + 1)
    args.learning_rate = args.learning_rate * \
        args.N_gpu * (args.batch_size / 32)
    iteration = 0
    loss_func = Loss(dboxes)

    ssd300.cuda()
    loss_func.cuda()

    if args.fp16:
        ssd300 = network_to_half(ssd300)

    if args.distributed:
        ssd300 = DDP(ssd300)

    optimizer = torch.optim.SGD(tencent_trick(ssd300),
                                lr=args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    scheduler = MultiStepLR(optimizer=optimizer,
                            milestones=args.multistep,
                            gamma=0.1)

    if args.fp16:
        optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.)

    inv_map = {v: k for k, v in val_dataset.label_map.items()}

    avg_loss = 0.0
    acc = 0
    batch_perf = AverageMeter()
    end = time.time()
    train_start = end

    args.train_annotate = os.path.join(args.data,
                                       "annotations/instances_train2017.json")
    args.train_coco_root = os.path.join(args.data, "train2017")
    local_seed = set_seeds(args)

    if args.data_pipeline == 'no_dali':
        train_trans = SSDTransformer(dboxes, args, (300, 300), val=False)
        train_dataset = get_train_dataset(args, train_trans)
        train_loader = get_train_loader(train_dataset, args, args.num_workers)
    elif args.data_pipeline == 'dali':
        train_loader = get_train_dali_loader(args, dboxes, local_seed)

    for epoch in range(args.epochs):
        start_epoch_time = time.time()
        scheduler.step()

        epoch_loop(train_loader, args, ssd300, time.time(), loss_func,
                   optimizer, iteration, avg_loss, batch_perf, epoch)
        torch.cuda.synchronize()

        if epoch in args.evaluation:
            acc = evaluate(ssd300, val_dataloader, cocoGt, encoder, inv_map,
                           args)

        try:
            train_loader.reset()
        except AttributeError:
            pass

    if args.local_rank == 0:
        print(
            "Training end: Average speed: {:3f} img/sec, Total time: {:3f} sec, Final accuracy: {:3f} mAP"
            .format(args.N_gpu * args.batch_size / batch_perf.avg,
                    time.time() - train_start, acc))
示例#7
0
    all_time = 0
    for epoch in range(opt.start_epochs, opt.epochs):
        start = time.time()

        model.train()
        for batch_i, datas in enumerate(train_loader):
            # for batch_i, (imgNames, imgs, targets) in enumerate(train_loader):

            for data in datas:
                imgs = data[0][0].cuda()
                targets = data[1][0].cuda()
                label_id = data[2][0].cuda()
                targets = torch.cat([label_id, targets], dim=1)

                _, outputs = model(imgs)
                loss = Loss(outputs, targets)

                optimizer.zero_grad()

                if opt.fp16:
                    if opt.amp:
                        with amp.scale_loss(loss, optimizer) as scale_loss:
                            scale_loss.backward()
                    else:
                        # optimizer.backward(loss)
                        loss.backward()
                else:
                    loss.backward()

                optimizer.step()
                progress_bar(