Exemplo n.º 1
0
def main(args):
    time_stamp = '{:.0f}'.format(time.time() % 100000)
    if torch.cuda.is_available() is True:
        logging.info('Utilizing GPU', extra=args.client)
        print('Utilizing GPU')

    train_loader, val_loader = load_data(args)
    model = get_model(args.model, args)

    if args.batch_size > 256 and args.dataset == 'imagenet' and args.model == 'resnet':
        batch_accumulate_num = args.batch_size // 256
    else:
        batch_accumulate_num = 1
    # create model
    if args.dataset == 'imagenet':
        if batch_accumulate_num > 1:
            args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256
        else:
            args.iterations_per_epoch = len(
                train_loader.dataset.imgs) // args.batch_size
        val_len = len(val_loader.dataset.imgs) // 1024
    else:
        args.iterations_per_epoch = len(
            train_loader.dataset.train_labels) // args.batch_size
        val_len = len(val_loader.dataset.test_labels) // 1024

    # get the number of model parameters
    log_str = 'Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()]))
    logging.info(log_str, extra=args.client)
    print(log_str)
    # for training on multiple GPUs.
    model = torch.nn.DataParallel(model)
    model = model.cuda()
    server = ParameterServer.get_server(args.optimizer, model, args)
    val_statistics = Statistics.get_statistics('image_classification', args)
    train_statistics = Statistics.get_statistics('image_classification', args)
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume + '/checkpoint.pth.tar'):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume + '/checkpoint.pth.tar')
            args.start_epoch = checkpoint['epoch']
            server = checkpoint['server']
            val_statistics = checkpoint['val_stats']
            train_statistics = checkpoint['train_stats']
            model.load_state_dict(checkpoint['state_dict'])
            print('=> loaded checkpoint {} (epoch {})'.format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # # Synchronous to Asynchronous Adjustments
    # print('Resetting Parameter Server to Asynchronous Mode')
    # logging.info('Resetting Parameter Server to Asynchronous Mode', extra=args.client)
    # server._shards_weights = list()
    # weights = server._get_model_weights()
    # for i in range(0, args.workers_num):
    #     server._shards_weights.append(deepcopy(weights))
    # server._workers_num = args.workers_num
    # # learning rate initialization
    # batch_baseline = args.baseline
    # server._lr = args.lr * np.sqrt((args.workers_num * args.batch_size) // batch_baseline) / (args.workers_num)
    # server._fast_im = args.fast_im
    # server._lr_warm_up = args.lr_warm_up
    # server._current_lr = args.lr
    # server._m_off = args.m_off
    # server._current_momentum = args.momentum
    # server._iterations_per_epoch = args.iterations_per_epoch
    # server._momentum = args.momentum
    # server._client = args.client
    # if args.fast_im is True:
    #     end_lr = args.lr * ((args.workers_num * args.batch_size) // batch_baseline) / np.sqrt(args.workers_num)
    #     start_lr = args.lr / (args.workers_num)
    #     server._lr = end_lr
    #     server._start_lr = start_lr
    #     server._lr_increment_const = (end_lr - start_lr) / (args.iterations_per_epoch * 5)
    #     log_str = 'Fast ImageNet Mode - Warm Up [{:.5f}]->[{:.5f}] In 5 Epochs'.format(start_lr, end_lr)
    #     logging.info(log_str, extra=args.client)
    #     print(log_str)
    # else:
    #     server._start_lr = 0
    #     server._lr_increment_const = 0
    # for param_group in server._optimizer.param_groups:
    #     param_group['lr'] = start_lr
    #     param_group['momentum'] = server._momentum
    # # Synchronous to Asynchronous Adjustments - End

    cudnn.benchmark = True
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    if args.bar is True:
        train_bar = IncrementalBar('Training  ',
                                   max=args.iterations_per_epoch,
                                   suffix='%(percent)d%%')
        val_bar = IncrementalBar('Evaluating',
                                 max=val_len,
                                 suffix='%(percent)d%%')
    else:
        train_bar = None
        val_bar = None
    log_str = '{}: Training neural network for {} epochs with {} workers'.format(
        args.id, args.epochs, args.workers_num)
    logging.info(log_str, extra=args.client)
    print(log_str)
    train_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train_loss, train_error = train(train_loader, model, criterion, server,
                                        epoch, args.workers_num,
                                        args.grad_clip, batch_accumulate_num,
                                        train_bar, train_statistics,
                                        args.client)

        train_time = time.time() - train_time
        if args.bar is True:
            train_bar.finish()
            train_bar.index = 0

        # evaluate on validation set
        val_time = time.time()
        with torch.no_grad():
            val_loss, val_error = validate(val_loader, model, criterion,
                                           server, val_statistics, val_bar)
        train_statistics.save_loss(train_loss)
        train_statistics.save_error(train_error)
        train_statistics.save_weight_mean_dist(
            server.get_workers_mean_statistics())
        train_statistics.save_weight_master_dist(
            server.get_workers_master_statistics())
        train_statistics.save_mean_master_dist(server.get_mean_master_dist())
        train_statistics.save_weight_norm(server.get_server_weights())
        train_statistics.save_gradient_norm(server.get_server_gradients())
        val_time = time.time() - val_time
        if args.bar is True:
            val_bar.finish()
            val_bar.index = 0

        log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \
                  'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss,
                                                                                train_error, val_time, val_loss,
                                                                                val_error)
        logging.info(log_str, extra=args.client)
        print(log_str)
        if epoch % args.save == 0 and epoch > 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'val_stats': val_statistics,
                    'train_stats': train_statistics,
                    'server': server
                },
                sim_name=(args.name + time_stamp + '_' + str(epoch)))
        train_time = time.time()

    return train_statistics, val_statistics
Exemplo n.º 2
0
def main(args):
    time_stamp = '{:.0f}'.format(time.time() % 100000)
    if torch.cuda.is_available() is True:
        logging.info('Utilizing GPU', extra=args.client)
        print('Utilizing GPU')

    train_loader, val_loader = load_data(args)
    model = get_model(args.model, args)

    if args.batch_size > 256:  # and args.dataset == 'imagenet' and args.model == 'resnet':
        batch_accumulate_num = args.batch_size // 256
    else:
        batch_accumulate_num = 1
    # create model
    if args.dataset == 'imagenet':
        if batch_accumulate_num > 1:
            args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256
        else:
            args.iterations_per_epoch = len(train_loader.dataset.imgs) // args.batch_size
        val_len = len(val_loader.dataset.imgs) // 1024
    else:
        if batch_accumulate_num > 1:
            args.iterations_per_epoch = len(train_loader.dataset.train_labels) // 256
        else:
            args.iterations_per_epoch = len(train_loader.dataset.train_labels) // args.batch_size

        val_len = len(val_loader.dataset.test_labels) // 1024

    # get the number of model parameters
    log_str = 'Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()]))
    logging.info(log_str, extra=args.client)
    print(log_str)
    # for training on multiple GPUs.
    model = torch.nn.DataParallel(model)
    model = model.cuda()
    server = ParameterServer.get_server(args.optimizer, model, args)
    val_statistics = Statistics.get_statistics('image_classification', args)
    train_statistics = Statistics.get_statistics('image_classification', args)
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume + '/checkpoint.pth.tar'):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume + '/checkpoint.pth.tar')
            args.start_epoch = checkpoint['epoch']
            server = checkpoint['server']
            val_statistics = checkpoint['val_stats']
            train_statistics = checkpoint['train_stats']
            model.load_state_dict(checkpoint['state_dict'])
            print('=> loaded checkpoint {} (epoch {})'.format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True
    # define loss function (criterion) and optimizer
    loss_params = {}
    if args.label_smoothing > 0:
        loss_params['smooth_eps'] = args.label_smoothing
    criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params).cuda()
    # criterion = nn.CrossEntropyLoss().cuda()
    if args.bar is True:
        train_bar = IncrementalBar('Training  ', max=args.iterations_per_epoch, suffix='%(percent)d%%')
        val_bar = IncrementalBar('Evaluating', max=val_len, suffix='%(percent)d%%')
    else:
        train_bar = None
        val_bar = None
    log_str = '{}: Training neural network for {} epochs with {} workers'.format(args.id, args.epochs,
                                                                                 args.workers_num)
    logging.info(log_str, extra=args.client)
    print(log_str)
    train_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train_loss, train_error = train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip,
                                        batch_accumulate_num, train_bar, train_statistics, args.client)

        train_time = time.time() - train_time
        if args.bar is True:
            train_bar.finish()
            train_bar.index = 0

        # evaluate on validation set
        val_time = time.time()
        with torch.no_grad():
            val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, val_bar)
        train_statistics.save_loss(train_loss)
        train_statistics.save_error(train_error)
        train_statistics.save_weight_mean_dist(server.get_workers_mean_statistics())
        train_statistics.save_weight_master_dist(server.get_workers_master_statistics())
        train_statistics.save_mean_master_dist(server.get_mean_master_dist())
        train_statistics.save_weight_norm(server.get_server_weights())
        train_statistics.save_gradient_norm(server.get_server_gradients())
        val_time = time.time() - val_time
        if args.bar is True:
            val_bar.finish()
            val_bar.index = 0

        log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \
                  'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss,
                                                                                train_error, val_time, val_loss,
                                                                                val_error)
        logging.info(log_str, extra=args.client)
        print(log_str)
        if epoch % args.save == 0 and epoch > 0:
            save_checkpoint({'epoch': epoch + 1,
                             'state_dict': model.state_dict(),
                             'val_stats': val_statistics,
                             'train_stats': train_statistics,
                             'server': server}, sim_name=(args.name + time_stamp + '_' + str(epoch)))
        train_time = time.time()

    return train_statistics, val_statistics
Exemplo n.º 3
0
def main(args):
    if torch.cuda.is_available() is True:
        print('Utilizing GPU')
        # torch.cuda.set_device(args.gpu_num)
    train_loader, val_loader = load_data(args)
    # create model
    if args.dataset == 'image_net':
        model = alexnet()
        top_k = (1, 5)
        val_len = len(val_loader.dataset.imgs)
    else:
        model = WideResNet(args.layers, args.dataset == 'cifar10' and 10 or 100,
                           args.widen_factor, dropRate=args.droprate)
        top_k = (1,)
        val_len = len(val_loader.dataset.test_labels)

    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))
    # for training on multiple GPUs.
    model = torch.nn.DataParallel(model).cuda()
    model = model.cuda()

    server = ParameterServer.get_server(args.optimizer, model, args)
    val_statistics = Statistics.get_statistics('image_classification', args)
    train_statistics = Statistics.get_statistics('image_classification', args)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    # ghost batch normalization (128 as baseline)
    repeat = args.batch_size // 128 if args.gbn == 1 else 1
    total_iterations = args.iterations_per_epoch + val_len // args.batch_size
    if args.bar is True:
        train_bar = IncrementalBar('Training  ', max=args.iterations_per_epoch, suffix='%(percent)d%%')
        val_bar = IncrementalBar('Evaluating', max=total_iterations, suffix='%(percent)d%%')
    else:
        train_bar = None
        val_bar = None

    print(
        '{}: Training neural network for {} epochs with {} workers'.format(args.sim_num, args.epochs, args.workers_num))
    train_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, repeat, train_bar)
        train_time = time.time() - train_time
        if args.bar is True:
            train_bar.finish()
            train_bar.index = 0

        # evaluate on validation set
        val_time = time.time()
        val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, top_k, val_bar)
        train_loss, train_error = validate(train_loader, model, criterion, server, train_statistics, top_k, val_bar,
                                           save_norm=True)
        val_time = time.time() - val_time
        if args.bar is True:
            val_bar.finish()
            val_bar.index = 0
        print('Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] |'
              ' Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'
              .format(epoch, train_time, train_loss, train_error, val_time, val_loss, val_error))
        train_time = time.time()

    return train_statistics, val_statistics