Пример #1
0
 def __init__(self):
     super(EvaluationProcess, self).__init__()
     self.server = ParameterServer(2)
     self.args = ArgsProvider(call_from=self,
                              define_params=[
                                  ("eval_freq", 10),
                                  ("eval_gpu", 1),
                              ])
     self.count = 0
Пример #2
0
def main(config, max_samples):
    get_env_configs(config)
    ray.init()

    parameter_server = ParameterServer.remote(config)
    replay_buffer = ReplayBuffer.remote(config)
    learner = Learner.remote(config, replay_buffer, parameter_server)

    train_actor_ids = []
    eval_actor_ids = []

    learner.start_learning.remote()

    #   start train actors
    for i in range(config["num_workers"]):
        epsilon = config["max_eps"] * i / config["num_workers"]
        training_actor = Actor.remote("train-" + str(i), replay_buffer,
                                      parameter_server, config, epsilon)
        training_actor.sample.remote()
        train_actor_ids.append(training_actor)

    #   start eval actors
    for i in range(config["eval_num_workers"]):
        epsilon = 0
        eval_actor = Actor.remote("eval-" + str(i),
                                  replay_buffer,
                                  parameter_server,
                                  config,
                                  epsilon,
                                  eval=True)
        eval_actor_ids.append(eval_actor)

    #   fetch samples in loop and sync actor weights
    total_samples = 0
    best_eval_mean_reward = np.NINF
    eval_mean_rewards = []
    while total_samples < max_samples:
        total_env_samples_id = replay_buffer.get_total_env_samples.remote()
        new_total_samples = ray.get(total_env_samples_id)
        num_new_samples = new_total_samples - total_samples
        if num_new_samples >= config["timesteps_per_iteration"]:
            total_samples = new_total_samples
            print("Total samples:", total_samples)
            parameter_server.set_eval_weights.remote()
            eval_sampling_ids = [
                eval_actor.sample.remote() for eval_actor in eval_actor_ids
            ]
            eval_rewards = ray.get(eval_sampling_ids)
            print("Evaluation rewards: {}".format(eval_rewards))
            eval_mean_reward = np.mean(eval_rewards)
            eval_mean_rewards.append(eval_mean_reward)
            print("Mean evaluation reward: {}".format(eval_mean_reward))
            if eval_mean_reward > best_eval_mean_reward:
                print("Model has improved! Saving the model!")
                best_eval_mean_reward = eval_mean_reward
                parameter_server.save_eval_weights.remote()

    print("Finishing the training.\n\n\n\n\n\n")
    [actor.stop.remote() for actor in train_actor_ids]
    learner.stop.remote()
Пример #3
0
class EvaluationProcess(mp.Process):
    def __init__(self):
        super(EvaluationProcess, self).__init__()
        self.server = ParameterServer(2)
        self.args = ArgsProvider(call_from=self,
                                 define_params=[
                                     ("eval_freq", 10),
                                     ("eval_gpu", 1),
                                 ])
        self.count = 0

    def set_model(self, mi):
        self.server.server_send_model(mi)

    def update_model(self, key, mi, immediate=False):
        if (self.count % self.args.eval_freq == 0) or immediate:
            self.server.server_update_model(key, mi, noblock=True)

        self.count += 1

    def set(self, evaluator, args):
        self.evaluator = evaluator
        self.args = args

    def run(self):
        ''' Run the model '''
        self.server.client_receive_model()
        self.evaluator.setup(self.args)
        k = 0
        while True:
            mi = self.server.client_refresh_model(gpu=self.evaluator.gpu)
            print("Eval: Get refreshed model")

            # Do your evaluation.
            self.evaluator.step(k, mi)
            k += 1

    def run_same_process(self, mi):
        self.evaluator.setup(self.args)
        # Do your evaluation.
        self.evaluator.step(0, mi)
Пример #4
0
def run(args):
    ray.init(
        address='auto',
        ignore_reinit_error=True,
        webui_host='0.0.0.0',
        redis_password='******'
    )
    try:
        ps = ParameterServer.remote(args)
        # https://docs.ray.io/en/releases-0.8.6/auto_examples/plot_parameter_server.html
        if args.sync_param_server:
            # synchronous parameter server:
            val = ps.run.remote()
        else:
            # asynchronous paramter server:
            val = ps.run_async.remote()
        print(ray.get(val))
    except Exception as e:
        raise e
    finally:
        print('waiting 10s to allow logs to flush')
        time.sleep(10)
        ray.shutdown()
Пример #5
0
def main(args):
    time_stamp = '{:.0f}'.format(time.time() % 100000)
    if torch.cuda.is_available() is True:
        logging.info('Utilizing GPU', extra=args.client)
        print('Utilizing GPU')

    train_loader, val_loader = load_data(args)
    model = get_model(args.model, args)

    if args.batch_size > 256:  # and args.dataset == 'imagenet' and args.model == 'resnet':
        batch_accumulate_num = args.batch_size // 256
    else:
        batch_accumulate_num = 1
    # create model
    if args.dataset == 'imagenet':
        if batch_accumulate_num > 1:
            args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256
        else:
            args.iterations_per_epoch = len(train_loader.dataset.imgs) // args.batch_size
        val_len = len(val_loader.dataset.imgs) // 1024
    else:
        if batch_accumulate_num > 1:
            args.iterations_per_epoch = len(train_loader.dataset.train_labels) // 256
        else:
            args.iterations_per_epoch = len(train_loader.dataset.train_labels) // args.batch_size

        val_len = len(val_loader.dataset.test_labels) // 1024

    # get the number of model parameters
    log_str = 'Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()]))
    logging.info(log_str, extra=args.client)
    print(log_str)
    # for training on multiple GPUs.
    model = torch.nn.DataParallel(model)
    model = model.cuda()
    server = ParameterServer.get_server(args.optimizer, model, args)
    val_statistics = Statistics.get_statistics('image_classification', args)
    train_statistics = Statistics.get_statistics('image_classification', args)
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume + '/checkpoint.pth.tar'):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume + '/checkpoint.pth.tar')
            args.start_epoch = checkpoint['epoch']
            server = checkpoint['server']
            val_statistics = checkpoint['val_stats']
            train_statistics = checkpoint['train_stats']
            model.load_state_dict(checkpoint['state_dict'])
            print('=> loaded checkpoint {} (epoch {})'.format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True
    # define loss function (criterion) and optimizer
    loss_params = {}
    if args.label_smoothing > 0:
        loss_params['smooth_eps'] = args.label_smoothing
    criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params).cuda()
    # criterion = nn.CrossEntropyLoss().cuda()
    if args.bar is True:
        train_bar = IncrementalBar('Training  ', max=args.iterations_per_epoch, suffix='%(percent)d%%')
        val_bar = IncrementalBar('Evaluating', max=val_len, suffix='%(percent)d%%')
    else:
        train_bar = None
        val_bar = None
    log_str = '{}: Training neural network for {} epochs with {} workers'.format(args.id, args.epochs,
                                                                                 args.workers_num)
    logging.info(log_str, extra=args.client)
    print(log_str)
    train_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train_loss, train_error = train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip,
                                        batch_accumulate_num, train_bar, train_statistics, args.client)

        train_time = time.time() - train_time
        if args.bar is True:
            train_bar.finish()
            train_bar.index = 0

        # evaluate on validation set
        val_time = time.time()
        with torch.no_grad():
            val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, val_bar)
        train_statistics.save_loss(train_loss)
        train_statistics.save_error(train_error)
        train_statistics.save_weight_mean_dist(server.get_workers_mean_statistics())
        train_statistics.save_weight_master_dist(server.get_workers_master_statistics())
        train_statistics.save_mean_master_dist(server.get_mean_master_dist())
        train_statistics.save_weight_norm(server.get_server_weights())
        train_statistics.save_gradient_norm(server.get_server_gradients())
        val_time = time.time() - val_time
        if args.bar is True:
            val_bar.finish()
            val_bar.index = 0

        log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \
                  'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss,
                                                                                train_error, val_time, val_loss,
                                                                                val_error)
        logging.info(log_str, extra=args.client)
        print(log_str)
        if epoch % args.save == 0 and epoch > 0:
            save_checkpoint({'epoch': epoch + 1,
                             'state_dict': model.state_dict(),
                             'val_stats': val_statistics,
                             'train_stats': train_statistics,
                             'server': server}, sim_name=(args.name + time_stamp + '_' + str(epoch)))
        train_time = time.time()

    return train_statistics, val_statistics
Пример #6
0
def initialization(params):
    print('-----------------------')
    print('Initializing...', end='')

    batch_size = params.batch_size
    learning_rate = params.learning_rate
    momentum = params.momentum
    rho = params.rho
    tau = params.tau
    workers_number = params.workers_number
    optimizer = params.optimizer
    permute = params.permute
    gpu_num = params.gpu_number
    gradient_clipping = params.gradient_clipping
    lr_batch_adjustment = params.lr_batch_adjustment

    if torch.cuda.is_available() is True:
        print('Utilizing GPU')
        torch.cuda.set_device(gpu_num)
        dtype = torch.cuda.FloatTensor
    else:
        dtype = torch.FloatTensor

    if params.data_set == 'cifar10':
        dataset = data_set.DataSetCifar10(batch_size, permute)
        model = resnet(num_classes=10, depth=56, wide_factor=1)
    if params.data_set == 'image_net':
        dataset = data_set.DataSetImageNet(batch_size, permute)
        model = alexnet()
    if params.data_set == 'cifar100':
        dataset = data_set.DataSetCifar100(batch_size, permute)
        model = resnet(num_classes=100, depth=56, wide_factor=1)

    train_set = dataset.get_train()
    test_set = dataset.get_test()

    if torch.cuda.is_available() is True:
        model.cuda()
        # model = torch.nn.DataParallel(model)  # Run on multiple GPUs

    parameters = net_model.get_model_parameters(model, dtype)
    gradients = net_model.get_model_parameters(model, dtype)
    loss_fn = torch.nn.CrossEntropyLoss()
    if optimizer == 'synchronous':
        effective_batch_size = batch_size * workers_number
    else:
        effective_batch_size = batch_size
    server = ParameterServer.get_server(
        optimizer,
        learning_rate=learning_rate,
        momentum=momentum,
        parameters=parameters,
        gradients=gradients,
        workers_number=workers_number,
        rho=rho,
        tau=tau,
        effective_batch_size=effective_batch_size,
        gradient_clipping=gradient_clipping,
        lr_batch_adjustment=lr_batch_adjustment)
    stats_train = Statistics.get_statistics('image_classification', params)
    stats_test = Statistics.get_statistics('image_classification', params)
    print('Done')

    return server, loss_fn, stats_train, stats_test, train_set, test_set, model, dtype
Пример #7
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=5,
                        metavar='N',
                        help='number of epochs to train (default: 5)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.1)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    # yuanfang added
    parser.add_argument('--quantize-nbits',
                        default=0,
                        type=int,
                        help='quantize')
    parser.add_argument('--tau',
                        default=32,
                        type=int,
                        help='hyperparameter used in AEASGD')
    parser.add_argument('--rho',
                        default=0.01,
                        type=float,
                        help='hyperparameter used in AEASGD')
    parser.add_argument('--world-size',
                        default=-1,
                        type=int,
                        help='number of nodes for distributed training')
    parser.add_argument('--rank',
                        default=-1,
                        type=int,
                        help='node rank for distributed training')
    parser.add_argument('--dist-url',
                        default='does not work',
                        type=str,
                        help='url used to set up distributed training')
    parser.add_argument('--dist-backend',
                        default='gloo',
                        type=str,
                        help='distributed backend')
    ps_flag_parser = parser.add_mutually_exclusive_group(required=False)
    ps_flag_parser.add_argument('--flag', dest='ps_flag', action='store_true')
    ps_flag_parser.add_argument('--no-flag',
                                dest='ps_flag',
                                action='store_false')
    parser.set_defaults(ps_flag=False)

    args = parser.parse_args()
    use_cuda = False

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")
    model = Net().to(device)
    if args.ps_flag:
        print("before init process group")
        # start a parameter server
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
        print("after init process group")
        ps = ParameterServer(model,
                             args.world_size,
                             quantize_num_bits=args.quantize_nbits)
        print("starting parameter server....")
        ps.start()
    else:
        print("before init process group")
        # start a worker
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

        print("after init process group")
        kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
        train_loader = torch.utils.data.DataLoader(datasets.MNIST(
            './data%d' % (args.rank),
            train=True,
            download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])),
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   **kwargs)
        test_loader = torch.utils.data.DataLoader(
            datasets.MNIST('./data%d' % (args.rank),
                           train=False,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307, ), (0.3081, ))
                           ])),
            batch_size=args.test_batch_size,
            shuffle=True,
            **kwargs)

        optimizer = AEASGD(model.parameters(),
                           lr=args.lr,
                           tau=args.tau,
                           rho=args.rho,
                           model=model,
                           quantize_num_bits=args.quantize_nbits)

        for epoch in range(1, args.epochs + 1):
            train(args, model, device, train_loader, optimizer, epoch)
            test(args, model, device, test_loader)

        optimizer.send_message(
            MessageCode.WorkerTerminate,
            torch.randn(optimizer.squash_model(optimizer.model).numel()))

        if args.save_model:
            torch.save(model.state_dict(), "mnist_cnn.pt")
Пример #8
0
def main(args):
    time_stamp = '{:.0f}'.format(time.time() % 100000)
    if torch.cuda.is_available() is True:
        logging.info('Utilizing GPU', extra=args.client)
        print('Utilizing GPU')

    train_loader, val_loader = load_data(args)
    model = get_model(args.model, args)

    if args.batch_size > 256 and args.dataset == 'imagenet' and args.model == 'resnet':
        batch_accumulate_num = args.batch_size // 256
    else:
        batch_accumulate_num = 1
    # create model
    if args.dataset == 'imagenet':
        if batch_accumulate_num > 1:
            args.iterations_per_epoch = len(train_loader.dataset.imgs) // 256
        else:
            args.iterations_per_epoch = len(
                train_loader.dataset.imgs) // args.batch_size
        val_len = len(val_loader.dataset.imgs) // 1024
    else:
        args.iterations_per_epoch = len(
            train_loader.dataset.train_labels) // args.batch_size
        val_len = len(val_loader.dataset.test_labels) // 1024

    # get the number of model parameters
    log_str = 'Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()]))
    logging.info(log_str, extra=args.client)
    print(log_str)
    # for training on multiple GPUs.
    model = torch.nn.DataParallel(model)
    model = model.cuda()
    server = ParameterServer.get_server(args.optimizer, model, args)
    val_statistics = Statistics.get_statistics('image_classification', args)
    train_statistics = Statistics.get_statistics('image_classification', args)
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume + '/checkpoint.pth.tar'):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume + '/checkpoint.pth.tar')
            args.start_epoch = checkpoint['epoch']
            server = checkpoint['server']
            val_statistics = checkpoint['val_stats']
            train_statistics = checkpoint['train_stats']
            model.load_state_dict(checkpoint['state_dict'])
            print('=> loaded checkpoint {} (epoch {})'.format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # # Synchronous to Asynchronous Adjustments
    # print('Resetting Parameter Server to Asynchronous Mode')
    # logging.info('Resetting Parameter Server to Asynchronous Mode', extra=args.client)
    # server._shards_weights = list()
    # weights = server._get_model_weights()
    # for i in range(0, args.workers_num):
    #     server._shards_weights.append(deepcopy(weights))
    # server._workers_num = args.workers_num
    # # learning rate initialization
    # batch_baseline = args.baseline
    # server._lr = args.lr * np.sqrt((args.workers_num * args.batch_size) // batch_baseline) / (args.workers_num)
    # server._fast_im = args.fast_im
    # server._lr_warm_up = args.lr_warm_up
    # server._current_lr = args.lr
    # server._m_off = args.m_off
    # server._current_momentum = args.momentum
    # server._iterations_per_epoch = args.iterations_per_epoch
    # server._momentum = args.momentum
    # server._client = args.client
    # if args.fast_im is True:
    #     end_lr = args.lr * ((args.workers_num * args.batch_size) // batch_baseline) / np.sqrt(args.workers_num)
    #     start_lr = args.lr / (args.workers_num)
    #     server._lr = end_lr
    #     server._start_lr = start_lr
    #     server._lr_increment_const = (end_lr - start_lr) / (args.iterations_per_epoch * 5)
    #     log_str = 'Fast ImageNet Mode - Warm Up [{:.5f}]->[{:.5f}] In 5 Epochs'.format(start_lr, end_lr)
    #     logging.info(log_str, extra=args.client)
    #     print(log_str)
    # else:
    #     server._start_lr = 0
    #     server._lr_increment_const = 0
    # for param_group in server._optimizer.param_groups:
    #     param_group['lr'] = start_lr
    #     param_group['momentum'] = server._momentum
    # # Synchronous to Asynchronous Adjustments - End

    cudnn.benchmark = True
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    if args.bar is True:
        train_bar = IncrementalBar('Training  ',
                                   max=args.iterations_per_epoch,
                                   suffix='%(percent)d%%')
        val_bar = IncrementalBar('Evaluating',
                                 max=val_len,
                                 suffix='%(percent)d%%')
    else:
        train_bar = None
        val_bar = None
    log_str = '{}: Training neural network for {} epochs with {} workers'.format(
        args.id, args.epochs, args.workers_num)
    logging.info(log_str, extra=args.client)
    print(log_str)
    train_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train_loss, train_error = train(train_loader, model, criterion, server,
                                        epoch, args.workers_num,
                                        args.grad_clip, batch_accumulate_num,
                                        train_bar, train_statistics,
                                        args.client)

        train_time = time.time() - train_time
        if args.bar is True:
            train_bar.finish()
            train_bar.index = 0

        # evaluate on validation set
        val_time = time.time()
        with torch.no_grad():
            val_loss, val_error = validate(val_loader, model, criterion,
                                           server, val_statistics, val_bar)
        train_statistics.save_loss(train_loss)
        train_statistics.save_error(train_error)
        train_statistics.save_weight_mean_dist(
            server.get_workers_mean_statistics())
        train_statistics.save_weight_master_dist(
            server.get_workers_master_statistics())
        train_statistics.save_mean_master_dist(server.get_mean_master_dist())
        train_statistics.save_weight_norm(server.get_server_weights())
        train_statistics.save_gradient_norm(server.get_server_gradients())
        val_time = time.time() - val_time
        if args.bar is True:
            val_bar.finish()
            val_bar.index = 0

        log_str = 'Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] | ' \
                  'Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'.format(epoch + 1, train_time, train_loss,
                                                                                train_error, val_time, val_loss,
                                                                                val_error)
        logging.info(log_str, extra=args.client)
        print(log_str)
        if epoch % args.save == 0 and epoch > 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'val_stats': val_statistics,
                    'train_stats': train_statistics,
                    'server': server
                },
                sim_name=(args.name + time_stamp + '_' + str(epoch)))
        train_time = time.time()

    return train_statistics, val_statistics
Пример #9
0
def main(args):
    if torch.cuda.is_available() is True:
        print('Utilizing GPU')
        # torch.cuda.set_device(args.gpu_num)
    train_loader, val_loader = load_data(args)
    # create model
    if args.dataset == 'image_net':
        model = alexnet()
        top_k = (1, 5)
        val_len = len(val_loader.dataset.imgs)
    else:
        model = WideResNet(args.layers, args.dataset == 'cifar10' and 10 or 100,
                           args.widen_factor, dropRate=args.droprate)
        top_k = (1,)
        val_len = len(val_loader.dataset.test_labels)

    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))
    # for training on multiple GPUs.
    model = torch.nn.DataParallel(model).cuda()
    model = model.cuda()

    server = ParameterServer.get_server(args.optimizer, model, args)
    val_statistics = Statistics.get_statistics('image_classification', args)
    train_statistics = Statistics.get_statistics('image_classification', args)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    # ghost batch normalization (128 as baseline)
    repeat = args.batch_size // 128 if args.gbn == 1 else 1
    total_iterations = args.iterations_per_epoch + val_len // args.batch_size
    if args.bar is True:
        train_bar = IncrementalBar('Training  ', max=args.iterations_per_epoch, suffix='%(percent)d%%')
        val_bar = IncrementalBar('Evaluating', max=total_iterations, suffix='%(percent)d%%')
    else:
        train_bar = None
        val_bar = None

    print(
        '{}: Training neural network for {} epochs with {} workers'.format(args.sim_num, args.epochs, args.workers_num))
    train_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        train(train_loader, model, criterion, server, epoch, args.workers_num, args.grad_clip, repeat, train_bar)
        train_time = time.time() - train_time
        if args.bar is True:
            train_bar.finish()
            train_bar.index = 0

        # evaluate on validation set
        val_time = time.time()
        val_loss, val_error = validate(val_loader, model, criterion, server, val_statistics, top_k, val_bar)
        train_loss, train_error = validate(train_loader, model, criterion, server, train_statistics, top_k, val_bar,
                                           save_norm=True)
        val_time = time.time() - val_time
        if args.bar is True:
            val_bar.finish()
            val_bar.index = 0
        print('Epoch [{0:1d}]: Train: Time [{1:.2f}], Loss [{2:.3f}], Error[{3:.3f}] |'
              ' Test: Time [{4:.2f}], Loss [{5:.3f}], Error[{6:.3f}]'
              .format(epoch, train_time, train_loss, train_error, val_time, val_loss, val_error))
        train_time = time.time()

    return train_statistics, val_statistics
Пример #10
0
def main():
    global args, best_prec1, dtype
    best_prec1 = 0
    args = parser.parse_args()
    dtype = torch_dtypes.get(args.dtype)
    torch.manual_seed(args.seed)
    time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    if args.evaluate:
        args.results_dir = '/tmp'
    if args.save is '':
        args.save = time_stamp
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    else:
        print('***************************************\n'
              'Warning: PATH exists - override warning\n'
              '***************************************')

    args.distributed = args.local_rank >= 0 or args.world_size > 1
    setup_logging(os.path.join(save_path, 'log.txt'),
                  resume=args.resume is not '',
                  dummy=args.distributed and args.local_rank > 0)

    if args.deterministic:
        logging.info('Deterministic Run Set')
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    results_path = os.path.join(save_path, 'results')
    results = ResultsLog(results_path,
                         title='Training Results - %s' % args.save)

    if args.distributed:
        args.device_ids = [args.local_rank]
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_init,
                                world_size=args.world_size,
                                rank=args.local_rank)

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)
    logging.info("creating model %s", args.model)

    if 'cuda' in args.device and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)
        torch.cuda.set_device(args.device_ids[0])
        cudnn.benchmark = True
    else:
        args.device_ids = None

    # create model
    set_global_seeds(args.seed)
    model = models.__dict__[args.model]
    model_config = {'dataset': args.dataset}

    if args.model_config is not '':
        model_config = dict(model_config, **literal_eval(args.model_config))

    model = model(**model_config)
    logging.info("created model with configuration: %s", model_config)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info("number of parameters: %d", num_parameters)

    # optionally resume from a checkpoint
    shards = None
    x = None
    checkpoint = None
    if args.evaluate:
        if not os.path.isfile(args.evaluate):
            parser.error('invalid checkpoint: {}'.format(args.evaluate))
        checkpoint = torch.load(args.evaluate)
        x = dict()
        for name, val in checkpoint['server_state_dict'].items():
            x[name[7:]] = val
        model.load_state_dict(x)
        shards = checkpoint['server_weight_shards']
        logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate,
                     checkpoint['epoch'])
    elif args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            results.load(os.path.join(checkpoint_file, 'results.csv'))
            checkpoint_file = os.path.join(checkpoint_file,
                                           'model_best.pth.tar')
        if os.path.isfile(checkpoint_file):
            logging.info("loading checkpoint '%s'", args.resume)
            checkpoint = torch.load(checkpoint_file,
                                    map_location=torch.device('cpu'))
            args.start_epoch = checkpoint['epoch'] - 1
            best_prec1 = checkpoint['best_prec1']
            # model_dict = {'.'.join(k.split('.')[1:]): v for k, v in checkpoint['server_state_dict'].items()}
            # model.load_state_dict(model_dict)
            model.load_state_dict(checkpoint['server_state_dict'])
            logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file,
                         checkpoint['epoch'])
            shards = checkpoint['server_weight_shards']
        else:
            logging.error("no checkpoint found at '%s'", args.resume)

    # define loss function (criterion) and optimizer
    loss_params = {}
    if args.label_smoothing > 0:
        loss_params['smooth_eps'] = args.label_smoothing
    criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params)
    criterion.to(args.device, dtype)
    model.to(args.device, dtype)

    # optimizer configuration
    optim_regime = getattr(model, 'regime', [{
        'epoch': 0,
        'optimizer': args.optimizer,
        'lr': args.lr,
        'momentum': args.momentum,
        'weight_decay': args.weight_decay
    }])
    cpu_store = True if args.dataset == 'imagenet' and args.workers_num > 32 else False
    args.server = args.server if args.delay > 0 else 'ssgd'
    server = ParameterServer.get_server(args.server,
                                        args.delay,
                                        model=model,
                                        shards=shards,
                                        optimizer_regime=optim_regime,
                                        device_ids=args.device_ids,
                                        device=args.device,
                                        dtype=dtype,
                                        distributed=args.distributed,
                                        local_rank=args.local_rank,
                                        grad_clip=args.grad_clip,
                                        workers_num=args.workers_num,
                                        cpu_store=cpu_store)
    del shards, x, checkpoint
    torch.cuda.empty_cache()

    trainer = Trainer(model,
                      server,
                      criterion,
                      device_ids=args.device_ids,
                      device=args.device,
                      dtype=dtype,
                      distributed=args.distributed,
                      local_rank=args.local_rank,
                      workers_number=args.workers_num,
                      grad_clip=args.grad_clip,
                      print_freq=args.print_freq,
                      schedule=args.schedule)

    # Evaluation Data loading code
    args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size
    val_data = DataRegime(getattr(model, 'data_eval_regime', None),
                          defaults={
                              'datasets_path': args.datasets_dir,
                              'name': args.dataset,
                              'split': 'val',
                              'augment': False,
                              'input_size': args.input_size,
                              'batch_size': args.eval_batch_size,
                              'shuffle': False,
                              'num_workers': args.workers,
                              'pin_memory': True,
                              'drop_last': True
                          })

    # Training Data loading code
    train_data = DataRegime(getattr(model, 'data_regime', None),
                            defaults={
                                'datasets_path': args.datasets_dir,
                                'name': args.dataset,
                                'split': 'train',
                                'augment': args.augment,
                                'input_size': args.input_size,
                                'batch_size': args.batch_size,
                                'shuffle': True,
                                'num_workers': args.workers,
                                'pin_memory': True,
                                'drop_last': True,
                                'distributed': args.distributed,
                                'duplicates': args.duplicates,
                                'cutout': {
                                    'holes': 1,
                                    'length': 16
                                } if args.cutout else None
                            })

    if args.evaluate:
        trainer.forward_pass(train_data.get_loader(),
                             duplicates=args.duplicates)
        results = trainer.validate(val_data.get_loader())
        logging.info(results)
        return

    logging.info('optimization regime: %s', optim_regime)
    trainer.training_steps = args.start_epoch * len(train_data)
    args.iterations_steps = trainer.training_steps

    with open(os.path.join(save_path, 'args.txt'), 'w') as file:
        file.write(dict_to_table(vars(args)))
    tb.init(path=save_path,
            title='Training Results',
            params=args,
            res_iterations=args.resolution)

    for epoch in range(args.start_epoch, args.epochs):
        trainer.epoch = epoch
        train_data.set_epoch(epoch)
        val_data.set_epoch(epoch)
        logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1))

        # train for one epoch
        train_results = trainer.train(train_data.get_loader(),
                                      duplicates=args.duplicates)
        # evaluate on validation set
        val_results = trainer.validate(val_data.get_loader())
        if args.distributed and args.local_rank > 0:
            continue

        # remember best prec@1 and save checkpoint
        is_best = val_results['prec1'] > best_prec1
        best_prec1 = max(val_results['prec1'], best_prec1)
        if (epoch + 1) % args.save_freq == 0:
            tb.tboard.set_resume_step(epoch)
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': args.model,
                    'server_state_dict': server._model.state_dict(),
                    'server_weight_shards': server._shards_weights,
                    'config': args.model_config,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                },
                is_best,
                path=save_path)
        errors = {
            'error1_train': 100 - train_results['prec1'],
            'error5_train': 100 - train_results['prec5'],
            'error1_val': 100 - val_results['prec1'],
            'error5_val': 100 - val_results['prec5'],
            'epochs': epoch
        }
        logging.info('\nResults - Epoch: {0}\n'
                     'Training Loss {train[loss]:.4f} \t'
                     'Training Error@1 {errors[error1_train]:.3f} \t'
                     'Training Error@5 {errors[error5_train]:.3f} \t'
                     'Validation Loss {val[loss]:.4f} \t'
                     'Validation Error@1 {errors[error1_val]:.3f} \t'
                     'Validation Error@5 {errors[error5_val]:.3f} \t\n'.format(
                         epoch + 1,
                         train=train_results,
                         val=val_results,
                         errors=errors))

        values = dict(epoch=epoch + 1, steps=trainer.training_steps)
        values.update({'training ' + k: v for k, v in train_results.items()})
        values.update({'validation ' + k: v for k, v in val_results.items()})
        tb.tboard.log_results(epoch, **values)
        tb.tboard.log_model(server, epoch)
        if args.delay > 0:
            tb.tboard.log_delay(trainer.delay_hist, epoch)

    tb.tboard.close()
    return errors, args
Пример #11
0
def main(config, max_samples):
    get_env_parameters(config)
    log_dir = "logs/scalars/" + datetime.datetime.now().strftime(
        "%Y%m%d-%H%M%S")
    file_writer = tf.summary.create_file_writer(log_dir + "/metrics")
    file_writer.set_as_default()
    config['log_dir'] = log_dir
    ray.init()
    parameter_server = ParameterServer.remote(config)
    replay_buffer = ReplayBuffer.remote(config)
    learner = Learner.remote(config, replay_buffer, parameter_server)
    training_actor_ids = []
    eval_actor_ids = []

    learner.start_learning.remote()

    # Create training actors
    for i in range(config["num_workers"]):
        eps = config["max_eps"] * i / config["num_workers"]
        actor = Actor.remote("train-" + str(i), replay_buffer,
                             parameter_server, config, eps)
        actor.sample.remote()
        training_actor_ids.append(actor)

    # Create eval actors
    for i in range(config["eval_num_workers"]):
        eps = 0
        actor = Actor.remote("eval-" + str(i), replay_buffer, parameter_server,
                             config, eps, True)
        eval_actor_ids.append(actor)

    total_samples = 0
    best_eval_mean_reward = np.NINF
    eval_mean_rewards = []
    while total_samples < max_samples:
        tsid = replay_buffer.get_total_env_samples.remote()
        new_total_samples = ray.get(tsid)
        if (new_total_samples - total_samples >=
                config["timesteps_per_iteration"]):
            total_samples = new_total_samples
            print("Total samples:", total_samples)
            parameter_server.set_eval_weights.remote()
            eval_sampling_ids = []
            for eval_actor in eval_actor_ids:
                sid = eval_actor.sample.remote()
                eval_sampling_ids.append(sid)
            eval_rewards = ray.get(eval_sampling_ids)
            print("Evaluation rewards: {}".format(eval_rewards))
            eval_mean_reward = np.mean(eval_rewards)
            eval_mean_rewards.append(eval_mean_reward)
            print("Mean evaluation reward: {}".format(eval_mean_reward))
            tf.summary.scalar('Mean evaluation reward',
                              data=eval_mean_reward,
                              step=total_samples)
            if eval_mean_reward > best_eval_mean_reward:
                print("Model has improved! Saving the model!")
                best_eval_mean_reward = eval_mean_reward
                parameter_server.save_eval_weights.remote()

    print("Finishing the training.")
    for actor in training_actor_ids:
        actor.stop.remote()
    learner.stop.remote()