예제 #1
0
def prepare(args, e_ix_ln, r_ix_ln, t_ix_ln):
    mdl = _model(args, e_ix_ln, r_ix_ln, t_ix_ln)

    lr_ml = (hvd.local_size() if hvd.nccl_built() else
             1) if not args.tpu and args.adasum else _size(args)
    opt = torch.optim.Adam(mdl.parameters(),
                           lr=lr_ml * args.learning_rate,
                           weight_decay=args.weight_decay)

    st_e, bst_ls = _resume(args, mdl, opt) if args.resume != '' else (1, None)

    if not args.tpu:
        opt = hvd.DistributedOptimizer(
            opt,
            named_parameters=mdl.named_parameters(),
            compression=hvd.Compression.fp16
            if args.fp16 else hvd.Compression.none,
            op=hvd.Adasum if args.adasum else hvd.Average)
        hvd.broadcast_parameters(mdl.state_dict(), root_rank=0)

    lr_sc = torch.optim.lr_scheduler.StepLR(opt,
                                            step_size=args.learning_rate_step,
                                            gamma=args.learning_rate_gamma)
    if not args.tpu:
        hvd.broadcast_optimizer_state(opt, root_rank=0)

    ls_f = _loss_f(args).to(args.dvc)

    return mdl, opt, lr_sc, ls_f, st_e, bst_ls
예제 #2
0
def verify_communication(use_horovod, world_size):
    """Verifies that the communication between workers works as expected
    It reduces a tensor of [1], and verifies that the reduced tensor is the same as the world size
    Args:
        use_horovod (bool): Use horovod for communication
        world_size (int): Distributed world size

    Raises:
        AssertionError: if the communication doesn't work as expected
    """
    if use_horovod:
        hvd.init()
        logger.info("Using horovod, rank = {}".format(hvd.rank()))
        tensor = torch.tensor(
            [1],
            device=torch.device("cuda" if dist.get_backend() ==
                                dist.Backend.NCCL else "cpu"),
        )
        res = hvd.allreduce(tensor, op=hvd.Sum)
        assert res[0] == world_size, "Communication is not working"
    else:
        logger.info("Using torch, rank={}".format(dist.get_rank()))
        tensor = torch.tensor(
            [1],
            device=torch.device("cuda" if dist.get_backend() ==
                                dist.Backend.NCCL else "cpu"),
        )
        dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
        assert tensor[0] == world_size, "Communication is not working"
    if hvd:
        logger.info("NCCL Built={}, MPI Built={} , GLOO Built={}".format(
            hvd.nccl_built(), hvd.mpi_built(), hvd.gloo_built()))
예제 #3
0
    def test_orthogonal(self):
        hvd.init()
        # TODO support non-MPI Adasum operation
        # Only do this test if there are GPUs available.
        if not hvd.mpi_enabled() or not torch.cuda.is_available():
            self.skipTest("No GPUs available")

        device = torch.device('cuda:{}'.format(hvd.local_rank()))
        np.random.seed(2)
        torch.manual_seed(2)
        size = hvd.size()
        local_size = hvd.local_size()
        rank = hvd.rank()

        for data_type in self.data_types:
            denominator = local_size if hvd.nccl_built() else 1
            all_Ns = [size * 20 - 17, size * 2 + 1, size + 2, 2**19]
            tensors = []
            all_qs = []
            for N in all_Ns:
                a = np.random.normal(0, 1, (N, size)).astype(np.float64)
                q, r = np.linalg.qr(a)
                q = q.astype(data_type)
                all_qs.append(q.astype(np.float64))
                tensors.append(q[:, hvd.rank()])

            tensors = list(
                map(lambda x: torch.from_numpy(x).to(device), tensors))

            handles = [
                hvd.allreduce_async(tensor, op=hvd.Adasum)
                for tensor in tensors
            ]

            reduced_tensors = [synchronize(h) for h in handles]

            expected = [np.sum(q, axis=1) / denominator for q in all_qs]
            all_comp = [
                self.are_close(data_type, e,
                               rt.cpu().numpy())
                for e, rt in zip(expected, reduced_tensors)
            ]
            if np.alltrue(all_comp):
                print('Orthogonal test passed')
            else:
                for c, e, rt in zip(all_comp, expected, reduced_tensors):
                    if c == False:
                        print('computed: ', rt)
                        print('expected: ', e)
                        print('off by: ', self.diff_ratio(e, rt.cpu().numpy()))
            assert np.alltrue(all_comp)
예제 #4
0
val_loader = DataLoader(val_dataset,
                        batch_size=val_batch_size,
                        collate_fn=val_dataset.collate,
                        sampler=val_sampler,
                        **kwargs)

# Horovod: print logs on the first worker.
verbose = 1 if hvd.rank() == 0 else 0

# ------------ preparation ------------
net = SCNN(resize_shape, pretrained=True)
lr_scaler = 1
if torch.cuda.is_available():
    net.cuda()
    # Horovod: Scale learning rate as per number of devices
    if hvd.nccl_built():
        lr_scaler = hvd.local_size()

net = torch.nn.DataParallel(net)
lr = exp_cfg['optim']['lr']
momentum = exp_cfg['optim']['momentum']
weight_decay = exp_cfg['optim']['weight_decay']
nesterov = exp_cfg['optim']['nesterov']

# Horovod: scale learning rate by lr_scaler.
optimizer = optim.SGD(net.parameters(),
                      lr=lr * lr_scaler,
                      momentum=momentum,
                      weight_decay=weight_decay,
                      nesterov=nesterov)
예제 #5
0
    torch.backends.cudnn.deterministic = True

    torch.cuda.set_device(hvd.local_rank())

    config.rank = hvd.rank()
    config.world = hvd.size()

    if hvd.local_rank() == 0:
        utils.download_model(config)
    hvd.broadcast_object(0, root_rank=0)
    model = x.Model(config)

    start_time = time.time()
    print('Loading dataset')
    train_data, dev_data, test_data = utils.build_dataset(config)

    train_iter = utils.build_dataloader(train_data, config)
    dev_iter = utils.build_dataloader(dev_data, config)
    test_iter = utils.build_dataloader(test_data, config)

    time_dif = utils.get_time_dif(start_time)
    print("Prepare data time: ", time_dif)

    # Train, eval, test
    model = model.to(config.device)

    if hvd.nccl_built() == False:
        raise Exception("NCCL was not compiled in Horovod!")

    train.train(config, model, train_iter, dev_iter, test_iter)
def main():
    args = parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)
    local_rank = hvd.local_rank()
    world_size = hvd.size()

    if args.cuda:
        device = torch.device(f'cuda:{local_rank}')
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(device)
        torch.cuda.manual_seed(args.seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
    # issues with Infiniband implementations that are not fork-safe
    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context')
            and mp._supports_context
            and 'forkserver' in mp.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    # Horovod: use DistributedSampler to partition the training data.
    data = prepare_datasets(args,
                            rank=local_rank,
                            num_workers=world_size,
                            data='mnist')
    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr * lr_scaler,
                          momentum=args.momentum)

    # Horovod: (optional) compression algorithm.
    compression = (hvd.Compression.fp16
                   if args.fp16_allreduce else hvd.Compression.none)

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average,
        gradient_predivide_factor=args.gradient_predivide_factor)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    loss_fn = nn.CrossEntropyLoss()
    epoch_times = []

    for epoch in range(1, args.epochs + 1):
        t0 = time.time()
        train(epoch,
              data['training'],
              rank=local_rank,
              model=model,
              loss_fn=loss_fn,
              optimizer=optimizer,
              args=args,
              scaler=None)

        if epoch > 2:
            epoch_times.append(time.time() - t0)

        if epoch % 10 == 0:
            if hvd.local_rank() == 0:
                accuracy = evaluate(model=model,
                                    test_loader=data['testing'].loader)
                logger.log('-' * 75)
                logger.log(f'Epoch: {epoch}, Accuracy: {accuracy}')
                logger.log('-' * 75)

    if local_rank == 0:
        epoch_times_str = ', '.join(str(x) for x in epoch_times)
        logger.log('Epoch times:')
        logger.log(epoch_times_str)

        outdir = os.path.join(os.getcwd(), 'results_mnist',
                              f'size{world_size}')
        if not os.path.isdir(outdir):
            os.makedirs(outdir)

        modeldir = os.path.join(outdir, 'saved_models')
        modelfile = os.path.join(modeldir, 'hvd_model_mnist.pth')
        if not os.path.isdir(modeldir):
            os.makedirs(modeldir)

        logger.log(f'Saving model to: {modelfile}')
        torch.save(model.state_dict(), modelfile)

        args_file = os.path.join(outdir, f'args_size{world_size}.json')
        logger.log(f'Saving args to: {args_file}.')

        with open(args_file, 'at') as f:
            json.dump(args.__dict__, f, indent=4)

        times_file = os.path.join(outdir, f'epoch_times_size{world_size}.csv')
        logger.log(f'Saving epoch times to: {times_file}')
        with open(times_file, 'a') as f:
            f.write(epoch_times_str + '\n')
예제 #7
0
def main_worker(args_):

    args_.cuda = not args_.no_cuda and torch.cuda.is_available()

    allreduce_batch_size = args_.batch_size * args_.batches_per_allreduce

    hvd.init()
    torch.distributed.init_process_group('nccl', rank=4)

    if args_.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        print(f"this process's hvd rank = {hvd.local_rank()}")
        # torch.cuda.manual_seed(args_.seed)

    # cudnn.benchmark = True

    # # If set > 0, will resume training from a given checkpoint.
    # resume_from_epoch = 0
    # for try_epoch in range(args_.epochs, 0, -1):
    #     if os.path.exists(args_.checkpoint_format.format(epoch=try_epoch)):
    #         resume_from_epoch = try_epoch
    #         break
    #
    # # Horovod: broadcast resume_from_epoch from rank 0 (which will have
    # # checkpoints) to other ranks.
    # resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0,
    #                                   name='resume_from_epoch').item()

    # # Horovod: print logs on the first worker.
    # verbose = 1 if hvd.rank() == 0 else 0
    #
    # # Horovod: write TensorBoard logs on first worker.
    # try:
    #     if LooseVersion(torch.__version__) >= LooseVersion('1.2.0'):
    #         from torch.utils.tensorboard import SummaryWriter
    #     else:
    #         from tensorboardX import SummaryWriter
    #     os.makedirs(os.path.join(args_.model_output_dir, 'logs'), exist_ok=True)
    #     log_writer = SummaryWriter(os.path.join(args_.model_output_dir, 'logs')) if hvd.rank() == 0 else None
    # except ImportError:
    #     log_writer = None

    ### MODEL CREATION ###

    # create model
    model1 = VQ_VAE(num_inputs=1, weight_matching=0., channel_var=np.ones((1,)))
    model2 = VQ_VAE(num_inputs=1, weight_matching=0.0005, channel_var=np.ones((1,)))

    model1.cuda()
    model2.cuda()

    model1 = torch.nn.parallel.DistributedDataParallel(model1)
    model2 = torch.nn.parallel.DistributedDataParallel(model2)

    # By default, Adasum doesn't need scaling up learning rate.
    # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce
    if args_.cuda and args_.use_adasum and hvd.nccl_built():
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        lr_scaler = args_.batches_per_allreduce * hvd.local_size()
    elif not args_.use_adasum:
        lr_scaler = args_.batches_per_allreduce * hvd.size()
    else:
        lr_scaler = 1

    # Horovod: scale learning rate by the number of GPUs.
    optimizer1 = t.optim.Adam(model1.parameters(),
                              lr=(args_.base_lr * lr_scaler),
                              betas=(.9, .999))
    optimizer2 = t.optim.Adam(model2.parameters(),
                              lr=(args_.base_lr * lr_scaler),
                              betas=(.9, .999))

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args_.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer1 = hvd.DistributedOptimizer(
        optimizer1, named_parameters=model1.named_parameters(),
        compression=compression,
        backward_passes_per_step=args_.batches_per_allreduce,
        op=hvd.Adasum if args_.use_adasum else hvd.Average)

    optimizer2 = hvd.DistributedOptimizer(
        optimizer2, named_parameters=model2.named_parameters(),
        compression=compression,
        backward_passes_per_step=args_.batches_per_allreduce,
        op=hvd.Adasum if args_.use_adasum else hvd.Average)

    # # Restore from a previous checkpoint, if initial_epoch is specified.
    # # Horovod: restore on the first worker which will broadcast weights to other workers.
    # if resume_from_epoch > 0 and hvd.rank() == 0:
    #     filepath = args.checkpoint_format.format(epoch=resume_from_epoch)
    #     checkpoint = torch.load(filepath)
    #     model.load_state_dict(checkpoint['model'])
    #     optimizer.load_state_dict(checkpoint['optimizer'])

    ### Settings ###
    model_output_dir = args_.model_output_dir
    project_dir = args_.project_dir

    ### Prepare Data ###
    log.info("LOADING FILES")

    # ======= load data using pytorch systems ========
    torch.set_num_threads(4)
    dataset = DatasetFolderWithPaths(
        root=project_dir+"/JUNE"+"/raw_patches",
        loader=npy_loader,
        extensions='.npy'
    )

    dataset_mask = DatasetFolderWithPaths(
        root=project_dir+"/JUNE"+"/raw_masks",
        loader=npy_loader,
        extensions='.npy'
    )

    relation_mat = np.load(os.path.join(project_dir, "JUNE", "raw_patches", "relation_mat.npy"), allow_pickle=True)

    # Horovod: use DistributedSampler to partition data among workers. Manually specify
    # `num_replicas=hvd.size()` and `rank=hvd.rank()`.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_sampler_mask = torch.utils.data.distributed.DistributedSampler(
        dataset_mask, num_replicas=hvd.size(), rank=hvd.rank())

    os.makedirs(os.path.join(model_output_dir, "stage1"), exist_ok=True)
    os.makedirs(os.path.join(model_output_dir, "stage2"), exist_ok=True)

    # =========================================================
    # =========================================================
    log.info("TRAINING: STARTING STAGE 1")

    kwargs = {'num_workers': 4, 'pin_memory': True} if args_.cuda else {}
    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=allreduce_batch_size,
        sampler=train_sampler, **kwargs)
    train_mask_loader = torch.utils.data.DataLoader(
        dataset_mask, batch_size=allreduce_batch_size,
        sampler=train_sampler_mask, **kwargs)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model1.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer1, root_rank=0)

    output_dir = os.path.join(model_output_dir, "stage1")
    writer = SummaryWriter(output_dir)
    log.info(f"\ttensorboard logs written to {output_dir}")

    for epoch in range(args_.stage1_epochs):
        model1.train()
        train_sampler.set_epoch(epoch)

        mean_loss = train(model1,
                          train_loader,
                          optimizer1,
                          # relation_mat=relation_mat,
                          mask_loader=train_mask_loader,
                          args_=args_
                          )

        for key, loss in mean_loss.items():
            mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1.
            writer.add_scalar('Loss/' + key, mean_loss[key], epoch)
        writer.flush()
        log.info('\tepoch %d' % epoch)
        log.info('\t'.join(['{}:{:0.4f}  '.format(key, loss) for key, loss in mean_loss.items()]))

        # only master process should save checkpoints.
        if torch.distributed.get_rank() == 0:
            log.info(f'\t saving epoch {epoch}')
            t.save(model1.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch))

    writer.close()

    # =========================================================
    # =========================================================
    log.info("TRAINING: STARTING STAGE 2")

    # get the last saved epoch.  on IBM, use max(). on OSX use min()
    # s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1", "/*"))
    s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1") + '/*.pt')
    last_epoch = max(s1_epochs, key=os.path.getctime)
    log.info(f"\tloading last epoch = {last_epoch}")

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=allreduce_batch_size,
                                               sampler=train_sampler)

    train_mask_loader = torch.utils.data.DataLoader(dataset_mask,
                                                    batch_size=allreduce_batch_size,
                                                    sampler=train_sampler_mask)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model2.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer2, root_rank=0)

    output_dir = os.path.join(model_output_dir, "stage2")
    writer = SummaryWriter(output_dir)
    log.info(f"\ttensorboard logs written to {output_dir}")

    model2.load_state_dict(t.load(last_epoch))
    for epoch in range(args_.stage2_epochs):
        model2.train()
        train_sampler.set_epoch(epoch)

        mean_loss = train(model2,
                          train_loader,
                          optimizer2,
                          # relation_mat=relation_mat,
                          mask_loader=train_mask_loader
                          )

        # shuffle samples ids at the end of the epoch
        # if shuffle_data:
        #     np.random.shuffle(sample_ids)
        for key, loss in mean_loss.items():
            mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1.
            writer.add_scalar('Loss/' + key, mean_loss[key], epoch)
        writer.flush()
        log.info('\tepoch %d' % epoch)
        log.info('\t'.join(['{}:{:0.4f}  '.format(key, loss) for key, loss in mean_loss.items()]))

        if torch.distributed.get_rank() == 0:
            log.info(f'\t saving epoch {epoch}')
            t.save(model2.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch))
    writer.close()
예제 #8
0
def main():
    global args, best_prec1, best_prec5
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    #horovod initialize
    hvd.init()

    log = None

    if hvd.rank() == 0:
        log = SummaryWriter(log_dir=args.log_dir)
        print('The Training Model is %s' % args.arch)
    # Check the save_dir exists or not
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    if args.cuda:
        torch.cuda.set_device(hvd.local_rank())

    normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
    # issues with Infiniband implementations that are not fork-safe
    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
            mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    train_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=True, transform=transforms.Compose([
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ]), download=True)

    val_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=False,transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
            ]))

    #Horovod Partition the training data
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())

    val_sampler = torch.utils.data.distributed.DistributedSampler(
        val_dataset, num_replicas=hvd.size(), rank=hvd.rank())

    train_loader = torch.utils.data.DataLoader(
        train_dataset,batch_size=args.batch_size,sampler=train_sampler,**kwargs)

    val_loader = torch.utils.data.DataLoader(
        val_dataset,batch_size=args.batch_size,sampler=val_sampler,**kwargs)

    # model = torch.nn.DataParallel(resnet.__dict__[args.arch]())
    if args.arch in resnet.__dict__:
        model = resnet.__dict__[args.arch]()
    elif args.arch == 'alexnet':
        model = models.AlexNet()
    elif args.arch == 'vgg16':
        model = models.VGG16()


    if hvd.rank() == 0:
        numel = sum(p.numel() for p in model.parameters())
        print('Total params: {:d}'.format(numel))

    lr_scaler = hvd.size()

    if args.cuda:
        model.cuda()
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()


    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.evaluate, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    if args.half:
        model.half()
        criterion.half()

    base_optimizer = torch.optim.SGD(model.parameters(), args.lr * lr_scaler,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)


    # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(base_optimizer,
    #                                 milestones=[100, 150], last_epoch=args.start_epoch - 1)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(base_optimizer, root_rank=0)

    #Compression
    # compression = Allgather(MGCCompressor(0.05), ResidualMemory(), hvd.size())
    # compression = Allgather(TernGradCompressor(), ResidualMemory(), hvd.size())
    compression = Allreduce(NoneCompressor(), NoneMemory())
    # compression = Allgather(DgcCompressor(0.01), ResidualMemory(), hvd.size())
    # compression = Allgather(LowQSGDCompressor(), ResidualMemory(), hvd.size())

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(base_optimizer, compression, named_parameters=model.named_parameters())

    if hvd.rank() == 0:
        log.add_scalar('train/accuracy', 0., 0)
        log.add_scalar('test/accuracy', 0., 0)

    for epoch in range(args.start_epoch + 1, args.epochs + 1):

        adjust_learning_rate(optimizer, epoch, size=lr_scaler)

        if hvd.rank() == 0:
            print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr']))

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, log=log)

        # evaluate on validation set
        prec1, prec5 = validate(val_loader, model, criterion, epoch, log=log)

        # remember best prec@1 and save checkpoint
        best_prec1 = max(prec1, best_prec1)
        best_prec5 = max(prec5, best_prec5)

        if hvd.rank() == 0:
            print('Best Pred@1:{:.2f}%, Prec@5:{:.2f}%\n'.format(best_prec1, best_prec5))

        # if epoch > 0 and epoch % args.save_every == 0:
        #     save_checkpoint({
        #         'epoch': epoch + 1,
        #         'state_dict': model.state_dict(),
        #         'best_prec1': best_prec1,
        #     }, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th'))
        #
        # save_checkpoint({
        #     'state_dict': model.state_dict(),
        #     'best_prec1': best_prec1,
        # }, is_best, filename=os.path.join(args.save_dir, 'model.th'))

    if hvd.rank() == 0:
        log.close()
예제 #9
0
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.val_batch_size,
                                             sampler=val_sampler, **kwargs)


    # Set up standard VGG16 model.
    model = models.vgg16()

    # By default, Adasum doesn't need scaling up learning rate.
    # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce
    lr_scaler = args.batches_per_allreduce * hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = args.batches_per_allreduce * hvd.local_size()

    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.SGD(model.parameters(),
                          lr=(args.base_lr *
                              lr_scaler),
                          momentum=args.momentum, weight_decay=args.wd)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters(),
        compression=compression,
예제 #10
0
                                              batch_size=test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()

    ##### HOROVOD #####
    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not use_adasum else 1

    if cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        ##### TODO:Need argument #####
        if use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    ##### TODO:Need argument #####
    '''
    optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler,
                          momentum=args.momentum)'''
    optimizer = optim.SGD(model.parameters(),
                          lr=lr * lr_scaler,
                          momentum=momentum)

    ##### HOROVOD #####
    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)
예제 #11
0
def train(args):
    hvd.init()

    print("Hello from local_rank {}/{}, rank {}/{}".format(
        hvd.local_rank(), hvd.local_size(), hvd.rank(), hvd.size()))

    verbose = hvd.rank() == 0

    if verbose:
        print('Using PyTorch version:', torch.__version__)
        print('Horovod version: {}, CUDA: {}, ROCM: {}, NCCL: {}, MPI: {}'.format(
            hvd_version,
            hvd.cuda_built(),
            hvd.rocm_built(),
            hvd.nccl_built(),
            hvd.mpi_built()))
        print(torch.__config__.show())

    cudnn.benchmark = True

    torch.cuda.set_device(hvd.local_rank())
    world_size = hvd.size()

    # Set up standard model.
    if verbose:
        print('Using {} model'.format(args.model))
    model = getattr(models, args.model)()
    model = model.cuda()

    # import torch.multiprocessing as mp
    # # # assert "forkserver" in mp.get_all_start_methods()
    # mp.set_start_method("forkserver")

    lr_scaler = hvd.size()

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 1e-4 * lr_scaler)

    optimizer = hvd.DistributedOptimizer(optimizer,
                                         named_parameters=model.named_parameters())
    train_dataset = dataset_from_datadir(args.datadir, verbose)
    train_sampler = DistributedSampler(train_dataset,
                                       num_replicas=hvd.size(),
                                       rank=hvd.rank())
    train_loader = DataLoader(dataset=train_dataset, batch_size=args.batchsize,
                              shuffle=False, num_workers=args.workers,
                              pin_memory=False, sampler=train_sampler,
                              multiprocessing_context='forkserver')

    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    total_step = args.steps if args.steps is not None else len(train_loader)

    # For each block of printed steps
    last_start = datetime.now()
    last_images = 0

    # For final average
    avg_images = 0
    avg_start = None
    tot_steps = 0

    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            li = len(images)
            last_images += li

            tot_steps += 1
            if tot_steps == args.warmup_steps:
                avg_start = datetime.now()
            elif tot_steps > args.warmup_steps:
                avg_images += li

            if (i + 1) % args.print_steps == 0 and verbose:
                now = datetime.now()
                last_secs = (now-last_start).total_seconds()

                print(f'Epoch [{epoch+1}/{args.epochs}], Step [{i+1}/{total_step}], '
                      f'Loss: {loss.item():.4f}, '
                      f'Images/sec: {last_images*world_size/last_secs:.2f} '
                      f'(last {args.print_steps} steps)')

                last_start = now
                last_images = 0

            if args.steps is not None and i >= args.steps:
                break
    if verbose:
        dur = datetime.now() - avg_start
        print(f"Training completed in: {dur}")
        print(f"Images/sec: {avg_images*world_size/dur.total_seconds():.2f} "
              f"(average, skipping {args.warmup_steps} warmup steps)")
예제 #12
0
    def fit(self, input_data=None, input_labels=None, loss="", opt=""):
        if self.use_model: # use_model
            # Check Input Data
            if input_data is None or input_labels is None:
                return
            if self.model_onnx:
                print("Cannot use onnx type to fit model")
                return
            # Make TensorDataset and DataLoader for PyTorch
            train_dataset = TensorDataset(input_data, input_labels)
            # Handling Input of Loss Function
            loss_func = F.nll_loss
            if loss == "nll_loss":
                loss_func = F.nll_loss
            elif loss == "mse_loss":
                loss_func = F.mse_loss
            elif loss == "cross_entropy":
                loss_func = F.cross_entropy
            elif loss == "l1_loss":
                loss_func = F.l1_loss
            if self.cuda:
                ##### HOROVOD #####
                train_sampler = torch.utils.data.distributed.DistributedSampler(
                               train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
                kwargs = {'num_workers': 1, 'pin_memory': True}
                # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
                # issues with Infiniband implementations that are not fork-safe
                if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
                        mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
                    kwargs['multiprocessing_context'] = 'forkserver'
                train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size,
                                                           sampler=train_sampler, **kwargs)
                # Set Optimizer
                if self.use_optimizer:
                    optimizer = self.optimizer
                else:
                    if args.use_adasum and hvd.nccl_built():
                        lr_scaler = hvd.local_size()                
                    if opt == "SGD":
                        optimizer = optim.SGD(self.model.parameters(), lr=self.lr*lr_scalar,
                                              momentum=self.momentum)
                    else:
                        optimizer = optim.SGD(self.model.parameters(), lr=self.lr*lr_scalar,
                                              momentum=self.momentum)
                
                # Horovod: broadcast parameters & optimizer state.
                hvd.broadcast_parameters(self.model.state_dict(), root_rank=0)
                hvd.broadcast_optimizer_state(optimizer, root_rank=0)
                
                # Horovod: (optional) compression algorithm.
                #compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
                compression = hvd.Compression.none
                # Horovod: wrap optimizer with DistributedOptimizer.
                optimizer = hvd.DistributedOptimizer(optimizer,
                                                     named_parameters=self.model.named_parameters(),
                                                     compression=compression,
                                                     op=hvd.Average)
                                                     #op=hvd.Adasum if args.use_adasum else hvd.Average)
            else:                
                train_loader = DataLoader(train_dataset, batch_size=self.batch_size)
                if self.use_optimizer:
                    optimizer = self.optimizer
                else:
                    if optim == "SGD":
                        optimizer = optim.SGD(self.model.parameters(), lr=self.lr,
                                              momentum=self.momentum)
                    else:
                        optimizer = optim.SGD(self.model.parameters(), lr=self.lr,
                                              momentum=self.momentum)
            
            if self.debug:
                # Print model's state_dict
                print("Model's state_dict:")
                for param_tensor in self.model.state_dict():
                    print(param_tensor, "\t", self.model.state_dict()[param_tensor].size())

                # Print optimizer's state_dict
                print("Optimizer's state_dict:")
                for var_name in optimizer.state_dict():
                    print(var_name, "\t", optimizer.state_dict()[var_name])
                
            losses = []
            nums = []
            accs = []
            for epoch in range(self.epochs):
                self.model.train()
                # Horovod: set epoch to sampler for shuffling.
                if self.cuda:
                    train_sampler.set_epoch(epoch)
                
                for batch_idx, (data, target) in enumerate(train_loader):
                    if self.cuda:
                        data, target = data.cuda(), target.cuda()                    
                    optimizer.zero_grad()
                    output = self.model(data)
                    loss = loss_func(output, target)                    
                    acc = self.accuracy(output,target)
                    loss.backward()
                    optimizer.step()
                    if batch_idx % self.log_interval == 0:
                        if self.cuda:
                            if hvd.rank() == 0:
                                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {}'.format(
                                      epoch+1, batch_idx * len(data), len(train_sampler),
                                      100. * batch_idx / len(train_loader), loss.item(), acc*100))
                        else:
                            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {}'.format(
                                  epoch+1, batch_idx * len(data), len(train_loader.dataset),
                                  100. * batch_idx / len(train_loader), loss.item(), acc*100))                                  
예제 #13
0
def pretrain(
    run_name: str,
    #
    # Data
    train_filepath: str = DEFAULT_CSNJS_TRAIN_FILEPATH,
    spm_filepath: str = DEFAULT_SPM_UNIGRAM_FILEPATH,
    num_workers=1,
    limit_dataset_size=-1,
    max_length=1024,
    subword_regularization_alpha: float = 0,
    program_mode="contrastive",
    loss_mode="infonce",  # infonce, mlm, or hybrid
    min_alternatives=1,
    #
    # Model
    resume_path: str = "",
    encoder_type: str = "transformer",
    lstm_project_mode: str = "hidden",
    n_encoder_layers: int = 6,
    d_model: int = 512,
    n_head: int = 8,
    #
    # Optimization
    num_epochs: int = 100,
    save_every: int = 1,
    batch_size: int = 256,
    lr: float = 8e-4,
    weight_decay: float = 0,
    adam_betas=(0.9, 0.98),
    warmup_steps: int = 5000,
    num_steps: int = 600000,
    #
    # Horovod
    use_adasum: bool = False,
    fp16_allreduce: bool = False,
    gradient_predivide_factor: float = 1.0,
    #
    # Computational
    use_cuda: bool = True,
    seed: int = 0,
):
    hvd.init()

    logger.info("L:", n_encoder_layers, type(n_encoder_layers))
    logger.info("H:", d_model, type(d_model))
    logger.info("A:", n_head, type(n_head))
    run_name = str(run_name)  # support numerical run ids
    slurm_job_id = os.environ.get("SLURM_JOB_ID")
    slurm_job_hostname = os.environ.get("SLURM_JOB_NODELIST")
    config = locals()
    logger.info(f"Config = \n{config}")
    logger.info("Training configuration: {}".format(config))
    logger.info(
        f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'")
    logger.info(f"CUDA_DEVICE_ORDER = '{os.environ.get('CUDA_DEVICE_ORDER')}'")

    assert program_mode in ["contrastive", "identity", "augmentation"]
    assert loss_mode == "infonce" or loss_mode == "mlm" or loss_mode == "hybrid"
    assert not (program_mode == "contrastive" and loss_mode == "mlm")
    assert not (program_mode != "contrastive" and
                (loss_mode == "hybrid" or loss_mode == "infonce"))
    assert not use_cuda or torch.cuda.is_available()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    run_dir = RUN_DIR / "{}_{}".format(run_name, int(time.time()))
    run_dir.mkdir(exist_ok=True, parents=True)
    config["run_dir"] = str(run_dir.resolve())
    logger.add(str((run_dir / "train.log").resolve()))
    logger.info(f"Saving logs, model checkpoints to {run_dir}")

    # Create training dataset and dataloader
    assert train_filepath.endswith(".pickle") or train_filepath.endswith(".gz")

    # Setup distributed
    gpu = hvd.local_rank()
    ngpus_per_node = 1
    chief_node = gpu == 0
    assert gpu is not None

    if chief_node:
        if config["loss_mode"] == "mlm":
            project = "bert-pretrain"
        elif config["loss_mode"] == "infonce":
            project = "moco-pretrain"
        elif config["loss_mode"] == "hybrid":
            project = "hybrid"
        wandb.init(name=config["run_name"],
                   config=config,
                   job_type="training",
                   project=project,
                   entity="ml4code")

    logger.info("Use GPU: {} for training".format(gpu))
    torch.cuda.set_device(gpu)
    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {"num_workers": 1, "pin_memory": True}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
    # issues with Infiniband implementations that are not fork-safe
    if (kwargs.get("num_workers", 0) > 0 and hasattr(mp, "_supports_context")
            and mp._supports_context
            and "forkserver" in mp.get_all_start_methods()):
        kwargs["multiprocessing_context"] = "forkserver"

    sp = spm.SentencePieceProcessor()
    sp.Load(config["spm_filepath"])
    pad_id = sp.PieceToId("[PAD]")
    logger.info("pad_id {}", pad_id)
    assert pad_id == 0  # hard coded in pad_collate
    mask_id = sp.PieceToId("[MASK]")

    # Create model
    if config["loss_mode"] == "infonce":
        # TODO(ajay): Support n_head argument, check how d_model is being used (why not in encoder config dict?)
        model = CodeMoCo(
            sp.GetPieceSize(),
            pad_id=pad_id,
            d_model=config["d_model"],
            encoder_config=dict(
                encoder_type=config["encoder_type"],
                lstm_project_mode=config["lstm_project_mode"],
                n_encoder_layers=config["n_encoder_layers"],
            ),
        )
        logger.info(
            f"Created CodeMoCo model with {count_parameters(model)} params")
    elif config["loss_mode"] == "mlm":
        model = CodeMLM(
            sp.GetPieceSize(),
            pad_id=pad_id,
            encoder_type=config["encoder_type"],
            n_encoder_layers=config["n_encoder_layers"],
            d_model=config["d_model"],
            n_head=config["n_head"],
            d_ff=4 * config["d_model"],
        )
        logger.info(
            f"Created CodeMLM model with {count_parameters(model)} params")
    elif config["loss_mode"] == "hybrid":
        model = CodeContrastiveMLM(
            sp.GetPieceSize(),
            pad_id=pad_id,
            n_encoder_layers=config["n_encoder_layers"],
            d_model=config["d_model"],
            n_head=config["n_head"],
            d_ff=4 * config["d_model"],
            use_horovod=True,
        )
        logger.info(
            f"Created CodeContrastiveMLM model with {count_parameters(model)} params"
        )
    else:
        raise ValueError(f"Bad loss mode {config['loss_mode']}")

    assert config["use_cuda"]
    model.cuda()
    # When using a single GPU per process and per
    # DistributedDataParallel, we need to divide the batch size
    # ourselves based on the total number of GPUs we have
    # config["batch_size"] = int(config["batch_size"] / ngpus_per_node)
    # config["num_workers"] = int((config["num_workers"] + ngpus_per_node - 1) / ngpus_per_node)
    # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu])

    # define optimizer
    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not config["use_adasum"] else 1
    # If using GPU Adasum allreduce, scale learning rate by local_size.
    if config["use_adasum"] and hvd.nccl_built():
        lr_scaler = hvd.local_size()
    # Horovod: scale learning rate by lr_scaler.
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=config["lr"] * lr_scaler,
                                 betas=config["adam_betas"],
                                 eps=1e-6,
                                 weight_decay=config["weight_decay"])
    sched = get_linear_schedule_with_warmup(optimizer, config["warmup_steps"],
                                            config["num_steps"])

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if config[
        "fp16_allreduce"] else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if config["use_adasum"] else hvd.Average,
        gradient_predivide_factor=config["gradient_predivide_factor"],
    )

    # Load checkpoint
    if config["resume_path"]:
        logger.info(f"Loading parameters from {config['resume_path']}")
        # configure map_location properly
        map_location = {"cuda:%d" % 0: "cuda:%d" % hvd.rank()}
        checkpoint = torch.load(config["resume_path"],
                                map_location=map_location)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        start_epoch = checkpoint["epoch"] + 1
        start_global_step = checkpoint["global_step"]
    else:
        start_epoch = 1
        start_global_step = 0

    # Setup data
    train_dataset = PrecomputedDataset(
        config["train_filepath"],
        min_alternatives=config["min_alternatives"],
        program_mode=config["program_mode"],
        limit_size=config["limit_dataset_size"],
        sp=sp,
        subword_regularization_alpha=config["subword_regularization_alpha"],
        max_length=config["max_length"],
    )
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config["batch_size"],
        shuffle=False,
        collate_fn=pad_collate_contrastive
        if config["program_mode"] == "contrastive" else pad_collate,
        drop_last=True,
        sampler=train_sampler,
        **kwargs,
    )

    # Train
    global_step = 0
    while global_step < start_global_step:
        sched.step()
        global_step += 1
    for epoch in tqdm.trange(start_epoch,
                             config["num_epochs"] + 1,
                             desc="training",
                             unit="epoch",
                             leave=False):
        logger.info(f"Starting epoch {epoch}\n")
        train_sampler.set_epoch(epoch)
        model.train()
        pbar = tqdm.tqdm(train_loader, desc=f"epoch {epoch}")
        for batch in pbar:
            optimizer.zero_grad()
            if config["loss_mode"] == "infonce":
                train_metrics = training_step(model,
                                              batch,
                                              use_cuda=config["use_cuda"])
            elif config["loss_mode"] == "mlm":
                # replace tokens randomly with tokens from _ (8)
                train_metrics = training_step_mlm(sp,
                                                  model,
                                                  batch,
                                                  pad_id=pad_id,
                                                  mask_id=mask_id,
                                                  vocab_start_idx=8,
                                                  vocab_end_idx=7999,
                                                  use_cuda=config["use_cuda"])
            elif config["loss_mode"] == "hybrid":
                train_metrics = training_step_hybrid(
                    sp,
                    model,
                    batch,
                    mask_id=mask_id,
                    pad_id=pad_id,
                    vocab_start_idx=0,
                    vocab_end_idx=7999,
                    use_cuda=config["use_cuda"])
            else:
                raise ValueError("Bad loss type")
            loss = train_metrics["loss"]
            loss.backward()
            optimizer.step()
            sched.step()

            global_step += 1
            pbar.set_description(
                f"epoch {epoch} gpu {gpu} step {global_step} loss {loss.item():.4f}"
            )

            if chief_node:
                wandb.log(dict(lr=sched.get_last_lr()[0]))
                wandb.log(dict(epoch=epoch, **train_metrics["log"]),
                          step=global_step)

                # Save checkpoint
                if config["save_every"] and global_step % config[
                        "save_every"] == 0:
                    checkpoint = {
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "epoch": epoch,
                        "global_step": global_step,
                        "config": config,
                    }
                    model_file = os.path.join(
                        config["run_dir"],
                        f"ckpt_pretrain_ep{epoch:04d}_step{global_step:07d}.pth"
                    )
                    logger.info(f"Saving checkpoint to {model_file}...")
                    torch.save(checkpoint, model_file)
                    wandb.save(str(model_file))
                    logger.info("Done.")
예제 #14
0
 def hvd_param_scaling(self):
     if hvd.nccl_built():
         self.lr_scaler = hvd.local_size()
         print('Rescale lr = {} * lr'.format(self.lr_scaler))
예제 #15
0
def main(_):
    """ Basic Configurations """
    ssl_set_unverified_context()
    FLAGS.CUDA = FLAGS.CUDA and torch.cuda.is_available()
    allreduce_batch_size = FLAGS.BATCH_SIZE * FLAGS.BATCHES_PER_ALLREDUCE
    hvd.init()
    np.random.seed(FLAGS.SEED)
    torch.manual_seed(FLAGS.SEED)

    if FLAGS.CUDA:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(FLAGS.SEED)

    cudnn.benchmark = True

    # Horovod: print logs on the first worker.
    verbose = 1 if hvd.rank() == 0 else 0

    # Select subdirectory as datetime if flagfile is not specified
    subdir = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    # If sys.argv has flagfile argument, set subdir as filename of flagfile
    parser = argparse.ArgumentParser()
    parser.add_argument('--flagfile')
    for flag in FLAGS.flag_values_dict().keys():
        if flag.isupper():
            parser.add_argument('--' + flag)
    args = parser.parse_args()
    if args.flagfile is not None:
        flagfile = args.flagfile
        subdir = os.path.splitext(os.path.basename(flagfile))[0]
        subdir = os.path.join(subdir, '-'.join(FLAGS.BLOCK_ARGS))
        script_name = [os.path.splitext(os.path.basename(arg))[0]
                       for arg in sys.argv if arg.endswith('.py')]
        if len(script_name) > 0:
            subdir = subdir.replace('train', script_name[0])

    # Horovod: write TensorBoard logs on first worker.
    if hvd.rank() == 0:
        fileroot = get_real_path(FLAGS.TENSORBOARD_DIR)
        train_tensorboard_dir = os.path.join(fileroot, subdir, 'train')
        valid_tensorboard_dir = os.path.join(fileroot, subdir, 'valid')
        train_summary_writer = tensorboard.SummaryWriter(train_tensorboard_dir)
        valid_summary_writer = tensorboard.SummaryWriter(valid_tensorboard_dir)


    """ Prepare Dataset """
    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(FLAGS.NUM_THREADS)

    kwargs = {'num_workers': FLAGS.NUM_WORKERS, 'pin_memory': True} if FLAGS.CUDA else {}

    dataset_module = 'lib.data.datasets.' + FLAGS.DATASET_NAME.lower()
    dataset = importlib.import_module(dataset_module).__getattribute__(FLAGS.DATASET_NAME)
    train_dataset = dataset('train', data_dir=FLAGS.DATASET_DIR,
                            mean=FLAGS.DATA_MEAN, std=FLAGS.DATA_STD)
    valid_dataset = dataset('valid', data_dir=FLAGS.DATASET_DIR,
                            mean=FLAGS.DATA_MEAN, std=FLAGS.DATA_STD)

    # Horovod: use DistributedSampler to partition data among workers. Manually specify
    # `num_replicas=hvd.size()` and `rank=hvd.rank()`.
    train_sampler = distributed.DistributedSampler(train_dataset,
                                                   num_replicas=hvd.size(),
                                                   rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=allreduce_batch_size,
                                               sampler=train_sampler, **kwargs)

    valid_sampler = distributed.DistributedSampler(valid_dataset,
                                                   num_replicas=hvd.size(),
                                                   rank=hvd.rank())
    valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                               batch_size=FLAGS.VALID_BATCH_SIZE,
                                               sampler=valid_sampler, **kwargs)


    """ Build Model """
    # Set up a model.
    model_module = 'models.' + FLAGS.MODEL_NAME.lower()
    net = importlib.import_module(model_module).__getattribute__(FLAGS.MODEL_NAME)
    model = net(num_classes=len(train_dataset.classes), block_args=FLAGS.BLOCK_ARGS)

    # By default, Adasum doesn't need scaling up learning rate.
    # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce
    lr_scaler = FLAGS.BATCHES_PER_ALLREDUCE * hvd.size() if not FLAGS.USE_ADASUM else 1

    if FLAGS.CUDA:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if FLAGS.USE_ADASUM and hvd.nccl_built():
            lr_scaler = FLAGS.BATCHES_PER_ALLREDUCE * hvd.local_size()


    """ Optimizer """
    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.SGD(model.parameters(), lr=(FLAGS.BASE_LR * lr_scaler),
                          momentum=FLAGS.MOMENTUM, weight_decay=FLAGS.WEIGHT_DECAY,
                          nesterov=FLAGS.NESTEROV)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if FLAGS.FP16_ALLREDUCE else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters(),
        compression=compression,
        backward_passes_per_step=FLAGS.BATCHES_PER_ALLREDUCE)
        # TODO: hvd.Adasum is not supported yet(0.18.2)
        #backward_passes_per_step = FLAGS.BATCHES_PER_ALLREDUCE,
        #op = hvd.Adasum if FLAGS.USE_ADASUM else hvd.Average)


    """ Restore & Broadcast """
    # If set > 0, will resume training from a given checkpoint.
    resume_from_epoch = 0
    fileroot = get_real_path(FLAGS.CHECKPOINT_DIR)
    for try_epoch in range(FLAGS.EPOCHS, 0, -1):
        filename = FLAGS.CHECKPOINT_FORMAT.format(epoch=try_epoch)
        filepath = os.path.join(fileroot, subdir, filename)
        if os.path.exists(filepath):
            resume_from_epoch = try_epoch
            break

    # Horovod: broadcast resume_from_epoch from rank 0 (which will have
    # checkpoints) to other ranks.
    resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0,
                                      name='resume_from_epoch').item()

    # Restore from a previous checkpoint, if initial_epoch is specified.
    # Horovod: restore on the first worker which will broadcast weights to other workers.
    if resume_from_epoch > 0 and hvd.rank() == 0:
        fileroot = get_real_path(FLAGS.CHECKPOINT_DIR)
        filename = FLAGS.CHECKPOINT_FORMAT.format(epoch=resume_from_epoch)
        filepath = os.path.join(fileroot, subdir, filename)
        checkpoint = torch.load(filepath)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)


    """ Training Operations """
    def train(epoch):
        model.train()
        lr = adjust_learning_rate(FLAGS, optimizer, epoch)
        train_sampler.set_epoch(epoch)
        train_loss = Metric('train_loss')
        train_accuracy = Metric('train_accuracy')

        with tqdm.tqdm(total=len(train_loader),
                       desc='Train Epoch     #{}'.format(epoch + 1),
                       disable=not verbose) as t:
            for batch_idx, (data, target) in enumerate(train_loader):
                if FLAGS.CUDA:
                    data, target = data.cuda(), target.cuda()
                optimizer.zero_grad()
                # Split data into sub-batches of size batch_size
                for i in range(0, len(data), FLAGS.BATCH_SIZE):
                    data_batch = data[i:i + FLAGS.BATCH_SIZE]
                    target_batch = target[i:i + FLAGS.BATCH_SIZE]
                    output = model(data_batch)
                    train_accuracy.update(accuracy(output, target_batch))
                    loss = F.cross_entropy(output, target_batch)
                    train_loss.update(loss)
                    # Average gradients among sub-batches
                    loss.div_(math.ceil(float(len(data)) / FLAGS.BATCH_SIZE))
                    loss.backward()
                    if i == 0 and hvd.rank() == 0:
                        train_summary_writer.add_image("input",
                                                       transforms.denormalize(data[0],
                                                                              mean=FLAGS.DATA_MEAN,
                                                                              std=FLAGS.DATA_STD),
                                                       epoch)
                # Gradient is applied across all ranks
                optimizer.step()
                t.set_postfix({'loss': train_loss.avg.item(),
                               'accuracy': 100. * train_accuracy.avg.item(),
                               'lr': lr})
                t.update(1)

        if hvd.rank() == 0:
            train_summary_writer.add_scalar('info/lr', lr, epoch)
            train_summary_writer.add_scalar('info/loss', train_loss.avg, epoch)
            train_summary_writer.add_scalar('metric/accuracy', train_accuracy.avg, epoch)

    def validate(epoch):
        model.eval()
        valid_loss = Metric('valid_loss')
        valid_accuracy = Metric('valid_accuracy')

        with tqdm.tqdm(total=len(valid_loader),
                       desc='Validate Epoch  #{}'.format(epoch + 1),
                       disable=not verbose) as t:
            with torch.no_grad():
                for data, target in valid_loader:
                    if FLAGS.CUDA:
                        data, target = data.cuda(), target.cuda()
                    output = model(data)

                    valid_loss.update(F.cross_entropy(output, target))
                    valid_accuracy.update(accuracy(output, target))
                    t.set_postfix({'loss': valid_loss.avg.item(),
                                   'accuracy': 100. * valid_accuracy.avg.item()})
                    t.update(1)

        if hvd.rank() == 0:
            valid_summary_writer.add_scalar('info/loss', valid_loss.avg, epoch)
            valid_summary_writer.add_scalar('metric/accuracy', valid_accuracy.avg, epoch)

    def save_checkpoint(epoch):
        if hvd.rank() == 0:
            fileroot = get_real_path(FLAGS.CHECKPOINT_DIR)
            filename = FLAGS.CHECKPOINT_FORMAT.format(epoch=epoch + 1)
            filepath = os.path.join(fileroot, subdir, filename)
            if not os.path.exists(os.path.dirname(filepath)):
                os.makedirs(os.path.dirname(filepath))
            state = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }
            torch.save(state, filepath)


    """ Training Loop """
    if hvd.rank() == 0:
        print(model)
        print(flags_to_string(FLAGS))
    for epoch in range(resume_from_epoch, FLAGS.EPOCHS):
        train(epoch)
        validate(epoch)
        save_checkpoint(epoch)
예제 #16
0
def train_fn(args):
    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    print("hvd rank:", hvd.rank(), " hvd local rank:", hvd.local_rank(),
          " using cuda: ", args.cuda)

    if args.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    data_dir = args.data_dir or './data'
    with FileLock(os.path.expanduser("~/.horovod_lock")):
        train_dataset = \
            datasets.MNIST(data_dir, train=True, download=True,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307,), (0.3081,))
                           ]))
    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               sampler=train_sampler,
                                               **kwargs)
    transformations = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    test_dataset = datasets.MNIST(data_dir,
                                  train=False,
                                  transform=transformations)
    # Horovod: use DistributedSampler to partition the test data.
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr * lr_scaler,
                          momentum=args.momentum)

    # Horovod: (optional) compression algorithm.
    compression = (hvd.Compression.fp16
                   if args.fp16_allreduce else hvd.Compression.none)

    def train(epoch):
        model.train()
        train_sampler.set_epoch(epoch)
        for batch, (data, target) in enumerate(train_loader):
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch % args.log_interval == 0:
                # Horovod: use train_sampler to determine
                # the number of examples in this worker's partition.
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch * len(data), len(train_sampler),
                    100.0 * batch / len(train_loader), loss.item()))

    def test():
        model.eval()
        test_loss = 0.
        test_accuracy = 0.
        for data, target in test_loader:
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            # sum up batch loss
            test_loss += F.nll_loss(output, target, size_average=False).item()
            # get the index of the max log-probability
            pred = output.data.max(1, keepdim=True)[1]
            test_accuracy += pred.eq(
                target.data.view_as(pred)).cpu().float().sum()

        # Horovod: use test_sampler to determine the number of examples in
        # this worker's partition.
        test_loss /= len(test_sampler)
        test_accuracy /= len(test_sampler)

        # Horovod: average metric values across workers.
        test_loss = metric_average(test_loss, 'avg_loss')
        test_accuracy = metric_average(test_accuracy, 'avg_accuracy')

        # Horovod: print output only on first rank.
        if hvd.rank() == 0:
            print(
                '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
                    test_loss, 100. * test_accuracy))

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average)

    for epoch in range(1, args.epochs + 1):
        train(epoch)
        test()
예제 #17
0
 def hvd_param_scaling(self):
     if hvd.nccl_built():
         # self.batch_size = int(self.batch_size/hvd.local_size())
         self.lr_scaler = 1,0
예제 #18
0
def pytorch_mnist_example():
    class Net(torch.nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = torch.nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = torch.nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = torch.nn.Dropout2d()
            self.fc1 = torch.nn.Linear(320, 50)
            self.fc2 = torch.nn.Linear(50, 10)

        def forward(self, x):
            x = torch.nn.functional.relu(
                torch.nn.functional.max_pool2d(self.conv1(x), 2))
            x = torch.nn.functional.relu(
                torch.nn.functional.max_pool2d(self.conv2_drop(self.conv2(x)),
                                               2))
            x = x.view(-1, 320)
            x = torch.nn.functional.relu(self.fc1(x))
            x = torch.nn.functional.dropout(x, training=self.training)
            x = self.fc2(x)
            return torch.nn.functional.log_softmax(x)

    def train(epoch, is_cuda, log_interval):
        model.train()
        # Horovod: set epoch to sampler for shuffling.
        train_sampler.set_epoch(epoch)
        for batch_idx, (data, target) in enumerate(train_loader):
            if is_cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            loss = torch.nn.functional.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                # Horovod: use train_sampler to determine the number of examples in
                # this worker's partition.
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_sampler),
                    100. * batch_idx / len(train_loader), loss.item()))

    def metric_average(val, name):
        tensor = torch.tensor(val)
        avg_tensor = hvd.allreduce(tensor, name=name)
        return avg_tensor.item()

    def test(is_cuda):
        model.eval()
        test_loss = 0.
        test_accuracy = 0.
        for data, target in test_loader:
            if is_cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            # sum up batch loss
            test_loss += torch.nn.functional.nll_loss(
                output, target, size_average=False).item()
            # get the index of the max log-probability
            pred = output.data.max(1, keepdim=True)[1]
            test_accuracy += pred.eq(
                target.data.view_as(pred)).cpu().float().sum()

        # Horovod: use test_sampler to determine the number of examples in
        # this worker's partition.
        test_loss /= len(test_sampler)
        test_accuracy /= len(test_sampler)

        # Horovod: average metric values across workers.
        test_loss = metric_average(test_loss, 'avg_loss')
        test_accuracy = metric_average(test_accuracy, 'avg_accuracy')

        # Horovod: print output only on first rank.
        if hvd.rank() == 0:
            print(
                '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
                    test_loss, 100. * test_accuracy))

    batch_size = 64
    test_batch_size = 1000
    epochs = 10
    lr = 0.01
    momentum = 0.5
    random_seed = 42
    log_interval = 10
    fp16_allreduce = False
    use_adasum = False
    gradient_predivide_factor = 1.0
    data_dir = './data'
    is_cuda = torch.cuda.is_available()

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(random_seed)

    if is_cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(random_seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True} if is_cuda else {}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent issues with Infiniband implementations that are not fork-safe.
    if (kwargs.get('num_workers', 0) > 0
            and hasattr(torch.multiprocessing, '_supports_context')
            and torch.multiprocessing._supports_context
            and 'forkserver' in torch.multiprocessing.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    with FileLock(os.path.expanduser('~/.horovod_lock')):
        train_dataset = torchvision.datasets.MNIST(
            data_dir,
            train=True,
            download=True,
            transform=torchvision.transforms.Compose([
                torchvision.transforms.ToTensor(),
                torchvision.transforms.Normalize((0.1307, ), (0.3081, ))
            ]))
    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    test_dataset = torchvision.datasets.MNIST(
        data_dir,
        train=False,
        download=True,
        transform=torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize((0.1307, ), (0.3081, ))
        ]))
    # Horovod: use DistributedSampler to partition the test data.
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not use_adasum else 1

    if is_cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=lr * lr_scaler,
                                momentum=momentum)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if use_adasum else hvd.Average,
        gradient_predivide_factor=gradient_predivide_factor)

    for epoch in range(1, epochs + 1):
        train(epoch, is_cuda, log_interval)
        test(is_cuda)
예제 #19
0
def main(args):
    def train_mixed_precision(epoch, scaler):
        model.train()
        # Horovod: set epoch to sampler for shuffling.
        train_sampler.set_epoch(epoch)
        for batch_idx, (data, target) in enumerate(train_loader):
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                output = model(data)
                loss = F.nll_loss(output, target)

            scaler.scale(loss).backward()
            # Make sure all async allreduces are done
            optimizer.synchronize()
            # In-place unscaling of all gradients before weights update
            scaler.unscale_(optimizer)
            with optimizer.skip_synchronize():
                scaler.step(optimizer)
            # Update scaler in case of overflow/underflow
            scaler.update()

            if batch_idx % args.log_interval == 0:
                # Horovod: use train_sampler to determine the number of examples in
                # this worker's partition.
                print(
                    'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLoss Scale: {}'
                    .format(epoch, batch_idx * len(data), len(train_sampler),
                            100. * batch_idx / len(train_loader), loss.item(),
                            scaler.get_scale()))

    def train_epoch(epoch):
        model.train()
        # Horovod: set epoch to sampler for shuffling.
        train_sampler.set_epoch(epoch)
        for batch_idx, (data, target) in enumerate(train_loader):
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % args.log_interval == 0:
                # Horovod: use train_sampler to determine the number of examples in
                # this worker's partition.
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_sampler),
                    100. * batch_idx / len(train_loader), loss.item()))

    def metric_average(val, name):
        tensor = torch.tensor(val)
        avg_tensor = hvd.allreduce(tensor, name=name)
        return avg_tensor.item()

    def test():
        model.eval()
        test_loss = 0.
        test_accuracy = 0.
        for data, target in test_loader:
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            # sum up batch loss
            test_loss += F.nll_loss(output, target, size_average=False).item()
            # get the index of the max log-probability
            pred = output.data.max(1, keepdim=True)[1]
            test_accuracy += pred.eq(
                target.data.view_as(pred)).cpu().float().sum()

        # Horovod: use test_sampler to determine the number of examples in
        # this worker's partition.
        test_loss /= len(test_sampler)
        test_accuracy /= len(test_sampler)

        # Horovod: average metric values across workers.
        test_loss = metric_average(test_loss, 'avg_loss')
        test_accuracy = metric_average(test_accuracy, 'avg_accuracy')

        # Horovod: print output only on first rank.
        if hvd.rank() == 0:
            print(
                '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
                    test_loss, 100. * test_accuracy))

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)

    if args.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)
    else:
        if args.use_mixed_precision:
            raise ValueError(
                "Mixed precision is only supported with cuda enabled.")

    if (args.use_mixed_precision
            and LooseVersion(torch.__version__) < LooseVersion('1.6.0')):
        raise ValueError("""Mixed precision is using torch.cuda.amp.autocast(),
                            which requires torch >= 1.6.0""")

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
    # issues with Infiniband implementations that are not fork-safe
    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context')
            and mp._supports_context
            and 'forkserver' in mp.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    data_dir = args.data_dir or './data'
    with FileLock(os.path.expanduser("~/.horovod_lock")):
        train_dataset = \
            datasets.MNIST(data_dir, train=True, download=True,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307,), (0.3081,))
                           ]))

    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    test_dataset = \
        datasets.MNIST(data_dir, train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ]))
    # Horovod: use DistributedSampler to partition the test data.
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr * lr_scaler,
                          momentum=args.momentum)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average,
        gradient_predivide_factor=args.gradient_predivide_factor)

    if args.use_mixed_precision:
        # Initialize scaler in global scale
        scaler = torch.cuda.amp.GradScaler()

    for epoch in range(1, args.epochs + 1):
        if args.use_mixed_precision:
            train_mixed_precision(epoch, scaler)
        else:
            train_epoch(epoch)
        # Keep test in full precision since computation is relatively light.
        test()
def train_main(args, filenames):
    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)

    if torch.cuda.is_available() and not args.no_cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)
    rank = hvd.rank()
    train_dataset = create_dataset(
        filenames,
        batch_size=args.batch_size,
        rank=rank,
        num_epochs=args.epochs,
        world_size=hvd.size(),
        num_reducers=args.num_reducers,
        max_concurrent_epochs=args.max_concurrent_epochs)
    model = Net()
    # By default, Adasum doesn"t need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if torch.cuda.is_available() and not args.no_cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr * lr_scaler,
                          momentum=args.momentum)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = (hvd.Compression.fp16
                   if args.fp16_allreduce else hvd.Compression.none)

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average,
        gradient_predivide_factor=args.gradient_predivide_factor)

    def _train(epoch):
        model.train()
        # Horovod: set epoch to sampler for shuffling.
        train_dataset.set_epoch(epoch)
        start_epoch = timeit.default_timer()
        last_batch_time = start_epoch
        batch_wait_times = []
        for batch_idx, (data, target) in enumerate(train_dataset):
            batch_wait_times.append(timeit.default_timer() - last_batch_time)
            if torch.cuda.is_available() and not args.no_cuda:
                if isinstance(data, list):
                    data = [t.cuda() for t in data]
                target = target.cuda()
            optimizer.zero_grad()
            # output = model(data)
            if batch_idx % args.log_interval == 0:
                print(
                    f"Processing batch {batch_idx} in epoch {epoch} on worker "
                    f"{rank}.")
            time.sleep(args.mock_train_step_time)
            # TODO(Clark): Add worker synchronization barrier here.
            # loss = F.nll_loss(output, target)
            # loss.backward()
            # optimizer.step()
            last_batch_time = timeit.default_timer()
        epoch_duration = timeit.default_timer() - start_epoch
        avg_batch_wait_time = np.mean(batch_wait_times)
        std_batch_wait_time = np.std(batch_wait_times)
        max_batch_wait_time = np.max(batch_wait_times)
        min_batch_wait_time = np.min(batch_wait_times)
        print(f"\nEpoch {epoch}, worker {rank} stats over "
              f"{len(batch_wait_times)} steps: {epoch_duration:.3f}")
        print(f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- "
              f"{std_batch_wait_time}")
        print(f"Max batch wait time: {max_batch_wait_time:.3f}s")
        print(f"Min batch wait time: {min_batch_wait_time:.3f}s")
        return batch_wait_times

    print(f"Starting training on worker {rank}.")
    batch_wait_times = []
    for epoch in range(args.epochs):
        # TODO(Clark): Don't include stats from first epoch since we already
        # expect that epoch to be cold?
        batch_wait_times.extend(_train(epoch))
    print(f"Done training on worker {rank}.")
    avg_batch_wait_time = np.mean(batch_wait_times)
    std_batch_wait_time = np.std(batch_wait_times)
    max_batch_wait_time = np.max(batch_wait_times)
    min_batch_wait_time = np.min(batch_wait_times)
    print(f"\nWorker {rank} training stats over {args.epochs} epochs:")
    print(f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- "
          f"{std_batch_wait_time}")
    print(f"Max batch wait time: {max_batch_wait_time:.3f}s")
    print(f"Min batch wait time: {min_batch_wait_time:.3f}s")
    # TODO(Clark): Add logic to the dataset abstraction so we don't have to do
    # this.
    if rank == 0:
        print("Waiting in rank 0 worker to let other workers consume queue...")
        time.sleep(10)
        print("Done waiting in rank 0 worker.")
예제 #21
0
def main():
    args = parser.parse_args()

    # Set-up tensorboard

    # Horovod: initialize library.
    seed = 42
    hvd.init()
    torch.manual_seed(seed)

    # Horovod: pin GPU to local rank.
    torch.cuda.set_device(hvd.local_rank())
    torch.cuda.manual_seed(seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
    # issues with Infiniband implementations that are not fork-safe
    if (hasattr(mp, '_supports_context') and mp._supports_context
            and 'forkserver' in mp.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    data_dir = args.data_dir
    with FileLock(os.path.expanduser("~/.horovod_lock")):
        train_dataset = \
            datasets.MNIST(data_dir, train=True, download=True,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307,), (0.3081,))
                           ]))

    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    test_dataset = \
        datasets.MNIST(data_dir, train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ]))
    # Horovod: use DistributedSampler to partition the test data.
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()
    loss_function = nn.CrossEntropyLoss()
    running_loss = 0.0

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    # Move model to GPU.
    model.cuda()
    # If using GPU Adasum allreduce, scale learning rate by local_size.
    if args.use_adasum and hvd.nccl_built():
        lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.Adam(model.parameters(), lr=args.base_lr * lr_scaler)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average)

    # Profile training
    logs = "logs/pytorch-" + datetime.now().strftime("%Y%m%d-%H%M%S")
    writer = SummaryWriter(log_dir=logs)

    for epoch in range(1, args.epochs + 1):
        train(epoch, model, train_sampler, train_loader, optimizer,
              loss_function, args)
        test_loss, test_accuracy = test(model, test_loader, test_sampler)

        if hvd.rank() == 0:
            writer.add_scalars("Test", {
                "loss": test_loss,
                "acc.": test_accuracy
            })

    writer.close()
예제 #22
0
    def __init__(self, opt):
        """Initialize the pix2pix class.

        Parameters:
            opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
        """
        BaseModel.__init__(self, opt)
        # specify the training losses you want to print out. The training/test scripts will call <BaseModel.get_current_losses>
        self.loss_names = ['G_GAN', 'G_L1', 'D_real', 'D_fake']
        # specify the images you want to save/display. The training/test scripts will call <BaseModel.get_current_visuals>
        self.visual_names = ['real_A', 'fake_B', 'real_B']
        # specify the models you want to save to the disk. The training/test scripts will call <BaseModel.save_networks> and <BaseModel.load_networks>
        if self.isTrain:
            self.model_names = ['G', 'D']
        else:  # during test time, only load G
            self.model_names = ['G']
        # define networks (both generator and discriminator)
        self.netG = networks.define_G(opt.input_nc, opt.output_nc, opt.ngf,
                                      opt.netG, opt.norm, not opt.no_dropout,
                                      opt.init_type, opt.init_gain,
                                      self.gpu_ids)

        # Horovod
        hvd.broadcast_parameters(self.netG.state_dict(), root_rank=0)

        if self.isTrain:  # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc
            self.netD = networks.define_D(opt.input_nc + opt.output_nc,
                                          opt.ndf, opt.netD, opt.n_layers_D,
                                          opt.norm, opt.init_type,
                                          opt.init_gain, self.gpu_ids)
            # Horovod
            hvd.broadcast_parameters(self.netD.state_dict(), root_rank=0)

        if self.isTrain:
            # Horovod
            compression = hvd.Compression.fp16 if opt.fp16_allreduce else hvd.Compression.none
            lr_scaler = hvd.size() if not opt.use_adasum else 1

            if opt.use_adasum and hvd.nccl_built():
                lr_scaler = hvd.local_size()

            # define loss functions
            self.criterionGAN = networks.GANLoss(opt.gan_mode).to(self.device)
            self.criterionL1 = torch.nn.L1Loss()
            # initialize optimizers; schedulers will be automatically created by function <BaseModel.setup>.
            optimizer_G = torch.optim.Adam(self.netG.parameters(),
                                           lr=opt.lr * lr_scaler,
                                           betas=(opt.beta1, 0.999))
            # Horovod
            hvd.broadcast_optimizer_state(optimizer_G, root_rank=0)
            self.optimizer_G = hvd.DistributedOptimizer(
                optimizer_G, named_parameters=self.netG.named_parameters())

            optimizer_D = torch.optim.Adam(self.netD.parameters(),
                                           lr=opt.lr * lr_scaler,
                                           betas=(opt.beta1, 0.999))
            # Horovod
            hvd.broadcast_optimizer_state(optimizer_D, root_rank=0)
            self.optimizer_D = hvd.DistributedOptimizer(
                optimizer_D, named_parameters=self.netD.named_parameters())

            self.optimizers.append(self.optimizer_G)
            self.optimizers.append(self.optimizer_D)
예제 #23
0
 def hvd_param_scaling(self):
     if hvd.nccl_built():
         self.batch_size = int(self.batch_size / hvd.local_size())
         self.iters_per_epoch = int(self.max_iterations / self.epochs /
                                    hvd.local_size())
예제 #24
0
def main(args):
    hvd.init()

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    if args.cuda:
        device = torch.device('cuda')
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())

    device = 'GPU' if args.cuda else 'CPU'
    if hvd.rank() == 0:
        log('Using PyTorch version: %s, Device: %s' %
            (torch.__version__, device))
        log('Horovod version: %s, CUDA: %s, ROCM: %s, NCCL: %s, MPI: %s' %
            (horovod.__version__, hvd.cuda_built(), hvd.rocm_built(),
             hvd.nccl_built(), hvd.mpi_built()))
        log(torch.__config__.show())

    cudnn.benchmark = True

    # Set up standard model.
    log('Initializing %s model...' % args.model)
    model = getattr(models, args.model)()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    if args.fixed_data:
        data, target = generate_data(args)

    def benchmark_step():
        nonlocal data, target

        if not args.fixed_data:
            data, target = generate_data(args)

        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()

    log('Model: %s' % args.model)
    log('Batch size: %d' % args.batch_size)
    log('Number of %ss: %d' % (device, hvd.size()))

    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        img_secs.append(img_sec)

    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
        (hvd.size(), device, hvd.size() * img_sec_mean,
         hvd.size() * img_sec_conf))
예제 #25
0
def train_fn():
    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    if args.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    train_dataset = \
        datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))
    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               sampler=train_sampler,
                                               **kwargs)
    transformations = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    test_dataset = datasets.MNIST('data-%d' % hvd.rank(),
                                  train=False,
                                  transform=transformations)
    # Horovod: use DistributedSampler to partition the test data.
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr * lr_scaler,
                          momentum=args.momentum)

    # Horovod: (optional) compression algorithm.
    compression = (hvd.Compression.fp16
                   if args.fp16_allreduce else hvd.Compression.none)

    @hvd.elastic.run
    def train(state):
        # post synchronization event (worker added, worker removed) init ...
        for state.epoch in range(state.epoch, args.epochs + 1):
            state.model.train()

            train_sampler.set_epoch(state.epoch)
            steps_remaining = len(train_loader) - state.batch

            for state.batch, (data, target) in enumerate(train_loader):
                if state.batch >= steps_remaining:
                    break

                if args.cuda:
                    data, target = data.cuda(), target.cuda()
                state.optimizer.zero_grad()
                output = state.model(data)
                loss = F.nll_loss(output, target)
                loss.backward()
                state.optimizer.step()
                if state.batch % args.log_interval == 0:
                    # Horovod: use train_sampler to determine
                    # the number of examples in this worker's partition.
                    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.
                          format(state.epoch, state.batch * len(data),
                                 len(train_sampler),
                                 100.0 * state.batch / len(train_loader),
                                 loss.item()))
                if (state.batch + 1) % args.num_batches_per_commit == 0:
                    state.commit()
            state.batch = 0

    def test():
        model.eval()
        test_loss = 0.
        test_accuracy = 0.
        for data, target in test_loader:
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            # sum up batch loss
            test_loss += F.nll_loss(output, target, size_average=False).item()
            # get the index of the max log-probability
            pred = output.data.max(1, keepdim=True)[1]
            test_accuracy += pred.eq(
                target.data.view_as(pred)).cpu().float().sum()

        # Horovod: use test_sampler to determine the number of examples in
        # this worker's partition.
        test_loss /= len(test_sampler)
        test_accuracy /= len(test_sampler)

        # Horovod: average metric values across workers.
        test_loss = metric_average(test_loss, 'avg_loss')
        test_accuracy = metric_average(test_accuracy, 'avg_accuracy')

        # Horovod: print output only on first rank.
        if hvd.rank() == 0:
            print(
                '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
                    test_loss, 100. * test_accuracy))

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average)

    # adjust learning rate on reset
    def on_state_reset():
        for param_group in optimizer.param_groups:
            param_group['lr'] = args.lr * hvd.size()

    state = hvd.elastic.TorchState(model, optimizer, epoch=1, batch=0)
    state.register_reset_callbacks([on_state_reset])
    train(state)
    test()
예제 #26
0
def train_func(config):
    data_dir = config.get("data_dir", None)
    seed = config.get("seed", 42)
    use_cuda = config.get("use_cuda", False)
    batch_size = config.get("batch_size", 64)
    use_adasum = config.get("use_adasum", False)
    lr = config.get("lr", 0.01)
    momentum = config.get("momentum", 0.5)
    num_epochs = config.get("num_epochs", 10)
    log_interval = config.get("log_interval", 10)

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(seed)

    if use_cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
    data_dir = data_dir or "~/data"
    with FileLock(os.path.expanduser("~/.horovod_lock")):
        train_dataset = \
            datasets.MNIST(data_dir, train=True, download=True,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307,), (0.3081,))
                           ]))
    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not use_adasum else 1

    if use_cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(
        model.parameters(), lr=lr * lr_scaler, momentum=momentum)

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        op=hvd.Adasum if use_adasum else hvd.Average)

    results = []
    for epoch in range(1, num_epochs + 1):
        model.train()
        # Horovod: set epoch to sampler for shuffling.
        train_sampler.set_epoch(epoch)
        num_batches = len(train_loader)
        for batch_idx, (data, target) in enumerate(train_loader):
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                # Horovod: use train_sampler to determine the number of
                # examples in this worker's partition.
                print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch, batch_idx * len(data), len(train_sampler),
                    100. * batch_idx / len(train_loader), loss.item()))
            if batch_idx == num_batches - 1:
                results.append(loss.item())
    return results
예제 #27
0
def setup(config):
    data_dir = config.get("data_dir", None)
    seed = config.get("seed", 42)
    batch_size = config.get("batch_size", 64)
    use_adasum = config.get("use_adasum", False)
    lr = config.get("lr", 0.01)
    momentum = config.get("momentum", 0.5)
    use_cuda = config.get("use_cuda", False)

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(seed)

    if use_cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
    data_dir = data_dir or "~/data"
    with FileLock(os.path.expanduser("~/.horovod_lock")):
        train_dataset = datasets.MNIST(
            data_dir,
            train=True,
            download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        )
    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not use_adasum else 1

    if use_cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=lr * lr_scaler,
                          momentum=momentum)

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        op=hvd.Adasum if use_adasum else hvd.Average,
    )

    return model, optimizer, train_loader, train_sampler
예제 #28
0
def run():
  
  
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)
    if args.cuda:
      
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)
        
    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)
  
    """model_init"""
    model = FFN_no_norm(in_channels=4, out_channels=1, input_size=args.input_size, delta=args.delta, depth=args.depth)
  
    #hvd ddl
    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler)
  

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(optimizer,
                                         named_parameters=model.named_parameters(),
                                         compression=compression,
                                         op=hvd.Adasum if args.use_adasum else hvd.Average)
  
  
  
  
  
    """resume"""
    if args.resume is not None:
        model.load_state_dict(torch.load(args.resume))
    
    if os.path.exists(args.save_path + 'resume_step.pkl'):
        resume = load_obj(args.save_path + 'resume_step.pkl')
    else:
        resume = {'resume_step': args.resume_step}
    args.resume_step = resume['resume_step']
    print('resume_step', args.resume_step)

    if args.tb == None:
        tb = SummaryWriter('./tensorboard/'+args.tag+'tb_train_log_fov:{}_delta:{}_depth:{}.pth'
                       .format(list(args.input_size)[0], list(args.delta)[0], args.depth))
    else:
        tb = SummaryWriter(args.tb)
        
    """data_load"""

    
    


    train_dataset= BatchCreator(args.train_data_dir, args.input_size, delta=args.delta,train=True)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
    # issues with Infiniband implementations that are not fork-safe
    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
            mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(
        train_dataset, sampler=train_sampler, **kwargs)

    batch_it = get_batch(train_loader, args.batch_size, args.input_size,
                                     partial(fixed_offsets, fov_moves=train_dataset.shifts))

    """
    
    for index in range(files_total):
        input_h5data_dict = [(abs_path_training_data + sorted_files_train_data)]
        print(input_h5data_dict)
        train_dataset_dict = BatchCreator(input_h5data_dict, args.input_size, delta=args.delta, train=True)
        train_sampler_dict = torch.utils.data.distributed.DistributedSampler(train_dataset_dict, num_replicas=world_size, rank=rank, shuffle=True)
        train_loader_dict = DataLoader(train_dataset_dict, num_workers=0, sampler=train_sampler_dict , pin_memory=True)
        batch_it_dict = get_batch(train_loader_dict, args.batch_size, args.input_size,
                               partial(fixed_offsets, fov_moves=train_dataset_dict.shifts))
    """

    
    
    
    
    
    
    
    """optimizer"""
    """
    if args.opt == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=args.lr)
    else:
        optimizer = optim.SGD(model.parameters(), lr=1e-3)
    """
    # optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.step, gamma=args.gamma, last_epoch=-1)
    
    """train_loop"""
    t_last = time.time()
    cnt = 0
    tp = fp = tn = fn = 0
    best_loss = np.inf
    
    model.train()

    while cnt < args.iter:
        cnt += 1
        
        # resume_tb
        if cnt % 1000 == 0:
            resume['resume_step'] = cnt + args.resume_step
            pickle_obj(resume, 'resume_step', args.save_path)
            
        """
        index_batch = (cnt % train_num)
        train_sampler_dict[index_batch].set_epoch(cnt)
        seeds, images, labels, offsets = next(batch_it_dict[index_batch])
        print(input_h5data_dict[index_batch])
        """
        

        train_sampler.set_epoch(cnt)
        seeds, images, labels, offsets = next(batch_it)

        
        
        
        # train
        t_curr = time.time()
        labels = labels.cuda()
        torch_seed = torch.from_numpy(seeds)
        input_data = torch.cat([images, torch_seed], dim=1)
        input_data = Variable(input_data.cuda())

        logits = model(input_data)
        updated = torch_seed.cuda() + logits

        optimizer.zero_grad()
        loss = F.binary_cross_entropy_with_logits(updated, labels)
        loss.backward()

        # torch.nn.utils.clip_grad_value_(model.parameters(), args.clip_grad_thr)
        optimizer.step()

        seeds[...] = updated.detach().cpu().numpy()
        
        
        

          
        pred_mask = (updated >= logit(0.8)).detach().cpu().numpy()
        true_mask = (labels > 0.5).cpu().numpy()
        true_bg = np.logical_not(true_mask)
        pred_bg = np.logical_not(pred_mask)
        tp += (true_mask & pred_mask).sum()
        fp += (true_bg & pred_mask).sum()
        fn += (true_mask & pred_bg).sum()
        tn += (true_bg & pred_bg).sum()
        precision = 1.0 * tp / max(tp + fp, 1)
        recall = 1.0 * tp / max(tp + fn, 1)
        accuracy = 1.0 * (tp + tn) / (tp + tn + fp + fn)
        print('[rank_{}:, Iter_{}:, loss: {:.4}, Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%]\r'.format(hvd.rank(),
            cnt, loss.item(), precision * 100, recall * 100, accuracy * 100))

        # scheduler.step()

        """model_saving_(iter)"""
        
        if (cnt % args.save_interval) == 0 and hvd.rank() == 0:
            tp = fp = tn = fn = 0
            # t_last = t_curr
            # best_loss = loss.item()
            input_size_r = list(args.input_size)
            delta_r = list(args.delta)
            torch.save(model.state_dict(), os.path.join(args.save_path, (
                        str(args.tag) + 'ffn_model_fov:{}_delta:{}_depth:{}.pth'.format(input_size_r[0],
                                                                                                    delta_r[0],
                                                                                                  args.depth))))
            torch.save(model.state_dict(), os.path.join(args.save_path, (
                    str(args.tag) + 'ffn_model_fov:{}_delta:{}_depth:{}_recall{}_.pth'.format(input_size_r[0],
                                                                                       delta_r[0],
                                                                                       args.depth,recall*100))))

            print('Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%, Model saved!'.format(
                precision * 100, recall * 100, accuracy * 100))

            buffer_step = 3000
            resume_step = args.resume_step - buffer_step
            if cnt > buffer_step:
                tb.add_scalar("Loss", loss.item(), cnt + resume_step)
                tb.add_scalar("Precision", precision * 100, cnt + resume_step)
                tb.add_scalar("Recall", recall * 100, cnt + resume_step)
                tb.add_scalar("Accuracy", accuracy * 100, cnt + resume_step)