Exemplo n.º 1
0
    def _set_horovod_backend(self):
        self.check_horovod()
        self._distrib_type = DistributedType.HOROVOD

        # Initialize Horovod to get rank / size info
        hvd.init()
        if self.on_gpu:
            # Horovod assigns one local GPU per process
            self.parallel_device_ids = list(range(hvd.local_size()))
        else:
            self.num_processes = hvd.local_size()
Exemplo n.º 2
0
    def local_size(cls, *args):
        """Get the number of workers at the current node."""

        try:
            return mgw.local_size(*args)
        except NameError:
            raise NameError('module <mgw> not imported')
Exemplo n.º 3
0
def _test__hvd_dist_model_create_from_backend_dist(backend, true_device):

    model = _HorovodDistModel.create_from_backend(backend=backend)

    assert hvd.rank() > -1

    with pytest.raises(
            RuntimeError,
            match=r"Can not re-initialize Horovod if it is already initialized"
    ):
        _HorovodDistModel.create_from_backend(backend=backend)

    _assert_model(
        model,
        {
            "device": true_device,
            "local_rank": hvd.local_rank(),
            "rank": hvd.rank(),
            "world_size": hvd.size(),
            "node_index": 0,
            "nnodes": 1,
            "nproc_per_node": hvd.local_size(),
        },
    )

    model.finalize()
Exemplo n.º 4
0
def prepare(args, e_ix_ln, r_ix_ln, t_ix_ln):
    mdl = _model(args, e_ix_ln, r_ix_ln, t_ix_ln)

    lr_ml = (hvd.local_size() if hvd.nccl_built() else
             1) if not args.tpu and args.adasum else _size(args)
    opt = torch.optim.Adam(mdl.parameters(),
                           lr=lr_ml * args.learning_rate,
                           weight_decay=args.weight_decay)

    st_e, bst_ls = _resume(args, mdl, opt) if args.resume != '' else (1, None)

    if not args.tpu:
        opt = hvd.DistributedOptimizer(
            opt,
            named_parameters=mdl.named_parameters(),
            compression=hvd.Compression.fp16
            if args.fp16 else hvd.Compression.none,
            op=hvd.Adasum if args.adasum else hvd.Average)
        hvd.broadcast_parameters(mdl.state_dict(), root_rank=0)

    lr_sc = torch.optim.lr_scheduler.StepLR(opt,
                                            step_size=args.learning_rate_step,
                                            gamma=args.learning_rate_gamma)
    if not args.tpu:
        hvd.broadcast_optimizer_state(opt, root_rank=0)

    ls_f = _loss_f(args).to(args.dvc)

    return mdl, opt, lr_sc, ls_f, st_e, bst_ls
Exemplo n.º 5
0
    def calculate_shuffle_buffer_size(hvd, avg_row_size, train_row_count_per_worker):
        """
        Determines the shuffling buffer size such that each worker gets at most 1GB for shuffling
        buffer such that on a single machine, among all the workers on that machine, at most
        memory_cap_gb GB are allocated for shuffling buffer. Also, it ensures that the buffer size
        is identical among all the workers.

        example 1:
        memory_cap_gb = 4
        machine1: 8 workers
        machine2: 3 workers
        shuffle_buffer_size = 0.5 GB

        example 2:
        memory_cap_gb = 4
            machine1: 2 workers
            machine2: 3 workers
        shuffle_buffer_size = 1 GB

        example 3:
        memory_cap_gb = 4
            machine1: 2 workers
            machine2: 8 workers
            machine3: 5 workers
        shuffle_buffer_size = 0.5 GB
        """
        local_size = hvd.local_size()
        local_sizes = hvd.allgather(torch.tensor([local_size]))
        max_local_size = torch.max(local_sizes).item()

        if max_local_size > TOTAL_BUFFER_MEMORY_CAP_GIB:
            shuffle_buffer_size = TOTAL_BUFFER_MEMORY_CAP_GIB * BYTES_PER_GIB / avg_row_size / max_local_size
        else:
            shuffle_buffer_size = BYTES_PER_GIB / avg_row_size
        return int(min(shuffle_buffer_size, train_row_count_per_worker))
Exemplo n.º 6
0
def _check_distributed():
    try:
        dist = hvd.size() != hvd.local_size()
    except ValueError:
        # not using horovod
        dist = False
    return dist
Exemplo n.º 7
0
  def test_stability(self):
    hvd.init()
    # TODO support non-MPI Adasum operation
    if not hvd.mpi_enabled():
      self.skipTest("MPI not enabled")

    device = torch.device('cuda:{}'.format(hvd.local_rank())) if torch.cuda.is_available() else torch.device('cpu')
    np.random.seed(2)
    torch.manual_seed(2)
    size = hvd.size()
    local_size = hvd.local_size()
    rank = hvd.rank()

    for data_type in self.data_types:
      N = 1024
      a = np.random.normal(0, np.finfo(data_type).tiny, (N, 1)).astype(np.float64)
      r = np.random.normal(0, 1, (size, 1)).astype(np.float64)
      q = np.dot(a,r.T).astype(data_type).astype(np.float64)
      tensor = np.zeros(N,dtype=data_type)
      tensor[:] = q[:,hvd.rank()]

      tensor = torch.from_numpy(tensor).to(device)

      hvd.allreduce_(tensor, op=hvd.Adasum)

      expected = np.sum(q,axis=1) / size
      comp = self.are_close(data_type, expected, tensor.cpu().numpy()) 
      if comp:
        print('Stability test passed')
      else:
        print('computed: ', tensor)
        print('expected: ', expected)
        print('off by: ', self.diff_ratio(expected,tensor.cpu().numpy()))
      assert comp
Exemplo n.º 8
0
    def _whoami(self, verbose):
        hw_cfg = {
            'n_cpus': mp.cpu_count(),
            'n_gpus': torch.cuda.device_count() if torch.cuda.is_available() else 0
        }
        sw_cfg = {
            'global_size': hvd.size(),
            'global_rank': hvd.rank(),
            'local_size':  hvd.local_size(),
            'local_rank':  hvd.local_rank(),
            'master_rank': __MASTER_PROC_RANK__
        }
        assert sw_cfg['local_size'] <= hw_cfg['n_cpus']  # maximum one process per core
        assert hw_cfg['n_gpus'] <= hw_cfg['n_cpus']        # maximum one GPU per core
        if hw_cfg['n_gpus'] > 0:
            assert sw_cfg['local_size'] <= hw_cfg['n_gpus']  # maximum one process per GPU
            torch.cuda.set_device(sw_cfg['local_rank'])      # if node is equipped with GPUs, each process should be pinned to one
            device = torch.cuda.current_device()
        else:
            device = torch.device('cpu')
        hw_cfg['device'] = device

        self.hw_cfg = hw_cfg
        self.sw_cfg = sw_cfg
        self.is_master = self.sw_cfg['global_rank'] == self.sw_cfg['master_rank']
        self.verbose = verbose and self.is_master
Exemplo n.º 9
0
def fn(magic_number):
    import horovod.torch as hvd
    hvd.init()
    print(
        'Hello, rank = %d, local_rank = %d, size = %d, local_size = %d, magic_number = %d'
        % (hvd.rank(), hvd.local_rank(), hvd.size(), hvd.local_size(),
           magic_number))
    return hvd.rank()
    def _handle_horovod(self) -> None:
        if self._num_nodes_flag > 1:
            raise MisconfigurationException(
                "Horovod does not support setting num_nodes / num_gpus explicitly. Use "
                "horovodrun / mpirun to configure the number of processes.")

        if not _HOROVOD_AVAILABLE:
            raise MisconfigurationException(
                'Requested `accelerator="horovod"`, but Horovod is not installed.'
                "Install with \n $HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]"
            )

        hvd.init()
        if isinstance(self.accelerator, GPUAccelerator):
            # Horovod assigns one local GPU per process
            self._parallel_devices = list(range(hvd.local_size()))
        else:
            self._parallel_devices = [torch.device("cpu")] * hvd.local_size()
Exemplo n.º 11
0
    def test_parallel(self):
        hvd.init()
        # TODO support non-MPI Adasum operation
        # Only do this test if there are GPUs available.
        if not hvd.mpi_enabled() or not torch.cuda.is_available():
            self.skipTest("No GPUs available")

        device = torch.device('cuda:{}'.format(hvd.local_rank()))
        np.random.seed(2)
        torch.manual_seed(2)
        size = hvd.size()
        local_size = hvd.local_size()
        rank = hvd.rank()

        for data_type in self.data_types:
            all_Ns = [size * 20 - 13, size * 2 + 1, size + 2, 2**19]
            tensors = []
            all_qs = []
            for N in all_Ns:
                a = np.random.normal(0, 1, (N, 1)).astype(np.float64)
                r = np.random.normal(0, 1, (size, 1)).astype(np.float64)
                q = np.dot(a, r.T)
                q = q.astype(data_type)
                all_qs.append(q.astype(np.float64))
                tensors.append(q[:, hvd.rank()])

            tensors = list(
                map(lambda x: torch.from_numpy(x).to(device), tensors))

            handles = [
                hvd.allreduce_async(tensor, op=hvd.Adasum)
                for tensor in tensors
            ]

            reduced_tensors = [synchronize(h) for h in handles]

            expected = [np.sum(q, axis=1) / size for q in all_qs]
            all_comp = [
                self.are_close(data_type, e,
                               rt.cpu().numpy())
                for e, rt in zip(expected, reduced_tensors)
            ]
            if np.alltrue(all_comp):
                print('Parallel test passed')
            else:
                for c, e, rt in zip(all_comp, expected, reduced_tensors):
                    if c == False:
                        print('computed: ', rt)
                        print('expected: ', e)
                        print('off by: ', self.diff_ratio(e, rt.cpu().numpy()))
            assert np.alltrue(all_comp)
Exemplo n.º 12
0
    def calculate_shuffle_buffer_size():
        """
        Determines the shuffling buffer size such that each worker gets at most 1GB for shuffling
        buffer such that on a single machine, among all the workers on that machine, at most
        memory_cap_gb GB are allocated for shuffling buffer. Also, it ensures that the buffer size
        is identical among all the workers.

        example 1:
        memory_cap_gb = 4
        machine1: 8 workers
        machine2: 3 workers
        shuffle_buffer_size = 0.5 GB

        example 2:
        memory_cap_gb = 4
            machine1: 2 workers
            machine2: 3 workers
        shuffle_buffer_size = 1 GB

        example 3:
        memory_cap_gb = 4
            machine1: 2 workers
            machine2: 8 workers
            machine3: 5 workers
        shuffle_buffer_size = 0.5 GB
        """
        import horovod.torch as hvd

        # If user specifies any user_shuffle_buffer_size (even 0), we should honor it.
        if user_shuffle_buffer_size is not None:
            if user_shuffle_buffer_size < 0:
                raise ValueError(
                    "user_shuffle_buffer_size cannot be negative!")
            return user_shuffle_buffer_size

        local_size = hvd.local_size()
        local_sizes = hvd.allgather(torch.tensor([local_size]))
        max_local_size = torch.max(local_sizes).item()

        if max_local_size > TOTAL_BUFFER_MEMORY_CAP_GIB:
            shuffle_buffer_size = TOTAL_BUFFER_MEMORY_CAP_GIB * BYTES_PER_GIB / avg_row_size / max_local_size
        else:
            shuffle_buffer_size = BYTES_PER_GIB / avg_row_size
        return int(min(shuffle_buffer_size, train_rows / hvd.size()))
Exemplo n.º 13
0
    def test_stability_2(self):
        hvd.init()
        # TODO support non-MPI Adasum operation
        if not hvd.mpi_enabled():
            return
        device = torch.device('cuda:{}'.format(hvd.local_rank(
        ))) if torch.cuda.is_available() else torch.device('cpu')
        np.random.seed(2)
        torch.manual_seed(2)
        size = hvd.size()
        local_size = hvd.local_size()
        rank = hvd.rank()

        for data_type in self.data_types:
            N = 1024
            dt_min = np.finfo(data_type).tiny.astype(np.float64)
            dt_max = math.sqrt(np.finfo(data_type).max.astype(np.float64))
            a = np.random.normal(0, 1, (N, 1)).astype(np.float64)
            r = np.array([
                dt_max**(float(i + 1) / float(size)) *
                dt_min**(float(size - i - 1) / float(size))
                for i in range(size)
            ]).reshape(size, 1).astype(np.float64)
            np.random.shuffle(r)
            q = np.dot(a, r.T).astype(data_type).astype(np.float64)
            tensor = np.zeros(N, dtype=data_type)
            tensor[:] = q[:, hvd.rank()]

            tensor = torch.from_numpy(tensor).to(device)

            hvd.allreduce_(tensor, op=hvd.Adasum)

            expected = np.sum(q, axis=1) / size
            comp = self.are_close(data_type, expected, tensor.cpu().numpy())
            if comp:
                print('Stability 2 test passed')
            else:
                print('computed: ', tensor)
                print('expected: ', expected)
                print('off by: ',
                      self.diff_ratio(expected,
                                      tensor.cpu().numpy()))
            assert comp
Exemplo n.º 14
0
def _test__hvd_dist_model_create_from_context_dist(true_backend, true_device):

    assert _HorovodDistModel.create_from_context() is None

    hvd.init()

    true_conf = {
        "device": true_device,
        "local_rank": hvd.local_rank(),
        "rank": hvd.rank(),
        "world_size": hvd.size(),
        "node_index": 0,
        "nnodes": 1,
        "nproc_per_node": hvd.local_size(),
    }

    model = _HorovodDistModel.create_from_context()
    _assert_model(model, true_conf)

    hvd.shutdown()
Exemplo n.º 15
0
def _test__hvd_dist_model_create_from_context_dist(true_backend, true_device):

    assert _HorovodDistModel.create_from_context() is None

    hvd.init()
    lrank = hvd.local_rank()
    if torch.cuda.is_available():
        torch.cuda.set_device(lrank)

    true_conf = {
        "device": true_device,
        "local_rank": lrank,
        "rank": hvd.rank(),
        "world_size": hvd.size(),
        "node_index": 0,
        "nnodes": 1,
        "nproc_per_node": hvd.local_size(),
    }

    model = _HorovodDistModel.create_from_context()
    assert model.backend() == true_backend
    _assert_model(model, true_conf)

    hvd.shutdown()
Exemplo n.º 16
0
 def _compute_nproc_per_node(self) -> int:
     return hvd.local_size()
Exemplo n.º 17
0
def setup(config):
    data_dir = config.get("data_dir", None)
    seed = config.get("seed", 42)
    batch_size = config.get("batch_size", 64)
    use_adasum = config.get("use_adasum", False)
    lr = config.get("lr", 0.01)
    momentum = config.get("momentum", 0.5)
    use_cuda = config.get("use_cuda", False)

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(seed)

    if use_cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
    data_dir = data_dir or "~/data"
    with FileLock(os.path.expanduser("~/.horovod_lock")):
        train_dataset = datasets.MNIST(
            data_dir,
            train=True,
            download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        )
    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not use_adasum else 1

    if use_cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=lr * lr_scaler,
                          momentum=momentum)

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        op=hvd.Adasum if use_adasum else hvd.Average,
    )

    return model, optimizer, train_loader, train_sampler
Exemplo n.º 18
0
                                             sampler=val_sampler, **kwargs)


    # Set up standard VGG16 model.
    model = models.vgg16()

    # By default, Adasum doesn't need scaling up learning rate.
    # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce
    lr_scaler = args.batches_per_allreduce * hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = args.batches_per_allreduce * hvd.local_size()

    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.SGD(model.parameters(),
                          lr=(args.base_lr *
                              lr_scaler),
                          momentum=args.momentum, weight_decay=args.wd)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters(),
        compression=compression,
        backward_passes_per_step=args.batches_per_allreduce,)
Exemplo n.º 19
0
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr * lr_scaler,
                          momentum=args.momentum)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
Exemplo n.º 20
0
def train_func(config):
    data_dir = config.get("data_dir", None)
    seed = config.get("seed", 42)
    use_cuda = config.get("use_cuda", False)
    batch_size = config.get("batch_size", 64)
    use_adasum = config.get("use_adasum", False)
    lr = config.get("lr", 0.01)
    momentum = config.get("momentum", 0.5)
    num_epochs = config.get("num_epochs", 10)
    log_interval = config.get("log_interval", 10)

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(seed)

    if use_cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
    data_dir = data_dir or "~/data"
    with FileLock(os.path.expanduser("~/.horovod_lock")):
        train_dataset = \
            datasets.MNIST(data_dir, train=True, download=True,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307,), (0.3081,))
                           ]))
    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not use_adasum else 1

    if use_cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(
        model.parameters(), lr=lr * lr_scaler, momentum=momentum)

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        op=hvd.Adasum if use_adasum else hvd.Average)

    results = []
    for epoch in range(1, num_epochs + 1):
        model.train()
        # Horovod: set epoch to sampler for shuffling.
        train_sampler.set_epoch(epoch)
        num_batches = len(train_loader)
        for batch_idx, (data, target) in enumerate(train_loader):
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                # Horovod: use train_sampler to determine the number of
                # examples in this worker's partition.
                print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch, batch_idx * len(data), len(train_sampler),
                    100. * batch_idx / len(train_loader), loss.item()))
            if batch_idx == num_batches - 1:
                results.append(loss.item())
    return results
Exemplo n.º 21
0
    def __init__(
        self,
        env,
        env_params: dict,
        log_dir: str,
        ac_kwargs: dict = {},
        seed: int = 0,
        steps_per_epoch: int = 4000,
        epochs: int = 50,
        gamma: float = 0.99,
        clip_ratio: float = 0.2,
        pi_lr: float = 3e-4,
        vf_lr: float = 1e-3,
        train_iters: int = 100,
        entropy_coeff: float = 1e-2,
        lam: float = 0.97,
        target_kl: float = 0.01,
        save_freq: int = 10,
        load_path=None,
        render_train: bool = False,
        wandb_id: Optional[str] = None,
        **kwargs,
    ):
        self.log_dir = log_dir
        self.render_dir = os.path.join(log_dir, "renders")
        self.ckpt_dir = os.path.join(log_dir, "checkpoints")
        if hvd.rank() == 0:
            os.makedirs(self.log_dir, exist_ok=True)
            os.makedirs(self.render_dir, exist_ok=True)
            os.makedirs(self.ckpt_dir, exist_ok=True)
        self.softlink = os.path.abspath(
            os.path.join(self.ckpt_dir, f"ckpt_latest.pth"))
        self.ac_params_file = os.path.join(log_dir, "ac_params.json")
        hparams = convert_json(locals())
        self.logger = EpochLogger(output_dir=self.log_dir, exp_name=wandb_id)

        if torch.cuda.is_available():
            # Horovod: pin GPU to local rank.
            dev_id = int(torch.cuda.device_count() * hvd.local_rank() /
                         hvd.local_size())
            torch.cuda.set_device(dev_id)
            device = torch.device(f"cuda:{dev_id}")
            torch.cuda.manual_seed(seed)
        else:
            device = torch.device("cpu")

        #         env_params.update({"device": device})
        self.env = env(**env_params)
        self.ac_params = {k: v for k, v in ac_kwargs.items()}
        self.ac_params.update({
            "observation_space": self.env.observation_space,
            "action_space": self.env.action_space,
            "nagents": self.env.nagents,
        })

        self.entropy_coeff = entropy_coeff
        self.entropy_coeff_decay = entropy_coeff / epochs

        # Horovod: limit # of CPU threads to be used per worker.
        torch.set_num_threads(1)

        torch.save(self.ac_params, self.ac_params_file)

        if os.path.isfile(self.softlink):
            self.logger.log("Restarting from latest checkpoint", color="red")
            load_path = self.softlink

        # Random seed
        seed += 10000 * hvd.rank()
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.nagents = self.env.nagents
        self.ac = PPOLidarActorCritic(
            self.env.observation_space,
            self.env.action_space,
            nagents=self.nagents,
            centralized=True,
            **ac_kwargs,
        )

        self.device = device

        self.pi_lr = pi_lr
        self.vf_lr = vf_lr

        self.load_path = load_path
        if load_path is not None:
            self.load_model(load_path)
        else:
            self.pi_optimizer = Adam(trainable_parameters(self.ac.pi),
                                     lr=self.pi_lr,
                                     eps=1e-8)
            self.vf_optimizer = Adam(trainable_parameters(self.ac.v),
                                     lr=self.vf_lr,
                                     eps=1e-8)

        # Sync params across processes
        hvd.broadcast_parameters(self.ac.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(self.pi_optimizer, root_rank=0)
        hvd.broadcast_optimizer_state(self.vf_optimizer, root_rank=0)
        self.ac = self.ac.to(device)
        self.move_optimizer_to_device(self.pi_optimizer)
        self.move_optimizer_to_device(self.vf_optimizer)

        if hvd.rank() == 0:
            if wandb_id is None:
                eid = (log_dir.split("/")[-2]
                       if load_path is None else load_path.split("/")[-4])
            else:
                eid = wandb_id
            wandb.init(
                name=eid,
                id=eid,
                project="Social Driving",
                resume=load_path is not None,
            )
            wandb.watch_called = False

            if "self" in hparams:
                del hparams["self"]
            wandb.config.update(hparams, allow_val_change=True)

            wandb.watch(self.ac.pi, log="all")
            wandb.watch(self.ac.v, log="all")

        # Count variables
        var_counts = tuple(
            count_vars(module) for module in [self.ac.pi, self.ac.v])
        self.logger.log(
            "\nNumber of parameters: \t pi: %d, \t v: %d\n" % var_counts,
            color="green",
        )

        # Set up experience buffer
        self.steps_per_epoch = steps_per_epoch
        self.local_steps_per_epoch = int(steps_per_epoch / hvd.size())
        self.buf = CentralizedPPOBuffer(
            self.env.observation_space[0].shape,
            self.env.observation_space[1].shape,
            self.env.action_space.shape,
            self.local_steps_per_epoch,
            gamma,
            lam,
            self.env.nagents,
            device=self.device,
        )

        self.gamma = gamma
        self.clip_ratio = clip_ratio
        self.train_iters = train_iters
        self.target_kl = target_kl
        self.epochs = epochs
        self.save_freq = save_freq
Exemplo n.º 22
0
def main_worker(args_):

    args_.cuda = not args_.no_cuda and torch.cuda.is_available()

    allreduce_batch_size = args_.batch_size * args_.batches_per_allreduce

    hvd.init()
    torch.distributed.init_process_group('nccl', rank=4)

    if args_.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        print(f"this process's hvd rank = {hvd.local_rank()}")
        # torch.cuda.manual_seed(args_.seed)

    # cudnn.benchmark = True

    # # If set > 0, will resume training from a given checkpoint.
    # resume_from_epoch = 0
    # for try_epoch in range(args_.epochs, 0, -1):
    #     if os.path.exists(args_.checkpoint_format.format(epoch=try_epoch)):
    #         resume_from_epoch = try_epoch
    #         break
    #
    # # Horovod: broadcast resume_from_epoch from rank 0 (which will have
    # # checkpoints) to other ranks.
    # resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0,
    #                                   name='resume_from_epoch').item()

    # # Horovod: print logs on the first worker.
    # verbose = 1 if hvd.rank() == 0 else 0
    #
    # # Horovod: write TensorBoard logs on first worker.
    # try:
    #     if LooseVersion(torch.__version__) >= LooseVersion('1.2.0'):
    #         from torch.utils.tensorboard import SummaryWriter
    #     else:
    #         from tensorboardX import SummaryWriter
    #     os.makedirs(os.path.join(args_.model_output_dir, 'logs'), exist_ok=True)
    #     log_writer = SummaryWriter(os.path.join(args_.model_output_dir, 'logs')) if hvd.rank() == 0 else None
    # except ImportError:
    #     log_writer = None

    ### MODEL CREATION ###

    # create model
    model1 = VQ_VAE(num_inputs=1, weight_matching=0., channel_var=np.ones((1,)))
    model2 = VQ_VAE(num_inputs=1, weight_matching=0.0005, channel_var=np.ones((1,)))

    model1.cuda()
    model2.cuda()

    model1 = torch.nn.parallel.DistributedDataParallel(model1)
    model2 = torch.nn.parallel.DistributedDataParallel(model2)

    # By default, Adasum doesn't need scaling up learning rate.
    # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce
    if args_.cuda and args_.use_adasum and hvd.nccl_built():
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        lr_scaler = args_.batches_per_allreduce * hvd.local_size()
    elif not args_.use_adasum:
        lr_scaler = args_.batches_per_allreduce * hvd.size()
    else:
        lr_scaler = 1

    # Horovod: scale learning rate by the number of GPUs.
    optimizer1 = t.optim.Adam(model1.parameters(),
                              lr=(args_.base_lr * lr_scaler),
                              betas=(.9, .999))
    optimizer2 = t.optim.Adam(model2.parameters(),
                              lr=(args_.base_lr * lr_scaler),
                              betas=(.9, .999))

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args_.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer1 = hvd.DistributedOptimizer(
        optimizer1, named_parameters=model1.named_parameters(),
        compression=compression,
        backward_passes_per_step=args_.batches_per_allreduce,
        op=hvd.Adasum if args_.use_adasum else hvd.Average)

    optimizer2 = hvd.DistributedOptimizer(
        optimizer2, named_parameters=model2.named_parameters(),
        compression=compression,
        backward_passes_per_step=args_.batches_per_allreduce,
        op=hvd.Adasum if args_.use_adasum else hvd.Average)

    # # Restore from a previous checkpoint, if initial_epoch is specified.
    # # Horovod: restore on the first worker which will broadcast weights to other workers.
    # if resume_from_epoch > 0 and hvd.rank() == 0:
    #     filepath = args.checkpoint_format.format(epoch=resume_from_epoch)
    #     checkpoint = torch.load(filepath)
    #     model.load_state_dict(checkpoint['model'])
    #     optimizer.load_state_dict(checkpoint['optimizer'])

    ### Settings ###
    model_output_dir = args_.model_output_dir
    project_dir = args_.project_dir

    ### Prepare Data ###
    log.info("LOADING FILES")

    # ======= load data using pytorch systems ========
    torch.set_num_threads(4)
    dataset = DatasetFolderWithPaths(
        root=project_dir+"/JUNE"+"/raw_patches",
        loader=npy_loader,
        extensions='.npy'
    )

    dataset_mask = DatasetFolderWithPaths(
        root=project_dir+"/JUNE"+"/raw_masks",
        loader=npy_loader,
        extensions='.npy'
    )

    relation_mat = np.load(os.path.join(project_dir, "JUNE", "raw_patches", "relation_mat.npy"), allow_pickle=True)

    # Horovod: use DistributedSampler to partition data among workers. Manually specify
    # `num_replicas=hvd.size()` and `rank=hvd.rank()`.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_sampler_mask = torch.utils.data.distributed.DistributedSampler(
        dataset_mask, num_replicas=hvd.size(), rank=hvd.rank())

    os.makedirs(os.path.join(model_output_dir, "stage1"), exist_ok=True)
    os.makedirs(os.path.join(model_output_dir, "stage2"), exist_ok=True)

    # =========================================================
    # =========================================================
    log.info("TRAINING: STARTING STAGE 1")

    kwargs = {'num_workers': 4, 'pin_memory': True} if args_.cuda else {}
    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=allreduce_batch_size,
        sampler=train_sampler, **kwargs)
    train_mask_loader = torch.utils.data.DataLoader(
        dataset_mask, batch_size=allreduce_batch_size,
        sampler=train_sampler_mask, **kwargs)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model1.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer1, root_rank=0)

    output_dir = os.path.join(model_output_dir, "stage1")
    writer = SummaryWriter(output_dir)
    log.info(f"\ttensorboard logs written to {output_dir}")

    for epoch in range(args_.stage1_epochs):
        model1.train()
        train_sampler.set_epoch(epoch)

        mean_loss = train(model1,
                          train_loader,
                          optimizer1,
                          # relation_mat=relation_mat,
                          mask_loader=train_mask_loader,
                          args_=args_
                          )

        for key, loss in mean_loss.items():
            mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1.
            writer.add_scalar('Loss/' + key, mean_loss[key], epoch)
        writer.flush()
        log.info('\tepoch %d' % epoch)
        log.info('\t'.join(['{}:{:0.4f}  '.format(key, loss) for key, loss in mean_loss.items()]))

        # only master process should save checkpoints.
        if torch.distributed.get_rank() == 0:
            log.info(f'\t saving epoch {epoch}')
            t.save(model1.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch))

    writer.close()

    # =========================================================
    # =========================================================
    log.info("TRAINING: STARTING STAGE 2")

    # get the last saved epoch.  on IBM, use max(). on OSX use min()
    # s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1", "/*"))
    s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1") + '/*.pt')
    last_epoch = max(s1_epochs, key=os.path.getctime)
    log.info(f"\tloading last epoch = {last_epoch}")

    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=allreduce_batch_size,
                                               sampler=train_sampler)

    train_mask_loader = torch.utils.data.DataLoader(dataset_mask,
                                                    batch_size=allreduce_batch_size,
                                                    sampler=train_sampler_mask)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model2.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer2, root_rank=0)

    output_dir = os.path.join(model_output_dir, "stage2")
    writer = SummaryWriter(output_dir)
    log.info(f"\ttensorboard logs written to {output_dir}")

    model2.load_state_dict(t.load(last_epoch))
    for epoch in range(args_.stage2_epochs):
        model2.train()
        train_sampler.set_epoch(epoch)

        mean_loss = train(model2,
                          train_loader,
                          optimizer2,
                          # relation_mat=relation_mat,
                          mask_loader=train_mask_loader
                          )

        # shuffle samples ids at the end of the epoch
        # if shuffle_data:
        #     np.random.shuffle(sample_ids)
        for key, loss in mean_loss.items():
            mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1.
            writer.add_scalar('Loss/' + key, mean_loss[key], epoch)
        writer.flush()
        log.info('\tepoch %d' % epoch)
        log.info('\t'.join(['{}:{:0.4f}  '.format(key, loss) for key, loss in mean_loss.items()]))

        if torch.distributed.get_rank() == 0:
            log.info(f'\t saving epoch {epoch}')
            t.save(model2.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch))
    writer.close()
Exemplo n.º 23
0
def train_fn():
    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    if args.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    train_dataset = \
        datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))
    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               sampler=train_sampler,
                                               **kwargs)
    transformations = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    test_dataset = datasets.MNIST('data-%d' % hvd.rank(),
                                  train=False,
                                  transform=transformations)
    # Horovod: use DistributedSampler to partition the test data.
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr * lr_scaler,
                          momentum=args.momentum)

    # Horovod: (optional) compression algorithm.
    compression = (hvd.Compression.fp16
                   if args.fp16_allreduce else hvd.Compression.none)

    @hvd.elastic.run
    def train(state):
        # post synchronization event (worker added, worker removed) init ...
        for state.epoch in range(state.epoch, args.epochs + 1):
            state.model.train()

            train_sampler.set_epoch(state.epoch)
            steps_remaining = len(train_loader) - state.batch

            for state.batch, (data, target) in enumerate(train_loader):
                if state.batch >= steps_remaining:
                    break

                if args.cuda:
                    data, target = data.cuda(), target.cuda()
                state.optimizer.zero_grad()
                output = state.model(data)
                loss = F.nll_loss(output, target)
                loss.backward()
                state.optimizer.step()
                if state.batch % args.log_interval == 0:
                    # Horovod: use train_sampler to determine
                    # the number of examples in this worker's partition.
                    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.
                          format(state.epoch, state.batch * len(data),
                                 len(train_sampler),
                                 100.0 * state.batch / len(train_loader),
                                 loss.item()))
                if (state.batch + 1) % args.num_batches_per_commit == 0:
                    state.commit()
            state.batch = 0

    def test():
        model.eval()
        test_loss = 0.
        test_accuracy = 0.
        for data, target in test_loader:
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            # sum up batch loss
            test_loss += F.nll_loss(output, target, size_average=False).item()
            # get the index of the max log-probability
            pred = output.data.max(1, keepdim=True)[1]
            test_accuracy += pred.eq(
                target.data.view_as(pred)).cpu().float().sum()

        # Horovod: use test_sampler to determine the number of examples in
        # this worker's partition.
        test_loss /= len(test_sampler)
        test_accuracy /= len(test_sampler)

        # Horovod: average metric values across workers.
        test_loss = metric_average(test_loss, 'avg_loss')
        test_accuracy = metric_average(test_accuracy, 'avg_accuracy')

        # Horovod: print output only on first rank.
        if hvd.rank() == 0:
            print(
                '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
                    test_loss, 100. * test_accuracy))

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average)

    # adjust learning rate on reset
    def on_state_reset():
        for param_group in optimizer.param_groups:
            param_group['lr'] = args.lr * hvd.size()

    state = hvd.elastic.TorchState(model, optimizer, epoch=1, batch=0)
    state.register_reset_callbacks([on_state_reset])
    train(state)
    test()
Exemplo n.º 24
0
def main(args):
    def train_mixed_precision(epoch, scaler):
        model.train()
        # Horovod: set epoch to sampler for shuffling.
        train_sampler.set_epoch(epoch)
        for batch_idx, (data, target) in enumerate(train_loader):
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                output = model(data)
                loss = F.nll_loss(output, target)

            scaler.scale(loss).backward()
            # Make sure all async allreduces are done
            optimizer.synchronize()
            # In-place unscaling of all gradients before weights update
            scaler.unscale_(optimizer)
            with optimizer.skip_synchronize():
                scaler.step(optimizer)
            # Update scaler in case of overflow/underflow
            scaler.update()

            if batch_idx % args.log_interval == 0:
                # Horovod: use train_sampler to determine the number of examples in
                # this worker's partition.
                print(
                    'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLoss Scale: {}'
                    .format(epoch, batch_idx * len(data), len(train_sampler),
                            100. * batch_idx / len(train_loader), loss.item(),
                            scaler.get_scale()))

    def train_epoch(epoch):
        model.train()
        # Horovod: set epoch to sampler for shuffling.
        train_sampler.set_epoch(epoch)
        for batch_idx, (data, target) in enumerate(train_loader):
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % args.log_interval == 0:
                # Horovod: use train_sampler to determine the number of examples in
                # this worker's partition.
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_sampler),
                    100. * batch_idx / len(train_loader), loss.item()))

    def metric_average(val, name):
        tensor = torch.tensor(val)
        avg_tensor = hvd.allreduce(tensor, name=name)
        return avg_tensor.item()

    def test():
        model.eval()
        test_loss = 0.
        test_accuracy = 0.
        for data, target in test_loader:
            if args.cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            # sum up batch loss
            test_loss += F.nll_loss(output, target, size_average=False).item()
            # get the index of the max log-probability
            pred = output.data.max(1, keepdim=True)[1]
            test_accuracy += pred.eq(
                target.data.view_as(pred)).cpu().float().sum()

        # Horovod: use test_sampler to determine the number of examples in
        # this worker's partition.
        test_loss /= len(test_sampler)
        test_accuracy /= len(test_sampler)

        # Horovod: average metric values across workers.
        test_loss = metric_average(test_loss, 'avg_loss')
        test_accuracy = metric_average(test_accuracy, 'avg_accuracy')

        # Horovod: print output only on first rank.
        if hvd.rank() == 0:
            print(
                '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format(
                    test_loss, 100. * test_accuracy))

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)

    if args.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)
    else:
        if args.use_mixed_precision:
            raise ValueError(
                "Mixed precision is only supported with cuda enabled.")

    if (args.use_mixed_precision
            and LooseVersion(torch.__version__) < LooseVersion('1.6.0')):
        raise ValueError("""Mixed precision is using torch.cuda.amp.autocast(),
                            which requires torch >= 1.6.0""")

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
    # issues with Infiniband implementations that are not fork-safe
    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context')
            and mp._supports_context
            and 'forkserver' in mp.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    data_dir = args.data_dir or './data'
    with FileLock(os.path.expanduser("~/.horovod_lock")):
        train_dataset = \
            datasets.MNIST(data_dir, train=True, download=True,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307,), (0.3081,))
                           ]))

    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    test_dataset = \
        datasets.MNIST(data_dir, train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ]))
    # Horovod: use DistributedSampler to partition the test data.
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr * lr_scaler,
                          momentum=args.momentum)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average,
        gradient_predivide_factor=args.gradient_predivide_factor)

    if args.use_mixed_precision:
        # Initialize scaler in global scale
        scaler = torch.cuda.amp.GradScaler()

    for epoch in range(1, args.epochs + 1):
        if args.use_mixed_precision:
            train_mixed_precision(epoch, scaler)
        else:
            train_epoch(epoch)
        # Keep test in full precision since computation is relatively light.
        test()
Exemplo n.º 25
0
        return
    with open('/var/scratch/sdhar/logs/pytorch_synthetic.csv', 'a', newline='') as f:
        csvwriter = csv.writer(f, lineterminator="\n")
        csvwriter.writerow([
            model,
            batch_size,
            device,
            num_devices,
            num_devices_per_node,
            disable_ib,
	    disable_nccl_p2p,
            img_sec_mean,
            img_sec_conf,
            total_img_sec_mean,
            total_img_sec_conf])

log_csv(
    args.model,
    str(args.batch_size),
    device,
    str(hvd.size()),
    str(hvd.local_size()),
    #Disable infiniband
    str(args.disable_ib),
    #Disable NCCL P2P Communication
    str(args.disable_p2p), 
    str(img_sec_mean),
    str(img_sec_conf),
    str(hvd.size() * img_sec_mean),
    str(hvd.size() * img_sec_conf))
Exemplo n.º 26
0
def train(args):
    hvd.init()

    print("Hello from local_rank {}/{}, rank {}/{}".format(
        hvd.local_rank(), hvd.local_size(), hvd.rank(), hvd.size()))

    verbose = hvd.rank() == 0

    if verbose:
        print('Using PyTorch version:', torch.__version__)
        print('Horovod version: {}, CUDA: {}, ROCM: {}, NCCL: {}, MPI: {}'.format(
            hvd_version,
            hvd.cuda_built(),
            hvd.rocm_built(),
            hvd.nccl_built(),
            hvd.mpi_built()))
        print(torch.__config__.show())

    cudnn.benchmark = True

    torch.cuda.set_device(hvd.local_rank())
    world_size = hvd.size()

    # Set up standard model.
    if verbose:
        print('Using {} model'.format(args.model))
    model = getattr(models, args.model)()
    model = model.cuda()

    # import torch.multiprocessing as mp
    # # # assert "forkserver" in mp.get_all_start_methods()
    # mp.set_start_method("forkserver")

    lr_scaler = hvd.size()

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 1e-4 * lr_scaler)

    optimizer = hvd.DistributedOptimizer(optimizer,
                                         named_parameters=model.named_parameters())
    train_dataset = dataset_from_datadir(args.datadir, verbose)
    train_sampler = DistributedSampler(train_dataset,
                                       num_replicas=hvd.size(),
                                       rank=hvd.rank())
    train_loader = DataLoader(dataset=train_dataset, batch_size=args.batchsize,
                              shuffle=False, num_workers=args.workers,
                              pin_memory=False, sampler=train_sampler,
                              multiprocessing_context='forkserver')

    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    total_step = args.steps if args.steps is not None else len(train_loader)

    # For each block of printed steps
    last_start = datetime.now()
    last_images = 0

    # For final average
    avg_images = 0
    avg_start = None
    tot_steps = 0

    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            li = len(images)
            last_images += li

            tot_steps += 1
            if tot_steps == args.warmup_steps:
                avg_start = datetime.now()
            elif tot_steps > args.warmup_steps:
                avg_images += li

            if (i + 1) % args.print_steps == 0 and verbose:
                now = datetime.now()
                last_secs = (now-last_start).total_seconds()

                print(f'Epoch [{epoch+1}/{args.epochs}], Step [{i+1}/{total_step}], '
                      f'Loss: {loss.item():.4f}, '
                      f'Images/sec: {last_images*world_size/last_secs:.2f} '
                      f'(last {args.print_steps} steps)')

                last_start = now
                last_images = 0

            if args.steps is not None and i >= args.steps:
                break
    if verbose:
        dur = datetime.now() - avg_start
        print(f"Training completed in: {dur}")
        print(f"Images/sec: {avg_images*world_size/dur.total_seconds():.2f} "
              f"(average, skipping {args.warmup_steps} warmup steps)")
Exemplo n.º 27
0
 def hvd_param_scaling(self):
     if hvd.nccl_built():
         self.batch_size = int(self.batch_size / hvd.local_size())
         self.iters_per_epoch = int(self.max_iterations / self.epochs /
                                    hvd.local_size())
Exemplo n.º 28
0
def get_local_size():
    global _USE_HVD
    if _USE_HVD:
        return hvd.local_size()
    return comm.get_local_size()
Exemplo n.º 29
0
def main():
    global args, best_prec1, best_prec5
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    #horovod initialize
    hvd.init()

    log = None

    if hvd.rank() == 0:
        log = SummaryWriter(log_dir=args.log_dir)
        print('The Training Model is %s' % args.arch)
    # Check the save_dir exists or not
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)

    if args.cuda:
        torch.cuda.set_device(hvd.local_rank())

    normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
    # issues with Infiniband implementations that are not fork-safe
    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
            mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    train_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=True, transform=transforms.Compose([
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ]), download=True)

    val_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=False,transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
            ]))

    #Horovod Partition the training data
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())

    val_sampler = torch.utils.data.distributed.DistributedSampler(
        val_dataset, num_replicas=hvd.size(), rank=hvd.rank())

    train_loader = torch.utils.data.DataLoader(
        train_dataset,batch_size=args.batch_size,sampler=train_sampler,**kwargs)

    val_loader = torch.utils.data.DataLoader(
        val_dataset,batch_size=args.batch_size,sampler=val_sampler,**kwargs)

    # model = torch.nn.DataParallel(resnet.__dict__[args.arch]())
    if args.arch in resnet.__dict__:
        model = resnet.__dict__[args.arch]()
    elif args.arch == 'alexnet':
        model = models.AlexNet()
    elif args.arch == 'vgg16':
        model = models.VGG16()


    if hvd.rank() == 0:
        numel = sum(p.numel() for p in model.parameters())
        print('Total params: {:d}'.format(numel))

    lr_scaler = hvd.size()

    if args.cuda:
        model.cuda()
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()


    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.evaluate, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    if args.half:
        model.half()
        criterion.half()

    base_optimizer = torch.optim.SGD(model.parameters(), args.lr * lr_scaler,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)


    # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(base_optimizer,
    #                                 milestones=[100, 150], last_epoch=args.start_epoch - 1)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(base_optimizer, root_rank=0)

    #Compression
    # compression = Allgather(MGCCompressor(0.05), ResidualMemory(), hvd.size())
    # compression = Allgather(TernGradCompressor(), ResidualMemory(), hvd.size())
    compression = Allreduce(NoneCompressor(), NoneMemory())
    # compression = Allgather(DgcCompressor(0.01), ResidualMemory(), hvd.size())
    # compression = Allgather(LowQSGDCompressor(), ResidualMemory(), hvd.size())

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(base_optimizer, compression, named_parameters=model.named_parameters())

    if hvd.rank() == 0:
        log.add_scalar('train/accuracy', 0., 0)
        log.add_scalar('test/accuracy', 0., 0)

    for epoch in range(args.start_epoch + 1, args.epochs + 1):

        adjust_learning_rate(optimizer, epoch, size=lr_scaler)

        if hvd.rank() == 0:
            print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr']))

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, log=log)

        # evaluate on validation set
        prec1, prec5 = validate(val_loader, model, criterion, epoch, log=log)

        # remember best prec@1 and save checkpoint
        best_prec1 = max(prec1, best_prec1)
        best_prec5 = max(prec5, best_prec5)

        if hvd.rank() == 0:
            print('Best Pred@1:{:.2f}%, Prec@5:{:.2f}%\n'.format(best_prec1, best_prec5))

        # if epoch > 0 and epoch % args.save_every == 0:
        #     save_checkpoint({
        #         'epoch': epoch + 1,
        #         'state_dict': model.state_dict(),
        #         'best_prec1': best_prec1,
        #     }, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th'))
        #
        # save_checkpoint({
        #     'state_dict': model.state_dict(),
        #     'best_prec1': best_prec1,
        # }, is_best, filename=os.path.join(args.save_dir, 'model.th'))

    if hvd.rank() == 0:
        log.close()
Exemplo n.º 30
0
def main():
    ''' simple starter program that can be copied for use when starting a new script. '''

    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-c',
                        '--config_file',
                        help='configuration file in json format',
                        required=True)
    parser.add_argument(
        '--num_files',
        '-n',
        default=-1,
        type=int,
        help='limit the number of files to process. default is all')
    parser.add_argument(
        '--model_save',
        help='base name of saved model parameters for later loading')
    parser.add_argument('--nsave',
                        default=100,
                        type=int,
                        help='frequency in batch number to save model')

    parser.add_argument(
        '--nval',
        default=100,
        type=int,
        help='frequency to evaluate validation sample in batch numbers')
    parser.add_argument('--nval_tests',
                        default=-1,
                        type=int,
                        help='number batches to test per validation run')

    parser.add_argument('--status',
                        default=20,
                        type=int,
                        help='frequency to print loss status in batch numbers')

    parser.add_argument('--batch',
                        default=-1,
                        type=int,
                        help='set batch size, overrides file config')

    parser.add_argument('--random_seed',
                        default=0,
                        type=int,
                        help='numpy random seed')

    parser.add_argument(
        '--valid_only',
        default=False,
        action='store_true',
        help='flag that triggers validation run. prints confusion matrix.')

    parser.add_argument(
        '--batch_limiter',
        help=
        'if set to an integer, will limit the number of batches during training. Use this to create short training runs for profiling.',
        type=int)

    parser.add_argument(
        '-i',
        '--input_model_pars',
        help=
        'if provided, the file will be used to fill the models state dict from a previous run.'
    )
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=-1,
                        help='number of epochs')
    parser.add_argument('-l',
                        '--logdir',
                        help='log directory for tensorboardx')

    parser.add_argument('--horovod',
                        default=False,
                        action='store_true',
                        help="Setup for distributed training")

    parser.add_argument('--cpu-only',
                        default=False,
                        action='store_true',
                        help='set to force CPU only running')

    parser.add_argument('--debug',
                        dest='debug',
                        default=False,
                        action='store_true',
                        help="Set Logger to DEBUG")
    parser.add_argument('--error',
                        dest='error',
                        default=False,
                        action='store_true',
                        help="Set Logger to ERROR")
    parser.add_argument('--warning',
                        dest='warning',
                        default=False,
                        action='store_true',
                        help="Set Logger to ERROR")
    parser.add_argument('--logfilename',
                        dest='logfilename',
                        default=None,
                        help='if set, logging information will go to file')
    args = parser.parse_args()

    logging_format = '%(asctime)s %(levelname)s:%(name)s:%(process)s:%(thread)s:%(message)s'
    logging_datefmt = '%Y-%m-%d %H:%M:%S'
    log_level = logging.INFO

    if args.debug and not args.error and not args.warning:
        log_level = logging.DEBUG
    elif not args.debug and args.error and not args.warning:
        log_level = logging.ERROR
    elif not args.debug and not args.error and args.warning:
        log_level = logging.WARNING

    rank = 0
    nranks = 1
    local_rank = 0
    local_size = 1
    hvd = None
    if args.horovod:
        print('importing horovod')
        import horovod.torch as hvd
        print('imported horovod')
        hvd.init()
        rank = hvd.rank()
        nranks = hvd.size()
        local_rank = hvd.local_rank()
        local_size = hvd.local_size()
        logging_format = '%(asctime)s %(levelname)s:' + '{:05d}'.format(
            rank) + ':%(name)s:%(process)s:%(thread)s:%(message)s'

    if rank > 0 and log_level == logging.INFO:
        log_level = logging.WARNING

    logging.basicConfig(level=log_level,
                        format=logging_format,
                        datefmt=logging_datefmt,
                        filename=args.logfilename)

    device = torch.device('cpu')
    if torch.cuda.is_available() and not args.cpu_only:
        device = torch.device('cuda:%d' % local_rank)
        torch.cuda.set_device(device)

    model_save = args.model_save
    if model_save is None:
        model_save = os.path.join(args.logdir, 'model')

    logger.warning('rank %6s of %6s    local rank %6s of %6s', rank, nranks,
                   local_rank, local_size)
    logger.info('hostname:           %s', socket.gethostname())
    logger.info('python version:     %s', sys.version)
    logger.info('num_threads:        %s', torch.get_num_threads())
    logger.info('torch version:      %s', torch.__version__)
    logger.info('torch file:         %s', torch.__file__)

    logger.info('config file:        %s', args.config_file)
    logger.info('num files:          %s', args.num_files)
    logger.info('model_save:         %s', model_save)
    logger.info('random_seed:        %s', args.random_seed)
    logger.info('valid_only:         %s', args.valid_only)
    logger.info('nsave:              %s', args.nsave)
    logger.info('nval:               %s', args.nval)
    logger.info('nval_tests:         %s', args.nval_tests)
    logger.info('status:             %s', args.status)
    logger.info('input_model_pars:   %s', args.input_model_pars)
    logger.info('epochs:             %s', args.epochs)
    logger.info('horovod:            %s', args.horovod)
    logger.info('cpu_only:           %s', args.cpu_only)
    logger.info('logdir:             %s', args.logdir)

    np.random.seed(args.random_seed)

    config_file = json.load(open(args.config_file))
    config_file['rank'] = rank
    config_file['nranks'] = nranks
    config_file['input_model_pars'] = args.input_model_pars
    config_file['horovod'] = args.horovod
    config_file['status'] = args.status
    config_file['nval'] = args.nval
    config_file['nval_tests'] = args.nval_tests
    config_file['nsave'] = args.nsave
    config_file['model_save'] = model_save
    config_file['valid_only'] = args.valid_only
    config_file['batch_limiter'] = args.batch_limiter
    config_file['cpu_only'] = args.cpu_only

    if args.valid_only and not args.input_model_pars:
        logger.error('if valid_only set, must provide input model')
        return

    if args.batch > 0:
        logger.info('setting batch size from command line: %s', args.batch)
        config_file['training']['batch_size'] = args.batch
    if args.epochs > 0:
        logger.info('setting epochs from command line: %s', args.epochs)
        config_file['training']['epochs'] = args.epochs

    logger.info('configuration = \n%s',
                json.dumps(config_file, indent=4, sort_keys=True))
    config_file['hvd'] = hvd

    # get datasets for training and validation
    trainds, testds = data_handler.get_datasets(config_file)

    # setup tensorboard
    writer = None
    if args.logdir and rank == 0:
        if not os.path.exists(args.logdir):
            os.makedirs(args.logdir)
        writer = tensorboardX.SummaryWriter(args.logdir)

    logger.info('building model')
    torch.manual_seed(args.random_seed)

    net = model.get_model(config_file)

    logger.info('model = \n %s', net)

    total_params = sum(p.numel() for p in net.parameters())
    logger.info('trainable parameters: %s', total_params)

    if args.valid_only:
        valid_model(net, validds, config_file)
    else:
        train_model(net, trainds, testds, config_file, device, writer)