示例#1
0
    def __init__(self, model, byteps_opt, num_steps=10**6):
        """Construct a new ScheduledOptimizer, which uses byteps optimizer under the hood for averaging gradients
         across all workers.
        Args:
            model: The training model. ByteScheduler uses the model object to register hooks.
            byteps_opt: Optimizer to use for averaging gradients and applying updates.
            num_steps: The maximum number of training steps. ByteScheduler needs to know when to stop cross-iteration
            scheduling.
        """
        self._model = model
        self._opt = byteps_opt
        self._logger = logging.getLogger("ByteScheduler")
        self._logger.debug("byteps size {}, rank {}".format(size(), rank()))
        self._desc = "rank {}".format(rank())

        # Track training steps
        self._step = 0
        self._final_step = num_steps

        # Use lock to block the forward propagation of each parameter.
        self._locks = {}
        for param_group in self.param_groups:
            for p in param_group['params']:
                self._locks[p] = threading.Lock()

        if size() > 1:
            self._register_forward_hooks()
            self._register_hooks()

        # Poll whether the tensor push-pull is finished.
        self._event_queue = queue.Queue()
        self._poller = threading.Thread(target=self._poll, args=())
        self._poller.start()
示例#2
0
    def step(self, closure=None):
        """Override the default step function."""
        self._logger.debug("{} calls step() {}".format(self._desc, self._step))

        # Step 0 is called for parameter initialization after parameter broadcast
        if size() > 1 and self._step > 0:
            self._synchronize()
            # if it is the final training step, wait for the completion of all tensors
            if self._step == self._final_step:
                self._logger.debug(
                    "final step {}, waiting for push-pull completion.".format(
                        self._final_step))
                while not self._event_queue.empty():
                    time.sleep(0.001)
                self._event_queue.put((None, None, None))
                self._poller.join()
                self._logger.info("training finished!")
            loss = None
            if closure is not None:
                loss = closure()
            self._step += 1
            return loss
        else:
            # Optimizer.step() will be triggered when user calls byteps.broadcast_optimizer_sate()
            super(self._opt.__class__, self._opt).step()
            self._step += 1
示例#3
0
 def zero_grad(self):
     """Override the default zero_grad function.
     Clears the gradients of all optimized tensors.
     """
     self._logger.debug("{} calls zero_grad() of step {}".format(self._desc, self._step))
     if size() > 1 and self._step > 0:
         return
     else:
         self._opt.zero_grad()
示例#4
0
 def step(self, closure=None, wait_for_finish=True):
     if size() > 1:
         self._sync_missing_gradients()
         if wait_for_finish:
             self._wait_for_all()
         loss = None
         if closure is not None:
             loss = closure()
         return loss
     else:
         super(self.__class__, self).step()
示例#5
0
 def _try_to_synchronize(self, p):
     handle, ctx = self._handles[p]
     if poll(handle):
         output = synchronize(handle)
         self._push_pull_delay[p] = self.backward_passes_per_step
         if self._is_tensor_instance:
             fp16_p = self._fp32_to_fp16_map.get(p.__hash__())
         else:
             fp16_p = self._fp32_to_fp16_map.get(p)
         fp16_p.grad.set_(self._compression.decompress(output, ctx))
         p.grad.data.copy_(fp16_p.grad.data)
         p.grad.data = p.grad.data / (self.loss_scale * size())
         self._step_one_param(p)
         fp16_p.data.copy_(p.data)
         self._handles.pop(p)
         return True
     else:
         return False
示例#6
0
    def __init__(self,
                 params,
                 named_parameters,
                 compression,
                 backward_passes_per_step=1):
        super(self.__class__, self).__init__(params)
        self._compression = compression

        if named_parameters is not None:
            named_parameters = list(named_parameters)
        else:
            named_parameters = []

        # make sure that named_parameters are tuples
        if any([not isinstance(p, tuple) for p in named_parameters]):
            raise ValueError('named_parameters should be a sequence of '
                             'tuples (name, parameter), usually produced by '
                             'model.named_parameters().')

        dups = _DistributedOptimizer.find_duplicates(
            [k for k, _ in named_parameters])
        if len(dups) > 0:
            raise ValueError(
                'Parameter names in named_parameters must be unique. '
                'Found duplicates: %s' % ', '.join(dups))

        if len(named_parameters) > 0:
            self._parameter_names = {v: k for k, v in sorted(named_parameters)}
        else:
            self._parameter_names = {
                v: 'push_pull.noname.%s' % i
                for param_group in self.param_groups
                for i, v in enumerate(param_group['params'])
            }
        self.backward_passes_per_step = backward_passes_per_step
        self._push_pull_delay = {
            v: self.backward_passes_per_step
            for _, v in sorted(named_parameters)
        }
        self._handles = {}
        self._grad_accs = []
        self._requires_update = set()
        if size() > 1:
            self._register_hooks()
示例#7
0
    def __init__(self,
                 params,
                 named_parameters,
                 compression,
                 backward_passes_per_step=1):
        super(self.__class__, self).__init__(params)
        self._compression = compression

        if named_parameters is not None:
            named_parameters = list(named_parameters)
        else:
            named_parameters = []

        # make sure that named_parameters are tuples
        if any([not isinstance(p, tuple) for p in named_parameters]):
            raise ValueError('named_parameters should be a sequence of '
                             'tuples (name, parameter), usually produced by '
                             'model.named_parameters().')

        dups = _DistributedOptimizer.find_duplicates(
            [k for k, _ in named_parameters])
        if len(dups) > 0:
            raise ValueError(
                'Parameter names in named_parameters must be unique. '
                'Found duplicates: %s' % ', '.join(dups))

        if len(named_parameters) > 0:
            if isinstance(named_parameters[0][1], torch.Tensor):
                if any([
                        not isinstance(p, torch.Tensor)
                        for name, p in named_parameters
                ]):
                    raise ValueError(
                        'named_parameters should consistently be a sequence of '
                        'tuples (name, torch.Tensor)')
                self._is_tensor_instance = True
                # there is an issue when using torch.Tensor as key, so use its hash instead
                # https://github.com/pytorch/pytorch/issues/7733
                self._parameter_names = {
                    v.__hash__(): k
                    for k, v in sorted(named_parameters)
                }
                self._tensor_list = [
                    tensor for name, tensor in named_parameters
                ]
            else:
                self._is_tensor_instance = False
                self._parameter_names = {
                    v: k
                    for k, v in sorted(named_parameters)
                }
        else:
            self._is_tensor_instance = False
            self._parameter_names = {
                v: 'push_pull.noname.%s' % i
                for param_group in self.param_groups
                for i, v in enumerate(param_group['params'])
            }
        self.backward_passes_per_step = backward_passes_per_step
        self._push_pull_delay = {
            v: self.backward_passes_per_step
            for _, v in sorted(named_parameters)
        }
        self._handles = {}
        self._grad_accs = []
        self._requires_update = set()
        if size() > 1:
            self._register_hooks()

        # declare tensors
        for name in sorted(self._parameter_names.values()):
            declare("Gradient." + name)
        # We use two loops for load-balancing
        for name in sorted(self._parameter_names.values()):
            declare("Parameter." + name)
示例#8
0
    def __init__(self, module, device_ids=None,
            broadcast_buffers=True,
            compression=Compression.none
            ):
        super(DistributedDataParallel, self).__init__()

        assert device_ids and len(device_ids) == 1, (
                "DistributedDataParallel device_ids contain exactlyone entry,"
                " but got {}.").format(device_ids)
        self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
        self.module = module
        self.broadcast_buffers = broadcast_buffers
        self.require_forward_param_sync = broadcast_buffers
        self._handles = {}
        self._grad_accs = []
        self._requires_update = set()
        self._num_grads = 1

        self.modules_buffers = [list(self.module.buffers())]
        self._compression = compression
        self._enable_async = False
        self._require_backward_grad_sync = True
        named_parameters = self.module.named_parameters()
        named_parameters = list(named_parameters)
        if len(named_parameters) > 0:
            if isinstance(named_parameters[0][1], torch.Tensor):
                if any([not isinstance(p, torch.Tensor) for name, p in named_parameters]):
                    raise ValueError('named_parameters should consistently be a sequence of '
                                     'tuples (name, torch.Tensor)')
                self._is_tensor_instance = True
                # there is an issue when using torch.Tensor as key, so use its hash instead
                # https://github.com/pytorch/pytorch/issues/7733
                self._parameter_names = {v.__hash__(): k for k, v
                                         in sorted(named_parameters)}
                self._tensor_list = [tensor for name, tensor in named_parameters]
            else:
                self._is_tensor_instance = False
                self._parameter_names = {v: k for k, v
                                         in sorted(named_parameters)}
        else:
            self._is_tensor_instance = False
            self._parameter_names = {v: 'push_pull.noname.%s' % i
                                     for param_group in self.param_groups
                                     for i, v in enumerate(param_group['params'])}
        if size() > 1:
            self._register_hooks()
            named_params = self.module.named_parameters()
            self._num_grads = sum(p.requires_grad for _, p in named_params)
            byteps_torch_set_num_grads(self._num_grads)

        # declare tensors
        for name in sorted(self._parameter_names.values()):
            declare("Gradient."+name)
        # We use two loops for load-balancing
        for name in sorted(self._parameter_names.values()):
            declare("Parameter."+name)

        # broadcast model state
        module_states = list(self.module.state_dict().values())
        if len(module_states) > 0:
            bps.torch.broadcast_parameters(self.module.state_dict(), root_rank=0)
    def __init__(self, params, named_parameters, compression,
                 backward_passes_per_step=1):
        super(self.__class__, self).__init__(params)
        self._compression = compression

        if named_parameters is not None:
            named_parameters = list(named_parameters)
        else:
            named_parameters = []
        self._sequential_keys = [k for k, v in named_parameters]
        self._named_parameters = {k: v for k, v
                                in named_parameters}

        self._tensor_fusion_threshold = int(os.environ.get('BYTEPS_FUSION_THRESHOLD', '0')) # in bytes
        self._enable_async = (int(os.getenv('BYTEPS_ENABLE_ASYNC', 0)) != 0)
        if self._enable_async:
            assert int(os.getenv('DMLC_NUM_WORKER')) > 1, \
                "Async is only valid for distributed training"
            print('BytePS: enable asynchronous training')

        # make sure that named_parameters are tuples
        if any([not isinstance(p, tuple) for p in named_parameters]):
            raise ValueError('named_parameters should be a sequence of '
                             'tuples (name, parameter), usually produced by '
                             'model.named_parameters().')

        dups = _DistributedOptimizer.find_duplicates([k for k, _ in named_parameters])
        if len(dups) > 0:
            raise ValueError('Parameter names in named_parameters must be unique. '
                             'Found duplicates: %s' % ', '.join(dups))

        if len(named_parameters) > 0:
            if isinstance(named_parameters[0][1], torch.Tensor):
                if any([not isinstance(p, torch.Tensor) for name, p in named_parameters]):
                    raise ValueError('named_parameters should consistently be a sequence of '
                                     'tuples (name, torch.Tensor)')
                self._is_tensor_instance = True
                # there is an issue when using torch.Tensor as key, so use its hash instead
                # https://github.com/pytorch/pytorch/issues/7733
                self._parameter_names = {v.__hash__(): k for k, v
                                         in sorted(named_parameters)}
                self._tensor_list = [tensor for name, tensor in named_parameters]
            else:
                self._is_tensor_instance = False
                self._parameter_names = {v: k for k, v
                                         in sorted(named_parameters)}
        else:
            self._is_tensor_instance = False
            self._parameter_names = {v: 'push_pull.noname.%s' % i
                                     for param_group in self.param_groups
                                     for i, v in enumerate(param_group['params'])}
        self.backward_passes_per_step = backward_passes_per_step
        self._push_pull_delay = {v: self.backward_passes_per_step
                                 for _, v in sorted(named_parameters)}
        self._handles = {}
        self._grad_accs = []
        self._requires_update = set()
        self._should_sync = True
        if self._tensor_fusion_threshold > 0:
            self._generate_merged_parameters()
        else:
            self._groups = []
        if size() > 1:
            self._register_hooks()

        # declare tensors
        for name in sorted(self._parameter_names.values()):
            declare("Gradient."+name)
        # We use two loops for load-balancing
        for name in sorted(self._parameter_names.values()):
            declare("Parameter."+name)