Пример #1
0
    def step(self, closure=None, **kargs):
        # apply local gradient.
        with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_):
            self._apply_gradient()

        # Unflatten the saved hat params.
        with kargs["timer"]("grad.recover_hat_params", epoch=self.conf.epoch_):
            params, _ = get_data(self.param_groups, self.param_names, is_get_grad=False)
            grads, shapes = get_data(
                self.param_groups, self.param_names, is_get_grad=True
            )

        # compress.
        with kargs["timer"]("grad.compress", epoch=self.conf.epoch_):
            selected_values, selected_indices, n_bits = self._compress(grads)

        # sync.
        with kargs["timer"]("grad.sync", epoch=self.conf.epoch_):
            synced_message, message_size = self._sync(selected_values, selected_indices)

        # recover and update the neighbor hat params.
        with kargs["timer"]("grad.recover_info", epoch=self.conf.epoch_):
            updated_flatten_params = self._recover_info(
                flatten(params),
                synced_message,
                message_size,
                self.selected_shapes,
                shapes,
            )

        with kargs["timer"]("grad.update_model", epoch=self.conf.epoch_):
            # finally unflatten.
            unflatten(params, updated_flatten_params, shapes)
        return n_bits
Пример #2
0
    def step(self, closure=None, **kargs):
        # Apply the gradients with the weight decay and momentum.
        with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_):
            utils.apply_gradient(
                self.param_groups, self.state, apply_grad_to_model=False
            )

        # get flattened params.
        with kargs["timer"]("grad.get_params", epoch=self.conf.epoch_):
            params, _ = comm.get_data(
                self.param_groups, self.param_names, is_get_grad=False
            )
            flatten_params = TensorBuffer(params)

            grads, _ = comm.get_data(
                self.param_groups, self.param_names, is_get_grad=True
            )
            flatten_grads = TensorBuffer(grads)

        with kargs["timer"]("grad.get_extrapolated_model", epoch=self.conf.epoch_):
            flatten_updated_params = deepcopy(flatten_params)

            # get weighted hat params.
            flatten_updated_params.buffer = sum(
                [
                    _hat_params.buffer * self.neighbors_info[_rank]
                    for _rank, _hat_params in self.neighbor_hat_params.items()
                ]
            )

        # get updated local model (flatten params).
        with kargs["timer"]("grad.unflatten_to_update", epoch=self.conf.epoch_):
            flatten_updated_params.buffer.add_(
                flatten_grads.buffer, alpha=-self.param_groups[0]["lr"]
            )
            flatten_updated_params.unpack(params)

            # get extrapolated model.
            flatten_updated_params.buffer = (
                (1 - 0.5 * self.conf.local_index) * flatten_params.buffer
                + 0.5 * self.conf.local_index * flatten_updated_params.buffer
            )

        # compress the model difference and sync.
        with kargs["timer"]("grad.compress", epoch=self.conf.epoch_):
            sync_buffer = {
                "original_shapes": self.shapes,
                "flatten_updated_params": flatten_updated_params,
            }
            self.compressor.compress(sync_buffer)

        with kargs["timer"]("grad.sync", epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)

        with kargs["timer"]("grad.unflatten_to_update", epoch=self.conf.epoch_):
            self.compressor.uncompress(
                sync_buffer, self.neighbor_hat_params, self.conf.local_index
            )
        return sync_buffer["n_bits"]
Пример #3
0
    def step(self, closure=None, **kargs):
        if self.conf.is_centralized:
            with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_):
                # Get data.
                grads, _ = comm.get_data(self.param_groups,
                                         self.param_names,
                                         is_get_grad=True)
                flatten_grads = TensorBuffer(grads)

            with kargs["timer"]("sync/sync", epoch=self.conf.epoch_):
                # Aggregate the gradients.
                flatten_grads.buffer = self.world_aggregator._agg(
                    flatten_grads.buffer,
                    op="avg",
                    distributed=self.conf.distributed)

            with kargs["timer"]("sync/unflatten_grad", epoch=self.conf.epoch_):
                # unflatten grads.
                flatten_grads.unpack(grads)

            with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
                utils.apply_gradient(self.param_groups,
                                     self.state,
                                     apply_grad_to_model=True)

            # Get n_bits to transmit.
            n_bits = get_n_bits(flatten_grads.buffer)
        else:
            with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
                utils.apply_gradient(self.param_groups,
                                     self.state,
                                     apply_grad_to_model=True)

            with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_):
                # first get and flatten all params.
                params, _ = comm.get_data(self.param_groups,
                                          self.param_names,
                                          is_get_grad=False)
                flatten_params = TensorBuffer(params)

            with kargs["timer"]("sync/sync", epoch=self.conf.epoch_):
                # prepare the sync.
                if self.conf.comm_device == "cpu":
                    flatten_params.buffer.cpu().detach_()

                # then sync.
                flatten_params.buffer = self.decentralized_aggregator._agg(
                    flatten_params.buffer, op="weighted")

            with kargs["timer"]("sync/update_model", epoch=self.conf.epoch_):
                # finally unflatten.
                flatten_params.unpack(params)

            # Get n_bits to transmit.
            n_bits = get_n_bits(flatten_params.buffer)
        return n_bits
Пример #4
0
    def step(self, closure=None, **kargs):
        # Apply the gradients with the weight decay and momentum.
        with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_):
            utils.apply_gradient(
                self.param_groups, self.state, apply_grad_to_model=True
            )

        with kargs["timer"]("grad.get_params", epoch=self.conf.epoch_):
            params, _ = comm.get_data(
                self.param_groups, self.param_names, is_get_grad=False
            )
            params_tb = TensorBuffer(params)

        with kargs["timer"]("grad.error_compensate", epoch=self.conf.epoch_):
            self.memory.buffer += params_tb.buffer

        with kargs["timer"]("grad.compress", epoch=self.conf.epoch_):
            sync_buffer = {"original_shapes": self.shapes, "params_tb": self.memory}
            local_compressed_params_tb = self.compressor.compress(sync_buffer)

        with kargs["timer"]("grad.update_memory", epoch=self.conf.epoch_):
            self.memory.buffer = self.memory.buffer - local_compressed_params_tb.buffer

        with kargs["timer"]("grad.sync", epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)

        # update local model.
        with kargs["timer"]("grad.decompress", epoch=self.conf.epoch_):
            aggregated_info_tb = self.compressor.uncompress(
                sync_buffer, self.neighbors_info
            )
            params_tb.buffer += aggregated_info_tb.buffer
            params_tb.unpack(params)
        return sync_buffer["n_bits"]
Пример #5
0
    def step(self, closure=None, **kargs):
        with kargs['timer']('sync', epoch=self.conf.epoch_):
            # do the local update steps.
            with kargs["timer"]("local_update", epoch=self.conf.epoch_):
                utils.apply_gradient(self.param_groups,
                                     self.state,
                                     apply_grad_to_model=True)

            # enter the global sync if it satisfies the condition.
            if (self.conf.epoch_ < self.turn_on_local_step_from_epoch
                    or self.conf.local_index % self.local_step == 0):
                with kargs["timer"]("get_params", epoch=self.conf.epoch_):
                    # get parmas.
                    params, _ = comm.get_data(self.param_groups,
                                              self.param_names,
                                              is_get_grad=False)
                    params_tb = TensorBuffer(params)
                with kargs['timer']('memory_and_compress',
                                    epoch=self.conf.epoch_):
                    # get the params difference w.r.t. previous synced model.
                    local_scale, local_sign = [], []
                    for consensus_param, param, memory in zip(
                            self.consensus_params_tb, params_tb,
                            self.memory_tb):
                        memory.data.copy_(consensus_param - param + memory)
                        # compress.
                with kargs["timer"]("directions", epoch=self.conf.epoch_):
                    direction = exchange(self.memory_tb.buffer)  #signum
                with kargs['timer']('memory_and_compress',
                                    epoch=self.conf.epoch_):
                    for consensus_param, param, memory in zip(
                            self.consensus_params_tb, params_tb,
                            self.memory_tb):
                        _local_scale, _local_sign = scaled_sign(memory)
                        local_scale.append(_local_scale)
                        local_sign.append(_local_sign)
                        memory.data.copy_(memory - _local_scale * _local_sign)
                with kargs["timer"]("directions", epoch=self.conf.epoch_):
                    global_direction = TB(self.memory_tb, direction)
                with kargs["timer"]("magnitudes", epoch=self.conf.epoch_):
                    magnitudes_tb = TensorBuffer(local_scale)
                    magnitudes_tb.buffer = self.world_aggregator._agg(
                        magnitudes_tb.buffer,
                        "avg",
                        distributed=self.conf.distributed)
                # unpack the synced info and update the consensus params.
                with kargs["timer"]("update_consensus",
                                    epoch=self.conf.epoch_):
                    for update_magnitude, update_direction, consensus_param in zip(
                            magnitudes_tb, global_direction,
                            self.consensus_params_tb):
                        consensus_param.add_(
                            -1.0, update_direction.mul(update_magnitude))

                # consistent the local models by assigning the consensus params.
                self.consensus_params_tb.unpack(params)
                n_bits = get_n_bits(magnitudes_tb.buffer)
            else:
                n_bits = 0
            return n_bits
Пример #6
0
    def step(self, closure=None, **kargs):
        with kargs["timer"]("sync.local_update", epoch=self.conf.epoch_):
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=True)

        with kargs["timer"]("sync.sync_and_update", epoch=self.conf.epoch_):
            # enter the global sync if it satisfies the condition.
            if (self.conf.epoch_ < self.turn_on_local_step_from_epoch
                    or self.conf.local_index % self.local_step == 0):
                # get parmas.
                params, _ = comm.get_data(self.param_groups,
                                          self.param_names,
                                          is_get_grad=False)
                params_tb = TensorBuffer(params)

                # get params_diff.
                param_diff = self.consensus_params_tb.buffer - params_tb.buffer
                # sync the directions.
                param_diff = self.world_aggregator._agg(
                    param_diff, "avg", distributed=self.conf.distributed)

                # unpack the synced info and update the consensus params.
                self.consensus_params_tb.buffer.add_(-1.0, param_diff)

                # consistent the local models by assigning the consensus params.
                self.consensus_params_tb.unpack(params)

                # Get n_bits to transmit.
                n_bits = get_n_bits(param_diff)
            else:
                n_bits = 0
        return n_bits
Пример #7
0
    def step(self, closure=None, **kargs):
        # Apply the gradients with the weight decay and momentum.
        with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_):
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=False)

        with kargs["timer"]("grad.get_grads", epoch=self.conf.epoch_):
            params, _ = comm.get_data(self.param_groups,
                                      self.param_names,
                                      is_get_grad=False)
            flatten_params = TensorBuffer(params)

            grads, _ = comm.get_data(self.param_groups,
                                     self.param_names,
                                     is_get_grad=True)
            flatten_grads = TensorBuffer(grads)

        # Get weighted hat params and apply the local gradient.
        with kargs["timer"]("grad.apply_local_gradient",
                            epoch=self.conf.epoch_):
            flatten_half_params = deepcopy(flatten_params)
            flatten_half_params.buffer = (sum([
                _hat_params.buffer * self.neighbors_info[_rank]
                for _rank, _hat_params in self.neighbor_hat_params.items()
            ]) - self.param_groups[0]["lr"] * flatten_grads.buffer)

        # compress the model difference and sync.
        with kargs["timer"]("grad.compress", epoch=self.conf.epoch_):
            sync_buffer = {
                "original_shapes": self.shapes,
                "flatten_half_params": flatten_half_params,
                "flatten_params": flatten_params,
            }
            self.compressor.compress(sync_buffer)

        with kargs["timer"]("grad.sync", epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)

        # finally unflatten and update local model.
        with kargs["timer"]("grad.unflatten_to_update",
                            epoch=self.conf.epoch_):
            self.compressor.uncompress(sync_buffer, self.neighbor_hat_params)
            flatten_params.buffer = self.neighbor_hat_params[
                self.rank].buffer.clone()
            flatten_params.unpack(params)
        return sync_buffer["n_bits"]
Пример #8
0
    def step(self, closure=None, **kargs):
        with kargs["timer"]("sync.apply_grad", epoch=self.conf.epoch_):
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=False)

        with kargs["timer"]("sync.get_data", epoch=self.conf.epoch_):
            # Get data.
            grads, _ = comm.get_data(self.param_groups,
                                     self.param_names,
                                     is_get_grad=True)
            grads_tb = TensorBuffer(grads)

        with kargs["timer"]("sync.use_memory", epoch=self.conf.epoch_):
            # use memory.
            grads_tb.buffer.add_(self.memory_tb.buffer)

        with kargs["timer"]("sync.compress", epoch=self.conf.epoch_):
            # compress.
            sync_buffer = self.compressor.compress(grads_tb)

        with kargs["timer"]("sync.sync", epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)

        with kargs["timer"]("sync.update_memory", epoch=self.conf.epoch_):
            # update memory.
            self.memory_tb.buffer = (grads_tb.buffer -
                                     sync_buffer["synced_grads_tb"].buffer)

        with kargs["timer"]("sync.decompress", epoch=self.conf.epoch_):
            sync_grads_tb = self.compressor.decompress(sync_buffer)

        with kargs["timer"]("sync.apply_grad", epoch=self.conf.epoch_):
            # appply the gradient but only with the gradient.
            params, _ = comm.get_data(self.param_groups,
                                      self.param_names,
                                      is_get_grad=False)
            params_tb = TensorBuffer(params)

            # apply the gradient.
            params_tb.buffer.add_(-self.param_groups[0]["lr"] *
                                  sync_grads_tb.buffer)

            # unpack.
            params_tb.unpack(params)
        return sync_buffer["n_bits"]
Пример #9
0
    def init_neighbor_hat_params(self):
        params, self.shapes = comm.get_data(self.param_groups,
                                            self.param_names,
                                            is_get_grad=False)
        flatten_params = TensorBuffer(params)

        # init the neighbor_params.
        self.neighbor_hat_params = dict()
        for rank, _ in self.neighbors_info.items():
            self.neighbor_hat_params[rank] = deepcopy(flatten_params)
Пример #10
0
    def init_neighbor_hat_params(self):
        params, self.shapes = comm.get_data(self.param_groups,
                                            self.param_names,
                                            is_get_grad=False)
        flatten_params = TensorBuffer(params)
        flatten_params.buffer = torch.zeros_like(flatten_params.buffer)

        # init the neighbor_params.
        self.neighbor_hat_params = {
            self.rank: deepcopy(flatten_params),
            "memory": deepcopy(flatten_params),
        }
Пример #11
0
    def step(self, closure=None, **kargs):
        # do the local update steps.
        with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_):
            # get parmas.
            params, _ = comm.get_data(self.param_groups,
                                      self.param_names,
                                      is_get_grad=False)
            params_tb = TensorBuffer(params)

        with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
            # prepare the gradient (sign)
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=False)
            # get grads.
            grads, _ = comm.get_data(self.param_groups,
                                     self.param_names,
                                     is_get_grad=True)
            grads_tb = TensorBuffer(grads)

        # enter the global sync if it satisfies the condition.
        # get the params difference w.r.t. previous synced model.
        with kargs["timer"]("sync/compress", epoch=self.conf.epoch_):
            sync_buffer = self.compressor.compress(grads_tb)

        # sync and decompress.
        with kargs["timer"]("sync/sync_and_decompress",
                            epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)
            synced_updates_tb = self.compressor.decompress(sync_buffer)

        # unpack the synced info and update the consensus params.
        with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
            params_tb.buffer -= self.param_groups[0][
                "lr"] * synced_updates_tb.buffer
            params_tb.unpack(params)
        return sync_buffer["n_bits"]
Пример #12
0
        def _init_neighbor_hat_params(conf, param_groups, param_names):
            params, params_shapes = comm.get_data(param_groups,
                                                  param_names,
                                                  is_get_grad=False)
            flatten_params = TensorBuffer(params)
            flatten_params.buffer = torch.zeros_like(flatten_params.buffer)

            # init the neighbor_params.
            return (
                {
                    conf.graph.rank: deepcopy(flatten_params),
                    "memory": deepcopy(flatten_params),
                },
                params_shapes,
            )
Пример #13
0
def recover_params(param_groups,
                   param_names,
                   rank=None,
                   neighbor_hat_params=None,
                   get_hat_params=True):
    # get flattened params.
    params, _ = comm.get_data(param_groups, param_names, is_get_grad=False)
    flatten_params = TensorBuffer(params)

    if get_hat_params:
        assert neighbor_hat_params is not None and rank is not None
        # recover the hat_params.
        flatten_hat_params = TensorBuffer(params)
        flatten_hat_params.buffer.data[:] = neighbor_hat_params[rank].buffer
        return params, flatten_params, flatten_hat_params
    else:
        return params, flatten_params
Пример #14
0
 def init_memory(self):
     params, self.shapes = comm.get_data(self.param_groups,
                                         self.param_names,
                                         is_get_grad=False)
     self.memory_tb = TensorBuffer(params)
     self.memory_tb.buffer = torch.zeros_like(self.memory_tb.buffer)
Пример #15
0
    def step(self, closure=None, **kargs):
        # do the local update steps.
        with kargs["timer"]("sync.local_update", epoch=self.conf.epoch_):
            for group in self.param_groups:
                weight_decay = group["weight_decay"]
                momentum = group["momentum"]
                dampening = group["dampening"]
                nesterov = group["nesterov"]

                for p in group["params"]:
                    # get param_state
                    param_state = self.state[p]

                    # get the gradient
                    if p.grad is None:
                        continue
                    d_p = p.grad.data

                    # add the weight decay and apply the momentum.
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                    # apply the momentum.
                    if momentum != 0:
                        if "momentum_buffer" not in param_state:
                            buf = param_state["momentum_buffer"] = torch.zeros_like(
                                p.data
                            )
                            buf.mul_(momentum).add_(d_p)
                        else:
                            buf = param_state["momentum_buffer"]
                            buf.mul_(momentum).add_(1 - dampening, d_p)
                        if nesterov:
                            d_p = d_p.add(momentum, buf)
                        else:
                            d_p = buf

                    # get the local sign and apply to the local model.
                    p.data.add_(-group["lr"], torch.sign(d_p))

        # enter the global sync if it satisfies the condition.
        if (
            self.conf.epoch_ < self.turn_on_local_step_from_epoch
            or self.conf.local_index % self.local_step == 0
        ):
            with kargs["timer"]("sync.get_params", epoch=self.conf.epoch_):
                # get parmas.
                params, _ = comm.get_data(
                    self.param_groups, self.param_names, is_get_grad=False
                )
                params_tb = TensorBuffer(params)

            # get the params difference w.r.t. previous synced model.
            local_scale, local_sign = [], []
            for consensus_param, param in zip(self.consensus_params_tb, params_tb):
                _local_scale, _local_sign = scaled_sign(consensus_param - param)
                local_scale.append(_local_scale)
                local_sign.append(_local_sign)

            # concat the update magnitude and directions.
            magnitudes_tb = TensorBuffer(local_scale)
            directions_tb = TensorBuffer(local_sign)

            # sync and decompress.
            with kargs["timer"]("sync.sync_and_decompress", epoch=self.conf.epoch_):
                # sync the directions.
                directions_tb.buffer = self.world_aggregator._agg(
                    directions_tb.buffer, "avg", distributed=self.conf.distributed
                )
                magnitudes_tb.buffer = self.world_aggregator._agg(
                    magnitudes_tb.buffer, "avg", distributed=self.conf.distributed
                )

            # unpack the synced info and update the consensus params.
            with kargs["timer"]("sync.update_consensus", epoch=self.conf.epoch_):
                for update_magnitude, update_direction, consensus_param in zip(
                    magnitudes_tb, directions_tb, self.consensus_params_tb
                ):
                    consensus_param.add_(-1.0, update_direction.mul(update_magnitude))

            # consistent the local models by assigning the consensus params.
            self.consensus_params_tb.unpack(params)
            n_bits = get_n_bits(directions_tb.buffer) + get_n_bits(magnitudes_tb.buffer)
        else:
            n_bits = 0
        return n_bits
Пример #16
0
 def _init_consensus(self):
     params, _ = comm.get_data(
         self.param_groups, self.param_names, is_get_grad=False
     )
     self.consensus_params_tb = deepcopy(TensorBuffer(params))