示例#1
0
    def step(self, closure=None, **kargs):
        # Apply the gradients with the weight decay and momentum.
        with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_):
            utils.apply_gradient(
                self.param_groups, self.state, apply_grad_to_model=True
            )

        with kargs["timer"]("grad.get_params", epoch=self.conf.epoch_):
            params, _ = comm.get_data(
                self.param_groups, self.param_names, is_get_grad=False
            )
            params_tb = TensorBuffer(params)

        with kargs["timer"]("grad.error_compensate", epoch=self.conf.epoch_):
            self.memory.buffer += params_tb.buffer

        with kargs["timer"]("grad.compress", epoch=self.conf.epoch_):
            sync_buffer = {"original_shapes": self.shapes, "params_tb": self.memory}
            local_compressed_params_tb = self.compressor.compress(sync_buffer)

        with kargs["timer"]("grad.update_memory", epoch=self.conf.epoch_):
            self.memory.buffer = self.memory.buffer - local_compressed_params_tb.buffer

        with kargs["timer"]("grad.sync", epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)

        # update local model.
        with kargs["timer"]("grad.decompress", epoch=self.conf.epoch_):
            aggregated_info_tb = self.compressor.uncompress(
                sync_buffer, self.neighbors_info
            )
            params_tb.buffer += aggregated_info_tb.buffer
            params_tb.unpack(params)
        return sync_buffer["n_bits"]
示例#2
0
    def step(self, closure=None, **kargs):
        with kargs['timer']('sync', epoch=self.conf.epoch_):
            # do the local update steps.
            with kargs["timer"]("local_update", epoch=self.conf.epoch_):
                utils.apply_gradient(self.param_groups,
                                     self.state,
                                     apply_grad_to_model=True)

            # enter the global sync if it satisfies the condition.
            if (self.conf.epoch_ < self.turn_on_local_step_from_epoch
                    or self.conf.local_index % self.local_step == 0):
                with kargs["timer"]("get_params", epoch=self.conf.epoch_):
                    # get parmas.
                    params, _ = comm.get_data(self.param_groups,
                                              self.param_names,
                                              is_get_grad=False)
                    params_tb = TensorBuffer(params)
                with kargs['timer']('memory_and_compress',
                                    epoch=self.conf.epoch_):
                    # get the params difference w.r.t. previous synced model.
                    local_scale, local_sign = [], []
                    for consensus_param, param, memory in zip(
                            self.consensus_params_tb, params_tb,
                            self.memory_tb):
                        memory.data.copy_(consensus_param - param + memory)
                        # compress.
                with kargs["timer"]("directions", epoch=self.conf.epoch_):
                    direction = exchange(self.memory_tb.buffer)  #signum
                with kargs['timer']('memory_and_compress',
                                    epoch=self.conf.epoch_):
                    for consensus_param, param, memory in zip(
                            self.consensus_params_tb, params_tb,
                            self.memory_tb):
                        _local_scale, _local_sign = scaled_sign(memory)
                        local_scale.append(_local_scale)
                        local_sign.append(_local_sign)
                        memory.data.copy_(memory - _local_scale * _local_sign)
                with kargs["timer"]("directions", epoch=self.conf.epoch_):
                    global_direction = TB(self.memory_tb, direction)
                with kargs["timer"]("magnitudes", epoch=self.conf.epoch_):
                    magnitudes_tb = TensorBuffer(local_scale)
                    magnitudes_tb.buffer = self.world_aggregator._agg(
                        magnitudes_tb.buffer,
                        "avg",
                        distributed=self.conf.distributed)
                # unpack the synced info and update the consensus params.
                with kargs["timer"]("update_consensus",
                                    epoch=self.conf.epoch_):
                    for update_magnitude, update_direction, consensus_param in zip(
                            magnitudes_tb, global_direction,
                            self.consensus_params_tb):
                        consensus_param.add_(
                            -1.0, update_direction.mul(update_magnitude))

                # consistent the local models by assigning the consensus params.
                self.consensus_params_tb.unpack(params)
                n_bits = get_n_bits(magnitudes_tb.buffer)
            else:
                n_bits = 0
            return n_bits
示例#3
0
    def step(self, closure=None, **kargs):
        with kargs["timer"]("sync.local_update", epoch=self.conf.epoch_):
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=True)

        with kargs["timer"]("sync.sync_and_update", epoch=self.conf.epoch_):
            # enter the global sync if it satisfies the condition.
            if (self.conf.epoch_ < self.turn_on_local_step_from_epoch
                    or self.conf.local_index % self.local_step == 0):
                # get parmas.
                params, _ = comm.get_data(self.param_groups,
                                          self.param_names,
                                          is_get_grad=False)
                params_tb = TensorBuffer(params)

                # get params_diff.
                param_diff = self.consensus_params_tb.buffer - params_tb.buffer
                # sync the directions.
                param_diff = self.world_aggregator._agg(
                    param_diff, "avg", distributed=self.conf.distributed)

                # unpack the synced info and update the consensus params.
                self.consensus_params_tb.buffer.add_(-1.0, param_diff)

                # consistent the local models by assigning the consensus params.
                self.consensus_params_tb.unpack(params)

                # Get n_bits to transmit.
                n_bits = get_n_bits(param_diff)
            else:
                n_bits = 0
        return n_bits
示例#4
0
    def step(self, closure=None, **kargs):
        # Apply the gradients with the weight decay and momentum.
        with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_):
            utils.apply_gradient(
                self.param_groups, self.state, apply_grad_to_model=False
            )

        # get flattened params.
        with kargs["timer"]("grad.get_params", epoch=self.conf.epoch_):
            params, _ = comm.get_data(
                self.param_groups, self.param_names, is_get_grad=False
            )
            flatten_params = TensorBuffer(params)

            grads, _ = comm.get_data(
                self.param_groups, self.param_names, is_get_grad=True
            )
            flatten_grads = TensorBuffer(grads)

        with kargs["timer"]("grad.get_extrapolated_model", epoch=self.conf.epoch_):
            flatten_updated_params = deepcopy(flatten_params)

            # get weighted hat params.
            flatten_updated_params.buffer = sum(
                [
                    _hat_params.buffer * self.neighbors_info[_rank]
                    for _rank, _hat_params in self.neighbor_hat_params.items()
                ]
            )

        # get updated local model (flatten params).
        with kargs["timer"]("grad.unflatten_to_update", epoch=self.conf.epoch_):
            flatten_updated_params.buffer.add_(
                flatten_grads.buffer, alpha=-self.param_groups[0]["lr"]
            )
            flatten_updated_params.unpack(params)

            # get extrapolated model.
            flatten_updated_params.buffer = (
                (1 - 0.5 * self.conf.local_index) * flatten_params.buffer
                + 0.5 * self.conf.local_index * flatten_updated_params.buffer
            )

        # compress the model difference and sync.
        with kargs["timer"]("grad.compress", epoch=self.conf.epoch_):
            sync_buffer = {
                "original_shapes": self.shapes,
                "flatten_updated_params": flatten_updated_params,
            }
            self.compressor.compress(sync_buffer)

        with kargs["timer"]("grad.sync", epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)

        with kargs["timer"]("grad.unflatten_to_update", epoch=self.conf.epoch_):
            self.compressor.uncompress(
                sync_buffer, self.neighbor_hat_params, self.conf.local_index
            )
        return sync_buffer["n_bits"]
示例#5
0
    def step(self, closure=None, **kargs):
        if self.conf.is_centralized:
            with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_):
                # Get data.
                grads, _ = comm.get_data(self.param_groups,
                                         self.param_names,
                                         is_get_grad=True)
                flatten_grads = TensorBuffer(grads)

            with kargs["timer"]("sync/sync", epoch=self.conf.epoch_):
                # Aggregate the gradients.
                flatten_grads.buffer = self.world_aggregator._agg(
                    flatten_grads.buffer,
                    op="avg",
                    distributed=self.conf.distributed)

            with kargs["timer"]("sync/unflatten_grad", epoch=self.conf.epoch_):
                # unflatten grads.
                flatten_grads.unpack(grads)

            with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
                utils.apply_gradient(self.param_groups,
                                     self.state,
                                     apply_grad_to_model=True)

            # Get n_bits to transmit.
            n_bits = get_n_bits(flatten_grads.buffer)
        else:
            with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
                utils.apply_gradient(self.param_groups,
                                     self.state,
                                     apply_grad_to_model=True)

            with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_):
                # first get and flatten all params.
                params, _ = comm.get_data(self.param_groups,
                                          self.param_names,
                                          is_get_grad=False)
                flatten_params = TensorBuffer(params)

            with kargs["timer"]("sync/sync", epoch=self.conf.epoch_):
                # prepare the sync.
                if self.conf.comm_device == "cpu":
                    flatten_params.buffer.cpu().detach_()

                # then sync.
                flatten_params.buffer = self.decentralized_aggregator._agg(
                    flatten_params.buffer, op="weighted")

            with kargs["timer"]("sync/update_model", epoch=self.conf.epoch_):
                # finally unflatten.
                flatten_params.unpack(params)

            # Get n_bits to transmit.
            n_bits = get_n_bits(flatten_params.buffer)
        return n_bits
示例#6
0
    def step(self, closure=None, **kargs):
        # Apply the gradients with the weight decay and momentum.
        with kargs["timer"]("sync.apply_grad", epoch=self.conf.epoch_):
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=True)

        with kargs["timer"]("sync.finish_sync", epoch=self.conf.epoch_):
            utils.join_thread(self.helper_thread)
            self.n_bits = self.sync_buffer.get("n_bits", 0)

        # recover current params and hat_params
        with kargs["timer"]("sync.recover_hat_params", epoch=self.conf.epoch_):
            params, flatten_params, flatten_hat_params = utils.recover_params(
                param_groups=self.param_groups,
                param_names=self.param_names,
                rank=self.rank,
                neighbor_hat_params=self.neighbor_hat_params,
                get_hat_params=True,
            )
        # get updated flatten params.
        with kargs["timer"]("sync.update_flatten_params",
                            epoch=self.conf.epoch_):
            utils.update_params_from_neighbor(
                neighbor_hat_params=self.neighbor_hat_params,
                flatten_params=flatten_params,
                consenus_stepsize=self.consenus_stepsize,
                self_rank=self.rank,
            )
        # update the local model.
        with kargs["timer"]("sync.update_local_model", epoch=self.conf.epoch_):
            flatten_params.unpack(params)

        # start compress/sync.
        with kargs["timer"]("sync.start_sync", epoch=self.conf.epoch_):
            self.sync_buffer = {
                "original_shapes": self.shapes,
                "flatten_params": flatten_params,
                "flatten_hat_params": flatten_hat_params,
            }

            self.helper_thread = utils.HelperThread(
                name=f"_thread_at_epoch_{self.conf.epoch_}.compress",
                func=self.compressor.pipeline,
                # the arguments below will be feeded into the `func`.
                sync_buffer=self.sync_buffer,
                neighbor_hat_params=self.neighbor_hat_params,
                neighbors_info=self.neighbors_info,
            )
            self.helper_thread.start()
            if self.conf.epoch_ % 1 == 0:
                utils.join_thread(self.helper_thread)
        return self.n_bits
示例#7
0
    def step(self, closure=None, **kargs):
        with kargs["timer"]("sync.apply_grad", epoch=self.conf.epoch_):
            # Apply the gradients with the weight decay and momentum.
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=True)
            self.updated_local_model_flag.set()

        with kargs["timer"]("sync.gossip_model", epoch=self.conf.epoch_):
            while not self.gossiped_dist_model_flag.is_set():
                pass
            self.gossiped_dist_model_flag.clear()
        return self.n_bits.item()
示例#8
0
    def step(self, closure=None, **kargs):
        # Apply the gradients with the weight decay and momentum.
        with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_):
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=False)

        with kargs["timer"]("grad.get_grads", epoch=self.conf.epoch_):
            params, _ = comm.get_data(self.param_groups,
                                      self.param_names,
                                      is_get_grad=False)
            flatten_params = TensorBuffer(params)

            grads, _ = comm.get_data(self.param_groups,
                                     self.param_names,
                                     is_get_grad=True)
            flatten_grads = TensorBuffer(grads)

        # Get weighted hat params and apply the local gradient.
        with kargs["timer"]("grad.apply_local_gradient",
                            epoch=self.conf.epoch_):
            flatten_half_params = deepcopy(flatten_params)
            flatten_half_params.buffer = (sum([
                _hat_params.buffer * self.neighbors_info[_rank]
                for _rank, _hat_params in self.neighbor_hat_params.items()
            ]) - self.param_groups[0]["lr"] * flatten_grads.buffer)

        # compress the model difference and sync.
        with kargs["timer"]("grad.compress", epoch=self.conf.epoch_):
            sync_buffer = {
                "original_shapes": self.shapes,
                "flatten_half_params": flatten_half_params,
                "flatten_params": flatten_params,
            }
            self.compressor.compress(sync_buffer)

        with kargs["timer"]("grad.sync", epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)

        # finally unflatten and update local model.
        with kargs["timer"]("grad.unflatten_to_update",
                            epoch=self.conf.epoch_):
            self.compressor.uncompress(sync_buffer, self.neighbor_hat_params)
            flatten_params.buffer = self.neighbor_hat_params[
                self.rank].buffer.clone()
            flatten_params.unpack(params)
        return sync_buffer["n_bits"]
示例#9
0
    def step(self, closure=None, **kargs):
        with kargs["timer"]("sync.apply_grad", epoch=self.conf.epoch_):
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=False)

        with kargs["timer"]("sync.get_data", epoch=self.conf.epoch_):
            # Get data.
            grads, _ = comm.get_data(self.param_groups,
                                     self.param_names,
                                     is_get_grad=True)
            grads_tb = TensorBuffer(grads)

        with kargs["timer"]("sync.use_memory", epoch=self.conf.epoch_):
            # use memory.
            grads_tb.buffer.add_(self.memory_tb.buffer)

        with kargs["timer"]("sync.compress", epoch=self.conf.epoch_):
            # compress.
            sync_buffer = self.compressor.compress(grads_tb)

        with kargs["timer"]("sync.sync", epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)

        with kargs["timer"]("sync.update_memory", epoch=self.conf.epoch_):
            # update memory.
            self.memory_tb.buffer = (grads_tb.buffer -
                                     sync_buffer["synced_grads_tb"].buffer)

        with kargs["timer"]("sync.decompress", epoch=self.conf.epoch_):
            sync_grads_tb = self.compressor.decompress(sync_buffer)

        with kargs["timer"]("sync.apply_grad", epoch=self.conf.epoch_):
            # appply the gradient but only with the gradient.
            params, _ = comm.get_data(self.param_groups,
                                      self.param_names,
                                      is_get_grad=False)
            params_tb = TensorBuffer(params)

            # apply the gradient.
            params_tb.buffer.add_(-self.param_groups[0]["lr"] *
                                  sync_grads_tb.buffer)

            # unpack.
            params_tb.unpack(params)
        return sync_buffer["n_bits"]
示例#10
0
    def step(self, closure=None, **kargs):
        # do the local update steps.
        with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_):
            # get parmas.
            params, _ = comm.get_data(self.param_groups,
                                      self.param_names,
                                      is_get_grad=False)
            params_tb = TensorBuffer(params)

        with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
            # prepare the gradient (sign)
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=False)
            # get grads.
            grads, _ = comm.get_data(self.param_groups,
                                     self.param_names,
                                     is_get_grad=True)
            grads_tb = TensorBuffer(grads)

        # enter the global sync if it satisfies the condition.
        # get the params difference w.r.t. previous synced model.
        with kargs["timer"]("sync/compress", epoch=self.conf.epoch_):
            sync_buffer = self.compressor.compress(grads_tb)

        # sync and decompress.
        with kargs["timer"]("sync/sync_and_decompress",
                            epoch=self.conf.epoch_):
            self.compressor.sync(sync_buffer)
            synced_updates_tb = self.compressor.decompress(sync_buffer)

        # unpack the synced info and update the consensus params.
        with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
            params_tb.buffer -= self.param_groups[0][
                "lr"] * synced_updates_tb.buffer
            params_tb.unpack(params)
        return sync_buffer["n_bits"]