Пример #1
0
    def _pillar_centers_from_index(self, xy_index: torch.tensor):
        """
            converts the pillar bounds into centers.  Pillars center shape must be (pillar_nbr, 2] with the
            last dimension being [x_min, y_min] for each pillar
        """
        logger.info("Calculating pillar_centers_from_index.")
        logger.debug(f"xy_index: {xy_index}{xy_index.shape}")

        min = torch.cuda.FloatTensor([
            self.pillars_cfg.getfloat('x_min'),
            self.pillars_cfg.getfloat('y_min')
        ])
        step = torch.cuda.FloatTensor([
            self.pillars_cfg.getfloat('x_step'),
            self.pillars_cfg.getfloat('y_step')
        ])
        z_center = torch.cuda.FloatTensor([
            (self.pillars_cfg.getfloat('z_max') -
             self.pillars_cfg.getfloat('z_min')) / 2.0
        ])

        # bring z center on shape from xy_min for concatenation
        z_center = z_center.unsqueeze(0).expand(xy_index.shape[0], -1)

        # The actual pillar boundaries (min has to be added again)
        xy_index = xy_index * step + min
        xy_index.add_(0.5 * step)
        xy_index = torch.cat((xy_index, z_center), dim=1)

        logger.debug(f"Center calculation complete.\n"
                     f"xy_center: {xy_index}{xy_index.shape},\n"
                     f"z_center: {z_center}{z_center.shape}")

        return xy_index
    def _add_intrinsic_reward(self, batch: dict, actions: torch.tensor,
                              rewards: torch.tensor, masks: torch.tensor):
        if self.r_enabled:
            intrinsic_r = self.r_policy.act(batch, actions, rewards, masks)
            rewards.add_(intrinsic_r)

        return rewards
Пример #3
0
    def Compressed_Allreduce(self, buffer_m: torch.tensor, worker_error,
                             server_error, rank, world_size, comm, local_rank):

        all_start_time = time.time()
        original_size = buffer_m.numel()
        cupy.cuda.Device(local_rank).use()

        if torch.numel(buffer_m) != torch.numel(worker_error):
            empty_tensor = torch.zeros(torch.numel(worker_error) -
                                       torch.numel(buffer_m),
                                       device=buffer_m.device)
            buffer_m = torch.cat([buffer_m, empty_tensor])

        buffer_m.add_(worker_error)
        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
        sign_buffer_m = buffer_m.sign().add_(1).bool()
        sign_buffer_m = sign_buffer_m.float()
        sign_buffer_m.add_(-0.5).mul_(2.0)
        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
        sign_buffer_m = None

        compensated_buffer_m = buffer_m
        compensated_buffer_m.sign_()
        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
        cupy_worker_scale = self.torch2cupy(worker_scale)
        cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m)
        compensated_buffer_m = None

        cupy_sign_list_packed = self.compress_by_chunk(
            cupy_compensated_buffer_m, world_size)
        cupy_compensated_buffer_m = None

        cupy_recvbuf_sign = cupy.zeros(
            [world_size, cupy_sign_list_packed[rank].size],
            dtype=cupy_sign_list_packed[0].dtype)
        cupy_recvbuf_scale = cupy.zeros([world_size, 1],
                                        dtype=cupy_worker_scale.dtype)

        # Communication Phase 1
        gather_start = time.time()
        if self.cuda_aware:
            gather_cuda(rank, world_size, comm, cupy_sign_list_packed,
                        cupy_recvbuf_sign, cupy_worker_scale,
                        cupy_recvbuf_scale)
        else:
            cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(
                rank, world_size, comm, cupy_sign_list_packed,
                cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale)
        gather_end = time.time()

        cupy_unpacked_sign = (cupy.unpackbits(
            cupy_recvbuf_sign.flatten())).reshape(world_size, -1)
        cupy_recvbuf_sign = None
        unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float()
        cupy_unpacked_sign = None
        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
        worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size)
        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
        unpacked_sign = None

        compensated_server_m.add_(server_error)
        server_scale = torch.norm(compensated_server_m) / np.sqrt(
            compensated_server_m.numel())
        sign_server_m = compensated_server_m.sign().add_(1).bool()
        sign_server_m = sign_server_m.float()
        sign_server_m.add_(-0.5).mul_(2.0)
        server_error.set_(compensated_server_m - server_scale * sign_server_m)
        sign_server_m = None

        compensated_server_m.sign_()
        compensated_server_m = compensated_server_m.add_(1).bool()
        cupy_server_scale = self.torch2cupy(server_scale)
        cupy_compensated_server_m = self.torch2cupy(compensated_server_m)
        compensated_server_m = None

        cupy_server_sign_packed = self.compress_by_chunk(
            cupy_compensated_server_m, 1)

        cupy_recvbuf_sign_server = cupy.zeros(
            [world_size, cupy_server_sign_packed[0].size],
            dtype=cupy_sign_list_packed[0].dtype)
        cupy_recvbuf_scale_server = cupy.zeros([world_size, 1],
                                               dtype=cupy_worker_scale.dtype)

        # Communication Phase 2
        if self.cuda_aware:
            allgather_cuda(comm, cupy_server_sign_packed[0],
                           cupy_recvbuf_sign_server, cupy_server_scale,
                           cupy_recvbuf_scale_server)
        else:
            cupy_server_sign_packed[
                0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(
                    comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server,
                    cupy_server_scale, cupy_recvbuf_scale_server)

        cupy_server_unpacked_sign = (cupy.unpackbits(
            cupy_recvbuf_sign_server.flatten())).reshape(world_size, -1)
        cupy_recvbuf_sign_server = None

        server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign)
        cupy_server_unpacked_sign = None

        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(
            2.0)
        server_scale = self.cupy2torch(cupy_recvbuf_scale_server)
        buffer_m = server_unpacked_sign.mul_(
            server_scale).flatten()[0:original_size]

        return buffer_m
Пример #4
0
    def compressed_allreduce(self,
                             buffer_m: torch.tensor,
                             worker_error,
                             server_error,
                             local_rank):

        # all_start_time = time.time()
        original_shape = buffer_m.size()
        if len(original_shape) > 1:
            buffer_m = torch.flatten(buffer_m)
        original_size = buffer_m.numel()
        worker_error_size = worker_error.numel()
        cupy.cuda.Device(local_rank).use()

        if original_size != worker_error_size:
            empty_tensor = torch.zeros(worker_error_size - original_size,
                                       device=buffer_m.device)
            buffer_m = torch.cat([buffer_m, empty_tensor])

        buffer_m.add_(worker_error)
        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
        worker_error.set_(buffer_m - worker_scale *
                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))

        if self.bool_not_supported:
            cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(
                    buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
                self.size)
        else:
            cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
                self.size)
        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)

        cupy_recvbuf_sign = cupy.zeros(
            [self.size,
             cupy_sign_list_packed[self.rank].size],
            dtype=cupy_sign_list_packed[0].dtype)
        # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)

        sign_list_packed = [
            self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
            for idx in range(self.size)
        ]

        # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
        recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
        #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
        recvbuf_scale = [
            torch.zeros(1,
                        dtype=worker_scale.dtype,
                        device=torch.device(local_rank)) for i in range(self.size)
        ]

        # communication phase 1
        # gather_start = time.time()
        # Alltoall for sign
        dist.all_to_all_single(recvbuf_sign,
                               torch.stack(sign_list_packed),
                               group=self.world_group)
        # Allgather for scale
        dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group)

        # gather_end = time.time()

        # cupy_sign_list_packed, sign_list_packed, cupy_worker_scale, worker_scale = None, None, None, None
        cupy_sign_list_packed = None

        cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign)
        #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale))

        compensated_server_m = self.compression_backend.cupy2torch(
            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
                self.size,
                -1)).float().add_(-0.5).mul_(2.0).mul_(
                    torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
        compensated_server_m.add_(server_error)
        server_scale = torch.norm(compensated_server_m) / np.sqrt(
            compensated_server_m.numel())
        server_error.set_(
            compensated_server_m - server_scale *
            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))

        # cupy_server_scale = self.compression_backend.torch2cupy(server_scale)

        if self.bool_not_supported:
            cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(
                    compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
                1)
        else:
            cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(
                    compensated_server_m.sign_().add_(1).bool()),
                1)
        compensated_server_m = None

        cupy_recvbuf_sign_server = cupy.zeros(
            [self.size,
             cupy_server_sign_packed[0].size],
            dtype=cupy_recvbuf_sign.dtype)
        # cupy_recvbuf_sign, recvbuf_sign = None, None
        cupy_recvbuf_sign = None

        server_sign_packed = [
            self.compression_backend.cupy2torch(cupy_server_sign_packed[0])
        ]
        recvbuf_sign_server = [
            self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx])
            for idx in range(self.size)
        ]

        # server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
        cupy_recvbuf_scale_server = cupy.zeros([self.size,
                                                1],
                                               dtype=cupy_worker_scale.dtype)
        # cupy_recvbuf_scale, recvbuf_scale = None, None

        recvbuf_scale_server = [
            self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx])
            for idx in range(self.size)
        ]

        # Communication Phase 2
        dist.all_gather(recvbuf_sign_server,
                        server_sign_packed[0],
                        group=self.world_group)
        dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group)

        cupy_server_sign_packed = None

        # need to convert from a tensor list to a single tensor
        # dist.all_gather only provides a tensor list as the recv/output buffer
        recvbuf_sign_server = torch.stack(recvbuf_sign_server)

        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(
            recvbuf_sign_server)

        buffer_m.data.copy_(
            self.compression_backend.cupy2torch(
                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
                    self.size,
                    -1)).float().add_(-0.5).mul_(2.0).mul_(
                        self.compression_backend.cupy2torch(
                            cupy_recvbuf_scale_server)).flatten().data)
        if original_size != worker_error_size:
            buffer_m = buffer_m[0:original_size]
        if len(original_shape) > 1:
            buffer_m = buffer_m.reshape(original_shape)

        return buffer_m
Пример #5
0
    def compressed_allreduce(self, buffer_m: torch.tensor, worker_error,
                             server_error, local_rank):

        all_start_time = time.time()
        original_shape = buffer_m.size()
        if len(original_shape) > 1:
            buffer_m = torch.flatten(buffer_m)
        original_size = buffer_m.numel()
        worker_error_size = worker_error.numel()
        cupy.cuda.Device(local_rank).use()

        if original_size != worker_error_size:
            empty_tensor = torch.zeros(worker_error_size - original_size,
                                       device=buffer_m.device)
            buffer_m = torch.cat([buffer_m, empty_tensor])

        buffer_m.add_(worker_error)
        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
        worker_error.set_(
            buffer_m - worker_scale *
            buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))

        cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
            self.compression_backend.torch2cupy(
                buffer_m.sign_().add_(1).bool()), self.size)
        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)

        cupy_recvbuf_sign = cupy.zeros(
            [self.size, cupy_sign_list_packed[self.rank].size],
            dtype=cupy_sign_list_packed[0].dtype)
        cupy_recvbuf_scale = cupy.zeros([self.size, 1],
                                        dtype=cupy_worker_scale.dtype)

        # Communication Phase 1
        gather_start = time.time()
        if self.cuda_aware:
            self.gather_cuda(self.rank, self.size, self.comm,
                             cupy_sign_list_packed, cupy_recvbuf_sign,
                             cupy_worker_scale, cupy_recvbuf_scale)
        else:
            _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(
                self.rank, self.size, self.comm, cupy_sign_list_packed,
                cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale)
        gather_end = time.time()

        # cupy_sign_list_packed, cupy_worker_scale, worker_scale = None, None, None
        cupy_sign_list_packed = None

        compensated_server_m = self.compression_backend.cupy2torch(
            (cupy.unpackbits(cupy_recvbuf_sign.flatten())
             ).reshape(self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
                 self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
                     1 / self.size)).sum(0)
        compensated_server_m.add_(server_error)
        server_scale = torch.norm(compensated_server_m) / np.sqrt(
            compensated_server_m.numel())
        server_error.set_(compensated_server_m -
                          server_scale * compensated_server_m.sign().add_(
                              1).bool().float().add_(-0.5).mul_(2.0))

        cupy_server_scale = self.compression_backend.torch2cupy(server_scale)

        cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
            self.compression_backend.torch2cupy(
                compensated_server_m.sign_().add_(1).bool()), 1)
        compensated_server_m = None

        cupy_recvbuf_sign_server = cupy.zeros(
            [self.size, cupy_server_sign_packed[0].size],
            dtype=cupy_recvbuf_sign.dtype)
        cupy_recvbuf_scale_server = cupy.zeros([self.size, 1],
                                               dtype=cupy_recvbuf_scale.dtype)
        # cupy_recvbuf_sign, cupy_recvbuf_scale = None, None
        cupy_recvbuf_sign = None

        # Communication Phase 2
        if self.cuda_aware:
            self.allgather_cuda(self.comm, cupy_server_sign_packed[0],
                                cupy_recvbuf_sign_server, cupy_server_scale,
                                cupy_recvbuf_scale_server)
        else:
            _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host(
                self.comm, cupy_server_sign_packed[0],
                cupy_recvbuf_sign_server, cupy_server_scale,
                cupy_recvbuf_scale_server)

        # cupy_server_sign_packed, cupy_server_scale, server_scale = None, None, None
        cupy_server_sign_packed = None

        buffer_m.data.copy_(
            self.compression_backend.cupy2torch(
                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
                    self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
                        self.compression_backend.cupy2torch(
                            cupy_recvbuf_scale_server)).flatten().data)
        if original_size != worker_error_size:
            buffer_m = buffer_m[0:original_size]
        if len(original_shape) > 1:
            buffer_m = buffer_m.reshape(original_shape)

        # cupy_recvbuf_sign_server, cupy_recvbuf_scale_server = None, None

        return buffer_m