def _pillar_centers_from_index(self, xy_index: torch.tensor): """ converts the pillar bounds into centers. Pillars center shape must be (pillar_nbr, 2] with the last dimension being [x_min, y_min] for each pillar """ logger.info("Calculating pillar_centers_from_index.") logger.debug(f"xy_index: {xy_index}{xy_index.shape}") min = torch.cuda.FloatTensor([ self.pillars_cfg.getfloat('x_min'), self.pillars_cfg.getfloat('y_min') ]) step = torch.cuda.FloatTensor([ self.pillars_cfg.getfloat('x_step'), self.pillars_cfg.getfloat('y_step') ]) z_center = torch.cuda.FloatTensor([ (self.pillars_cfg.getfloat('z_max') - self.pillars_cfg.getfloat('z_min')) / 2.0 ]) # bring z center on shape from xy_min for concatenation z_center = z_center.unsqueeze(0).expand(xy_index.shape[0], -1) # The actual pillar boundaries (min has to be added again) xy_index = xy_index * step + min xy_index.add_(0.5 * step) xy_index = torch.cat((xy_index, z_center), dim=1) logger.debug(f"Center calculation complete.\n" f"xy_center: {xy_index}{xy_index.shape},\n" f"z_center: {z_center}{z_center.shape}") return xy_index
def _add_intrinsic_reward(self, batch: dict, actions: torch.tensor, rewards: torch.tensor, masks: torch.tensor): if self.r_enabled: intrinsic_r = self.r_policy.act(batch, actions, rewards, masks) rewards.add_(intrinsic_r) return rewards
def Compressed_Allreduce(self, buffer_m: torch.tensor, worker_error, server_error, rank, world_size, comm, local_rank): all_start_time = time.time() original_size = buffer_m.numel() cupy.cuda.Device(local_rank).use() if torch.numel(buffer_m) != torch.numel(worker_error): empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m), device=buffer_m.device) buffer_m = torch.cat([buffer_m, empty_tensor]) buffer_m.add_(worker_error) worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) sign_buffer_m = buffer_m.sign().add_(1).bool() sign_buffer_m = sign_buffer_m.float() sign_buffer_m.add_(-0.5).mul_(2.0) worker_error.set_((buffer_m - worker_scale * sign_buffer_m)) sign_buffer_m = None compensated_buffer_m = buffer_m compensated_buffer_m.sign_() compensated_buffer_m = compensated_buffer_m.add_(1).bool() cupy_worker_scale = self.torch2cupy(worker_scale) cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m) compensated_buffer_m = None cupy_sign_list_packed = self.compress_by_chunk( cupy_compensated_buffer_m, world_size) cupy_compensated_buffer_m = None cupy_recvbuf_sign = cupy.zeros( [world_size, cupy_sign_list_packed[rank].size], dtype=cupy_sign_list_packed[0].dtype) cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) # Communication Phase 1 gather_start = time.time() if self.cuda_aware: gather_cuda(rank, world_size, comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale) else: cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host( rank, world_size, comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale) gather_end = time.time() cupy_unpacked_sign = (cupy.unpackbits( cupy_recvbuf_sign.flatten())).reshape(world_size, -1) cupy_recvbuf_sign = None unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float() cupy_unpacked_sign = None unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0) worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size) compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0) unpacked_sign = None compensated_server_m.add_(server_error) server_scale = torch.norm(compensated_server_m) / np.sqrt( compensated_server_m.numel()) sign_server_m = compensated_server_m.sign().add_(1).bool() sign_server_m = sign_server_m.float() sign_server_m.add_(-0.5).mul_(2.0) server_error.set_(compensated_server_m - server_scale * sign_server_m) sign_server_m = None compensated_server_m.sign_() compensated_server_m = compensated_server_m.add_(1).bool() cupy_server_scale = self.torch2cupy(server_scale) cupy_compensated_server_m = self.torch2cupy(compensated_server_m) compensated_server_m = None cupy_server_sign_packed = self.compress_by_chunk( cupy_compensated_server_m, 1) cupy_recvbuf_sign_server = cupy.zeros( [world_size, cupy_server_sign_packed[0].size], dtype=cupy_sign_list_packed[0].dtype) cupy_recvbuf_scale_server = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype) # Communication Phase 2 if self.cuda_aware: allgather_cuda(comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server) else: cupy_server_sign_packed[ 0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host( comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server) cupy_server_unpacked_sign = (cupy.unpackbits( cupy_recvbuf_sign_server.flatten())).reshape(world_size, -1) cupy_recvbuf_sign_server = None server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign) cupy_server_unpacked_sign = None server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_( 2.0) server_scale = self.cupy2torch(cupy_recvbuf_scale_server) buffer_m = server_unpacked_sign.mul_( server_scale).flatten()[0:original_size] return buffer_m
def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank): # all_start_time = time.time() original_shape = buffer_m.size() if len(original_shape) > 1: buffer_m = torch.flatten(buffer_m) original_size = buffer_m.numel() worker_error_size = worker_error.numel() cupy.cuda.Device(local_rank).use() if original_size != worker_error_size: empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device) buffer_m = torch.cat([buffer_m, empty_tensor]) buffer_m.add_(worker_error) worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) if self.bool_not_supported: cupy_sign_list_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy( buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)), self.size) else: cupy_sign_list_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), self.size) cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale) cupy_recvbuf_sign = cupy.zeros( [self.size, cupy_sign_list_packed[self.rank].size], dtype=cupy_sign_list_packed[0].dtype) # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) sign_list_packed = [ self.compression_backend.cupy2torch(cupy_sign_list_packed[idx]) for idx in range(self.size) ] # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale) recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign) #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale) recvbuf_scale = [ torch.zeros(1, dtype=worker_scale.dtype, device=torch.device(local_rank)) for i in range(self.size) ] # communication phase 1 # gather_start = time.time() # Alltoall for sign dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed), group=self.world_group) # Allgather for scale dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group) # gather_end = time.time() # cupy_sign_list_packed, sign_list_packed, cupy_worker_scale, worker_scale = None, None, None, None cupy_sign_list_packed = None cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign) #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale)) compensated_server_m = self.compression_backend.cupy2torch( (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape( self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0) compensated_server_m.add_(server_error) server_scale = torch.norm(compensated_server_m) / np.sqrt( compensated_server_m.numel()) server_error.set_( compensated_server_m - server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) # cupy_server_scale = self.compression_backend.torch2cupy(server_scale) if self.bool_not_supported: cupy_server_sign_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy( compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)), 1) else: cupy_server_sign_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy( compensated_server_m.sign_().add_(1).bool()), 1) compensated_server_m = None cupy_recvbuf_sign_server = cupy.zeros( [self.size, cupy_server_sign_packed[0].size], dtype=cupy_recvbuf_sign.dtype) # cupy_recvbuf_sign, recvbuf_sign = None, None cupy_recvbuf_sign = None server_sign_packed = [ self.compression_backend.cupy2torch(cupy_server_sign_packed[0]) ] recvbuf_sign_server = [ self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx]) for idx in range(self.size) ] # server_scale = self.compression_backend.cupy2torch(cupy_server_scale) cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) # cupy_recvbuf_scale, recvbuf_scale = None, None recvbuf_scale_server = [ self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx]) for idx in range(self.size) ] # Communication Phase 2 dist.all_gather(recvbuf_sign_server, server_sign_packed[0], group=self.world_group) dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group) cupy_server_sign_packed = None # need to convert from a tensor list to a single tensor # dist.all_gather only provides a tensor list as the recv/output buffer recvbuf_sign_server = torch.stack(recvbuf_sign_server) cupy_recvbuf_sign_server = self.compression_backend.torch2cupy( recvbuf_sign_server) buffer_m.data.copy_( self.compression_backend.cupy2torch( (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape( self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( self.compression_backend.cupy2torch( cupy_recvbuf_scale_server)).flatten().data) if original_size != worker_error_size: buffer_m = buffer_m[0:original_size] if len(original_shape) > 1: buffer_m = buffer_m.reshape(original_shape) return buffer_m
def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank): all_start_time = time.time() original_shape = buffer_m.size() if len(original_shape) > 1: buffer_m = torch.flatten(buffer_m) original_size = buffer_m.numel() worker_error_size = worker_error.numel() cupy.cuda.Device(local_rank).use() if original_size != worker_error_size: empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device) buffer_m = torch.cat([buffer_m, empty_tensor]) buffer_m.add_(worker_error) worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m)) worker_error.set_( buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)) cupy_sign_list_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy( buffer_m.sign_().add_(1).bool()), self.size) cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale) cupy_recvbuf_sign = cupy.zeros( [self.size, cupy_sign_list_packed[self.rank].size], dtype=cupy_sign_list_packed[0].dtype) cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype) # Communication Phase 1 gather_start = time.time() if self.cuda_aware: self.gather_cuda(self.rank, self.size, self.comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale) else: _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host( self.rank, self.size, self.comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale) gather_end = time.time() # cupy_sign_list_packed, cupy_worker_scale, worker_scale = None, None, None cupy_sign_list_packed = None compensated_server_m = self.compression_backend.cupy2torch( (cupy.unpackbits(cupy_recvbuf_sign.flatten()) ).reshape(self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_( 1 / self.size)).sum(0) compensated_server_m.add_(server_error) server_scale = torch.norm(compensated_server_m) / np.sqrt( compensated_server_m.numel()) server_error.set_(compensated_server_m - server_scale * compensated_server_m.sign().add_( 1).bool().float().add_(-0.5).mul_(2.0)) cupy_server_scale = self.compression_backend.torch2cupy(server_scale) cupy_server_sign_packed = self.compression_backend.compress_by_chunk( self.compression_backend.torch2cupy( compensated_server_m.sign_().add_(1).bool()), 1) compensated_server_m = None cupy_recvbuf_sign_server = cupy.zeros( [self.size, cupy_server_sign_packed[0].size], dtype=cupy_recvbuf_sign.dtype) cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_recvbuf_scale.dtype) # cupy_recvbuf_sign, cupy_recvbuf_scale = None, None cupy_recvbuf_sign = None # Communication Phase 2 if self.cuda_aware: self.allgather_cuda(self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server) else: _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host( self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server) # cupy_server_sign_packed, cupy_server_scale, server_scale = None, None, None cupy_server_sign_packed = None buffer_m.data.copy_( self.compression_backend.cupy2torch( (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape( self.size, -1)).float().add_(-0.5).mul_(2.0).mul_( self.compression_backend.cupy2torch( cupy_recvbuf_scale_server)).flatten().data) if original_size != worker_error_size: buffer_m = buffer_m[0:original_size] if len(original_shape) > 1: buffer_m = buffer_m.reshape(original_shape) # cupy_recvbuf_sign_server, cupy_recvbuf_scale_server = None, None return buffer_m