def compute_ae_bottleneck_size(self, x, print_info=False): z = self.head_model(x) modules = list() module_util.extract_decomposable_modules(self.autoencoder, z, modules) modules = [module.to(x.device) for module in modules] org_size = np.prod(x.size()) min_rate = None bo = None bqo = None for i in range(len(modules)): if isinstance(modules[i], nn.Linear): z = z.view(z.size(0), -1) z = modules[i](z) rate = np.prod(z.size()) / org_size if min_rate is None or rate < min_rate: min_rate = rate bo = pickle.dumps(z) bqo = pickle.dumps(tensor_util.quantize_tensor(z)) output_data_size = sys.getsizeof(bo) / 1024 quantized_output_data_size = sys.getsizeof(bqo) / 1024 if print_info: print( '[Autoencoder bottleneck]\tScaled output size: {} [%]\tOutput data size: {} [KB]' '\tQuantized output data size: {} [KB]'.format( min_rate * 100.0, output_data_size, quantized_output_data_size)) # Scaled bottleneck size, bottleneck data size [KB], Quantized bottleneck data size [KB] return min_rate, output_data_size, quantized_output_data_size
def __call__(self, z, target): if z is None: data_size = 0.0 fp16_data_size = 0.0 quantized_data_size = 0.0 else: data_size = file_util.get_binary_object_size(z) fp16_data_size = None if not isinstance(z, torch.Tensor) else file_util.get_binary_object_size(z.short()) quantized_data_size = None if not isinstance(z, torch.Tensor)\ else file_util.get_binary_object_size(tensor_util.quantize_tensor(z, num_bits=self.num_bits4quant)) self.data_size_list.append(data_size) self.fp16_data_size_list.append(fp16_data_size) self.quantized_data_size_list.append(quantized_data_size) self.tensor_shape_list.append([0, 0, 0] if z is None else [z.shape[1], z.shape[2], z.shape[3]]) return z, target
def test_split_model(model, head_network, tail_network, sensor_device, edge_device, spbit, config): dataset_config = config['dataset'] _, _, test_loader =\ dataset_util.get_data_loaders(dataset_config, batch_size=config['train']['batch_size'], rough_size=config['train']['rough_size'], reshape_size=tuple(config['input_shape'][1:3]), test_batch_size=config['test']['batch_size'], jpeg_quality=-1) print('Testing..') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device.type == 'cuda': cudnn.benchmark = True head_network = module_util.use_multiple_gpus_if_available( head_network, sensor_device) tail_network = module_util.use_multiple_gpus_if_available( tail_network, edge_device) model.to(device) head_network.to(sensor_device) tail_network.to(edge_device) head_network.eval() tail_network.eval() model.eval() split_correct_count = 0 split_test_loss = 0 org_correct_count = 0 org_test_loss = 0 total = 0 file_size_list = list() head_proc_time_list = list() tail_proc_time_list = list() with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(test_loader): total += targets.size(0) inputs, targets = inputs.to(sensor_device), targets.to(edge_device) head_start_time = time.time() zs = head_network(inputs) if spbit in ['8bits', '16bits']: if spbit == '8bits': # Quantization and dequantization qzs = tensor_util.quantize_tensor(zs) head_end_time = time.time() file_size_list.append( file_util.get_binary_object_size(qzs)) zs = tensor_util.dequantize_tensor(qzs) else: # Casting and recasting zs = zs.half() head_end_time = time.time() file_size_list.append(file_util.get_binary_object_size(zs)) zs = zs.float() else: head_end_time = time.time() file_size_list.append(file_util.get_binary_object_size(zs)) preds = tail_network(zs.to(edge_device)) tail_end_time = time.time() sub_correct_count, sub_test_loss = predict(preds, targets) split_correct_count += sub_correct_count split_test_loss += sub_test_loss inputs, targets = inputs.to(device), targets.to(device) preds = model(inputs) sub_correct_count, sub_test_loss = predict(preds, targets) org_correct_count += sub_correct_count org_test_loss += sub_test_loss head_proc_time_list.append(head_end_time - head_start_time) tail_proc_time_list.append(tail_end_time - head_end_time) org_acc = 100.0 * org_correct_count / total print( '[Before splitting]\tAverage Loss: {:.4f}, Accuracy: {}/{} [{:.4f}%]\n' .format(org_test_loss / total, org_correct_count, total, org_acc)) split_acc = 100.0 * split_correct_count / total print( '[After splitting]\tAverage Loss: {:.4f}, Accuracy: {}/{} [{:.4f}%]\n'. format(split_test_loss / total, split_correct_count, total, split_acc)) print('Output file size at splitting point [KB]: {} +- {}'.format( np.average(file_size_list), np.std(file_size_list))) print('Local processing time [sec]: {} +- {}'.format( np.average(head_proc_time_list), np.std(head_proc_time_list))) print('Edge processing time [sec]: {} +- {}'.format( np.average(tail_proc_time_list), np.std(tail_proc_time_list)))
def save_image(self, z, output_file_path): qz = tensor_util.quantize_tensor(z) img = Image.fromarray(qz.tensor.permute(1, 2, 0).cpu().numpy()) img.save(output_file_path, format='jpeg', quality=self.jpeg_quality) return qz
def __call__(self, z, target): if self.num_bits == 16: return z.half(), target qz = tensor_util.quantize_tensor(z, num_bits=self.num_bits) return qz, target