def network_size_weights(): """ Return total number of weights and network size (for weights) in KBytes """ kbytes = None num_params = None # get all parameters ps = nn.get_parameters() for p in ps: if ((p.endswith("quantized_conv/W") or p.endswith("quantized_conv/b") or p.endswith("quantized_affine/W") or p.endswith("quantized_affine/b"))): _num_params = np.prod(ps[p].shape) print(f"{p}\t{ps[p].shape}\t{_num_params}") if cfg.w_quantize is not None: if cfg.w_quantize in [ 'parametric_fp_b_xmax', 'parametric_fp_d_b', 'parametric_pow2_b_xmax', 'parametric_pow2_b_xmin' ]: # parametric quantization n_p = p + "quant/" + cfg.w_quantize + "/n" n = F.round( clip_scalar(ps[n_p], cfg.w_bitwidth_min, cfg.w_bitwidth_max)) elif cfg.w_quantize == 'parametric_fp_d_xmax': # this quantization methods do not have n, so we need to compute it d = ps[p + "quant/" + cfg.w_quantize + "/d"] xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"] # ensure that stepsize is in specified range and a power of two d_q = quantize_pow2( clip_scalar(d, cfg.w_stepsize_min, cfg.w_stepsize_max)) # ensure that dynamic range is in specified range xmax = clip_scalar(xmax, cfg.w_xmax_min, cfg.w_xmax_max) # compute real `xmax` xmax = F.round(xmax / d_q) * d_q # we do not clip to `cfg.w_bitwidth_max` as xmax/d_q could correspond to more than 8 bit n = F.maximum_scalar(F.ceil(log2(xmax / d_q + 1.0) + 1.0), cfg.w_bitwidth_min) elif cfg.w_quantize == 'parametric_pow2_xmin_xmax': # this quantization methods do not have n, so we need to compute it xmin = ps[p + "quant/" + cfg.w_quantize + "/xmin"] xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"] # ensure that minimum dynamic range is in specified range and a power-of-two xmin = quantize_pow2( clip_scalar(xmin, cfg.w_xmin_min, cfg.w_xmin_max)) # ensure that maximum dynamic range is in specified range and a power-of-two xmax = quantize_pow2( clip_scalar(xmax, cfg.w_xmax_min, cfg.w_xmax_max)) # use ceil to determine bitwidth n = F.maximum_scalar( F.ceil(log2(log2(xmax / xmin) + 1.0) + 1.), cfg.w_bitwidth_min) elif cfg.w_quantize == 'fp' or cfg.w_quantize == 'pow2': # fixed quantization n = nn.Variable((), need_grad=False) n.d = cfg.w_bitwidth else: raise ValueError( f'Unknown quantization method {cfg.w_quantize}') else: # float precision n = nn.Variable((), need_grad=False) n.d = 32. if kbytes is None: kbytes = n * _num_params / 8. / 1024. num_params = _num_params else: kbytes += n * _num_params / 8. / 1024. num_params += _num_params return num_params, kbytes
def network_size_activations(): """ Returns total number of activations and size in KBytes (NNabla variable using `max` or `sum` operator) """ kbytes = [] num_activations = 0 # get all parameters ps = nn.get_parameters(grad_only=False) for p in ps: if "Asize" in p: print(f"{p}\t{ps[p].d}") num_activations += ps[p].d if cfg.a_quantize is not None: if cfg.a_quantize in ['fp_relu', 'pow2_relu']: # fixed quantization n = nn.Variable((), need_grad=False) n.d = cfg.a_bitwidth elif cfg.a_quantize in [ 'parametric_fp_relu', 'parametric_fp_b_xmax_relu', 'parametric_fp_d_b_relu', 'parametric_pow2_b_xmax_relu', 'parametric_pow2_b_xmin_relu' ]: # parametric quantization s = p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/n") n = F.round( clip_scalar(ps[s], cfg.a_bitwidth_min, cfg.a_bitwidth_max)) elif cfg.a_quantize in ['parametric_fp_d_xmax_relu']: # these quantization methods do not have n, so we need to compute it! # parametric quantization d = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/d")] xmax = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmax")] # ensure that stepsize is in specified range and a power of two d_q = quantize_pow2( clip_scalar(d, cfg.a_stepsize_min, cfg.a_stepsize_max)) # ensure that dynamic range is in specified range xmax = clip_scalar(xmax, cfg.a_xmax_min, cfg.a_xmax_max) # compute real `xmax` xmax = F.round(xmax / d_q) * d_q n = F.maximum_scalar(F.ceil(log2(xmax / d_q + 1.0)), cfg.a_bitwidth_min) elif cfg.a_quantize in ['parametric_pow2_xmin_xmax_relu']: # these quantization methods do not have n, so we need to compute it! # parametric quantization xmin = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmin")] xmax = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmax")] # ensure that dynamic ranges are in specified range and a power-of-two xmin = quantize_pow2( clip_scalar(xmin, cfg.a_xmin_min, cfg.a_xmin_max)) xmax = quantize_pow2( clip_scalar(xmax, cfg.a_xmax_min, cfg.a_xmax_max)) # use ceil rounding n = F.maximum_scalar( F.ceil(log2(log2(xmax / xmin) + 1.) + 1.), cfg.a_bitwidth_min) else: raise ValueError("Unknown quantization method {}".format( cfg.a_quantize)) else: # float precision n = nn.Variable((), need_grad=False) n.d = 32. kbytes.append( F.reshape(n * ps[p].d / 8. / 1024., (1, ), inplace=False)) if cfg.target_activation_type == 'max': _kbytes = F.max(F.concatenate(*kbytes)) elif cfg.target_activation_type == 'sum': _kbytes = F.sum(F.concatenate(*kbytes)) return num_activations, _kbytes
def _build(self): # infer variable self.infer_obs_t = infer_obs_t = nn.Variable((1, 4, 84, 84)) # inference output self.infer_q_t,\ self.infer_probs_t, _ = self.q_function(infer_obs_t, self.num_actions, self.min_v, self.max_v, self.num_bins, 'q_func') self.infer_t = F.sink(self.infer_q_t, self.infer_probs_t) # train variables self.obss_t = nn.Variable((self.batch_size, 4, 84, 84)) self.acts_t = nn.Variable((self.batch_size, 1)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, 4, 84, 84)) self.ters_tp1 = nn.Variable((self.batch_size, 1)) # training output q_t, probs_t, dists = self.q_function(self.obss_t, self.num_actions, self.min_v, self.max_v, self.num_bins, 'q_func') q_tp1, probs_tp1, _ = self.q_function(self.obss_tp1, self.num_actions, self.min_v, self.max_v, self.num_bins, 'target_q_func') expand_last = lambda x: F.reshape(x, x.shape + (1, )) flat = lambda x: F.reshape(x, (-1, 1)) # extract selected dimension a_t_one_hot = expand_last(F.one_hot(self.acts_t, (self.num_actions, ))) probs_t_selected = F.max(probs_t * a_t_one_hot, axis=1) # extract max dimension _, indices = F.max(q_tp1, axis=1, keepdims=True, with_index=True) a_tp1_one_hot = expand_last(F.one_hot(indices, (self.num_actions, ))) probs_tp1_best = F.max(probs_tp1 * a_tp1_one_hot, axis=1) # clipping reward clipped_rews_tp1 = clip_by_value(self.rews_tp1, -1.0, 1.0) disc_q_tp1 = F.reshape(dists, (1, -1)) * (1.0 - self.ters_tp1) t_z = clip_by_value(clipped_rews_tp1 + self.gamma * disc_q_tp1, self.min_v, self.max_v) # update indices b = (t_z - self.min_v) / ((self.max_v - self.min_v) / (self.num_bins - 1)) l = F.floor(b) l_mask = F.reshape(F.one_hot(flat(l), (self.num_bins, )), (-1, self.num_bins, self.num_bins)) u = F.ceil(b) u_mask = F.reshape(F.one_hot(flat(u), (self.num_bins, )), (-1, self.num_bins, self.num_bins)) m_l = expand_last(probs_tp1_best * (1 - (b - l))) m_u = expand_last(probs_tp1_best * (b - l)) m = F.sum(m_l * l_mask + m_u * u_mask, axis=1) m.need_grad = False self.loss = -F.mean(F.sum(m * F.log(probs_t_selected + 1e-10), axis=1)) # optimizer self.solver = S.RMSprop(self.lr, 0.95, 1e-2) # weights and biases with nn.parameter_scope('q_func'): self.params = nn.get_parameters() with nn.parameter_scope('target_q_func'): self.target_params = nn.get_parameters() # set q function parameters to solver self.solver.set_parameters(self.params)