def test_max_with_index(seed, ctx, func_name, inshape, axis, keepdims): x = np.random.RandomState(seed).randn(*inshape).astype(np.float32) x = nn.Variable.from_numpy_array(x) with nn.context_scope(ctx), nn.auto_forward(True): val, idx = F.max(x, axis, keepdims, with_index=True) assert_allclose(val.d, np.amax(x.d, axis, keepdims=keepdims)) shape = [a for i, a in enumerate(x.d.shape) if i not in axis] + [-1] assert np.all(idx.d == x.d.reshape(*shape).argmax(-1).reshape(idx.d.shape)) with nn.context_scope(ctx), nn.auto_forward(True): idx = F.max(x, axis, keepdims, only_index=True) shape = [a for i, a in enumerate(x.d.shape) if i not in axis] + [-1] assert np.all(idx.d == x.d.reshape(*shape).argmax(-1).reshape(idx.d.shape))
def chamfer_hausdorff_oneside_dists(X0, X1): b0 = X0.shape[0] b1 = X1.shape[0] sum_ = 0 max_ = nn.NdArray.from_numpy_array(np.array(-np.inf)) n = 0 for i in tqdm.tqdm(range(0, b0, sub_batch_size), desc="cdist-outer-loop"): x0 = nn.NdArray.from_numpy_array(X0[i:i + sub_batch_size]) norm_x0 = F.sum(x0**2.0, axis=1, keepdims=True) min_ = nn.NdArray.from_numpy_array(np.ones(x0.shape[0]) * np.inf) for j in tqdm.tqdm(range(0, b1, sub_batch_size), desc="cdist-inner-loop"): x1 = nn.NdArray.from_numpy_array(X1[j:j + sub_batch_size]) # block pwd norm_x1 = F.transpose(F.sum(x1**2.0, axis=1, keepdims=True), (1, 0)) x1_T = F.transpose(x1, (1, 0)) x01 = F.affine(x0, x1_T) bpwd = (norm_x0 + norm_x1 - 2.0 * x01)**0.5 # block min min_ = F.minimum2(min_, F.min(bpwd, axis=1)) # sum/max over cols sum_ += F.sum(min_) n += bpwd.shape[0] max_ = F.maximum2(max_, F.max(min_)) ocd = sum_.data / n ohd = max_.data return ocd, ohd
def build_train_graph(self, batch): self.solver = S.Adam(self.learning_rate) obs, action, reward, terminal, newobs = batch # Create input variables s = nn.Variable(obs.shape) a = nn.Variable(action.shape) r = nn.Variable(reward.shape) t = nn.Variable(terminal.shape) snext = nn.Variable(newobs.shape) with nn.parameter_scope(self.name_q): q = self.q_builder(s, self.num_actions, test=False) self.solver.set_parameters(nn.get_parameters()) with nn.parameter_scope(self.name_qnext): qnext = self.q_builder(snext, self.num_actions, test=True) qnext.need_grad = False clipped_r = F.minimum_scalar(F.maximum_scalar( r, -self.clip_reward), self.clip_reward) q_a = F.sum( q * F.one_hot(F.reshape(a, (-1, 1), inplace=False), (q.shape[1],)), axis=1) target = clipped_r + self.gamma * (1 - t) * F.max(qnext, axis=1) loss = F.mean(F.huber_loss(q_a, target)) Variables = namedtuple( 'Variables', ['s', 'a', 'r', 't', 'snext', 'q', 'loss']) self.v = Variables(s, a, r, t, snext, q, loss) self.sync_models() self.built = True
def random_generate(self, num_images, path): # Generate from the uniform prior of the base model indices = F.randint(low=0, high=self.num_embedding, shape=[num_images] + self.latent_shape) indices = F.reshape(indices, (-1, ), inplace=True) quantized = F.embed(indices, self.base_model.vq.embedding_weight) quantized = F.transpose( quantized.reshape([num_images] + self.latent_shape + [quantized.shape[-1]]), (0, 3, 1, 2)) img_gen_uniform_prior = self.base_model(quantized, quantized_as_input=True, test=True) # Generate images using pixelcnn prior indices = nn.Variable.from_numpy_array( np.zeros(shape=[num_images] + self.latent_shape)) labels = F.randint(low=0, high=self.num_classes, shape=(num_images, 1)) labels = F.one_hot(labels, shape=(self.num_classes, )) # Sample from pixelcnn - pixel by pixel import torch # Numpy behavior is different and not giving correct output for i in range(self.latent_shape[0]): for j in range(self.latent_shape[1]): quantized = F.embed(indices.reshape((-1, )), self.base_model.vq.embedding_weight) quantized = F.transpose( quantized.reshape([num_images] + self.latent_shape + [quantized.shape[-1]]), (0, 3, 1, 2)) indices_sample = self.prior(quantized, labels) indices_prob = F.reshape(indices_sample, indices.shape + (indices_sample.shape[-1], ), inplace=True)[:, i, j] indices_prob = F.softmax(indices_prob) indices_prob_tensor = torch.from_numpy(indices_prob.d) sample = indices_prob_tensor.multinomial(1).squeeze().numpy() indices[:, i, j] = sample print(indices.d) quantized = F.embed(indices.reshape((-1, )), self.base_model.vq.embedding_weight) quantized = F.transpose( quantized.reshape([num_images] + self.latent_shape + [quantized.shape[-1]]), (0, 3, 1, 2)) img_gen_pixelcnn_prior = self.base_model(quantized, quantized_as_input=True, test=True) self.save_image(img_gen_uniform_prior, os.path.join(path, 'generate_uniform.png')) self.save_image(img_gen_pixelcnn_prior, os.path.join(path, 'generate_pixelcnn.png')) print('Random labels generated for pixelcnn prior:', list(F.max(labels, axis=1, only_index=True).d))
def _build(self): # infer variable self.infer_obs_t = nn.Variable((1, 4, 84, 84)) # inference output self.infer_qs_t = self.q_function(self.infer_obs_t, self.num_actions, self.num_heads, 'q_func') self.infer_all = F.sink(*self.infer_qs_t) # train variables self.obss_t = nn.Variable((self.batch_size, 4, 84, 84)) self.acts_t = nn.Variable((self.batch_size, 1)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, 4, 84, 84)) self.ters_tp1 = nn.Variable((self.batch_size, 1)) self.weights = nn.Variable((self.batch_size, self.num_heads)) # training output qs_t = self.q_function(self.obss_t, self.num_actions, self.num_heads, 'q_func') qs_tp1 = q_function(self.obss_tp1, self.num_actions, self.num_heads, 'target') stacked_qs_t = F.transpose(F.stack(*qs_t), [1, 0, 2]) stacked_qs_tp1 = F.transpose(F.stack(*qs_tp1), [1, 0, 2]) # select one dimension a_one_hot = F.reshape(F.one_hot(self.acts_t, (self.num_actions, )), (-1, 1, self.num_actions)) # mask output q_t_selected = F.sum(stacked_qs_t * a_one_hot, axis=2) q_tp1_best = F.max(stacked_qs_tp1, axis=2) q_tp1_best.need_grad = False # reward clipping clipped_rews_tp1 = clip_by_value(self.rews_tp1, -1.0, 1.0) # loss calculation y = clipped_rews_tp1 + self.gamma * q_tp1_best * (1.0 - self.ters_tp1) td = F.huber_loss(q_t_selected, y) self.loss = F.mean(F.sum(td * self.weights, axis=1)) # optimizer self.solver = S.RMSprop(self.lr, 0.95, 1e-2) # weights and biases with nn.parameter_scope('q_func'): self.params = nn.get_parameters() self.head_params = [] for i in range(self.num_heads): with nn.parameter_scope('head%d' % i): self.head_params.append(nn.get_parameters()) with nn.parameter_scope('shared'): self.shared_params = nn.get_parameters() with nn.parameter_scope('target'): self.target_params = nn.get_parameters() # set q function parameters to solver self.solver.set_parameters(self.params)
def forward_impl(self, inputs, outputs): x = inputs[0].data M = inputs[1].data y = outputs[0].data y.copy_from(x) if not self.training: return Mb = F.max(x, keepdims=True) F.identity(self.decay * M + (1 - self.decay) * Mb, outputs=[M])
def forward_impl(self, inputs, outputs): x = inputs[0].data M = inputs[1].data y = outputs[0].data y.copy_from(x) if not self.training: return Mb = F.max(x, keepdims=True) F.maximum2(M, Mb, outputs=[M])
def encode_text(text): param_dict = nn.get_parameters() embed_dim = param_dict['text_projection'].shape[1] context_length = param_dict['positional_embedding'].shape[0] vocab_size = param_dict['token_embedding/W'].shape[0] transformer_width = param_dict['ln_final/W'].shape[0] transformer_heads = transformer_width // 64 transformer_layers = len( set( k.split('/')[2] for k in param_dict.keys() if k.startswith(f'transformer/resblocks'))) token_embedding = nn.parameter.get_parameter_or_create( name='token_embedding/W', shape=(vocab_size, transformer_width)) x = F.embed(text, token_embedding) # [batch_size, n_ctx, d_model] positional_embedding = nn.parameter.get_parameter_or_create( name='positional_embedding', shape=(context_length, transformer_width)).reshape( (1, context_length, transformer_width)) x = x + positional_embedding x = F.transpose(x, (1, 0, 2)) # NLD -> LND x = transformer(x, transformer_width, transformer_layers, transformer_heads, attn_mask=build_attn_mask(context_length)) x = F.transpose(x, (1, 0, 2)) # LND -> NLD ln_final_W = nn.parameter.get_parameter_or_create( name='ln_final/W', shape=(transformer_width, )).reshape( (1, 1, transformer_width)) ln_final_b = nn.parameter.get_parameter_or_create( name='ln_final/b', shape=(transformer_width, )).reshape( (1, 1, transformer_width)) x = F.layer_normalization(x, ln_final_b, ln_final_W, batch_axis=(0, 1)) idx = F.max(text, axis=-1, only_index=True) idx.forward() x = x[list(range(x.shape[0])), idx.d].reshape((1, x.shape[0], -1)) text_projection = nn.parameter.get_parameter_or_create( name='text_projection', shape=(transformer_width, embed_dim)).reshape( (1, transformer_width, embed_dim)) x = F.batch_matmul(x, text_projection) x = x.reshape((-1, embed_dim)) return x
def ray_march(self, camloc, raydir, t0, t1, N, n_chunks, t_argmin=False): # Points computation BR, _ = t0.shape t0 = F.reshape(t0, (BR, 1, 1)) t1 = F.reshape(t1, (BR, 1, 1)) camloc = F.reshape(camloc, (BR, 1, 3)) raydir = F.reshape(raydir, (BR, 1, 3)) step = (t1 - t0) / (N - 1) intervals = F.reshape(F.arange(0, N), (1, N, 1)) ts = t0 + step * intervals points = camloc + ts * raydir points = F.reshape(points, (BR * N, 3)) # SDF computation sdf_points = [] batch = (BR * N) // n_chunks for r in range(0, BR * N, batch): sdf_points.append(self.sdf(points[r:r + batch, :])) sdf_points = F.reshape(F.concatenate(*sdf_points, axis=0), (BR, N, 1)) if n_chunks != 1 else \ F.reshape(sdf_points[0], (BR, N, 1)) # t_argmin computation if t_argmin: idx_min = F.min(sdf_points, axis=1, keepdims=True, only_index=True) t_argmin = F.reshape(F.gather(ts, idx_min, axis=1, batch_dims=1), (BR, 1)) return t_argmin # Intersection check points = F.reshape(points, (BR, N, 3)) sdf_pos = F.greater_equal_scalar(sdf_points[:, :-1, :], 0) sdf_neg = F.less_equal_scalar(sdf_points[:, 1:, :], 0) mask_hit = sdf_pos * sdf_neg decreasing_consts = F.reshape(F.arange(N, 1, -1), (1, N - 1, 1)) vals = mask_hit * decreasing_consts idx_max = F.max(vals, axis=1, only_index=True) points = points[:, :-1, :] x_hit = F.gather(points, idx_max, axis=1, batch_dims=1) x_hit = F.reshape(x_hit, (BR, 3)) mask_hit = F.greater_scalar(F.sum(mask_hit, axis=1), 0) mask_hit = F.reshape(mask_hit, (BR, 1)) x_hit_rm0 = x_hit step = F.reshape(step, (BR, 1)) raydir = F.reshape(raydir, (BR, 3)) x_hit_rm1 = x_hit_rm0 + step * raydir return x_hit_rm0, x_hit_rm1, mask_hit
def visualize_discrete_image(self, var, filename): assert var.ndim < 3, 'The discrete image should only consist of indices of the codebook vectors' if var.ndim == 2 and var.shape[1] > 1: var = F.max(var, axis=1, only_index=True) var = F.reshape(var, [-1, 1] + self.latent_shape, inplace=True) var = var / self.num_embedding img = nn.monitor.tile_images(var.d) plt.imshow(img, cmap='magma') plt.axis('off') plt.savefig(filename, bbox_inches='tight') plt.close() print('Reconstruction saved at {}'.format(filename))
def pointer_net(query_embed, query_embed_mask, decoder_states, hidden_dim): """ query_embed: (batch_size, max_query_length, E1) decoder_states: (batch_size, max_action_length, E2) """ with nn.parameter_scope("pointer_net"): batch_size, max_query_length, _ = query_embed.shape _, max_action_length, _ = decoder_states.shape with nn.parameter_scope("layer1_input"): query_embed_trans = dense(query_embed, hidden_dim, base_axis=2, activation=lambda x: x) with nn.parameter_scope("layer1_h"): h_trans = dense(decoder_states, hidden_dim, base_axis=2, activation=lambda x: x) query_embed_trans = F.reshape( query_embed_trans, (batch_size, 1, max_query_length, hidden_dim)) query_embed_trans = F.broadcast( query_embed_trans, (batch_size, max_action_length, max_query_length, hidden_dim)) h_trans = F.reshape(h_trans, (batch_size, max_action_length, 1, hidden_dim)) h_trans = F.broadcast( h_trans, (batch_size, max_action_length, max_query_length, hidden_dim)) dense1_trans = F.tanh(query_embed_trans + h_trans) with nn.parameter_scope("layer2"): # scores: (batch_size, max_action_length, max_query_length, 1) scores = dense(dense1_trans, 1, base_axis=3, activation=lambda x: x) # scores: (batch_size, max_action_length, max_query_length) scores = F.reshape(scores, (batch_size, max_action_length, max_query_length)) scores = F.exp(scores - F.max(scores, axis=2, keepdims=True)) mask = F.reshape(query_embed_mask, (batch_size, 1, max_query_length)) mask = F.broadcast(mask, (batch_size, max_action_length, max_query_length)) scores = scores * mask scores = scores / F.sum(scores, axis=2, keepdims=True) return scores
def get_preds_fromhm(hm, center=None, scale=None): """Obtain (x,y) coordinates given a set of N heatmaps. If the center and the scale is provided the function will return the points also in the original coordinate frame. Arguments: hm {numpy.array} -- the predicted heatmaps, of shape [B, N, W, H] Keyword Arguments: center {numpy.array} -- the center of the bounding box (default: {None}) scale {float} -- face scale (default: {None}) """ idx = F.max(F.reshape( hm, (hm.shape[0], hm.shape[1], hm.shape[2] * hm.shape[3])), axis=2, only_index=True) idx.d += 1 idx = F.reshape(idx, (1, 68, 1)) preds = F.concatenate(idx, idx, axis=2) preds.d[..., 0] = preds[..., 0].apply(d=(preds[..., 0].d - 1) % hm.shape[3] + 1).d preds.d[..., 1] = preds[..., 1].apply(d=(preds[..., 1].d + 1) // hm.shape[2] + 1).d for i in range(preds.shape[0]): for j in range(preds.shape[1]): hm_ = hm[i, j, :] pX, pY = int(preds[i, j, 0].d) - 1, int(preds[i, j, 1].d) - 1 if pX > 0 and pX < 63 and pY > 0 and pY < 63: preds.d[i, j] += np.sign(hm_.d[pY, pX + 1] - hm_.d[pY, pX - 1] ) * .25, np.sign(hm_.d[pY + 1, pX] - hm_.d[pY - 1, pX]) * .25 preds.d -= .5 preds_orig = F.constant(shape=preds.shape) if center is not None and scale is not None: for i in range(hm.shape[0]): for j in range(hm.shape[1]): d = transform(list(preds.d[i][j]), center, scale, hm.shape[2], True) preds_orig.d[i, j] = d[0], d[1] return preds, preds_orig
def _build(self): # infer variable self.infer_obs_t = nn.Variable((1, 4, 84, 84)) # inference output self.infer_q_t = self.q_function(self.infer_obs_t, self.num_actions, scope='q_func') # train variables self.obss_t = nn.Variable((self.batch_size, 4, 84, 84)) self.acts_t = nn.Variable((self.batch_size, 1)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, 4, 84, 84)) self.ters_tp1 = nn.Variable((self.batch_size, 1)) self.weights = nn.Variable((self.batch_size, 1)) # training output q_t = self.q_function(self.obss_t, self.num_actions, scope='q_func') q_tp1 = self.q_function(self.obss_tp1, self.num_actions, scope='target_q_func') # select one dimension a_t_one_hot = F.one_hot(self.acts_t, (self.num_actions, )) q_t_selected = F.sum(q_t * a_t_one_hot, axis=1, keepdims=True) q_tp1_best = F.max(q_tp1, axis=1, keepdims=True) # loss calculation y = self.rews_tp1 + self.gamma * q_tp1_best * (1.0 - self.ters_tp1) self.td = q_t_selected - y self.loss = F.sum(F.huber_loss(q_t_selected, y) * self.weights) self.loss_sink = F.sink(self.td, self.loss) # optimizer self.solver = S.RMSprop(self.lr, 0.95, 1e-2) # weights and biases with nn.parameter_scope('q_func'): self.params = nn.get_parameters() with nn.parameter_scope('target_q_func'): self.target_params = nn.get_parameters() # set q function parameters to solver self.solver.set_parameters(self.params)
def network_size_activations(): """ Returns total number of activations and size in KBytes (NNabla variable using `max` or `sum` operator) """ kbytes = [] num_activations = 0 # get all parameters ps = nn.get_parameters(grad_only=False) for p in ps: if "Asize" in p: print(f"{p}\t{ps[p].d}") num_activations += ps[p].d if cfg.a_quantize is not None: if cfg.a_quantize in ['fp_relu', 'pow2_relu']: # fixed quantization n = nn.Variable((), need_grad=False) n.d = cfg.a_bitwidth elif cfg.a_quantize in [ 'parametric_fp_relu', 'parametric_fp_b_xmax_relu', 'parametric_fp_d_b_relu', 'parametric_pow2_b_xmax_relu', 'parametric_pow2_b_xmin_relu' ]: # parametric quantization s = p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/n") n = F.round( clip_scalar(ps[s], cfg.a_bitwidth_min, cfg.a_bitwidth_max)) elif cfg.a_quantize in ['parametric_fp_d_xmax_relu']: # these quantization methods do not have n, so we need to compute it! # parametric quantization d = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/d")] xmax = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmax")] # ensure that stepsize is in specified range and a power of two d_q = quantize_pow2( clip_scalar(d, cfg.a_stepsize_min, cfg.a_stepsize_max)) # ensure that dynamic range is in specified range xmax = clip_scalar(xmax, cfg.a_xmax_min, cfg.a_xmax_max) # compute real `xmax` xmax = F.round(xmax / d_q) * d_q n = F.maximum_scalar(F.ceil(log2(xmax / d_q + 1.0)), cfg.a_bitwidth_min) elif cfg.a_quantize in ['parametric_pow2_xmin_xmax_relu']: # these quantization methods do not have n, so we need to compute it! # parametric quantization xmin = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmin")] xmax = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmax")] # ensure that dynamic ranges are in specified range and a power-of-two xmin = quantize_pow2( clip_scalar(xmin, cfg.a_xmin_min, cfg.a_xmin_max)) xmax = quantize_pow2( clip_scalar(xmax, cfg.a_xmax_min, cfg.a_xmax_max)) # use ceil rounding n = F.maximum_scalar( F.ceil(log2(log2(xmax / xmin) + 1.) + 1.), cfg.a_bitwidth_min) else: raise ValueError("Unknown quantization method {}".format( cfg.a_quantize)) else: # float precision n = nn.Variable((), need_grad=False) n.d = 32. kbytes.append( F.reshape(n * ps[p].d / 8. / 1024., (1, ), inplace=False)) if cfg.target_activation_type == 'max': _kbytes = F.max(F.concatenate(*kbytes)) elif cfg.target_activation_type == 'sum': _kbytes = F.sum(F.concatenate(*kbytes)) return num_activations, _kbytes
def nonlocal_net(B_lab_map, relu_layers, temperature=0.001 * 5, detach_flag=False, WTA_scale_weight=1, feature_noise=0): batch_size = B_lab_map.shape[0] channel = B_lab_map.shape[1] image_height = B_lab_map.shape[2] image_width = B_lab_map.shape[3] feature_height = int(image_height / 4) feature_width = int(image_width / 4) feature_channel = 64 in_channels = feature_channel * 4 inter_channels = 256 # layer2_1 A_feature2_1 = layer2_1(relu_layers[0]) B_feature2_1 = layer2_1(relu_layers[4]) # layer3_1 A_feature3_1 = layer3_1(relu_layers[1]) B_feature3_1 = layer3_1(relu_layers[5]) # layer4_1 A_feature4_1 = layer4_1(relu_layers[2]) B_feature4_1 = layer4_1(relu_layers[6]) # layer5_1 A_feature5_1 = layer5_1(relu_layers[3]) B_feature5_1 = layer5_1(relu_layers[7]) if A_feature5_1.shape[2] != A_feature2_1.shape[2] or A_feature5_1.shape[3] != A_feature2_1.shape[3]: A_feature5_1 = pad_replicate(A_feature5_1) B_feature5_1 = pad_replicate(B_feature5_1) A_features = layer( F.concatenate( A_feature2_1, A_feature3_1, A_feature4_1, A_feature5_1, axis=1), feature_channel * 4) B_features = layer( F.concatenate( B_feature2_1, B_feature3_1, B_feature4_1, B_feature5_1, axis=1), feature_channel * 4) # pairwise cosine similarity theta = PF.convolution( A_features, inter_channels, kernel=( 1, 1), stride=( 1, 1), name='theta') theta_re = F.reshape(theta, (batch_size, inter_channels, -1)) theta_re = theta_re - F.mean(theta_re, axis=2, keepdims=True) # center the feature theta_norm = F.norm( theta_re, p=2, axis=1, keepdims=True) + sys.float_info.epsilon theta_re = F.div2(theta_re, theta_norm) # 2*(feature_height*feature_width)*256 theta_permute = F.transpose(theta_re, (0, 2, 1)) phi = PF.convolution( B_features, inter_channels, kernel=( 1, 1), stride=( 1, 1), name='phi') phi_re = F.reshape(phi, (batch_size, inter_channels, -1)) # center the feature phi_re = phi_re - F.mean(phi_re, axis=2, keepdims=True) phi_norm = F.norm(phi_re, p=2, axis=1, keepdims=True) + \ sys.float_info.epsilon phi_re = F.div2(phi_re, phi_norm) # 2*(feature_height*feature_width)*(feature_height*feature_width) f = F.batch_matmul(theta_permute, phi_re) f_shape = f.shape f = F.reshape(f, (1,) + f_shape) f_similarity = F.reshape(f, (1,) + f_shape) similarity_map = F.max(f_similarity, axis=3, keepdims=True) similarity_map = F.reshape( similarity_map, (batch_size, 1, feature_height, feature_width)) # f can be negative # if WTA_scale_weight == 1: f_WTA = f f_WTA = f_WTA / temperature f_WTA_sp = f_WTA.shape f_WTA = F.reshape(f_WTA, (f_WTA_sp[1], f_WTA_sp[2], f_WTA_sp[3])) # 2*1936*1936; softmax along the horizontal line (dim=-1) f_div_C = F.softmax(f_WTA, axis=2) # downsample the reference color B_lab = F.average_pooling(B_lab_map, (4, 4)) B_lab = F.reshape(B_lab, (batch_size, channel, -1)) B_lab = F.transpose(B_lab, (0, 2, 1)) # 2*1936*channel # multiply the corr map with color y = F.batch_matmul(f_div_C, B_lab) # 2*1936*channel y = F.transpose(y, (0, 2, 1)) y = F.reshape( y, (batch_size, channel, feature_height, feature_width)) # 2*3*44*44 y = F.interpolate(y, scale=(4, 4), mode='nearest', align_corners=False) similarity_map = F.interpolate( similarity_map, scale=( 4, 4), mode='nearest', align_corners=False) return y, similarity_map
def cond_att_lstm(x, parent_index, mask, context, context_mask, state_size, att_hidden_size, initial_state=None, initial_cell=None, hist=None, dropout=0, train=True, w_init=None, inner_w_init=None, b_init=I.ConstantInitializer(0), forget_bias_init=I.ConstantInitializer(1)): """ x: (batch_size, length, input_size) parent_index: (batch_size, length) mask: (batch_size, length) context: (batch_size, context_length, context_size) context_mask: (batch_size, context_length) hist: (batch_size, l, state_size) """ batch_size, length, input_size = x.shape _, context_length, context_size = context.shape if w_init is None: w_init = I.UniformInitializer( I.calc_uniform_lim_glorot(input_size, state_size)) if inner_w_init is None: inner_w_init = orthogonal retain_prob = 1.0 - dropout z_w = nn.Variable((batch_size, 4, input_size), need_grad=False) z_w.d = 1 z_u = nn.Variable((batch_size, 4, state_size), need_grad=False) z_u.d = 1 if dropout > 0: if train: z_w = F.dropout(z_w, p=retain_prob) z_u = F.dropout(z_u, p=retain_prob) z_w *= retain_prob z_u *= retain_prob z_w = F.reshape(z_w, (batch_size, 4, 1, input_size)) z_w = F.broadcast(z_w, (batch_size, 4, length, input_size)) z_w = F.split(z_w, axis=1) z_u = F.split(z_u, axis=1) xi = z_w[0] * x xf = z_w[1] * x xc = z_w[2] * x xo = z_w[3] * x with nn.parameter_scope("cond_att_lstm"): # (batch_size, length, state_size) with nn.parameter_scope("lstm"): xi = PF.affine( xi, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wi") xf = PF.affine( xf, state_size, base_axis=2, w_init=w_init, b_init=forget_bias_init, name="Wf") xc = PF.affine( xc, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wc") xo = PF.affine( xo, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wo") with nn.parameter_scope("context"): # context_att_trans: (batch_size, context_size, att_hidden_size) context_att_trans = PF.affine( context, att_hidden_size, base_axis=2, w_init=w_init, b_init=b_init, name="layer1_c") if initial_state is None: h = nn.Variable((batch_size, state_size), need_grad=False) h.data.zero() else: h = initial_state if initial_cell is None: c = nn.Variable((batch_size, state_size), need_grad=False) c.data.zero() else: c = initial_cell if hist is None: hist = nn.Variable((batch_size, 1, state_size), need_grad=False) hist.data.zero() # (batch_size, state_size) xi = split(xi, axis=1) xf = split(xf, axis=1) xc = split(xc, axis=1) xo = split(xo, axis=1) mask = F.reshape(mask, [batch_size, length, 1]) # (batch_size, length, 1) mask = F.broadcast(mask, [batch_size, length, state_size]) # (batch_size, state_size) mask = split(mask, axis=1) # (batch_size, max_action_length) parent_index = parent_index + 1 # index == 0 means that parent is root # (batch_size) parent_index = split(parent_index, axis=1) hs = [] cs = [] ctx = [] for i, f, c2, o, m, p in zip(xi, xf, xc, xo, mask, parent_index): h_num = hist.shape[1] with nn.parameter_scope("context"): h_att_trans = PF.affine( h, att_hidden_size, with_bias=False, w_init=w_init, name="layer1_h") # (batch_size, att_hidden_size) h_att_trans = F.reshape(h_att_trans, (batch_size, 1, att_hidden_size)) h_att_trans = F.broadcast( h_att_trans, (batch_size, context_length, att_hidden_size)) att_hidden = F.tanh(context_att_trans + h_att_trans) att_raw = PF.affine( att_hidden, 1, base_axis=2, w_init=w_init, b_init=b_init) # (batch_size, context_length, 1) att_raw = F.reshape(att_raw, (batch_size, context_length)) ctx_att = F.exp(att_raw - F.max(att_raw, axis=1, keepdims=True)) ctx_att = ctx_att * context_mask ctx_att = ctx_att / F.sum(ctx_att, axis=1, keepdims=True) ctx_att = F.reshape(ctx_att, (batch_size, context_length, 1)) ctx_att = F.broadcast(ctx_att, (batch_size, context_length, context_size)) ctx_vec = F.sum( context * ctx_att, axis=1) # (batch_size, context_size) # parent_history p = F.reshape(p, (batch_size, 1)) p = F.one_hot(p, (h_num, )) p = F.reshape(p, (batch_size, 1, h_num)) par_h = F.batch_matmul(p, hist) # [batch_size, 1, state_size] par_h = F.reshape(par_h, (batch_size, state_size)) with nn.parameter_scope("lstm"): i_t = PF.affine( z_u[0] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Ui") i_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Ci") i_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pi") i_t = F.sigmoid(i + i_t) f_t = PF.affine( z_u[1] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uf") f_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Cf") f_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pf") f_t = F.sigmoid(f + f_t) c_t = PF.affine( z_u[2] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uc") c_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Cc") c_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pc") c_t = f_t * c + i_t * F.tanh(c2 + c_t) o_t = PF.affine( z_u[3] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uo") o_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Co") o_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Po") o_t = F.sigmoid(o + o_t) h_t = o_t * F.tanh(c_t) h_t = (1 - m) * h + m * h_t c_t = (1 - m) * c + m * c_t h = h_t c = c_t h_t = F.reshape(h_t, (batch_size, 1, state_size), inplace=False) c_t = F.reshape(c_t, (batch_size, 1, state_size), inplace=False) ctx_vec = F.reshape( ctx_vec, (batch_size, 1, context_size), inplace=False) hs.append(h_t) cs.append(c_t) ctx.append(ctx_vec) hist = F.concatenate( hist, h_t, axis=1) # (batch_size, h_num + 1, state_size) return concatenate( *hs, axis=1), concatenate( *cs, axis=1), concatenate( *ctx, axis=1), hist
def _build(self): # infer variable self.infer_obs_t = infer_obs_t = nn.Variable((1, 4, 84, 84)) # inference output self.infer_q_t,\ self.infer_probs_t, _ = self.q_function(infer_obs_t, self.num_actions, self.min_v, self.max_v, self.num_bins, 'q_func') self.infer_t = F.sink(self.infer_q_t, self.infer_probs_t) # train variables self.obss_t = nn.Variable((self.batch_size, 4, 84, 84)) self.acts_t = nn.Variable((self.batch_size, 1)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, 4, 84, 84)) self.ters_tp1 = nn.Variable((self.batch_size, 1)) # training output q_t, probs_t, dists = self.q_function(self.obss_t, self.num_actions, self.min_v, self.max_v, self.num_bins, 'q_func') q_tp1, probs_tp1, _ = self.q_function(self.obss_tp1, self.num_actions, self.min_v, self.max_v, self.num_bins, 'target_q_func') expand_last = lambda x: F.reshape(x, x.shape + (1, )) flat = lambda x: F.reshape(x, (-1, 1)) # extract selected dimension a_t_one_hot = expand_last(F.one_hot(self.acts_t, (self.num_actions, ))) probs_t_selected = F.max(probs_t * a_t_one_hot, axis=1) # extract max dimension _, indices = F.max(q_tp1, axis=1, keepdims=True, with_index=True) a_tp1_one_hot = expand_last(F.one_hot(indices, (self.num_actions, ))) probs_tp1_best = F.max(probs_tp1 * a_tp1_one_hot, axis=1) # clipping reward clipped_rews_tp1 = clip_by_value(self.rews_tp1, -1.0, 1.0) disc_q_tp1 = F.reshape(dists, (1, -1)) * (1.0 - self.ters_tp1) t_z = clip_by_value(clipped_rews_tp1 + self.gamma * disc_q_tp1, self.min_v, self.max_v) # update indices b = (t_z - self.min_v) / ((self.max_v - self.min_v) / (self.num_bins - 1)) l = F.floor(b) l_mask = F.reshape(F.one_hot(flat(l), (self.num_bins, )), (-1, self.num_bins, self.num_bins)) u = F.ceil(b) u_mask = F.reshape(F.one_hot(flat(u), (self.num_bins, )), (-1, self.num_bins, self.num_bins)) m_l = expand_last(probs_tp1_best * (1 - (b - l))) m_u = expand_last(probs_tp1_best * (b - l)) m = F.sum(m_l * l_mask + m_u * u_mask, axis=1) m.need_grad = False self.loss = -F.mean(F.sum(m * F.log(probs_t_selected + 1e-10), axis=1)) # optimizer self.solver = S.RMSprop(self.lr, 0.95, 1e-2) # weights and biases with nn.parameter_scope('q_func'): self.params = nn.get_parameters() with nn.parameter_scope('target_q_func'): self.target_params = nn.get_parameters() # set q function parameters to solver self.solver.set_parameters(self.params)
def generate_attribute_direction(args, attribute_prediction_model): if not os.path.isfile(os.path.join(args.weights_path, 'gen_params.h5')): os.makedirs(args.weights_path, exist_ok=True) print( "Downloading the pretrained tf-converted weights. Please wait...") url = "https://nnabla.org/pretrained-models/nnabla-examples/GANs/stylegan2/styleGAN2_G_params.h5" from nnabla.utils.data_source_loader import download download(url, os.path.join(args.weights_path, 'gen_params.h5'), False) nn.load_parameters(os.path.join(args.weights_path, 'gen_params.h5')) print('Loaded pretrained weights from tensorflow!') nn.load_parameters(args.classifier_weight_path) print(f'Loaded {args.classifier_weight_path}') batches = [ args.batch_size for _ in range(args.num_images // args.batch_size) ] if args.num_images % args.batch_size != 0: batches.append(args.num_images - (args.num_images // args.batch_size) * args.batch_size) w_plus, w_minus = 0.0, 0.0 w_plus_count, w_minus_count = 0.0, 0.0 pbar = trange(len(batches)) for i in pbar: batch_size = batches[i] z = [F.randn(shape=(batch_size, 512)).data] z = [z[0], z[0]] for i in range(len(z)): z[i] = F.div2( z[i], F.pow_scalar(F.add_scalar( F.mean(z[i]**2., axis=1, keepdims=True), 1e-8), 0.5, inplace=True)) # get latent code w = [mapping_network(z[0], outmaps=512, num_layers=8)] w += [mapping_network(z[1], outmaps=512, num_layers=8)] # truncation trick dlatent_avg = nn.parameter.get_parameter_or_create(name="dlatent_avg", shape=(1, 512)) w = [lerp(dlatent_avg, _, 0.7) for _ in w] constant_bc = nn.parameter.get_parameter_or_create( name="G_synthesis/4x4/Const/const", shape=(1, 512, 4, 4)) constant_bc = F.broadcast(constant_bc, (batch_size, ) + constant_bc.shape[1:]) gen = synthesis(w, constant_bc, noise_seed=100, mix_after=7) classifier_score = F.softmax(attribute_prediction_model(gen, True)) confidence, class_pred = F.max(classifier_score, axis=1, with_index=True, keepdims=True) w_plus += np.sum(w[0].data * (class_pred.data == 0) * (confidence.data > 0.65), axis=0, keepdims=True) w_minus += np.sum(w[0].data * (class_pred.data == 1) * (confidence.data > 0.65), axis=0, keepdims=True) w_plus_count += np.sum( (class_pred.data == 0) * (confidence.data > 0.65)) w_minus_count += np.sum( (class_pred.data == 1) * (confidence.data > 0.65)) pbar.set_description(f'{w_plus_count} {w_minus_count}') # save attribute direction attribute_variation_direction = (w_plus / w_plus_count) - (w_minus / w_minus_count) print(w_plus_count, w_minus_count) np.save(f'{args.classifier_weight_path.split("/")[0]}/direction.npy', attribute_variation_direction)