def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var): #TODO: squared error/absolute error with nn.context_scope(ctx): loss_sr = F.mean(F.squared_error( F.softmax(pred0), F.softmax(pred1)) * F.exp(-log_var)) \ + F.mean(log_var) return loss_sr
def pred(decoder_hidden_states, ctx_vectors, query_embed, query_embed_mask, rule_num, token_num, embedding_size, hidden_size): """ decoder_hidden_states: (batch_size, max_action_length, decoder_state_size) ctx_vectors: (batch_size, max_action_length, encoder_state_size) """ batch_size, max_action_length, _ = decoder_hidden_states.shape dc = concatenate(decoder_hidden_states, ctx_vectors, axis=2) with nn.parameter_scope("decoder_state_rule"): # (batch_size, max_action_length, embedding_size) decoder_hidden_state_trans_rule = dense(decoder_hidden_states, embedding_size, base_axis=2) with nn.parameter_scope("decoder_state_token"): # (batch_size, max_action_length, decoder_state_size + encoder_state_size) # (batch_size, max_action_length, embedding_size) decoder_hidden_state_trans_token = dense(dc, embedding_size, base_axis=2) with nn.parameter_scope("rule_embedding"): # (batch_size, max_action_length, rule_num) rule_predict = embed_inverse(decoder_hidden_state_trans_rule, rule_num, embedding_size, base_axis=2) embed_b = nn.parameter.get_parameter_or_create("embed/b", (rule_num, ), need_grad=True) embed_b.data.zero() embed_b = F.reshape(embed_b, (1, 1, rule_num), inplace=False) embed_b = F.broadcast(embed_b, (batch_size, max_action_length, rule_num)) rule_predict = F.softmax(rule_predict + embed_b) with nn.parameter_scope("gen_action"): terminal_gen_action_prob = dense(decoder_hidden_states, 2, base_axis=2, activation=F.softmax) with nn.parameter_scope("token_embedding"): # (batch_size, max_action_length, token_num) token_predict = embed_inverse(decoder_hidden_state_trans_token, token_num, embedding_size, base_axis=2) embed_b = nn.parameter.get_parameter_or_create("embed/b", (token_num, ), need_grad=True) embed_b.data.zero() embed_b = F.reshape(embed_b, (1, 1, token_num), inplace=False) embed_b = F.broadcast(embed_b, (batch_size, max_action_length, token_num)) token_predict = F.softmax(token_predict + embed_b) with nn.parameter_scope("copy_token"): # (batch_size, max_action_length, max_query_length) copy_prob = pointer_net(query_embed, query_embed_mask, dc, hidden_size) return rule_predict, terminal_gen_action_prob, token_predict, copy_prob
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var0, log_var1): #TODO: squared error/absolute error s0 = F.exp(log_var0) s1 = F.exp(log_var1) squared_error = F.squared_error(F.softmax(pred0), F.softmax(pred1)) with nn.context_scope(ctx): loss_sr = F.mean(squared_error * (1 / s0 + 1 / s1) + (s0 / s1 + s1 / s0)) * 0.5 return loss_sr
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var0, log_var1): #TODO: squared error/absolute error s0 = F.exp(log_var0) s1 = F.exp(log_var1) squared_error = F.squared_error(F.softmax(pred0), F.softmax(pred1)) with nn.context_scope(ctx): loss_sr = F.mean(squared_error * (1 / s0 + 1 / s1) + (s0 / s1 + s1 / s0)) * 0.5 return loss_sr
def er_loss(ctx, pred): with nn.context_scope(ctx): bs = pred.shape[0] d = np.prod(pred.shape[1:]) denominator = bs * d pred_normalized = F.softmax(pred) pred_log_normalized = F.log(F.softmax(pred)) loss_er = -F.sum(pred_normalized * pred_log_normalized) / denominator return loss_er
def er_loss(ctx, pred): with nn.context_scope(ctx): bs = pred.shape[0] d = np.prod(pred.shape[1:]) denominator = bs * d pred_normalized = F.softmax(pred) pred_log_normalized = F.log(F.softmax(pred)) loss_er = - F.sum(pred_normalized * pred_log_normalized) / denominator return loss_er
def psm_net(left, right, maxdisp, training): print(training) if training: batch_stat = True else: batch_stat = False # feature extraction refimg_fea = feature_extraction(left, batch_stat, training) targetimg_fea = feature_extraction(right, batch_stat, training) # matching cost = build_cost_volume(refimg_fea, targetimg_fea, maxdisp) cost0 = dres0(cost, batch_stat) cost0 = dres1(cost0, batch_stat) + cost0 out1, pre1, post1 = hourglass(cost0, None, None, batch_stat) out1 = out1 + cost0 out2, pre2, post2 = hourglass(out1, pre1, post1, batch_stat) out2 = out2 + cost0 out3, pre3, post3 = hourglass(out2, pre1, post2, batch_stat) out3 = out3 + cost0 cost1 = classif1(out1, batch_stat) cost2 = classif2(out2, batch_stat) + cost1 cost3 = classif3(out3, batch_stat) + cost2 if training: with nn.parameter_scope('cost1_upsample'): cost1_upsample = upsample(cost1, 4, True) cost1_upsample = F.softmax(cost1_upsample, axis=2) pred1 = disparityregression(cost1_upsample, maxdisp) with nn.parameter_scope('cost2_upsample'): cost2_upsample = upsample(cost2, 4, True) cost2_upsample = F.softmax(cost2_upsample, axis=2) pred2 = disparityregression(cost2_upsample, maxdisp) with nn.parameter_scope('cost3_upsample'): cost3_upsample = upsample(cost3, 4, True) cost3_upsample = F.softmax(cost3_upsample, axis=2) pred3 = disparityregression(cost3_upsample, maxdisp) if training: return pred1, pred2, pred3 else: return pred3
def __init__(self, parents): smo.Graph.__init__(self, parents=parents) join_param = Parameter(shape=(len(parents) + 3, )) join_prob = F.softmax(join_param) self.append( smo.Input(name='input_1', value=nn.Variable((10, 20, 32, 32)), eval_prob=join_prob[0] + join_prob[1])) self.append( smo.Conv(name='conv', parents=[self[-1]], in_channels=20, out_channels=20, kernel=(3, 3), pad=(1, 1), eval_prob=join_prob[0])) self.append( smo.Input(name='input_2', value=nn.Variable((10, 20, 32, 32)), eval_prob=join_prob[2])) self.append( smo.Join(name='join', parents=parents + [mi for mi in self], mode='linear', join_parameters=join_param))
def __call__(self, features): upsampled_inputs = [ F.interpolate(x, output_size=features[0].shape[2:], mode='linear', align_corners=False, half_pixel=True) for x in features ] inputs = F.concatenate(*upsampled_inputs, axis=1) out = self.conv2d(inputs, self.hparams['channels'], kernel_size=1, stride=1, bias=False, name='convs/0/conv') out = F.relu(self.batch_norm(out, name='convs/0/bn')) out = self.conv2d(out, self.hparams['num_classes'], kernel_size=1, stride=1, bias=True, name='conv_seg') out = F.interpolate(out, output_size=self.output_size, mode='linear', align_corners=False, half_pixel=True) if self.test: return F.softmax(out, axis=1) return out
def attn_block(x, name, num_heads=4, fix_parameters=False): """Multihead attention block""" B, C, H, W = x.shape with nn.parameter_scope(name): # Get query, key, value h = normalize(x, name="norm") # nin(3 * C) -> split is faster? q = nin(h, C, name="q") k = nin(h, C, name="k") v = nin(h, C, name="v") # Attention w = F.batch_matmul(F.reshape(q, (B * num_heads, -1, H * W)), F.reshape(k, (B * num_heads, -1, H * W)), transpose_a=True) w = F.mul_scalar(w, int(C)**(-0.5), inplace=True) assert w.shape == (B * num_heads, H * W, H * W) w = F.softmax(w, axis=-1) h = F.reshape(v, (B * num_heads, -1, H * W)) h = F.batch_matmul(h, w) h = F.reshape(h, (B, C, H, W)) # output projection h = nin(h, C, name='proj_out', zeroing_w=True) assert h.shape == x.shape return F.add2(h, x, inplace=True)
def kl_divergence(ctx, pred, label, log_var): with nn.context_scope(ctx): s = F.pow_scalar(F.exp(log_var), 0.5) elms = softmax_with_temperature(ctx, label, s) \ * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def kl_divergence(ctx, pred, label, log_var): with nn.context_scope(ctx): s = F.pow_scalar(F.exp(log_var), 0.5) elms = softmax_with_temperature(ctx, label, s) \ * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def random_generate(self, num_images, path): # Generate from the uniform prior of the base model indices = F.randint(low=0, high=self.num_embedding, shape=[num_images] + self.latent_shape) indices = F.reshape(indices, (-1, ), inplace=True) quantized = F.embed(indices, self.base_model.vq.embedding_weight) quantized = F.transpose( quantized.reshape([num_images] + self.latent_shape + [quantized.shape[-1]]), (0, 3, 1, 2)) img_gen_uniform_prior = self.base_model(quantized, quantized_as_input=True, test=True) # Generate images using pixelcnn prior indices = nn.Variable.from_numpy_array( np.zeros(shape=[num_images] + self.latent_shape)) labels = F.randint(low=0, high=self.num_classes, shape=(num_images, 1)) labels = F.one_hot(labels, shape=(self.num_classes, )) # Sample from pixelcnn - pixel by pixel import torch # Numpy behavior is different and not giving correct output for i in range(self.latent_shape[0]): for j in range(self.latent_shape[1]): quantized = F.embed(indices.reshape((-1, )), self.base_model.vq.embedding_weight) quantized = F.transpose( quantized.reshape([num_images] + self.latent_shape + [quantized.shape[-1]]), (0, 3, 1, 2)) indices_sample = self.prior(quantized, labels) indices_prob = F.reshape(indices_sample, indices.shape + (indices_sample.shape[-1], ), inplace=True)[:, i, j] indices_prob = F.softmax(indices_prob) indices_prob_tensor = torch.from_numpy(indices_prob.d) sample = indices_prob_tensor.multinomial(1).squeeze().numpy() indices[:, i, j] = sample print(indices.d) quantized = F.embed(indices.reshape((-1, )), self.base_model.vq.embedding_weight) quantized = F.transpose( quantized.reshape([num_images] + self.latent_shape + [quantized.shape[-1]]), (0, 3, 1, 2)) img_gen_pixelcnn_prior = self.base_model(quantized, quantized_as_input=True, test=True) self.save_image(img_gen_uniform_prior, os.path.join(path, 'generate_uniform.png')) self.save_image(img_gen_pixelcnn_prior, os.path.join(path, 'generate_pixelcnn.png')) print('Random labels generated for pixelcnn prior:', list(F.max(labels, axis=1, only_index=True).d))
def network(x, test=False): # Input:x -> 1,128,128 # ImageAugmentation h = F.image_augmentation(x, (1,128,128), (0,0), 1, 1, 0, 1, 0, False, False, 0, False, 1, 0.5, False, 0) # Convolution -> 16,124,124 h = PF.convolution(h, 16, (5,5), (0,0), name='Convolution') # ReLU h = F.relu(h, True) # MaxPooling -> 16,62,62 h = F.max_pooling(h, (2,2), (2,2)) # Convolution_2 -> 30,60,60 h = PF.convolution(h, 30, (3,3), (0,0), name='Convolution_2') # MaxPooling_2 -> 30,30,30 h = F.max_pooling(h, (2,2), (2,2)) # Tanh_2 h = F.tanh(h) # Affine -> 150 h = PF.affine(h, (150,), name='Affine') # ReLU_2 h = F.relu(h, True) # Affine_2 -> 2 h = PF.affine(h, (2,), name='Affine_2') # Softmax h = F.softmax(h) return h
def compute_context(prev_state): batch_size = prev_state.shape[0] ht = PF.affine(prev_state, attention_units, with_bias=False, name='Waht') # -> (batch_size, attention_units) ht = F.reshape(ht, (batch_size, 1, attention_units)) # -> (batch_size, 1, attention_units) ht = F.broadcast(ht, (batch_size, sentence_length_source, attention_units)) # -> (batch_size, sentence_length_source, attention_units) attention = F.tanh(hs + ht) # -> (batch_size, sentence_length_source, attention_units) attention = time_distributed(PF.affine)(attention, 1, with_bias=False, name='attention') # -> (batch_size, sentence_length_source, 1) attention = F.softmax(attention, axis=1) # -> (batch_size, sentence_length_source, 1) context = F.batch_matmul(hs, attention, transpose_a=True) context = F.reshape(context, (batch_size, attention_units)) return context
def attnblock(h, r=8, fix_parameters=False, sn=True, test=False): """Attention block""" x = h # 1x1 convolutions b, c, s0, s1 = h.shape c_r = c // r assert c_r > 0 f_x = convolution(h, c_r, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="f", with_bias=False, sn=sn, test=test) g_x = convolution(h, c_r, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="g", with_bias=False, sn=sn, test=test) h_x = convolution(h, c, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="h", with_bias=False, sn=sn, test=test) # Attend attn = F.batch_matmul(f_x.reshape( [b, c_r, -1]), g_x.reshape([b, c_r, -1]), transpose_a=True) attn = F.softmax(attn, 1) h_x = h_x.reshape([b, c, -1]) o = F.batch_matmul(h_x, attn) o = F.reshape(o, [b, c, s0, s1]) # Shortcut gamma = get_parameter_or_create( "gamma", [1, 1, 1, 1], ConstantInitializer(0.), not fix_parameters) y = gamma * o + x return y
def softmax_cross_entropy_backward(inputs, axis=None): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] t0 = inputs[2] D = len(x0.shape) axis = positive_axis(axis, D) c0 = x0.shape[axis] t0_shape = [s for s in t0.shape if s != 1] u0 = F.reshape(t0, (-1, 1), inplace=False) u1 = F.one_hot(u0, (c0, )) to = F.reshape(u1, t0_shape + [ c0, ]) t0 = no_grad(to) if axis != len(to.shape) - 1: oaxes = [i for i in range(len(t0_shape))] taxes = oaxes[:axis] + [to.ndim - 1] + oaxes[axis:] to = F.transpose(to, taxes) dx0 = dy * (F.softmax(x0, axis=axis) - to) return dx0, None
def Bahdanau_attention(query, values, out_features, scope): r"""Return the Bahdanau attention mechanism. Args: query (nn.Variable): A query of size (B, 1, C). values (nn.Variable): Values of size (B, T, C). out_features (int): The projected dimensionality. scope (str): Parameter scope. Returns: nn.Variable: The context vector. nn.Variable: The attention weight vector. """ with nn.parameter_scope(scope): x = PF.affine(query, out_features, base_axis=2, with_bias=False, name='query') y = PF.affine(values, out_features, base_axis=2, with_bias=False, name='values') # scores of shape (B, T, 1) scores = PF.affine(F.tanh(x + y), 1, base_axis=2, with_bias=False, name='scores') # attention_weights of shape (B, 1, T) attention_weights = F.softmax( scores, axis=1).reshape((query.shape[0], 1, -1)) # context_vector shape after sum == (B, 1, C) context_vector = F.batch_matmul(attention_weights, values) return context_vector, attention_weights
def yolov2_activate(x, anchors, biases): shape = x.shape y = F.reshape(x, ( shape[0], anchors, -1, ) + shape[2:]) stop = list(y.shape) stop[2] = 2 t_xy = F.slice(y, (0, 0, 0, 0, 0), stop) stop[2] = 4 t_wh = F.slice(y, (0, 0, 2, 0, 0), stop) stop[2] = 5 t_o = F.slice(y, (0, 0, 4, 0, 0), stop) stop[2] = y.shape[2] t_p = F.slice(y, (0, 0, 5, 0, 0), stop) t_xy = F.sigmoid(t_xy) t_wh = F.exp(t_wh) t_o = F.sigmoid(t_o) t_p = F.softmax(t_p, axis=2) t_x, t_y, t_wh = yolov2_image_coordinate(t_xy, t_wh, biases) y = F.concatenate(t_x, t_y, t_wh, t_o, t_p, axis=2) y = F.transpose(y, (0, 1, 3, 4, 2)).reshape( (shape[0], -1, shape[1] / anchors)) return y
def network(x, y, test=False): # Input:x -> 3,64,64 # AveragePooling -> 3,12,21 h = F.average_pooling(x, (5, 3), (5, 3)) # LeakyReLU_2 h = F.leaky_relu(h, 0.1, True) # Convolution_2 -> 20,13,21 h = PF.convolution(h, 20, (2, 3), (1, 1), name='Convolution_2') # BatchNormalization h = PF.batch_normalization(h, (1, ), 0.9, 0.0001, not test, name='BatchNormalization') # ReLU h = F.relu(h, True) # DepthwiseConvolution h = PF.depthwise_convolution(h, (5, 5), (2, 2), name='DepthwiseConvolution') # MaxPooling_2 -> 20,6,7 h = F.max_pooling(h, (2, 3), (2, 3)) # LeakyReLU h = F.leaky_relu(h, 0.1, True) # Affine -> 2 h = PF.affine(h, (2, ), name='Affine') # Softmax h = F.softmax(h) return h
def convolution(x): x = x.reshape([BATCH_SIZE, IMAGE_DEPTH, IMAGE_HEIGHT, IMAGE_WIDTH]) with nn.parameter_scope("conv1"): output = PF.convolution(x, 16, (5, 5), stride=(2, 2), pad=(1, 1)) output = F.relu(output) with nn.parameter_scope("conv2"): output = PF.convolution(output, 32, (3, 3), stride=(1, 1), pad=(1, 1)) output = F.relu(output) with nn.parameter_scope("conv3"): output = PF.convolution(output, 64, (3, 3), stride=(1, 1), pad=(1, 1)) output = F.relu(output) output = output.reshape([BATCH_SIZE, int(output.size / BATCH_SIZE)]) with nn.parameter_scope("fc1"): output = PF.affine(output, 1024) output = F.relu(output) with nn.parameter_scope("fc2"): output = PF.affine(output, 256) output = F.relu(output) with nn.parameter_scope("softmax"): output = PF.affine(output, 10) output = F.softmax(output) return output
def mlp(image, test=False): image /= 255.0 c1 = F.relu(PF.convolution(image, 32, (3, 3), name='conv1'), inplace=True) c2 = F.relu(PF.convolution(c1, 128, (3, 3), name='conv2'), inplace=True) c3 = F.relu(PF.convolution(c2, 256, (3, 3), name='conv3'), inplace=True) c4 = F.relu(PF.affine(c3, 512, name='fc3'), inplace=True) c5 = PF.affine(c3, 10, name='fc4') return F.softmax(c5)
def detect_keypoint(x, block_expansion, num_kp, num_channels, max_features, num_blocks, temperature, estimate_jacobian=False, scale_factor=1, single_jacobian_map=False, pad=0, test=False, comm=None): if scale_factor != 1: x = anti_alias_interpolate(x, num_channels, scale_factor) with nn.parameter_scope("hourglass"): feature_map = hourglass(x, block_expansion, num_blocks=num_blocks, max_features=max_features, test=test, comm=comm) with nn.parameter_scope("keypoint_detector"): inmaps, outmaps = feature_map.shape[1], num_kp k_w = I.calc_normal_std_he_forward( inmaps, outmaps, kernel=(7, 7)) / np.sqrt(2.) k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.) w_init = I.UniformInitializer((-k_w, k_w)) b_init = I.UniformInitializer((-k_b, k_b)) prediction = PF.convolution(feature_map, outmaps=num_kp, kernel=(7, 7), pad=(pad, pad), w_init=w_init, b_init=b_init) final_shape = prediction.shape heatmap = F.reshape(prediction, (final_shape[0], final_shape[1], -1)) heatmap = F.softmax(heatmap / temperature, axis=2) heatmap = F.reshape(heatmap, final_shape, inplace=False) out = gaussian2kp(heatmap) # {"value": value}, keypoint positions. if estimate_jacobian: if single_jacobian_map: num_jacobian_maps = 1 else: num_jacobian_maps = num_kp with nn.parameter_scope("jacobian_estimator"): jacobian_map = PF.convolution(feature_map, outmaps=4*num_jacobian_maps, kernel=(7, 7), pad=(pad, pad), w_init=I.ConstantInitializer(0), b_init=np.array([1, 0, 0, 1]*num_jacobian_maps)) jacobian_map = F.reshape( jacobian_map, (final_shape[0], num_jacobian_maps, 4, final_shape[2], final_shape[3])) heatmap = F.reshape( heatmap, heatmap.shape[:2] + (1,) + heatmap.shape[2:], inplace=False) jacobian = heatmap * jacobian_map jacobian = F.sum(jacobian, axis=(3, 4)) jacobian = F.reshape( jacobian, (jacobian.shape[0], jacobian.shape[1], 2, 2), inplace=False) out['jacobian'] = jacobian # jacobian near each keypoint. # out is a dictionary containing {"value": value, "jacobian": jacobian} return out
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args axis = self.forward_func.info.args["axis"] # Inputs x0 = inputs[0].data # logits t0 = inputs[1].data # labels dz = inputs[2].data # grad_input # Outputs dx0 = outputs[0].data dt0 = outputs[1].data # Grads of inputs g_x0 = inputs[0].grad g_t0 = inputs[1].grad g_dz = inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad g_dt0 = outputs[1].grad # Computation ## w.r.t. x0 if prop_down[0]: # gradient is the backward of softmax with (g_dx0 * dz) as in-coming gradient si = nn.Variable(x0.shape).apply(data=x0, need_grad=True) si.grad.fill(0.0) so = F.softmax(si, axis) if not nn.get_auto_forward(): so.forward() so.backward(g_dx0 * dz, clear_buffer=False) g_x0_ = si.grad if accum[0]: g_x0 += g_x0_ else: g_x0.copy_from(g_x0_) ## w.r.t. t0 is not required ## w.r.t. dz if prop_down[2]: # Instable implementation since using `/ dz` ## g_dz_ = g_dx0 * dx0 / dz ## g_dz_ = F.sum(g_dz_, axis) shape = dz.shape if dz.shape != [] else [1] si = nn.Variable(x0.shape).apply(data=x0, need_grad=True) ti = nn.Variable(t0.shape).apply(data=t0) o = nn.Variable(shape) o.grad.fill(1.0) self.forward_func.backward([si, ti], [o], [False, False]) # Sum g_dx0_i * (y_hat_i - y_i) over i g_dz_ = F.sum(g_dx0 * si.grad, axis) if accum[2]: g_dz += g_dz_ else: g_dz.copy_from(g_dz_)
def net(n_class, xs, xq, init_type='nnabla', embedding='conv4', net_type='prototypical', distance='euclid', test=False): ''' Similarity net function This function implements the network with settings as specified. Args: n_class (int): number of classes. Typical setting is 5 or 20. xs (~nnabla.Variable): support images. xq (~nnabla.Variable): query images. init_type (str, optional): initialization type for weights and bias parameters. See conv_initializer function. embedding(str, optional): embedding network. distance (str, optional): similarity metric to use. See similarity function. test (bool, optional): switch flag for training dataset and test dataset Returns: h (~nnabla.Variable): output variable indicating similarity between support and query. ''' # feature embedding for supports and queries n_shot = xs.shape[0] / n_class n_query = xq.shape[0] / n_class if embedding == 'conv4': fs = conv4(xs, test, init_type) # tensor of (n_support, fdim) fq = conv4(xq, test, init_type) # tensor of (n_query, fdim) if net_type == 'matching': # This example does not include the full-context-embedding of matching networks. fs = F.reshape(fs, (1, ) + fs.shape) # (1, n_way, fdim) # (n_way*n_query, 1, fdim) fq = F.reshape(fq, (fq.shape[0], 1) + fq.shape[1:]) h = similarity(fq, fs, distance) h = h - F.mean(h, axis=1, keepdims=True) if 1 < n_shot: h = F.minimum_scalar(F.maximum_scalar(h, -35), 35) h = F.softmax(h) h = F.reshape(h, (h.shape[0], n_class, n_shot)) h = F.mean(h, axis=2) # Reverse to logit to use same softmax cross entropy h = F.log(h) elif net_type == 'prototypical': if 1 < n_shot: fs = F.reshape(fs, (n_class, n_shot) + fs.shape[1:]) fs = F.mean(fs, axis=1) fs = F.reshape(fs, (1, ) + fs.shape) # (1, n_way, fdim) # (n_way*n_query, 1, fdim) fq = F.reshape(fq, (fq.shape[0], 1) + fq.shape[1:]) h = similarity(fq, fs, distance) h = h - F.mean(h, axis=1, keepdims=True) return h
def call(self, input): if self._mode == 'full': out = F.stack(*[op(input) for op in self._ops], axis=0) out = F.mul2(out, F.softmax(self._alpha, axis=0)) return F.sum(out, axis=0) # update active index self._update_active_index() return self._ops[self._active](input)
def network01E(x, y, test=False): # Input:x -> 1,64,48 # BinaryConnectConvolution -> 64,60,44 h = PF.binary_connect_convolution(x, 64, (5, 5), (0, 0), name='BinaryConnectConvolution') # MaxPooling -> 64,30,22 h = F.max_pooling(h, (2, 2), (2, 2)) # BatchNormalization h = PF.batch_normalization(h, (1, ), 0.5, 0.01, not test, name='BatchNormalization') # BinarySigmoid h = F.binary_sigmoid(h) # BinaryConnectConvolution_2 -> 64,26,18 h = PF.binary_connect_convolution(h, 64, (5, 5), (0, 0), name='BinaryConnectConvolution_2') # MaxPooling_2 -> 64,13,9 h = F.max_pooling(h, (2, 2), (2, 2)) # BatchNormalization_2 h = PF.batch_normalization(h, (1, ), 0.5, 0.01, not test, name='BatchNormalization_2') # BinarySigmoid_2 h = F.binary_sigmoid(h) # BinaryConnectAffine -> 512 h = PF.binary_connect_affine(h, (512, ), name='BinaryConnectAffine') # BatchNormalization_3 h = PF.batch_normalization(h, (1, ), 0.5, 0.01, not test, name='BatchNormalization_3') # BinarySigmoid_3 h = F.binary_sigmoid(h) # BinaryConnectAffine_2 -> 10 h = PF.binary_connect_affine(h, (10, ), name='BinaryConnectAffine_2') # BatchNormalization_4 h = PF.batch_normalization(h, (1, ), 0.5, 0.01, not test, name='BatchNormalization_4') # Softmax h = F.softmax(h) # CategoricalCrossEntropy -> 1 # h = F.categorical_cross_entropy(h, y) return h
def _scaled_dot_product_attention(q, k, v, attn_mask, dropout): B, Nt, E = q.shape q *= float(E)**-0.5 # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns) attn = F.batch_matmul(q, k, transpose_b=True) if attn_mask is not None: attn += attn_mask attn_output_weights = F.softmax(attn, axis=len(attn.shape) - 1) if dropout > 0.0: attn = F.dropout(attn, p=dropout) # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E) attn_output = F.batch_matmul(attn_output_weights, v) return attn_output, attn_output_weights
def cnn_network(obs, num_actions, scope): with nn.parameter_scope(scope): out = PF.convolution(obs, 32, (8, 8), stride=(4, 4), name='conv1') out = F.relu(out) out = PF.convolution(out, 64, (4, 4), stride=(2, 2), name='conv2') out = F.relu(out) out = PF.convolution(out, 64, (3, 3), stride=(1, 1), name='conv3') out = F.relu(out) out = PF.affine(out, 512, name='fc1') out = F.relu(out) policy = F.softmax(PF.affine(out, num_actions, name='policy')) value = PF.affine(out, 1, name='value') return policy, value
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args axis = self.forward_func.info.args["axis"] # To deal with double_backward index error for cuda in windows if axis < 0: axis += inputs[0].ndim # Inputs x0 = inputs[0].data y0 = inputs[1].data dy = inputs[2].data # Outputs dx0 = outputs[0].data # Grads of inputs g_x0 = inputs[0].grad g_y0 = inputs[1].grad g_dy = inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad # w.r.t. x0 if prop_down[0]: # gradient is the backward of softmax with (g_x0 * -sum_i dy_i) as in-coming gradient neg_sum_dy = -F.sum(dy, axis, True) si = nn.Variable(x0.shape).apply(data=x0, need_grad=True) si.grad.fill(0.0) so = F.softmax(si, axis) if not nn.get_auto_forward(): so.forward() so.backward(g_dx0 * neg_sum_dy, clear_buffer=False) g_x0_ = si.grad if accum[0]: g_x0 += g_x0_ else: g_x0.copy_from(g_x0_) # w.r.t. y0 is the grad-depends # w.r.t. dy if prop_down[2]: # gradient is the backward of log_softmax with g_dx0 as in-coming gradient lsi = nn.Variable(x0.shape).apply(data=x0, grad=g_dy, need_grad=True) lso = nn.Variable(x0.shape).apply(data=y0, grad=g_dx0) self.forward_func.backward([lsi], [lso], accum=[accum[2]])
def get_conditional_dist(fake_images): """Get the prediction score using Inception v3. Args: fake_images (nn.NdArray): NdArrays representing images. Shape must be (B, 3, 299, 299). Must be pre-normalized, i.e. its values must lie in [-1., +1.] Returns: py_given_x (nn.NdArray): Class probabilities of given images. (B, 1008) """ py_given_x = construct_inceptionv3(fake_images) py_given_x = PF.affine(py_given_x, 1008, name="Affine", with_bias=False) # strangely, 1008 is correct, and no bias. py_given_x = F.softmax(py_given_x) return py_given_x
def log_softmax_backward(inputs, axis=None): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] y0 = F.softmax(x0, axis=axis) D = len(x0.shape) axis = positive_axis(axis, D) dx0 = dy - y0 * F.sum(dy, axis=axis, keepdims=True) return dx0
def attention(k, q, v, div_dim=True, softmax=True): v_shape = v.shape k = F.identity(k) q = F.identity(q) k = F.reshape(k, (k.shape[0], np.prod(k.shape[1:]))) q = F.reshape(q, (q.shape[0], np.prod(q.shape[1:]))) v = q # F.reshape is inplace cf = F.affine(q, F.transpose(k, (1, 0))) if div_dim: dim = np.prod(v_shape[1:]) cf /= np.sqrt(dim) h = cf if softmax: h = F.softmax(h) h = F.affine(h, v)x h = F.reshape(h, v_shape) return h
def softmax_with_temperature(ctx, x, t): with nn.context_scope(ctx): h = x / t h = F.softmax(h, axis=1) return h
def kl_divergence(ctx, pred, label): with nn.context_scope(ctx): elms = F.softmax(label, axis=1) * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def distance(y0, y1): """ Distance function is Kullback-Leibler Divergence for categorical distribution """ return F.kl_multinomial(F.softmax(y0), F.softmax(y1))
def sr_loss(ctx, pred0, pred1): with nn.context_scope(ctx): pred_x_u0 = F.softmax(pred0) pred_x_u1 = F.softmax(pred1) loss_sr = F.mean(F.squared_error(pred_x_u0, pred_x_u1)) return loss_sr
def ce_loss_soft(ctx, pred, target): with nn.context_scope(ctx): #todo: devide or not loss = - F.mean(F.sum(F.softmax(target) * F.log(F.softmax(pred)), axis=1)) return loss