def zero(x, output_filter, scope, input_node_id, is_reduced, test, is_search): """ Zero operation, i.e. all elements become 0. """ if is_reduced and input_node_id < 2: h = F.max_pooling(x, kernel=(1, 1), stride=(2, 2)) # downsampling h = F.mul_scalar(h, 0) else: h = F.mul_scalar(x, 0) return h
def __call__(self, gen_rgb_out): out = conv_layer(gen_rgb_out, inmaps=3, outmaps=self.channels[0], kernel_size=1, name_scope='Discriminator/Convinitial') inmaps = self.channels[0] for i in range(1, len(self.resolutions)): res = out.shape[2] outmaps = self.channels[i] out = res_block(out, res=res, outmaps=outmaps, inmaps=inmaps) inmaps = outmaps N, C, H, W = out.shape group = min(N, self.stddev_group) stddev_mean = F.reshape( out, (group, -1, self.stddev_feat, C // self.stddev_feat, H, W), inplace=False) # mean = F.mean(stddev_mean, axis=0, keepdims=True) mean = F.mul_scalar(F.sum(stddev_mean, axis=0, keepdims=True), 1.0/stddev_mean.shape[0], inplace=False) stddev_mean = F.mean(F.pow_scalar(F.sub2(stddev_mean, F.broadcast( mean, stddev_mean.shape)), 2.), axis=0, keepdims=False) stddev_mean = F.pow_scalar(F.add_scalar( stddev_mean, 1e-8, inplace=False), 0.5, inplace=False) stddev_mean = F.mean(stddev_mean, axis=[2, 3, 4], keepdims=True) stddev_mean = F.reshape( stddev_mean, stddev_mean.shape[:2]+stddev_mean.shape[3:], inplace=False) out = F.concatenate(out, F.tile(stddev_mean, (group, 1, H, W)), axis=1) out = conv_layer(out, inmaps=out.shape[1], outmaps=self.channels[-1], kernel_size=3, name_scope='Discriminator/Convfinal') out = F.reshape(out, (N, -1), inplace=False) # Linear Layers lrmul = 1 scale = 1/(out.shape[1]**0.5)*lrmul W, bias = weight_init_fn( (out.shape[-1], self.channels[-1]), weight_var='Discriminator/final_linear_1/affine') out = F.affine(out, W*scale, bias*lrmul) out = F.mul_scalar(F.leaky_relu( out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) scale = 1/(out.shape[1]**0.5)*lrmul W, bias = weight_init_fn( (out.shape[-1], 1), weight_var='Discriminator/final_linear_2/affine') out = F.affine(out, W*scale, bias*lrmul) return out
def get_sample_and_feedback(args, data_dict): """ Let the controller predict one architecture and test its performance to get feedback. Here the feedback is validation accuracy and will be reused to train the controller. """ skip_weight = args.skip_weight entropy_weight = args.entropy_weight bl_dec = args.baseline_decay arc_seq, log_probs, entropys, skip_penaltys = sample_from_controller(args) sample_arch = list() for arc in arc_seq: sample_arch.extend(arc.tolist()) show_arch(sample_arch) sample_entropy = entropys sample_log_prob = log_probs nn.set_auto_forward(False) val_acc = CNN_run(args, sample_arch, data_dict) # Execute Evaluation Only nn.set_auto_forward(True) print("Accuracy on Validation: {:.2f} %\n".format(100 * val_acc)) reward = val_acc # use validation accuracy as reward if entropy_weight is not None: reward = F.add_scalar(F.mul_scalar(sample_entropy, entropy_weight), reward).d sample_log_prob = F.mul_scalar(sample_log_prob, (1 / args.num_candidate)) if args.use_variance_reduction: baseline = 0.0 # variance reduction baseline = baseline - ((1 - bl_dec) * (baseline - reward)) reward = reward - baseline loss = F.mul_scalar(sample_log_prob, (-1) * reward) if skip_weight is not None: adding_penalty = F.mul_scalar(skip_penaltys, skip_weight) loss = F.add2(loss, adding_penalty) return loss, val_acc, sample_arch
def test_graph_connection_with_setitem(indices): import nnabla.functions as F x = np.arange(8 * 7).reshape((8, 7)) x = nn.Variable.from_numpy_array(x, need_grad=True) u = np.arange(-1, -7, -1).reshape(3, 2) u = nn.Variable.from_numpy_array(u, need_grad=True) y = F.mul_scalar(x, 1) y[indices] = u z = F.add_scalar(y, 0) z.forward() # '+' signs only to persist visual alignment through autopep8 assert_allclose(z.d, np.array([[+0, +1, +2, +3, +4, +5, +6], [+7, +8, +9, 10, 11, 12, 13], [14, 15, 16, -1, -2, 19, 20], [21, 22, 23, -3, -4, 26, 27], [28, 29, 30, -5, -6, 33, 34], [35, 36, 37, 38, 39, 40, 41], [42, 43, 44, 45, 46, 47, 48], [49, 50, 51, 52, 53, 54, 55]])) x.grad.zero() u.grad.zero() z.backward(np.arange(1, 1 + 8 * 7).reshape(8, 7)) assert_allclose(x.g, np.array([[+1, +2, +3, +4, +5, +6, +7], [+8, +9, 10, 11, 12, 13, 14], [15, 16, 17, +0, +0, 20, 21], [22, 23, 24, +0, +0, 27, 28], [29, 30, 31, +0, +0, 34, 35], [36, 37, 38, 39, 40, 41, 42], [43, 44, 45, 46, 47, 48, 49], [50, 51, 52, 53, 54, 55, 56]])) assert_allclose(u.g, np.array([[18, 19], [25, 26], [32, 33]]))
def attn_block(x, name, num_heads=4, fix_parameters=False): """Multihead attention block""" B, C, H, W = x.shape with nn.parameter_scope(name): # Get query, key, value h = normalize(x, name="norm") # nin(3 * C) -> split is faster? q = nin(h, C, name="q") k = nin(h, C, name="k") v = nin(h, C, name="v") # Attention w = F.batch_matmul(F.reshape(q, (B * num_heads, -1, H * W)), F.reshape(k, (B * num_heads, -1, H * W)), transpose_a=True) w = F.mul_scalar(w, int(C)**(-0.5), inplace=True) assert w.shape == (B * num_heads, H * W, H * W) w = F.softmax(w, axis=-1) h = F.reshape(v, (B * num_heads, -1, H * W)) h = F.batch_matmul(h, w) h = F.reshape(h, (B, C, H, W)) # output projection h = nin(h, C, name='proj_out', zeroing_w=True) assert h.shape == x.shape return F.add2(h, x, inplace=True)
def res_block(res_input, res, inmaps, outmaps, block_scope='res_block'): """ Residual block for Discriminator """ name_scope = f'Discriminator/{block_scope}_{res}x{res}' out = conv_layer(res_input, inmaps, inmaps, kernel_size=3, name_scope=f'{name_scope}/Conv1') out = conv_layer(out, inmaps, outmaps, kernel_size=3, downsample=True, name_scope=f'{name_scope}/Conv2') skip = conv_layer(res_input, inmaps, outmaps, kernel_size=1, downsample=True, bias=False, act=F.identity, name_scope=f'{name_scope}/ConvSkip') out = F.mul_scalar(F.add2(out, skip), 1 / np.sqrt(2).astype(np.float32), inplace=False) return out
def call(self, input): if self._drop_prob == 0: return input mask = F.rand(shape=(input.shape[0], 1, 1, 1)) mask = F.greater_equal_scalar(mask, self._drop_prob) out = F.mul_scalar(input, 1. / (1 - self._drop_prob)) out = F.mul2(out, mask) return out
def smooth_L1(__pred_locs, __label_locs): # input # __pred_locs : type=nn.Variable, # __label_locs : type=nn.Variable, # output # _loss : type=nn.Variable, loss of location. return F.mul_scalar(F.huber_loss(__pred_locs, __label_locs), 0.5)
def __neg__(self): """ Element-wise negation. Implements the unary negation expression ``-A`` . Returns: :class:`nnabla.Variable` """ import nnabla.functions as F return F.mul_scalar(self, -1)
def conv_layer(conv_input, inmaps, outmaps, kernel_size, downsample=False, bias=True, act=F.leaky_relu, name_scope='Conv'): """ Conv layer for the residual block of the discriminator """ if downsample: k = [1, 3, 3, 1] out = downsample_2d(conv_input, k, factor=2, gain=1, kernel_size=kernel_size) stride = 2 pad = 0 else: stride = 1 pad = kernel_size // 2 out = conv_input init_function = weight_init_fn(shape=(outmaps, inmaps, kernel_size, kernel_size), return_init=True) scale = 1 / np.sqrt(inmaps * kernel_size**2) conv_weight = nn.parameter.get_parameter_or_create( name=f'{name_scope}/W', initializer=init_function, shape=(outmaps, inmaps, kernel_size, kernel_size)) if bias: conv_bias = nn.parameter.get_parameter_or_create( name=f'{name_scope}/b', shape=(outmaps, )) else: conv_bias = None out = F.convolution(out, conv_weight * scale, bias=conv_bias, stride=(stride, stride), pad=(pad, pad)) if act == F.leaky_relu: out = F.mul_scalar(F.leaky_relu(out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) else: out = act(out) return out
def call(self, *inputs): # we overload the Zero implementation from the dynamic modules. # The reason for this is the following: in the static modules, # we need to create a Variable with zeros and of the size of # the parent. Then we just multiply the scalar 0.0 with that variable, # with the only purpose that a NNabla graph is created for this # F.mul_scalar() operation, which is later visible in the exported # NNabla graph (.nnp) and also in the converted ONNX file (.onnx) self._value = nn.Variable.from_numpy_array( np.zeros(self._parents[0].shape)) return F.mul_scalar(self._value, 0.0, inplace=True)
def loss_function(pred, label, aux_logits=None, aux_weights=1.0): """ Compute loss. """ if aux_logits is None: loss = F.mean(F.softmax_cross_entropy(pred, label)) else: loss = F.softmax_cross_entropy(pred, label) loss_from_aux = F.mul_scalar( F.softmax_cross_entropy(aux_logits, label), aux_weights) loss = F.mean(F.add2(loss, loss_from_aux)) return loss
def get_sample_and_feedback(args, data_dict): """ Let the controller predict one architecture and test its performance to get feedback. Here the feedback is validation accuracy and will be reused to train the controller. """ entropy_weight = args.entropy_weight bl_dec = args.baseline_decay both_archs, log_probs, entropys = sample_from_controller(args) sample_entropy = entropys sample_log_prob = log_probs show_arch(both_archs) nn.set_auto_forward(False) val_acc = CNN_run(args, both_archs, data_dict) nn.set_auto_forward(True) print("Accuracy on Validation: {:.2f} %\n".format(100 * val_acc)) reward = val_acc if entropy_weight is not None: reward = F.add_scalar(F.mul_scalar(sample_entropy, entropy_weight), reward).d sample_log_prob = F.mul_scalar(sample_log_prob, (1 / args.num_candidate)) if args.use_variance_reduction: baseline = 0.0 # variance reduction baseline = baseline - ((1 - bl_dec) * (baseline - reward)) reward = reward - baseline loss = F.mul_scalar(sample_log_prob, (-1) * reward) return loss, val_acc, both_archs
def mapping_network(z, outmaps=512, num_layers=8, net_scope='G_mapping/Dense'): lrmul = 0.01 runtime_coef = 0.00044194172 out = z for i in range(num_layers): with nn.parameter_scope(f'{net_scope}{i}'): W, bias = weight_init_fn(shape=(out.shape[1], outmaps), lrmul=lrmul) out = F.affine(out, W * runtime_coef, bias * lrmul) out = F.mul_scalar(F.leaky_relu(out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) return out
def vgg16(x): # Input:x -> 3,300,300 # VGG11/MulScalar h = F.mul_scalar(x, 0.01735) # VGG11/AddScalar h = F.add_scalar(h, -1.99) # VGG11/Convolution -> 64,300,300 h = PF.convolution(h, 64, (3, 3), (1, 1), name='Convolution') # VGG11/ReLU h = F.relu(h, True) # VGG11/MaxPooling -> 64,150,150 h = F.max_pooling(h, (2, 2), (2, 2)) # VGG11/Convolution_3 -> 128,150,150 h = PF.convolution(h, 128, (3, 3), (1, 1), name='Convolution_3') # VGG11/ReLU_3 h = F.relu(h, True) # VGG11/MaxPooling_2 -> 128,75,75 h = F.max_pooling(h, (2, 2), (2, 2)) # VGG11/Convolution_5 -> 256,75,75 h = PF.convolution(h, 256, (3, 3), (1, 1), name='Convolution_5') # VGG11/ReLU_5 h = F.relu(h, True) # VGG11/Convolution_6 h = PF.convolution(h, 256, (3, 3), (1, 1), name='Convolution_6') # VGG11/ReLU_6 h = F.relu(h, True) # VGG11/MaxPooling_3 -> 256,38,38 h = F.max_pooling(h, (2, 2), (2, 2), True, (1, 1)) # VGG11/Convolution_8 -> 512,38,38 h = PF.convolution(h, 512, (3, 3), (1, 1), name='Convolution_8') # VGG11/ReLU_8 h = F.relu(h, True) # VGG11/Convolution_9 h = PF.convolution(h, 512, (3, 3), (1, 1), name='Convolution_9') # VGG11/ReLU_9 h = F.relu(h, True) # # VGG11/MaxPooling_4 -> 512,19,19 # h = F.max_pooling(h, (2,2), (2,2)) # # VGG11/Convolution_11 # h = PF.convolution(h, 512, (3,3), (1,1), name='Convolution_11') # # VGG11/ReLU_11 # h = F.relu(h, True) # # VGG11/Convolution_12 # h = PF.convolution(h, 512, (3,3), (1,1), name='Convolution_12') # # VGG11/ReLU_12 # h = F.relu(h, True) return h
def mapping_network(noise, outmaps=512, num_layers=8, net_scope='G_mapping/Dense'): """ a mapping network which embeds input noise into a vector in latent space. activation layer contains multiplication by np.sqrt(2). """ lrmul = 0.01 runtime_coef = 0.00044194172 out = noise for i in range(num_layers): with nn.parameter_scope(f'{net_scope}{i}'): W, bias = weight_init_fn(shape=(out.shape[1], outmaps), lrmul=lrmul) out = F.affine(out, W * runtime_coef, bias * lrmul) out = F.mul_scalar(F.leaky_relu(out, alpha=0.2), np.sqrt(2)) return out
def __mul__(self, other): """ Element-wise multiplication. Implements the multiplication operator expression ``A * B``, together with :func:`~nnabla.variable.__rmul__` . When a scalar is specified for ``other``, this function performs an element-wise operation for all elements in ``self``. Args: other (float or ~nnabla.Variable): Internally calling :func:`~nnabla.functions.mul2` or :func:`~nnabla.functions.mul_scalar` according to the type. Returns: :class:`nnabla.Variable` """ import nnabla.functions as F if isinstance(other, Variable): return F.mul2(self, other) return F.mul_scalar(self, other)
def main(): args = get_args() from nnabla.ext_utils import get_extension_context ctx = get_extension_context(args.context) nn.set_default_context(ctx) nn.load_parameters(args.weights) x = nn.Variable((1, 3, args.size, args.size)) y = darknet19.darknet19_classification(x / 255, test=True) label_names = np.loadtxt('imagenet.shortnames.list', dtype=str, delimiter=',')[:1000] img = imread(args.input) img = imresize(img, (args.size, args.size)) x.d = img.transpose(2, 0, 1).reshape(1, 3, args.size, args.size) y.forward(clear_buffer=True) # softmax p = F.reshape(F.mul_scalar(F.softmax(y.data), 100), (y.size, )) # Show top-5 prediction inds = np.argsort(y.d.flatten())[::-1][:5] for i in inds: print('{}: {:.1f}%'.format(label_names[i], p.data[i])) s = time.time() n_time = 10 for i in range(n_time): y.forward(clear_buffer=True) # Invoking device-to-host copy to synchronize the device (if CUDA). _ = y.d print("Processing time: {:.1f} [ms/image]".format( (time.time() - s) / n_time * 1000))
def sample_from_controller(args): """ 2-layer RNN(LSTM) based controller which outputs an architecture of CNN, represented as a sequence of integers and its list. Given the number of layers, for each layer, it executes 2 types of computation, one for sampling the operation at that layer, another for sampling the skip connection patterns. """ entropys = nn.Variable([1, 1], need_grad=True) log_probs = nn.Variable([1, 1], need_grad=True) skip_penaltys = nn.Variable([1, 1], need_grad=True) entropys.d = log_probs.d = skip_penaltys.d = 0.0 # initialize them all num_layers = args.num_layers lstm_size = args.lstm_size state_size = args.state_size lstm_num_layers = args.lstm_layers skip_target = args.skip_prob temperature = args.temperature tanh_constant = args.tanh_constant num_branch = args.num_ops arc_seq = [] initializer = I.UniformInitializer((-0.1, 0.1)) prev_h = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] prev_c = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] for i in range(len(prev_h)): prev_h[i].d = 0 # initialize variables in lstm layers. prev_c[i].d = 0 inputs = nn.Variable([1, lstm_size]) inputs.d = np.random.normal(0, 0.5, [1, lstm_size]) g_emb = nn.Variable([1, lstm_size]) g_emb.d = np.random.normal(0, 0.5, [1, lstm_size]) skip_targets = nn.Variable([1, 2]) skip_targets.d = np.array([[1.0 - skip_target, skip_target]]) for layer_id in range(num_layers): # One-step stacked LSTM. with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, state_size) prev_h, prev_c = next_h, next_c # shape:(1, lstm_size) # Compute for operation. with nn.parameter_scope("ops"): logit = PF.affine(next_h[-1], num_branch, w_init=initializer, with_bias=False) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: logit = F.mul_scalar(F.tanh(logit), tanh_constant) # (1, num_branch) # normalizing logits. normed_logit = np.e**logit.d normed_logit = normed_logit / np.sum(normed_logit) # Sampling operation id from multinomial distribution. ops_id = np.random.multinomial(1, normed_logit[0], 1).nonzero()[1] ops_id = nn.Variable.from_numpy_array(ops_id) # (1, ) arc_seq.append(ops_id.d) # log policy for operation. log_prob = F.softmax_cross_entropy(logit, F.reshape(ops_id, shape=(1, 1))) # (1, ) # accumulate log policy as log probs log_probs = F.add2(log_probs, log_prob) entropy = log_prob * F.exp(-log_prob) entropys = F.add2(entropys, entropy) # accumulate entropy as entropys. w_emb = nn.parameter.get_parameter_or_create("w_emb", [num_branch, lstm_size], initializer, need_grad=False) inputs = F.reshape(w_emb[int(ops_id.d)], (1, w_emb.shape[1])) # (1, lstm_size) with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, lstm_size) prev_h, prev_c = next_h, next_c # (1, lstm_size) with nn.parameter_scope("skip_affine_3"): adding_w_1 = PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False) # (1, lstm_size) if layer_id == 0: inputs = g_emb # (1, lstm_size) anchors = next_h[-1] # (1, lstm_size) anchors_w_1 = adding_w_1 # then goes back to the entry point of the loop else: # (layer_id, lstm_size) this shape during the process query = anchors_w_1 with nn.parameter_scope("skip_affine_1"): query = F.tanh( F.add2( query, PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False))) # (layer_id, lstm_size) + (1, lstm_size) # broadcast occurs here. resulting shape is; (layer_id, lstm_size) with nn.parameter_scope("skip_affine_2"): query = PF.affine(query, 1, w_init=initializer, with_bias=False) # (layer_id, 1) # note that each weight for skip_affine_X is shared across all steps of LSTM. # re-define logits, now its shape is;(layer_id, 2) logit = F.concatenate(-query, query, axis=1) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: logit = F.mul_scalar(F.tanh(logit), tanh_constant) skip_prob_unnormalized = F.exp(logit) # (layer_id, 2) # normalizing skip_prob_unnormalized. summed = F.sum(skip_prob_unnormalized, axis=1, keepdims=True).apply(need_grad=False) summed = F.concatenate(summed, summed, axis=1) skip_prob_normalized = F.div2(skip_prob_unnormalized, summed) # (layer_id, 2) # Sampling skip_pattern from multinomial distribution. skip_pattern = np.random.multinomial( 1, skip_prob_normalized.d[0], layer_id).nonzero()[1] # (layer_id, 1) arc_seq.append(skip_pattern) skip = nn.Variable.from_numpy_array(skip_pattern) # compute skip penalty. # (layer_id, 2) broadcast occurs here too kl = F.mul2(skip_prob_normalized, F.log(F.div2(skip_prob_normalized, skip_targets))) kl = F.sum(kl, keepdims=True) # get the mean value here in advance. kl = kl * (1.0 / (num_layers - 1)) # accumulate kl divergence as skip penalty. skip_penaltys = F.add2(skip_penaltys, kl) # log policy for connection. log_prob = F.softmax_cross_entropy( logit, F.reshape(skip, shape=(skip.shape[0], 1))) log_probs = F.add2(log_probs, F.sum(log_prob, keepdims=True)) entropy = F.sum(log_prob * F.exp(-log_prob), keepdims=True) # accumulate entropy as entropys. entropys = F.add2(entropys, entropy) skip = F.reshape(skip, (1, layer_id)) inputs = F.affine(skip, anchors).apply(need_grad=False) # (1, lstm_size) inputs = F.mul_scalar(inputs, (1.0 / (1.0 + (np.sum(skip.d))))) # add new row for the next computation # (layer_id + 1, lstm_size) anchors = F.concatenate(anchors, next_h[-1], axis=0) # (layer_id + 1, lstm_size) anchors_w_1 = F.concatenate(anchors_w_1, adding_w_1, axis=0) return arc_seq, log_probs, entropys, skip_penaltys
def styled_conv_block(conv_input, w, noise=None, res=4, inmaps=512, outmaps=512, kernel_size=3, pad_size=1, demodulate=True, namescope="Conv", up=False, act=F.leaky_relu): """ Conv block with skip connection for Generator """ batch_size = conv_input.shape[0] with nn.parameter_scope(f'G_synthesis/{res}x{res}/{namescope}'): W, bias = weight_init_fn(shape=(w.shape[1], inmaps)) runtime_coef = (1. / np.sqrt(512)).astype(np.float32) style = F.affine(w, W * runtime_coef, bias) + 1.0 runtime_coef_for_conv = ( 1 / np.sqrt(np.prod([inmaps, kernel_size, kernel_size]))).astype( np.float32) if up: init_function = weight_init_fn(shape=(inmaps, outmaps, kernel_size, kernel_size), return_init=True) conv_weight = nn.parameter.get_parameter_or_create( name=f'G_synthesis/{res}x{res}/{namescope}/conv/W', shape=(inmaps, outmaps, kernel_size, kernel_size), initializer=init_function) else: init_function = weight_init_fn(shape=(outmaps, inmaps, kernel_size, kernel_size), return_init=True) conv_weight = nn.parameter.get_parameter_or_create( name=f'G_synthesis/{res}x{res}/{namescope}/conv/W', shape=(outmaps, inmaps, kernel_size, kernel_size), initializer=init_function) conv_weight = F.mul_scalar(conv_weight, runtime_coef_for_conv) if up: scale = F.reshape(style, (style.shape[0], style.shape[1], 1, 1, 1), inplace=False) else: scale = F.reshape(style, (style.shape[0], 1, style.shape[1], 1, 1), inplace=False) mod_w = F.mul2( F.reshape(conv_weight, (1, ) + conv_weight.shape, inplace=False), scale) if demodulate: if up: denom_w = F.pow_scalar( F.sum(F.pow_scalar(mod_w, 2.), axis=[1, 3, 4], keepdims=True) + 1e-8, 0.5) else: denom_w = F.pow_scalar( F.sum(F.pow_scalar(mod_w, 2.), axis=[2, 3, 4], keepdims=True) + 1e-8, 0.5) demod_w = F.div2(mod_w, denom_w) else: demod_w = mod_w conv_input = F.reshape(conv_input, (1, -1, conv_input.shape[2], conv_input.shape[3]), inplace=False) demod_w = F.reshape( demod_w, (-1, demod_w.shape[2], demod_w.shape[3], demod_w.shape[4]), inplace=False) if up: k = [1, 3, 3, 1] conv_out = upsample_conv_2d(conv_input, demod_w, k, factor=2, gain=1, group=batch_size) else: conv_out = F.convolution(conv_input, demod_w, pad=(pad_size, pad_size), group=batch_size) conv_out = F.reshape( conv_out, (batch_size, -1, conv_out.shape[2], conv_out.shape[3]), inplace=False) if noise is not None: noise_coeff = nn.parameter.get_parameter_or_create( name=f'G_synthesis/{res}x{res}/{namescope}/noise_strength', shape=()) conv_out = F.add2(conv_out, noise * F.reshape(noise_coeff, (1, 1, 1, 1))) else: conv_out = conv_out bias = nn.parameter.get_parameter_or_create( name=f'G_synthesis/{res}x{res}/{namescope}/conv/b', shape=(outmaps, ), initializer=np.random.randn(outmaps, ).astype(np.float32)) conv_out = F.add2(conv_out, F.reshape(bias, (1, outmaps, 1, 1), inplace=False)) if act == F.leaky_relu: conv_out = F.mul_scalar(F.leaky_relu(conv_out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) else: conv_out = act(conv_out) return conv_out
def dot(a, b, out=None): ''' A compatible operation with ``numpy.dot``. Note: Any operation between nnabla's Variable/NdArray and numpy array is not supported. If both arguments are 1-D, it is inner product of vectors. If both arguments are 2-D, it is matrix multiplication. If either a or b is 0-D(scalar), it is equivalent to multiply. If b is a 1-D array, it is a sum product over the last axis of a and b. If b is an M-D array (M>=2), it is a sum product over the last axis of a and the second-to-last axis of b. Args: a (Variable, NdArray or scalar): Left input array. b (Variable, NdArray or scalar): Right input array. out: Output argument. This must have the same shape, dtype, and type as the result that would be returned for F.dot(a,b). Returns: ~nnabla.Variable or ~nnabla.NdArray Examples: .. code-block:: python import numpy as np import nnabla as nn import nnabla.functions as F # 2-D matrix * 2-D matrix arr1 = np.arange(5*6).reshape(5, 6) arr2 = np.arange(6*8).reshape(6, 8) nd1 = nn.NdArray.from_numpy_array(arr1) nd2 = nn.NdArray.from_numpy_array(arr2) ans1 = F.dot(nd1, nd2) print(ans1.shape) #(5, 8) var1 = nn.Variable.from_numpy_array(arr1) var2 = nn.Variable.from_numpy_array(arr2) ans2 = F.dot(var1, var2) ans2.forward() print(ans2.shape) #(5, 8) out1 = nn.NdArray((5, 8)) out1.cast(np.float32) F.dot(nd1, nd2, out1) print(out1.shape) #(5, 8) out2 = nn.Variable((5, 8)) out2.data.cast(np.float32) F.dot(var1, var2, out2) out2.forward() print(out2.shape) #(5, 8) # N-D matrix * M-D matrix (M>=2) arr1 = np.arange(5*6*7*8).reshape(5, 6, 7, 8) arr2 = np.arange(2*3*8*6).reshape(2, 3, 8, 6) nd1 = nn.NdArray.from_numpy_array(arr1) nd2 = nn.NdArray.from_numpy_array(arr2) ans1 = F.dot(nd1, nd2) print(ans1.shape) #(5, 6, 7, 2, 3, 6) var1 = nn.Variable.from_numpy_array(arr1) var2 = nn.Variable.from_numpy_array(arr2) ans2 = F.dot(var1, var2) ans2.forward() print(ans2.shape) #(5, 6, 7, 2, 3, 6) out1 = nn.NdArray((5, 6, 7, 2, 3, 6)) out1.cast(np.float32) F.dot(nd1, nd2, out1) print(out1.shape) #(5, 6, 7, 2, 3, 6) out2 = nn.Variable((5, 6, 7, 2, 3, 6)) out2.data.cast(np.float32) F.dot(var1, var2, out2) out2.forward() print(out2.shape) #(5, 6, 7, 2, 3, 6) ''' import nnabla as nn import nnabla.functions as F def _chk(x, mark=0): if isinstance(x, nn.NdArray): return x.data, 1 elif isinstance(x, nn.Variable): return x.d, 1 else: return x, mark m, mark1 = _chk(a) n, mark2 = _chk(b) if mark1 and mark2: if a.ndim == 1 and b.ndim == 1: result = F.sum(a * b) elif a.ndim == 2 and b.ndim == 2: result = F.affine(a, b) elif a.ndim == 0 or b.ndim == 0: if a.ndim == 0: result = F.mul_scalar(b, m) if isinstance(a, nn.NdArray) and isinstance(b, nn.Variable): result.forward() result = result.data else: result = F.mul_scalar(a, n) if isinstance(a, nn.Variable) and isinstance(b, nn.NdArray): result.forward() result = result.data elif b.ndim == 1: h = F.affine(a, F.reshape(b, (-1, 1)), base_axis=a.ndim - 1) result = F.reshape(h, h.shape[:-1]) elif b.ndim >= 2: index = [*range(0, b.ndim)] index.insert(0, index.pop(b.ndim - 2)) b = F.transpose(b, index) h = F.affine(a, b, base_axis=a.ndim - 1) result = h else: result = np.dot(a, b) if out is not None: out_, _ = _chk(out) result_, _ = _chk(result) if type(out) == type( result ) and out_.shape == result_.shape and out_.dtype == result_.dtype: if isinstance(out, nn.NdArray): out.cast(result.data.dtype)[...] = result.data elif isinstance(out, nn.Variable): out.rewire_on(result) else: out = result else: raise ValueError( f"Output argument must have the same shape, type and dtype as the result that would be " f"returned for F.dot(a,b).") else: return result
def CNN_run(args, ops, arch_dict): """ Based on the given model architecture, construct CNN and execute training. input: args: arguments set by user. ops: operations used in the network. arch_dict: a dictionary containing architecture information. """ data_iterator = data_iterator_cifar10 tdata = data_iterator(args.batch_size, True) vdata = data_iterator(args.batch_size, False) # CIFAR10 statistics, mean and variance CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1)) CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1)) channels, image_height, image_width = 3, 32, 32 batch_size = args.batch_size initial_model_lr = args.model_lr one_epoch = tdata.size // batch_size max_iter = args.epoch * one_epoch val_iter = 10000 // batch_size # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) monitor_err = MonitorSeries("Training error", monitor, interval=100) monitor_vloss = MonitorSeries("Test loss", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=100) # prepare variables and graph used for test image_valid = nn.Variable( (batch_size, channels, image_height, image_width)) label_valid = nn.Variable((batch_size, 1)) input_image_valid = {"image": image_valid, "label": label_valid} pred_valid, _ = construct_networks(args, ops, arch_dict, image_valid, test=True) loss_valid = loss_function(pred_valid, label_valid) # set dropout rate in advance nn.parameter.get_parameter_or_create("drop_rate", shape=(1, 1, 1, 1), need_grad=False) initial_drop_rate = nn.Variable((1, 1, 1, 1)).apply(d=args.dropout_rate) nn.parameter.set_parameter("drop_rate", initial_drop_rate) # prepare variables and graph used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} pred_train, aux_logits = construct_networks(args, ops, arch_dict, image_train, test=False) loss_train = loss_function(pred_train, label_train, aux_logits, args.auxiliary_weight) # prepare solvers model_params_dict = nn.get_parameters() solver_model = S.Momentum(initial_model_lr) solver_model.set_parameters(model_params_dict, reset=False, retain_state=True) # Training-loop for curr_epoch in range(args.epoch): print("epoch {}".format(curr_epoch)) curr_dropout_rate = F.add_scalar( F.mul_scalar(initial_drop_rate, (curr_epoch / args.epoch)), 1e-8) nn.parameter.set_parameter("drop_rate", curr_dropout_rate) for i in range(one_epoch): image, label = tdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD if args.cutout: image = cutout(image, args) input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward(clear_no_need_grad=True) e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(one_epoch * curr_epoch + i, loss_train.d.copy()) monitor_err.add(one_epoch * curr_epoch + i, e) if args.lr_control_model: new_lr = learning_rate_scheduler(one_epoch * curr_epoch + i, max_iter, initial_model_lr, 0) solver_model.set_learning_rate(new_lr) solver_model.zero_grad() loss_train.backward(clear_buffer=True) if args.with_grad_clip_model: for k, v in model_params_dict.items(): v.grad.copy_from( F.clip_by_norm(v.grad, args.grad_clip_value_model)) # update parameters solver_model.weight_decay(args.weight_decay_model) solver_model.update() if (one_epoch * curr_epoch + i) % args.model_save_interval == 0: nn.save_parameters( os.path.join( args.model_save_path, 'params_{}.h5'.format(one_epoch * curr_epoch + i))) # Validation during training. ve = 0. vloss = 0. for j in range(val_iter): image, label = vdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD input_image_valid["image"].d = image input_image_valid["label"].d = label loss_valid.forward(clear_no_need_grad=True) vloss += loss_valid.d.copy() ve += categorical_error(pred_valid.d.copy(), label) ve /= val_iter vloss /= val_iter monitor_vloss.add(one_epoch * curr_epoch + i, vloss) monitor_verr.add(one_epoch * curr_epoch + i, ve) return
def ssd_loss(_ssd_confs, _ssd_locs, _label, _alpha=1): # input # _ssd_confs : type=nn.Variable, prediction of class. shape=(batch_size, default boxes, class num + 1) # _ssd_locs : type=nn.Variable, prediction of location. shape=(batch_size, default boxes, 4) # _label : type=nn.Variable, shape=(batch_size, default boxes, class num + 1 + 4) # _alpha : type=float, hyperparameter. this is weight of loc_loss. # output # loss : type=nn.Variable def smooth_L1(__pred_locs, __label_locs): # input # __pred_locs : type=nn.Variable, # __label_locs : type=nn.Variable, # output # _loss : type=nn.Variable, loss of location. return F.mul_scalar(F.huber_loss(__pred_locs, __label_locs), 0.5) # _label_conf : type=nn.Variable, label of class. shape=(batch_size, default boxes, class num + 1) (after one_hot) # _label_loc : type=nn.Variable, label of location. shape=(batch_size, default boxes, 4) label_conf = F.slice( _label, start=(0,0,4), stop=_label.shape, step=(1,1,1) ) label_loc = F.slice( _label, start=(0,0,0), stop=(_label.shape[0], _label.shape[1], 4), step=(1,1,1) ) # conf ssd_pos_conf, ssd_neg_conf = ssd_separate_conf_pos_neg(_ssd_confs) label_conf_pos, _ = ssd_separate_conf_pos_neg(label_conf) # pos pos_loss = F.sum( F.mul2( F.softmax(ssd_pos_conf, axis=2), label_conf_pos ) , axis=2 ) # neg neg_loss = F.sum(F.log(ssd_neg_conf), axis=2) conf_loss = F.sum(F.sub2(pos_loss, neg_loss), axis=1) # loc pos_label = F.sum(label_conf_pos, axis=2) # =1 (if there is sonething), =0 (if there is nothing) loc_loss = F.sum(F.mul2(F.sum(smooth_L1(_ssd_locs, label_loc), axis=2), pos_label), axis=1) # [2019/07/18] label_match_default_box_num = F.slice( _label, start=(0,0,_label.shape[2] - 1), stop=_label.shape, step=(1,1,1) ) label_match_default_box_num = F.sum(label_match_default_box_num, axis=1) label_match_default_box_num = F.r_sub_scalar(label_match_default_box_num, _label.shape[1]) label_match_default_box_num = F.reshape(label_match_default_box_num, (label_match_default_box_num.shape[0],), inplace=False) # label_match_default_box_num : type=nn.Variable, inverse number of default boxes that matches with pos. # loss loss = F.mul2(F.add2(conf_loss, F.mul_scalar(loc_loss, _alpha)), label_match_default_box_num) loss = F.mean(loss) return loss
def call(self, input): if self._stride[0] > 1: input = F.max_pooling(input, kernel=(1, 1), stride=self._stride) return F.mul_scalar(input, 0.0)
def __call__(self, gen_rgb_out, patch_switch=False, index=0): out = conv_layer(gen_rgb_out, inmaps=3, outmaps=self.channels[0], kernel_size=1, name_scope='Discriminator/Convinitial') inmaps = self.channels[0] out_list = [out] for i in range(1, len(self.resolutions)): res = out.shape[2] outmaps = self.channels[i] out = res_block(out, res=res, outmaps=outmaps, inmaps=inmaps) inmaps = outmaps out_list.append(out) if patch_switch: GV_class = GetVariablesOnGraph(out) GF_class = GetFunctionFromInput(out, func_type_list=['LeakyReLU']) feature_dict = OrderedDict() for key in GV_class.coef_dict_on_graph: if ('res_block' in key and '/W' in key) and ('Conv1' in key or 'Conv2' in key): feature_var = GF_class.functions[key][0].outputs[ 0].function_references[0].outputs[0] if feature_var.shape[2:] in ((32, 32), (16, 16)): feature_dict[key] = GF_class.functions[key][0].outputs[ 0].function_references[0].outputs[0] N, C, H, W = out.shape group = min(N, self.stddev_group) stddev_mean = F.reshape( out, (group, -1, self.stddev_feat, C // self.stddev_feat, H, W), inplace=False) mean = F.mul_scalar(F.sum(stddev_mean, axis=0, keepdims=True), 1.0 / stddev_mean.shape[0], inplace=False) stddev_mean = F.mean(F.pow_scalar( F.sub2(stddev_mean, F.broadcast(mean, stddev_mean.shape)), 2.), axis=0, keepdims=False) stddev_mean = F.pow_scalar(F.add_scalar(stddev_mean, 1e-8, inplace=False), 0.5, inplace=False) stddev_mean = F.mean(stddev_mean, axis=[2, 3, 4], keepdims=True) stddev_mean = F.reshape(stddev_mean, stddev_mean.shape[:2] + stddev_mean.shape[3:], inplace=False) out = F.concatenate(out, F.tile(stddev_mean, (group, 1, H, W)), axis=1) out = conv_layer(out, inmaps=out.shape[1], outmaps=self.channels[-1], kernel_size=3, name_scope='Discriminator/Convfinal') out = F.reshape(out, (N, -1), inplace=False) # Linear Layers lrmul = 1 scale = 1 / (out.shape[1]**0.5) * lrmul W, bias = weight_init_fn( (out.shape[-1], self.channels[-1]), weight_var='Discriminator/final_linear_1/affine') out = F.affine(out, W * scale, bias * lrmul) out = F.mul_scalar(F.leaky_relu(out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) scale = 1 / (out.shape[1]**0.5) * lrmul W, bias = weight_init_fn( (out.shape[-1], 1), weight_var='Discriminator/final_linear_2/affine') out = F.affine(out, W * scale, bias * lrmul) if patch_switch: return out, list(feature_dict.values())[index] else: return out
def main(): args = get_args() # Get context from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) nn.set_auto_forward(True) image = io.imread(args.test_image) if image.ndim == 2: image = color.gray2rgb(image) elif image.shape[-1] == 4: image = image[..., :3] if args.context == 'cudnn': if not os.path.isfile(args.cnn_face_detction_model): # Block of bellow code will download the cnn based face-detection model file provided by dlib for face detection # and will save it in the directory where this script is executed. print("Downloading the face detection CNN. Please wait...") url = "http://dlib.net/files/mmod_human_face_detector.dat.bz2" from nnabla.utils.data_source_loader import download download(url, url.split('/')[-1], False) # get the decompressed data. data = bz2.BZ2File(url.split('/')[-1]).read() # write to dat file. open(url.split('/')[-1][:-4], 'wb').write(data) face_detector = dlib.cnn_face_detection_model_v1( args.cnn_face_detction_model) detected_faces = face_detector( cv2.cvtColor(image[..., ::-1].copy(), cv2.COLOR_BGR2GRAY)) detected_faces = [[ d.rect.left(), d.rect.top(), d.rect.right(), d.rect.bottom() ] for d in detected_faces] else: face_detector = dlib.get_frontal_face_detector() detected_faces = face_detector( cv2.cvtColor(image[..., ::-1].copy(), cv2.COLOR_BGR2GRAY)) detected_faces = [[d.left(), d.top(), d.right(), d.bottom()] for d in detected_faces] if len(detected_faces) == 0: print("Warning: No faces were detected.") return None # Load FAN weights with nn.parameter_scope("FAN"): print("Loading FAN weights...") nn.load_parameters(args.model) # Load ResNetDepth weights if args.landmarks_type_3D: with nn.parameter_scope("ResNetDepth"): print("Loading ResNetDepth weights...") nn.load_parameters(args.resnet_depth_model) landmarks = [] for i, d in enumerate(detected_faces): center = [d[2] - (d[2] - d[0]) / 2.0, d[3] - (d[3] - d[1]) / 2.0] center[1] = center[1] - (d[3] - d[1]) * 0.12 scale = (d[2] - d[0] + d[3] - d[1]) / args.reference_scale inp = crop(image, center, scale) inp = nn.Variable.from_numpy_array(inp.transpose((2, 0, 1))) inp = F.reshape(F.mul_scalar(inp, 1 / 255.0), (1, ) + inp.shape) with nn.parameter_scope("FAN"): out = fan(inp, args.network_size)[-1] pts, pts_img = get_preds_fromhm(out, center, scale) pts, pts_img = F.reshape(pts, (68, 2)) * \ 4, F.reshape(pts_img, (68, 2)) if args.landmarks_type_3D: heatmaps = np.zeros((68, 256, 256), dtype=np.float32) for i in range(68): if pts.d[i, 0] > 0: heatmaps[i] = draw_gaussian(heatmaps[i], pts.d[i], 2) heatmaps = nn.Variable.from_numpy_array(heatmaps) heatmaps = F.reshape(heatmaps, (1, ) + heatmaps.shape) with nn.parameter_scope("ResNetDepth"): depth_pred = F.reshape( resnet_depth(F.concatenate(inp, heatmaps, axis=1)), (68, 1)) pts_img = F.concatenate(pts_img, depth_pred * (1.0 / (256.0 / (200.0 * scale))), axis=1) landmarks.append(pts_img.d) visualize(landmarks, image, args.output)
def sample_from_controller(args): """ 2-layer RNN(LSTM) based controller which outputs an architecture of CNN, represented as a sequence of integers and its list. Given the number of layers, for each layer, it executes 2 types of computation, one for sampling the operation at that layer, another for sampling the skip connection patterns. """ entropys = nn.Variable([1, 1], need_grad=True) log_probs = nn.Variable([1, 1], need_grad=True) entropys.d = log_probs.d = 0.0 # initialize them all num_cells = args.num_cells num_nodes = args.num_nodes lstm_size = args.lstm_size state_size = args.state_size lstm_num_layers = args.lstm_layers temperature = args.temperature tanh_constant = args.tanh_constant op_tanh_reduce = args.op_tanh_reduce num_branch = args.num_ops both_archs = [list(), list()] initializer = I.UniformInitializer((-0.1, 0.1)) prev_h = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] prev_c = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] for i in range(len(prev_h)): prev_h[i].d = 0 # initialize. prev_c[i].d = 0 inputs = nn.Variable([1, lstm_size]) inputs.d = np.random.normal(0, 0.5, [1, lstm_size]) g_emb = nn.Variable([1, lstm_size]) g_emb.d = np.random.normal(0, 0.5, [1, lstm_size]) for ind in range(2): # first create conv cell and then reduc cell. idx_seq = list() ops_seq = list() for node_id in range(num_nodes): if node_id == 0: anchors = nn.parameter.get_parameter_or_create("anchors", [2, lstm_size], initializer, need_grad=False) anchors_w_1 = nn.parameter.get_parameter_or_create( "anchors_w_1", [2, lstm_size], initializer, need_grad=False) else: assert anchors.shape[0] == node_id + \ 2, "Something wrong with anchors." assert anchors_w_1.shape[0] == node_id + \ 2, "Something wrong with anchors_w_1." # for each node, get the index used as inputs for i in range(2): # One-step stacked LSTM. with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, state_size) prev_h, prev_c = next_h, next_c # shape:(1, lstm_size) query = anchors_w_1 with nn.parameter_scope("skip_affine_1"): query = F.tanh( F.add2( query, PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False))) # (node_id + 2, lstm_size) + (1, lstm_size) # broadcast occurs here. resulting shape is; (node_id + 2, lstm_size) with nn.parameter_scope("skip_affine_2"): # (node_id + 2, 1) logit = PF.affine(query, 1, w_init=initializer, with_bias=False) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: logit = F.mul_scalar(F.tanh(logit), tanh_constant) index = F.exp(logit) index = F.mul_scalar(index, (1 / index.d.sum())) # Sampling input indices from multinomial distribution. index = np.random.multinomial( 1, np.reshape(index.d, (1, index.d.size))[0], 1) idx_seq.append(index.nonzero()[1]) label = nn.Variable.from_numpy_array( index.transpose()) # (node_id + 2, 1) log_prob = F.softmax_cross_entropy(logit, label) log_probs = F.add2(log_probs, F.sum(log_prob, keepdims=True)) curr_ent = F.softmax_cross_entropy(logit, F.softmax(logit)) entropy = F.sum(curr_ent, keepdims=True) entropys = F.add2(entropys, entropy) taking_ind = int(index.nonzero()[1][0]) # (1, lstm_size) inputs = F.reshape(anchors[taking_ind], (1, anchors.shape[1])) # ops for j in range(2): with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, state_size) prev_h, prev_c = next_h, next_c # shape:(1, lstm_size) # Compute for operation. with nn.parameter_scope("ops"): logit = PF.affine(next_h[-1], num_branch, w_init=initializer, with_bias=False) # shape of logit : (1, num_branch) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: op_tanh = tanh_constant / op_tanh_reduce logit = F.mul_scalar(F.tanh(logit), op_tanh) # normalizing logits. normed_logit = np.e**logit.d normed_logit = normed_logit / np.sum(normed_logit) # Sampling operation id from multinomial distribution. branch_id = np.random.multinomial(1, normed_logit[0], 1).nonzero()[1] branch_id = nn.Variable.from_numpy_array(branch_id) ops_seq.append(branch_id.d) # log policy for operation. log_prob = F.softmax_cross_entropy( logit, F.reshape(branch_id, shape=(1, 1))) # accumulate log policy as log probs log_probs = F.add2(log_probs, log_prob) logit = F.transpose(logit, axes=(1, 0)) curr_ent = F.softmax_cross_entropy(logit, F.softmax(logit)) entropy = F.sum(curr_ent, keepdims=True) entropys = F.add2(entropys, entropy) w_emb = nn.parameter.get_parameter_or_create( "w_emb", [num_branch, lstm_size], initializer, need_grad=False) # (1, lstm_size) inputs = F.reshape(w_emb[int(branch_id.d)], (1, w_emb.shape[1])) with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, lstm_size) prev_h, prev_c = next_h, next_c with nn.parameter_scope("skip_affine_3"): adding_w_1 = PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False) # (node_id + 2 + 1, lstm_size) anchors = F.concatenate(anchors, next_h[-1], axis=0) # (node_id + 2 + 1, lstm_size) anchors_w_1 = F.concatenate(anchors_w_1, adding_w_1, axis=0) for idx, ops in zip(idx_seq, ops_seq): both_archs[ind].extend([int(idx), int(ops)]) return both_archs, log_probs, entropys
def __call__(self, input): out = F.mul_scalar(input, self._scale) out = F.sub2(out, self._mean) out = F.div2(out, self._std) return out
def test_prohibit_clear_data(): import nnabla.functions as F nn.prefer_cached_array(False) shape = (2, 3, 4) var_np = np.random.rand(*shape) # the case of root variable x1 = nn.Variable.from_numpy_array(var_np) y1 = F.reshape(x1, (-1, ), inplace=True) y1 = F.reshape(y1, shape, inplace=True) * 2 x2 = nn.Variable.from_numpy_array(var_np) y2 = F.reshape(x2, (-1, ), inplace=False) y2 = F.reshape(y2, shape, inplace=False) * 2 nn.forward_all([y1, y2], clear_buffer=True) assert_allclose(x1.d, x2.d) assert_allclose(y1.d, y2.d) # the case of persistent variable x1 = nn.Variable.from_numpy_array(var_np) p_y1 = F.mul_scalar(x1, 2).apply(persistent=True) y1 = F.reshape(p_y1, (-1, ), inplace=True) y1 = F.reshape(y1, shape, inplace=True) * 2 x2 = nn.Variable.from_numpy_array(var_np) p_y2 = F.mul_scalar(x2, 2).apply(persistent=True) y2 = F.reshape(p_y2, (-1, ), inplace=False) y2 = F.reshape(y2, shape, inplace=False) * 2 nn.forward_all([y1, y2], clear_buffer=True) assert_allclose(p_y1.d, p_y2.d) assert_allclose(y1.d, y2.d) # the case of rewire_on root variable # graph A: x11 -> f_inplace -> y11 x11 = nn.Variable.from_numpy_array(var_np) y11 = F.reshape(x11, (-1, ), inplace=True) # graph B: x12 -> f_inplace -> mul_scalar -> y12 x12 = nn.Variable(shape=y11.shape) y12 = F.reshape(x12, shape, inplace=True) * 2 # graph A->B: x11 -> f_inplace -> f_inplace -> mul_scalar -> y12 x12.rewire_on(y11) x2 = nn.Variable.from_numpy_array(var_np) y2 = F.reshape(x2, (-1, ), inplace=False) y2 = F.reshape(y2, shape, inplace=False) * 2 nn.forward_all([y12, y2], clear_buffer=True) assert_allclose(x11.d, x2.d) assert_allclose(y12.d, y2.d) # the case of rewire_on persistent variable # graph A: x11 -> mul_scalar -> p_x11 -> f_inplace -> y11 x11 = nn.Variable.from_numpy_array(var_np) p_x11 = F.mul_scalar(x11, 2).apply(persistent=True) y11 = F.reshape(p_x11, (-1, ), inplace=True) # graph B: x12 -> f_inplace -> mul_scalar -> y12 x12 = nn.Variable(shape=y11.shape) y12 = F.reshape(x12, shape, inplace=True) * 2 # graph A->B: ... -> p_x11 -> f_inplace -> f_inplace -> mul_scalar -> y12 x12.rewire_on(y11) x2 = nn.Variable.from_numpy_array(var_np) p_x2 = F.mul_scalar(x2, 2).apply(persistent=True) y2 = F.reshape(p_x2, (-1, ), inplace=False) y2 = F.reshape(y2, shape, inplace=False) * 2 nn.forward_all([y12, y2], clear_buffer=True) assert_allclose(p_x11.d, p_x2.d) assert_allclose(y12.d, y2.d)