def test_clear_input_if_no_need_grad_inplace1(self): x1 = nn.Variable([1, 5], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1, inplace=True) y2 = F.add_scalar(y1) answer = [] answer.append([False]) answer.append([False]) answer.append([False]) y2.forward(clear_no_need_grad=True) self.check_input_data_clear_called_flags(answer)
def weight_normalization_backward(inputs, dim=0, eps=1e-12): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] w = inputs[1] g = inputs[2] g_shape = g.shape dim += w.ndim*(dim < 0) # Create inverted norm of w sum_axes = list(filter(lambda x: x != dim, range(w.ndim))) w_pow = F.pow_scalar(w, 2.0) w_sum = F.sum(w_pow, sum_axes, True) w_add = F.add_scalar(w_sum, eps) w_norm_inv = F.pow_scalar(w_add, -0.5) dyw_sum = F.sum(dy * w, sum_axes, True) # w.r.t. dw g = g.reshape([s if i == dim else 1 for i, s in enumerate(w.shape)]) dw = (dy - dyw_sum * (w_norm_inv ** 2) * w) * g * w_norm_inv # w.r.t. dg dg = dyw_sum * w_norm_inv dg = dg.reshape(g_shape) return dw, dg
def test_graph_connection_with_setitem(indices): import nnabla.functions as F x = np.arange(8 * 7).reshape((8, 7)) x = nn.Variable.from_numpy_array(x, need_grad=True) u = np.arange(-1, -7, -1).reshape(3, 2) u = nn.Variable.from_numpy_array(u, need_grad=True) y = F.mul_scalar(x, 1) y[indices] = u z = F.add_scalar(y, 0) z.forward() # '+' signs only to persist visual alignment through autopep8 assert_allclose(z.d, np.array([[+0, +1, +2, +3, +4, +5, +6], [+7, +8, +9, 10, 11, 12, 13], [14, 15, 16, -1, -2, 19, 20], [21, 22, 23, -3, -4, 26, 27], [28, 29, 30, -5, -6, 33, 34], [35, 36, 37, 38, 39, 40, 41], [42, 43, 44, 45, 46, 47, 48], [49, 50, 51, 52, 53, 54, 55]])) x.grad.zero() u.grad.zero() z.backward(np.arange(1, 1 + 8 * 7).reshape(8, 7)) assert_allclose(x.g, np.array([[+1, +2, +3, +4, +5, +6, +7], [+8, +9, 10, 11, 12, 13, 14], [15, 16, 17, +0, +0, 20, 21], [22, 23, 24, +0, +0, 27, 28], [29, 30, 31, +0, +0, 34, 35], [36, 37, 38, 39, 40, 41, 42], [43, 44, 45, 46, 47, 48, 49], [50, 51, 52, 53, 54, 55, 56]])) assert_allclose(u.g, np.array([[18, 19], [25, 26], [32, 33]]))
def test_clear_input_if_no_need_grad_branch0(self): x1 = nn.Variable([1, 5], need_grad=True) x2 = nn.Variable([1, 5], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) # (1) y2 = F.add_scalar(xx1) # (2) y3 = F.add2(y1, y2) # (3) answer = [] answer.append([False]) answer.append([False]) # (1) does not clear xx1 answer.append([True]) # (2) clears xx1 answer.append([True, True]) y3.forward(clear_no_need_grad=True) self.check_input_data_clear_called_flags(answer)
def test_clear_output_grad_inplace(self): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1, inplace=True) y2 = F.add_scalar(y1) answer_grad = [] answer_grad.append([True]) answer_grad.append([True]) answer_grad.append([True]) y2.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y2.backward(clear_buffer=True) self.check_grad_cleared_flags(answer_grad)
def test_clear_input_if_no_need_grad_branch2(self): x1 = nn.Variable([1, 5], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) y2 = F.add_scalar(y1, inplace=True) z1 = F.add_scalar(xx1) z2 = F.add_scalar(z1) y3 = F.add2(y2, z2) answer = [] answer.append([False]) answer.append([False]) answer.append([False]) answer.append([True]) answer.append([True]) answer.append([False, True]) y3.forward(clear_no_need_grad=True) self.check_input_data_clear_called_flags(answer)
def test_clear_output_grad_prohibit_clear_input(self): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) y2 = F.add_scalar(xx1) y3 = F.sink(y1, y2) answer_grad = [] answer_grad.append([True]) # y3 answer_grad.append([False]) # y2 answer_grad.append([False]) # y1 answer_grad.append([True]) # xx1 y3.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y3.backward(clear_buffer=True) self.check_grad_cleared_flags(answer_grad)
def test_clear_output_grad_persistent(self): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) y2 = F.add_scalar(y1) xx1.persistent = True y2.persistent = True answer_grad = [] answer_grad.append([False]) # y2 answer_grad.append([True]) # y1 answer_grad.append([False]) # xx1 y2.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y2.backward(clear_buffer=True) self.check_grad_cleared_flags(answer_grad)
def __call__(self, gen_rgb_out): out = conv_layer(gen_rgb_out, inmaps=3, outmaps=self.channels[0], kernel_size=1, name_scope='Discriminator/Convinitial') inmaps = self.channels[0] for i in range(1, len(self.resolutions)): res = out.shape[2] outmaps = self.channels[i] out = res_block(out, res=res, outmaps=outmaps, inmaps=inmaps) inmaps = outmaps N, C, H, W = out.shape group = min(N, self.stddev_group) stddev_mean = F.reshape( out, (group, -1, self.stddev_feat, C // self.stddev_feat, H, W), inplace=False) # mean = F.mean(stddev_mean, axis=0, keepdims=True) mean = F.mul_scalar(F.sum(stddev_mean, axis=0, keepdims=True), 1.0/stddev_mean.shape[0], inplace=False) stddev_mean = F.mean(F.pow_scalar(F.sub2(stddev_mean, F.broadcast( mean, stddev_mean.shape)), 2.), axis=0, keepdims=False) stddev_mean = F.pow_scalar(F.add_scalar( stddev_mean, 1e-8, inplace=False), 0.5, inplace=False) stddev_mean = F.mean(stddev_mean, axis=[2, 3, 4], keepdims=True) stddev_mean = F.reshape( stddev_mean, stddev_mean.shape[:2]+stddev_mean.shape[3:], inplace=False) out = F.concatenate(out, F.tile(stddev_mean, (group, 1, H, W)), axis=1) out = conv_layer(out, inmaps=out.shape[1], outmaps=self.channels[-1], kernel_size=3, name_scope='Discriminator/Convfinal') out = F.reshape(out, (N, -1), inplace=False) # Linear Layers lrmul = 1 scale = 1/(out.shape[1]**0.5)*lrmul W, bias = weight_init_fn( (out.shape[-1], self.channels[-1]), weight_var='Discriminator/final_linear_1/affine') out = F.affine(out, W*scale, bias*lrmul) out = F.mul_scalar(F.leaky_relu( out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) scale = 1/(out.shape[1]**0.5)*lrmul W, bias = weight_init_fn( (out.shape[-1], 1), weight_var='Discriminator/final_linear_2/affine') out = F.affine(out, W*scale, bias*lrmul) return out
def get_sample_and_feedback(args, data_dict): """ Let the controller predict one architecture and test its performance to get feedback. Here the feedback is validation accuracy and will be reused to train the controller. """ skip_weight = args.skip_weight entropy_weight = args.entropy_weight bl_dec = args.baseline_decay arc_seq, log_probs, entropys, skip_penaltys = sample_from_controller(args) sample_arch = list() for arc in arc_seq: sample_arch.extend(arc.tolist()) show_arch(sample_arch) sample_entropy = entropys sample_log_prob = log_probs nn.set_auto_forward(False) val_acc = CNN_run(args, sample_arch, data_dict) # Execute Evaluation Only nn.set_auto_forward(True) print("Accuracy on Validation: {:.2f} %\n".format(100 * val_acc)) reward = val_acc # use validation accuracy as reward if entropy_weight is not None: reward = F.add_scalar(F.mul_scalar(sample_entropy, entropy_weight), reward).d sample_log_prob = F.mul_scalar(sample_log_prob, (1 / args.num_candidate)) if args.use_variance_reduction: baseline = 0.0 # variance reduction baseline = baseline - ((1 - bl_dec) * (baseline - reward)) reward = reward - baseline loss = F.mul_scalar(sample_log_prob, (-1) * reward) if skip_weight is not None: adding_penalty = F.mul_scalar(skip_penaltys, skip_weight) loss = F.add2(loss, adding_penalty) return loss, val_acc, sample_arch
def vgg16(x): # Input:x -> 3,300,300 # VGG11/MulScalar h = F.mul_scalar(x, 0.01735) # VGG11/AddScalar h = F.add_scalar(h, -1.99) # VGG11/Convolution -> 64,300,300 h = PF.convolution(h, 64, (3, 3), (1, 1), name='Convolution') # VGG11/ReLU h = F.relu(h, True) # VGG11/MaxPooling -> 64,150,150 h = F.max_pooling(h, (2, 2), (2, 2)) # VGG11/Convolution_3 -> 128,150,150 h = PF.convolution(h, 128, (3, 3), (1, 1), name='Convolution_3') # VGG11/ReLU_3 h = F.relu(h, True) # VGG11/MaxPooling_2 -> 128,75,75 h = F.max_pooling(h, (2, 2), (2, 2)) # VGG11/Convolution_5 -> 256,75,75 h = PF.convolution(h, 256, (3, 3), (1, 1), name='Convolution_5') # VGG11/ReLU_5 h = F.relu(h, True) # VGG11/Convolution_6 h = PF.convolution(h, 256, (3, 3), (1, 1), name='Convolution_6') # VGG11/ReLU_6 h = F.relu(h, True) # VGG11/MaxPooling_3 -> 256,38,38 h = F.max_pooling(h, (2, 2), (2, 2), True, (1, 1)) # VGG11/Convolution_8 -> 512,38,38 h = PF.convolution(h, 512, (3, 3), (1, 1), name='Convolution_8') # VGG11/ReLU_8 h = F.relu(h, True) # VGG11/Convolution_9 h = PF.convolution(h, 512, (3, 3), (1, 1), name='Convolution_9') # VGG11/ReLU_9 h = F.relu(h, True) # # VGG11/MaxPooling_4 -> 512,19,19 # h = F.max_pooling(h, (2,2), (2,2)) # # VGG11/Convolution_11 # h = PF.convolution(h, 512, (3,3), (1,1), name='Convolution_11') # # VGG11/ReLU_11 # h = F.relu(h, True) # # VGG11/Convolution_12 # h = PF.convolution(h, 512, (3,3), (1,1), name='Convolution_12') # # VGG11/ReLU_12 # h = F.relu(h, True) return h
def test_clear_input_if_no_need_grad2(self): x1 = nn.Variable([1, 5], need_grad=True) xx1 = F.identity(x1) # (1) y1 = F.tanh(xx1) # (2) y2 = F.add_scalar(y1) # (3) answer = [] answer.append([False]) answer.append([True]) answer.append([False]) # y1 must not be clear after (3) because y1 is required for backward of (2). y2.forward(clear_no_need_grad=True) self.check_input_data_clear_called_flags(answer)
def test_clear_output_grad_argument(self, grad): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) answer_grad = [] if grad is None or isinstance(grad, nn.NdArray): answer_grad.append([False]) # y1 else: answer_grad.append([True]) # y1 answer_grad.append([True]) # xx1 y1.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y1.backward(clear_buffer=True, grad=grad) self.check_grad_cleared_flags(answer_grad) assert y1.grad.clear_called == False
def __sub__(self, other): """ Element-wise subtraction. Implements the subtraction operator expression ``A - B``, together with :func:`~nnabla.variable.__rsub__` . When a scalar is specified for ``other``, this function performs an element-wise operation for all elements in ``self``. Args: other (float or ~nnabla.Variable): Internally calling :func:`~nnabla.functions.sub2` or :func:`~nnabla.functions.add_scalar` according to the type. Returns: :class:`nnabla.Variable` """ import nnabla.functions as F if isinstance(other, Variable): return F.sub2(self, other) return F.add_scalar(self, -other)
def get_sample_and_feedback(args, data_dict): """ Let the controller predict one architecture and test its performance to get feedback. Here the feedback is validation accuracy and will be reused to train the controller. """ entropy_weight = args.entropy_weight bl_dec = args.baseline_decay both_archs, log_probs, entropys = sample_from_controller(args) sample_entropy = entropys sample_log_prob = log_probs show_arch(both_archs) nn.set_auto_forward(False) val_acc = CNN_run(args, both_archs, data_dict) nn.set_auto_forward(True) print("Accuracy on Validation: {:.2f} %\n".format(100 * val_acc)) reward = val_acc if entropy_weight is not None: reward = F.add_scalar(F.mul_scalar(sample_entropy, entropy_weight), reward).d sample_log_prob = F.mul_scalar(sample_log_prob, (1 / args.num_candidate)) if args.use_variance_reduction: baseline = 0.0 # variance reduction baseline = baseline - ((1 - bl_dec) * (baseline - reward)) reward = reward - baseline loss = F.mul_scalar(sample_log_prob, (-1) * reward) return loss, val_acc, both_archs
def test_imperative_i1_o1(): import nnabla.functions as F x = nn.NdArray([2, 3, 4]) x.fill(1) x1 = F.add_scalar(x, 1) assert np.allclose(x1.data, 2)
def __call__(self, batch_size, style_noises, truncation_psi=1.0, return_latent=False, mixing_layer_index=None, dlatent_avg_beta=0.995): with nn.parameter_scope(self.global_scope): # normalize noise inputs for i in range(len(style_noises)): style_noises[i] = F.div2( style_noises[i], F.pow_scalar(F.add_scalar(F.mean(style_noises[i]**2., axis=1, keepdims=True), 1e-8, inplace=False), 0.5, inplace=False)) # get latent code w = [ mapping_network(style_noises[0], outmaps=self.mapping_network_dim, num_layers=self.mapping_network_num_layers) ] w += [ mapping_network(style_noises[1], outmaps=self.mapping_network_dim, num_layers=self.mapping_network_num_layers) ] dlatent_avg = nn.parameter.get_parameter_or_create( name="dlatent_avg", shape=(1, 512)) # Moving average update of dlatent_avg batch_avg = F.mean((w[0] + w[1]) * 0.5, axis=0, keepdims=True) update_op = F.assign( dlatent_avg, lerp(batch_avg, dlatent_avg, dlatent_avg_beta)) update_op.name = 'dlatent_avg_update' dlatent_avg = F.identity(dlatent_avg) + 0 * update_op # truncation trick w = [lerp(dlatent_avg, _, truncation_psi) for _ in w] # generate output from generator constant_bc = nn.parameter.get_parameter_or_create( name="G_synthesis/4x4/Const/const", shape=(1, 512, 4, 4), initializer=np.random.randn(1, 512, 4, 4).astype(np.float32)) constant_bc = F.broadcast(constant_bc, (batch_size, ) + constant_bc.shape[1:]) if mixing_layer_index is None: mixing_layer_index_var = F.randint(1, len(self.resolutions) * 2, (1, )) else: mixing_layer_index_var = F.constant(val=mixing_layer_index, shape=(1, )) mixing_switch_var = F.clip_by_value( F.arange(0, len(self.resolutions) * 2) - mixing_layer_index_var, 0, 1) mixing_switch_var_re = F.reshape( mixing_switch_var, (1, mixing_switch_var.shape[0], 1), inplace=False) w0 = F.reshape(w[0], (batch_size, 1, w[0].shape[1]), inplace=False) w1 = F.reshape(w[1], (batch_size, 1, w[0].shape[1]), inplace=False) w_mixed = w0 * mixing_switch_var_re + \ w1 * (1 - mixing_switch_var_re) rgb_output = self.synthesis(w_mixed, constant_bc) if return_latent: return rgb_output, w_mixed else: return rgb_output
def generate_attribute_direction(args, attribute_prediction_model): if not os.path.isfile(os.path.join(args.weights_path, 'gen_params.h5')): os.makedirs(args.weights_path, exist_ok=True) print( "Downloading the pretrained tf-converted weights. Please wait...") url = "https://nnabla.org/pretrained-models/nnabla-examples/GANs/stylegan2/styleGAN2_G_params.h5" from nnabla.utils.data_source_loader import download download(url, os.path.join(args.weights_path, 'gen_params.h5'), False) nn.load_parameters(os.path.join(args.weights_path, 'gen_params.h5')) print('Loaded pretrained weights from tensorflow!') nn.load_parameters(args.classifier_weight_path) print(f'Loaded {args.classifier_weight_path}') batches = [ args.batch_size for _ in range(args.num_images // args.batch_size) ] if args.num_images % args.batch_size != 0: batches.append(args.num_images - (args.num_images // args.batch_size) * args.batch_size) w_plus, w_minus = 0.0, 0.0 w_plus_count, w_minus_count = 0.0, 0.0 pbar = trange(len(batches)) for i in pbar: batch_size = batches[i] z = [F.randn(shape=(batch_size, 512)).data] z = [z[0], z[0]] for i in range(len(z)): z[i] = F.div2( z[i], F.pow_scalar(F.add_scalar( F.mean(z[i]**2., axis=1, keepdims=True), 1e-8), 0.5, inplace=True)) # get latent code w = [mapping_network(z[0], outmaps=512, num_layers=8)] w += [mapping_network(z[1], outmaps=512, num_layers=8)] # truncation trick dlatent_avg = nn.parameter.get_parameter_or_create(name="dlatent_avg", shape=(1, 512)) w = [lerp(dlatent_avg, _, 0.7) for _ in w] constant_bc = nn.parameter.get_parameter_or_create( name="G_synthesis/4x4/Const/const", shape=(1, 512, 4, 4)) constant_bc = F.broadcast(constant_bc, (batch_size, ) + constant_bc.shape[1:]) gen = synthesis(w, constant_bc, noise_seed=100, mix_after=7) classifier_score = F.softmax(attribute_prediction_model(gen, True)) confidence, class_pred = F.max(classifier_score, axis=1, with_index=True, keepdims=True) w_plus += np.sum(w[0].data * (class_pred.data == 0) * (confidence.data > 0.65), axis=0, keepdims=True) w_minus += np.sum(w[0].data * (class_pred.data == 1) * (confidence.data > 0.65), axis=0, keepdims=True) w_plus_count += np.sum( (class_pred.data == 0) * (confidence.data > 0.65)) w_minus_count += np.sum( (class_pred.data == 1) * (confidence.data > 0.65)) pbar.set_description(f'{w_plus_count} {w_minus_count}') # save attribute direction attribute_variation_direction = (w_plus / w_plus_count) - (w_minus / w_minus_count) print(w_plus_count, w_minus_count) np.save(f'{args.classifier_weight_path.split("/")[0]}/direction.npy', attribute_variation_direction)
def generate_data(args): if not os.path.isfile(os.path.join(args.weights_path, 'gen_params.h5')): os.makedirs(args.weights_path, exist_ok=True) print( "Downloading the pretrained tf-converted weights. Please wait...") url = "https://nnabla.org/pretrained-models/nnabla-examples/GANs/stylegan2/styleGAN2_G_params.h5" from nnabla.utils.data_source_loader import download download(url, os.path.join(args.weights_path, 'gen_params.h5'), False) nn.load_parameters(os.path.join(args.weights_path, 'gen_params.h5')) print('Loaded pretrained weights from tensorflow!') os.makedirs(args.save_image_path, exist_ok=True) batches = [ args.batch_size for _ in range(args.num_images // args.batch_size) ] if args.num_images % args.batch_size != 0: batches.append(args.num_images - (args.num_images // args.batch_size) * args.batch_size) for idx, batch_size in enumerate(batches): z = [ F.randn(shape=(batch_size, 512)).data, F.randn(shape=(batch_size, 512)).data ] for i in range(len(z)): z[i] = F.div2( z[i], F.pow_scalar(F.add_scalar( F.mean(z[i]**2., axis=1, keepdims=True), 1e-8), 0.5, inplace=True)) # get latent code w = [mapping_network(z[0], outmaps=512, num_layers=8)] w += [mapping_network(z[1], outmaps=512, num_layers=8)] # truncation trick dlatent_avg = nn.parameter.get_parameter_or_create(name="dlatent_avg", shape=(1, 512)) w = [lerp(dlatent_avg, _, 0.7) for _ in w] # Load direction if not args.face_morph: attr_delta = nn.NdArray.from_numpy_array( np.load(args.attr_delta_path)) attr_delta = F.reshape(attr_delta[0], (1, -1)) w_plus = [w[0] + args.coeff * attr_delta, w[1]] w_minus = [w[0] - args.coeff * attr_delta, w[1]] else: w_plus = [w[0], w[0]] # content w_minus = [w[1], w[1]] # style constant_bc = nn.parameter.get_parameter_or_create( name="G_synthesis/4x4/Const/const", shape=(1, 512, 4, 4)) constant_bc = F.broadcast(constant_bc, (batch_size, ) + constant_bc.shape[1:]) gen_plus = synthesis(w_plus, constant_bc, noise_seed=100, mix_after=8) gen_minus = synthesis(w_minus, constant_bc, noise_seed=100, mix_after=8) gen = synthesis(w, constant_bc, noise_seed=100, mix_after=8) image_plus = convert_images_to_uint8(gen_plus, drange=[-1, 1]) image_minus = convert_images_to_uint8(gen_minus, drange=[-1, 1]) image = convert_images_to_uint8(gen, drange=[-1, 1]) for j in range(batch_size): filepath = os.path.join(args.save_image_path, f'image_{idx*batch_size+j}') imsave(f'{filepath}_o.png', image_plus[j], channel_first=True) imsave(f'{filepath}_y.png', image_minus[j], channel_first=True) imsave(f'{filepath}.png', image[j], channel_first=True) print(f"Genetated. Saved {filepath}")
def CNN_run(args, ops, arch_dict): """ Based on the given model architecture, construct CNN and execute training. input: args: arguments set by user. ops: operations used in the network. arch_dict: a dictionary containing architecture information. """ data_iterator = data_iterator_cifar10 tdata = data_iterator(args.batch_size, True) vdata = data_iterator(args.batch_size, False) # CIFAR10 statistics, mean and variance CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1)) CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1)) channels, image_height, image_width = 3, 32, 32 batch_size = args.batch_size initial_model_lr = args.model_lr one_epoch = tdata.size // batch_size max_iter = args.epoch * one_epoch val_iter = 10000 // batch_size # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) monitor_err = MonitorSeries("Training error", monitor, interval=100) monitor_vloss = MonitorSeries("Test loss", monitor, interval=100) monitor_verr = MonitorSeries("Test error", monitor, interval=100) # prepare variables and graph used for test image_valid = nn.Variable( (batch_size, channels, image_height, image_width)) label_valid = nn.Variable((batch_size, 1)) input_image_valid = {"image": image_valid, "label": label_valid} pred_valid, _ = construct_networks(args, ops, arch_dict, image_valid, test=True) loss_valid = loss_function(pred_valid, label_valid) # set dropout rate in advance nn.parameter.get_parameter_or_create("drop_rate", shape=(1, 1, 1, 1), need_grad=False) initial_drop_rate = nn.Variable((1, 1, 1, 1)).apply(d=args.dropout_rate) nn.parameter.set_parameter("drop_rate", initial_drop_rate) # prepare variables and graph used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} pred_train, aux_logits = construct_networks(args, ops, arch_dict, image_train, test=False) loss_train = loss_function(pred_train, label_train, aux_logits, args.auxiliary_weight) # prepare solvers model_params_dict = nn.get_parameters() solver_model = S.Momentum(initial_model_lr) solver_model.set_parameters(model_params_dict, reset=False, retain_state=True) # Training-loop for curr_epoch in range(args.epoch): print("epoch {}".format(curr_epoch)) curr_dropout_rate = F.add_scalar( F.mul_scalar(initial_drop_rate, (curr_epoch / args.epoch)), 1e-8) nn.parameter.set_parameter("drop_rate", curr_dropout_rate) for i in range(one_epoch): image, label = tdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD if args.cutout: image = cutout(image, args) input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward(clear_no_need_grad=True) e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(one_epoch * curr_epoch + i, loss_train.d.copy()) monitor_err.add(one_epoch * curr_epoch + i, e) if args.lr_control_model: new_lr = learning_rate_scheduler(one_epoch * curr_epoch + i, max_iter, initial_model_lr, 0) solver_model.set_learning_rate(new_lr) solver_model.zero_grad() loss_train.backward(clear_buffer=True) if args.with_grad_clip_model: for k, v in model_params_dict.items(): v.grad.copy_from( F.clip_by_norm(v.grad, args.grad_clip_value_model)) # update parameters solver_model.weight_decay(args.weight_decay_model) solver_model.update() if (one_epoch * curr_epoch + i) % args.model_save_interval == 0: nn.save_parameters( os.path.join( args.model_save_path, 'params_{}.h5'.format(one_epoch * curr_epoch + i))) # Validation during training. ve = 0. vloss = 0. for j in range(val_iter): image, label = vdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD input_image_valid["image"].d = image input_image_valid["label"].d = label loss_valid.forward(clear_no_need_grad=True) vloss += loss_valid.d.copy() ve += categorical_error(pred_valid.d.copy(), label) ve /= val_iter vloss /= val_iter monitor_vloss.add(one_epoch * curr_epoch + i, vloss) monitor_verr.add(one_epoch * curr_epoch + i, ve) return
def __call__(self, gen_rgb_out, patch_switch=False, index=0): out = conv_layer(gen_rgb_out, inmaps=3, outmaps=self.channels[0], kernel_size=1, name_scope='Discriminator/Convinitial') inmaps = self.channels[0] out_list = [out] for i in range(1, len(self.resolutions)): res = out.shape[2] outmaps = self.channels[i] out = res_block(out, res=res, outmaps=outmaps, inmaps=inmaps) inmaps = outmaps out_list.append(out) if patch_switch: GV_class = GetVariablesOnGraph(out) GF_class = GetFunctionFromInput(out, func_type_list=['LeakyReLU']) feature_dict = OrderedDict() for key in GV_class.coef_dict_on_graph: if ('res_block' in key and '/W' in key) and ('Conv1' in key or 'Conv2' in key): feature_var = GF_class.functions[key][0].outputs[ 0].function_references[0].outputs[0] if feature_var.shape[2:] in ((32, 32), (16, 16)): feature_dict[key] = GF_class.functions[key][0].outputs[ 0].function_references[0].outputs[0] N, C, H, W = out.shape group = min(N, self.stddev_group) stddev_mean = F.reshape( out, (group, -1, self.stddev_feat, C // self.stddev_feat, H, W), inplace=False) mean = F.mul_scalar(F.sum(stddev_mean, axis=0, keepdims=True), 1.0 / stddev_mean.shape[0], inplace=False) stddev_mean = F.mean(F.pow_scalar( F.sub2(stddev_mean, F.broadcast(mean, stddev_mean.shape)), 2.), axis=0, keepdims=False) stddev_mean = F.pow_scalar(F.add_scalar(stddev_mean, 1e-8, inplace=False), 0.5, inplace=False) stddev_mean = F.mean(stddev_mean, axis=[2, 3, 4], keepdims=True) stddev_mean = F.reshape(stddev_mean, stddev_mean.shape[:2] + stddev_mean.shape[3:], inplace=False) out = F.concatenate(out, F.tile(stddev_mean, (group, 1, H, W)), axis=1) out = conv_layer(out, inmaps=out.shape[1], outmaps=self.channels[-1], kernel_size=3, name_scope='Discriminator/Convfinal') out = F.reshape(out, (N, -1), inplace=False) # Linear Layers lrmul = 1 scale = 1 / (out.shape[1]**0.5) * lrmul W, bias = weight_init_fn( (out.shape[-1], self.channels[-1]), weight_var='Discriminator/final_linear_1/affine') out = F.affine(out, W * scale, bias * lrmul) out = F.mul_scalar(F.leaky_relu(out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) scale = 1 / (out.shape[1]**0.5) * lrmul W, bias = weight_init_fn( (out.shape[-1], 1), weight_var='Discriminator/final_linear_2/affine') out = F.affine(out, W * scale, bias * lrmul) if patch_switch: return out, list(feature_dict.values())[index] else: return out