def __init__(self, nf, nout, kwargs): for pname in DNN.param_names: setattr(self, pname, kwargs[pname]) if self.activation == "elu": nonlin = lambda x: T.switch(x >= 0, x, T.exp(x) - 1) else: self.activation = "rectify" if self.activation == "relu" else self.activation nonlin = getattr(lasagne.nonlinearities, self.activation) self.opt = getattr(lasagne.updates, self.opt) l_in = lasagne.layers.InputLayer(shape=(None, nf)) cur_layer = batch_norm(l_in) if self.bnorm else l_in cur_layer = lasagne.layers.DropoutLayer(cur_layer, p=self.drates[0]) if self.drates[0] > 0 else cur_layer self.layers = [cur_layer] for n_hidden, drate in zip(self.n_hidden, self.drates[1:]): l_betw = lasagne.layers.DenseLayer(self.layers[-1], num_units=n_hidden, nonlinearity=nonlin) cur_layer = batch_norm(l_betw) if self.bnorm else l_betw cur_layer = lasagne.layers.DropoutLayer(cur_layer, p=drate) if drate > 0 else cur_layer self.layers.append(cur_layer) l_out = lasagne.layers.DenseLayer(self.layers[-1], num_units=nout, nonlinearity=None) target_output = T.matrix("target_output") # cost_train = T.mean(lasagne.objectives.squared_error(lasagne.layers.get_output(l_out, deterministic=False), target_output)) cost_train = T.mean( T.sum( lasagne.objectives.squared_error(lasagne.layers.get_output(l_out, deterministic=False), target_output), axis=1, ) / 2 ) cost_eval = T.mean( T.sum( lasagne.objectives.squared_error(lasagne.layers.get_output(l_out, deterministic=True), target_output), axis=1, ) / 2 ) # cost_eval = T.mean((lasagne.layers.get_output(l_out, deterministic=True)-target_output)**2) all_params = lasagne.layers.get_all_params(l_out, trainable=True) all_grads = T.grad(cost_train, all_params) all_grads, total_norm = lasagne.updates.total_norm_constraint(all_grads, self.norm, return_norm=True) # all_grads = [T.switch(T.or_(T.isnan(total_norm), T.isinf(total_norm)), p*0.01 , g) for g,p in zip(all_grads, all_params)] updates = self.opt(all_grads, all_params, self.lr) self.train_model = theano.function( inputs=[l_in.input_var, target_output], outputs=cost_train, updates=updates, allow_input_downcast=True ) self.predict_model = theano.function( inputs=[l_in.input_var, target_output], outputs=[cost_eval, lasagne.layers.get_output(l_out, deterministic=True)], allow_input_downcast=True, )
def net_multi_base_named_dilated(X, nfilt, doBatchNorm, trainPhase, pool_stride, pool_size, conf): inDim = X.get_shape()[3] with tf.variable_scope('layer1'): conv1 = conv_relu(X, [5, 5, inDim, 48], 0.01, 0, doBatchNorm, trainPhase) norm1 = norm('norm1', conv1, lsize=2) with tf.variable_scope('layer2'): weights = tf.get_variable( "weights", [3, 3, 48, nfilt], initializer=tf.contrib.layers.xavier_initializer()) biases = tf.get_variable("biases", nfilt, initializer=tf.constant_initializer(1)) conv2 = tf.nn.convolution(norm1, weights, strides=[1, 1], padding='SAME', dilation_rate=[4, 4]) if doBatchNorm: conv2 = batch_norm(conv2, trainPhase) conv2 = tf.nn.relu(conv2 + biases) norm2 = norm('norm2', conv2, lsize=4) with tf.variable_scope('layer3'): weights = tf.get_variable( "weights", [3, 3, nfilt, nfilt], initializer=tf.contrib.layers.xavier_initializer()) biases = tf.get_variable("biases", nfilt, initializer=tf.constant_initializer(1)) conv3 = tf.nn.convolution(norm2, weights, strides=[1, 1], padding='SAME', dilation_rate=[2, 2]) if doBatchNorm: conv3 = batch_norm(conv3, trainPhase) conv3 = tf.nn.relu(conv3 + biases) with tf.variable_scope('layer4'): conv4 = conv_relu(conv3, [3, 3, nfilt, nfilt], 0.01, 1, doBatchNorm, trainPhase) with tf.variable_scope('layer5'): conv5 = conv_relu(conv4, [3, 3, nfilt, nfilt], 0.01, 1, doBatchNorm, trainPhase) out_dict = { 'conv1': conv1, 'conv2': conv2, 'conv3': conv3, 'conv4': conv4, 'conv5': conv5, 'norm1': norm1, 'norm2': norm2, } return conv5, out_dict
def ModelSimple(X, is_training): h_1 = lrelu(batch_norm(conv2d(X, 32, name='conv1'), is_training, scope='bn1'), name='lrelu1') h_2 = lrelu(batch_norm(conv2d(h_1, 64, name='conv2'), is_training, scope='bn2'), name='lrelu2') h_3 = lrelu(batch_norm(conv2d(h_2, 64, name='conv3'), is_training, scope='bn3'), name='lrelu3') h_3_flat = tf.reshape(h_3, [-1, 64 * 4 * 4]) return linear(h_3_flat, 10)
def ConvBNRelu(input, kernelSize, outputSize,is_training): inputSize = input.get_shape()[3].value # print type(inputSize) weights.append(CreateWeight(kernelSize, inputSize, outputSize)) conv = tf.nn.conv2d(input, weights[-1], strides=[1, 1, 1, 1], padding='SAME') # conv = tf.nn.batch_normalization(conv, 0.001, 1.0, 0, 1, 0.0001) conv = batch_norm(conv,is_training) return tf.nn.relu(conv)
def bn_relu_dropout_conv(input_layer, filter_shape, strides, bn_param, keep_prob, device): layer = [batch_norm(input_layer, bn_param, device=device)] layer.append(tf.nn.relu(layer[-1])) if FLAGS.keep_prob!=None: layer.append(tf.nn.dropout(layer[-1], keep_prob[0])) layer.append(convolution_layer(layer[-1], shape=filter_shape, strides=strides, bias=False, layer_name='conv', device=device)) return layer[-1]
def conv_relu(X, kernel_shape, conv_std,bias_val,doBatchNorm,trainPhase): weights = tf.get_variable("weights", kernel_shape, initializer=tf.random_normal_initializer(stddev=conv_std)) biases = tf.get_variable("biases", kernel_shape[-1], initializer=tf.constant_initializer(bias_val)) conv = tf.nn.conv2d(X, weights, strides=[1, 1, 1, 1], padding='SAME') if doBatchNorm: conv = batch_norm(conv,trainPhase) return tf.nn.relu(conv + biases)
def conv_relu(X, kernel_shape, conv_std, bias_val, doBatchNorm, trainPhase): weights = tf.get_variable( "weights", kernel_shape, initializer=tf.random_normal_initializer(stddev=conv_std)) biases = tf.get_variable("biases", kernel_shape[-1], initializer=tf.constant_initializer(bias_val)) conv = tf.nn.conv2d(X, weights, strides=[1, 1, 1, 1], padding='SAME') if doBatchNorm: conv = batch_norm(conv, trainPhase) return tf.nn.relu(conv + biases)
def net_multi_base_named_dilated(X, nfilt, doBatchNorm, trainPhase, pool_stride, pool_size, conf): inDim = X.get_shape()[3] with tf.variable_scope('layer1'): conv1 = conv_relu(X, [5, 5, inDim, 48], 0.01, 0, doBatchNorm, trainPhase) norm1 = norm('norm1', conv1, lsize=2) with tf.variable_scope('layer2'): weights = tf.get_variable("weights", [3,3,48,nfilt], initializer=tf.contrib.layers.xavier_initializer()) biases = tf.get_variable("biases", nfilt, initializer=tf.constant_initializer(1)) conv2 = tf.nn.convolution(norm1, weights, strides=[1, 1], padding='SAME',dilation_rate=[4,4]) if doBatchNorm: conv2 = batch_norm(conv2, trainPhase) conv2 = tf.nn.relu(conv2 + biases) norm2 = norm('norm2',conv2 , lsize=4) with tf.variable_scope('layer3'): weights = tf.get_variable("weights", [3,3,nfilt,nfilt], initializer=tf.contrib.layers.xavier_initializer()) biases = tf.get_variable("biases",nfilt, initializer=tf.constant_initializer(1)) conv3 = tf.nn.convolution(norm2, weights, strides=[1, 1], padding='SAME', dilation_rate=[2, 2]) if doBatchNorm: conv3 = batch_norm(conv3, trainPhase) conv3 = tf.nn.relu(conv3 + biases) with tf.variable_scope('layer4'): conv4 = conv_relu(conv3, [3, 3, nfilt, nfilt], 0.01, 1, doBatchNorm, trainPhase) with tf.variable_scope('layer5'): conv5 = conv_relu(conv4, [3, 3, nfilt, nfilt], 0.01, 1, doBatchNorm, trainPhase) out_dict = {'conv1': conv1, 'conv2': conv2, 'conv3': conv3, 'conv4': conv4, 'conv5': conv5, 'norm1': norm1, 'norm2': norm2, } return conv5, out_dict
def _act(x, name="bn_act"): """Batch-normalized activation function. Args: x: Input tensor. name: Name for the output tensor. Returns: normed: Output tensor. """ n_out = x.get_shape()[-1] with tf.variable_scope("bn_params"): if affine: beta = nn.weight_variable([n_out], init_method="constant", dtype=dtype, init_param={"val": 0.0}, name="beta") gamma = nn.weight_variable([n_out], init_method="constant", dtype=dtype, init_param={"val": 1.0}, name="gamma") else: beta = None gamma = None if learn_sigma: sigma = nn.weight_variable([1], init_method="constant", dtype=dtype, init_param={"val": sigma_init}, name="sigma") else: sigma = sigma_init eps = sigma**2 x_normed, x_mean = batch_norm(x, n_out, is_training, gamma=gamma, beta=beta, eps=eps, axes=axes, scope=scope, name=name, return_mean=True) if l1_reg > 0.0: l1_collection.append(l1_loss(x, x_mean=x_mean, alpha=l1_reg)) return act(x_normed)
def inference(input_tensor_batch, bn_param, keep_prob, n, k, num_classes, device): layers = [] with tf.variable_scope('group1'): conv0 = convolution_layer(input_tensor_batch, shape=[3, 3, 3, 16], strides=[1, 1, 1, 1], bias=False, layer_name='conv0', device=device) layers.append(conv0) for i in range(n): with tf.variable_scope('group2_block%d' %i): if i == 0 and k!=1: conv1 = first_residual_block(layers[-1], 16*k, bn_param, keep_prob, down_sample=False, device=device) else: conv1 = residual_block(layers[-1], 16*k, bn_param, keep_prob, device=device) layers.append(conv1) for i in range(n): with tf.variable_scope('group3_block%d' %i): if i==0: conv2 = first_residual_block(layers[-1], 32*k, bn_param, keep_prob, down_sample=True, device=device) else: conv2 = residual_block(layers[-1], 32*k, bn_param, keep_prob, device=device) layers.append(conv2) for i in range(n): with tf.variable_scope('group4_block%d' %i): if i==0: conv3 = first_residual_block(layers[-1], 64*k, bn_param, keep_prob, down_sample=True, device=device) else: conv3 = residual_block(layers[-1], 64*k, bn_param, keep_prob, device=device) layers.append(conv3) assert conv3.get_shape().as_list()[1:] == [8, 8, 64*k] with tf.variable_scope('fc'): bn_layer = batch_norm(layers[-1], bn_param, device=device) relu_layer = tf.nn.relu(bn_layer) global_pool = tf.reduce_mean(relu_layer, [1, 2]) assert global_pool.get_shape().as_list()[-1:] == [64*k] shape=[global_pool.get_shape().as_list()[-1], num_classes] output = full_connection_layer(global_pool, shape=shape, bias=True, layer_name='output', device=device) layers.append(output) return layers[-1]
def net_multi_conv(X0, X1, X2, _dropout, conf, doBatchNorm, trainPhase): imsz = conf.imsz rescale = conf.rescale pool_scale = conf.pool_scale nfilt = conf.nfilt pool_stride = conf.pool_stride pool_size = conf.pool_size # conv5_0,base_dict_0 = net_multi_base(X0,_weights['base0']) # conv5_1,base_dict_1 = net_multi_base(X1,_weights['base1']) # conv5_2,base_dict_2 = net_multi_base(X2,_weights['base2']) if conf.dilation_rate is 4: net_to_use = net_multi_base_named_dilated else: net_to_use = net_multi_base_named with tf.variable_scope('scale0'): conv5_0, base_dict_0 = net_to_use(X0, nfilt, doBatchNorm, trainPhase, pool_stride, pool_size, conf) with tf.variable_scope('scale1'): conv5_1, base_dict_1 = net_to_use(X1, nfilt, doBatchNorm, trainPhase, pool_stride, pool_size, conf) with tf.variable_scope('scale2'): conv5_2, base_dict_2 = net_to_use(X2, nfilt, doBatchNorm, trainPhase, pool_stride, pool_size, conf) sz0 = int(math.ceil(float(imsz[0]) / pool_scale / rescale)) sz1 = int(math.ceil(float(imsz[1]) / pool_scale / rescale)) conv5_1_up = upscale('5_1', conv5_1, [sz0, sz1]) conv5_2_up = upscale('5_2', conv5_2, [sz0, sz1]) # crop lower res layers to match higher res size conv5_0_sz = tf.Tensor.get_shape(conv5_0).as_list() conv5_1_sz = tf.Tensor.get_shape(conv5_1_up).as_list() crop_0 = int(old_div((sz0 - conv5_0_sz[1]), 2)) crop_1 = int(old_div((sz1 - conv5_0_sz[2]), 2)) curloc = [0, crop_0, crop_1, 0] patchsz = tf.to_int32([-1, conv5_0_sz[1], conv5_0_sz[2], -1]) conv5_1_up = tf.slice(conv5_1_up, curloc, patchsz) conv5_2_up = tf.slice(conv5_2_up, curloc, patchsz) conv5_1_final_sz = tf.Tensor.get_shape(conv5_1_up).as_list() # print("Initial lower res layer size %s"%(', '.join(map(str,conv5_1_sz)))) # print("Initial higher res layer size %s"%(', '.join(map(str,conv5_0_sz)))) # print("Crop start lower res layer at %s"%(', '.join(map(str,curloc)))) # print("Final size of lower res layer %s"%(', '.join(map(str,conv5_1_final_sz)))) conv5_cat = tf.concat([conv5_0, conv5_1_up, conv5_2_up], 3) # Reshape conv5 output to fit dense layer input # conv6 = conv2d('conv6',conv5_cat,_weights['wd1'],_weights['bd1']) # conv6 = tf.nn.dropout(conv6,_dropout) # conv7 = conv2d('conv7',conv6,_weights['wd2'],_weights['bd2']) # conv7 = tf.nn.dropout(conv7,_dropout) with tf.variable_scope('layer6'): if hasattr(conf, 'dilation_rate'): dilation_rate = [conf.dilation_rate, conf.dilation_rate] else: dilation_rate = [1, 1] weights = tf.get_variable( "weights", [conf.psz, conf.psz, conf.numscale * nfilt, conf.nfcfilt], initializer=tf.contrib.layers.xavier_initializer()) biases = tf.get_variable("biases", conf.nfcfilt, initializer=tf.constant_initializer(1)) conv6 = tf.nn.convolution(conv5_cat, weights, strides=[1, 1], padding='SAME', dilation_rate=dilation_rate) if doBatchNorm: conv6 = batch_norm(conv6, trainPhase) conv6 = tf.nn.relu(conv6 + biases) conv6 = tf.nn.dropout(conv6, _dropout, [conf.batch_size, 1, 1, conf.nfcfilt]) with tf.variable_scope('layer7'): conv7 = conv_relu(conv6, [1, 1, conf.nfcfilt, conf.nfcfilt], 0.005, 1, doBatchNorm, trainPhase) # if not doBatchNorm: conv7 = tf.nn.dropout(conv7, _dropout, [conf.batch_size, 1, 1, conf.nfcfilt]) # Output, class prediction # out = tf.nn.bias_add(tf.nn.conv2d( # conv7, _weights['wd3'], # strides=[1, 1, 1, 1], padding='SAME'),_weights['bd3']) with tf.variable_scope('layer8'): l8_weights = tf.get_variable( "weights", [1, 1, conf.nfcfilt, conf.n_classes], initializer=tf.random_normal_initializer(stddev=0.01)) l8_biases = tf.get_variable("biases", conf.n_classes, initializer=tf.constant_initializer(0)) out = tf.nn.conv2d( conv7, l8_weights, strides=[1, 1, 1, 1 ], padding='SAME') + l8_biases # No batch norm for the output layer. out_dict = { 'base_dict_0': base_dict_0, 'base_dict_1': base_dict_1, 'base_dict_2': base_dict_2, 'conv6': conv6, 'conv7': conv7, } return out, out_dict
def bn_relu_conv(input_layer, filter_shape, strides, bn_param, device): bn = batch_norm(input_layer, bn_param, device=device) relu = tf.nn.relu(bn) conv = convolution_layer(relu, shape=filter_shape, strides=strides, bias=False, layer_name='conv', device=device) return conv, relu
def inference(input_tensor, train, regularizer): #第一层卷积层 with tf.variable_scope('layer1-conv1'): conv1_weights_g = tf.get_variable( "weight_g", shape=[CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_DEEP], initializer=tf.truncated_normal_initializer(stddev=0.1)) conv1_biases_g = tf.get_variable( "biases_g", shape=[CONV1_DEEP], initializer=tf.constant_initializer(0.0)) conv1_weights_o = tf.get_variable( "weight_o", shape=[CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_DEEP], initializer=tf.truncated_normal_initializer(stddev=0.1)) #tf.assign(conv1_weights_o , gutils.unit(conv1_weights_o)) conv1_biases_o = tf.get_variable( "biases_o", shape=[CONV1_DEEP], initializer=tf.constant_initializer(0.0)) conv1_weights_g_tmp = tf.get_variable( "weight_g_tmp", shape=[CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_DEEP], initializer=tf.truncated_normal_initializer(stddev=1)) conv1_weights_o_tmp = tf.get_variable( "weight_o_tmp", shape=[CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_DEEP], initializer=tf.truncated_normal_initializer(stddev=1)) conv1_biases_g_tmp = tf.get_variable( "biases_g_tmp", shape=[CONV1_DEEP], initializer=tf.constant_initializer(0.0)) conv1_biases_o_tmp = tf.get_variable( "biases_o_tmp", shape=[CONV1_DEEP], initializer=tf.constant_initializer(0.0)) #卷积网络前向传播,这里步长为1且做全0填充,输出是28*28*32的矩阵,步幅就在第二个参数矩阵里面了。 conv1_g = tf.nn.conv2d(input_tensor, conv1_weights_g, strides=[1, 1, 1, 1], padding='SAME') conv1_batch_g = batch_norm.batch_norm(conv1_g, scale=None) relu1_g = tf.nn.relu(tf.nn.bias_add(conv1_batch_g, conv1_biases_g)) conv1_o = tf.nn.conv2d(input_tensor, conv1_weights_o, strides=[1, 1, 1, 1], padding='SAME') conv1_batch_o = batch_norm.batch_norm(conv1_o, scale=None) relu1_o = tf.nn.relu(tf.nn.bias_add(conv1_batch_o, conv1_biases_o)) #第二层池化层,步长为2,全0填充,过滤器边长为2 with tf.name_scope('layer2-pool1'): pool1_g = tf.nn.max_pool(relu1_g, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') #第二个参数是步幅,第三个参数是步长 pool1_batch_g = batch_norm.batch_norm(pool1_g, scale=None) pool1_o = tf.nn.max_pool(relu1_o, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # 第二个参数是步幅,第三个参数是步长 pool1_batch_o = batch_norm.batch_norm(pool1_o, scale=None) #输出是14*14*32,池化不改变层数 #第三层卷积层,步幅为5,步长为1,深度为64,全0填充,输出为14*14*64 with tf.variable_scope('layer3-conv2'): conv2_weights_g = tf.get_variable( 'weight_g', shape=[CONV2_SIZE, CONV2_SIZE, CONV1_DEEP, CONV2_DEEP], initializer=tf.truncated_normal_initializer(stddev=0.1)) conv2_biases_g = tf.get_variable( 'biases_g', shape=[CONV2_DEEP], initializer=tf.constant_initializer(0.0)) conv2_weights_o = tf.get_variable( 'weight_o', shape=[CONV2_SIZE, CONV2_SIZE, CONV1_DEEP, CONV2_DEEP], initializer=tf.truncated_normal_initializer(stddev=0.1)) #tf.assign(conv2_weights_o , gutils.unit(conv2_weights_o)) conv2_biases_o = tf.get_variable( 'biases_o', shape=[CONV2_DEEP], initializer=tf.constant_initializer(0.0)) conv2_weights_g_tmp = tf.get_variable( "weight_g_tmp", shape=[CONV2_SIZE, CONV2_SIZE, CONV1_DEEP, CONV2_DEEP], initializer=tf.truncated_normal_initializer(stddev=1)) conv2_weights_o_tmp = tf.get_variable( "weight_o_tmp", shape=[CONV2_SIZE, CONV2_SIZE, CONV1_DEEP, CONV2_DEEP], initializer=tf.truncated_normal_initializer(stddev=1)) conv2_biases_g_tmp = tf.get_variable( 'biases_g_tmp', shape=[CONV2_DEEP], initializer=tf.constant_initializer(0.0)) conv2_biases_o_tmp = tf.get_variable( 'biases_o_tmp', shape=[CONV2_DEEP], initializer=tf.constant_initializer(0.0)) #卷积网络前向传播 conv2_g = tf.nn.conv2d(pool1_batch_g, conv2_weights_g, strides=[1, 1, 1, 1], padding='SAME') conv2_batch_g = batch_norm.batch_norm(conv2_g, scale=None) relu2_g = tf.nn.relu(tf.nn.bias_add(conv2_batch_g, conv2_biases_g)) conv2_o = tf.nn.conv2d(pool1_batch_o, conv2_weights_o, strides=[1, 1, 1, 1], padding='SAME') conv2_batch_o = batch_norm.batch_norm(conv2_o, scale=None) relu2_o = tf.nn.relu(tf.nn.bias_add(conv2_batch_o, conv2_biases_o)) #第四层池化层和第二层结构相同 with tf.name_scope('layer4-pool2'): pool2_g = tf.nn.max_pool(relu2_g, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # 第二个参数是步幅,第三个参数是步长 pool2_batch_g = batch_norm.batch_norm(pool2_g, scale=None) pool2_o = tf.nn.max_pool(relu2_o, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # 第二个参数是步幅,第三个参数是步长 pool2_batch_o = batch_norm.batch_norm(pool2_o, scale=None) #第五层是引入dropout的全连接层 # 输出是7*7*64 # 第五层是全连接网络,输出节点是512个 with tf.variable_scope('layer5-fc1'): pool_shape = pool2_batch_g.get_shape().as_list() # pool_shape的第一个数据pool_shape[0]就是batch的大小 nodes = pool_shape[1] * pool_shape[2] * pool_shape[3] # 重新改变输入的结构把它拉成一个向量做全连接 reshaped_g = tf.reshape(pool2_batch_g, [-1, nodes]) reshaped_o = tf.reshape(pool2_batch_o, [-1, nodes]) fc1_weights_g = tf.get_variable( 'weight_g', shape=[nodes, FC_SIZE], initializer=tf.truncated_normal_initializer(stddev=0.1)) fc1_weights_o = tf.get_variable( 'weight_o', shape=[nodes, FC_SIZE], initializer=tf.truncated_normal_initializer(stddev=0.1)) #tf.assign(fc1_weights_o , gutils.unit(fc1_weights_o)) fc1_weights_g_tmp = tf.get_variable( 'weight_g_tmp', shape=[nodes, FC_SIZE], initializer=tf.truncated_normal_initializer(stddev=0.1)) fc1_weights_o_tmp = tf.get_variable( 'weight_o_tmp', shape=[nodes, FC_SIZE], initializer=tf.truncated_normal_initializer(stddev=0.1)) #只对全连接参数做正则化 if regularizer != None: tf.add_to_collection('losses_g', regularizer(fc1_weights_g)) tf.add_to_collection('losses_o', regularizer(fc1_weights_o)) fc1_biases_g = tf.get_variable( 'biases_g', shape=[FC_SIZE], initializer=tf.constant_initializer(0.0)) fc1_biases_o = tf.get_variable( 'biases_o', shape=[FC_SIZE], initializer=tf.constant_initializer(0.0)) fc1_biases_g_tmp = tf.get_variable( 'biases_g_tmp', shape=[FC_SIZE], initializer=tf.constant_initializer(0.0)) fc1_biases_o_tmp = tf.get_variable( 'biases_o_tmp', shape=[FC_SIZE], initializer=tf.constant_initializer(0.0)) fc1_g = tf.nn.relu(tf.matmul(reshaped_g, fc1_weights_g) + fc1_biases_g) fc1_o = tf.nn.relu(tf.matmul(reshaped_o, fc1_weights_o) + fc1_biases_o) if train: fc1_g = tf.nn.dropout(fc1_g, 0.5) fc1_o = tf.nn.dropout(fc1_o, 0.5) #第六层输出层 with tf.variable_scope('layer6-fc2'): fc2_weights_g = tf.get_variable( 'weight_g', shape=[FC_SIZE, NUM_LABELS], initializer=tf.truncated_normal_initializer(stddev=0.1)) fc2_weights_o = tf.get_variable( 'weight_o', shape=[FC_SIZE, NUM_LABELS], initializer=tf.truncated_normal_initializer(stddev=0.1)) #tf.assign(fc2_weights_o , gutils.unit(fc2_weights_o)) fc2_weights_g_tmp = tf.get_variable( 'weight_g_tmp', shape=[FC_SIZE, NUM_LABELS], initializer=tf.truncated_normal_initializer(stddev=0.1)) fc2_weights_o_tmp = tf.get_variable( 'weight_o_tmp', shape=[FC_SIZE, NUM_LABELS], initializer=tf.truncated_normal_initializer(stddev=0.1)) if regularizer != None: tf.add_to_collection('losses_g', regularizer(fc2_weights_g)) tf.add_to_collection('losses_o', regularizer(fc2_weights_o)) fc2_biases_g = tf.get_variable( 'biases_g', shape=[NUM_LABELS], initializer=tf.constant_initializer(0.0)) fc2_biases_o = tf.get_variable( 'biases_o', shape=[NUM_LABELS], initializer=tf.constant_initializer(0.0)) fc2_biases_g_tmp = tf.get_variable( 'biases_g_tmp', shape=[NUM_LABELS], initializer=tf.constant_initializer(0.0)) fc2_biases_o_tmp = tf.get_variable( 'biases_o_tmp', shape=[NUM_LABELS], initializer=tf.constant_initializer(0.0)) logit_g = tf.matmul(fc1_g, fc2_weights_g) + fc2_biases_g logit_o = tf.matmul(fc1_o, fc2_weights_o) + fc2_biases_o return logit_g, logit_o
def __init__(self, nf, nout, kwargs): for pname in DNN.param_names: setattr(self, pname, kwargs[pname]) if self.activation == 'elu': nonlin = lambda x: T.switch(x >= 0, x, T.exp(x) - 1) else: self.activation = 'rectify' if self.activation == 'relu' else self.activation nonlin = getattr(lasagne.nonlinearities, self.activation) self.opt = getattr(lasagne.updates, self.opt) l_in = lasagne.layers.InputLayer(shape=(None, nf)) cur_layer = batch_norm(l_in) if self.bnorm else l_in cur_layer = lasagne.layers.DropoutLayer( cur_layer, p=self.drates[0]) if self.drates[0] > 0 else cur_layer self.layers = [cur_layer] for n_hidden, drate in zip(self.n_hidden, self.drates[1:]): l_betw = lasagne.layers.DenseLayer(self.layers[-1], num_units=n_hidden, nonlinearity=nonlin) cur_layer = batch_norm(l_betw) if self.bnorm else l_betw cur_layer = lasagne.layers.DropoutLayer( cur_layer, p=drate) if drate > 0 else cur_layer self.layers.append(cur_layer) l_out = lasagne.layers.DenseLayer(self.layers[-1], num_units=nout, nonlinearity=None) target_output = T.matrix('target_output') # cost_train = T.mean(lasagne.objectives.squared_error(lasagne.layers.get_output(l_out, deterministic=False), target_output)) cost_train = T.mean( T.sum(lasagne.objectives.squared_error( lasagne.layers.get_output(l_out, deterministic=False), target_output), axis=1) / 2) cost_eval = T.mean( T.sum(lasagne.objectives.squared_error( lasagne.layers.get_output(l_out, deterministic=True), target_output), axis=1) / 2) # cost_eval = T.mean((lasagne.layers.get_output(l_out, deterministic=True)-target_output)**2) all_params = lasagne.layers.get_all_params(l_out, trainable=True) all_grads = T.grad(cost_train, all_params) all_grads, total_norm = lasagne.updates.total_norm_constraint( all_grads, self.norm, return_norm=True) # all_grads = [T.switch(T.or_(T.isnan(total_norm), T.isinf(total_norm)), p*0.01 , g) for g,p in zip(all_grads, all_params)] updates = self.opt(all_grads, all_params, self.lr) self.train_model = theano.function( inputs=[l_in.input_var, target_output], outputs=cost_train, updates=updates, allow_input_downcast=True) self.predict_model = theano.function( inputs=[l_in.input_var, target_output], outputs=[ cost_eval, lasagne.layers.get_output(l_out, deterministic=True) ], allow_input_downcast=True)
import tensorflow as tf from batch_norm import batch_norm from activations import lrelu from connections import conv2d, linear from datasets import MNIST # %% Setup input to the network and true output label. These are # simply placeholders which we'll fill in later. mnist = MNIST() x = tf.placeholder(tf.float32, [None, 784]) y = tf.placeholder(tf.float32, [None, 10]) x_tensor = tf.reshape(x, [-1, 28, 28, 1]) # %% Define the network: bn1 = batch_norm(-1, name='bn1') bn2 = batch_norm(-1, name='bn2') bn3 = batch_norm(-1, name='bn3') h_1 = lrelu(bn1(conv2d(x_tensor, 32, name='conv1')), name='lrelu1') h_2 = lrelu(bn2(conv2d(h_1, 64, name='conv2')), name='lrelu2') h_3 = lrelu(bn3(conv2d(h_2, 64, name='conv3')), name='lrelu3') h_3_flat = tf.reshape(h_3, [-1, 64 * 4 * 4]) h_4 = linear(h_3_flat, 10) y_pred = tf.nn.softmax(h_4) # %% Define loss/eval/training functions cross_entropy = -tf.reduce_sum(y * tf.log(y_pred)) train_step = tf.train.AdamOptimizer().minimize(cross_entropy) correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
def __init__( self, sess, height, width, phi_length, n_actions, name, optimizer='RMS', learning_rate=0.00025, epsilon=0.01, decay=0.95, momentum=0., slow=False, tau=0.001, verbose=False, path='', folder='_networks', l2_decay=0.001): """ Initialize network """ super(DqnNetClass, self).__init__(sess, name=name) self.slow = slow self.tau = tau self.name = name self.sess = sess self.path = path self.folder = folder self.observation = tf.placeholder(tf.float32, [None, height, width, phi_length], name=self.name + '_observation') self.actions = tf.placeholder(tf.float32, shape=[None, n_actions], name=self.name + "_actions") self.is_training = tf.placeholder(tf.bool, []) with tf.name_scope("Conv1") as scope: kernel_shape = [8, 8, phi_length, 32] self.W_conv1 = self.weight_variable(kernel_shape, 'conv1') #self.b_conv1 = self.bias_variable(kernel_shape, 'conv1') self.h_conv1_bn = batch_norm(self.conv2d(self.observation, self.W_conv1, 4), 32, self.is_training, self.sess, slow=self.slow, tau=self.tau) self.h_conv1 = tf.nn.relu(self.h_conv1_bn.bnorm, name=self.name + '_conv1_activations') tf.add_to_collection('conv_weights', self.W_conv1) tf.add_to_collection('conv_output', self.h_conv1) tf.add_to_collection('transfer_params', self.W_conv1) tf.add_to_collection('transfer_params', self.h_conv1_bn.scale) tf.add_to_collection('transfer_params', self.h_conv1_bn.beta) tf.add_to_collection('transfer_params', self.h_conv1_bn.pop_mean) tf.add_to_collection('transfer_params', self.h_conv1_bn.pop_var) with tf.name_scope("Conv2") as scope: kernel_shape = [4, 4, 32, 64] self.W_conv2 = self.weight_variable(kernel_shape, 'conv2') #self.b_conv2 = self.bias_variable(kernel_shape, 'conv2') self.h_conv2_bn = batch_norm(self.conv2d(self.h_conv1, self.W_conv2, 2), 64, self.is_training, self.sess, slow=self.slow, tau=self.tau) self.h_conv2 = tf.nn.relu(self.h_conv2_bn.bnorm, name=self.name + '_conv2_activations') tf.add_to_collection('conv_weights', self.W_conv2) tf.add_to_collection('conv_output', self.h_conv2) tf.add_to_collection('transfer_params', self.W_conv2) tf.add_to_collection('transfer_params', self.h_conv2_bn.scale) tf.add_to_collection('transfer_params', self.h_conv2_bn.beta) tf.add_to_collection('transfer_params', self.h_conv2_bn.pop_mean) tf.add_to_collection('transfer_params', self.h_conv2_bn.pop_var) with tf.name_scope("Conv3") as scope: kernel_shape = [3, 3, 64, 64] self.W_conv3 = self.weight_variable(kernel_shape, 'conv3') #self.b_conv3 = self.bias_variable(kernel_shape, 'conv3') self.h_conv3_bn = batch_norm(self.conv2d(self.h_conv2, self.W_conv3, 1), 64, self.is_training, self.sess, slow=self.slow, tau=self.tau) self.h_conv3 = tf.nn.relu(self.h_conv3_bn.bnorm, name=self.name + '_conv3_activations') tf.add_to_collection('conv_weights', self.W_conv3) tf.add_to_collection('conv_output', self.h_conv3) tf.add_to_collection('transfer_params', self.W_conv3) tf.add_to_collection('transfer_params', self.h_conv3_bn.scale) tf.add_to_collection('transfer_params', self.h_conv3_bn.beta) tf.add_to_collection('transfer_params', self.h_conv3_bn.pop_mean) tf.add_to_collection('transfer_params', self.h_conv3_bn.pop_var) self.h_conv3_flat = tf.reshape(self.h_conv3, [-1, 3136]) with tf.name_scope("FullyConnected1") as scope: kernel_shape = [3136, 512] self.W_fc1 = self.weight_variable(kernel_shape, 'fc1') #self.b_fc1 = self.bias_variable(kernel_shape, 'fc1') self.h_fc1_bn = batch_norm(tf.matmul(self.h_conv3_flat, self.W_fc1), 512, self.is_training, self.sess, slow=self.slow, tau=self.tau, linear=True) self.h_fc1 = tf.nn.relu(self.h_fc1_bn.bnorm, name=self.name + '_fc1_activations') tf.add_to_collection('transfer_params', self.W_fc1) tf.add_to_collection('transfer_params', self.h_fc1_bn.scale) tf.add_to_collection('transfer_params', self.h_fc1_bn.beta) tf.add_to_collection('transfer_params', self.h_fc1_bn.pop_mean) tf.add_to_collection('transfer_params', self.h_fc1_bn.pop_var) with tf.name_scope("FullyConnected2") as scope: kernel_shape = [512, n_actions] self.W_fc2 = self.weight_variable_last_layer(kernel_shape, 'fc2') self.b_fc2 = self.bias_variable_last_layer(kernel_shape, 'fc2') self.action_output = tf.add(tf.matmul(self.h_fc1, self.W_fc2), self.b_fc2, name=self.name + '_fc1_outputs') tf.add_to_collection('transfer_params', self.W_fc2) tf.add_to_collection('transfer_params', self.b_fc2) if verbose: self.init_verbosity() # cost of q network with tf.name_scope("Entropy") as scope: self.cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( _sentinel=None, labels=self.actions, logits=self.action_output)) #+ \ #l2_decay*tf.nn.l2_loss(self.W_fc2) + l2_decay*tf.nn.l2_loss(self.b_fc2)) ce_summ = tf.summary.scalar("cross_entropy", self.cross_entropy) # self.parameters = [ # self.W_conv1, self.h_conv1_bn.scale, self.h_conv1_bn.beta, # self.W_conv2, self.h_conv2_bn.scale, self.h_conv2_bn.beta, # self.W_conv3, self.h_conv3_bn.scale, self.h_conv3_bn.beta, # self.W_fc1, self.h_fc1_bn.scale, self.h_fc1_bn.beta, # self.W_fc2, self.b_fc2, # ] with tf.name_scope("Train") as scope: if optimizer == "Adam": self.opt = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=epsilon) else: self.opt = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum, epsilon=epsilon) self.grads_vars = self.opt.compute_gradients(self.cross_entropy) grads = [] params = [] for p in self.grads_vars: if p[0] == None: continue grads.append(p[0]) params.append(p[1]) grads = tf.clip_by_global_norm(grads, 1)[0] self.grads_vars_updates = zip(grads, params) self.train_step = self.opt.apply_gradients(self.grads_vars_updates) # for grad, var in self.grads_vars: # if grad == None: # continue # tf.summary.histogram(var.op.name + '/gradients', grad) with tf.name_scope("Evaluating") as scope: correct_prediction = tf.equal(tf.argmax(self.action_output,1), tf.argmax(self.actions,1)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) accuracy_summary = tf.summary.scalar("accuracy", self.accuracy) # initialize all tensor variable parameters self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() self.merged = tf.summary.merge_all() self.writer = tf.summary.FileWriter(self.path + self.folder + '/log_tb', self.sess.graph)
def inference(input_tensor, train, regularizer): with tf.variable_scope('layer1-conv1_grassmann'): conv1_weights_g = tf.get_variable( "weight_g", shape=[CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_DEEP], initializer=tf.truncated_normal_initializer(stddev=0.1, seed=1)) conv1_biases_g = tf.get_variable( "biases_g", shape=[CONV1_DEEP], initializer=tf.constant_initializer(0)) conv1_weights_g_tmp = tf.get_variable( "weight_g_tmp", shape=[CONV1_SIZE, CONV1_SIZE, NUM_CHANNELS, CONV1_DEEP], initializer=tf.truncated_normal_initializer(stddev=1)) conv1_biases_g_tmp = tf.get_variable( "biases_g_tmp", shape=[CONV1_DEEP], initializer=tf.constant_initializer(0.0)) #卷积网络前向传播,这里步长为1且做全0填充,输出是28*28*32的矩阵,步幅就在第二个参数矩阵里面了。 conv1_g = tf.nn.conv2d(input_tensor, conv1_weights_g, strides=[1, 1, 1, 1], padding='SAME') conv1_batch_g = batch_norm.batch_norm(conv1_g, scale=None) relu1_g_grassmann = tf.nn.relu( tf.nn.bias_add(conv1_batch_g, conv1_biases_g)) with tf.name_scope('layer2-pool1_grassmann'): pool1_g = tf.nn.max_pool(relu1_g_grassmann, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') #第二个参数是步幅,第三个参数是步长 pool1_batch_g_grassmann = batch_norm.batch_norm(pool1_g, scale=None) with tf.variable_scope('layer3-conv2_grassmann'): conv2_weights_g = tf.get_variable( 'weight_g', shape=[CONV2_SIZE, CONV2_SIZE, CONV1_DEEP, CONV2_DEEP], initializer=tf.truncated_normal_initializer(stddev=0.1, seed=3)) conv2_biases_g = tf.get_variable( 'biases_g', shape=[CONV2_DEEP], initializer=tf.constant_initializer(0.0)) conv2_weights_g_tmp = tf.get_variable( "weight_g_tmp", shape=[CONV2_SIZE, CONV2_SIZE, CONV1_DEEP, CONV2_DEEP], initializer=tf.truncated_normal_initializer(stddev=1)) conv2_biases_g_tmp = tf.get_variable( "biases_g_tmp", shape=[CONV2_DEEP], initializer=tf.constant_initializer(0.0)) #卷积网络前向传播 conv2_g = tf.nn.conv2d(pool1_batch_g_grassmann, conv2_weights_g, strides=[1, 1, 1, 1], padding='SAME') conv2_batch_g = batch_norm.batch_norm(conv2_g, scale=None) relu2_g_grassmann = tf.nn.relu( tf.nn.bias_add(conv2_batch_g, conv2_biases_g)) with tf.name_scope('layer4-pool2_grassmann'): pool2_g = tf.nn.max_pool(relu2_g_grassmann, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # 第二个参数是步幅,第三个参数是步长 pool2_batch_g_grassmann = batch_norm.batch_norm(pool2_g, scale=None) with tf.variable_scope('layer5-fc1_grassmann'): pool_shape = pool2_batch_g_grassmann.get_shape().as_list() # pool_shape的第一个数据pool_shape[0]就是batch的大小 nodes = pool_shape[1] * pool_shape[2] * pool_shape[3] # 重新改变输入的结构把它拉成一个向量做全连接 reshaped_g = tf.reshape(pool2_batch_g_grassmann, [-1, nodes]) fc1_weights_g = tf.get_variable( 'weight_g', shape=[nodes, FC_SIZE], initializer=tf.truncated_normal_initializer(stddev=0.1, seed=5)) fc1_weights_g_tmp = tf.get_variable( 'weight_g_tmp', shape=[nodes, FC_SIZE], initializer=tf.truncated_normal_initializer(stddev=0.1)) #只对全连接参数做正则化 if regularizer != None: tf.add_to_collection('losses_g_grassmann', regularizer(fc1_weights_g)) fc1_biases_g = tf.get_variable( 'biases_g', shape=[FC_SIZE], initializer=tf.constant_initializer(0.0)) fc1_biases_g_tmp = tf.get_variable( "biases_g_tmp", shape=[FC_SIZE], initializer=tf.constant_initializer(0.0)) fc1_g_grassmann = tf.nn.relu( tf.matmul(reshaped_g, fc1_weights_g) + fc1_biases_g) if train: fc1_g_grassmann = tf.nn.dropout(fc1_g_grassmann, 0.5) with tf.variable_scope('layer6-fc2_grassmann'): fc2_weights_g = tf.get_variable( 'weight_g', shape=[FC_SIZE, NUM_LABELS], initializer=tf.truncated_normal_initializer(stddev=0.1, seed=5)) fc2_weights_g_tmp = tf.get_variable( 'weight_g_tmp', shape=[FC_SIZE, NUM_LABELS], initializer=tf.truncated_normal_initializer(stddev=0.1)) if regularizer != None: tf.add_to_collection('losses_g_grassmann', regularizer(fc2_weights_g)) fc2_biases_g = tf.get_variable( 'biases_g', shape=[NUM_LABELS], initializer=tf.constant_initializer(0.0)) fc2_biases_g_tmp = tf.get_variable( "biases_g_tmp", shape=[NUM_LABELS], initializer=tf.constant_initializer(0.0)) logit_g_grassmann = tf.matmul(fc1_g_grassmann, fc2_weights_g) + fc2_biases_g return logit_g_grassmann
def fc_bn_relu(inputTensor, shape, layer_name, bn_param, device): fc = full_connection_layer(inputTensor, shape, bias=False, layer_name=layer_name, device=device) bn = batch_norm(fc, bn_param=bn_param, scale=False, name=layer_name, device=device) return tf.nn.relu(bn)
def __init__(self, sess, height, width, phi_length, n_actions, name, gamma=0.99, copy_interval=4, optimizer='RMS', learning_rate=0.00025, epsilon=0.01, decay=0.95, momentum=0., l2_decay=0.0001, error_clip=1.0, slow=False, tau=0.01, verbose=False, path='', folder='_networks', decay_learning_rate=False, transfer=False): """ Initialize network """ Network.__init__(self, sess, name=name) self.gamma = gamma self.slow = slow self.tau = tau self.name = name self.sess = sess self.path = path self.folder = folder self.copy_interval = copy_interval self.update_counter = 0 self.decay_learning_rate = decay_learning_rate self.observation = tf.placeholder(tf.float32, [None, height, width, phi_length], name=self.name + '_observation') self.actions = tf.placeholder(tf.float32, shape=[None, n_actions], name=self.name + "_actions") # one-hot matrix self.next_observation = tf.placeholder( tf.float32, [None, height, width, phi_length], name=self.name + '_t_next_observation') self.rewards = tf.placeholder(tf.float32, shape=[None], name=self.name + "_rewards") self.terminals = tf.placeholder(tf.float32, shape=[None], name=self.name + "_terminals") self.slow_learnrate_vars = [] self.fast_learnrate_vars = [] self.observation_n = tf.div(self.observation, 255.) self.next_observation_n = tf.div(self.next_observation, 255.) # q network model: self.is_training = tf.placeholder(tf.bool, []) with tf.name_scope("Conv1") as scope: kernel_shape = [8, 8, phi_length, 32] self.W_conv1 = self.weight_variable(phi_length, kernel_shape, 'conv1') #self.b_conv1 = self.bias_variable(kernel_shape, 'conv1') self.h_conv1_bn = batch_norm(self.conv2d(self.observation_n, self.W_conv1, 4), 32, self.is_training, self.sess, slow=self.slow, tau=self.tau) self.h_conv1 = tf.nn.relu(self.h_conv1_bn.bnorm, name=self.name + '_conv1_activations') tf.add_to_collection('conv_weights', self.W_conv1) tf.add_to_collection('conv_output', self.h_conv1) if transfer: self.slow_learnrate_vars.append(self.W_conv1) self.slow_learnrate_vars.append(self.h_conv1_bn.scale) self.slow_learnrate_vars.append(self.h_conv1_bn.beta) with tf.name_scope("Conv2") as scope: kernel_shape = [4, 4, 32, 64] self.W_conv2 = self.weight_variable(32, kernel_shape, 'conv2') #self.b_conv2 = self.bias_variable(kernel_shape, 'conv2') self.h_conv2_bn = batch_norm(self.conv2d(self.h_conv1, self.W_conv2, 2), 64, self.is_training, self.sess, slow=self.slow, tau=self.tau) self.h_conv2 = tf.nn.relu(self.h_conv2_bn.bnorm, name=self.name + '_conv2_activations') tf.add_to_collection('conv_weights', self.W_conv2) tf.add_to_collection('conv_output', self.h_conv2) if transfer: self.slow_learnrate_vars.append(self.W_conv2) self.slow_learnrate_vars.append(self.h_conv2_bn.scale) self.slow_learnrate_vars.append(self.h_conv2_bn.beta) with tf.name_scope("Conv3") as scope: kernel_shape = [3, 3, 64, 64] self.W_conv3 = self.weight_variable(64, kernel_shape, 'conv3') #self.b_conv3 = self.bias_variable(kernel_shape, 'conv3') self.h_conv3_bn = batch_norm(self.conv2d(self.h_conv2, self.W_conv3, 1), 64, self.is_training, self.sess, slow=self.slow, tau=self.tau) self.h_conv3 = tf.nn.relu(self.h_conv3_bn.bnorm, name=self.name + '_conv3_activations') tf.add_to_collection('conv_weights', self.W_conv3) tf.add_to_collection('conv_output', self.h_conv3) if transfer: self.slow_learnrate_vars.append(self.W_conv3) self.slow_learnrate_vars.append(self.h_conv3_bn.scale) self.slow_learnrate_vars.append(self.h_conv3_bn.beta) self.h_conv3_flat = tf.reshape(self.h_conv3, [-1, 3136]) with tf.name_scope("FullyConnected1") as scope: kernel_shape = [3136, 512] self.W_fc1 = self.weight_variable_linear(kernel_shape, 'fc1') #self.b_fc1 = self.bias_variable(kernel_shape, 'fc1') self.h_fc1_bn = batch_norm(tf.matmul(self.h_conv3_flat, self.W_fc1), 512, self.is_training, self.sess, slow=self.slow, tau=self.tau, linear=True) self.h_fc1 = tf.nn.relu(self.h_fc1_bn.bnorm, name=self.name + '_fc1_activations') if transfer: self.fast_learnrate_vars.append(self.W_fc1) self.fast_learnrate_vars.append(self.h_fc1_bn.scale) self.fast_learnrate_vars.append(self.h_fc1_bn.beta) with tf.name_scope("FullyConnected2") as scope: kernel_shape = [512, n_actions] self.W_fc2 = self.weight_variable_linear(kernel_shape, 'fc2') self.b_fc2 = self.bias_variable_linear(kernel_shape, 'fc2') self.q_value = tf.add(tf.matmul(self.h_fc1, self.W_fc2), self.b_fc2, name=self.name + '_fc1_outputs') if transfer: self.fast_learnrate_vars.append(self.W_fc2) self.fast_learnrate_vars.append(self.b_fc2) if transfer: self.load_transfer_model(optimizer=optimizer.lower()) # Scale down the last layer W_fc2_scaled = tf.scalar_mul(0.01, self.W_fc2) b_fc2_scaled = tf.scalar_mul(0.01, self.b_fc2) self.sess.run([ self.W_fc2.assign(W_fc2_scaled), self.b_fc2.assign(b_fc2_scaled) ]) if verbose: self.init_verbosity() # target q network model: self.t_is_training = tf.placeholder(tf.bool, []) with tf.name_scope("TConv1") as scope: kernel_shape = [8, 8, phi_length, 32] self.t_W_conv1 = self.weight_variable(phi_length, kernel_shape, 't_conv1') #self.t_b_conv1 = self.bias_variable(kernel_shape, 't_conv1') self.t_h_conv1_bn = batch_norm(self.conv2d(self.next_observation_n, self.t_W_conv1, 4), 32, self.t_is_training, self.sess, parForTarget=self.h_conv1_bn, slow=self.slow, tau=self.tau) self.t_h_conv1 = tf.nn.relu(self.t_h_conv1_bn.bnorm, name=self.name + '_t_conv1_activations') with tf.name_scope("TConv2") as scope: kernel_shape = [4, 4, 32, 64] self.t_W_conv2 = self.weight_variable(32, kernel_shape, 't_conv2') #self.t_b_conv2 = self.bias_variable(kernel_shape, 't_conv2') self.t_h_conv2_bn = batch_norm(self.conv2d(self.t_h_conv1, self.t_W_conv2, 2), 64, self.t_is_training, self.sess, parForTarget=self.h_conv2_bn, slow=self.slow, tau=self.tau) self.t_h_conv2 = tf.nn.relu(self.t_h_conv2_bn.bnorm, name=self.name + '_t_conv2_activations') with tf.name_scope("TConv3") as scope: kernel_shape = [3, 3, 64, 64] self.t_W_conv3 = self.weight_variable(64, kernel_shape, 't_conv3') #self.t_b_conv3 = self.bias_variable(kernel_shape, 't_conv3') self.t_h_conv3_bn = batch_norm(self.conv2d(self.t_h_conv2, self.t_W_conv3, 1), 64, self.t_is_training, self.sess, parForTarget=self.h_conv3_bn, slow=self.slow, tau=self.tau) self.t_h_conv3 = tf.nn.relu(self.t_h_conv3_bn.bnorm, name=self.name + '_t_conv3_activations') self.t_h_conv3_flat = tf.reshape(self.t_h_conv3, [-1, 3136]) with tf.name_scope("TFullyConnected1") as scope: kernel_shape = [3136, 512] self.t_W_fc1 = self.weight_variable_linear(kernel_shape, 't_fc1') #self.t_b_fc1 = self.bias_variable(kernel_shape, 't_fc1') self.t_h_fc1_bn = batch_norm(tf.matmul(self.t_h_conv3_flat, self.t_W_fc1), 512, self.t_is_training, self.sess, parForTarget=self.h_fc1_bn, slow=self.slow, tau=self.tau, linear=True) self.t_h_fc1 = tf.nn.relu(self.t_h_fc1_bn.bnorm, name=self.name + '_t_fc1_activations') with tf.name_scope("TFullyConnected2") as scope: kernel_shape = [512, n_actions] self.t_W_fc2 = self.weight_variable_linear(kernel_shape, 't_fc2') self.t_b_fc2 = self.bias_variable_linear(kernel_shape, 't_fc2') self.t_q_value = tf.add(tf.matmul(self.t_h_fc1, self.t_W_fc2), self.t_b_fc2, name=self.name + '_t_fc1_outputs') if transfer: # only intialize tensor variables that are not loaded from the transfer model #self.sess.run(tf.variables_initializer(fast_learnrate_vars)) self._global_vars_temp = set(tf.global_variables()) # cost of q network #self.l2_regularizer_loss = l2_decay * (tf.reduce_sum(tf.pow(self.W_conv1, 2)) + tf.reduce_sum(tf.pow(self.W_conv2, 2)) + tf.reduce_sum(tf.pow(self.W_conv3, 2)) + tf.reduce_sum(tf.pow(self.W_fc1, 2)) + tf.reduce_sum(tf.pow(self.W_fc2, 2))) self.cost = self.build_loss(error_clip, n_actions) #+ self.l2_regularizer_loss # self.parameters = [ # self.W_conv1, self.h_conv1_bn.scale, self.h_conv1_bn.beta, # self.W_conv2, self.h_conv2_bn.scale, self.h_conv2_bn.beta, # self.W_conv3, self.h_conv3_bn.scale, self.h_conv3_bn.beta, # self.W_fc1, self.h_fc1_bn.scale, self.h_fc1_bn.beta, # self.W_fc2, self.b_fc2, # ] with tf.name_scope("Train") as scope: if optimizer == "Graves": # Nature RMSOptimizer self.train_step, self.grads_vars = graves_rmsprop_optimizer( self.cost, learning_rate, decay, epsilon, 1) else: if optimizer == "Adam": self.opt = tf.train.AdamOptimizer( learning_rate=learning_rate, epsilon=epsilon) elif optimizer == "RMS": # Tensorflow RMSOptimizer self.opt = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum, epsilon=epsilon) else: print(colored("Unknown Optimizer!", "red")) sys.exit() self.grads_vars = self.opt.compute_gradients(self.cost) grads = [] params = [] for p in self.grads_vars: if p[0] == None: continue grads.append(p[0]) params.append(p[1]) #grads = tf.clip_by_global_norm(grads, 1)[0] self.grads_vars_updates = zip(grads, params) self.train_step = self.opt.apply_gradients( self.grads_vars_updates) # for grad, var in self.grads_vars: # if grad == None: # continue # tf.summary.histogram(var.op.name + '/gradients', grad) if transfer: vars_diff = set(tf.global_variables()) - self._global_vars_temp self.sess.run(tf.variables_initializer(vars_diff)) self.sess.run( tf.variables_initializer([ self.t_h_conv1_bn.pop_mean, self.t_h_conv1_bn.pop_var, self.t_h_conv2_bn.pop_mean, self.t_h_conv2_bn.pop_var, self.t_h_conv3_bn.pop_mean, self.t_h_conv3_bn.pop_var, self.t_h_fc1_bn.pop_mean, self.t_h_fc1_bn.pop_var ])) else: # initialize all tensor variable parameters self.sess.run(tf.global_variables_initializer()) # Make sure q and target model have same initial parameters copy the parameters self.sess.run([ self.t_W_conv1.assign( self.W_conv1), #self.t_b_conv1.assign(self.b_conv1), self.t_W_conv2.assign( self.W_conv2), #self.t_b_conv2.assign(self.b_conv2), self.t_W_conv3.assign( self.W_conv3), #self.t_b_conv3.assign(self.b_conv3), self.t_W_fc1.assign(self.W_fc1), #self.t_b_fc1.assign(self.b_fc1), self.t_W_fc2.assign(self.W_fc2), self.t_b_fc2.assign(self.b_fc2), self.t_h_conv1_bn.scale.assign(self.h_conv1_bn.scale), self.t_h_conv1_bn.beta.assign(self.h_conv1_bn.beta), self.t_h_conv2_bn.scale.assign(self.h_conv2_bn.scale), self.t_h_conv2_bn.beta.assign(self.h_conv2_bn.beta), self.t_h_conv3_bn.scale.assign(self.h_conv3_bn.scale), self.t_h_conv3_bn.beta.assign(self.h_conv3_bn.beta), self.t_h_fc1_bn.scale.assign(self.h_fc1_bn.scale), self.t_h_fc1_bn.beta.assign(self.h_fc1_bn.beta) ]) if self.slow: self.update_target_op = [ self.t_W_conv1.assign(self.tau * self.W_conv1 + (1 - self.tau) * self.t_W_conv1 ), #self.t_b_conv1.assign(self.b_conv1), self.t_W_conv2.assign(self.tau * self.W_conv2 + (1 - self.tau) * self.t_W_conv2 ), #self.t_b_conv2.assign(self.b_conv2), self.t_W_conv3.assign(self.tau * self.W_conv3 + (1 - self.tau) * self.t_W_conv3 ), #self.t_b_conv3.assign(self.b_conv3), self.t_W_fc1.assign(self.tau * self.W_fc1 + (1 - self.tau) * self.t_W_fc1 ), #self.t_b_fc1.assign(self.b_fc1), self.t_W_fc2.assign(self.tau * self.W_fc2 + (1 - self.tau) * self.t_W_fc2), self.t_b_fc2.assign(self.tau * self.b_fc2 + (1 - self.tau) * self.t_b_fc2), self.t_h_conv1_bn.updateTarget, self.t_h_conv2_bn.updateTarget, self.t_h_conv3_bn.updateTarget, self.t_h_fc1_bn.updateTarget ] else: self.update_target_op = [ self.t_W_conv1.assign( self.W_conv1), #self.t_b_conv1.assign(self.b_conv1), self.t_W_conv2.assign( self.W_conv2), #self.t_b_conv2.assign(self.b_conv2), self.t_W_conv3.assign( self.W_conv3), #self.t_b_conv3.assign(self.b_conv3), self.t_W_fc1.assign( self.W_fc1), #self.t_b_fc1.assign(self.b_fc1), self.t_W_fc2.assign(self.W_fc2), self.t_b_fc2.assign(self.b_fc2), self.t_h_conv1_bn.updateTarget, self.t_h_conv2_bn.updateTarget, self.t_h_conv3_bn.updateTarget, self.t_h_fc1_bn.updateTarget ] self.saver = tf.train.Saver() self.merged = tf.summary.merge_all() self.writer = tf.summary.FileWriter( self.path + self.folder + '/log_tb', self.sess.graph)
def conv_bn_relu(inputTensor, shape, bn_param, device): conv = convolution_layer(inputTensor, shape, strides=[1,1,1,1], bias=False, layer_name='conv', device=device) bn = batch_norm(conv, bn_param=bn_param, scale=False, device = device) return tf.nn.relu(bn)
def net_multi_conv(X0,X1,X2,_dropout,conf,doBatchNorm,trainPhase): imsz = conf.imsz; rescale = conf.rescale pool_scale = conf.pool_scale nfilt = conf.nfilt pool_stride = conf.pool_stride pool_size = conf.pool_size # conv5_0,base_dict_0 = net_multi_base(X0,_weights['base0']) # conv5_1,base_dict_1 = net_multi_base(X1,_weights['base1']) # conv5_2,base_dict_2 = net_multi_base(X2,_weights['base2']) if conf.dilation_rate is 4: net_to_use = net_multi_base_named_dilated else: net_to_use = net_multi_base_named with tf.variable_scope('scale0'): conv5_0,base_dict_0 = net_to_use(X0,nfilt,doBatchNorm,trainPhase, pool_stride,pool_size,conf) with tf.variable_scope('scale1'): conv5_1,base_dict_1 = net_to_use(X1,nfilt,doBatchNorm,trainPhase, pool_stride,pool_size,conf) with tf.variable_scope('scale2'): conv5_2,base_dict_2 = net_to_use(X2,nfilt,doBatchNorm,trainPhase, pool_stride,pool_size,conf) sz0 = int(math.ceil(float(imsz[0])/pool_scale/rescale)) sz1 = int(math.ceil(float(imsz[1])/pool_scale/rescale)) conv5_1_up = upscale('5_1',conv5_1,[sz0,sz1]) conv5_2_up = upscale('5_2',conv5_2,[sz0,sz1]) # crop lower res layers to match higher res size conv5_0_sz = tf.Tensor.get_shape(conv5_0).as_list() conv5_1_sz = tf.Tensor.get_shape(conv5_1_up).as_list() crop_0 = int(old_div((sz0-conv5_0_sz[1]),2)) crop_1 = int(old_div((sz1-conv5_0_sz[2]),2)) curloc = [0,crop_0,crop_1,0] patchsz = tf.to_int32([-1,conv5_0_sz[1],conv5_0_sz[2],-1]) conv5_1_up = tf.slice(conv5_1_up,curloc,patchsz) conv5_2_up = tf.slice(conv5_2_up,curloc,patchsz) conv5_1_final_sz = tf.Tensor.get_shape(conv5_1_up).as_list() # print("Initial lower res layer size %s"%(', '.join(map(str,conv5_1_sz)))) # print("Initial higher res layer size %s"%(', '.join(map(str,conv5_0_sz)))) # print("Crop start lower res layer at %s"%(', '.join(map(str,curloc)))) # print("Final size of lower res layer %s"%(', '.join(map(str,conv5_1_final_sz)))) conv5_cat = tf.concat([conv5_0,conv5_1_up,conv5_2_up],3) # Reshape conv5 output to fit dense layer input # conv6 = conv2d('conv6',conv5_cat,_weights['wd1'],_weights['bd1']) # conv6 = tf.nn.dropout(conv6,_dropout) # conv7 = conv2d('conv7',conv6,_weights['wd2'],_weights['bd2']) # conv7 = tf.nn.dropout(conv7,_dropout) with tf.variable_scope('layer6'): if hasattr(conf, 'dilation_rate'): dilation_rate = [conf.dilation_rate, conf.dilation_rate] else: dilation_rate = [1, 1] weights = tf.get_variable("weights", [conf.psz,conf.psz,conf.numscale*nfilt,conf.nfcfilt], initializer=tf.contrib.layers.xavier_initializer()) biases = tf.get_variable("biases", conf.nfcfilt, initializer=tf.constant_initializer(1)) conv6 = tf.nn.convolution(conv5_cat, weights, strides=[1, 1], padding='SAME',dilation_rate=dilation_rate) if doBatchNorm: conv6 = batch_norm(conv6, trainPhase) conv6 = tf.nn.relu(conv6 + biases) conv6 = tf.nn.dropout(conv6,_dropout, [conf.batch_size,1,1,conf.nfcfilt]) with tf.variable_scope('layer7'): conv7 = conv_relu(conv6,[1,1,conf.nfcfilt,conf.nfcfilt], 0.005,1,doBatchNorm,trainPhase) # if not doBatchNorm: conv7 = tf.nn.dropout(conv7,_dropout, [conf.batch_size,1,1,conf.nfcfilt]) # Output, class prediction # out = tf.nn.bias_add(tf.nn.conv2d( # conv7, _weights['wd3'], # strides=[1, 1, 1, 1], padding='SAME'),_weights['bd3']) with tf.variable_scope('layer8'): l8_weights = tf.get_variable("weights", [1,1,conf.nfcfilt,conf.n_classes], initializer=tf.random_normal_initializer(stddev=0.01)) l8_biases = tf.get_variable("biases", conf.n_classes, initializer=tf.constant_initializer(0)) out = tf.nn.conv2d(conv7, l8_weights, strides=[1, 1, 1, 1], padding='SAME') + l8_biases # No batch norm for the output layer. out_dict = {'base_dict_0':base_dict_0, 'base_dict_1':base_dict_1, 'base_dict_2':base_dict_2, 'conv6':conv6, 'conv7':conv7, } return out,out_dict
def build_model(input_var): network = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input_var) network = batch_norm(lasagne.layers.Conv2DLayer( network, num_filters=96, filter_size=(3, 3), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), pad='same')) #network = lasagne.layers.dropout(network, p=0.4) network = batch_norm(lasagne.layers.Conv2DLayer( network, num_filters=96, filter_size=(3, 3), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), pad='same')) network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) network = batch_norm(lasagne.layers.Conv2DLayer( network, num_filters=192, filter_size=(3, 3), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), pad='same')) #network = lasagne.layers.dropout(network, p=0.4) network = batch_norm(lasagne.layers.Conv2DLayer( network, num_filters=192, filter_size=(3, 3), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), pad='same')) network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) network = batch_norm(lasagne.layers.Conv2DLayer( network, num_filters=256, filter_size=(3, 3), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), pad='same')) #network = lasagne.layers.dropout(network, p=0.4) network = batch_norm(lasagne.layers.Conv2DLayer( network, num_filters=256, filter_size=(3, 3), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), pad='same')) #network = lasagne.layers.dropout(network, p=0.4) #network = batch_norm(lasagne.layers.Conv2DLayer( #network, num_filters=256, filter_size=(3, 3), #nonlinearity=lasagne.nonlinearities.rectify, #W=lasagne.init.HeNormal(), pad='same')) network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) network = batch_norm(lasagne.layers.Conv2DLayer( network, num_filters=512, filter_size=(3, 3), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), pad='same')) #network = lasagne.layers.dropout(network, p=0.4) network = batch_norm(lasagne.layers.Conv2DLayer( network, num_filters=512, filter_size=(3, 3), nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.HeNormal(), pad='same')) #network = lasagne.layers.dropout(network, p=0.4) #network = batch_norm(lasagne.layers.Conv2DLayer( #network, num_filters=512, filter_size=(3, 3), #nonlinearity=lasagne.nonlinearities.rectify, #W=lasagne.init.HeNormal(), pad='same')) network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) #network = batch_norm(lasagne.layers.Conv2DLayer( #network, num_filters=512, filter_size=(3, 3), #nonlinearity=lasagne.nonlinearities.rectify, #W=lasagne.init.HeNormal(), pad='same')) ##network = lasagne.layers.dropout(network, p=0.4) #network = batch_norm(lasagne.layers.Conv2DLayer( #network, num_filters=512, filter_size=(3, 3), #nonlinearity=lasagne.nonlinearities.rectify, #W=lasagne.init.HeNormal(), pad='same')) ##network = lasagne.layers.dropout(network, p=0.4) #network = batch_norm(lasagne.layers.Conv2DLayer( #network, num_filters=512, filter_size=(3, 3), #nonlinearity=lasagne.nonlinearities.rectify, #W=lasagne.init.HeNormal(), pad='same')) #network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2)) ##network = lasagne.layers.dropout(network, p=0.5) #network = batch_norm(lasagne.layers.DenseLayer( #network, num_units=512, #nonlinearity=lasagne.nonlinearities.rectify, #W=lasagne.init.HeNormal())) #network = lasagne.layers.dropout(network, p=0.5) network = lasagne.layers.DenseLayer(network, num_units=200, W=lasagne.init.HeNormal(), nonlinearity=lasagne.nonlinearities.rectify) network = lasagne.layers.DenseLayer(network, num_units=10, W=lasagne.init.HeNormal(), nonlinearity=None) return network
deconv32 = tf.nn.conv2d_transpose( value=net['relu7'], filter=tf.Variable(tf.truncated_normal(shape=(32, 32, 1, 512), mean=1.0)), output_shape=tf.pack((tf.shape(net['relu7'])[0],tf.shape(input_image)[1],tf.shape(input_image)[2],1)), strides=(1, 32, 32, 1), padding='SAME') + tf.Variable(tf.truncated_normal(shape=(1,),stddev=0.1), dtype=tf.float32) #Concatenate them, one deconvolution per channel deconvs = tf.concat(3,(deconv8, deconv16, deconv32)) #One last convolution to rule them all conv = tf.nn.conv2d(deconvs, tf.Variable(tf.truncated_normal(shape=(1,1,3,21), mean=1.0), dtype=tf.float32), strides=(1,1,1,1), padding="SAME") + tf.Variable(tf.truncated_normal(shape=(21,),stddev=0.1), dtype=tf.float32) #Batch normalization from batch_norm import batch_norm bn = batch_norm(conv, scale=True, is_training=True) #Network estimate exp = tf.exp(bn) norm = tf.reduce_sum(exp, reduction_indices=3, keep_dims=True) y_hat = tf.div(exp, norm) ########################################## ########TRAIN DECONVOLUTION LAYERS######## ########################################## #Test data indices = tf.placeholder(tf.int64, shape=(None,None,None)) targets = tf.one_hot(indices=indices, depth=21, on_value=1.0, off_value=0.0, axis=-1) #Loss function (cross-entropy)
def model18(w=32, h=32, c=1, nb_filters=64, size_filters_enc=5, size_filters_dec=5, nb_hidden=100, sparsity=True, use_batch_norm=False, nb_filters_mul=2, nonlin=rectify, stride=2, nb_layers_enc=2, nb_layers_dec=2): """ stadard conv aa without any sparsity """ s = size_filters_enc l_in = layers.InputLayer((None, c, w, h), name="input") l_conv = l_in l_convs = [] for i in range(nb_layers_enc): l_conv = layers.Conv2DLayer( l_conv, num_filters=nb_filters * (nb_filters_mul**i), filter_size=(s, s), nonlinearity=nonlin, W=init.GlorotUniform(), stride=stride, name="conv{}".format(i)) if use_batch_norm: l_conv = batch_norm(l_conv) print(l_conv.output_shape) l_convs.append(l_conv) lastconv_num_units = np.prod(l_conv.output_shape[1:]) lastconv_shape = l_conv.output_shape[1:] z_mean = layers.DenseLayer( l_conv, num_units=nb_hidden, nonlinearity=linear, name="z_mean") z_log_sigma = layers.DenseLayer( l_conv, num_units=nb_hidden, nonlinearity=linear, name="z_log_sigma") encoder = [l_in] + l_convs + [z_mean, z_log_sigma] z_in = layers.InputLayer((None, nb_hidden), name="input") l_unconv = layers.DenseLayer(z_in, num_units=lastconv_num_units, name="unconv0") l_unconvs = [l_unconv] l_unconv = layers.ReshapeLayer(l_unconv, ([0],) +lastconv_shape, name="unconv0") s = size_filters_dec for i in range(nb_layers_dec): print(l_unconv.output_shape) if i == nb_layers_dec - 1: if sparsity: l_unconv = layers.NonlinearityLayer(l_unconv, wta_spatial, name="wta_spatial") l_unconv = layers.NonlinearityLayer(l_unconv, wta_channel, name="wta_channel") nonlin_cur = linear nb = c name = "output" else: nonlin_cur = nonlin nb = nb_filters * 2**(nb_layers_dec - i - 1) name = "unconv{}".format(i + 1) if stride==1: l_unconv = layers.Conv2DLayer( l_unconv, num_filters=nb, filter_size=(s, s), nonlinearity=nonlin_cur, W=init.GlorotUniform(), pad='full', name=name) else: l_unconv = Deconv2DLayer( l_unconv, num_filters=nb, filter_size=(s, s), nonlinearity=nonlin_cur, W=init.GlorotUniform(), stride=stride, name=name) if use_batch_norm: l_unconv = batch_norm(l_unconv) print(l_unconv.output_shape) l_unconvs.append(l_unconv) decoder = [z_in] + l_unconvs return encoder , decoder
def build_model(d_params, g_params, s_params, options): trng = RandomStreams(SEED) x = tensor.matrix('x', dtype='int32') # n_sample * n_emb where is n_word if options['debug']: x.tag.test_value = np.random.randint(2, size=(64, 40)).astype( 'int32') # batchsize * sent_len(n_word) item: 0-voc_size # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # generative model part z = tensor.matrix('z', dtype='float32') # n_batch * n_feature n_z = z.shape[0] n_samples = options['batch_size'] n_words = options['n_words'] n_x = d_params['Wemb'].shape[1] #embeding dim if options['shareLSTM']: h_decoder = decoder_g(g_params, z, options, max_step=options['max_step'], prefix='decoder_0') else: z_code = tensor.cast(z[:, 0], dtype='int32') h_decoder = tensor.zeros( [options['max_step'], n_samples, options['n_h']]) h_temp = [] for idx in range(options['n_codes']): temp_idx = tensor.eq(z_code, idx).nonzero()[0] if options['sharedEmb']: h_decoder_temp = decoder_emb_from_d( g_params, d_params, z[:, 1:], options, max_step=options['max_step'], prefix=_p('decoder', idx)) else: h_decoder_temp = decoder_g(g_params, z[:, 1:], options, max_step=options['max_step'], prefix=_p('decoder', idx)) h_temp.append(h_decoder_temp) h_decoder = tensor.inc_subtensor(h_decoder[:, temp_idx, :], h_temp[idx][:, temp_idx, :]) #h_decoder = dropout(h_decoder, trng, use_noise) # reconstruct the original sentence shape_w = h_decoder.shape # n_step, n_sample , n_h h_decoder = h_decoder.reshape((shape_w[0] * shape_w[1], shape_w[2])) # pred_w: (n_steps * n_samples) * n_words if options['sharedEmb']: Vhid = tensor.dot(g_params['Vhid'], d_params['Wemb'].T) else: Vhid = tensor.dot(g_params['Vhid'], g_params['Wemb'].T) pred_w = tensor.dot(h_decoder, Vhid) + g_params['bhid'] n_steps = shape_w[0] # nondifferentiable if options['delta'] > 1e-10: pred_w = tensor.switch(tensor.ge(pred_w, options['delta']), pred_w, 0) #pred_w = tensor.nnet.softmax(pred_w*options['L']) max_w = tensor.max(pred_w, axis=1, keepdims=True) e0 = tensor.exp((pred_w - max_w) * options['L']) pred_w = e0 / tensor.sum(e0, axis=1, keepdims=True) max_print = tensor.max(pred_w, axis=1) max_print = max_print.reshape((n_steps, n_samples)).dimshuffle(1, 0) pred_w = pred_w.reshape( (n_steps, n_samples, n_words)).dimshuffle(1, 0, 2) # reshape need parenthesis if options['force_cut'] == 'cut': rng_temp = tensor.minimum( -tensor.sum(tensor.log(trng.uniform( (n_samples, 6))), axis=1) * 3.3, options['max_step'] - 5) rng_length = tensor.floor(rng_temp).astype('int32') #gamma(6,3.3) # pred_mask = tensor.zeros(pred_w.shape) period = options['period'] # should use set values for i in xrange(n_samples): pred_w = tensor.set_subtensor(pred_w[i, rng_length[i]:, :], 0) pred_w = tensor.set_subtensor(pred_w[i, rng_length[i], period], 1) pred_w = tensor.set_subtensor(pred_w[i, (rng_length[i] + 1):, 0], 1) elif options['force_cut'] == 'strip': for i in xrange(n_samples): pred_w = tensor.set_subtensor( pred_w[i, options['max_step'] - 1, 0], 1) idx_end = theano.tensor.eq(tensor.argmax(pred_w[i, :, :], axis=1), 0).nonzero()[0][0] pred_w = tensor.set_subtensor(pred_w[i, (idx_end + 1):, 0], 1) pred_w = tensor.set_subtensor(pred_w[i, (idx_end + 1):, 1:], 0) pad = max(options['filter_hs']) - 1 end_mat = tensor.concatenate([ tensor.ones([n_samples, pad, 1]), tensor.zeros([n_samples, pad, n_words - 1]) ], axis=2) pred_w = tensor.concatenate([end_mat, pred_w, end_mat], axis=1) n_steps = n_steps + 2 * pad pred_w = pred_w.reshape((n_steps * n_samples, n_words)) # should be d's embeding fake_input = tensor.dot(pred_w, d_params['Wemb']) # real[ 64 1 68 300] fake[ 64 1 41 300] fake_input = fake_input.reshape( (n_samples, 1, n_steps, d_params['Wemb'].shape[1])) #(64,1, ) use_noise2 = theano.shared(numpy_floatX(0.)) fake_input = dropout(fake_input, trng, use_noise2) # fake feature output fake_outputs1 = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(d_params, fake_input, filter_shape, pool_size, options, prefix=_p('cnn_d', i)) fake_output1 = conv_layer fake_outputs1.append(fake_output1) fake_output1 = tensor.concatenate(fake_outputs1, 1) # should be 64*900 if options['batch_norm']: fake_output1 = batch_norm(d_params, fake_output1, options, prefix='fake') if options['cnn_activation'] == 'tanh': fake_pred = mlp_layer_linear(d_params, fake_output1, prefix='dis_d') elif options['cnn_activation'] == 'linear': fake_pred = mlp_layer_linear(d_params, tensor.tanh(fake_output1), prefix='dis_d') # if not options['wgan']: fake_pred = tensor.nnet.sigmoid(fake_pred) * ( 1 - 2 * options['label_smoothing']) + options['label_smoothing'] # for reverse model # if options['reverse']: fake_recon = mlp_layer_tanh(d_params, fake_output1, prefix='recon') r_t = fake_recon / 2.0 + .5 z_t = z / 2.0 + .5 r_cost = (-z_t * tensor.log(r_t + 0.0001) - (1. - z_t) * tensor.log(1.0001 - r_t)).sum() / n_samples / n_z # Proposal nets (for infogan) fake_outputs2 = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(g_params, fake_input, filter_shape, pool_size, options, prefix=_p('cnn_d', i)) fake_output2 = conv_layer fake_outputs2.append(fake_output2) fake_output2 = tensor.concatenate( fake_outputs2, 1) # should be 64*900 # why it is 64*0??? # check whether to use softmax or tanh fake_propose = mlp_layer_tanh(g_params, fake_output2, prefix='dis_q') fake_propose = (fake_propose + 1) / 2 fake_propose = tensor.log(fake_propose) z_code = tensor.cast(z[:, 0], dtype='int32') z_index = tensor.arange(n_z) fake_logent = fake_propose[z_index, z_code] l_I = tensor.sum(fake_logent) # Wemb: voc_size(n_words) * n_emb 64* 1* 40 *48 real_input = d_params['Wemb'][tensor.cast( x.flatten(), dtype='int32')].reshape( (x.shape[0], 1, x.shape[1], d_params['Wemb'].shape[1])) # n_sample,1,n_length,n_emb real_input = dropout(real_input, trng, use_noise2) real_outputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer2 = encoder(d_params, real_input, filter_shape, pool_size, options, prefix=_p('cnn_d', i)) real_output = conv_layer2 real_outputs.append(real_output) real_output = tensor.concatenate(real_outputs, 1) if options['batch_norm']: real_output = batch_norm(d_params, real_output, options, prefix='real') if options['cnn_activation'] == 'tanh': real_pred = mlp_layer_linear(d_params, real_output, prefix='dis_d') elif options['cnn_activation'] == 'linear': real_pred = mlp_layer_linear(d_params, tensor.tanh(real_output), prefix='dis_d') if not options['wgan']: real_pred = tensor.nnet.sigmoid(real_pred) * ( 1 - 2 * options['label_smoothing']) + options['label_smoothing'] #Compute for KDE mu = real_output X = fake_output1 KDE = cal_nkde(X, mu, options['kde_sigma']) #calculate KDE on real_input and fake_input X_i = fake_input.reshape((n_samples, n_steps * d_params['Wemb'].shape[1])) mu_i = real_input.reshape((n_samples, n_steps * d_params['Wemb'].shape[1])) KDE_input = cal_nkde(X_i, mu_i, options['kde_sigma']) # sufficient statistics cur_size = s_params['seen_size'] * 1.0 identity = tensor.eye(options['n_z']) * options['diag'] fake_mean = tensor.mean(fake_output1, axis=0) real_mean = tensor.mean(real_output, axis=0) fake_xx = tensor.dot(fake_output1.T, fake_output1) real_xx = tensor.dot(real_output.T, real_output) acc_fake_xx = (s_params['acc_fake_xx'] * cur_size + fake_xx) / (cur_size + n_samples) acc_real_xx = (s_params['acc_real_xx'] * cur_size + real_xx) / (cur_size + n_samples) acc_fake_mean = (s_params['acc_fake_mean'] * cur_size + fake_mean * n_samples) / (cur_size + n_samples) acc_real_mean = (s_params['acc_real_mean'] * cur_size + real_mean * n_samples) / (cur_size + n_samples) cov_fake = acc_fake_xx - tensor.dot(acc_fake_mean.dimshuffle(0, 'x'), acc_fake_mean.dimshuffle( 0, 'x').T) + identity cov_real = acc_real_xx - tensor.dot(acc_real_mean.dimshuffle(0, 'x'), acc_real_mean.dimshuffle( 0, 'x').T) + identity cov_fake_inv = tensor.nlinalg.matrix_inverse(cov_fake) cov_real_inv = tensor.nlinalg.matrix_inverse(cov_real) if options['feature_match'] == 'moment': temp1 = ((fake_mean - real_mean)**2).sum() fake_obj = temp1 elif options['feature_match'] == 'JSD_acc': temp1 = tensor.nlinalg.trace( tensor.dot(cov_fake_inv, cov_real) + tensor.dot(cov_real_inv, cov_fake)) temp2 = tensor.dot( tensor.dot((acc_fake_mean - acc_real_mean), (cov_fake_inv + cov_real_inv)), (acc_fake_mean - acc_real_mean).T) fake_obj = temp1 + temp2 elif options['feature_match'] == 'mmd': #### too many nodes, use scan #### kxx, kxy, kyy = 0, 0, 0 dividend = 1 dist_x, dist_y = fake_output1 / dividend, real_output / dividend x_sq = tensor.sum(dist_x**2, axis=1).dimshuffle(0, 'x') # 64*1 y_sq = tensor.sum(dist_y**2, axis=1).dimshuffle(0, 'x') # 64*1 tempxx = -2 * tensor.dot(dist_x, dist_x.T) + x_sq + x_sq.T # (xi -xj)**2 tempxy = -2 * tensor.dot(dist_x, dist_y.T) + x_sq + y_sq.T # (xi -yj)**2 tempyy = -2 * tensor.dot(dist_y, dist_y.T) + y_sq + y_sq.T # (yi -yj)**2 for sigma in options['sigma_range']: kxx += tensor.mean(tensor.exp(-tempxx / 2 / (sigma**2))) kxy += tensor.mean(tensor.exp(-tempxy / 2 / (sigma**2))) kyy += tensor.mean(tensor.exp(-tempyy / 2 / (sigma**2))) fake_obj = tensor.sqrt(kxx + kyy - 2 * kxy) elif options['feature_match'] == 'mmd_cov': kxx, kxy, kyy = 0, 0, 0 cov_sum = (cov_fake + cov_real) / 2 cov_sum_inv = tensor.nlinalg.matrix_inverse(cov_sum) dividend = 1 dist_x, dist_y = fake_output1 / dividend, real_output / dividend cov_inv_mat = cov_sum_inv x_sq = tensor.sum(tensor.dot(dist_x, cov_inv_mat) * dist_x, axis=1).dimshuffle(0, 'x') y_sq = tensor.sum(tensor.dot(dist_y, cov_inv_mat) * dist_y, axis=1).dimshuffle(0, 'x') tempxx = -2 * tensor.dot(tensor.dot(dist_x, cov_inv_mat), dist_x.T) + x_sq + x_sq.T # (xi -xj)**2 tempxy = -2 * tensor.dot(tensor.dot(dist_x, cov_inv_mat), dist_y.T) + x_sq + y_sq.T # (xi -yj)**2 tempyy = -2 * tensor.dot(tensor.dot(dist_y, cov_inv_mat), dist_y.T) + y_sq + y_sq.T # (yi -yj)**2 for sigma in options['sigma_range']: kxx += tensor.mean(tensor.exp(-tempxx / 2 / (sigma**2))) kxy += tensor.mean(tensor.exp(-tempxy / 2 / (sigma**2))) kyy += tensor.mean(tensor.exp(-tempyy / 2 / (sigma**2))) fake_obj = tensor.sqrt(kxx + kyy - 2 * kxy) elif options['feature_match'] == 'mmd_ld': kxx, kxy, kyy = 0, 0, 0 real_mmd = mlp_layer_tanh(d_params, real_output, prefix='dis_mmd') fake_mmd = mlp_layer_tanh(d_params, fake_output1, prefix='dis_mmd') dividend = options['dim_mmd'] # for numerical stability & scale with dist_x, dist_y = fake_mmd / dividend, real_mmd / dividend x_sq = tensor.sum(dist_x**2, axis=1).dimshuffle(0, 'x') # 64*1 y_sq = tensor.sum(dist_y**2, axis=1).dimshuffle(0, 'x') # 64*1 tempxx = -2 * tensor.dot(dist_x, dist_x.T) + x_sq + x_sq.T # (xi -xj)**2 tempxy = -2 * tensor.dot(dist_x, dist_y.T) + x_sq + y_sq.T # (xi -yj)**2 tempyy = -2 * tensor.dot(dist_y, dist_y.T) + y_sq + y_sq.T # (yi -yj)**2 for sigma in options['sigma_range']: kxx += tensor.exp(-tempxx / 2 / sigma).sum() kxy += tensor.exp(-tempxy / 2 / sigma).sum() kyy += tensor.exp(-tempyy / 2 / sigma).sum() fake_obj = tensor.sqrt(kxx + kyy - 2 * kxy) elif options['feature_match'] == 'mmd_h': #### too many nodes, use scan #### kxx, kxy, kyy = 0, 0, 0 if options['cnn_activation'] == 'tanh': fake_mmd = middle_layer(d_params, fake_output1, prefix='dis_d') elif options['cnn_activation'] == 'linear': fake_mmd = middle_layer(d_params, tensor.tanh(fake_output1), prefix='dis_d') # if options['cnn_activation'] == 'tanh': real_mmd = middle_layer(d_params, real_output, prefix='dis_d') elif options['cnn_activation'] == 'linear': real_mmd = middle_layer(d_params, tensor.tanh(real_output), prefix='dis_d') # dividend = 1 dist_x, dist_y = fake_mmd / dividend, real_mmd / dividend x_sq = tensor.sum(dist_x**2, axis=1).dimshuffle(0, 'x') # 64*1 y_sq = tensor.sum(dist_y**2, axis=1).dimshuffle(0, 'x') # 64*1 tempxx = -2 * tensor.dot(dist_x, dist_x.T) + x_sq + x_sq.T # (xi -xj)**2 tempxy = -2 * tensor.dot(dist_x, dist_y.T) + x_sq + y_sq.T # (xi -yj)**2 tempyy = -2 * tensor.dot(dist_y, dist_y.T) + y_sq + y_sq.T # (yi -yj)**2 for sigma in options['sigma_range']: kxx += tensor.mean(tensor.exp(-tempxx / 2 / (sigma**2))) kxy += tensor.mean(tensor.exp(-tempxy / 2 / (sigma**2))) kyy += tensor.mean(tensor.exp(-tempyy / 2 / (sigma**2))) fake_obj = tensor.sqrt(kxx + kyy - 2 * kxy) else: fake_obj = -tensor.log(fake_pred + 1e-6).sum() / n_z if options['wgan']: gan_cost_d = fake_pred.sum() / n_z - real_pred.sum() / n_samples gan_cost_g = -fake_pred.sum() / n_z + 0 * ( (fake_mean - acc_real_mean)**2).sum() else: gan_cost_d = -tensor.log(1 - fake_pred + 1e-6).sum( ) / n_z - tensor.log(real_pred + 1e-6).sum() / n_samples gan_cost_g = fake_obj #result4 = fake_obj d_cost = gan_cost_d - options['lambda_fm'] * fake_obj + options[ 'lambda_recon'] * r_cost + options['lambda_q'] * l_I / n_z g_cost = gan_cost_g - options['lambda_q'] * l_I / n_z #result1, result2, result4, result5, result6 = x_sq, y_sq, tempxx, tempxy, tempyy result1 = tensor.mean(real_pred) # goes to nan result2 = tensor.mean(fake_pred) # goes to nan result3 = tensor.argmax(pred_w, axis=1).reshape([n_samples, n_steps]) result4 = tensor.nlinalg.trace( tensor.dot(cov_fake_inv, cov_real) + tensor.dot(cov_real_inv, cov_fake)) result5 = max_print[ 0] #mu #tensor.dot( tensor.dot((acc_fake_mean - acc_real_mean) , (cov_fake_inv + cov_real_inv)), (acc_fake_mean - acc_real_mean).T) result6 = ((fake_mean - real_mean)**2).sum() return use_noise, use_noise2, x, z, d_cost, g_cost, r_cost, fake_recon, acc_fake_xx, acc_real_xx, acc_fake_mean, acc_real_mean, result1, result2, result3, result4, result5, result6, KDE, KDE_input
def VAE(input_shape=[None, 784], n_filters=[64, 64, 64], filter_sizes=[4, 4, 4], n_hidden=32, n_code=2, activation=tf.nn.tanh, dropout=False, denoising=False, convolutional=False, variational=False, on_cloud=0): """(Variational) (Convolutional) (Denoising) Autoencoder. Uses tied weights. Parameters ---------- input_shape : list, optional Shape of the input to the network. e.g. for MNIST: [None, 784]. n_filters : list, optional Number of filters for each layer. If convolutional=True, this refers to the total number of output filters to create for each layer, with each layer's number of output filters as a list. If convolutional=False, then this refers to the total number of neurons for each layer in a fully connected network. filter_sizes : list, optional Only applied when convolutional=True. This refers to the ksize (height and width) of each convolutional layer. n_hidden : int, optional Only applied when variational=True. This refers to the first fully connected layer prior to the variational embedding, directly after the encoding. After the variational embedding, another fully connected layer is created with the same size prior to decoding. Set to 0 to not use an additional hidden layer. n_code : int, optional Only applied when variational=True. This refers to the number of latent Gaussians to sample for creating the inner most encoding. activation : function, optional Activation function to apply to each layer, e.g. tf.nn.relu dropout : bool, optional Whether or not to apply dropout. If using dropout, you must feed a value for 'keep_prob', as returned in the dictionary. 1.0 means no dropout is used. 0.0 means every connection is dropped. Sensible values are between 0.5-0.8. denoising : bool, optional Whether or not to apply denoising. If using denoising, you must feed a value for 'corrupt_prob', as returned in the dictionary. 1.0 means no corruption is used. 0.0 means every feature is corrupted. Sensible values are between 0.5-0.8. convolutional : bool, optional Whether or not to use a convolutional network or else a fully connected network will be created. This effects the n_filters parameter's meaning. variational : bool, optional Whether or not to create a variational embedding layer. This will create a fully connected layer after the encoding, if `n_hidden` is greater than 0, then will create a multivariate gaussian sampling layer, then another fully connected layer. The size of the fully connected layers are determined by `n_hidden`, and the size of the sampling layer is determined by `n_code`. Returns ------- model : dict { 'cost': Tensor to optimize. 'Ws': All weights of the encoder. 'x': Input Placeholder 'z': Inner most encoding Tensor (latent features) 'y': Reconstruction of the Decoder 'keep_prob': Amount to keep when using Dropout 'corrupt_prob': Amount to corrupt when using Denoising 'train': Set to True when training/Applies to Batch Normalization. } """ # network input / placeholders for train (bn) and dropout x = tf.placeholder(tf.float32, input_shape, 'x') phase_train = tf.placeholder(tf.bool, name='phase_train') keep_prob = tf.placeholder(tf.float32, name='keep_prob') corrupt_prob = tf.placeholder(tf.float32, [1]) # apply noise if denoising x_ = (utils.corrupt(x) * corrupt_prob + x * (1 - corrupt_prob)) if denoising else x # 2d -> 4d if convolution x_tensor = utils.to_tensor(x_) if convolutional else x_ current_input = x_tensor Ws = [] shapes = [] # Build the encoder for layer_i, n_output in enumerate(n_filters): with tf.variable_scope('encoder/{}'.format(layer_i)): shapes.append(current_input.get_shape().as_list()) if convolutional: h, W = utils.conv2d(x=current_input, n_output=n_output, k_h=filter_sizes[layer_i], k_w=filter_sizes[layer_i]) else: h, W = utils.linear(x=current_input, n_output=n_output) h = activation(batch_norm(h, phase_train, 'bn' + str(layer_i))) if dropout: h = tf.nn.dropout(h, keep_prob) Ws.append(W) current_input = h shapes.append(current_input.get_shape().as_list()) with tf.variable_scope('variational'): if variational: dims = current_input.get_shape().as_list() flattened = utils.flatten(current_input) if n_hidden: h = utils.linear(flattened, n_hidden, name='W_fc')[0] h = activation(batch_norm(h, phase_train, 'fc/bn')) if dropout: h = tf.nn.dropout(h, keep_prob) else: h = flattened z_mu = utils.linear(h, n_code, name='mu')[0] z_log_sigma = 0.5 * utils.linear(h, n_code, name='log_sigma')[0] # Sample from noise distribution p(eps) ~ N(0, 1) epsilon = tf.random_normal(tf.stack([tf.shape(x)[0], n_code])) # Sample from posterior z = z_mu + tf.multiply(epsilon, tf.exp(z_log_sigma)) if n_hidden: h = utils.linear(z, n_hidden, name='fc_t')[0] h = activation(batch_norm(h, phase_train, 'fc_t/bn')) if dropout: h = tf.nn.dropout(h, keep_prob) else: h = z size = dims[1] * dims[2] * dims[3] if convolutional else dims[1] h = utils.linear(h, size, name='fc_t2')[0] current_input = activation(batch_norm(h, phase_train, 'fc_t2/bn')) if dropout: current_input = tf.nn.dropout(current_input, keep_prob) if convolutional: current_input = tf.reshape( current_input, tf.stack([ tf.shape(current_input)[0], dims[1], dims[2], dims[3] ])) else: z = current_input shapes.reverse() n_filters.reverse() Ws.reverse() n_filters += [input_shape[-1]] # %% # Decoding layers for layer_i, n_output in enumerate(n_filters[1:]): with tf.variable_scope('decoder/{}'.format(layer_i)): shape = shapes[layer_i + 1] if convolutional: h, W = utils.deconv2d(x=current_input, n_output_h=shape[1], n_output_w=shape[2], n_output_ch=shape[3], n_input_ch=shapes[layer_i][3], k_h=filter_sizes[layer_i], k_w=filter_sizes[layer_i]) else: h, W = utils.linear(x=current_input, n_output=n_output) h = activation(batch_norm(h, phase_train, 'dec/bn' + str(layer_i))) if dropout: h = tf.nn.dropout(h, keep_prob) current_input = h y = current_input x_flat = utils.flatten(x) y_flat = utils.flatten(y) # l2 loss loss_x = tf.reduce_sum(tf.squared_difference(x_flat, y_flat), 1) if variational: # variational lower bound, kl-divergence loss_z = -0.5 * tf.reduce_sum( 1.0 + 2.0 * z_log_sigma - tf.square(z_mu) - tf.exp(2.0 * z_log_sigma), 1) # add l2 loss cost = tf.reduce_mean(loss_x + loss_z) else: # just optimize l2 loss cost = tf.reduce_mean(loss_x) return { 'cost': cost, 'Ws': Ws, 'x': x, 'z': z, 'y': y, 'keep_prob': keep_prob, 'corrupt_prob': corrupt_prob, 'train': phase_train }
def __call__(self, inputs, state, scope=None, is_training=True, reuse=None, reuse_bn=None): self.unroll_count += 1 with tf.variable_scope(scope or type(self).__name__): if self._state_is_tuple: c, h = state else: c, h = nn.split(state, 2, 1) with tf.variable_scope("LSTM_weights", reuse=reuse): print("resue is ", reuse) i2h = _linear([inputs], 4 * self._num_units, True, scope="LinearI", init_scale=self.init_scale) h2h = _linear([h], 4 * self._num_units, True, scope="LinearH", init_scale=self.init_scale) beta_i = nn.weight_variable([4 * self._num_units], init_method="constant", init_param={"val": 0.0}, name="beta_i") gamma_i = nn.weight_variable([4 * self._num_units], init_method="constant", init_param={"val": 0.1}, name="gamma_i") beta_h = nn.weight_variable([4 * self._num_units], init_method="constant", init_param={"val": 0.0}, name="beta_h") gamma_h = nn.weight_variable([4 * self._num_units], init_method="constant", init_param={"val": 0.1}, name="gamma_h") beta_c = nn.weight_variable([self._num_units], init_method="constant", init_param={"val": 0.0}, name="beta_c") gamma_c = nn.weight_variable([self._num_units], init_method="constant", init_param={"val": 0.1}, name="gamma_c") i2h_norm, mean_i = batch_norm(i2h, self._num_units * 4, is_training, reuse=reuse_bn, gamma=gamma_i, beta=beta_i, axes=[0], eps=self.eps, scope="bn_i_{}".format( self.unroll_count), return_mean=True) # if self.l1_reg > 0.0: # tf.add_to_collection(L1_REG_KEY, # self.l1_reg * tf.reduce_mean(tf.abs(i2h - mean_i))) h2h_norm, mean_h = batch_norm(h2h, self._num_units * 4, is_training, reuse=reuse_bn, gamma=gamma_h, beta=beta_h, axes=[0], eps=self.eps, scope="bn_h_{}".format( self.unroll_count), return_mean=True) # if self.l1_reg > 0.0: # tf.add_to_collection(L1_REG_KEY, # self.l1_reg * tf.reduce_mean(tf.abs(h2h - mean_h))) i, j, f, o = nn.split(i2h_norm + h2h_norm, 4, 1) new_c = (c * self.gate_activation(f + self._forget_bias) + self.gate_activation(i) * self.state_activation(j)) new_c_norm, mean_c = batch_norm(new_c, self._num_units, is_training, reuse=reuse_bn, gamma=gamma_c, beta=beta_c, axes=[0], eps=self.eps, scope="bn_c_{}".format( self.unroll_count), return_mean=True) # if self.l1_reg > 0.0: # tf.add_to_collection(L1_REG_KEY, self.l1_reg * # tf.reduce_mean(tf.abs(new_c - mean_c))) new_h = self.state_activation(new_c_norm) * self.gate_activation(o) if self._state_is_tuple: new_state = LSTMStateTuple(new_c_norm, new_h) else: new_state = nn.concat([new_c_norm, new_h], 1) return new_h, new_state