def BatchnormB(name, axes, inputs, is_training=None, stats_iter=None, update_moving_stats=True, fused=True, labels=None, n_labels=None, n_start_labels=None): """ based on conditional batchnorm (dumoulin et al 2016) for BCHW conv filtermaps""" if axes != [0, 2, 3]: raise Exception('unsupported') mean, var = tf.nn.moments(inputs, axes, keep_dims=True) shape = mean.get_shape().as_list() # shape is [1,n,1,1] init_scale = np.zeros([n_labels, shape[1]], dtype='float32') init_scale[:n_start_labels] = 1.0 offset_m = lib.param(name + '.offset', np.zeros([n_labels, shape[1]], dtype='float32')) scale_m = lib.param(name + '.scale', init_scale) offset = tf.matmul(labels, offset_m) scale = tf.matmul(labels, scale_m) result = tf.nn.batch_normalization(inputs, mean, var, offset[:, :, None, None], scale[:, :, None, None], 1e-5) return result, offset_m, scale_m
def BiLSTM( name, inputs, n_in, n_hid, h0_1=None, h0_2=None ): """ Compute recurrent memory states using Bidirectional Long Short-Term Memory units :parameters: n_in : int ; Dimensionality of input n_hid : int ; Dimensionality of hidden state / memory state h0_1: vector ; Initial hidden state of forward LSTM h0_2: vector ; Initial hidden state of backward LSTM """ batch_size = tf.shape(inputs)[0] if h0_1 is None: h0_1 = tflib.param(name+'.init.h0_1', np.zeros(2*n_hid, dtype='float32')) h0_1 = tf.reshape(tf.tile(h0_1, tf.pack([batch_size])), tf.pack([batch_size, 2*n_hid])) if h0_2 is None: h0_2 = tflib.param(name+'.init.h0_2', np.zeros(2*n_hid, dtype='float32')) h0_2 = tf.reshape(tf.tile(h0_2, tf.pack([batch_size])), tf.pack([batch_size, 2*n_hid])) cell1 = LSTMCell(name+'_fw', n_in, n_hid) cell2 = LSTMCell(name+'_bw', n_in, n_hid) seq_len = tf.tile(tf.expand_dims(tf.shape(inputs)[1],0),[batch_size]) outputs = tf.nn.bidirectional_dynamic_rnn(cell1, cell2, inputs, sequence_length=seq_len, initial_state_fw=h0_1, initial_state_bw=h0_2, swap_memory=True) return tf.concat(2,[outputs[0][0],outputs[0][1]])
def Layernorm(name, norm_axes, inputs, labels=None, n_labels=None): """labels and n_labels implement 'conditional batchnorm' (dumoulin et al 2016)""" mean, var = tf.nn.moments(inputs, norm_axes, keep_dims=True) # Assume the 'neurons' axis is the first of norm_axes. This is the case for fully-connected and BCHW conv layers. n_neurons = inputs.get_shape().as_list()[norm_axes[0]] if labels is None: offset = lib.param(name+'.b', np.zeros(n_neurons, dtype='float32')) scale = lib.param(name+'.scale', np.ones(n_neurons, dtype='float32')) # Add broadcasting dims to offset and scale (e.g. BCHW conv data) offset = tf.reshape(offset, [-1] + [1 for i in xrange(len(norm_axes)-1)]) scale = tf.reshape(scale, [-1] + [1 for i in xrange(len(norm_axes)-1)]) else: offset_m = lib.param(name+'.b', np.zeros([n_labels,n_neurons], dtype='float32')) scale_m = lib.param(name+'.scale', np.ones([n_labels,n_neurons], dtype='float32')) offset = tf.nn.embedding_lookup(offset_m, labels) scale = tf.nn.embedding_lookup(scale_m, labels) # Add H and W broadcasting dims if norm_axes != [1,2,3]: raise Exception('unsupported') offset = offset[:,:,None,None] scale = scale[:,:,None,None] result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-5) return result
def MiniBatchLayer(name, num_inputs, num_kernels, dim_per_kernel, inputs): with tf.name_scope(name) as scope: def uniform(stdev, size): if _weights_stdev is not None: stdev = _weights_stdev return np.random.uniform( low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size ).astype('float32') weight_values = uniform(np.sqrt(2./num_inputs),(num_inputs, num_kernels, dim_per_kernel)) weight = lib.param( name + '.W', weight_values ) bias = lib.param( name + '.b', np.zeros((num_kernels,),dtype='float32') ) activation = tf.tensordot(inputs, weight, [[1], [0]]) abs_dif = (tf.reduce_sum(tf.abs(tf.expand_dims(activation, axis=-1) - tf.expand_dims(tf.transpose(activation, perm=[1, 2, 0]), axis=0)), axis=2)+ 1e6 * tf.expand_dims(tf.eye(tf.shape(inputs)[0]), axis=1)) f = tf.reduce_sum(tf.exp(-abs_dif), axis=2) f += tf.expand_dims(bias, axis=0) return tf.concat([inputs, f], axis=1)
def conv2d(name, input, kernel, stride, depth, num_filters, init='GlorotUniform', pad='SAME', bias=True, weightnorm=False, batchnorm=False, is_training=True, **kwargs): """ Performs 2D convolution on input in NCHW data format :parameters: input - input to be convolved kernel - int; size of convolutional kernel stride - int; horizontal / vertical stride to be used depth - int; no. of channels of input num_filters - int; no. of output channels required batchnorm - flag that denotes whether batch normalization should be applied is_training - flag that denotes batch normalization mode """ with tf.name_scope(name) as scope: filter_values = initializer(init, (kernel, kernel, depth, num_filters), gain='relu', **kwargs) filters = tflib.param(name + '.W', filter_values) if weightnorm: norm_values = np.sqrt( np.sum(np.square(filter_values), axis=(0, 1, 2))) target_norms = tflib.param(name + '.g', norm_values) with tf.name_scope('weightnorm') as scope: norms = tf.sqrt( tf.reduce_sum(tf.square(filters), reduction_indices=[0, 1, 2])) filters = filters * (target_norms / norms) out = tf.nn.conv2d(input, filters, strides=[1, stride, stride, 1], padding=pad, data_format='NHWC') if bias: b = tflib.param(name + '.b', np.zeros(num_filters, dtype=np.float32)) out = tf.nn.bias_add(out, b, data_format='NHWC') if batchnorm: out = tf.contrib.layers.batch_norm(inputs=out, scope=scope, is_training=is_training, data_format='NHWC') return out
def Batchnorm(name, axes, inputs, is_training=None, stats_iter=None, update_moving_stats=True, fused=True, labels=None, n_labels=None): """conditional batchnorm (dumoulin et al 2016) for BCHW conv filtermaps""" if axes != [0, 2, 3]: raise Exception('unsupported') #pdb.set_trace() mean, var = tf.nn.moments(inputs, axes, keep_dims=True) shape = mean.get_shape().as_list() # shape is [1,n,1,1] offset_m = lib.param(name + '.offset', np.zeros([n_labels, shape[1]], dtype='float32')) scale_m = lib.param(name + '.scale', np.ones([n_labels, shape[1]], dtype='float32')) moving_mean_m = lib.param(name + '.moving_mean', np.zeros([n_labels, shape[1]], dtype='float32'), trainable=False) moving_variance_m = lib.param(name + '.moving_variance', np.ones([n_labels, shape[1]], dtype='float32'), trainable=False) offset = tf.nn.embedding_lookup(offset_m, labels) scale = tf.nn.embedding_lookup(scale_m, labels) result = tf.nn.batch_normalization(inputs, mean, var, offset[:, :, None, None], scale[:, :, None, None], 1e-5) return result
def im2latexAttention(name, inputs, ctx, input_dim, ENC_DIM, DEC_DIM, D, H, W): """ Function that encodes the feature grid extracted from CNN using BiLSTM encoder and decodes target sequences using an attentional decoder mechanism PS: Feature grid can be of variable size (as long as size is within 'H' and 'W') :parameters: ctx - (N,C,H,W) format ; feature grid extracted from CNN input_dim - int ; Dimensionality of input sequences (Usually, Embedding Dimension) ENC_DIM - int; Dimensionality of BiLSTM Encoder DEC_DIM - int; Dimensionality of Attentional Decoder D - int; No. of channels in feature grid H - int; Maximum height of feature grid W - int; Maximum width of feature grid """ V = tf.transpose(ctx, [0, 2, 3, 1]) # (B, H, W, D) V_cap = [] batch_size = tf.shape(ctx)[0] count = 0 h0_i_1 = tf.tile( tflib.param(name + '.Enc_.init.h0_1', np.zeros((1, H, 2 * ENC_DIM)).astype('float32')), [batch_size, 1, 1]) h0_i_2 = tf.tile( tflib.param(name + '.Enc_init.h0_2', np.zeros((1, H, 2 * ENC_DIM)).astype('float32')), [batch_size, 1, 1]) def fn(prev_out, i): # for i in xrange(H): return tflib.ops.BiLSTM(name + '.BiLSTMEncoder', V[:, i], D, ENC_DIM, h0_i_1[:, i], h0_i_2[:, i]) V_cap = tf.scan(fn, tf.range(tf.shape(V)[1]), initializer=tf.placeholder(shape=(None, None, 2 * ENC_DIM), dtype=tf.float32)) V_t = tf.reshape(tf.transpose(V_cap, [1, 0, 2, 3]), [tf.shape(inputs)[0], -1, ENC_DIM * 2]) # (B, L, ENC_DIM) h0_dec = tf.tile( tflib.param(name + '.Decoder.init.h0', np.zeros((1, 3 * DEC_DIM)).astype('float32')), [batch_size, 1]) cell = tflib.ops.im2latexAttentionCell(name + '.AttentionCell', input_dim, DEC_DIM, H * W, 2 * ENC_DIM, V_t) seq_len = tf.tile(tf.expand_dims(tf.shape(inputs)[1], 0), [batch_size]) out = tf.nn.dynamic_rnn(cell, inputs, initial_state=h0_dec, sequence_length=seq_len, swap_memory=True) return out
def Layernorm(name, norm_axes, inputs, labels=None, n_labels=None): """labels and n_labels implement 'conditional batchnorm' (dumoulin et al 2016)""" mean, var = tf.nn.moments(inputs, norm_axes, keep_dims=True) # Assume the 'neurons' axis is the first of norm_axes. This is the case for fully-connected and BCHW conv layers. n_neurons = inputs.get_shape().as_list()[norm_axes[0]] if labels is None: offset = lib.param(name + '.b', np.zeros(n_neurons, dtype='float32')) scale = lib.param(name + '.scale', np.ones(n_neurons, dtype='float32')) # Add broadcasting dims to offset and scale (e.g. BCHW conv data) offset = tf.reshape(offset, [-1] + [1 for i in xrange(len(norm_axes) - 1)]) scale = tf.reshape(scale, [-1] + [1 for i in xrange(len(norm_axes) - 1)]) else: offset_m = lib.param(name + '.b', np.zeros([n_labels, n_neurons], dtype='float32')) scale_m = lib.param(name + '.scale', np.ones([n_labels, n_neurons], dtype='float32')) offset = tf.nn.embedding_lookup(offset_m, labels) scale = tf.nn.embedding_lookup(scale_m, labels) # Add H and W broadcasting dims if norm_axes != [1, 2, 3]: raise Exception('unsupported') offset = offset[:, :, None, None] scale = scale[:, :, None, None] result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-5) return result
def BiLSTM(name, inputs, n_in, n_hid, h0_1=None, h0_2=None): """ Compute recurrent memory states using Bidirectional Long Short-Term Memory units :parameters: n_in : int ; Dimensionality of input n_hid : int ; Dimensionality of hidden state / memory state h0_1: vector ; Initial hidden state of forward LSTM h0_2: vector ; Initial hidden state of backward LSTM """ batch_size = tf.shape(inputs)[0] if h0_1 is None: h0_1 = tflib.param(name + '.init.h0_1', np.zeros(2 * n_hid, dtype='float32')) h0_1 = tf.reshape( tf.tile(h0_1, tf.stack([batch_size])), tf.stack([batch_size, 2 * n_hid])) if h0_2 is None: h0_2 = tflib.param(name + '.init.h0_2', np.zeros(2 * n_hid, dtype='float32')) h0_2 = tf.reshape( tf.tile(h0_2, tf.stack([batch_size])), tf.stack([batch_size, 2 * n_hid])) cell_fw = LSTMCell(name + '_fw', n_in, n_hid) cell_bw = LSTMCell(name + '_bw', n_in, n_hid) # with tf.variable_scope(name + '_fw_single'): # cell_fw = tf.contrib.rnn.LSTMCell(n_hid) # with tf.variable_scope(name + '_bw_single'): # cell_bw = tf.contrib.rnn.LSTMCell(n_hid) # ''' # 添加多层 # ''' # stack_fw, stack_bw=[], [] # for i in range(3): # with tf.variable_scope(name+'_fw_{}'.format(i)): # stack_fw.append(cell1) # with tf.variable_scope(name+'bw_{}'.format(i)): # stack_bw.append(cell2) # with tf.variable_scope(name + '_mul_fw'): # mcell_fw = tf.contrib.rnn.MultiRNNCell(stack_fw) # with tf.variable_scope(name + '_mul_bw'): # mcell_bw = tf.contrib.rnn.MultiRNNCell(stack_bw) # print(np.shape(mcell_bw)) seq_len = tf.tile(tf.expand_dims(tf.shape(inputs)[1], 0), [batch_size]) outputs = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs, sequence_length=seq_len, initial_state_fw=h0_1, initial_state_bw=h0_2, swap_memory=True) return tf.concat(axis=2, values=[outputs[0][0], outputs[0][1]])
def Linear(name, inputs, input_dim, output_dim, activation='linear', bias=True, init=None, weightnorm=False, **kwargs): """ Compute a linear transform of one or more inputs, optionally with a bias. Supports more than 2 dimensions. (in which case last axis is considered the dimension to be transformed) :parameters: input_dim: tuple of ints, or int; dimensionality of the input output_dim: int; dimensionality of output activation: 'linear','sigmoid', etc. ; used as gain parameter for weight initialization ; DOES NOT APPLY THE ACTIVATION MENTIONED IN THIS PARAMETER bias: flag that denotes whether bias should be applied init: name of weight initializer to be used weightnorm: flag that denotes whether weight normalization should be applied """ with tf.name_scope(name) as scope: weight_values = initializer(init, (input_dim, output_dim), gain=activation, **kwargs) weight = tflib.param(name + '.W', weight_values) batch_size = None if weightnorm: norm_values = np.sqrt(np.sum(np.square(weight_values), axis=0)) # nort.m_values = np.linalg.norm(weight_values, axis=0) target_norms = tflib.param(name + '.g', norm_values) with tf.name_scope('weightnorm') as scope: norms = tf.sqrt( tf.reduce_sum(tf.square(weight), reduction_indices=[0])) weight = weight * (target_norms / norms) if inputs.get_shape().ndims == 2: result = tf.matmul(inputs, weight) else: reshaped_inputs = tf.reshape(inputs, [-1, input_dim]) result = tf.matmul(reshaped_inputs, weight) result = tf.reshape( result, tf.stack(tf.unstack(tf.shape(inputs))[:-1] + [output_dim])) if bias: b = tflib.param(name + '.b', numpy.zeros((output_dim, ), dtype='float32')) result = tf.nn.bias_add(result, b) return result
def Conv1D(name, input_dim, output_dim, filter_size, inputs, he_init=True, stride=1, save_filter=False): """ inputs: tensor of shape (batch size, num channels, height, width) mask_type: one of None, 'a', 'b' returns: tensor of shape (batch size, num channels, height, width) """ with tf.name_scope(name): def uniform(stdev, size): return np.random.uniform(low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size).astype('float32') if _weights_stdev is not None: filter_values = uniform(_weights_stdev, (filter_size, input_dim, output_dim)) else: fan_in = input_dim * filter_size fan_out = output_dim * filter_size / stride if he_init: filters_stdev = np.sqrt(4. / (fan_in + fan_out)) else: # Normalized init (Glorot & Bengio) filters_stdev = np.sqrt(2. / (fan_in + fan_out)) filter_values = uniform(filters_stdev, (filter_size, input_dim, output_dim)) filters = lib.param(name + '.Filters', filter_values) # print('-------------------------------------------------------------------') # print(inputs.shape) inputs = tf.transpose(inputs, [0, 2, 1]) # name='NCHW_to_NHWC' result = tf.nn.conv1d(value=inputs, filters=filters, stride=stride, padding='SAME', data_format='NWC') _biases = lib.param(name + '.Biases', np.zeros([output_dim], dtype='float32')) #result = tf.expand_dims(result, 3) result = tf.nn.bias_add(result, _biases, data_format='NHWC') result = tf.transpose(result, [0, 2, 1]) #name='NHWC_to_NCHW' result = tf.squeeze(result) if save_filter: return result, filters else: return result
def Deconv2D(name, input_dim, output_dim, filter_size1, filter_size2, inputs, he_init=True): """ inputs: tensor of shape (batch size, height, width, input_dim) returns: tensor of shape (batch size, 2*height, 2*width, output_dim) """ with tf.name_scope(name): def uniform(stdev, size): return np.random.uniform(low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size).astype('float32') stride = 2 fan_in = input_dim * filter_size1 * filter_size2 / stride fan_out = output_dim * filter_size1 * filter_size2 if _weights_stdev is not None: filter_values = uniform( _weights_stdev, (filter_size1, filter_size2, input_dim, output_dim)) else: if he_init: filters_stdev = np.sqrt(4. / (fan_in + fan_out)) else: # Normalized init (Glorot & Bengio) filters_stdev = np.sqrt(2. / (fan_in + fan_out)) filter_values = uniform( filters_stdev, (filter_size1, filter_size2, input_dim, output_dim)) filters = lib.param(name + '.Filters', filter_values) inputs = tf.transpose(inputs, [0, 2, 3, 1], name='NCHW_to_NHWC') input_shape = tf.shape(inputs) output_shape = tf.stack([1, 2 * input_shape[2]]) resized_image = tf.image.resize_images( images=inputs, size=output_shape, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) result = tf.nn.conv2d(input=resized_image, filter=filters, strides=[1, 1, 1, 1], padding='SAME') _biases = lib.param(name + '.Biases', np.zeros(output_dim, dtype='float32')) result = tf.nn.bias_add(result, _biases) result = tf.transpose(result, [0, 3, 1, 2], name='NHWC_to_NCHW') return result
def Conv2D(name, input_dim, output_dim, filter_size1, filter_size2, inputs, he_init=True, stride=1, save_filter=False): """ inputs: tensor of shape (batch size, num channels, height, width) mask_type: one of None, 'a', 'b' returns: tensor of shape (batch size, num channels, height, width) """ with tf.name_scope(name): def uniform(stdev, size): return np.random.uniform(low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size).astype('float32') if _weights_stdev is not None: filter_values = uniform( _weights_stdev, (filter_size1, filter_size2, input_dim, output_dim)) else: fan_in = input_dim * filter_size1 * filter_size2 fan_out = output_dim * filter_size1 * filter_size2 / stride if he_init: filters_stdev = np.sqrt(4. / (fan_in + fan_out)) else: # Normalized init (Glorot & Bengio) filters_stdev = np.sqrt(2. / (fan_in + fan_out)) filter_values = uniform( filters_stdev, (filter_size1, filter_size2, input_dim, output_dim)) filters = lib.param(name + '.Filters', filter_values) result = tf.nn.conv2d(input=inputs, filter=filters, strides=[1, 1, stride, stride], padding='VALID', data_format='NCHW') _biases = lib.param(name + '.Biases', np.zeros(output_dim, dtype='float32')) result = tf.nn.bias_add(result, _biases, data_format='NCHW') if save_filter: return result, filters else: return result
def Ladder(inputs, input_dim, name): with tf.name_scope(name) as scope: zs = np.zeros(input_dim).astype('float32') os = np.ones(input_dim).astype('float32') a1 = lib.param(name + '.a1', zs) a2 = lib.param(name + '.a2', os) a3 = lib.param(name + '.a3', zs) a4 = lib.param(name + '.a4', zs) c1 = lib.param(name + '.c1', zs) c2 = lib.param(name + '.c2', os) c3 = lib.param(name + '.c3', zs) c4 = lib.param(name + '.c4', zs) b1 = lib.param(name + '.b1', zs) z_lat, u = inputs sigval = c1 + c2 * z_lat sigval += c3 * u + c4 * z_lat * u sigval = tf.nn.sigmoid(sigval) z_est = a1 + a2 * z_lat + b1 * sigval z_est += a3 * u + a4 * z_lat * u return z_est
def make_adam_vars(name, params): t = lib.param(name + '.t', np.float32(0.)) ms, vs = [], [] for i, param in enumerate(params): ms.append( lib.param(name + '.m_{}'.format(i), np.zeros(param.get_shape(), dtype='float32'))) vs.append( lib.param(name + '.v_{}'.format(i), np.zeros(param.get_shape(), dtype='float32'))) return (t, ms, vs)
def Batchnorm(name, axes, inputs, is_training=None, stats_iter=None, update_moving_stats=True, fused=True, labels=None, n_labels=None): """conditional batchnorm (dumoulin et al 2016) for BCHW conv filtermaps""" if axes != [0,2,3]: raise Exception('unsupported') mean, var = tf.nn.moments(inputs, axes, keep_dims=True) shape = mean.get_shape().as_list() # shape is [1,n,1,1] offset_m = lib.param(name+'.offset', np.zeros([n_labels,shape[1]], dtype='float32')) scale_m = lib.param(name+'.scale', np.ones([n_labels,shape[1]], dtype='float32')) offset = tf.nn.embedding_lookup(offset_m, labels) scale = tf.nn.embedding_lookup(scale_m, labels) result = tf.nn.batch_normalization(inputs, mean, var, offset[:,:,None,None], scale[:,:,None,None], 1e-5) return result
def Conv2D( name, input, depth, n_filters, kernel, stride, **kwargs ): with tf.name_scope(name) as scope: filter_values = weight_initializer( kwargs.get('init', 'GlorotUniform'), (kernel, kernel, depth, n_filters), gain=kwargs.get('activation', 'relu') ) filters = lib.param( name+'.W', filter_values ) if _WEIGHTNORM: norm_values = np.sqrt(np.sum(np.square(filter_values), axis=(0, 1, 2))) target_norms = lib.param( name + '.g', norm_values ) with tf.name_scope('weightnorm'): norms = tf.sqrt(tf.reduce_sum(tf.square(filters), reduction_indices=[0, 1, 2])) filters = filters * (target_norms / norms) out = tf.nn.conv2d( input, filters, strides=[1, 1, stride, stride], padding=kwargs.get('padding', 'SAME'), data_format='NCHW' ) if kwargs.get('bias', True): b = lib.param( name+'.b', weight_initializer('Constant', n_filters, val=0.) ) out = tf.nn.bias_add(out, b, data_format='NCHW') if kwargs.get('batchnorm', False): # Note: when training, the moving_mean and moving_variance need to be updated. # By default the update ops are placed in tf.GraphKeys.UPDATE_OPS, # so they need to be added as a dependency to the train_op. For example: # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # with tf.control_dependencies(update_ops): # train_op = optimizer.minimize(loss) out = tf.layers.batch_normalization( inputs=out, axis=1, training=kwargs.get('training_mode', True) ) return out
def Conv3D(name, filter_len, input_dim, output_dim, filter_size, inputs, he_init=True, stride=1, stride_len=1, biases=True): """ inputs: tensor of shape (N, L, H, W, C) returns: tensor of shape (N, L, H, W, C) """ with tf.name_scope(name) as scope: def uniform(stdev, size): return np.random.uniform(low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size).astype('float32') fan_in = input_dim * filter_size**2 * filter_len fan_out = output_dim * filter_size**2 / (stride** 2) * filter_len / stride_len if he_init: filters_stdev = np.sqrt(4. / (fan_in + fan_out)) else: # Normalized init (Glorot & Bengio) filters_stdev = np.sqrt(2. / (fan_in + fan_out)) filter_values = uniform( filters_stdev, (filter_len, filter_size, filter_size, input_dim, output_dim)) filters = lib.param(name + '.Filters', filter_values) result = tf.nn.conv3d(input=inputs, filter=filters, strides=[1, stride_len, stride, stride, 1], padding='SAME', data_format='NDHWC') if biases: _biases = lib.param( name + '.Biases', np.zeros((1, 1, 1, 1, output_dim), dtype='float32')) result = tf.add(result, _biases) return result
def Linear( name, inputs, input_dim, output_dim, **kwargs ): with tf.name_scope(name): weight_values = weight_initializer( kwargs.get('init', 'HeNormal'), (input_dim, output_dim), gain=kwargs.get('activation', 'linear') ) weight = lib.param( name + '.W', weight_values ) if _WEIGHTNORM: norm_values = np.sqrt(np.sum(np.square(weight_values), axis=0)) # nort.m_values = np.linalg.norm(weight_values, axis=0) target_norms = lib.param( name + '.g', norm_values ) with tf.name_scope('weightnorm'): norms = tf.sqrt(tf.reduce_sum(tf.square(weight), reduction_indices=[0])) weight = weight * (target_norms / norms) if inputs.get_shape().ndims == 2: result = tf.matmul(inputs, weight) else: reshaped_inputs = tf.reshape(inputs, [-1, input_dim]) result = tf.matmul(reshaped_inputs, weight) result = tf.reshape(result, tf.stack(tf.unstack(tf.shape(inputs))[:-1] + [output_dim])) if kwargs.get('bias', True): b = lib.param( name + '.b', weight_initializer('Constant', output_dim, val=0.) ) result = tf.nn.bias_add(result, b) if kwargs.get('batchnorm', False): result = tf.layers.batch_normalization( inputs=result, axis=-1, training=kwargs.get('training_mode', True) ) return result
def Layernorm(name, norm_axes, inputs): mean, var = tf.nn.moments(inputs, norm_axes, keep_dims=True) n_neurons = inputs.get_shape().as_list()[norm_axes[0]] offset = lib.param(name + '.offset', np.zeros(n_neurons, dtype='float32')) scale = lib.param(name + '.scale', np.ones(n_neurons, dtype='float32')) offset = tf.reshape(offset, [-1] + [1 for i in xrange(len(norm_axes) - 1)]) scale = tf.reshape(scale, [-1] + [1 for i in xrange(len(norm_axes) - 1)]) result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-5) return result
def AttentionRNN( type, name, ctx, ctx_mask, input_dim, hidden_dim, ctx_dim, inputs=None, state0=None, mask=None, n_layers=1, position_gap=1., return_cell_state=False, closed_loop=False, seq_len=1000 ): size = 2*hidden_dim if type == 'LSTM' else hidden_dim if closed_loop: batch_size = tf.shape(ctx)[0] else: batch_size, seq_len, _ = tf.unstack(tf.shape(inputs)) if state0 is None: h0 = tf.tile(lib.param( name+'.h0', weight_initializer('Constant', (1, n_layers*size), val=0.) ), [batch_size, 1]) w0 = tf.tile(lib.param( name+'.w0', weight_initializer('Constant', (1, ctx_dim), val=0.) ), [batch_size, 1]) k0 = tf.tile(lib.param( name+'.k0', weight_initializer('Constant', (1, 1), val=0.) ), [batch_size, 1]) if closed_loop: x0 = tf.zeros((batch_size, 34), dtype=tf.float32) state0 = (h0, k0, w0, x0) inputs = tf.zeros((batch_size, seq_len, 20), dtype=tf.float32) else: state0 = (h0, k0, w0) if mask is None: sequence_length = tf.tile(tf.expand_dims(seq_len, 0), [batch_size]) else: sequence_length = tf.reduce_sum(mask, axis=-1) states, _ = tf.nn.dynamic_rnn( AttentionRNNCell(type, name, ctx, ctx_mask, input_dim, hidden_dim, ctx_dim, n_layers, position_gap, closed_loop), inputs, sequence_length=sequence_length, initial_state=state0 ) return states
def spectral_norm(w, name, iteration=1): w_shape = w.shape.as_list() w = tf.reshape(w, [-1, w_shape[-1]]) u = lib.param(name.replace('Discriminator.', 'D.').replace('Generator.', 'G.').replace('Classifier.', 'C.') + ".u", np.random.normal(size=(1, w_shape[-1])).astype('float32'), trainable=False) u_hat = tf.identity(u) v_hat = None for i in range(iteration): """ power iteration Usually iteration = 1 will be enough """ v_ = tf.matmul(u_hat, tf.transpose(w)) v_hat = l2_norm(v_) u_ = tf.matmul(v_hat, w) u_hat = l2_norm(u_) u_final = tf.identity(u_hat) v_final = tf.identity(v_hat) u_final = tf.stop_gradient(u_final) v_final = tf.stop_gradient(v_final) sigma = tf.matmul(tf.matmul(v_final, w), tf.transpose(u_final)) assign_u = tf.compat.v1.assign(u, u_final) with tf.control_dependencies([assign_u]): sigma = tf.identity(sigma) w_norm = tf.identity(w / sigma) w_norm = tf.reshape(w_norm, w_shape) return w_norm
def HyperGenerator(hyper_k, hyper_noise): com_mu = lib.param( 'Generator.Hyper.Mu', np.random.normal(size=(N_COMS, DIM_LATENT)).astype('float32')) noise = tf.add(tf.matmul(tf.cast(hyper_k, tf.float32), com_mu), hyper_noise) return noise
def Layernorm(name, norm_axes, inputs): mean, var = tf.nn.moments(inputs, norm_axes, keep_dims=True) # Assume the 'neurons' axis is the first of norm_axes. This is the case for fully-connected and BCHW conv layers. n_neurons = inputs.get_shape().as_list()[norm_axes[0]] offset = lib.param(name + '.offset', np.zeros(n_neurons, dtype='float32')) scale = lib.param(name + '.scale', np.ones(n_neurons, dtype='float32')) # Add broadcasting dims to offset and scale (e.g. BCHW conv data) offset = tf.reshape(offset, [-1] + [1 for i in range(len(norm_axes) - 1)]) scale = tf.reshape(scale, [-1] + [1 for i in range(len(norm_axes) - 1)]) result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-5) return result
def Layernorm(name, norm_axes, inputs): mean, var = tf.nn.moments(inputs, norm_axes, keep_dims=True) # Assume the 'neurons' axis is the first of norm_axes. This is the case for fully-connected and BCHW conv layers. n_neurons = inputs.get_shape().as_list()[norm_axes[0]] offset = lib.param(name+'.offset', np.zeros(n_neurons, dtype='float32')) scale = lib.param(name+'.scale', np.ones(n_neurons, dtype='float32')) # Add broadcasting dims to offset and scale (e.g. BCHW conv data) offset = tf.reshape(offset, [-1] + [1 for i in xrange(len(norm_axes)-1)]) scale = tf.reshape(scale, [-1] + [1 for i in xrange(len(norm_axes)-1)]) result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-5) return result
def HyperExtractor(latent_z): com_mu = lib.param( 'Generator.Hyper.Mu', np.random.normal(size=(N_COMS, DIM_LATENT)).astype('float32')) com_logits = -.5 * tf.reduce_sum(tf.pow( (tf.expand_dims(latent_z, axis=1) - tf.expand_dims(com_mu, axis=0)), 2), axis=-1) + tf.expand_dims(tf.log(PI), axis=0) if MODE_K is 'REINFORCE': k = tf.one_hot(indices=tf.argmax(com_logits, axis=-1), depth=N_COMS) elif MODE_K is 'CONCRETE': k = tf.nn.softmax( (com_logits + sample_gumbel(tf.shape(com_logits))) / TEMP) elif MODE_K is 'STRAIGHT_THROUGHT_CONCRETE': k = tf.nn.softmax( (com_logits + sample_gumbel(tf.shape(com_logits))) / TEMP) k_hard = tf.one_hot(indices=tf.argmax(k, axis=-1), depth=N_COMS) k = tf.stop_gradient(k_hard - k) + k elif MODE_K is 'STRAIGHT_THROUGHT': k_hard = tf.one_hot(indices=tf.argmax(com_logits, axis=-1), depth=N_COMS) k = tf.stop_gradient(k_hard - com_logits) + com_logits return com_logits, k
def GRU(name, n_in, n_hid, inputs, h0=None): if h0 is None: batch_size = tf.shape(inputs)[0] h0 = lib.param(name+'.h0', np.zeros(n_hid, dtype='float32')) # h0 = tf.reshape(tf.tile(h0, tf.pack([batch_size])), tf.pack([batch_size, n_hid])) h0 = tf.reshape(tf.tile(h0, tf.stack([batch_size])), tf.stack([batch_size, n_hid])) return tf.nn.dynamic_rnn(GRUCell(name, n_in, n_hid), inputs, initial_state=h0, swap_memory=True)[0] # class GRUCell(tf.nn.rnn_cell.RNNCell): # def __init__(self, name, n_in, n_hid): # self._n_in = n_in # self._n_hid = n_hid # self._name = name # @property # def state_size(self): # return self._n_hid # @property # def output_size(self): # return self._n_hid # def __call__(self, processed_inputs, state, scope=None): # # pi_update, pi_reset, pi_candidate = tf.split(1, 3, processed_inputs) # gates = tf.nn.sigmoid( # lib.ops.linear.Linear( # self._name+'.Gates_R', # self._n_hid, # 2 * self._n_hid, # state, # biases=False # ) + processed_inputs[:,:2*self._n_hid] # ) # update, reset = tf.split(1, 2, gates) # scaled_state = reset * state # candidate = tf.tanh( # lib.ops.linear.Linear( # self._name+'.Candidate_R', # self._n_hid, # self._n_hid, # scaled_state # # tf.concat(1, [inputs, scaled_state]) # ) + processed_inputs[:,2*self._n_hid:] # ) # output = (update * candidate) + ((1 - update) * state) # return output, output # def GRU(name, n_in, n_hid, inputs): # processed_inputs = lib.ops.linear.Linear(name+'.Inputs', n_in, 3*n_hid, inputs) # h0 = lib.param(name+'.h0', np.zeros(n_hid, dtype='float32')) # batch_size = tf.shape(inputs)[0] # h0 = tf.reshape(tf.tile(h0, tf.pack([batch_size])), tf.pack([batch_size, n_hid])) # return tf.nn.dynamic_rnn(GRUCell(name, n_in, n_hid), processed_inputs, initial_state=h0, swap_memory=True)[0]
def Embedding(name, n_symbols, emb_dim, indices): with tf.name_scope(name): emb = lib.param( name, weight_initializer('Normal', [n_symbols, emb_dim], std=1.0/np.sqrt(n_symbols)) ) return tf.nn.embedding_lookup(emb, indices)
def RNN(name, n_in, n_hid, inputs): h0 = lib.param(name + '.h0', np.zeros(n_hid, dtype='float32')) batch_size = tf.shape(inputs)[0] h0 = tf.reshape(tf.tile(h0, tf.pack([batch_size])), tf.pack([batch_size, n_hid])) return tf.nn.dynamic_rnn(RNNCell(name, n_in, n_hid), inputs, initial_state=h0, swap_memory=True)[0]
def Embedding(name, inputs, vocab_size, hidden_size, embed=None): """ inputs_shape: (..., vocab_size) """ with tf.name_scope(name): embed_values = np.random.uniform(size=(vocab_size, hidden_size)) if not embed: embed = lib.param(name, embed_values) return tf.nn.embedding_lookup(embed, inputs)
def RNN( type, name, inputs, input_dim, hidden_dim, h0=None, mask=None, n_layers=1, bidirectional=False, return_cell_state=False ): ''' inputs: (BATCH_SIZE, N_STEPS, INPUT_DIM) h0: (N_DIRECTIONS, N_LAYERS * HIDDEN_DIM) outputs: (BATCH_SIZE, N_STEPS, N_DIRECTIONS, N_LAYERS, HIDDEN_DIM) ''' size = 2*hidden_dim if type == 'LSTM' else hidden_dim batch_size, seq_len, _ = tf.unstack(tf.shape(inputs)) n_dir = 2 if bidirectional else 1 if h0 is None: h0 = tf.tile(lib.param( name+'.h0', weight_initializer('Constant', (1, n_dir, n_layers*size), val=0.) ), [batch_size, 1, 1]) if mask is None: sequence_length = tf.tile(tf.expand_dims(seq_len, 0), [batch_size]) else: sequence_length = tf.reduce_sum(mask, axis=-1) if bidirectional: states, _ = tf.nn.bidirectional_dynamic_rnn( RNNCell(type, name+'.Forward', input_dim, hidden_dim, n_layers), RNNCell(type, name+'.Backward', input_dim, hidden_dim, n_layers), inputs, sequence_length=sequence_length, initial_state_fw=h0[:, 0], initial_state_bw=h0[:, 1] ) states = tf.stack(states, axis=2) else: states, _ = tf.nn.dynamic_rnn( RNNCell(type, name, input_dim, hidden_dim, n_layers), inputs, sequence_length=sequence_length, initial_state=h0[:, 0] ) states = tf.expand_dims(states, axis=2) states = tf.stack(tf.split(states, n_layers, axis=-1), axis=3) if return_cell_state: return states else: return states[:, :, :, :, :hidden_dim]
def Batchnorm( name, axes, inputs, # is_training=None, # stats_iter=None, # update_moving_stats=True, # fused=True, labels=None, n_labels=None): """ conditional batchnorm (dumoulin et al 2016) for BCHW conv filtermaps """ print 'inputs: ', inputs print 'labels: ', labels if axes != [0, 2, 3]: raise Exception('unsupported') mean, var = tf.nn.moments(inputs, axes, keep_dims=True) print 'mean:', mean print 'var:', var shape = mean.get_shape().as_list() # shape is [1,n,1,1] print 'shape: ', shape offset_m = lib.param(name + '.offset', np.zeros([n_labels, shape[1]], dtype='float32')) scale_m = lib.param(name + '.scale', np.ones([n_labels, shape[1]], dtype='float32')) print 'offset_m: ', offset_m print 'scale_m: ', scale_m offset = tf.nn.embedding_lookup(offset_m, labels) scale = tf.nn.embedding_lookup(scale_m, labels) print 'offset: ', offset print 'scale: ', scale result = tf.nn.batch_normalization(inputs, mean, var, offset[:, :, None, None], scale[:, :, None, None], 1e-5) return result
def Batchnorm(name, axes, inputs): if axes == [0, 2, 3]: inputs = tf.transpose(inputs, [0, 2, 3, 1]) mean, var = tf.nn.moments(inputs, [0, 1, 2], keep_dims=False) offset = lib.param(name + '.offset', np.zeros(mean.get_shape()[-1], dtype='float32')) scale = lib.param(name + '.scale', np.ones(var.get_shape()[-1], dtype='float32')) result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-4) return tf.transpose(result, [0, 3, 1, 2]) else: mean, var = tf.nn.moments(inputs, axes, keep_dims=True) offset = lib.param(name + '.offset', np.zeros(mean.get_shape(), dtype='float32')) scale = lib.param(name + '.scale', np.ones(var.get_shape(), dtype='float32')) result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-4) # lib.debug.print_stats(name, result) return result
def Conv2D(name, input_dim, output_dim, filter_size, inputs, he_init=True, mask_type=None, stride=1, weightnorm=None, biases=True, gain=1.): """ inputs: tensor of shape (batch size, num channels, height, width) mask_type: one of None, 'a', 'b' returns: tensor of shape (batch size, num channels, height, width) """ with tf.name_scope(name) as scope: if mask_type is not None: mask_type, mask_n_channels = mask_type mask = np.ones( (filter_size, filter_size, input_dim, output_dim), dtype='float32' ) center = filter_size // 2 # Mask out future locations # filter shape is (height, width, input channels, output channels) mask[center+1:, :, :, :] = 0. mask[center, center+1:, :, :] = 0. # Mask out future channels for i in range(mask_n_channels): for j in range(mask_n_channels): if (mask_type=='a' and i >= j) or (mask_type=='b' and i > j): mask[ center, center, i::mask_n_channels, j::mask_n_channels ] = 0. def uniform(stdev, size): return np.random.uniform( low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size ).astype('float32') fan_in = input_dim * filter_size**2 fan_out = output_dim * filter_size**2 / (stride**2) if mask_type is not None: # only approximately correct fan_in /= 2. fan_out /= 2. if he_init: filters_stdev = np.sqrt(4./(fan_in+fan_out)) else: # Normalized init (Glorot & Bengio) filters_stdev = np.sqrt(2./(fan_in+fan_out)) if _weights_stdev is not None: filter_values = uniform( _weights_stdev, (filter_size, filter_size, input_dim, output_dim) ) else: filter_values = uniform( filters_stdev, (filter_size, filter_size, input_dim, output_dim) ) # print "WARNING IGNORING GAIN" filter_values *= gain filters = lib.param(name+'.Filters', filter_values) if weightnorm==None: weightnorm = _default_weightnorm if weightnorm: norm_values = np.sqrt(np.sum(np.square(filter_values), axis=(0,1,2))) target_norms = lib.param( name + '.g', norm_values ) with tf.name_scope('weightnorm') as scope: norms = tf.sqrt(tf.reduce_sum(tf.square(filters), reduction_indices=[0,1,2])) filters = filters * (target_norms / norms) if mask_type is not None: with tf.name_scope('filter_mask'): filters = filters * mask result = tf.nn.conv2d( input=inputs, filter=filters, strides=[1, 1, stride, stride], padding='SAME', data_format='NCHW' ) if biases: _biases = lib.param( name+'.Biases', np.zeros(output_dim, dtype='float32') ) result = tf.nn.bias_add(result, _biases, data_format='NCHW') return result
def SeparableConv2D(name, input_dim, output_dim, filter_size, inputs, he_init=True, stride=1, weightnorm=None, biases=True, gain=1., mask_type=None): """ inputs: tensor of shape (batch size, num channels, height, width) mask_type: one of None, 'a', 'b' returns: tensor of shape (batch size, num channels, height, width) """ if mask_type is not None: raise Exception('unsupported') with tf.name_scope(name) as scope: def uniform(stdev, size): return np.random.uniform( low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size ).astype('float32') spatial_fan_in = filter_size**2 spatial_fan_out = filter_size**2 / (stride**2) pointwise_fan_in = input_dim pointwise_fan_out = output_dim if he_init: spatial_filters_stdev = np.sqrt(4./(spatial_fan_in+spatial_fan_out)) else: # Normalized init (Glorot & Bengio) spatial_filters_stdev = np.sqrt(2./(spatial_fan_in+spatial_fan_out)) pointwise_filters_stdev = np.sqrt(2./(pointwise_fan_in+pointwise_fan_out)) spatial_filter_values = uniform( spatial_filters_stdev, (filter_size, filter_size, input_dim, 1) ) pointwise_filter_values = uniform( pointwise_filters_stdev, (1, 1, input_dim, output_dim) ) spatial_filter_values *= gain spatial_filters = lib.param(name+'.SpatialFilters', spatial_filter_values) pointwise_filters = lib.param(name+'.PointwiseFilters', pointwise_filter_values) if weightnorm==None: weightnorm = _default_weightnorm if weightnorm: spatial_norm_values = np.sqrt(np.sum(np.square(spatial_filter_values), axis=(0,1))) spatial_target_norms = lib.param( name + '.gSpatial', spatial_norm_values ) pointwise_norm_values = np.sqrt(np.sum(np.square(pointwise_filter_values), axis=(0,1,2))) pointwise_target_norms = lib.param( name + '.gPointwise', pointwise_norm_values ) with tf.name_scope('weightnorm') as scope: spatial_norms = tf.sqrt(tf.reduce_sum(tf.square(spatial_filters), reduction_indices=[0,1])) spatial_filters = spatial_filters * (spatial_target_norms / spatial_norms) pointwise_norms = tf.sqrt(tf.reduce_sum(tf.square(pointwise_filters), reduction_indices=[0,1,2])) pointwise_filters = pointwise_filters * (pointwise_target_norms / pointwise_norms) result = tf.transpose(inputs, [0,2,3,1]) result = tf.nn.separable_conv2d( input=result, depthwise_filter=spatial_filters, pointwise_filter=pointwise_filters, strides=[1, stride, stride, 1], padding='SAME' ) if biases: _biases = lib.param( name+'.Biases', np.zeros(output_dim, dtype='float32') ) result = tf.nn.bias_add(result, _biases) result = tf.transpose(result, [0,3,1,2]) # lib.debug.print_stats(name, result) return result
def Deconv2D( name, input_dim, output_dim, filter_size, inputs, he_init=True, weightnorm=None, biases=True, gain=1., mask_type=None, ): """ inputs: tensor of shape (batch size, height, width, input_dim) returns: tensor of shape (batch size, 2*height, 2*width, output_dim) """ with tf.name_scope(name) as scope: if mask_type != None: raise Exception('Unsupported configuration') def uniform(stdev, size): return np.random.uniform( low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size ).astype('float32') stride = 2 fan_in = input_dim * filter_size**2 / (stride**2) fan_out = output_dim * filter_size**2 if he_init: filters_stdev = np.sqrt(4./(fan_in+fan_out)) else: # Normalized init (Glorot & Bengio) filters_stdev = np.sqrt(2./(fan_in+fan_out)) if _weights_stdev is not None: filter_values = uniform( _weights_stdev, (filter_size, filter_size, output_dim, input_dim) ) else: filter_values = uniform( filters_stdev, (filter_size, filter_size, output_dim, input_dim) ) filter_values *= gain filters = lib.param( name+'.Filters', filter_values ) if weightnorm==None: weightnorm = _default_weightnorm if weightnorm: norm_values = np.sqrt(np.sum(np.square(filter_values), axis=(0,1,3))) target_norms = lib.param( name + '.g', norm_values ) with tf.name_scope('weightnorm') as scope: norms = tf.sqrt(tf.reduce_sum(tf.square(filters), reduction_indices=[0,1,3])) filters = filters * tf.expand_dims(target_norms / norms, 1) inputs = tf.transpose(inputs, [0,2,3,1], name='NCHW_to_NHWC') input_shape = tf.shape(inputs) try: # tf pre-1.0 (top) vs 1.0 (bottom) output_shape = tf.pack([input_shape[0], 2*input_shape[1], 2*input_shape[2], output_dim]) except Exception as e: output_shape = tf.stack([input_shape[0], 2*input_shape[1], 2*input_shape[2], output_dim]) result = tf.nn.conv2d_transpose( value=inputs, filter=filters, output_shape=output_shape, strides=[1, 2, 2, 1], padding='SAME' ) if biases: _biases = lib.param( name+'.Biases', np.zeros(output_dim, dtype='float32') ) result = tf.nn.bias_add(result, _biases) result = tf.transpose(result, [0,3,1,2], name='NHWC_to_NCHW') return result
def Linear( name, input_dim, output_dim, inputs, biases=True, initialization=None, weightnorm=None, gain=1. ): """ initialization: None, `lecun`, `he`, `orthogonal`, `("uniform", range)` """ with tf.name_scope(name) as scope: def uniform(stdev, size): return np.random.uniform( low=-stdev * np.sqrt(3), high=stdev * np.sqrt(3), size=size ).astype('float32') if initialization == 'lecun' or \ (initialization == None and input_dim != output_dim): weight_values = uniform( np.sqrt(1./input_dim), (input_dim, output_dim) ) elif initialization == 'glorot': weight_values = uniform( np.sqrt(2./(input_dim+output_dim)), (input_dim, output_dim) ) elif initialization == 'he': weight_values = uniform( np.sqrt(2./input_dim), (input_dim, output_dim) ) elif initialization == 'glorot_he': weight_values = uniform( np.sqrt(4./(input_dim+output_dim)), (input_dim, output_dim) ) elif initialization == 'orthogonal' or \ (initialization == None and input_dim == output_dim): # From lasagne def sample(shape): if len(shape) < 2: raise RuntimeError("Only shapes of length 2 or more are " "supported.") flat_shape = (shape[0], np.prod(shape[1:])) # TODO: why normal and not uniform? a = np.random.normal(0.0, 1.0, flat_shape) u, _, v = np.linalg.svd(a, full_matrices=False) # pick the one with the correct shape q = u if u.shape == flat_shape else v q = q.reshape(shape) return q.astype('float32') weight_values = sample((input_dim, output_dim)) elif initialization[0] == 'uniform': weight_values = np.random.uniform( low=-initialization[1], high=initialization[1], size=(input_dim, output_dim) ).astype('float32') else: raise Exception('Invalid initialization!') weight_values *= gain weight = lib.param( name + '.W', weight_values ) if weightnorm==None: weightnorm = _default_weightnorm if weightnorm: norm_values = np.sqrt(np.sum(np.square(weight_values), axis=0)) # norm_values = np.linalg.norm(weight_values, axis=0) target_norms = lib.param( name + '.g', norm_values ) with tf.name_scope('weightnorm') as scope: norms = tf.sqrt(tf.reduce_sum(tf.square(weight), reduction_indices=[0])) weight = weight * (target_norms / norms) result = tf.matmul(inputs, weight) if biases: result = tf.nn.bias_add( result, lib.param( name + '.b', np.zeros((output_dim,), dtype='float32') ) ) return result
def Embedding(name, vocab_size, dim, indices): embeddings = lib.param( name+'.EmbeddingMatrix', np.random.normal(size=(vocab_size, dim)).astype('float32') ) return tf.gather(embeddings, indices)
def Batchnorm(name, axes, inputs, is_training=None, stats_iter=None, update_moving_stats=True, fused=True): if ((axes == [0,2,3]) or (axes == [0,2])) and fused==True: if axes==[0,2]: inputs = tf.expand_dims(inputs, 3) # Old (working but pretty slow) implementation: ########## # inputs = tf.transpose(inputs, [0,2,3,1]) # mean, var = tf.nn.moments(inputs, [0,1,2], keep_dims=False) # offset = lib.param(name+'.offset', np.zeros(mean.get_shape()[-1], dtype='float32')) # scale = lib.param(name+'.scale', np.ones(var.get_shape()[-1], dtype='float32')) # result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-4) # return tf.transpose(result, [0,3,1,2]) # New (super fast but untested) implementation: offset = lib.param(name+'.offset', np.zeros(inputs.get_shape()[1], dtype='float32')) scale = lib.param(name+'.scale', np.ones(inputs.get_shape()[1], dtype='float32')) moving_mean = lib.param(name+'.moving_mean', np.zeros(inputs.get_shape()[1], dtype='float32'), trainable=False) moving_variance = lib.param(name+'.moving_variance', np.ones(inputs.get_shape()[1], dtype='float32'), trainable=False) def _fused_batch_norm_training(): return tf.nn.fused_batch_norm(inputs, scale, offset, epsilon=1e-5, data_format='NCHW') def _fused_batch_norm_inference(): # Version which blends in the current item's statistics batch_size = tf.cast(tf.shape(inputs)[0], 'float32') mean, var = tf.nn.moments(inputs, [2,3], keep_dims=True) mean = ((1./batch_size)*mean) + (((batch_size-1.)/batch_size)*moving_mean)[None,:,None,None] var = ((1./batch_size)*var) + (((batch_size-1.)/batch_size)*moving_variance)[None,:,None,None] return tf.nn.batch_normalization(inputs, mean, var, offset[None,:,None,None], scale[None,:,None,None], 1e-5), mean, var # Standard version # return tf.nn.fused_batch_norm( # inputs, # scale, # offset, # epsilon=1e-2, # mean=moving_mean, # variance=moving_variance, # is_training=False, # data_format='NCHW' # ) if is_training is None: outputs, batch_mean, batch_var = _fused_batch_norm_training() else: outputs, batch_mean, batch_var = tf.cond(is_training, _fused_batch_norm_training, _fused_batch_norm_inference) if update_moving_stats: no_updates = lambda: outputs def _force_updates(): """Internal function forces updates moving_vars if is_training.""" float_stats_iter = tf.cast(stats_iter, tf.float32) update_moving_mean = tf.assign(moving_mean, ((float_stats_iter/(float_stats_iter+1))*moving_mean) + ((1/(float_stats_iter+1))*batch_mean)) update_moving_variance = tf.assign(moving_variance, ((float_stats_iter/(float_stats_iter+1))*moving_variance) + ((1/(float_stats_iter+1))*batch_var)) with tf.control_dependencies([update_moving_mean, update_moving_variance]): return tf.identity(outputs) outputs = tf.cond(is_training, _force_updates, no_updates) if axes == [0,2]: return outputs[:,:,:,0] # collapse last dim else: return outputs else: # raise Exception('old BN') # TODO we can probably use nn.fused_batch_norm here too for speedup mean, var = tf.nn.moments(inputs, axes, keep_dims=True) shape = mean.get_shape().as_list() if 0 not in axes: print "WARNING ({}): didn't find 0 in axes, but not using separate BN params for each item in batch".format(name) shape[0] = 1 offset = lib.param(name+'.offset', np.zeros(shape, dtype='float32')) scale = lib.param(name+'.scale', np.ones(shape, dtype='float32')) result = tf.nn.batch_normalization(inputs, mean, var, offset, scale, 1e-5) return result
def RNN(name, n_in, n_hid, inputs): h0 = lib.param(name+'.h0', np.zeros(n_hid, dtype='float32')) batch_size = tf.shape(inputs)[0] h0 = tf.reshape(tf.tile(h0, tf.pack([batch_size])), tf.pack([batch_size, n_hid])) return tf.nn.dynamic_rnn(RNNCell(name, n_in, n_hid), inputs, initial_state=h0, swap_memory=True)[0]