def _resnet_block_mode2(x, hidden_units, dropouts, cardinality=1, dense_shortcut=False, training=False, seed=0): """A block that has a dense layer at shortcut. # Arguments input_tensor: input tensor kernel_size: default 3, the kernel size of middle conv layer at main path filters: list of integers, the filters of 3 conv layer at main path stage: integer, current stage label, used for generating layer names block: 'a','b'..., current block label, used for generating layer names # Returns Output tensor for the block. Note that from stage 3, the first conv layer at main path is with strides=(2,2) And the shortcut should have strides=(2,2) as well """ h1, h2, h3 = hidden_units dr1, dr2, dr3 = dropouts xs = [] # branch 0 if dense_shortcut: x0 = tf.layers.Dense(h3, kernel_initializer=tf.glorot_uniform_initializer(seed * 1), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(x) xs.append(x0) else: xs.append(x) # branch 1 ~ cardinality for i in range(cardinality): xs.append(_resnet_branch_mode2(x, hidden_units, dropouts, training, seed)) x = tf.add_n(xs) return x
def inference(self): """ Initialize important settings """ self.regularizer = tf.contrib.layers.l2_regularizer(self.regularizer_rate) if self.initializer == 'Normal': self.initializer = tf.truncated_normal_initializer(stddev=0.01) elif self.initializer == 'Xavier_Normal': self.initializer = tf.contrib.layers.xavier_initializer() else: self.initializer = tf.glorot_uniform_initializer() if self.activation_func == 'ReLU': self.activation_func = tf.nn.relu elif self.activation_func == 'Leaky_ReLU': self.activation_func = tf.nn.leaky_relu elif self.activation_func == 'ELU': self.activation_func = tf.nn.elu if self.loss_func == 'cross_entropy': # self.loss_func = lambda labels, logits: -tf.reduce_sum( # (labels * tf.log(logits) + ( # tf.ones_like(labels, dtype=tf.float32) - labels) * # tf.log(tf.ones_like(logits, dtype=tf.float32) - logits)), 1) self.loss_func = tf.nn.sigmoid_cross_entropy_with_logits if self.optim == 'SGD': self.optim = tf.train.GradientDescentOptimizer(self.lr, name='SGD') elif self.optim == 'RMSProp': self.optim = tf.train.RMSPropOptimizer(self.lr, decay=0.9, momentum=0.0, name='RMSProp') elif self.optim == 'Adam': self.optim = tf.train.AdamOptimizer(self.lr, name='Adam')
def attention(x, feature_dim, sequence_length, mask_zero=False, maxlen=None, epsilon=1e-8, seed=0): input_shape = tf.shape(x) step_dim = input_shape[1] # feature_dim = input_shape[2] x = tf.reshape(x, [-1, feature_dim]) """ The last dimension of the inputs to `Dense` should be defined. Found `None`. cann't not use `tf.layers.Dense` here eij = tf.layers.Dense(1)(x) see: https://github.com/tensorflow/tensorflow/issues/13348 workaround: specify the feature_dim as input """ eij = tf.layers.Dense(1, activation=tf.nn.tanh, kernel_initializer=tf.glorot_uniform_initializer(seed=seed), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(x) eij = tf.reshape(eij, [-1, step_dim]) a = tf.exp(eij) # apply mask after the exp. will be re-normalized next if mask_zero: # None * step_dim mask = tf.sequence_mask(sequence_length, maxlen) mask = tf.cast(mask, tf.float32) a = a * mask # in some cases especially in the early stages of training the sum may be almost zero a /= tf.cast(tf.reduce_sum(a, axis=1, keep_dims=True) + epsilon, tf.float32) a = tf.expand_dims(a, axis=-1) return a
def __call__(self, shape, dtype, partition_info=None): if self._base_initializer is None: # mimic default initialization in tf.get_variable() if dtype.is_floating: ret = tf.glorot_uniform_initializer()(shape, dtype) else: ret = tf.zeros(shape, dtype) else: ret = self._base_initializer(shape, dtype, partition_info=partition_info) noise = 0.0 # no random noise in the initializer. return tf.cast(self._parameter_encoding.encode(ret, noise), dtype)
def _resnet_branch_mode1(x, hidden_units, dropouts, training, seed=0): h1, h2, h3 = hidden_units dr1, dr2, dr3 = dropouts # branch 2 x2 = tf.layers.Dense(h1, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 2), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(x) x2 = tf.layers.BatchNormalization()(x2) x2 = tf.nn.relu(x2) x2 = tf.layers.Dropout(dr1, seed=seed * 1)(x2, training=training) if dr1 > 0 else x2 x2 = tf.layers.Dense(h2, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 3), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(x2) x2 = tf.layers.BatchNormalization()(x2) x2 = tf.nn.relu(x2) x2 = tf.layers.Dropout(dr2, seed=seed * 2)(x2, training=training) if dr2 > 0 else x2 x2 = tf.layers.Dense(h3, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 4), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(x2) x2 = tf.layers.BatchNormalization()(x2) return x2
def _conv2d(self, x, name, filter_size, in_channels, out_channels, strides): with tf.variable_scope(name): kernel = tf.get_variable(name='W', shape=[filter_size, filter_size, in_channels, out_channels], dtype=tf.float32, initializer=tf.glorot_uniform_initializer()) # tf.glorot_normal_initializer b = tf.get_variable(name='b', shape=[out_channels], dtype=tf.float32, initializer=tf.constant_initializer()) con2d_op = tf.nn.conv2d(x, kernel, [1, strides, strides, 1], padding='SAME') return tf.nn.bias_add(con2d_op, b)
def _dense_block_mode2(x, hidden_units, dropouts, densenet=False, training=False, seed=0, bn=False, name="dense_block"): """ :param x: :param hidden_units: :param dropouts: :param densenet: enable densenet :return: Ref: https://github.com/titu1994/DenseNet """ for i, (h, d) in enumerate(zip(hidden_units, dropouts)): if bn: z = batch_normalization(x, training=training, name=name + "-" + str(i)) z = tf.nn.relu(z) z = tf.layers.Dropout(d, seed=seed * i)(z, training=training) if d > 0 else z z = tf.layers.Dense(h, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * i), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(z) if densenet: x = tf.concat([x, z], axis=-1) else: x = z return x
def rectified_conv2d(X,name,filter_shape,output_channel, stride,padding_type,is_training,dropout_rate=0.0, apply_batchnorm=True,weight_decay=None,apply_relu=True, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: This function will apply simple convolution to the given input images filtering the input with requires number of filters. This will be a custom block to apply the whole rectified convolutional block which include the following sequence of operation. conv2d --> batch_norm(optional) --> activation(optional) USAGE: INPUT: X : the input 'image' to this layer. A 4D tensor of shape [batch,input_height,input_width,input_channel] name : the name of the this convolution layer. This will be useful in grouping the components together. (so currently kept as compulsory) filter_shape : a tuple of form (filter_height,filter_width) output_channel : the total nuber of output channels in the feature 'image/activation' of this layer stride : a tuple giving (stride_height,stride_width) padding_type : string either to do 'SAME' or 'VALID' padding is_training : (used with batchnorm) a boolean to specify whether we are in training or inference mode. dropout_rate : the fraction of final activation to drop from the last activation of this layer. It will act as regularization effect. apply_batchnorm: a boolean to specify whether to use batch norm or not.Defaulted to True since bnorm is useful weight_decay : give a value of regularization hyperpaprameter i.e the amount we want to have l2-regularization on the weights. defalut no regularization. apply_relu : this will be useful if we dont want to apply relu but some other activation function diretly during the model description. Then this function will not do rectification. initializer : the initializer for the filter Variables OUTPUT: A :the output feature 'image' of this layer ''' with tf.variable_scope(name): #Creating the filter weights and biases #Filter Weights input_channel=X.get_shape().as_list()[3] fh,fw=filter_shape net_filter_shape=(fh,fw,input_channel,output_channel) filters=get_variable_on_cpu('W',net_filter_shape,initializer,weight_decay) #stride and padding configuration sh,sw=stride net_stride=(1,sh,sw,1) if not (padding_type=='SAME' or padding_type=='VALID'): raise AssertionError('Please use SAME/VALID string for padding') #Now applying the convolution Z_conv=tf.nn.conv2d(X,filters,net_stride,padding_type,name='conv2d') if apply_batchnorm==True: Z=_batch_normalization2d(Z_conv,is_training) else: #Biases Weight creation net_bias_shape=(1,1,1,output_channel) bias_initializer=tf.zeros_initializer() biases=get_variable_on_cpu('b',net_bias_shape,bias_initializer) Z=tf.add(Z_conv,biases,name='bias_add') #Finally applying the 'relu' activation if apply_relu==True: with tf.variable_scope('rl_dp'): A=tf.nn.relu(Z,name='relu') #Adding the dropout to the last layer A=tf.layers.dropout(A,rate=dropout_rate,training=is_training, name='dropout') else: A=Z #when we want to apply another activation outside in model. return A
def convolutional_residual_block(X,name,num_channels, first_filter_stride,mid_filter_shape,is_training, dropout_rate=0.0,apply_batchnorm=True,weight_decay=None, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: This block is similar to the previous identity block but the only difference is that the shape (height,width) of main branch i.e 2 is changed in the way, so we have to adust this shape in the skip-connection/shortcut branch also. So we will use convolution in the shorcut branch to match the shape. USAGE: INPUT: first_filter_stride : (sh,sw) stride to be used with first filter Rest of the argument decription is same a identity block OUTPUT: A : the final output/feature map of this residual block ''' with tf.variable_scope(name): #Main Branch #Applying the first one-one convolution A1=rectified_conv2d(X,name='branch_2a', filter_shape=(1,1), output_channel=num_channels[0], stride=first_filter_stride, padding_type="VALID", is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, initializer=initializer) #Applying the Filtering in the mid sub-layer A2=rectified_conv2d(A1,name='branch_2b', filter_shape=mid_filter_shape, output_channel=num_channels[1], stride=(1,1), padding_type="SAME", is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, initializer=initializer) #Again one-one convolution for upsampling #Here last number of channels which need not match with input Z3=rectified_conv2d(A2,name='branch_2c', filter_shape=(1,1), output_channel=num_channels[2], stride=(1,1), padding_type="VALID", is_training=is_training, dropout_rate=0.0, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=False, #necessary cuz addition before activation initializer=initializer) #Skip-Connection/Shortcut Branch #Now we have to bring the shortcut/skip-connection in shape and number of channels Z_shortcut=rectified_conv2d(X,name='branch_1', filter_shape=(1,1), output_channel=num_channels[2], stride=first_filter_stride, padding_type="VALID", is_training=is_training, dropout_rate=0.0, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=False, #necessary cuz addition before activation initializer=initializer) #Finally merging the two branch with tf.variable_scope('skip_conn'): #now adding the two branches element wise Z=tf.add(Z3,Z_shortcut) A=tf.nn.relu(Z,name='relu') #Adding the dropout to the last sub-layer after skip-connection A=tf.layers.dropout(A,rate=dropout_rate,training=is_training,name='dropout') return A
def esmm_model_fn(features, labels, mode, params): batch_weight = tf.feature_column.input_layer(features, params['weight_columns']) inputs, embedding_table = build_input(features, params) hidden_units = params['hidden_units'] linear_parent_scope = 'linear' dnn_parent_scope = 'dnn' reg = 1e-4 if params['model'] == 'linear': with tf.variable_scope(linear_parent_scope, values=tuple(six.itervalues(features)), reuse=tf.AUTO_REUSE): with tf.variable_scope('linear_ctr'): ctr_logit_fn = linear._linear_logit_fn_builder(1, params['linear_columns']) ctr_logits = ctr_logit_fn(features=features) with tf.variable_scope('linear_cvr'): cvr_logit_fn = linear._linear_logit_fn_builder(1, params['linear_columns']) cvr_logits = cvr_logit_fn(features=features) if params['model'] == 'dnn': with tf.variable_scope(dnn_parent_scope): common_inputs = tf.layers.dense(inputs=inputs, units=256, activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.glorot_uniform_initializer(), name="SharedLayer") #common_inputs = inputs with tf.variable_scope('dnn_ctr'): ctr_logits = build_deep_layers(common_inputs, hidden_units, mode, params['ctr_reg']) #ctr_logit_fn = dnn._dnn_logit_fn_builder(1, hidden_units, params['dnn_columns'], tf.nn.relu, None, None, True) #ctr_logits = ctr_logit_fn(features=features, mode=mode) with tf.variable_scope('dnn_cvr'): cvr_logits = build_deep_layers(common_inputs, hidden_units, mode, params['cvr_reg']) #cvr_logit_fn = dnn._dnn_logit_fn_builder(1, hidden_units, params['dnn_columns'], tf.nn.relu, None, None, True) #cvr_logits = cvr_logit_fn(features=features, mode=mode) ctr_preds = tf.nn.sigmoid(ctr_logits) cvr_preds = tf.nn.sigmoid(cvr_logits) ctcvr_preds = tf.stop_gradient(ctr_preds) * cvr_preds #ctcvr_preds = ctr_preds * cvr_preds tf.summary.histogram("esmm/ctr_preds", ctr_preds) tf.summary.histogram("esmm/ctcvr_preds", ctcvr_preds) if mode == tf.estimator.ModeKeys.PREDICT: #redundant_items = ctr_preds predictions = { 'prob': tf.concat([ctcvr_preds, ctr_preds], 1) } export_outputs = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(predictions) #线上预测需要的 } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) else: #for variable in tf.trainable_variables('fm'): # print(variable_name) #print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='fm')) #print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope)) shared_weights = tf.trainable_variables(dnn_parent_scope + '/SharedLayer/kernel')[0] #linear_weights = tf.concat(list(embedding_table.get_linear_weights().values()), axis=0) #embed_weights = tf.concat(list(embedding_table.get_embed_weights().values()), axis=0) #shared_weights = tf.concat([linear_weights, embed_weights], axis=1) ctr_labels = labels['ctr'] ctcvr_labels = labels['ctcvr'] linear_optimizer = tf.train.FtrlOptimizer(0.01, l1_regularization_strength=0.01, l2_regularization_strength=0.001) dnn_optimizer = optimizers.get_optimizer_instance('Adam', params['learning_rate']) loss_optimizer = optimizers.get_optimizer_instance('Adam', 0.001) ctr_auc = tf.metrics.auc(labels=ctr_labels, predictions=ctr_preds, weights=batch_weight) ctcvr_auc = tf.metrics.auc(labels=ctcvr_labels, predictions=ctcvr_preds, weights=batch_weight) ctr_precision, ctr_precision_update_op = tf.metrics.precision(labels=ctr_labels, predictions=ctr_preds, weights=batch_weight) ctr_recall, ctr_recall_update_op = tf.metrics.recall(labels=ctr_labels, predictions=ctr_preds, weights=batch_weight) ctr_loss = tf.losses.log_loss(ctr_labels, ctr_preds, weights=batch_weight, reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE) ctcvr_loss = tf.losses.log_loss(ctcvr_labels, ctcvr_preds, weights=batch_weight, reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE) reg_loss = tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) weight_loss, update_list, w_list, loss_gradnorm = grad_norm([ctr_loss, ctcvr_loss], shared_weights) loss = tf.add_n(weight_loss + [reg_loss]) tf.summary.scalar("esmm/ctr_loss", tf.reduce_sum(ctr_loss)) tf.summary.scalar("esmm/ctcvr_loss", tf.reduce_sum(ctcvr_loss)) tf.summary.scalar("esmm/loss", tf.reduce_sum(loss)) def _train_op_fn(loss): train_ops = [] global_step = tf.train.get_global_step() if params['model'] in ('dnn'): var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='fm') + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope) train_ops.append( dnn_optimizer.minimize( loss, var_list=var_list)) if params['model'] in ('linear'): train_ops.append( linear_optimizer.minimize( loss, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=linear_parent_scope))) train_ops.append( loss_optimizer.minimize( loss_gradnorm, var_list=w_list)) train_ops.append(update_list) train_op = tf.group(*train_ops) with tf.control_dependencies([train_op]): return distribute_lib.increment_var(global_step) metrics = {'ctr_auc': ctr_auc, 'ctcvr_auc': ctcvr_auc, 'ctr_precision':(ctr_precision, ctr_precision_update_op), 'ctr_recall':(ctr_recall, ctr_recall_update_op)} train_op = _train_op_fn(loss) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: train_op = tf.group(train_op, *update_ops) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, eval_metric_ops=metrics)
def construct(self, args): self.z_dim = args.z_dim with self.session.graph.as_default(): if args.recodex: tf.get_variable_scope().set_initializer( tf.glorot_uniform_initializer(seed=42)) # Inputs self.images = tf.placeholder(tf.float32, [None, self.HEIGHT, self.WIDTH, 1]) self.z = tf.placeholder(tf.float32, [None, self.z_dim]) # Generator def generator(z): # Define a generator as a sequence of: # - batch normalized dense layer with 1024 neurons and ReLU activation hidden_layer1 = tf.layers.dense(z, 1024, activation=None, use_bias=False) hidden_layer1_norm = tf.layers.batch_normalization( hidden_layer1, training=True) hidden_layer1_norm_relu = tf.nn.relu(hidden_layer1_norm) # - batch normalized dense layer with 7 * 7 * 64 neurons and ReLU activation hidden_layer2 = tf.layers.dense(hidden_layer1_norm_relu, 7 * 7 * 64, activation=None, use_bias=False) hidden_layer2_norm = tf.layers.batch_normalization( hidden_layer2, training=True) hidden_layer2_norm_relu = tf.nn.relu(hidden_layer2_norm) # - change shape to a batch of images with size 7 x 7 and 64 channels imgs = tf.reshape(hidden_layer2_norm_relu, [-1, 7, 7, 64]) # - batch normalized conv2d_transpose with 32 output channels, kernel size 5, # stride 2, "same" padding and ReLU activation cl1 = tf.layers.conv2d_transpose(imgs, filters=32, kernel_size=(5, 5), strides=2, padding="same", activation=None, use_bias=False) cl1_norm = tf.layers.batch_normalization(cl1, training=True) cl1_norm_relu = tf.nn.relu(cl1_norm) print(cl1_norm_relu) # - (non-normalized) conv2d_transpose with 1 output channel, kernel size 5, # stride 2, "same" padding and sigmoid activation cl2 = tf.layers.conv2d_transpose(cl1_norm_relu, filters=1, kernel_size=(5, 5), strides=2, padding="same", activation=tf.nn.sigmoid) print(cl2) # Return the result. # # Note that batch normalization should be used on inputs # without bias (`use_bias=False`) and that activation should be # applied after the batch normalization. Also use `training=True` # all the time (i.e., never use the saved estimates of moments) # in the batch normalization. return cl2 with tf.variable_scope("generator"): # Define `self.generated_images` as a result of `generator` applied to `self.z`. self.generated_images = generator(self.z) # Discriminator def discriminator(image): # Define a discriminator as a sequence of: # - batch normalized conv2d with 32 output channels, kernel size 5, # "same" padding and ReLU activation cl1 = tf.layers.conv2d(image, filters=32, kernel_size=(5, 5), padding="same", activation=None, use_bias=False) cl1_norm = tf.layers.batch_normalization(cl1, training=True) cl1_norm_relu = tf.nn.relu(cl1_norm) # - max pooling layer with kernel size 2 and stride 2 mp1 = tf.layers.max_pooling2d(cl1_norm_relu, pool_size=2, strides=2) # - batch normalized conv2d with 64 output channels, kernel size 5, # "same" padding and ReLU activation cl2 = tf.layers.conv2d(mp1, filters=64, kernel_size=(5, 5), padding="same", activation=None, use_bias=False) cl2_norm = tf.layers.batch_normalization(cl2, training=True) cl2_norm_relu = tf.nn.relu(cl2_norm) # - max pooling layer with kernel size 2 and stride 2 mp2 = tf.layers.max_pooling2d(cl2_norm_relu, pool_size=2, strides=2) # - flattening layer flattened_image = tf.layers.flatten(mp2, name="flatten") # - batch normalized dense layer with 1024 neurons and ReLU activation hidden_layer1 = tf.layers.dense(flattened_image, 1024, activation=None, use_bias=False) hidden_layer1_norm = tf.layers.batch_normalization( hidden_layer1, training=True) hidden_layer1_norm_relu = tf.nn.relu(hidden_layer1_norm) # - (non-normalized) dense layer with 1 neuron without activation. hidden_layer2 = tf.layers.dense(hidden_layer1_norm_relu, 1, activation=None) # # Consider the last hidden layer output to be the logit of whether the input # images comes from real data. Change its shape to remove the last dimension # (i.e., [batch_size] instead of [batch_size, 1]) and return it. # # Same considerations as in `generator` regarding the batch normalization apply. return tf.reshape(hidden_layer2, [-1]) with tf.variable_scope("discriminator"): # Define `discriminator_logit_real` as a result of # `discriminator` applied to `self.images`. discriminator_logit_real = discriminator(self.images) with tf.variable_scope("discriminator", reuse=True): # Define `discriminator_logit_fake` as a result of # `discriminator` applied to `self.generated_images`. discriminator_logit_fake = discriminator(self.generated_images) # Note the discriminator is called in the same variable # scope as several lines above -- it will try to utilize the # same variables. In order to allow reusing them, we need to explicitly # pass the `reuse=True` flag. # Losses # Define `self.discriminator_loss` as a sum of # - sigmoid cross entropy loss with gold labels of ones (1.0) and discriminator_logit_real # - sigmoid cross entropy loss with gold labels of zeros (0.0) and discriminator_logit_fake self.discriminator_loss = tf.losses.sigmoid_cross_entropy(tf.ones(tf.shape(discriminator_logit_real)), discriminator_logit_real) \ + tf.losses.sigmoid_cross_entropy(tf.zeros(tf.shape(discriminator_logit_fake)), discriminator_logit_fake) # Define `self.generator_loss` as a sigmoid cross entropy # loss with gold labels of ones (1.0) and discriminator_logit_fake. self.generator_loss = tf.losses.sigmoid_cross_entropy( tf.ones(tf.shape(discriminator_logit_fake)), discriminator_logit_fake) # Training global_step = tf.train.create_global_step() # Create `self.discriminator_training` as an AdamOptimizer.minimize # for discriminator_loss and variables in the "discriminator" namespace using # the option var_list=tf.global_variables("discriminator"). # Do *not* pass global_step as argument to AdamOptimizer.minimize. self.discriminator_training = tf.train.AdamOptimizer().minimize( self.discriminator_loss, var_list=tf.global_variables("discriminator"), name="training_d") # Create `self.generator_training` as an AdamOptimizer.minimize # for generator_loss and variables in "generator" namespace. # This time *do* pass global_step as argument to AdamOptimizer.minimize. self.generator_training = tf.train.AdamOptimizer().minimize( self.generator_loss, var_list=tf.global_variables("generator"), name="training_g") # Summaries discriminator_accuracy = tf.reduce_mean( tf.to_float( tf.concat([ tf.greater(discriminator_logit_real, 0), tf.less(discriminator_logit_fake, 0) ], axis=0))) summary_writer = tf.contrib.summary.create_file_writer( args.logdir, flush_millis=10 * 1000) with summary_writer.as_default( ), tf.contrib.summary.record_summaries_every_n_global_steps(100): self.discriminator_summary = [ tf.contrib.summary.scalar("gan/discriminator_loss", self.discriminator_loss), tf.contrib.summary.scalar("gan/discriminator_accuracy", discriminator_accuracy) ] self.generator_summary = tf.contrib.summary.scalar( "gan/generator_loss", self.generator_loss) self.generated_image_data = tf.placeholder(tf.float32, [None, None, 1]) with summary_writer.as_default( ), tf.contrib.summary.always_record_summaries(): self.generated_image_summary = tf.contrib.summary.image( "gan/generated_image", tf.expand_dims(self.generated_image_data, axis=0)) # Initialize variables self.session.run(tf.global_variables_initializer()) with summary_writer.as_default(): tf.contrib.summary.initialize(session=self.session, graph=self.session.graph)
def parameter_network(family, arch_dict, param_net_hps, param_net_input): """Instantiate and connect layers of the parameter network. The parameter network (parameterized by phi in EFN paper) maps natural parameters eta to the parameters of the the density network theta. Args: family (obj): Instance of tf_util.families.Family. arch_dict (dict): Specifies structure of approximating density network. param_net_hps (dict): Parameter network hyperparameters. param_net_input (tf.placeholder): Usually this is a (K X|eta|) tensor holding eta for each of the K distributions. Sometimes we provide hints for the parameter network in addition to eta, which are concatenated onto the end. Returns: theta (tf.Tensor): Output of the parameter network. """ K = tf.shape(param_net_input)[0] L_theta = param_net_hps["L"] upl_theta = param_net_hps["upl"] h = param_net_input for i in range(L_theta): with tf.variable_scope("ParamNetLayer%d" % (i + 1)): h = tf.layers.dense(h, upl_theta[i], activation=tf.nn.tanh) out_dim = h.shape[1] theta = [] flow_class = get_flow_class(arch_dict["flow_type"]) num_params_i = get_num_flow_params(flow_class, family.D_Z) with tf.variable_scope("ParamNetReadout"): for i in range(arch_dict['repeats']): A_i = tf.get_variable( "layer%d_A" % (i + 1), shape=(out_dim, num_params_i), dtype=tf.float64, initializer=tf.glorot_uniform_initializer(), ) b_i = tf.get_variable( "layer%d_b" % (i + 1), shape=(1, num_params_i), dtype=tf.float64, initializer=tf.glorot_uniform_initializer(), ) params_i = tf.matmul(h, A_i) + b_i theta.append(params_i) if (arch_dict['post_affine']): # elem mult A_em = tf.get_variable( "em_A" % (i + 1), shape=(out_dim, family.D_Z), dtype=tf.float64, initializer=tf.glorot_uniform_initializer(), ) b_em = tf.get_variable( "em_b" % (i + 1), shape=(1, family.D_Z), dtype=tf.float64, initializer=tf.glorot_uniform_initializer(), ) params_em = tf.matmul(h, A_em) + b_em theta.append(params_em) # shift A_s = tf.get_variable( "s_A" % (i + 1), shape=(out_dim, family.D_Z), dtype=tf.float64, initializer=tf.glorot_uniform_initializer(), ) b_s = tf.get_variable( "s_b" % (i + 1), shape=(1, family.D_Z), dtype=tf.float64, initializer=tf.glorot_uniform_initializer(), ) params_s = tf.matmul(h, A_s) + b_s theta.append(params_s) return theta
def __init__(self, inputLen, hiddenLayers=[]): tf.reset_default_graph() self.winit = tf.glorot_uniform_initializer() self.binit = tf.constant_initializer(0.0) self.hiddenLayers = hiddenLayers self.inputs = tf.placeholder(tf.float64, [None, inputLen]) self.labels = tf.placeholder(tf.float64, [None, 1]) self.batchSize = tf.placeholder(tf.float64, []) self.learningRate = tf.placeholder_with_default(np.float64(0.01), []) self.regularization = tf.placeholder_with_default(np.float64(0.1), []) self.keepProb = tf.placeholder_with_default(np.float64(1.0), []) lastOutputs = tf.nn.dropout(self.inputs, self.keepProb) lastSize = inputLen i = 0 weigths = [] for l in hiddenLayers: scope = "hidden_" + str(i) with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): w = tf.get_variable(name="w", shape=[lastSize, l], dtype=tf.float64, initializer=self.winit) b = tf.get_variable(name="b", shape=[l], dtype=tf.float64, initializer=self.binit) #lastOutputs = tf.sigmoid(tf.matmul(lastOutputs, w) + b) lastOutputs = tf.nn.relu(tf.matmul(lastOutputs, w) + b) lastOutputs = tf.nn.dropout(lastOutputs, self.keepProb) lastSize = l weigths.append(w) i += 1 with tf.variable_scope("output", reuse=tf.AUTO_REUSE): w = tf.get_variable(name="w", shape=[lastSize, 1], dtype=tf.float64, initializer=self.winit) b = tf.get_variable(name="b", shape=[1], dtype=tf.float64, initializer=self.binit) weigths.append(w) self.predictions = tf.sigmoid(tf.matmul(lastOutputs, w) + b) self.cost = tf.reduce_mean( tf.losses.log_loss(self.labels, self.predictions)) self.squaredWeigth = tf.add_n( [tf.reduce_sum(tf.square(w)) for w in weigths]) self.ncost = tf.cast( self.cost, tf.float64) + (self.regularization / (self.batchSize * 2.0)) * self.squaredWeigth self.optim = tf.train.AdamOptimizer(self.learningRate).minimize( self.ncost) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) self.session.run(tf.local_variables_initializer())
y_var = np.loadtxt(open("y_var", "rb"), delimiter=" ", skiprows=0) y_test = np.loadtxt(open("y_test", "rb"), delimiter=" ", skiprows=0) l_data = np.loadtxt(open("l_data", "rb"), delimiter=" ", skiprows=0) l_test = np.loadtxt(open("l_test", "rb"), delimiter=" ", skiprows=0) x = tf.placeholder(tf.float32, [None, 135]) y = tf.placeholder(tf.float32, [None, 15]) pp = tf.placeholder(tf.float32) aa = tf.placeholder(tf.float32) train = tf.placeholder(tf.bool) #tf.keras.initializers.lecun_normal() #tf.glorot_uniform_initializer() wwtf = tf.glorot_uniform_initializer() W1 = tf.get_variable('W1', shape=[135, nu], initializer=wwtf) W2 = tf.get_variable('W2', shape=[nu, nu2], initializer=wwtf) W3 = tf.get_variable('W3', shape=[nu2, nu3], initializer=wwtf) W4 = tf.get_variable('W4', shape=[nu3, nu4], initializer=wwtf) W5 = tf.get_variable('W5', shape=[nu4, nu5], initializer=wwtf) W6 = tf.get_variable('W6', shape=[nu5, nu6], initializer=wwtf) W7 = tf.get_variable('W7', shape=[nu6, nu7], initializer=wwtf) b1 = tf.Variable(tf.zeros(shape=[nu])) b2 = tf.Variable(tf.zeros(shape=[nu2])) b3 = tf.Variable(tf.zeros(shape=[nu3])) b4 = tf.Variable(tf.zeros(shape=[nu4])) b5 = tf.Variable(tf.zeros(shape=[nu5])) b6 = tf.Variable(tf.zeros(shape=[nu6])) b7 = tf.Variable(tf.zeros(shape=[nu7]))
def logits_fn(logits_dim): return tf.layers.dense( last_layer, units=logits_dim, kernel_initializer=tf.glorot_uniform_initializer( seed=self._seed))
def modulated_convolution2d( inputs, num_outputs, kernel_size, stride=1, padding='SAME', gamma=None, demodulation=True, epsilon=1e-8, data_format=None, rate=1, activation_fn=None, normalizer_fn=None, normalizer_params=None, weights_initializer=tf.glorot_uniform_initializer(), weights_regularizer=None, biases_initializer=tf.zeros_initializer(), biases_regularizer=None, reuse=None, trainable=True, scope=None): # only for 2d convolution with tf.variable_scope(scope, 'modulated_convolution2d', reuse=reuse): conv_dims = inputs.shape.rank - 2 kernel_size = kernel_size if isinstance( kernel_size, (list, tuple)) else [kernel_size] * conv_dims stride = stride if isinstance(stride, (list, tuple)) else [stride] * conv_dims rate = rate if isinstance(rate, (list, tuple)) else [rate] * conv_dims if data_format is None or data_format.endswith('C'): num_inputs = inputs.shape[-1] elif data_format.startswith('NC'): num_inputs = inputs.shape[1] else: raise ValueError('Invalid data_format') weights = tf.get_variable('weights', shape=list(kernel_size) + [num_inputs, num_outputs], initializer=weights_initializer, regularizer=weights_regularizer, trainable=trainable) weights = weight_modulation(weights, gamma=gamma, demodulation=demodulation, epsilon=epsilon, trainable=trainable) if gamma is not None: inputs = tf.transpose(inputs, [1, 2, 0, 3]) # (H, W, N, CI) inputs = tf.reshape( inputs, [1, inputs.shape[0], inputs.shape[1], -1]) # (1, H, W, N * CI) outputs = tf.nn.convolution(input=inputs, filter=weights, dilation_rate=rate, strides=stride, padding=padding, data_format=data_format) if gamma is not None: outputs = tf.reshape( outputs, [outputs.shape[1], outputs.shape[2], -1, num_outputs ]) # (H, W, N, CO) outputs = tf.transpose(outputs, [2, 0, 1, 3]) if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) else: if biases_initializer is not None: biases = tf.get_variable('biases', shape=[num_outputs], initializer=biases_initializer, regularizer=biases_regularizer, trainable=trainable) outputs = tf.nn.bias_add(outputs, biases, data_format=data_format) if activation_fn is not None: outputs = activation_fn(outputs) return outputs
def inference(input_tensor, n_classes, TRAIN_FLAG): batch_size = ifd_train.BATCH_SIZE input = tf.reshape(input_tensor, [-1, ifd_train.PATCH_SIZE, ifd_train.PATCH_SIZE, ifd_train.INCHANNEL]) print('inputshape: {}'.format(input.get_shape().as_list())) with tf.variable_scope('conv1'): conv1 = tf.layers.conv2d(inputs=input, filters=16, kernel_size=[5, 5], trainable=TRAIN_FLAG, padding='SAME', activation=tf.nn.relu, name='conv1') print('conv1 output shape: {}'.format(conv1.get_shape().as_list())) with tf.variable_scope('conv2'): conv2 = tf.layers.conv2d(inputs=conv1, filters=1, kernel_size=[5, 5], trainable=TRAIN_FLAG, padding='SAME', activation=tf.nn.relu, name='conv2') print('conv2 output shape: {}'.format(conv2.get_shape().as_list())) with tf.variable_scope('lstm'): _fc2 = cut_to_blocks(conv2, ifd_train.BATCH_SIZE, ifd_train.PATCH_SIZE, ifd_train.BLOCK_SIZE) print('_fc2 shape: {}'.format(_fc2.get_shape().as_list())) #use a fc layer to fit fc2 with lstm_hidden_size w_fc2 = tf.Variable(tf.random_normal([lstm_block_size, lstm_hidden_size]), name='w_fc2', trainable=TRAIN_FLAG) b_fc2 = tf.Variable(tf.constant(0.1, shape=[lstm_hidden_size]), name='b_fc2', trainable=TRAIN_FLAG) fc2 = tf.matmul(tf.reshape(_fc2, [-1, lstm_block_size]), w_fc2) + b_fc2 fc2 = tf.reshape(fc2, [batch_size, lstm_time_step, lstm_hidden_size]) print('fc2 shape: {}'.format(fc2.get_shape().as_list())) #set lstm mlstm_cell = rnn.MultiRNNCell([lstm_cell(lstm_hidden_size, keep_prob) for _ in range(lstm_layer_num)], state_is_tuple=True) init_state = mlstm_cell.zero_state(batch_size, dtype=tf.float32) lstm_ouput, state = tf.nn.dynamic_rnn(mlstm_cell, inputs=fc2, initial_state=init_state, time_major=False) print('lstm_ouput shape: {}'.format(lstm_ouput.get_shape().as_list())) # handle lstm_ouput ==> list: [time_step, [batch_size, output_each_cell]] lstm_ouput = tf.unstack(tf.transpose(lstm_ouput, [1, 0, 2])) # patch_label w_pl = tf.get_variable(name='W_label_out', shape=[lstm_hidden_size, n_classes], dtype=tf.float32, trainable=TRAIN_FLAG, initializer=tf.glorot_uniform_initializer()) # tf.glorot_normal_initializer b_pl = tf.get_variable(name='b_label_out', shape=[n_classes], dtype=tf.float32, trainable=TRAIN_FLAG, initializer=tf.constant_initializer()) logits = tf.matmul(lstm_ouput[-1], w_pl) + b_pl print('logits output shape: {}'.format(logits.get_shape().as_list())) # get f_lstm f_lstm = cat_to_map(lstm_ouput, ifd_train.BATCH_SIZE) print('f_lstm shape: {}'.format(f_lstm.get_shape().as_list())) with tf.variable_scope('conv3'): f_lstm = tf.reshape(f_lstm, [f_lstm.shape[0], f_lstm.shape[1], f_lstm.shape[2], 1]) conv3 = tf.layers.conv2d(inputs=f_lstm, filters=32, kernel_size=[5, 5], trainable=TRAIN_FLAG, padding='SAME', activation=tf.nn.relu, name='conv3') print('conv3 output shape: {}'.format(conv3.get_shape().as_list())) with tf.variable_scope('pool1'): pool1 = tf.nn.max_pool(value=conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') print('pool1 output shape: {}'.format(pool1.get_shape().as_list())) with tf.variable_scope('fcn4'): w4 = tf.get_variable(name='w4', shape=[64, 64, 32, 4096], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, trainable=TRAIN_FLAG) b4 = tf.get_variable(name='b4', shape=[4096], trainable=TRAIN_FLAG, initializer=tf.constant_initializer()) fcn4 = tf.nn.conv2d(input=pool1, filter=w4, strides=[1, 1, 1, 1], padding='SAME', name='fcn4') fcn4 = tf.nn.bias_add(fcn4, b4) print('fcn4 output shape: {}'.format(fcn4.get_shape().as_list())) with tf.variable_scope('fcn5'): w5 = tf.get_variable(name='w5', shape=[64, 64, 4096, ifd_train.N_CLASSES], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, trainable=TRAIN_FLAG) b5 = tf.get_variable(name='b5', shape=[ifd_train.N_CLASSES], trainable=TRAIN_FLAG, initializer=tf.constant_initializer()) fcn5 = tf.nn.conv2d(input=fcn4, filter=w5, strides=[1, 1, 1, 1], padding='SAME', name='fcn5') fcn5 = tf.nn.bias_add(fcn5, b5) print('fcn5 output shape: {}'.format(fcn5.get_shape().as_list())) return logits, fcn5
def _build(self, convnet_pars): with tf.variable_scope(None, default_name=self._name): self._scope_name = tf.get_default_graph().get_name_scope() + '/' self._x = tf.placeholder(tf.float32, shape=[None] + list(convnet_pars['input_shape']), name='input') scaled_x = self._x / 255. hidden_1 = tf.layers.conv2d( scaled_x, 32, 8, 4, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), bias_initializer=tf.glorot_uniform_initializer(), name='hidden_1') hidden_2 = tf.layers.conv2d( hidden_1, 64, 4, 2, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), bias_initializer=tf.glorot_uniform_initializer(), name='hidden_2') hidden_3 = tf.layers.conv2d( hidden_2, 64, 3, 1, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), bias_initializer=tf.glorot_uniform_initializer(), name='hidden_3') flatten = tf.reshape(hidden_3, [-1, 7 * 7 * 64], name='flatten') self._features = tf.layers.dense( flatten, 512, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), bias_initializer=tf.glorot_uniform_initializer(), name='_features') self.q = tf.layers.dense( self._features, convnet_pars['output_shape'][0], kernel_initializer=tf.glorot_uniform_initializer(), bias_initializer=tf.glorot_uniform_initializer(), name='q') self._target_q = tf.placeholder('float32', [None], name='target_q') self._action = tf.placeholder('uint8', [None], name='action') action_one_hot = tf.one_hot(self._action, convnet_pars['output_shape'][0], name='action_one_hot') self._q_acted = tf.reduce_sum(self.q * action_one_hot, axis=1, name='q_acted') loss = tf.losses.huber_loss(self._target_q, self._q_acted) tf.summary.scalar('huber_loss', loss) tf.summary.scalar('average_q', tf.reduce_mean(self.q)) self._merged = tf.summary.merge( tf.get_collection(tf.GraphKeys.SUMMARIES, scope=self._scope_name)) optimizer = convnet_pars['optimizer'] if optimizer['name'] == 'rmspropcentered': opt = tf.train.RMSPropOptimizer(learning_rate=optimizer['lr'], decay=optimizer['decay'], epsilon=optimizer['epsilon'], centered=True) elif optimizer['name'] == 'rmsprop': opt = tf.train.RMSPropOptimizer(learning_rate=optimizer['lr'], decay=optimizer['decay'], epsilon=optimizer['epsilon']) elif optimizer['name'] == 'adam': opt = tf.train.AdamOptimizer(learning_rate=optimizer['lr']) elif optimizer['name'] == 'adadelta': opt = tf.train.AdadeltaOptimizer(learning_rate=optimizer['lr']) else: raise ValueError('Unavailable optimizer selected.') self._train_step = opt.minimize(loss=loss) initializer = tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self._scope_name)) self._session.run(initializer) if self._folder_name is not None: self._train_writer = tf.summary.FileWriter( self._folder_name + '/' + self._scope_name[:-1], graph=tf.get_default_graph()) self._train_count = 0 self._add_collection()
def deform_conv_2d(inputs, num_outputs, kernel_size=3, stride=1, dilate_rate=1, deformable_group=1, data_format='channels_first', no_bias=True, name=None): with tf.variable_scope(name, 'deform_conv'): if 'channels_last' == data_format: inputs = tf.transpose(inputs, [0, 3, 1, 2], name='trans') offset = tf.layers.conv2d(inputs, 2 * deformable_group * kernel_size**2, kernel_size, padding='SAME', dilation_rate=(dilate_rate, dilate_rate), strides=(stride, stride), data_format='channels_first') kernel = tf.get_variable(name='kernel', shape=(num_outputs, inputs.get_shape().as_list()[1], kernel_size, kernel_size), initializer=tf.glorot_uniform_initializer()) if not no_bias: bias_var = tf.get_variable(name='bias', shape=(1, num_outputs, 1, 1), initializer=tf.zeros_initializer()) res = deform_conv_op(inputs, filter=kernel, offset=offset, rates=[1, 1, dilate_rate, dilate_rate], padding='SAME', strides=[1, 1, stride, stride], num_groups=1, deformable_group=deformable_group) if 'channels_last' == data_format: res = tf.transpose(res, [0, 2, 3, 1], name='trans_inv') if not no_bias: res = res + bias_var return res
import tensorflow as tf import numpy as np from utils import AugmentedObject from metrics import EM_metrics, F1_metrics from layers import WordEmbedding, CharEmbedding, Highway, ContextQueryAttention, EmbeddingEncoder, Layer initializer = lambda shape: 0.5 * tf.glorot_uniform_initializer()(shape) def QANet_fn(features, labels, mode, params): Layer.scope_manager = {} lr = params["lr"] keep_prob = params["keep_prob"] embedding = params["embedding"] nb_char = params["nb_char"] clip = params["clip"] keep_layer = params["keep_layer"] norm_linear = params["norm_linear"] norm_trainable = params["norm_trainable"] training = (mode == tf.estimator.ModeKeys.TRAIN) parags = features["paragraph"] queries = features["query"] parags_char = features["par_char"] queries_char = features["query_char"] batch_size = 32 char_emb_size = 200 par_len = parags.shape[1]
def simple_fully_connected(X,name,output_dim,is_training,dropout_rate=0.0, apply_batchnorm=True,weight_decay=None, flatten_first=False,apply_relu=True, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: This function will implement a simple feed-foreward network, taking the activation X of previous layer/input layer, transforming it linearly and then passing it through a desired non-linear activation. USAGE: INPUT: X :the activation of previous layer/input layer output_dim :the dimenstion of the output layer name :the name of the layer weight_decay:(lambda) if specified to a value, then it will be used for implementing the L2- regularization of the weights is_training : to be used to state the mode i.e training or inference mode.used for batch norm dropout_rate: the fraction of the activation which we will dropout randomly to act as a regularizing effect. a number between 0 an 1. apply_batchnorm: whether to apply batch norm or not. A boolean True/False. weight_decay : an hyperparameter which will control the fraction of L2- norm of weights to add in total loss. Will act as regularization effect. flatten_first: whether to first flattenthe input into a 2 dimenional tensor as [batch_size,all_activation] apply_relu : whether to apply relu activation at last or not. initializer :initializer choice to be used for Weights OUTPUT: A : the activation of this layer ''' with tf.variable_scope(name): #Flattening the input if necessary if flatten_first==True: X=tf.contrib.layers.flatten(X) input_dim=X.get_shape().as_list()[1] #Checking the dimension of the input if not len(X.get_shape().as_list())==2: raise AssertionError('The X should be of shape: (batch,all_nodes)') #Get the hold of necessary variable shape_W=(input_dim,output_dim) shape_b=(1,output_dim) W=get_variable_on_cpu('W',shape_W,initializer,weight_decay) #Applying the linear transforamtion and passing through non-linearity Z=tf.matmul(X,W,name='linear_transform') #Applying batch norm if apply_batchnorm==True: with tf.variable_scope('batch_norm'): axis=1 #here the features are in axis 1 Z_tilda=tf.layers.batch_normalization(Z,axis=axis, training=is_training) else: #We generally dont regularize the bias unit bias_initializer=tf.zeros_initializer() b=get_variable_on_cpu('b',shape_b,bias_initializer) Z_tilda=tf.add(Z,b,name='bias_add') if apply_relu==True: with tf.variable_scope('rl_dp'): A=tf.nn.relu(Z_tilda,name='relu') #Adding dropout to the layer with drop_rate parameter A=tf.layers.dropout(A,rate=dropout_rate,training=is_training, name='dropout') else: A=Z_tilda return A
def _wide_deep_combined_model_fn(features, labels, mode, head, model_type, with_cnn=False, cnn_optimizer='Adagrad', linear_feature_columns=None, linear_optimizer='Ftrl', dnn_feature_columns=None, dnn_optimizer='Adagrad', dnn_hidden_units=None, dnn_connected_mode=None, input_layer_partitioner=None, config=None): """Wide and Deep combined model_fn. (Dnn, Cnn, Linear) Args: features: dict of `Tensor`. labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. head: A `Head` instance. model_type: one of `wide`, `deep`, `wide_deep`. with_cnn: Bool, set True to combine image input featrues using cnn. cnn_optimizer: String, `Optimizer` object, or callable that defines the optimizer to use for training the CNN model. Defaults to the Adagrad optimizer. linear_feature_columns: An iterable containing all the feature columns used by the Linear model. linear_optimizer: String, `Optimizer` object, or callable that defines the optimizer to use for training the Linear model. Defaults to the Ftrl optimizer. dnn_feature_columns: An iterable containing all the feature columns used by the DNN model. dnn_optimizer: String, `Optimizer` object, or callable that defines the optimizer to use for training the DNN model. Defaults to the Adagrad optimizer. dnn_hidden_units: List of hidden units per DNN layer. dnn_connected_mode: List of connected mode. dnn_activation_fn: Activation function applied to each DNN layer. If `None`, will use `tf.nn.relu`. dnn_dropout: When not `None`, the probability we will drop out a given DNN coordinate. dnn_batch_norm: Bool, add BN layer after each DNN layer input_layer_partitioner: Partitioner for input layer. config: `RunConfig` object to configure the runtime settings. Returns: `ModelFnOps` Raises: ValueError: If both `linear_feature_columns` and `dnn_features_columns` are empty at the same time, or `input_layer_partitioner` is missing, or features has the wrong type. """ if not isinstance(features, dict): raise ValueError('features should be a dictionary of `Tensor`s. ' 'Given type: {}'.format(type(features))) if with_cnn: try: cnn_features = features.pop( 'image') # separate image feature from input_fn except KeyError: raise ValueError( 'No input image features, must provide image features if use cnn.' ) num_ps_replicas = config.num_ps_replicas if config else 0 input_layer_partitioner = input_layer_partitioner or ( tf.min_max_variable_partitioner(max_partitions=num_ps_replicas, min_slice_size=64 << 20)) # weight decay lr global_step = tf.Variable(0) _LINEAR_LEARNING_RATE = tf.train.exponential_decay( _linear_init_learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=_linear_decay_rate, staircase=False) _DNN_LEARNING_RATE = tf.train.exponential_decay(_dnn_init_learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=_dnn_decay_rate, staircase=False) _CNN_LEARNING_RATE = tf.train.exponential_decay(_cnn_init_learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=_cnn_decay_rate, staircase=False) # Build DNN Logits. dnn_parent_scope = 'dnn' if model_type == 'wide' or not dnn_feature_columns: dnn_logits = None else: dnn_optimizer = get_optimizer_instance( dnn_optimizer, learning_rate=_DNN_LEARNING_RATE) if model_type == 'wide_deep': check_no_sync_replicas_optimizer(dnn_optimizer) dnn_partitioner = tf.min_max_variable_partitioner( max_partitions=num_ps_replicas) with tf.variable_scope(dnn_parent_scope, values=tuple(six.itervalues(features)), partitioner=dnn_partitioner): dnn_logit_fn = multidnn_logit_fn_builder( units=head.logits_dimension, hidden_units_list=dnn_hidden_units, connected_mode_list=dnn_connected_mode, feature_columns=dnn_feature_columns, input_layer_partitioner=input_layer_partitioner) dnn_logits = dnn_logit_fn(features=features, mode=mode) # Build Linear Logits. linear_parent_scope = 'linear' if model_type == 'deep' or not linear_feature_columns: linear_logits = None else: linear_optimizer = get_optimizer_instance( linear_optimizer, learning_rate=_LINEAR_LEARNING_RATE) check_no_sync_replicas_optimizer(linear_optimizer) with tf.variable_scope(linear_parent_scope, values=tuple(six.itervalues(features)), partitioner=input_layer_partitioner) as scope: logit_fn = linear_logit_fn_builder( units=head.logits_dimension, feature_columns=linear_feature_columns) linear_logits = logit_fn(features=features) add_layer_summary(linear_logits, scope.name) # Build CNN Logits. cnn_parent_scope = 'cnn' if not with_cnn: cnn_logits = None else: cnn_optimizer = get_optimizer_instance( cnn_optimizer, learning_rate=_CNN_LEARNING_RATE) with tf.variable_scope(cnn_parent_scope, values=tuple([cnn_features]), partitioner=input_layer_partitioner) as scope: img_vec = Vgg16().build(cnn_features) cnn_logits = tf.layers.dense( img_vec, units=head.logits_dimension, kernel_initializer=tf.glorot_uniform_initializer(), name=scope) add_layer_summary(cnn_logits, scope.name) # Combine logits and build full model. logits_combine = [] # _BinaryLogisticHeadWithSigmoidCrossEntropyLoss, logits_dimension=1 for logits in [dnn_logits, linear_logits, cnn_logits]: # shape: [batch_size, 1] if logits is not None: logits_combine.append(logits) logits = tf.add_n(logits_combine) def _train_op_fn(loss): """Returns the op to optimize the loss.""" train_ops = [] global_step = tf.train.get_global_step() # BN, when training, the moving_mean and moving_variance need to be updated. By default the # update ops are placed in tf.GraphKeys.UPDATE_OPS, so they need to be added as a dependency to the train_op update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if dnn_logits is not None: train_ops.append( dnn_optimizer.minimize( loss, global_step=global_step, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope))) if linear_logits is not None: train_ops.append( linear_optimizer.minimize( loss, global_step=global_step, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=linear_parent_scope))) if cnn_logits is not None: train_ops.append( cnn_optimizer.minimize( loss, global_step=global_step, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=cnn_parent_scope))) # Create an op that groups multiple ops. When this op finishes, # all ops in inputs have finished. This op has no output. train_op = tf.group(*train_ops) with tf.control_dependencies([train_op]): # Returns a context manager that specifies an op to colocate with. with tf.colocate_with(global_step): return tf.assign_add(global_step, 1) return head.create_estimator_spec(features=features, mode=mode, labels=labels, train_op_fn=_train_op_fn, logits=logits)
def create(cls, embeddings, labels, **kwargs): """The main method for creating all :class:`WordBasedModel` types. This method instantiates a model with pooling and optional stacking layers. Many of the arguments provided are reused by each implementation, but some sub-classes need more information in order to properly initialize. For this reason, the full list of keyword args are passed to the :method:`pool` and :method:`stacked` methods. :param embeddings: This is a dictionary of embeddings, mapped to their numerical indices in the lookup table :param labels: This is a list of the `str` labels :param kwargs: There are sub-graph specific Keyword Args allowed for e.g. embeddings. See below for known args: :Keyword Arguments: * *gpus* -- (``int``) How many GPUs to split training across. If called this function delegates to another class `ClassifyParallelModel` which creates a parent graph and splits its inputs across each sub-model, by calling back into this exact method (w/o this argument), once per GPU * *model_type* -- The string name for the model (defaults to `default`) * *sess* -- An optional tensorflow session. If not passed, a new session is created * *lengths_key* -- (``str``) Specifies which `batch_dict` property should be used to determine the temporal length if this is not set, it defaults to either `word`, or `x` if `word` is also not a feature * *finetune* -- Are we doing fine-tuning of word embeddings (defaults to `True`) * *mxlen* -- The maximum signal (`x` tensor temporal) length (defaults to `100`) * *dropout* -- This indicates how much dropout should be applied to the model when training. * *filtsz* -- This is actually a top-level param due to an unfortunate coupling between the pooling layer and the input, which, for convolution, requires input padding. :return: A fully-initialized tensorflow classifier """ TRAIN_FLAG() gpus = kwargs.get('gpus', 1) if gpus == -1: gpus = len(os.getenv('CUDA_VISIBLE_DEVICES', os.getenv('NV_GPU', '0')).split(',')) kwargs['gpus'] = gpus if gpus > 1: return ClassifyParallelModel(cls.create, embeddings, labels, **kwargs) sess = kwargs.get('sess', tf.Session()) model = cls() model.embeddings = embeddings model._record_state(**kwargs) model.lengths_key = kwargs.get('lengths_key') if model.lengths_key is not None: model.lengths = kwargs.get('lengths', tf.placeholder(tf.int32, [None], name="lengths")) else: model.lengths = None model.labels = labels nc = len(labels) model.y = kwargs.get('y', tf.placeholder(tf.int32, [None, nc], name="y")) # This only exists to make exporting easier model.pdrop_value = kwargs.get('dropout', 0.5) # This only exists to make exporting easier with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): seed = np.random.randint(10e8) init = tf.random_uniform_initializer(-0.05, 0.05, dtype=tf.float32, seed=seed) word_embeddings = model.embed(**kwargs) input_sz = word_embeddings.shape[-1] pooled = model.pool(word_embeddings, input_sz, init, **kwargs) stacked = model.stacked(pooled, init, **kwargs) # For fully connected layers, use xavier (glorot) transform with tf.variable_scope("output"): model.logits = tf.identity(tf.layers.dense(stacked, nc, activation=None, kernel_initializer=tf.glorot_uniform_initializer(seed)), name="logits") model.best = tf.argmax(model.logits, 1, name="best") model.probs = tf.nn.softmax(model.logits, name="probs") model.sess = sess # writer = tf.summary.FileWriter('blah', sess.graph) return model
def _build(self, convnet_pars): with tf.variable_scope(None, default_name=self._name): self._scope_name = tf.get_default_graph().get_name_scope() + '/' with tf.variable_scope('State'): self._x = tf.placeholder(tf.float32, shape=[None] + list(convnet_pars['input_shape']), name='input') with tf.variable_scope('Action'): self._action = tf.placeholder('uint8', [None], name='action') action_one_hot = tf.one_hot(self._action, convnet_pars['output_shape'][0], name='action_one_hot') with tf.variable_scope('Mask'): self._mask = tf.placeholder( tf.float32, shape=[None, convnet_pars['n_approximators']]) if convnet_pars['n_states'] is not None: x = tf.one_hot(tf.cast(self._x[..., 0, 0], tf.int32), convnet_pars['n_states']) else: x = self._x[..., 0] self._features = list() self._features2 = list() self._q = list() self._q_acted = list() for i in range(convnet_pars['n_approximators']): with tf.variable_scope('head_' + str(i)): self._features.append( tf.layers.dense( x, convnet_pars['n_features'], activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), name='features_' + str(i))) self._features2.append( tf.layers.dense( self._features[i], convnet_pars['n_features'], activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), name='features2_' + str(i))) self._q.append( tf.layers.dense( self._features2[i], convnet_pars['output_shape'][0], kernel_initializer=tf.glorot_uniform_initializer(), name='q_' + str(i))) self._q_acted.append( tf.reduce_sum(self._q[i] * action_one_hot, axis=1, name='q_acted_' + str(i))) self._target_q = tf.placeholder( 'float32', [None, convnet_pars['n_approximators']], name='target_q') loss = 0. for i in range(convnet_pars['n_approximators']): loss += tf.losses.mean_squared_error( self._mask[:, i] * self._target_q[:, i], self._mask[:, i] * self._q_acted[i]) tf.summary.scalar('mse', loss) tf.summary.scalar('average_q', tf.reduce_mean(self._q)) self._merged = tf.summary.merge( tf.get_collection(tf.GraphKeys.SUMMARIES, scope=self._scope_name)) optimizer = convnet_pars['optimizer'] if optimizer['name'] == 'rmspropcentered': opt = tf.train.RMSPropOptimizer(learning_rate=optimizer['lr'], decay=optimizer['decay'], epsilon=optimizer['epsilon'], centered=True) elif optimizer['name'] == 'rmsprop': opt = tf.train.RMSPropOptimizer(learning_rate=optimizer['lr'], decay=optimizer['decay'], epsilon=optimizer['epsilon']) elif optimizer['name'] == 'adam': opt = tf.train.AdamOptimizer(learning_rate=optimizer['lr']) elif optimizer['name'] == 'adadelta': opt = tf.train.AdadeltaOptimizer(learning_rate=optimizer['lr']) else: raise ValueError('Unavailable optimizer selected.') self._train_step = opt.minimize(loss=loss) initializer = tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self._scope_name)) self._session.run(initializer) if self._folder_name is not None: self._train_writer = tf.summary.FileWriter( self._folder_name + '/' + self._scope_name[:-1], graph=tf.get_default_graph()) self._train_count = 0 self._add_collection()
def inception3d_block(X,name,final_channel_list,compress_channel_list, is_training,dropout_rate=0.0, apply_batchnorm=False,weight_decay=None, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: This function will be again a equivalent of 2d inception block with the 3d convolution at each sub-layer rather than 2d convolution. The filter shape currently chosen are [F1:1x1x1, F2:3x3x3, F3:5x5x5] but could be changed if they contribute significantly to the conputational complexity USAGE: INPUT: final_channel_list :a list of number giving the number of channels in output of the each sub-layer of form [ # 1x1x1 channels,# 3x3x3 channels, # 5x5x5 channels,# compressed maxpool channels ] compress_channel_list : since we need to compress the number of channels coming from input to do 3x3x3 and 5x5x5 convolution (which are computationally expensive) we need to compress them using 1x1x1 convolution. so list of such compress of form [#compressed channel for 3x3x3, #compresses channel for 5x5x5] OUTPUT: A : The final activation after concatenation of all these sub-layer activation ''' with tf.variable_scope(name): #Starting with Route 1: 1x1x1 convolution A1=rectified_conv3d(X, name='1x1x1', filter_shape=(1,1,1), output_channel=final_channel_list[0], stride=(1,1,1), padding_type='VALID', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #Now starting the Route 2: 3x3x3 convolution #First compress by 1x1x1 C3=rectified_conv3d(X, name='compress_3x3x3', filter_shape=(1,1,1), output_channel=compress_channel_list[0], stride=(1,1,1), padding_type='VALID', is_training=is_training, dropout_rate=0.0,#dropout kept to zero apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #now doing the 3x3x3 convolution on the smallar-compresses representation A3=rectified_conv3d(C3, name='3x3x3', filter_shape=(3,3,3), output_channel=final_channel_list[1], stride=(1,1,1), padding_type='SAME', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #Now starting the Route 3: 5x5x5 convolution #First compressing by 1x1x1 convolution C5=rectified_conv3d(X, name='compress_5x5x5', filter_shape=(1,1,1), output_channel=compress_channel_list[1], stride=(1,1,1), padding_type='VALID', is_training=is_training, dropout_rate=0.0, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #now doing 5x5x5 convolution on this compressed 3Dimage A5=rectified_conv3d(C5, name='5x5x5', filter_shape=(5,5,5), output_channel=final_channel_list[2], stride=(1,1,1), padding_type='SAME', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #Now going though Route 4: Maxpooling sub-layer #First of all maxpooling the input CMp=max_pooling3d(X, name='maxpool', filter_shape=(3,3,3), stride=(1,1,1), padding_type='SAME') #Now compressing to reduce the number of channels AMp=rectified_conv3d(CMp, name='compress_maxpool', filter_shape=(1,1,1), output_channel=final_channel_list[3], stride=(1,1,1), padding_type='VALID', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #Finally concatenating all the routes concat_list=[A1,A3,A5,AMp] axis=-1 #Concatenating along the channel axis: axis=4 A=tf.concat(concat_list,axis=axis,name='concat') return A
def build_model(mode, inputs, params, is_training): """Compute logits of the model (output distribution) Args: mode: (string) 'train', 'eval', etc. inputs: (dict) contains the inputs of the graph (features, labels...) this can be `tf.placeholder` or outputs of `tf.data` params: (Params) contains hyperparameters of the model (ex: `params.learning_rate`) Returns: output: (tf.Tensor) output of the model """ sentence = inputs['sentence'] max_length = 50 if params.model_version == 'lstm': # Get word embeddings for each token in the sentence embeddings = tf.get_variable(name="embeddings", dtype=tf.float32, shape=[params.vocab_size, params.embedding_size]) sentence = tf.nn.embedding_lookup(embeddings, sentence) # Self Attentive Classification Network implementation sentence = tf.layers.dense(sentence, 10, activation = tf.nn.relu) # Apply a bidirectional LSTM over the embeddings lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(params.lstm_num_units) outputs, last_output_states = tf.nn.bidirectional_dynamic_rnn( lstm_cell, lstm_cell, sentence, dtype=tf.float32, sequence_length=inputs['sentence_lengths'] ,scope = 'Encoder') output = tf.concat(outputs, 2) W_1 = tf.get_variable('W_1',shape = [2*params.lstm_num_units, params.attention_units], initializer = tf.glorot_uniform_initializer()) v = tf.get_variable('v' ,shape = [params.attention_units, 2*params.lstm_num_units], initializer = tf.glorot_uniform_initializer()) s = [] for w in range(max_length): u = tf.tanh( tf.matmul( output[:,w,:], (W_1) ) ) s.append(tf.matmul(u, v)) scores = tf.stack(s, axis=1) attention_scores = tf.exp(scores)/tf.reduce_sum(tf.exp(scores), axis =1, keepdims =True) # (m, Tx, emb) context = tf.multiply(output, attention_scores) #context_diff = tf.subtract(attention_scores, output) attention_output = tf.concat([output, context], axis = 2) integration_cell = tf.nn.rnn_cell.BasicLSTMCell(attention_output.get_shape().as_list()[2]) attention_layer_outputs, last_output_states = tf.nn.bidirectional_dynamic_rnn( integration_cell, integration_cell, attention_output, dtype=tf.float32, sequence_length=inputs['sentence_lengths'] ,scope='Integration_Layer') attention_layer_output = tf.concat(attention_layer_outputs, 2) #meanOutput = tf.reduce_mean(attention_output, axis = 1) pool_output5M = tf.nn.pool(input=attention_layer_output, window_shape=[5], pooling_type="MAX", padding="SAME") pool_output5A = tf.nn.pool(input=attention_layer_output, window_shape=[5], pooling_type="AVG", padding="SAME") pool_output = tf.concat([pool_output5M, pool_output5A], 2) mean_output = tf.reduce_mean(pool_output, axis = 1) keep_rate = 1.0 if is_training: keep_rate = 1.0 - params.dropout_rate activation_fn = tf.nn.relu # Compute logits from the output of the LSTM layer1_output = tf.layers.dense(mean_output, 10, activation = activation_fn) layer1_output = tf.nn.dropout(layer1_output, keep_rate) layer2_output = tf.layers.dense(layer1_output, 5, activation = activation_fn) layer2_output = tf.nn.dropout(layer2_output, keep_rate) logits = tf.layers.dense(layer2_output, 2) else: raise NotImplementedError("Unknown model version: {}".format(params.model_version)) return logits
def construct(self, args, num_words, num_chars, num_tags): with self.session.graph.as_default(): if args.recodex: tf.get_variable_scope().set_initializer( tf.glorot_uniform_initializer(seed=42)) # Inputs self.sentence_lens = tf.placeholder(tf.int32, [None], name="sentence_lens") self.word_ids = tf.placeholder(tf.int32, [None, None], name="word_ids") self.charseqs = tf.placeholder(tf.int32, [None, None], name="charseqs") self.charseq_lens = tf.placeholder(tf.int32, [None], name="charseq_lens") self.charseq_ids = tf.placeholder(tf.int32, [None, None], name="charseq_ids") self.tags = tf.placeholder(tf.int32, [None, None], name="tags") # RNN Cell # TODO(we): Choose RNN cell class according to args.rnn_cell (LSTM and GRU # should be supported, using tf.nn.rnn_cell.{BasicLSTM,GRU}Cell). if args.rnn_cell == "LSTM": rnn_cell = tf.nn.rnn_cell.BasicLSTMCell elif args.rnn_cell == "GRU": rnn_cell = tf.nn.rnn_cell.GRUCell else: raise ValueError("Unknown rnn_cell {}".format(args.rnn_cell)) # Word embeddings # TODO(we): Create word embeddings for num_words of dimensionality args.we_dim # using `tf.get_variable`. word_embeddings = tf.get_variable("word_embeddings", shape=[num_words, args.we_dim], dtype=tf.float32) # TODO(we): Embed self.word_ids according to the word embeddings, by utilizing # `tf.nn.embedding_lookup`. inputs = tf.nn.embedding_lookup(word_embeddings, self.word_ids) # Character-level embeddings # TODO: Generate character embeddings for num_chars of dimensionality args.cle_dim. character_embeddings = tf.get_variable( "character_embeddings", shape=[num_chars, args.cle_dim], dtype=tf.float32) # TODO: Embed self.charseqs (list of unique words in the batch) using the character embeddings. characters_embedded = tf.nn.embedding_lookup( character_embeddings, self.charseqs) # TODO: Use `tf.nn.bidirectional_dynamic_rnn` to process embedded self.charseqs using # a GRU cell of dimensionality `args.cle_dim`. _, (state_fwd, state_bwd) = tf.nn.bidirectional_dynamic_rnn( tf.nn.rnn_cell.GRUCell(args.cle_dim), tf.nn.rnn_cell.GRUCell(args.cle_dim), characters_embedded, sequence_length=self.charseq_lens, dtype=tf.float32) # TODO: Sum the resulting fwd and bwd state to generate character-level word embedding (CLE) # of unique words in the batch. cle = state_fwd + state_bwd # TODO: Generate CLEs of all words in the batch by indexing the just computed embeddings # by self.charseq_ids (using tf.nn.embedding_lookup). # TODO: Concatenate the word embeddings (computed above) and the CLE (in this order). inputs = tf.concat( [inputs, tf.nn.embedding_lookup(cle, self.charseq_ids)], axis=2) # Computation # TODO(we): Using tf.nn.bidirectional_dynamic_rnn, process the embedded inputs. # Use given rnn_cell (different for fwd and bwd direction) and self.sentence_lens. (hidden_layer_fwd, hidden_layer_bwd), _ = tf.nn.bidirectional_dynamic_rnn( rnn_cell(args.rnn_cell_dim), rnn_cell(args.rnn_cell_dim), inputs, sequence_length=self.sentence_lens, dtype=tf.float32) # TODO(we): Concatenate the outputs for fwd and bwd directions (in the third dimension). hidden_layer = tf.concat([hidden_layer_fwd, hidden_layer_bwd], axis=2) # TODO(we): Add a dense layer (without activation) into num_tags classes and # store result in `output_layer`. output_layer = tf.layers.dense(hidden_layer, num_tags) # TODO(we): Generate `self.predictions`. self.predictions = tf.argmax(output_layer, axis=2) # TODO(we): Generate `weights` as a 1./0. mask of valid/invalid words (using `tf.sequence_mask`). weights = tf.sequence_mask(self.sentence_lens, dtype=tf.float32) # Training # TODO(we): Define `loss` using `tf.losses.sparse_softmax_cross_entropy`, but additionally # use `weights` parameter to mask-out invalid words. loss = tf.losses.sparse_softmax_cross_entropy(self.tags, output_layer, weights=weights) global_step = tf.train.create_global_step() self.training = tf.train.AdamOptimizer().minimize( loss, global_step=global_step, name="training") # Summaries self.current_accuracy, self.update_accuracy = tf.metrics.accuracy( self.tags, self.predictions, weights=weights) self.current_loss, self.update_loss = tf.metrics.mean( loss, weights=tf.reduce_sum(weights)) self.reset_metrics = tf.variables_initializer( tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)) summary_writer = tf.contrib.summary.create_file_writer( args.logdir, flush_millis=10 * 1000) self.summaries = {} with summary_writer.as_default( ), tf.contrib.summary.record_summaries_every_n_global_steps(10): self.summaries["train"] = [ tf.contrib.summary.scalar("train/loss", self.update_loss), tf.contrib.summary.scalar("train/accuracy", self.update_accuracy) ] with summary_writer.as_default( ), tf.contrib.summary.always_record_summaries(): for dataset in ["dev", "test"]: self.summaries[dataset] = [ tf.contrib.summary.scalar(dataset + "/loss", self.current_loss), tf.contrib.summary.scalar(dataset + "/accuracy", self.current_accuracy) ] # Initialize variables self.session.run(tf.global_variables_initializer()) with summary_writer.as_default(): tf.contrib.summary.initialize(session=self.session, graph=self.session.graph)
def embedding(inputs, vocab_size, num_units, zero_pad=True, lookup_table=None, scale=True, scope="embedding", reuse=None): '''Embeds a given tensor. Args: inputs: A `Tensor` with type `int32` or `int64` containing the ids to be looked up in `lookup table`. vocab_size: An int. Vocabulary size. num_units: An int. Number of embedding hidden units. zero_pad: A boolean. If True, all the values of the fist row (id 0) should be constant zeros. scale: A boolean. If True. the outputs is multiplied by sqrt num_units. scope: Optional scope for `variable_scope`. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A `Tensor` with one more rank than inputs's. The last dimensionality should be `num_units`. For example, ``` import tensorflow as tf inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) outputs = embedding(inputs, 6, 2, zero_pad=True) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print sess.run(outputs) >> [[[ 0. 0. ] [ 0.09754146 0.67385566] [ 0.37864095 -0.35689294]] [[-1.01329422 -1.09939694] [ 0.7521342 0.38203377] [-0.04973143 -0.06210355]]] ``` ``` import tensorflow as tf inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) outputs = embedding(inputs, 6, 2, zero_pad=False) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print sess.run(outputs) >> [[[-0.19172323 -0.39159766] [-0.43212751 -0.66207761] [ 1.03452027 -0.26704335]] [[-0.11634696 -0.35983452] [ 0.50208133 0.53509563] [ 1.22204471 -0.96587461]]] ``` ''' if lookup_table == None and scope == 'encoder_embed': # create lookup_table if it's not given # create main lookup_table lookup_table = tf.get_variable( 'lookup_table', dtype=tf.float32, shape=[vocab_size, num_units], initializer=tf.glorot_uniform_initializer()) if zero_pad: # zero pad for <PAD> lookup_table = tf.concat( (tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0, name='concated_lookup_table') with tf.variable_scope(scope, reuse=reuse): if lookup_table == None and scope != 'encoder_embed': # create lookup table for other purpose lookup_table = tf.get_variable( 'lookup_table', dtype=tf.float32, shape=[vocab_size, num_units], initializer=tf.glorot_uniform_initializer()) ### TODO: do refactor later, otherwise multiple huge lookup table would be copied here if zero_pad: # zero pad for <PAD> lookup_table = tf.concat( (tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0, name='concated_lookup_table') outputs = tf.nn.embedding_lookup(lookup_table, inputs) if scale: outputs = outputs * (num_units**0.5) return outputs
def mnist_lenet(inputs, training=False, reuse=tf.AUTO_REUSE, name="lenet"): # used for mnist with tf.variable_scope(name, reuse=reuse, dtype=tf.float32): x = tf.reshape(inputs, shape=[-1, 28, 28, 1]) # first convolutional layer conv1 = tf.layers.conv2d( x, filters=32, kernel_size=(3, 3), padding="same", activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.glorot_uniform_initializer(), name="conv1", reuse=tf.AUTO_REUSE, bias_initializer=tf.constant_initializer(0.05)) pool1 = tf.layers.max_pooling2d(conv1, pool_size=(2, 2), strides=(2, 2), padding="same", name="max_pool1") drop1 = tf.layers.dropout(pool1, rate=0.5, training=training, name="dropout1") # second convolutional layer conv2 = tf.layers.conv2d( drop1, filters=64, kernel_size=(3, 3), padding="same", activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.glorot_uniform_initializer(), name="conv2", reuse=tf.AUTO_REUSE, bias_initializer=tf.constant_initializer(0.05)) pool2 = tf.layers.max_pooling2d(conv2, pool_size=(2, 2), strides=(2, 2), padding="same", name="max_pool2") drop2 = tf.layers.dropout(pool2, rate=0.5, training=training, name="dropout2") # third convolutional layer conv3 = tf.layers.conv2d( drop2, filters=128, kernel_size=(3, 3), padding="same", activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.glorot_uniform_initializer(), name="conv3", reuse=tf.AUTO_REUSE, bias_initializer=tf.constant_initializer(0.05)) pool3 = tf.layers.max_pooling2d(conv3, pool_size=(2, 2), strides=(2, 2), padding="same", name="max_pool3") drop3 = tf.layers.dropout(pool3, rate=0.5, training=training, name="dropout3") # flatten features = tf.layers.flatten(drop3, name="flatten") return features
def _build_model(self): filters = [1, 64, 128, 128, FLAGS.out_channels] strides = [1, 2] feature_h = FLAGS.image_height feature_w = FLAGS.image_width count_ = 0 min_size = min(FLAGS.image_height, FLAGS.image_width) while min_size > 1: min_size = (min_size + 1) // 2 count_ += 1 assert FLAGS.cnn_count <= count_, "FLAGS.cnn_count should be <= {}!".format( count_) # CNN part with tf.variable_scope('cnn'): x = self.inputs for i in range(FLAGS.cnn_count): with tf.variable_scope('unit-%d' % (i + 1)): x = self._conv2d(x, 'cnn-%d' % (i + 1), 3, filters[i], filters[i + 1], strides[0]) x = self._batch_norm('bn%d' % (i + 1), x) x = self._leaky_relu(x, FLAGS.leakiness) x = self._max_pool(x, 2, strides[1]) # print('----x.get_shape().as_list(): {}'.format(x.get_shape().as_list())) _, feature_h, feature_w, _ = x.get_shape().as_list() print('\nfeature_h: {}, feature_w: {}'.format( feature_h, feature_w)) # LSTM part with tf.variable_scope('lstm'): x = tf.transpose( x, [0, 2, 1, 3 ]) # [batch_size, feature_w, feature_h, FLAGS.out_channels] # treat `feature_w` as max_timestep in lstm. x = tf.reshape( x, [FLAGS.batch_size, feature_w, feature_h * FLAGS.out_channels]) print('lstm input shape: {}'.format(x.get_shape().as_list())) self.seq_len = tf.fill([x.get_shape().as_list()[0]], feature_w) # print('self.seq_len.shape: {}'.format(self.seq_len.shape.as_list())) # tf.nn.rnn_cell.RNNCell, tf.nn.rnn_cell.GRUCell cell = tf.nn.rnn_cell.LSTMCell(FLAGS.num_hidden, state_is_tuple=True) if self.mode == 'train': cell = tf.nn.rnn_cell.DropoutWrapper( cell=cell, output_keep_prob=FLAGS.output_keep_prob) cell1 = tf.nn.rnn_cell.LSTMCell(FLAGS.num_hidden, state_is_tuple=True) if self.mode == 'train': cell1 = tf.nn.rnn_cell.DropoutWrapper( cell=cell1, output_keep_prob=FLAGS.output_keep_prob) # Stacking rnn cells stack = tf.nn.rnn_cell.MultiRNNCell([cell, cell1], state_is_tuple=True) initial_state = stack.zero_state(FLAGS.batch_size, dtype=tf.float32) # The second output is the last state and we will not use that outputs, _ = tf.nn.dynamic_rnn( cell=stack, inputs=x, sequence_length=self.seq_len, initial_state=initial_state, dtype=tf.float32, time_major=False ) # [batch_size, max_stepsize, FLAGS.num_hidden] # Reshaping to apply the same weights over the timesteps outputs = tf.reshape( outputs, [-1, FLAGS.num_hidden ]) # [batch_size * max_stepsize, FLAGS.num_hidden] W = tf.get_variable(name='W_out', shape=[FLAGS.num_hidden, num_classes], dtype=tf.float32, initializer=tf.glorot_uniform_initializer() ) # tf.glorot_normal_initializer b = tf.get_variable(name='b_out', shape=[num_classes], dtype=tf.float32, initializer=tf.constant_initializer()) self.logits = tf.matmul(outputs, W) + b # Reshaping back to the original shape shape = tf.shape(x) self.logits = tf.reshape(self.logits, [shape[0], -1, num_classes]) # Time major self.logits = tf.transpose(self.logits, (1, 0, 2))
def simple_vector_RNN_block(X_img, is_training, conv2d_function_handle, sequence_model_type, num_of_sequence_layers, hidden_state_dim_list, output_dimension_list, output_type, output_norm_list, num_detector_layers=40, weight_decay=None, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: USAGE: INPUT: X_img : the 3d input image with the hits of all the detector-layers is_training : whether we are in training or testing mode. used internally by batchnorm and dropout. conv2d_function_handle : this will be a 2d convolutional model to be applied to the "images" of each layer of the detector with the same shared parameters and finally generating a vectored output. sequence_model_type : whether we want to use the 'LSTM' layers or RNN layers (later support for mixing both could be added). ['LSTM'/'RNN'] num_of_sequence_layers : the number of RNN cells stacked on top of each other. Practically <=2 is best to keep. hidden_state_dim_list : the list containing the dimension of the hidden state of the RNN layers. output_dimension_list : the list containing the output dimension of each RNN layer output_type : whether we want to return a sequence or vector as output of this block. string: 'sequence'/'vector' output_norm_list : the name of the normalization to be applied to the output of each layer. ['relu'/'tanh'/None] supported now num_detector_layers : the total number of layers in detector hit-image, default to 40 weight_decay : the hyperparameter that will be multiplied to the L2-regularization contribution to the total cost. initializer : the initializer to initialize the weigths of the weights in the layers (CNN and RNN) OUTPUT: ''' #Asserting the dimension of input with the number of detector-layer layer_dim=3 assert len(X_img.get_shape().as_list()),'give in: [batch,x,y,z] format' assert X_img.get_shape().as_list()[layer_dim]==num_detector_layers,'detector\ layer number mismatch' #Running the convolution on the same varaible scope for each detector layer conv_output_list=[] with tf.variable_scope('shared_conv_layers'): #initializing the iter varaible iter_i=tf.constant(0,dtype=tf.int32,name='iter_i') iter_end=tf.constant(num_detector_layers,dtype=tf.int32,name='iter_end') #Initializing the TensorArray for holding all the layer activation tensor_array=tf.TensorArray(dtype=dtype, size=num_detector_layers, clear_after_read=True,#no read many infer_shape=True) #Initializing the constant to hold the regularization loss conv_reg_loss=tf.constant(0.0,dtype=dtype,name='reg_loss_value') #Now running the tf.while loop and the final tensor array as the output _,_,_,_,conv_reg_loss,tensor_array=tf.while_loop(_tfwhile_cond, #_tfwhile_body, conv2d_function_handle, loop_vars=[X_img,is_training,iter_i,iter_end,\ conv_reg_loss,tensor_array], #none of them will be shape invaraint, swap_memory=True, parallel_iterations=16) #Now adding this regularization of this conv_layer to one collection tf.add_to_collection('reg_losses',conv_reg_loss) tf.summary.scalar('l2_reg_loss_conv',conv_reg_loss) #Now we are ready for the implementation of the sequence(RNN/LSTM) cells #Retreiving the sequence tensor (vector-encoding) from the tensor array cnn_output_vectors=tensor_array.stack() input_sequence=[cnn_output_vectors[i,:,:] for i in range(num_detector_layers)] #All the necessary argument assertion assert output_type=='sequence' or output_type=='vector','Give correct argument' #Writing in a separate name scope since varaible scope are taken care inside with tf.name_scope('seq_RNN_layers') as rnn_block_scope: #Stacking up the RNN seq-layer on top on one another. for i in range(num_of_sequence_layers): #Deciding the unique layer name for unique varaible scope for each layer layer_name='layer{}'.format(i+1) #Specifying the number of output_source num_output_source='all' if i==num_of_sequence_layers-1 and output_type=='vector': num_output_source='one' #Now stacking up the layers on top of other #Selecting the type of sequence model we want stack seq_layer_function_handle=None if sequence_model_type=='RNN': seq_layer_function_handle=_simple_vector_RNN_layer elif sequence_model_type=='LSTM': seq_layer_function_handle=_simple_vector_LSTM_layer #The output of this layer will be the input sequence to next RNN layer input_sequence=seq_layer_function_handle(input_sequence=input_sequence, name=layer_name, hidden_state_length=hidden_state_dim_list[i], num_output_source=num_output_source, output_dimension=output_dimension_list[i], output_norm=output_norm_list[i], weight_decay=weight_decay, initializer=initializer) #Finally returning the output sequence be it a list of one vector or all output_sequence=input_sequence #Adding the regularization loss of this scope to the reg_loss_list_rnn=tf.get_collection('all_losses',scope=rnn_block_scope) l2_reg_loss_rnn=0.0 if not len(reg_loss_list_rnn)==0: l2_reg_loss_rnn=tf.add_n(reg_loss_list_rnn,name='l2_reg_loss_rnn') #Adding this regularization loss to the reg_losses collection tf.add_to_collection('reg_losses',l2_reg_loss_rnn) tf.summary.scalar('l2_reg_loss_rnn',l2_reg_loss_rnn) #This output could be used for furthur fully connected layer/ #aggregateion (if its a sequence) or input to other sequence layer #or directly as the unnormalized output of the whole model. return output_sequence
def createCNN(self, Input, groundTruth, stage): #trainable = trainable_params[0] #trainable = tf.placeholder(tf.bool) #S2_isTrain = tf.placeholder(tf.bool) trainable_s1 = False trainable_s2 = False if stage == 1: trainable_s1 = True if stage == 2: trainable_s2 = True print("s1 dropout:") print(trainable_s1) print("s2 dropout:") print(trainable_s2) Ret_dict = {} with tf.variable_scope("S1"): #layers: conv1_1,bn1_1,conv1_2,bn1_2,pool1 s1_conv1_1 = tf.layers.conv2d( Input, 64, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s1, kernel_initializer=tf.glorot_uniform_initializer()) s1_bn1_1 = tf.layers.batch_normalization( s1_conv1_1, trainable=trainable_s1, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s1_conv1_2 = tf.layers.conv2d( s1_bn1_1, 64, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s1, kernel_initializer=tf.glorot_uniform_initializer()) s1_bn1_2 = tf.layers.batch_normalization( s1_conv1_2, trainable=trainable_s1, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s1_pool1 = tf.layers.max_pooling2d(s1_bn1_2, 2, 2, padding='same') #layers: conv2_1,bn2_1,conv2_2,bn2_2,pool2 s1_conv2_1 = tf.layers.conv2d( s1_pool1, 128, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s1, kernel_initializer=tf.glorot_uniform_initializer()) s1_bn2_1 = tf.layers.batch_normalization( s1_conv2_1, trainable=trainable_s1, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s1_conv2_2 = tf.layers.conv2d( s1_bn2_1, 128, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s1, kernel_initializer=tf.glorot_uniform_initializer()) s1_bn2_2 = tf.layers.batch_normalization( s1_conv2_2, trainable=trainable_s1, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s1_pool2 = tf.layers.max_pooling2d(s1_bn2_2, 2, 2, padding='same') #layers: conv3_1,bn3_1,conv3_2,bn3_2,pool3 s1_conv3_1 = tf.layers.conv2d( s1_pool2, 256, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s1, kernel_initializer=tf.glorot_uniform_initializer()) s1_bn3_1 = tf.layers.batch_normalization( s1_conv3_1, trainable=trainable_s1, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s1_conv3_2 = tf.layers.conv2d( s1_bn3_1, 256, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s1, kernel_initializer=tf.glorot_uniform_initializer()) s1_bn3_2 = tf.layers.batch_normalization( s1_conv3_2, trainable=trainable_s1, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s1_pool3 = tf.layers.max_pooling2d(s1_bn3_2, 2, 2, padding='same') #layers: conv4_1,bn4_1,conv4_2,bn4_2,pool4 s1_conv4_1 = tf.layers.conv2d( s1_pool3, 512, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s1, kernel_initializer=tf.glorot_uniform_initializer()) s1_bn4_1 = tf.layers.batch_normalization( s1_conv4_1, trainable=trainable_s1, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s1_conv4_2 = tf.layers.conv2d( s1_bn4_1, 512, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s1, kernel_initializer=tf.glorot_uniform_initializer()) s1_bn4_2 = tf.layers.batch_normalization( s1_conv4_2, trainable=trainable_s1, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s1_pool4 = tf.layers.max_pooling2d(s1_bn4_2, 2, 2, padding='same') s1_pool4_flat = tf.contrib.layers.flatten(s1_pool4) s1_dropout = tf.layers.dropout(s1_pool4_flat, 0.5, training=trainable_s1) s1_fc1 = tf.layers.dense( s1_dropout, 256, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), trainable=trainable_s1) s1_fc1 = tf.layers.batch_normalization(s1_fc1, trainable=trainable_s1, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s1_output = tf.layers.dense(s1_fc1, 136, activation=None) s1_landmarks = s1_output + self.initLandmarks s1_cost = tf.reduce_mean( self.normRmse_S2(groundTruth, s1_landmarks)) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS, 'S1')): s1_optimizer = tf.train.AdamOptimizer(0.001).minimize( s1_cost, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "S1")) Ret_dict['s1_landmarks'] = s1_landmarks Ret_dict['s1_cost'] = s1_cost Ret_dict['s1_optimizer'] = s1_optimizer with tf.variable_scope("S2"): s1_landmarks = tf.reshape(s1_landmarks, [-1, 68, 2]) r, t = TransformParamsLayer(s1_landmarks, self.initLandmarks) S2_img_output = AffineTransformLayer(Input, r, t) S2_landmarks_affine = LandmarkTransformLayer(s1_landmarks, r, t) S2_img_landmarks = LandmarkImageLayer(S2_landmarks_affine) S2_img_feature = tf.layers.dense( s1_fc1, 56 * 56, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer()) S2_img_feature = tf.reshape(S2_img_feature, [-1, 56, 56, 1]) S2_img_feature = tf.image.resize_images(S2_img_feature, [IMGSIZE, IMGSIZE]) S2_inputs = tf.concat( [S2_img_output, S2_img_landmarks, S2_img_feature], axis=3) S2_inputs = tf.layers.batch_normalization( S2_inputs, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s2_conv1_1 = tf.layers.conv2d( S2_inputs, 64, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s2, kernel_initializer=tf.glorot_uniform_initializer()) s2_bn1_1 = tf.layers.batch_normalization( s2_conv1_1, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s2_conv1_2 = tf.layers.conv2d( s2_bn1_1, 64, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s2, kernel_initializer=tf.glorot_uniform_initializer()) s2_bn1_2 = tf.layers.batch_normalization( s2_conv1_2, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s2_pool1 = tf.layers.max_pooling2d(s2_bn1_2, 2, 2) #layers: conv2_1,bn2_1,conv2_2,bn2_2,pool2 s2_conv2_1 = tf.layers.conv2d( s2_pool1, 128, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s2, kernel_initializer=tf.glorot_uniform_initializer()) s2_bn2_1 = tf.layers.batch_normalization( s2_conv2_1, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s2_conv2_2 = tf.layers.conv2d( s2_bn2_1, 128, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s2, kernel_initializer=tf.glorot_uniform_initializer()) s2_bn2_2 = tf.layers.batch_normalization( s2_conv2_2, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s2_pool2 = tf.layers.max_pooling2d(s2_bn2_2, 2, 2) #layers: conv3_1,bn3_1,conv3_2,bn3_2,pool3 s2_conv3_1 = tf.layers.conv2d( s2_pool2, 256, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s2, kernel_initializer=tf.glorot_uniform_initializer()) s2_bn3_1 = tf.layers.batch_normalization( s2_conv3_1, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s2_conv3_2 = tf.layers.conv2d( s2_bn3_1, 256, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s2, kernel_initializer=tf.glorot_uniform_initializer()) s2_bn3_2 = tf.layers.batch_normalization( s2_conv3_2, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s2_pool3 = tf.layers.max_pooling2d(s2_bn3_2, 2, 2) #layers: conv4_1,bn4_1,conv4_2,bn4_2,pool4 s2_conv4_1 = tf.layers.conv2d( s2_pool3, 512, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s2, kernel_initializer=tf.glorot_uniform_initializer()) s2_bn4_1 = tf.layers.batch_normalization( s2_conv4_1, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s2_conv4_2 = tf.layers.conv2d( s2_bn4_1, 512, 3, 1, padding='same', activation=tf.nn.relu, trainable=trainable_s2, kernel_initializer=tf.glorot_uniform_initializer()) s2_bn4_2 = tf.layers.batch_normalization( s2_conv4_2, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) s2_pool4 = tf.layers.max_pooling2d(s2_bn4_2, 2, 2) s2_pool4_flat = tf.contrib.layers.flatten(s2_pool4) s2_dropout = tf.layers.dropout(s2_pool4_flat, 0.5, training=trainable_s2) s2_fc1 = tf.layers.dense( s2_pool4_flat, 256, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer()) s2_fc1_bn = tf.layers.batch_normalization( s2_fc1, trainable=trainable_s2, axis=-1, scale=True, momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True, fused=True) S2_Fc2 = tf.layers.dense(s2_fc1_bn, N_LANDMARK * 2, activation=None) #S2_landmarks_affine = tf.reshape(S2_landmarks_affine,[-1,136]) S2_Fc2 = tf.reshape(S2_Fc2, [-1, 68, 2]) + S2_landmarks_affine S2_Ret = LandmarkTransformLayer(S2_Fc2, r, t, Inverse=True) S2_Ret = tf.reshape(S2_Ret, [-1, 136]) S2_Cost = tf.reduce_mean(self.normRmse_S2(groundTruth, S2_Ret)) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS, 'S2')): S2_Optimizer = tf.train.AdamOptimizer(0.001).minimize( S2_Cost, var_list=tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'S2')) Ret_dict['S2_Ret'] = S2_Ret Ret_dict['S2_Cost'] = S2_Cost Ret_dict['S2_Optimizer'] = S2_Optimizer return Ret_dict
def _build_model(self): filters = [1, 64, 128, 128, FLAGS.out_channels] strides = [1, 2] feature_h = FLAGS.image_height feature_w = FLAGS.image_width count_ = 0 min_size = min(FLAGS.image_height, FLAGS.image_width) while min_size > 1: min_size = (min_size + 1) // 2 count_ += 1 assert (FLAGS.cnn_count <= count_, "FLAGS.cnn_count should be <= {}!".format(count_)) # CNN part with tf.variable_scope('cnn'): x = self.inputs for i in range(FLAGS.cnn_count): with tf.variable_scope('unit-%d' % (i + 1)): x = self._conv2d(x, 'cnn-%d' % (i + 1), 3, filters[i], filters[i + 1], strides[0]) x = self._batch_norm('bn%d' % (i + 1), x) x = self._leaky_relu(x, FLAGS.leakiness) x = self._max_pool(x, 2, strides[1]) # print('----x.get_shape().as_list(): {}'.format(x.get_shape().as_list())) _, feature_h, feature_w, _ = x.get_shape().as_list() print('\nfeature_h: {}, feature_w: {}'.format(feature_h, feature_w)) # LSTM part with tf.variable_scope('lstm'): x = tf.transpose(x, [0, 2, 1, 3]) # [batch_size, feature_w, feature_h, FLAGS.out_channels] # treat `feature_w` as max_timestep in lstm. x = tf.reshape(x, [FLAGS.batch_size, feature_w, feature_h * FLAGS.out_channels]) print('lstm input shape: {}'.format(x.get_shape().as_list())) self.seq_len = tf.fill([x.get_shape().as_list()[0]], feature_w) # print('self.seq_len.shape: {}'.format(self.seq_len.shape.as_list())) # tf.nn.rnn_cell.RNNCell, tf.nn.rnn_cell.GRUCell cell = tf.nn.rnn_cell.LSTMCell(FLAGS.num_hidden, state_is_tuple=True) if self.mode == 'train': cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, output_keep_prob=FLAGS.output_keep_prob) cell1 = tf.nn.rnn_cell.LSTMCell(FLAGS.num_hidden, state_is_tuple=True) if self.mode == 'train': cell1 = tf.nn.rnn_cell.DropoutWrapper(cell=cell1, output_keep_prob=FLAGS.output_keep_prob) # Stacking rnn cells stack = tf.nn.rnn_cell.MultiRNNCell([cell, cell1], state_is_tuple=True) initial_state = stack.zero_state(FLAGS.batch_size, dtype=tf.float32) # The second output is the last state and we will not use that outputs, _ = tf.nn.dynamic_rnn( cell=stack, inputs=x, sequence_length=self.seq_len, initial_state=initial_state, dtype=tf.float32, time_major=False ) # [batch_size, max_stepsize, FLAGS.num_hidden] # Reshaping to apply the same weights over the timesteps outputs = tf.reshape(outputs, [-1, FLAGS.num_hidden]) # [batch_size * max_stepsize, FLAGS.num_hidden] W = tf.get_variable(name='W_out', shape=[FLAGS.num_hidden, num_classes], dtype=tf.float32, initializer=tf.glorot_uniform_initializer()) # tf.glorot_normal_initializer b = tf.get_variable(name='b_out', shape=[num_classes], dtype=tf.float32, initializer=tf.constant_initializer()) self.logits = tf.matmul(outputs, W) + b # Reshaping back to the original shape shape = tf.shape(x) self.logits = tf.reshape(self.logits, [shape[0], -1, num_classes]) # Time major self.logits = tf.transpose(self.logits, (1, 0, 2))
def _create_weights(self): # Embedding self.Wu_Emb = tf.get_variable(shape=[self.num_user, self.dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='user_embedding') self.W_usr_feat_emb = tf.get_variable(shape=[self.dim_usr_cf_emb, self.dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='user_feat_embedding') self.Wwords_Emb = tf.get_variable(shape=[self.num_words, self.dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='words_embedding') self.W_LDA_emb = tf.get_variable(shape=[self.dim_lda, self.dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='LDA_embedding') self.W_one_hots = [] for i in range(len(self.one_hots_dims)): W_temp = tf.get_variable(shape=[self.one_hots_dims[i], self.dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='one_hot_{}'.format(i)) self.W_one_hots.append(W_temp) self.W_Ctx = tf.get_variable(shape=[self.dim_num_feat, self.dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='context_embedding') # Item one-hot features attention self.Wu_oh_Att = tf.get_variable(shape=[self.dim_k, self.att_dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='user_attention') self.Wctx_Att = tf.get_variable(shape=[self.dim_k, self.att_dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='context_attention') self.Woh_Att = [] for i in range(len(self.one_hots_dims)): W_temp = tf.get_variable(shape=[self.dim_k, self.att_dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='oh_attention_{}'.format(i)) self.Woh_Att.append(W_temp) self.WW_Att = tf.get_variable(shape=[self.dim_k, self.att_dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='words_attention') self.W_visual_Att = tf.get_variable(shape=[self.dim_k, self.att_dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='visual_attention') self.W_LDA_Att = tf.get_variable(shape=[self.dim_k, self.att_dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='LDA_attention') self.b_oh_Att = tf.get_variable(shape=[self.att_dim_k], initializer=tf.zeros_initializer(), dtype=tf.float32, name='bias_attention') self.w_oh_Att = tf.get_variable(shape=[self.att_dim_k, 1], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='weight_attention') self.c_oh_Att = tf.get_variable(shape=[1], initializer=tf.zeros_initializer(), dtype=tf.float32, name='bias2_attention') self.bias = tf.get_variable(shape=[1], initializer=tf.zeros_initializer(), dtype=tf.float32, name='bias') self.bias_u = tf.get_variable(shape=[self.num_user, 1], initializer=tf.zeros_initializer(), dtype=tf.float32, name='bias_u') # 点积的attention self.W_in_prd_att = tf.get_variable(shape=[self.dim_k, self.att_dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='inner_product_attention_weight') self.b_in_prd_att = tf.get_variable(shape=[self.att_dim_k], initializer=tf.zeros_initializer, dtype=tf.float32, name='inner_product_att_bias') self.w_in_prd_att = tf.get_variable(shape=[self.att_dim_k, 1], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='inner_product_w') # deep 参数 if self.use_deep: self.Wu_deep_emb = tf.get_variable(shape=[self.num_user, self.dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='user_deep_emb') self.Wwords_deep_emb = tf.get_variable(shape=[self.num_words, self.dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='words_deep_emb') self.W_deep_one_hots = [] for i in range(len(self.one_hots_dims)): W_temp = tf.get_variable(shape=[self.one_hots_dims[i], self.dim_k], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, name='deep_one_hot_{}'.format(i)) self.W_deep_one_hots.append(W_temp)
def identity_residual_block(X,name,num_channels,mid_filter_shape,is_training, dropout_rate=0.0,apply_batchnorm=True,weight_decay=None, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: This layer implements the one of the special case of residual layer, when the shortcut/skip connection is directly connected to main branch without any extra projection since dimension (nH,nW) dont change in the main branch. We will be using bottle-neck approach to reduce computational complexity as mentioned in the ResNet Paper. There are three sub-layer in this layer: Conv1(one-one):F1 channels ---> Conv2(fh,fw):F2 channels --->Conv3(one-one):F3 channels USAGE: INPUT: X : the input 'image' to this layer name : the name for this identity resnet block num_channels :the number of channels/filters in each of sub-layer a tuple of (F1,F2,F3) mid_filter_shape: (fh,fw) a tuple of shape of the filter to be used OUTPUT: A : the output feature map/image of this layer ''' with tf.variable_scope(name): #Applying the first one-one convolution A1=rectified_conv2d(X,name='branch_2a', filter_shape=(1,1), output_channel=num_channels[0], stride=(1,1), padding_type="VALID", is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, initializer=initializer) #Applying the Filtering in the mid sub-layer A2=rectified_conv2d(A1,name='branch_2b', filter_shape=mid_filter_shape, output_channel=num_channels[1], stride=(1,1), padding_type="SAME", is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, initializer=initializer) #Again one-one convolution for upsampling #Sanity check for the last number of channels which should match with input input_channels=X.get_shape().as_list()[3] if not input_channels==num_channels[2]: raise AssertionError('Identity Block: last sub-layer channels should match input') Z3=rectified_conv2d(A2,name='branch_2c', filter_shape=(1,1), output_channel=num_channels[2], stride=(1,1), padding_type="VALID", is_training=is_training, dropout_rate=0.0, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=False, #necessary cuz addition before activation initializer=initializer) #Skip Connection #Adding the shortcut/skip connection with tf.variable_scope('skip_conn'): Z=tf.add(Z3,X) A=tf.nn.relu(Z,name='relu') #Adding dropout to the last sub-layer of this block A=tf.layers.dropout(A,rate=dropout_rate,training=is_training,name='dropout') return A
def construct(self, args, num_words, num_chars, num_tags): with self.session.graph.as_default(): if args.recodex: tf.get_variable_scope().set_initializer(tf.glorot_uniform_initializer(seed=42)) # Inputs self.sentence_lens = tf.placeholder(tf.int32, [None], name="sentence_lens") self.word_ids = tf.placeholder(tf.int32, [None, None], name="word_ids") self.charseqs = tf.placeholder(tf.int32, [None, None], name="charseqs") # slova v batchi, indexuji se podle charseq_ids self.charseq_lens = tf.placeholder(tf.int32, [None], name="charseq_lens") self.charseq_ids = tf.placeholder(tf.int32, [None, None], name="charseq_ids") # ids slov relativne v batchi self.tags = tf.placeholder(tf.int32, [None, None], name="tags") # TODO: Choose RNN cell class according to args.rnn_cell (LSTM and GRU # should be supported, using tf.nn.rnn_cell.{BasicLSTM,GRU}Cell). num_units = args.rnn_cell_dim cells = [] for i in range(2): cells.append({ 'lstm': tf.nn.rnn_cell.BasicLSTMCell(num_units, name="lstm_cell"), 'gru': tf.nn.rnn_cell.GRUCell(num_units, name="gru_cell") }[args.rnn_cell.lower()]) # TODO: Create word embeddings for num_words of dimensionality args.we_dim # using `tf.get_variable`. word_embeddings = tf.get_variable(name="word_embeddings", shape=[num_words, args.we_dim]) # TODO: Embed self.word_ids according to the word embeddings, by utilizing # `tf.nn.embedding_lookup`. embedded_word_ids = tf.nn.embedding_lookup(word_embeddings, self.word_ids) # Character-level word embeddings (CLE) # TODO: Generate character embeddings for num_chars of dimensionality args.cle_dim. char_embeddings = tf.get_variable(name="char_embeddings", shape=[num_chars, args.cle_dim]) # TODO: Embed self.charseqs (list of unique words in the batch) using the character embeddings. embedded_char_batch = tf.nn.embedding_lookup(char_embeddings, self.charseqs) # TODO: Use `tf.nn.bidirectional_dynamic_rnn` to process embedded self.charseqs using # a GRU cell of dimensionality `args.cle_dim`. cell_fw = tf.nn.rnn_cell.GRUCell(args.cle_dim, name="gru_cle_cell_fw") cell_bw = tf.nn.rnn_cell.GRUCell(args.cle_dim, name="gru_cle_cell_bw") (o_fw, o_bw), (s_fw, s_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, embedded_char_batch, sequence_length=self.charseq_lens, dtype=tf.float32) # TODO: Sum the resulting fwd and bwd state to generate character-level word embedding (CLE) # of unique words in the batch. cle = s_fw + s_bw #print(cle.shape)charseq_id #print(embedded_word_ids.shape) # TODO: Generate CLEs of all words in the batch by indexing the just computed embeddings # by self.charseq_ids (using tf.nn.embedding_lookup). cle_ids = tf.nn.embedding_lookup(cle, self.charseq_ids) # probably not # CNNE #character_embeddings = tf.get_variable("character_embeddings_cne", shape=(num_chars, args.cle_dim)) character_embeddings = char_embeddings character_embeddings_charseqs = tf.nn.embedding_lookup(character_embeddings, self.charseqs) max_pools = [] for kernel_size in range(2, args.cnne_max + 1): conv_layer = tf.layers.conv1d(character_embeddings_charseqs, filters=args.cnne_filters, kernel_size=kernel_size) maxpool_layer = tf.layers.max_pooling1d(conv_layer, 100, 100, padding='same') maxpool_layer = tf.squeeze(maxpool_layer, axis=1) max_pools.append(maxpool_layer) cnne = tf.concat(max_pools, axis=-1) cnne_ids = tf.nn.embedding_lookup(cnne, self.charseq_ids) # TODO: Concatenate the word embeddings (computed above) and the CLE (in this order). #concatenated_embeddings = tf.concat((embedded_word_ids, cle_ids), axis=2) concatenated_embeddings = tf.concat((cnne_ids, cle_ids), axis=2) #concatenated_embeddings = cle_ids # TODO(we): Using tf.nn.bidirectional_dynamic_rnn, process the embedded inputs. # Use given rnn_cell (different for fwd and bwd direction) and self.sentence_lens. (rnn_fw, rnn_bw), _ = tf.nn.bidirectional_dynamic_rnn(cells[0], cells[1], concatenated_embeddings, sequence_length=self.sentence_lens, dtype=tf.float32) # TODO(we): Concatenate the outputs for fwd and bwd directions (in the third dimension). rnn = tf.concat((rnn_fw, rnn_bw), axis=2) # TODO(we): Add a dense layer (without activation) into num_tags classes and # store result in `output_layer`. output_layer = tf.layers.dense(rnn, num_tags, name="output_layer") # TODO(we): Generate `self.predictions`. self.predictions = tf.argmax(output_layer, axis=2) # TODO(we): Generate `weights` as a 1./0. mask of valid/invalid words (using `tf.sequence_mask`). weights = tf.sequence_mask(self.sentence_lens, maxlen=tf.reduce_max(self.sentence_lens), dtype=tf.float32) # Training # TODO(we): Define `loss` using `tf.losses.sparse_softmax_cross_entropy`, but additionally # use `weights` parameter to mask-out invalid words. loss = tf.losses.sparse_softmax_cross_entropy(self.tags, output_layer, weights=weights) global_step = tf.train.create_global_step() self.training = tf.train.AdamOptimizer().minimize(loss, global_step=global_step, name="training") # Summaries self.current_accuracy, self.update_accuracy = tf.metrics.accuracy(self.tags, self.predictions, weights=weights) self.current_loss, self.update_loss = tf.metrics.mean(loss, weights=tf.reduce_sum(weights)) self.reset_metrics = tf.variables_initializer(tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)) summary_writer = tf.contrib.summary.create_file_writer(args.logdir, flush_millis=10 * 1000) self.summaries = {} with summary_writer.as_default(), tf.contrib.summary.record_summaries_every_n_global_steps(10): self.summaries["train"] = [tf.contrib.summary.scalar("train/loss", self.update_loss), tf.contrib.summary.scalar("train/accuracy", self.update_accuracy)] with summary_writer.as_default(), tf.contrib.summary.always_record_summaries(): for dataset in ["dev", "test"]: self.summaries[dataset] = [tf.contrib.summary.scalar(dataset + "/loss", self.current_loss), tf.contrib.summary.scalar(dataset + "/accuracy", self.current_accuracy)] # Initialize variables self.session.run(tf.global_variables_initializer()) with summary_writer.as_default(): tf.contrib.summary.initialize(session=self.session, graph=self.session.graph)
def inception_block(X,name,final_channel_list,compress_channel_list, is_training,dropout_rate=0.0, apply_batchnorm=True,weight_decay=None, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: This block will enable us to have multiple filter's activation in the same layer. Multiple filters (here only 1x1,3x3,5x5 and a maxpooling layer) will be aplied to the input image and the ouptput of all these filters will be stacked in one layer. This is biologically inspired where we first extrct the feature of multiple frequencey/filter and then combine it to furthur abstract the idea/image. Filters larger than 5 not included as they will/could increase the computational complexity. USAGE: INPUT: X :the input image/tensor. name :the name to be given this whole block.will be used in visualization final_channel_list : the list of channels as output of these filter [# 1x1 channels,# 3x3 channels, # 5x5 channels,# compressed maxpool channels] compress_channel_list: since we need to compress the input to do 3x3 and 5x5 convolution. So we need the number of channels to compress into. list [#compressed channel for 3x3, #compressed channel for 5x5] ''' with tf.variable_scope(name): #Starting with the direct one-one convolution to output A1=rectified_conv2d(X, name='1x1', filter_shape=(1,1), output_channel=final_channel_list[0], stride=(1,1), padding_type='VALID', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #Now starting the 3x3 convolution part #first compressing by 1x1 C3=rectified_conv2d(X, name='compress_3x3', filter_shape=(1,1), output_channel=compress_channel_list[0], stride=(1,1), padding_type='VALID', is_training=is_training, dropout_rate=0.0, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #now doing 3x3 convolution on this compressed 'image' A3=rectified_conv2d(C3, name='3x3', filter_shape=(3,3), output_channel=final_channel_list[1], stride=(1,1), padding_type='SAME', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #Now starting the same structure for the 5x5 conv part #first compressing by 1x1 C5=rectified_conv2d(X, name='compress_5x5', filter_shape=(1,1), output_channel=compress_channel_list[1], stride=(1,1), padding_type='VALID', is_training=is_training, dropout_rate=0.0, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #now doing 5x5 convolution on this compressed 'image' A5=rectified_conv2d(C5, name='5x5', filter_shape=(5,5), output_channel=final_channel_list[2], stride=(1,1), padding_type='SAME', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #Now adding the 3x3 maxpooling layer #first maxpooling CMp=max_pooling2d(X, name='maxpool', filter_shape=(3,3), stride=(1,1), padding_type='SAME') #now comressing to reduce channels AMp=rectified_conv2d(CMp, name='compress_maxpool', filter_shape=(1,1), output_channel=final_channel_list[3], stride=(1,1), padding_type='VALID', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #Now Concatenating the sub-channels of different filter type concat_list=[A1,A3,A5,AMp] axis=-1 #across the channel axis : axis=3 A=tf.concat(concat_list,axis=axis,name='concat') return A
def construct(self, args): self.z_dim = args.z_dim with self.session.graph.as_default(): if args.recodex: tf.get_variable_scope().set_initializer(tf.glorot_uniform_initializer(seed=42)) # Inputs self.images = tf.placeholder(tf.float32, [None, self.HEIGHT, self.WIDTH, 1]) # Encoder def encoder(image): # TODO: Define an encoder as a sequence of: # - flattening layer # - dense layer with 500 neurons and ReLU activation # - dense layer with 500 neurons and ReLU activation # # Using the last hidden layer output, produce two vectors of size self.z_dim, # using two dense layers without activation, the first being `z_mean` and the second # `z_log_variance`. return z_mean, z_log_variance z_mean, z_log_variance = encoder(self.images) # TODO: Compute `z_sd` as a standard deviation from `z_log_variance`, by passing # `z_log_variance` / 2 to an exponential function. # TODO: Compute `epsilon` as a random normal noise, with a shape of `z_mean`. # TODO: Compute `self.z` by drawing from normal distribution with # mean `z_mean` and standard deviation `z_sd` (utilizing the `epsilon` noise). # Decoder def decoder(z): # TODO: Define a decoder as a sequence of: # - dense layer with 500 neurons and ReLU activation # - dense layer with 500 neurons and ReLU activation # - dense layer with as many neurons as there are pixels in an image # # Consider the output of the last hidden layer to be the logits of # individual pixels. Reshape them into a correct shape for a grayscale # image of size self.WIDTH x self.HEIGHT and return them. generated_logits = decoder(self.z) # TODO: Define `self.generated_images` as generated_logits passed through a sigmoid. # Loss and training # TODO: Define `reconstruction_loss` as a sigmoid cross entropy # loss of `self.images` and `generated_logits`. # TODO: Define `latent_loss` as a mean of KL-divergences of normal distributions # N(z_mean, z_sd) and N(0, 1), utilizing `tf.distributions.kl_divergence` # and `tf.distributions.Normal`. # TODO: Define `self.loss` as a weighted combination of # reconstruction_loss (weight is the number of pixels in an image) # and latent_loss (weight is the dimensionality of the latent variable z). global_step = tf.train.create_global_step() self.training = tf.train.AdamOptimizer().minimize(self.loss, global_step=global_step, name="training") # Summaries summary_writer = tf.contrib.summary.create_file_writer(args.logdir, flush_millis=10 * 1000) with summary_writer.as_default(), tf.contrib.summary.record_summaries_every_n_global_steps(100): self.summaries = [tf.contrib.summary.scalar("vae/loss", self.loss), tf.contrib.summary.scalar("vae/reconstruction_loss", reconstruction_loss), tf.contrib.summary.scalar("vae/latent_loss", latent_loss)] self.generated_image_data = tf.placeholder(tf.float32, [None, None, 1]) with summary_writer.as_default(), tf.contrib.summary.always_record_summaries(): self.generated_image_summary = tf.contrib.summary.image("vae/generated_image", tf.expand_dims(self.generated_image_data, axis=0)) # Initialize variables self.session.run(tf.global_variables_initializer()) with summary_writer.as_default(): tf.contrib.summary.initialize(session=self.session, graph=self.session.graph) def train(self, images): return self.session.run([self.training, self.summaries, self.loss], {self.images: images})[-1] def generate(self): GRID = 20 def sample_z(batch_size): return np.random.normal(size=[batch_size, self.z_dim]) # Generate GRIDxGRID images random_images = self.session.run(self.generated_images, {self.z: sample_z(GRID * GRID)}) # Generate GRIDxGRID interpolated images if self.z_dim == 2: # Use 2D grid for sampled Z starts = np.stack([-2 * np.ones(GRID), np.linspace(-2, 2, GRID)], -1) ends = np.stack([2 * np.ones(GRID), np.linspace(-2, 2, GRID)], -1) else: # Generate random Z starts, ends = sample_z(GRID), sample_z(GRID) interpolated_z = [] for i in range(GRID): interpolated_z.extend(starts[i] + (ends[i] - starts[i]) * np.expand_dims(np.linspace(0, 1, GRID), -1)) interpolated_images = self.session.run(self.generated_images, {self.z: interpolated_z}) # Stack the random images, then an empty row, and finally interpolated imates image = np.concatenate( [np.concatenate(list(images), axis=1) for images in np.split(random_images, GRID)] + [np.zeros([self.HEIGHT, self.WIDTH * GRID, 1])] + [np.concatenate(list(images), axis=1) for images in np.split(interpolated_images, GRID)], axis=0) self.session.run(self.generated_image_summary, {self.generated_image_data: image})
def _init_graph(self): self.graph = tf.Graph() with self.graph.as_default(): tf.set_random_seed(self.params["random_seed"]) #### input self.training = tf.placeholder(tf.bool, shape=[], name="training") # seq self.seq_name = tf.placeholder(tf.int32, shape=[None, None], name="seq_name") self.seq_item_desc = tf.placeholder(tf.int32, shape=[None, None], name="seq_item_desc") self.seq_category_name = tf.placeholder(tf.int32, shape=[None, None], name="seq_category_name") if self.params["use_bigram"]: self.seq_bigram_item_desc = tf.placeholder(tf.int32, shape=[None, None], name="seq_bigram_item_desc") if self.params["use_trigram"]: self.seq_trigram_item_desc = tf.placeholder(tf.int32, shape=[None, None], name="seq_trigram_item_desc") if self.params["use_subword"]: self.seq_subword_item_desc = tf.placeholder(tf.int32, shape=[None, None], name="seq_subword_item_desc") # placeholder for length self.sequence_length_name = tf.placeholder(tf.int32, shape=[None], name="sequence_length_name") self.sequence_length_item_desc = tf.placeholder(tf.int32, shape=[None], name="sequence_length_item_desc") self.sequence_length_category_name = tf.placeholder(tf.int32, shape=[None], name="sequence_length_category_name") self.sequence_length_item_desc_subword = tf.placeholder(tf.int32, shape=[None], name="sequence_length_item_desc_subword") self.word_length = tf.placeholder(tf.int32, shape=[None, None], name="word_length") # other context self.brand_name = tf.placeholder(tf.int32, shape=[None, 1], name="brand_name") # self.category_name = tf.placeholder(tf.int32, shape=[None, 1], name="category_name") self.category_name1 = tf.placeholder(tf.int32, shape=[None, 1], name="category_name1") self.category_name2 = tf.placeholder(tf.int32, shape=[None, 1], name="category_name2") self.category_name3 = tf.placeholder(tf.int32, shape=[None, 1], name="category_name3") self.item_condition_id = tf.placeholder(tf.int32, shape=[None, 1], name="item_condition_id") self.item_condition = tf.placeholder(tf.float32, shape=[None, self.params["MAX_NUM_CONDITIONS"]], name="item_condition") self.shipping = tf.placeholder(tf.int32, shape=[None, 1], name="shipping") self.num_vars = tf.placeholder(tf.float32, shape=[None, self.params["NUM_VARS_DIM"]], name="num_vars") # target self.target = tf.placeholder(tf.float32, shape=[None, 1], name="target") #### embed # embed seq # std = np.sqrt(2 / self.params["embedding_dim"]) std = 0.001 minval = -std maxval = std emb_word = tf.Variable( tf.random_uniform([self.params["MAX_NUM_WORDS"] + 1, self.params["embedding_dim"]], minval, maxval, seed=self.params["random_seed"], dtype=tf.float32)) # emb_word2 = tf.Variable(tf.random_uniform([self.params["MAX_NUM_WORDS"] + 1, self.params["embedding_dim"]], minval, maxval, # seed=self.params["random_seed"], # dtype=tf.float32)) emb_seq_name = tf.nn.embedding_lookup(emb_word, self.seq_name) if self.params["embedding_dropout"] > 0.: emb_seq_name = word_dropout(emb_seq_name, training=self.training, dropout=self.params["embedding_dropout"], seed=self.params["random_seed"]) emb_seq_item_desc = tf.nn.embedding_lookup(emb_word, self.seq_item_desc) if self.params["embedding_dropout"] > 0.: emb_seq_item_desc = word_dropout(emb_seq_item_desc, training=self.training, dropout=self.params["embedding_dropout"], seed=self.params["random_seed"]) # emb_seq_category_name = tf.nn.embedding_lookup(emb_word, self.seq_category_name) # if self.params["embedding_dropout"] > 0.: # emb_seq_category_name = word_dropout(emb_seq_category_name, training=self.training, # dropout=self.params["embedding_dropout"], # seed=self.params["random_seed"]) if self.params["use_bigram"]: emb_seq_bigram_item_desc = embed(self.seq_bigram_item_desc, self.params["MAX_NUM_BIGRAMS"] + 1, self.params["embedding_dim"], seed=self.params["random_seed"]) if self.params["embedding_dropout"] > 0.: emb_seq_bigram_item_desc = word_dropout(emb_seq_bigram_item_desc, training=self.training, dropout=self.params["embedding_dropout"], seed=self.params["random_seed"]) if self.params["use_trigram"]: emb_seq_trigram_item_desc = embed(self.seq_trigram_item_desc, self.params["MAX_NUM_TRIGRAMS"] + 1, self.params["embedding_dim"], seed=self.params["random_seed"]) if self.params["embedding_dropout"] > 0.: emb_seq_trigram_item_desc = word_dropout(emb_seq_trigram_item_desc, training=self.training, dropout=self.params["embedding_dropout"], seed=self.params["random_seed"]) if self.params["use_subword"]: emb_seq_subword_item_desc = embed(self.seq_subword_item_desc, self.params["MAX_NUM_SUBWORDS"] + 1, self.params["embedding_dim"], seed=self.params["random_seed"]) if self.params["embedding_dropout"] > 0.: emb_seq_subword_item_desc = word_dropout(emb_seq_subword_item_desc, training=self.training, dropout=self.params["embedding_dropout"], seed=self.params["random_seed"]) # embed other context std = 0.001 minval = -std maxval = std emb_brand = tf.Variable( tf.random_uniform([self.params["MAX_NUM_BRANDS"], self.params["embedding_dim"]], minval, maxval, seed=self.params["random_seed"], dtype=tf.float32)) emb_brand_name = tf.nn.embedding_lookup(emb_brand, self.brand_name) # emb_brand_name = embed(self.brand_name, self.params["MAX_NUM_BRANDS"], self.params["embedding_dim"], # flatten=False, seed=self.params["random_seed"]) # emb_category_name = embed(self.category_name, MAX_NUM_CATEGORIES, self.params["embedding_dim"], # flatten=False) emb_category_name1 = embed(self.category_name1, self.params["MAX_NUM_CATEGORIES_LST"][0], self.params["embedding_dim"], flatten=False, seed=self.params["random_seed"]) emb_category_name2 = embed(self.category_name2, self.params["MAX_NUM_CATEGORIES_LST"][1], self.params["embedding_dim"], flatten=False, seed=self.params["random_seed"]) emb_category_name3 = embed(self.category_name3, self.params["MAX_NUM_CATEGORIES_LST"][2], self.params["embedding_dim"], flatten=False, seed=self.params["random_seed"]) emb_item_condition = embed(self.item_condition_id, self.params["MAX_NUM_CONDITIONS"] + 1, self.params["embedding_dim"], flatten=False, seed=self.params["random_seed"]) emb_shipping = embed(self.shipping, self.params["MAX_NUM_SHIPPINGS"], self.params["embedding_dim"], flatten=False, seed=self.params["random_seed"]) #### encode enc_seq_name = encode(emb_seq_name, method=self.params["encode_method"], params=self.params, sequence_length=self.sequence_length_name, mask_zero=self.params["embedding_mask_zero"], scope="enc_seq_name") enc_seq_item_desc = encode(emb_seq_item_desc, method=self.params["encode_method"], params=self.params, sequence_length=self.sequence_length_item_desc, mask_zero=self.params["embedding_mask_zero"], scope="enc_seq_item_desc") # enc_seq_category_name = encode(emb_seq_category_name, method=self.params["encode_method"], # params=self.params, sequence_length=self.sequence_length_category_name, # mask_zero=self.params["embedding_mask_zero"], # scope="enc_seq_category_name") if self.params["use_bigram"]: enc_seq_bigram_item_desc = encode(emb_seq_bigram_item_desc, method="fasttext", params=self.params, sequence_length=self.sequence_length_item_desc, mask_zero=self.params["embedding_mask_zero"], scope="enc_seq_bigram_item_desc") if self.params["use_trigram"]: enc_seq_trigram_item_desc = encode(emb_seq_trigram_item_desc, method="fasttext", params=self.params, sequence_length=self.sequence_length_item_desc, mask_zero=self.params["embedding_mask_zero"], scope="enc_seq_trigram_item_desc") # use fasttext encode method for the following if self.params["use_subword"]: enc_seq_subword_item_desc = encode(emb_seq_subword_item_desc, method="fasttext", params=self.params, sequence_length=self.sequence_length_item_desc_subword, mask_zero=self.params["embedding_mask_zero"], scope="enc_seq_subword_item_desc") context = tf.concat([ # att_seq_category_name, tf.layers.flatten(emb_brand_name), # tf.layers.flatten(emb_category_name), tf.layers.flatten(emb_category_name1), tf.layers.flatten(emb_category_name2), tf.layers.flatten(emb_category_name3), self.item_condition, tf.cast(self.shipping, tf.float32), self.num_vars], axis=-1, name="context") context_size = self.params["encode_text_dim"] * 0 + \ self.params["embedding_dim"] * 4 + \ self.params["item_condition_size"] + \ self.params["shipping_size"] + \ self.params["num_vars_size"] feature_dim = context_size + self.params["encode_text_dim"] # context = None feature_dim = self.params["encode_text_dim"] att_seq_name = attend(enc_seq_name, method=self.params["attend_method"], context=None, feature_dim=feature_dim, sequence_length=self.sequence_length_name, maxlen=self.params["max_sequence_length_name"], mask_zero=self.params["embedding_mask_zero"], training=self.training, seed=self.params["random_seed"], name="att_seq_name_attend") att_seq_item_desc = attend(enc_seq_item_desc, method=self.params["attend_method"], context=None, feature_dim=feature_dim, sequence_length=self.sequence_length_item_desc, maxlen=self.params["max_sequence_length_item_desc"], mask_zero=self.params["embedding_mask_zero"], training=self.training, seed=self.params["random_seed"], name="att_seq_item_desc_attend") if self.params["encode_text_dim"] != self.params["embedding_dim"]: att_seq_name = tf.layers.Dense(self.params["embedding_dim"])(att_seq_name) att_seq_item_desc = tf.layers.Dense(self.params["embedding_dim"])(att_seq_item_desc) # since the following use fasttext encode, the `encode_text_dim` is embedding_dim feature_dim = context_size + self.params["embedding_dim"] feature_dim = self.params["embedding_dim"] if self.params["use_bigram"]: att_seq_bigram_item_desc = attend(enc_seq_bigram_item_desc, method=self.params["attend_method"], context=None, feature_dim=feature_dim, sequence_length=self.sequence_length_item_desc, maxlen=self.params["max_sequence_length_item_desc"], mask_zero=self.params["embedding_mask_zero"], training=self.training, seed=self.params["random_seed"], name="att_seq_bigram_item_desc_attend") # reshape if self.params["encode_text_dim"] != self.params["embedding_dim"]: att_seq_bigram_item_desc = tf.layers.Dense(self.params["embedding_dim"], kernel_initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(att_seq_bigram_item_desc) if self.params["use_trigram"]: att_seq_trigram_item_desc = attend(enc_seq_trigram_item_desc, method=self.params["attend_method"], context=None, feature_dim=feature_dim, sequence_length=self.sequence_length_item_desc, maxlen=self.params["max_sequence_length_item_desc"], mask_zero=self.params["embedding_mask_zero"], training=self.training, seed=self.params["random_seed"], name="att_seq_trigram_item_desc_attend") # reshape if self.params["encode_text_dim"] != self.params["embedding_dim"]: att_seq_trigram_item_desc = tf.layers.Dense(self.params["embedding_dim"], kernel_initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(att_seq_trigram_item_desc) feature_dim = context_size + self.params["embedding_dim"] if self.params["use_subword"]: att_seq_subword_item_desc = attend(enc_seq_subword_item_desc, method="ave", context=None, feature_dim=feature_dim, sequence_length=self.sequence_length_item_desc_subword, maxlen=self.params["max_sequence_length_item_desc_subword"], mask_zero=self.params["embedding_mask_zero"], training=self.training, seed=self.params["random_seed"], name="att_seq_subword_item_desc_attend") # reshape if self.params["encode_text_dim"] != self.params["embedding_dim"]: att_seq_subword_item_desc = tf.layers.Dense(self.params["embedding_dim"], kernel_initializer=tf.glorot_uniform_initializer(), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(att_seq_subword_item_desc) deep_list = [] if self.params["enable_deep"]: # common common_list = [ # emb_seq_category_name, emb_brand_name, # emb_category_name, emb_category_name1, emb_category_name2, emb_category_name3, emb_item_condition, emb_shipping ] tmp_common = tf.concat(common_list, axis=1) # word level fm for seq_name and others tmp_name = tf.concat([emb_seq_name, tmp_common], axis=1) sum_squared_name = tf.square(tf.reduce_sum(tmp_name, axis=1)) squared_sum_name = tf.reduce_sum(tf.square(tmp_name), axis=1) fm_name = 0.5 * (sum_squared_name - squared_sum_name) # word level fm for seq_item_desc and others tmp_item_desc = tf.concat([emb_seq_item_desc, tmp_common], axis=1) sum_squared_item_desc = tf.square(tf.reduce_sum(tmp_item_desc, axis=1)) squared_sum_item_desc = tf.reduce_sum(tf.square(tmp_item_desc), axis=1) fm_item_desc = 0.5 * (sum_squared_item_desc - squared_sum_item_desc) #### predict # concat deep_list += [ att_seq_name, att_seq_item_desc, context, fm_name, fm_item_desc, ] # if self.params["use_bigram"]: # deep_list += [att_seq_bigram_item_desc] # # if self.params["use_trigram"]: # # deep_list += [att_seq_trigram_item_desc] # if self.params["use_subword"]: # deep_list += [att_seq_subword_item_desc] # fm layer fm_list = [] if self.params["enable_fm_first_order"]: bias_seq_name = embed(self.seq_name, self.params["MAX_NUM_WORDS"] + 1, 1, reduce_sum=True, seed=self.params["random_seed"]) bias_seq_item_desc = embed(self.seq_item_desc, self.params["MAX_NUM_WORDS"] + 1, 1, reduce_sum=True, seed=self.params["random_seed"]) # bias_seq_category_name = embed(self.seq_category_name, self.params["MAX_NUM_WORDS"] + 1, 1, reduce_sum=True, # seed=self.params["random_seed"]) if self.params["use_bigram"]: bias_seq_bigram_item_desc = embed(self.seq_bigram_item_desc, self.params["MAX_NUM_BIGRAMS"] + 1, 1, reduce_sum=True, seed=self.params["random_seed"]) if self.params["use_trigram"]: bias_seq_trigram_item_desc = embed(self.seq_trigram_item_desc, self.params["MAX_NUM_TRIGRAMS"] + 1, 1, reduce_sum=True, seed=self.params["random_seed"]) if self.params["use_subword"]: bias_seq_subword_item_desc = embed(self.seq_subword_item_desc, self.params["MAX_NUM_SUBWORDS"] + 1, 1, reduce_sum=True, seed=self.params["random_seed"]) bias_brand_name = embed(self.brand_name, self.params["MAX_NUM_BRANDS"], 1, flatten=True, seed=self.params["random_seed"]) # bias_category_name = embed(self.category_name, MAX_NUM_CATEGORIES, 1, flatten=True) bias_category_name1 = embed(self.category_name1, self.params["MAX_NUM_CATEGORIES_LST"][0], 1, flatten=True, seed=self.params["random_seed"]) bias_category_name2 = embed(self.category_name2, self.params["MAX_NUM_CATEGORIES_LST"][1], 1, flatten=True, seed=self.params["random_seed"]) bias_category_name3 = embed(self.category_name3, self.params["MAX_NUM_CATEGORIES_LST"][2], 1, flatten=True, seed=self.params["random_seed"]) bias_item_condition = embed(self.item_condition_id, self.params["MAX_NUM_CONDITIONS"] + 1, 1, flatten=True, seed=self.params["random_seed"]) bias_shipping = embed(self.shipping, self.params["MAX_NUM_SHIPPINGS"], 1, flatten=True, seed=self.params["random_seed"]) fm_first_order_list = [ bias_seq_name, bias_seq_item_desc, # bias_seq_category_name, bias_brand_name, # bias_category_name, bias_category_name1, bias_category_name2, bias_category_name3, bias_item_condition, bias_shipping, ] if self.params["use_bigram"]: fm_first_order_list += [bias_seq_bigram_item_desc] if self.params["use_trigram"]: fm_first_order_list += [bias_seq_trigram_item_desc] # if self.params["use_subword"]: # fm_first_order_list += [bias_seq_subword_item_desc] tmp_first_order = tf.concat(fm_first_order_list, axis=1) fm_list.append(tmp_first_order) if self.params["enable_fm_second_order"]: # second order emb_list = [ tf.expand_dims(att_seq_name, axis=1), tf.expand_dims(att_seq_item_desc, axis=1), # tf.expand_dims(att_seq_category_name, axis=1), emb_brand_name, # emb_category_name, emb_category_name1, emb_category_name2, emb_category_name3, emb_item_condition, emb_shipping, ] if self.params["use_bigram"]: emb_list += [tf.expand_dims(att_seq_bigram_item_desc, axis=1)] # if self.params["use_trigram"]: # emb_list += [tf.expand_dims(att_seq_trigram_item_desc, axis=1)] if self.params["use_subword"]: emb_list += [tf.expand_dims(att_seq_subword_item_desc, axis=1)] emb_concat = tf.concat(emb_list, axis=1) emb_sum_squared = tf.square(tf.reduce_sum(emb_concat, axis=1)) emb_squared_sum = tf.reduce_sum(tf.square(emb_concat), axis=1) fm_second_order = 0.5 * (emb_sum_squared - emb_squared_sum) fm_list.extend([emb_sum_squared, emb_squared_sum]) if self.params["enable_fm_second_order"] and self.params["enable_fm_higher_order"]: fm_higher_order = dense_block(fm_second_order, hidden_units=[self.params["embedding_dim"]] * 2, dropouts=[0.] * 2, densenet=False, training=self.training, seed=self.params["random_seed"]) fm_list.append(fm_higher_order) if self.params["enable_deep"]: deep_list.extend(fm_list) deep_in = tf.concat(deep_list, axis=-1, name="concat") # dense hidden_units = [self.params["fc_dim"]*4, self.params["fc_dim"]*2, self.params["fc_dim"]] dropouts = [self.params["fc_dropout"]] * len(hidden_units) if self.params["fc_type"] == "fc": deep_out = dense_block(deep_in, hidden_units=hidden_units, dropouts=dropouts, densenet=False, training=self.training, seed=self.params["random_seed"]) elif self.params["fc_type"] == "resnet": deep_out = resnet_block(deep_in, hidden_units=hidden_units, dropouts=dropouts, cardinality=1, dense_shortcut=True, training=self.training, seed=self.params["random_seed"]) elif self.params["fc_type"] == "densenet": deep_out = dense_block(deep_in, hidden_units=hidden_units, dropouts=dropouts, densenet=True, training=self.training, seed=self.params["random_seed"]) fm_list.append(deep_out) fm_list.append(self.num_vars) fm_list.append(self.item_condition) fm_list.append(tf.cast(self.shipping, tf.float32)) out = tf.concat(fm_list, axis=-1) self.pred = tf.layers.Dense(1, kernel_initializer=tf.glorot_uniform_initializer(self.params["random_seed"]), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(out) # intermediate meta self.meta = out #### loss self.rmse = tf.sqrt(tf.losses.mean_squared_error(self.target, self.pred)) # target is normalized, so std is 1 # we apply 3 sigma principle std = 1. self.loss = tf.losses.huber_loss(self.target, self.pred, delta=1. * std) # self.loss = self.rmse #### optimizer self.learning_rate = tf.placeholder(tf.float32, shape=[], name="learning_rate") if self.params["optimizer_type"] == "nadam": self.optimizer = LazyNadamOptimizer(learning_rate=self.learning_rate, beta1=self.params["beta1"], beta2=self.params["beta2"], epsilon=1e-8, schedule_decay=self.params["schedule_decay"]) elif self.params["optimizer_type"] == "adam": self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=self.params["beta1"], beta2=self.params["beta2"], epsilon=1e-8) elif self.params["optimizer_type"] == "lazyadam": self.optimizer = tf.contrib.opt.LazyAdamOptimizer(learning_rate=self.learning_rate, beta1=self.params["beta1"], beta2=self.params["beta2"], epsilon=1e-8) elif self.params["optimizer_type"] == "adagrad": self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-7) elif self.params["optimizer_type"] == "gd": self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) elif self.params["optimizer_type"] == "momentum": self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95) elif self.params["optimizer_type"] == "rmsprop": self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, decay=0.9, momentum=0.9, epsilon=1e-8) elif self.params["optimizer_type"] == "lazypowersign": self.optimizer = LazyPowerSignOptimizer(learning_rate=self.learning_rate) elif self.params["optimizer_type"] == "lazyaddsign": self.optimizer = LazyAddSignOptimizer(learning_rate=self.learning_rate) elif self.params["optimizer_type"] == "lazyamsgrad": self.optimizer = LazyAMSGradOptimizer(learning_rate=self.learning_rate, beta1=self.params["beta1"], beta2=self.params["beta2"], epsilon=1e-8) #### training op """ https://stackoverflow.com/questions/35803425/update-only-part-of-the-word-embedding-matrix-in-tensorflow TL;DR: The default implementation of opt.minimize(loss), TensorFlow will generate a sparse update for word_emb that modifies only the rows of word_emb that participated in the forward pass. The gradient of the tf.gather(word_emb, indices) op with respect to word_emb is a tf.IndexedSlices object (see the implementation for more details). This object represents a sparse tensor that is zero everywhere, except for the rows selected by indices. A call to opt.minimize(loss) calls AdamOptimizer._apply_sparse(word_emb_grad, word_emb), which makes a call to tf.scatter_sub(word_emb, ...)* that updates only the rows of word_emb that were selected by indices. If on the other hand you want to modify the tf.IndexedSlices that is returned by opt.compute_gradients(loss, word_emb), you can perform arbitrary TensorFlow operations on its indices and values properties, and create a new tf.IndexedSlices that can be passed to opt.apply_gradients([(word_emb, ...)]). For example, you could cap the gradients using MyCapper() (as in the example) using the following calls: grad, = opt.compute_gradients(loss, word_emb) train_op = opt.apply_gradients( [tf.IndexedSlices(MyCapper(grad.values), grad.indices)]) Similarly, you could change the set of indices that will be modified by creating a new tf.IndexedSlices with a different indices. * In general, if you want to update only part of a variable in TensorFlow, you can use the tf.scatter_update(), tf.scatter_add(), or tf.scatter_sub() operators, which respectively set, add to (+=) or subtract from (-=) the value previously stored in a variable. """ # # it's slow # grads = self.optimizer.compute_gradients(self.loss) # for i, (g, v) in enumerate(grads): # if g is not None: # if isinstance(g, tf.IndexedSlices): # grads[i] = (tf.IndexedSlices(tf.clip_by_norm(g.values, self.params["optimizer_clipnorm"]), g.indices), v) # else: # grads[i] = (tf.clip_by_norm(g, self.params["optimizer_clipnorm"]), v) # self.train_op = self.optimizer.apply_gradients(grads, global_step=self.global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self.train_op = self.optimizer.minimize(self.loss)#, global_step=self.global_step) #### init self.sess, self.saver = self._init_session() # save model state to memory # https://stackoverflow.com/questions/46393983/how-can-i-restore-tensors-to-a-past-value-without-saving-the-value-to-disk/46511601 # https://stackoverflow.com/questions/33759623/tensorflow-how-to-save-restore-a-model/43333803#43333803 # Extract the global varibles from the graph. self.gvars = self.graph.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) # Exract the Assign operations for later use. self.assign_ops = [self.graph.get_operation_by_name(v.op.name + "/Assign") for v in self.gvars] # Extract the initial value ops from each Assign op for later use. self.init_values = [assign_op.inputs[1] for assign_op in self.assign_ops]
def model(x, is_training, dropout_pro, num, weight_decay): input_layer = tf.reshape(x, [-1, 32, 32, 10]) conv1 = tf.layers.conv2d( inputs=input_layer, filters=64, kernel_size=[3, 3], padding="same", kernel_initializer=tf.glorot_uniform_initializer(), activation=tf.nn.relu, kernel_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), name="conv1") conv2 = tf.layers.conv2d( inputs=conv1, filters=64, kernel_size=[3, 3], padding="same", kernel_initializer=tf.glorot_uniform_initializer(), activation=tf.nn.relu, kernel_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), name="conv2") conv3 = tf.layers.conv2d( inputs=conv2, filters=128, kernel_size=[3, 3], padding="same", kernel_initializer=tf.glorot_uniform_initializer(), activation=tf.nn.relu, kernel_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), name="conv3") pool1 = tf.layers.max_pooling2d(inputs=conv3, pool_size=[2, 2], strides=2, name="pool1") # 16 * 16 * 128 conv4 = tf.layers.conv2d( inputs=pool1, filters=128, kernel_size=[3, 3], padding="same", kernel_initializer=tf.glorot_uniform_initializer(), activation=tf.nn.relu, kernel_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), name="conv4") conv5 = tf.layers.conv2d( inputs=conv4, filters=256, kernel_size=[3, 3], padding="same", kernel_initializer=tf.glorot_uniform_initializer(), activation=tf.nn.relu, kernel_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), name="conv5") shape = conv5.get_shape() flat = tf.reshape(conv5, [-1, shape[1].value * shape[2].value * shape[3].value]) dense1 = tf.layers.dense( inputs=flat, units=1024, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), kernel_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), name="dense1") dropout1 = tf.layers.dropout(inputs=dense1, rate=dropout_pro, training=is_training, name="dropout1") dense2 = tf.layers.dense( inputs=dropout1, units=2048, activation=tf.nn.relu, kernel_initializer=tf.glorot_uniform_initializer(), kernel_regularizer=tf.contrib.layers.l2_regularizer(weight_decay), name="dense2") dropout2 = tf.layers.dropout(inputs=dense2, rate=dropout_pro, training=is_training, name="dropout2") y = tf.layers.dense(inputs=dropout2, units=num, activation=None, kernel_initializer=tf.glorot_uniform_initializer(), name="y") return y
def _forward_pass(self, ): # 用户的向量表示 with tf.name_scope('user_express'): # 用户隐向量 self.Usr_Emb = tf.nn.embedding_lookup(self.Wu_Emb, tf.cast(self.user_indices, tf.int32)) # [batch_size, dim_k] self.Usr_Feat = tf.nn.embedding_lookup(self.user_emb_feat, tf.cast(self.user_indices, tf.int32)) # [batch_size, dim_cf_emb] self.bias_usr_feat_emb = tf.get_variable(shape=[self.dim_k], initializer=tf.zeros_initializer(), dtype=tf.float32, name='bias_usr_feat_emb') self.Usr_Feat_Emb = tf.matmul(self.Usr_Feat, self.W_usr_feat_emb) + self.bias_usr_feat_emb # [batch_size, dim_k] self.Usr_Feat_Emb = tf.nn.relu(self.Usr_Feat_Emb) # self.Usr_Feat_Emb = tf.layers.dropout(self.Usr_Feat_Emb, self.dropout_emb) # self.Usr_Feat_Emb = self._batch_norm_layer(self.Usr_Feat_Emb, self.train_phase, 'user_emb_bn') self.Usr_Expr_a = self.Usr_Emb + self.Usr_Feat_Emb # [batch_size, dim_k] # 环境的向量表示 with tf.name_scope('context_express'): self.bias_ctx_emb = tf.get_variable(shape=[self.dim_k], initializer=tf.zeros_initializer(), dtype=tf.float32, name='bias_ctx_emb') self.Ctx_Emb = tf.matmul(self.num_features, self.W_Ctx) + self.bias_ctx_emb # [batch_size, dim_k] self.Ctx_Emb = self._batch_norm_layer(self.Ctx_Emb, self.train_phase, 'ctx_bn') self.Ctx_Emb = tf.nn.relu(self.Ctx_Emb) # 物品的向量表示 with tf.name_scope('item_express'): self.I_Wds_a = tf.SparseTensor(indices=tf.cast(self.item_words_indices_a, dtype=np.int64), values=self.item_words_values_a, dense_shape=[tf.cast(self.batch_size, dtype=np.int64), self.num_words]) self.att_u_a = tf.matmul(self.Usr_Expr_a, self.Wu_oh_Att) # [batch_size, dim_att] self.att_ctx = tf.matmul(self.Ctx_Emb, self.Wctx_Att) self.att_oh = [] self.bias_wds_emb = tf.get_variable(shape=[self.dim_k], initializer=tf.zeros_initializer(), dtype=tf.float32, name='bias_wds_emb') self.I_Wds_Emb_a = tf.sparse_tensor_dense_matmul(self.I_Wds_a, self.Wwords_Emb) + self.bias_wds_emb # [batch_size, dim_k] self.I_Wds_Emb_a = tf.nn.relu(self.I_Wds_Emb_a) self.att_I_Wds = tf.matmul(self.I_Wds_Emb_a, self.WW_Att) # 词的attention self.att_I_Wds = tf.nn.relu(self.att_u_a + self.att_ctx + self.att_I_Wds + self.b_oh_Att) self.att_I_Wds = tf.matmul(self.att_I_Wds, self.w_oh_Att) + self.c_oh_Att self.att_oh.append(self.att_I_Wds) vae_encoder = VAE(input_dim=2048, hidden_encoder_dim=1024, latent_dim=self.dim_k, lam=0.001, kld_loss=0.001) self.I_visual_Emb, self.vae_loss = vae_encoder.get_vae_embbeding(self.visual_emb_feat) # TODO self.I_visual_Emb = self._batch_norm_layer(self.I_visual_Emb, self.train_phase, 'vis_bn') # self.I_visual_Emb = tf.layers.dropout(self.I_visual_Emb, self.dropout_emb) self.att_I_visual = tf.matmul(self.I_visual_Emb, self.W_visual_Att) # 图像的attention self.att_I_visual = tf.nn.relu(self.att_u_a + self.att_ctx + self.att_I_visual + self.b_oh_Att) self.att_I_visual = tf.matmul(self.att_I_visual, self.w_oh_Att) + self.c_oh_Att self.att_oh.append(self.att_I_visual) self.bias_lda_emb = tf.get_variable(shape=[self.dim_k], initializer=tf.zeros_initializer(), dtype=tf.float32, name='bias_lda_emb') self.I_LDA_Emb = tf.matmul(self.words_lda, self.W_LDA_emb) + self.bias_lda_emb # [batch_size, dim_k] self.I_LDA_Emb = tf.nn.relu(self.I_LDA_Emb) self.att_I_LDA = tf.matmul(self.I_LDA_Emb, self.W_LDA_Att) # LDA的attention self.att_I_LDA = tf.nn.relu(self.att_u_a + self.att_ctx + self.att_I_LDA + self.b_oh_Att) self.att_I_LDA = tf.matmul(self.att_I_LDA, self.w_oh_Att) + self.c_oh_Att self.att_oh.append(self.att_I_LDA) self.I_One_hot_a = [] for i in range(len(self.one_hots_dims)): I_Emb_temp_a = tf.nn.embedding_lookup(self.W_one_hots[i], tf.cast(self.one_hots_a[:, i], tf.int32)) # [batch_size, dim_k] att_oh_temp = tf.matmul(I_Emb_temp_a, self.Woh_Att[i]) # [batch_size, att_dim_k] att_temp = tf.nn.relu( self.att_u_a + self.att_ctx + att_oh_temp + self.b_oh_Att) # [batch_size, att_dim_k] att_temp = tf.matmul(att_temp, self.w_oh_Att) + self.c_oh_Att # [batch_size, 1] self.att_oh.append(att_temp) self.I_One_hot_a.append(I_Emb_temp_a) self.att_oh = tf.nn.softmax(tf.concat(self.att_oh, axis=1)) # [batch_size, oh_dim] 第一列是词attention self.I_Wds_Emb_a = self.I_Wds_Emb_a * self.att_oh[:, 0:1] self.I_visual_Emb = self.I_visual_Emb * self.att_oh[:, 1:2] self.I_LDA_Emb = self.I_LDA_Emb * self.att_oh[:, 2:3] for i in range(3, len(self.one_hots_dims) + 3): self.I_One_hot_a[i - 3] = self.I_One_hot_a[i - 3] * self.att_oh[:, i:i + 1] self.Item_Expr_a = tf.add_n(self.I_One_hot_a + [self.I_visual_Emb, self.I_Wds_Emb_a, self.I_LDA_Emb]) with tf.name_scope('deep'): if self.use_deep: self.Usr_emb_deep = tf.nn.embedding_lookup(self.Wu_deep_emb, tf.cast(self.user_indices, tf.int32)) # [batch_size, dim_k] self.I_Wds_emb_deep = tf.sparse_tensor_dense_matmul(self.I_Wds_a, self.Wwords_deep_emb) # [batch_size, dim_k] self.I_one_hot_deep = [] for i in range(len(self.one_hots_dims)): I_Emb_temp_a = tf.nn.embedding_lookup(self.W_deep_one_hots[i], tf.cast(self.one_hots_a[:, i], tf.int32)) # [batch_size, dim_k] self.I_one_hot_deep.append(I_Emb_temp_a) # self.deep_input = tf.concat([self.num_features, self.Usr_Feat, self.face_num, self.visual_emb_feat], # axis=1) # [batch_size, input_dim] self.deep_input = tf.concat( [self.num_features, self.visual_emb_feat] + self.I_one_hot_deep, axis=1) # 输入加入batch_norm # self.deep_input = self._batch_norm_layer(self.deep_input, self.train_phase, 'input_bn') for i, deep_dim in enumerate(self.deep_dims): if i == 0: self.deep_input = tf.layers.dense(inputs=self.deep_input, kernel_initializer=tf.glorot_uniform_initializer(), units=deep_dim, activation=tf.nn.relu, ) self.deep_input = self._batch_norm_layer(self.deep_input, self.train_phase, 'deep_bn_{}'.format(i)) else: self.deep_input = self._hightway_layer(self.deep_input, deep_dim, 'deep_highway_{}'.format(i)) # 加入dropout self.deep_input = tf.layers.dropout(inputs=self.deep_input, rate=self.dropout_deep) self.deep_output = tf.layers.dense(self.deep_input, 1, activation=None) self.deep_output = tf.reshape(self.deep_output, [-1]) # [batch_size] with tf.name_scope('output'): # self.cf_out = tf.layers.dropout(self.Item_Expr_a * self.Usr_Expr_a, rate=self.dropout_emb) # self.ctx_usr_out = tf.layers.dropout(self.Ctx_Emb * self.Usr_Expr_a, rate=self.dropout_emb) self.ctx_item_out = tf.layers.dropout(self.Ctx_Emb * self.Item_Expr_a, rate=self.dropout_emb) self.cf_outs = [] for usr_expr in [self.Usr_Emb, self.Usr_Feat_Emb]: cf_out = tf.layers.dropout(self.Item_Expr_a * usr_expr, rate=self.dropout_emb) ctx_usr_out = tf.layers.dropout(self.Ctx_Emb * usr_expr, rate=self.dropout_emb) self.cf_outs.extend([cf_out, ctx_usr_out]) if self.use_deep: self.concated = tf.concat(self.cf_outs + [self.ctx_item_out, self.deep_input], axis=1) else: self.concated = tf.concat(self.cf_outs + [self.ctx_item_out], axis=1) self.hidden = self.concated for i, dim in enumerate(self.dim_hidden_out): if i == 0: self.hidden = tf.layers.dense(inputs=self.hidden, kernel_initializer=tf.glorot_uniform_initializer(), units=dim, activation=tf.nn.relu) else: self.hidden = self._hightway_layer(self.hidden, dim, 'out_highway_{}'.format(i)) self.hidden = tf.layers.dropout(inputs=self.hidden, rate=self.dropout_deep) self.bu = tf.nn.embedding_lookup(self.bias_u, tf.cast(self.user_indices, dtype=tf.int32)) self.y_ui_a = tf.layers.dense(self.hidden, 1, activation=None) + self.bu self.y_ui_a = tf.reshape(tf.nn.sigmoid(self.y_ui_a), [-1])
def model_fn(features, labels, mode, params): # parse params embedding_size = params['embedding_size'] # 字段嵌入大小 learning_rate = params['learning_rate'] # 学习率 field_size = params['feature_field_size'] + 10 # 字段数量 hidden_units = params['hidden_units'] # 各隐藏层隐藏单元数 use_dnn = params.get('use_dnn', True) # 是否使用Deep Neural Network(DNN), 默认True use_cin = params.get('use_cin', True) # 是否使用Compressed Interactive Network(CIN), 默认True use_fm = params.get('use_fm', False) # 是否使用Factorization Machine(FM), 默认False # optimizer_used = params.get('optimizer', 'adagrad') dropout_rate = params.get('dropout_rate', 0.5) training = False if mode == tf.estimator.ModeKeys.TRAIN: training = True tf.logging.info(params) tf.logging.info('is_training: ' + str(training)) weights = init_weights(params) # 初始化权重 first_order_outputs, embeddings = embed(features, weights, params) last_layer_to_concat = [first_order_outputs] # FM part if use_fm: second_order_outputs = 0.5 * tf.subtract(tf.square(tf.reduce_sum(embeddings, axis=1)), tf.reduce_sum(tf.square(embeddings), axis=1)) last_layer_to_concat.append(second_order_outputs) # CIN Layer if use_cin: cin_layer_size = params.get('cin_layer_size', [10, 10, 10]) field_nums = [field_size] cin_layer_0 = tf.split(embeddings, embedding_size * [1], 2) cin_layer_mat = [cin_layer_0] cin_layer_output = [] for idx, layer_size in enumerate(cin_layer_size): conv_len = field_nums[0] * field_nums[-1] cross_result = tf.matmul(cin_layer_0, cin_layer_mat[-1], transpose_b=True) cross_result = tf.reshape(cross_result, shape=[embedding_size, -1, conv_len]) cross_result = tf.transpose(cross_result, perm=[1, 0, 2]) filters = tf.get_variable(name='filter_%d' % idx, shape=[1, conv_len, layer_size], initializer=tf.glorot_uniform_initializer(), dtype=tf.float32) b = tf.get_variable('cin_b_%d' % idx, shape=[layer_size], initializer=tf.zeros_initializer(), dtype=tf.float32) conv_result = tf.nn.conv1d(cross_result, filters, stride=1, padding='VALID') conv_result = tf.nn.relu(tf.nn.bias_add(conv_result, b)) conv_result = tf.transpose(conv_result, perm=[0, 2, 1]) cin_layer_mat.append(tf.split(conv_result, embedding_size * [1], 2)) cin_layer_output.append(conv_result) field_nums.append(layer_size) cin_layer_output = tf.reduce_sum(tf.concat(cin_layer_output, axis=1), axis=2) cin_weights = tf.get_variable('cin_weights', dtype=tf.float32, shape=[cin_layer_output.shape[1], 40], initializer=tf.glorot_normal_initializer()) cin_bias = tf.get_variable('cin_bias', dtype=tf.float32, shape=[40], initializer=tf.zeros_initializer()) cin_layer_output = tf.nn.xw_plus_b(cin_layer_output, cin_weights, cin_bias) cin_layer_output = tf.layers.dropout(cin_layer_output, rate=dropout_rate, training=training) last_layer_to_concat.append(cin_layer_output) # DNN part if use_dnn: nn_outputs = tf.reshape(embeddings, shape=[-1, field_size * embedding_size]) for units in hidden_units: nn_outputs = tf.layers.dense(nn_outputs, units, activation=tf.nn.relu) nn_outputs = tf.layers.dropout(nn_outputs, rate=dropout_rate, training=training) last_layer_to_concat.append(nn_outputs) # Output layer outputs = tf.concat(last_layer_to_concat, axis=1) outputs = tf.layers.dense(outputs, 25, activation=tf.nn.relu) outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=training) logits = tf.layers.dense(outputs, 1, activation=None) if mode == tf.estimator.ModeKeys.TRAIN: labels = tf.reshape(labels, shape=[-1, 1]) # 样本标签 loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op ) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'probabilities': tf.sigmoid(logits), 'logits': logits, } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions) if mode == tf.estimator.ModeKeys.EVAL: labels = tf.reshape(labels, shape=[-1, 1]) # 样本标签 eval_metric_ops = {"auc": tf.metrics.auc(labels, tf.sigmoid(logits))} loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def get_instance(args): # pylint: disable=unused-argument """ create an instance of the initializer """ return tf.glorot_uniform_initializer(seed=SEED)
def identity3d_residual_block(X,name,num_channels,mid_filter_shape,is_training, dropout_rate=0.0,apply_batchnorm=False,weight_decay=None, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: This function will impletement a 3d "image" (which contain both spatio-temporal information) of the Identity Residual Block. The pattern is similar to the 2D version of Identity Residual Block. Again Bottle-neck approach will be used to reduce the dimension of the number of channels to decrease the computational cost of applying a larger sized filter on large number of channels. USAGE: INPUT: X : the input tensor to this layer name : the unique name of this layer for namescope/variablescope num_channels : the number of channels in each sub-layer of main branch in the resnet architecture : of shape [F1,F2,F3] mid_filter_shape:(tuple) for the mid layer of the main branch of shape [filter_width,filter_height,filter_width] {other arguments are as usual to the conv layers} OUTPUT: A : the output activataion of this layer ''' with tf.variable_scope(name): #MAIN BRANCH #Applying the first filter - one-one convolution for compressing A1=rectified_conv3d(X,name='branch_2a', filter_shape=(1,1,1), output_channel=num_channels[0], stride=(1,1,1), padding_type='VALID', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, initializer=initializer) #Applying the mid sub-layer of main branch A2=rectified_conv3d(A1,name='branch_2b', filter_shape=mid_filter_shape, output_channel=num_channels[1], stride=(1,1,1), padding_type='SAME', is_training=is_training, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, initializer=initializer) #Finally decompressing with one-one convolution #Sanity check for the last layer's num of channels should match with input input_channels=X.get_shape().as_list()[4] if not input_channels==num_channels[2]: raise AssertionError('IdentityBlock: last sub-layer channel size should be same to input') Z3=rectified_conv3d(A2,name='branch_2c', filter_shape=(1,1,1), output_channel=num_channels[2], stride=(1,1,1), padding_type='VALID', is_training=is_training, dropout_rate=0.0,#dropout will be after adding shortcut apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=False,#first we will add the shortcut and then rectify initializer=initializer) #Skip-Connection #Now we will add the shortcut path/skip connection directly from input with tf.variable_scope('skip_conn'): Z=tf.add(Z3,X) #element wise addition A=tf.nn.relu(Z,name='relu') #Now finally adding the dropout to the final activation #(this function is general purpose of any tensor shape) A=tf.layers.dropout(A,rate=dropout_rate,training=is_training,name='dropout') return A
def construct_network(self): self.word_ids = tf.placeholder(tf.int32, [None, None], name="word_ids") self.char_ids = tf.placeholder(tf.int32, [None, None, None], name="char_ids") self.sentence_lengths = tf.placeholder(tf.int32, [None], name="sentence_lengths") self.word_lengths = tf.placeholder(tf.int32, [None, None], name="word_lengths") self.label_ids = tf.placeholder(tf.int32, [None, None], name="label_ids") self.learningrate = tf.placeholder(tf.float32, name="learningrate") self.is_training = tf.placeholder(tf.int32, name="is_training") self.context_emb = tf.placeholder( tf.float32, [None, None, self.config['bert_emb_dim']], name="context_emb") self.loss = 0.0 input_tensor = None input_vector_size = 0 self.initializer = None if self.config["initializer"] == "normal": self.initializer = tf.random_normal_initializer(mean=0.0, stddev=0.1) elif self.config["initializer"] == "glorot": self.initializer = tf.glorot_uniform_initializer() elif self.config["initializer"] == "xavier": self.initializer = tf.glorot_normal_initializer() else: raise ValueError("Unknown initializer") self.word_embeddings = tf.get_variable( "word_embeddings", shape=[len(self.word2id), self.config["word_embedding_size"]], initializer=(tf.zeros_initializer() if self.config["emb_initial_zero"] == True else self.initializer), trainable=(True if self.config["train_embeddings"] == True else False)) input_tensor = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids) input_vector_size = self.config["word_embedding_size"] print(input_tensor.get_shape()) input_tensor = tf.concat([input_tensor, self.context_emb], axis=2) print(input_tensor.get_shape()) if self.config["char_embedding_size"] > 0 and self.config[ "char_recurrent_size"] > 0: with tf.variable_scope("chars"), tf.control_dependencies([ tf.assert_equal(tf.shape(self.char_ids)[2], tf.reduce_max(self.word_lengths), message="Char dimensions don't match") ]): self.char_embeddings = tf.get_variable( "char_embeddings", shape=[ len(self.char2id), self.config["char_embedding_size"] ], initializer=self.initializer, trainable=True) char_input_tensor = tf.nn.embedding_lookup( self.char_embeddings, self.char_ids) s = tf.shape(char_input_tensor) char_input_tensor = tf.reshape( char_input_tensor, shape=[ s[0] * s[1], s[2], self.config["char_embedding_size"] ]) _word_lengths = tf.reshape(self.word_lengths, shape=[s[0] * s[1]]) char_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell( self.config["char_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) char_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell( self.config["char_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) char_lstm_outputs = tf.nn.bidirectional_dynamic_rnn( char_lstm_cell_fw, char_lstm_cell_bw, char_input_tensor, sequence_length=_word_lengths, dtype=tf.float32, time_major=False) _, ((_, char_output_fw), (_, char_output_bw)) = char_lstm_outputs char_output_tensor = tf.concat( [char_output_fw, char_output_bw], axis=-1) char_output_tensor = tf.reshape( char_output_tensor, shape=[s[0], s[1], 2 * self.config["char_recurrent_size"]]) char_output_vector_size = 2 * self.config["char_recurrent_size"] if self.config["lmcost_char_gamma"] > 0.0: self.loss += self.config[ "lmcost_char_gamma"] * self.construct_lmcost( char_output_tensor, char_output_tensor, self.sentence_lengths, self.word_ids, "separate", "lmcost_char_separate") if self.config["lmcost_joint_char_gamma"] > 0.0: self.loss += self.config[ "lmcost_joint_char_gamma"] * self.construct_lmcost( char_output_tensor, char_output_tensor, self.sentence_lengths, self.word_ids, "joint", "lmcost_char_joint") if self.config["char_hidden_layer_size"] > 0: char_hidden_layer_size = self.config[ "word_embedding_size"] if self.config[ "char_integration_method"] == "attention" else self.config[ "char_hidden_layer_size"] char_output_tensor = tf.layers.dense( char_output_tensor, char_hidden_layer_size, activation=tf.tanh, kernel_initializer=self.initializer) char_output_vector_size = char_hidden_layer_size if self.config["char_integration_method"] == "concat": input_tensor = tf.concat( [input_tensor, char_output_tensor], axis=-1) input_vector_size += char_output_vector_size elif self.config["char_integration_method"] == "attention": assert ( char_output_vector_size == self.config["word_embedding_size"] ), "This method requires the char representation to have the same size as word embeddings" static_input_tensor = tf.stop_gradient(input_tensor) is_unk = tf.equal(self.word_ids, self.word2id[self.UNK]) char_output_tensor_normalised = tf.nn.l2_normalize( char_output_tensor, 2) static_input_tensor_normalised = tf.nn.l2_normalize( static_input_tensor, 2) cosine_cost = 1.0 - tf.reduce_sum(tf.multiply( char_output_tensor_normalised, static_input_tensor_normalised), axis=2) is_padding = tf.logical_not( tf.sequence_mask(self.sentence_lengths, maxlen=tf.shape(self.word_ids)[1])) cosine_cost_unk = tf.where(tf.logical_or( is_unk, is_padding), x=tf.zeros_like(cosine_cost), y=cosine_cost) self.loss += self.config[ "char_attention_cosine_cost"] * tf.reduce_sum( cosine_cost_unk) attention_evidence_tensor = tf.concat( [input_tensor, char_output_tensor], axis=2) attention_output = tf.layers.dense( attention_evidence_tensor, self.config["word_embedding_size"], activation=tf.tanh, kernel_initializer=self.initializer) attention_output = tf.layers.dense( attention_output, self.config["word_embedding_size"], activation=tf.sigmoid, kernel_initializer=self.initializer) input_tensor = tf.multiply( input_tensor, attention_output) + tf.multiply( char_output_tensor, (1.0 - attention_output)) elif self.config["char_integration_method"] == "none": input_tensor = input_tensor else: raise ValueError("Unknown char integration method") dropout_input = self.config["dropout_input"] * tf.cast( self.is_training, tf.float32) + ( 1.0 - tf.cast(self.is_training, tf.float32)) input_tensor = tf.nn.dropout(input_tensor, dropout_input, name="dropout_word") word_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell( self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) word_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell( self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) with tf.control_dependencies([ tf.assert_equal(tf.shape(self.word_ids)[1], tf.reduce_max(self.sentence_lengths), message="Sentence dimensions don't match") ]): (lstm_outputs_fw, lstm_outputs_bw), _ = tf.nn.bidirectional_dynamic_rnn( word_lstm_cell_fw, word_lstm_cell_bw, input_tensor, sequence_length=self.sentence_lengths, dtype=tf.float32, time_major=False) dropout_word_lstm = self.config["dropout_word_lstm"] * tf.cast( self.is_training, tf.float32) + ( 1.0 - tf.cast(self.is_training, tf.float32)) lstm_outputs_fw = tf.nn.dropout(lstm_outputs_fw, dropout_word_lstm) lstm_outputs_bw = tf.nn.dropout(lstm_outputs_bw, dropout_word_lstm) if self.config["lmcost_lstm_gamma"] > 0.0: self.loss += self.config[ "lmcost_lstm_gamma"] * self.construct_lmcost( lstm_outputs_fw, lstm_outputs_bw, self.sentence_lengths, self.word_ids, "separate", "lmcost_lstm_separate") if self.config["lmcost_joint_lstm_gamma"] > 0.0: self.loss += self.config[ "lmcost_joint_lstm_gamma"] * self.construct_lmcost( lstm_outputs_fw, lstm_outputs_bw, self.sentence_lengths, self.word_ids, "joint", "lmcost_lstm_joint") processed_tensor = tf.concat([lstm_outputs_fw, lstm_outputs_bw], 2) processed_tensor_size = self.config["word_recurrent_size"] * 2 if self.config["hidden_layer_size"] > 0: processed_tensor = tf.layers.dense( processed_tensor, self.config["hidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer) processed_tensor_size = self.config["hidden_layer_size"] self.scores = tf.layers.dense(processed_tensor, len(self.label2id), activation=None, kernel_initializer=self.initializer, name="output_ff") if self.config["crf_on_top"] == True: crf_num_tags = self.scores.get_shape()[2].value self.crf_transition_params = tf.get_variable( "output_crf_transitions", [crf_num_tags, crf_num_tags], initializer=self.initializer) log_likelihood, self.crf_transition_params = tf.contrib.crf.crf_log_likelihood( self.scores, self.label_ids, self.sentence_lengths, transition_params=self.crf_transition_params) self.loss += self.config["main_cost"] * tf.reduce_sum( -log_likelihood) else: self.probabilities = tf.nn.softmax(self.scores) self.predictions = tf.argmax(self.probabilities, 2) loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.scores, labels=self.label_ids) mask = tf.sequence_mask(self.sentence_lengths, maxlen=tf.shape(self.word_ids)[1]) loss_ = tf.boolean_mask(loss_, mask) self.loss += self.config["main_cost"] * tf.reduce_sum(loss_) self.train_op = self.construct_optimizer(self.config["opt_strategy"], self.loss, self.learningrate, self.config["clip"])
def model_fn(features, labels, mode, params, config): # *********************************************************************************************** # * share net * # *********************************************************************************************** net_config = params["net_config"] if mode == tf.estimator.ModeKeys.TRAIN: IS_TRAINING = True else: IS_TRAINING = False origin_image_batch = features["image"] image_window = features["image_window"] image_batch = origin_image_batch - net_config.PIXEL_MEANS # there is is_training means that bn is training, so it is important! _, share_net = get_network_byname(inputs=image_batch, config=net_config, is_training=False, reuse=tf.AUTO_REUSE) # *********************************************************************************************** # * fpn * # *********************************************************************************************** feature_pyramid = build_fpn.build_feature_pyramid(share_net, net_config) # *********************************************************************************************** # * rpn * # *********************************************************************************************** gtboxes_and_label_batch = labels.get("gt_box_labels") rpn = build_rpn.RPN(feature_pyramid=feature_pyramid, image_window=image_window, config=net_config) # rpn_proposals_scores==(2000,) rpn_proposals_boxes, rpn_proposals_scores = rpn.rpn_proposals(IS_TRAINING) rpn_location_loss, rpn_classification_loss = rpn.rpn_losses( labels["minibatch_indices"], labels["minibatch_encode_gtboxes"], labels["minibatch_objects_one_hot"]) rpn_total_loss = rpn_classification_loss + rpn_location_loss # *********************************************************************************************** # * Rerference image * # *********************************************************************************************** reference_image = load_reference_image() reference_image = tf.cast(reference_image, tf.float32) reference_image = reference_image - net_config.PIXEL_MEANS _, reference_share_net = get_network_byname(inputs=reference_image, config=net_config, is_training=False, reuse=tf.AUTO_REUSE) reference_feature_pyramid = build_fpn.build_feature_pyramid( reference_share_net, net_config) # average the features of support images # reference_feature_pyramid[key](C*S, H, W, 256)---->(C, 7, 7, 256) with tf.variable_scope('reference_feature_origision'): for key, value in reference_feature_pyramid.items(): reference_feature_pyramid[key] = tf.image.resize_bilinear( reference_feature_pyramid[key], (net_config.ROI_SIZE, net_config.ROI_SIZE)) reference_feature_pyramid[key] = tf.reduce_mean(tf.reshape( reference_feature_pyramid[key], (net_config.NUM_CLASS - 1, net_config.NUM_SUPPROTS, net_config.ROI_SIZE, net_config.ROI_SIZE, 256)), axis=1) # average the features of fpn features average_fpn_feature = [] for key, value in reference_feature_pyramid.items(): average_fpn_feature.append(value) reference_fpn_features = tf.reduce_mean(tf.stack(average_fpn_feature, axis=0), axis=0) # compute the negative features with tf.variable_scope("reference_negative"): with slim.arg_scope( [slim.conv2d], padding="SAME", weights_initializer=tf.glorot_uniform_initializer(), weights_regularizer=slim.l2_regularizer( net_config.WEIGHT_DECAY)): # the shape of positive features is (1, H, W, C*channels) positive_features = tf.reshape( tf.transpose(reference_fpn_features, (1, 2, 0, 3)), (1, net_config.ROI_SIZE, net_config.ROI_SIZE, (net_config.NUM_CLASS - 1) * 256)) # (1, H, W, channels) negative_feature = slim.conv2d(positive_features, num_outputs=256, kernel_size=[3, 3], stride=1) total_refernece_feature = tf.concat( [negative_feature, reference_fpn_features], axis=0) # *********************************************************************************************** # * Fast RCNN * # *********************************************************************************************** fast_rcnn = build_fast_rcnn.FastRCNN( feature_pyramid=feature_pyramid, rpn_proposals_boxes=rpn_proposals_boxes, origin_image=origin_image_batch, gtboxes_and_label=gtboxes_and_label_batch, reference_feature=total_refernece_feature, config=net_config, is_training=False, image_window=image_window) detections = fast_rcnn.fast_rcnn_detection() if DEBUG: rpn_proposals_vision = draw_boxes_with_scores( origin_image_batch[0, :, :, :], rpn_proposals_boxes[0, :50, :], rpn_proposals_scores[0, :50]) fast_rcnn_vision = draw_boxes_with_categories_and_scores( origin_image_batch[0, :, :, :], detections[0, :, :4], detections[0, :, 4], detections[0, :, 5]) tf.summary.image("rpn_proposals_vision", rpn_proposals_vision) tf.summary.image("fast_rcnn_vision", fast_rcnn_vision) fast_rcnn_location_loss, fast_rcnn_classification_loss = fast_rcnn.fast_rcnn_loss( ) fast_rcnn_total_loss = 5.0 * fast_rcnn_classification_loss + fast_rcnn_location_loss # train with tf.variable_scope("regularization_losses"): regularization_list = [ tf.nn.l2_loss(w.read_value()) * net_config.WEIGHT_DECAY / tf.cast(tf.size(w.read_value()), tf.float32) for w in tf.trainable_variables() if 'gamma' not in w.name and 'beta' not in w.name ] regularization_losses = tf.add_n(regularization_list) total_loss = regularization_losses + fast_rcnn_total_loss + rpn_total_loss global_step = slim.get_or_create_global_step() tf.train.init_from_checkpoint( net_config.CHECKPOINT_DIR, {net_config.NET_NAME + "/": net_config.NET_NAME + "/"}) with tf.variable_scope("optimizer"): lr = tf.train.piecewise_constant(global_step, boundaries=[ np.int64(net_config.BOUNDARY[0]), np.int64(net_config.BOUNDARY[1]) ], values=[ net_config.LEARNING_RATE, net_config.LEARNING_RATE / 10, net_config.LEARNING_RATE / 100 ]) optimizer = tf.train.MomentumOptimizer(lr, momentum=net_config.MOMENTUM) optimizer = tf.contrib.estimator.TowerOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies([tf.group(*update_ops)]): grads = optimizer.compute_gradients(total_loss) for i, (g, v) in enumerate(grads): if g is not None: grads[i] = (tf.clip_by_norm(g, 5.0), v) # clip gradients train_op = optimizer.apply_gradients(grads, global_step) # *********************************************************************************************** # * Summary * # *********************************************************************************************** # rpn loss and image tf.summary.scalar('rpn/rpn_location_loss', rpn_location_loss) tf.summary.scalar('rpn/rpn_classification_loss', rpn_classification_loss) tf.summary.scalar('rpn/rpn_total_loss', rpn_total_loss) tf.summary.scalar('fast_rcnn/fast_rcnn_location_loss', fast_rcnn_location_loss) tf.summary.scalar('fast_rcnn/fast_rcnn_classification_loss', fast_rcnn_classification_loss) tf.summary.scalar('fast_rcnn/fast_rcnn_total_loss', fast_rcnn_total_loss) tf.summary.scalar('learning_rate', lr) tf.summary.scalar('total_loss', total_loss) summary_hook = tf.train.SummarySaverHook( save_steps=net_config.SAVE_EVERY_N_STEP, output_dir=net_config.MODLE_DIR, summary_op=tf.summary.merge_all()) if mode == tf.estimator.ModeKeys.TRAIN: return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op, training_hooks=[summary_hook]) if mode == tf.estimator.ModeKeys.EVAL: predicts = { "predict_bbox": detections[:, :, :4], "predict_class_id": detections[:, :, 5], "predict_scores": detections[:, :, 4] } return tf.estimator.EstimatorSpec(mode, loss=total_loss, predictions=predicts) if mode == tf.estimator.ModeKeys.PREDICT: predicts = { "predict_bbox": detections[:, :, :4], "predict_class_id": detections[:, :, 5], "predict_scores": detections[:, :, 4] } return tf.estimator.EstimatorSpec(mode, predictions=predicts)
def rectified_conv3d(X,name,filter_shape,output_channel, stride,padding_type,is_training,dropout_rate=0.0, apply_batchnorm=False,weight_decay=None,apply_relu=True, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: This function will apply a 3D convolution to the 3D image of form : [ depth : in our case the layer of HGCAL detector height : in out case x-axis of the detector (check orientation of HGCAL) width : in our case y-axis of the detector ] Unlike the 2d version, this will also convolute in depth dimension of image since they also contain the information about TIME-axis. So, input will be of form (using default data format of tensorflow NDHWC) [batch_size,depth,height,width,channels] USAGE: INPUT: X : the input (tensor) to this layer name : unique name (string) for this convolutional layer filter_shape : (tuple) of form (filter_depth,filter_height,filter_width) output_channel : (int) the total number of output channels stride : (tuple) of form (stride_depth,stride_height,stride_width) padding_type : (string) 'SAME'/'VALID' is_training : (placeholder)to distinguish whether we are in training mode or inference/testing mode dropout_rate : (int)fraction of final activation (if relu activated) which will be dropped/made zero apply_batchnorm : (boolean) to specify to whether to use batchnorm or not.Default False weight_decay : (int) another hyperparameter to specify the proportion of L2-regularization included in final total loss. If None/0 the weight decay will not be applied. apply_relu : whether to apply relu before giving the result or not. Useful in cases in Resnet blocks and before softmax initializer : the initializer function handle for filter Varaibles OUTPUT: A : the output/final 3D image activation of this layer ''' with tf.variable_scope(name): #Creating filter weight #[batch,in_depth,in_height,in_width,in_channels] input_channel=X.get_shape().as_list()[4] fd,fh,fw=filter_shape net_filter_shape=(fd,fh,fw,input_channel,output_channel) filters=get_variable_on_cpu('W',net_filter_shape,initializer,weight_decay) #Setting up padding configuration sd,sh,sw=stride net_stride=(1,sd,sh,sw,1)#must have stride[0]=stride[4]=1 if not (padding_type=='SAME' or padding_type=='VALID'): raise AssertionError('Please use SAME/VALID string for padding_type') #Now applying the convolution Z_conv=tf.nn.conv3d(X,filters,net_stride,padding_type,name='conv3d') if apply_batchnorm==True: Z=_batch_normalization3d(Z_conv,is_training) else: #Since we are not using batchnormalization we can use bias #Bias Weight Creation net_bias_shape=(1,1,1,1,output_channel) bias_initializer=tf.zeros_initializer() #Also, conventionally we dont apply weight decay to biases biases=get_variable_on_cpu('b',net_bias_shape,bias_initializer) #Adding the biases to the Z_conv Z=tf.add(Z_conv,biases,name='bias_add') #Finally applying element wise RELU activation and DROPOUT is required if apply_relu==True: with tf.variable_scope('rl_dp'): A=tf.nn.relu(Z,name='relu') #Adding dropout only after activation/rectification A=tf.layers.dropout(A,rate=dropout_rate,training=is_training, name='dropout') else: #If we want to apply another activation from outside A=Z return A
def call(self, inputs, training=True): """ Given inputs, return the logits. :param features: :param training: :return: """ inputs_seq, masks, length = inputs length = tf.squeeze(length) # Given inputs, to generate the embedding as the input to next layer. with tf.variable_scope(name_or_scope='input_embedding_scope', reuse=tf.AUTO_REUSE) as in_em_scope: # use elmo as the input embedding if self.params.get('elmo'): elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False) # change inputs to a list of words to fit into the elmo for i in range(len(inputs)): inputs_seq[i] = [self.dic[v] for v in inputs_seq[i]] # Size of input_embedding: batch_size * max_length * 1024(default) input_embedding = elmo(inputs={ 'tokens': inputs_seq, 'sequence_len': length }, signature='tokens', as_dict=True)['elmo'] # use Bert as the input embedding if self.params.get('bert'): # TODO embed bert model here. pass # Use Glove/word2vec embedding as the input if self.params.get('word_embedding'): assert self.embedding is not None input_embedding = tf.nn.embedding_lookup( self.embedding, inputs_seq, name='input_embedding') # Use char embedding as the supplementary embedding if self.params.get('char_embedding'): # TODO embed char embedding here, need to think about how to store the instance. pass mask_embedding = tf.nn.embedding_lookup(self._mask_embedding, masks, name='mask_embedding') # concat input and mask embedding input_embedding = tf.concat([input_embedding, mask_embedding], axis=-1) with tf.variable_scope('lstm_part', reuse=tf.AUTO_REUSE) as lstm_part: lstm_output = input_embedding for i in range(self.params.get('layer_num')): lstm_output = self.add_lstm_layer(inputs=lstm_output, length=length, layer_name=i) if self.params.get('if_residual'): lstm_output = input_embedding + tf.layers.dense( inputs=lstm_output, units=self.params.get('word_dimension') + self.params.get('mask_dim')) # CRF layer with tf.variable_scope('crf_layer', reuse=tf.AUTO_REUSE) as crf_layer_layer: crf_input = tf.layers.dense( lstm_output, units=2, bias_initializer=tf.glorot_uniform_initializer()) crf_layer_ = crf_layer(inputs=crf_input, sequence_lengths=length, transition_prob=self.transition) crf_output = crf_layer_.crf_output_prob( )[:, :, -1] # The size should be batch_size * seq_len # expand crf_output's shape to batch_size * 1 * seq_len for batch matrix multipcation crf_output = tf.expand_dims(crf_output, axis=1) # also can chose matmul # sentiment_vector = tf.squeeze( # tf.einsum('aij,ajk->aik', crf_output, lstm_output)) # output shape is batch_size * embedding_dim # sentiment_vector = tf.squeeze(tf.matmul(crf_output, lstm_output)) sentiment_vector = tf.matmul(crf_output, lstm_output) # logits layer with tf.variable_scope('logits', reuse=tf.AUTO_REUSE) as logits_layer: logits = tf.layers.dense( inputs=sentiment_vector, units=self.params.get('n_classes'), activation='softmax', bias_initializer=tf.glorot_uniform_initializer()) return logits
def convolutional3d_residual_block(X,name,num_channels, first_filter_stride,mid_filter_shape,is_training, dropout_rate=0.0,apply_batchnorm=False,weight_decay=None, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: Again the fuctionality being similar to it 2d counterpart. USAGE: INPUT: first_filter_stride : (tuple) of form (stride_depth,stride_height,stride_width) for the first layer to reduce the image(dhw) dimension {Rest of the arguments are similar to the identity block} OUTPUT: A : the final output of this layer ''' with tf.variable_scope(name): #Main Branch #Applying the first one one convolution A1=rectified_conv3d(X,name='branch_2a', filter_shape=(1,1,1), output_channel=num_channels[0], stride=first_filter_stride, padding_type='VALID', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, initializer=initializer) #Applying the mid sub-layer of main branch A2=rectified_conv3d(A1,name='branch_2b', filter_shape=mid_filter_shape, output_channel=num_channels[1], stride=(1,1,1), padding_type='SAME', is_training=is_training, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, initializer=initializer) #Again one-one convolution for decompressing/upsampling #Here last number of channels which need not match with input #since we will match them while transforming the shortcut path Z3=rectified_conv3d(A2,name='branch_2c', filter_shape=(1,1,1), output_channel=num_channels[2], stride=(1,1,1), padding_type='VALID', is_training=is_training, dropout_rate=0.0,#dropout will be after adding shortcut apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=False,#first we will add the shortcut and then rectify initializer=initializer) #Skip-Connection/Shorcut Branch #Now we will bring the "DHWC" of input to the similar shape as main branch Z_shortcut=rectified_conv3d(X,name='branch_1', filter_shape=(1,1,1), output_channel=num_channels[2],#same number as last sub-layer stride=first_filter_stride,#now DHW same as main branch padding_type="VALID", is_training=is_training, dropout_rate=0.0,#will be added after skip connection apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=False,#will be done after skip connection initializer=initializer) #Finally merging the two branches with tf.variable_scope('skip_conn'): #now adding the two branches element wise Z=tf.add(Z3,Z_shortcut) A=tf.nn.relu(Z,name='relu') #Now adding the dropout of the last sub-layer after this skip connection A=tf.layers.dropout(A,rate=dropout_rate,training=is_training,name='dropout') return A
is_loop=is_loop, batch_size=batch_size, ans_max_len=ans_max_len, que_max_len=que_max_len, use_bert=use_bert, bert_encoder=bert_encoder, is_sort=False) # get match answer # question2id, question2answer, question2label = \ # gather_question_template(choose_template, mode='choose') with tf.Graph().as_default(): with tf.variable_scope("Model", reuse=None, initializer=tf.glorot_uniform_initializer()): answer_understander_dev = AnswerUnderstander( use_bert=use_bert, use_w2v=use_w2v, rnn_unit='lstm', dropout_rate=dropout_rate, optimizer=optimizer, learning_rate=learning_rate, grad_clipper=grad_clipper, global_step=None, attention_dim=attention_dim, nb_hops=nb_hops, rnn_dim=rnn_dim, lambda_l2=lambda_l2, is_training=False, sentiment_polarity_multiple=sentiment_polarity_multiple,
def inception_global_filter_layer(X,name, first_filter_shape,first_filter_stride, second_filter_shape,second_filter_stride, final_channel_list, is_training, dropout_rate=0.0, apply_batchnorm=False, weight_decay=None, initializer=tf.glorot_uniform_initializer()): ''' DESCRIPTION: Using this layer we will try to bring the inception like pattern with the global filters which we are using in model2 (in the depth dimension). Currently we will use usual spatial filter dimensions like (3x3,5x5,...) but the depth dimension will probe the whole depth at once with the global filters of sized equal to the depth. USAGE: X : the input to this layer name : a unique name given to this layer for better visualization in tensorboard first_filter_shape : the shape of the first filter of this layer first_filter_stride : the stride in the first filter convolution second_filter_shape : the shape of the second filter second_filter_stride: the stride of the second filter while convolving final_channel_list : this will specify the channel number of both filters [channel output filter 1, channel output filter 2] is_training : to specify whether we are in training or testing mode for dropout and batch normalization dropout_rate : the rate of dropping out the activation in that particular layer. apply_batchnorm : whether to apply batch norm or not (default False) weight_decay : how much L2 regularization we should apply initializer : the initializer for the variables ''' with tf.variable_scope(name): with tf.name_scope('first_global_filter'): #Defining the first layer convolution steps #Creating the appropriate padding in the spatial dimension fx,fy,_=first_filter_shape px=(fx-1)/2 #if integer division is possible it will give int py=(fy-1)/2 padding=[[0,0],[px,px],[py,py],[0,0],[0,0]] #leaving the padding in last dimension #Padding the input activation/image padded_X1=tf.pad(X,paddings=padding,mode='CONSTANT', name='spatial_same_pad',constant_values=0) #Now applying the usual convolution A1=rectified_conv3d(padded_X1, name='conv3d1', filter_shape=first_filter_shape, output_channel=final_channel_list[0], stride=first_filter_stride, padding_type='VALID', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) with tf.name_scope('second_global_filter'): #Now defining the computation of the second filter #Creatign the appropriate padding for the spatial dimension fx,fy,_=second_filter_shape px=(fx-1)/2 py=(fy-1)/2 padding=[[0,0],[px,px],[py,py],[0,0],[0,0]] #Applying the padding to the input image padded_X2=tf.pad(X,paddings=padding,mode='CONSTANT', name='spatial_same_pad',constant_values=0) #Now applying the usual convolution with appropriate stride A2=rectified_conv3d(padded_X2, name='conv3d2', filter_shape=second_filter_shape, output_channel=final_channel_list[1], stride=second_filter_stride, padding_type='VALID', is_training=is_training, dropout_rate=dropout_rate, apply_batchnorm=apply_batchnorm, weight_decay=weight_decay, apply_relu=True, initializer=initializer) #Finally merging the two different filters global view #having different receptive power concat_list=[A1,A2] axis=-1 #the last channel axis [assuming the layer : NDHWC] A=tf.concat(concat_list,axis=axis,name='concat') return A
def test(): with open('config_choose.json', encoding='utf-8') as infile: config = json.load(infile) int2bool = {1: True, 0: False} sentiment_words_path = config["train_config"]["SENTIMENT_WORDS_PATH"] batch_size = config["train_config"]["BATCH_SIZE"] is_loop = int2bool[config["train_config"]["Is_LOOP"]] is_sort = int2bool[config["train_config"]["IS_SORT"]] dropout_rate = config["train_config"]["DROPOUT_RATE"] nb_classes = config["train_config"]["NB_CLASSES"] attention_dim = config["train_config"]["ATTENTION_DIM"] nb_hops = config["train_config"]["NB_HOPS"] drop_template_path = config["train_config"]["DROP_choose_TEMPLATE_PATH"] use_bert = int2bool[config["train_config"]["USE_BERT"]] optimizer = config["train_config"]["OPTIMIZER"] learning_rate = config["train_config"]["LEARNING_RATE"] grad_clipper = config["train_config"]["GRAD_CLIPPER"] drop_choose_dev_path = config["train_config"]["DROP_choose_DEV_PATH"] best_path = config["train_config"]["BEST_PATH"] question2targets_path = config["train_config"]["QUESTION2TARGETS_PATH"] use_extra_feature = config["train_config"]["USE_EXTRA_FEATURE"] ner_dict_size = config["train_config"]["NER_DICT_SIZE"] pos_dict_size = config["train_config"]["POS_DICT_SIZE"] extra_feature_dim = config["train_config"]["EXTRA_FEATURE_DIM"] ner_dict_path = config["train_config"]["NER_DICT_PATH"] pos_dict_path = config["train_config"]["POS_DICT_PATH"] rnn_dim = config["train_config"]["RNN_DIM"] lambda_l2 = config["train_config"]["LAMBDA_L2"] ans_max_len = config["train_config"]["ANS_MAX_LEN"] que_max_len = config["train_config"]["QUE_MAX_LEN"] sentiment_polarity_multiple = config["train_config"]["POLARITY_MULTIPLE"] use_w2v = True if use_bert: use_w2v = False char_voc_path = config["w2v_config"]["CHAR_VOC_PATH"] char_embedding_matrix_path = config["w2v_config"][ "CHAR_EMBEDDING_MATRIX_PATH"] word_voc_path = config["w2v_config"]["WORD_VOC_PATH"] word_embedding_matrix_path = config["w2v_config"][ "WORD_EMBEDDING_MATRIX_PATH"] bert_model_path = config["bert_config"]["BERT_MODEL_PFTH"] bert_config_file = config["bert_config"]["CONFIG_FILE"] bert_checkpoint_path = config["bert_config"]["INIT_CHECKPOINT"] bert_voc_path = config["bert_config"]["VOC_FILE"] sen2id_path = config["bert_config"]["SEN2ID_PATH"] choose_samples, _, _ = read_file(drop_choose_dev_path) # choose_template, _, _ = read_file(drop_template_path) max_sequence_len = max( max([len(sample['question']) for sample in choose_samples]), max([len(sample['answer']) for sample in choose_samples])) with open(char_voc_path, 'rb') as infile: char_voc = pickle.load(infile) with open(word_voc_path, 'rb') as infile: word_voc = pickle.load(infile) bert_encoder = BertEncoder(model_root=bert_model_path, bert_config_file=bert_config_file, init_checkpoint=bert_checkpoint_path, vocab_file=bert_voc_path, max_sequence_len=max_sequence_len, embedding_batch=3, embedding_matrix_path=None, sen2id_path=sen2id_path, vec_dim=768) instances_choose_dev = make_instances(choose_samples, char_voc, word_voc, sentiment_words_path, ner_dict_path=ner_dict_path, pos_dict_path=pos_dict_path, use_extra_feature=use_extra_feature, question2targets=question2targets, is_training=False, need_augment=False) # instances_choose_dev_with_match_result(instances_choose_dev) data_stream_choose_dev = DataStream(instances=instances_choose_dev, is_shuffle=False, is_loop=is_loop, batch_size=batch_size, ans_max_len=ans_max_len, que_max_len=que_max_len, use_bert=use_bert, bert_encoder=bert_encoder, is_sort=is_sort) with tf.Graph().as_default(): with tf.variable_scope("Model", reuse=False, initializer=tf.glorot_uniform_initializer()): answer_understander_dev = AnswerUnderstander( use_bert=use_bert, use_w2v=use_w2v, rnn_unit='lstm', dropout_rate=dropout_rate, optimizer=optimizer, learning_rate=learning_rate, grad_clipper=grad_clipper, global_step=None, attention_dim=attention_dim, nb_hops=nb_hops, rnn_dim=rnn_dim, lambda_l2=lambda_l2, is_training=False, sentiment_polarity_multiple=sentiment_polarity_multiple, nb_classes=nb_classes, use_extra_feature=use_extra_feature, ner_dict_size=ner_dict_size, pos_dict_size=pos_dict_size, extra_feature_dim=extra_feature_dim, ans_max_len=ans_max_len, que_max_len=que_max_len, char_w2v_embedding_matrix_path=char_embedding_matrix_path, word_w2v_embedding_matrix_path=word_embedding_matrix_path) saver = tf.train.Saver() sess = tf.Session() initializer = tf.global_variables_initializer() sess.run(initializer) saver.restore(sess, best_path) choose_acc = evaluation(sess, answer_understander_dev, data_stream_choose_dev, 'result_{}.txt'.format(loop_index)) print("the final choose accuracy:{}".format(choose_acc)) return choose_acc