def content_extractor(self, images, reuse=False): # images: (batch, 32, 32, 3) or (batch, 32, 32, 1) if images.get_shape()[3] == 1: # For mnist dataset, replicate the gray scale image 3 times. images = tf.image.grayscale_to_rgb(images) with tf.variable_scope('content_extractor', reuse=reuse): with slim.arg_scope([slim.conv2d], padding='SAME', activation_fn=None, stride=2, weights_initializer=tf.contrib.layers.xavier_initializer()): with slim.arg_scope([slim.batch_norm], decay=0.95, center=True, scale=True, activation_fn=tf.nn.relu, is_training=(self.mode=='train' or self.mode=='pretrain')): net = slim.conv2d(images, 64, [3, 3], scope='conv1') # (batch_size, 16, 16, 64) net = slim.batch_norm(net, scope='bn1') net = slim.conv2d(net, 128, [3, 3], scope='conv2') # (batch_size, 8, 8, 128) net = slim.batch_norm(net, scope='bn2') net = slim.conv2d(net, 256, [3, 3], scope='conv3') # (batch_size, 4, 4, 256) net = slim.batch_norm(net, scope='bn3') net = slim.conv2d(net, 128, [4, 4], padding='VALID', scope='conv4') # (batch_size, 1, 1, 128) net = slim.batch_norm(net, activation_fn=tf.nn.tanh, scope='bn4') if self.mode == 'pretrain': net = slim.conv2d(net, 10, [1, 1], padding='VALID', scope='out') net = slim.flatten(net) return net
def LResnet50E_IR(images, keep_probability, phase_train=True, bottleneck_layer_size=512, weight_decay=0.0, reuse=None): ''' conv name conv[conv_layer]_[block_index]_[block_layer_index] for resnet50 n_units=[3,4,14,3], consider one unit is dim_reduction_layer repeat n_units=[2,3,13,2] ''' with tf.variable_scope('Conv1'): net = slim.conv2d(images,64,scope='Conv1_pre') net = slim.batch_norm(net,scope='Conv1_bn') with tf.variable_scope('Conv2'): net = resface_block(net,64,stride=2,dim_match=False,scope='Conv2_pre') net = slim.repeat(net,2,resface_block,64,1,True,scope='Conv2_main') with tf.variable_scope('Conv3'): net = resface_block(net,128,stride=2,dim_match=False,scope='Conv3_pre') net = slim.repeat(net,3,resface_block,128,1,True,scope='Conv3_main') with tf.variable_scope('Conv4'): net = resface_block(net,256,stride=2,dim_match=False,scope='Conv4_pre') net = slim.repeat(net,13,resface_block,256,1,True,scope='Conv4_main') with tf.variable_scope('Conv5'): net = resface_block(net,512,stride=2,dim_match=False,scope='Conv5_pre') net = slim.repeat(net,2,resface_block,512,1,True,scope='Conv5_main') with tf.variable_scope('Logits'): net = slim.batch_norm(net,activation_fn=None,scope='bn1') net = slim.dropout(net, keep_probability, is_training=phase_train,scope='Dropout') net = slim.flatten(net) net = slim.fully_connected(net, bottleneck_layer_size, biases_initializer=tf.contrib.layers.xavier_initializer(), scope='fc1') net = slim.batch_norm(net, activation_fn=None, scope='Bottleneck') return net,''
def _depthwise_separable_conv(inputs, num_pwc_filters, width_multiplier, sc, downsample=False): """ Helper function to build the depth-wise separable convolution layer. """ num_pwc_filters = round(num_pwc_filters * width_multiplier) _stride = 2 if downsample else 1 # skip pointwise by setting num_outputs=None depthwise_conv = slim.separable_convolution2d(inputs, num_outputs=None, stride=_stride, depth_multiplier=1, kernel_size=[3, 3], scope=sc+'/depthwise_conv') bn = slim.batch_norm(depthwise_conv, scope=sc+'/dw_batch_norm') pointwise_conv = slim.convolution2d(bn, num_pwc_filters, kernel_size=[1, 1], scope=sc+'/pointwise_conv') bn = slim.batch_norm(pointwise_conv, scope=sc+'/pw_batch_norm') return bn
def resface_block(lower_input,output_channels,stride,dim_match=True,scope=None): with tf.variable_scope(scope): net = slim.batch_norm(lower_input, activation_fn=None,scope='bn1') net = slim.conv2d(net, output_channels) net = slim.batch_norm(net,scope='bn2') net = slim.conv2d(net, output_channels,stride=stride) net = slim.batch_norm(net, activation_fn=None,scope='bn3') if dim_match==True: short_cut = lower_input else: short_cut = slim.conv2d(lower_input, output_channels, stride=2, kernel_size=1) short_cut = slim.batch_norm(short_cut, activation_fn=None,scope='shortcut_bn') return short_cut + net
def generator(self, inputs, reuse=False): # inputs: (batch, 1, 1, 128) with tf.variable_scope('generator', reuse=reuse): with slim.arg_scope([slim.conv2d_transpose], padding='SAME', activation_fn=None, stride=2, weights_initializer=tf.contrib.layers.xavier_initializer()): with slim.arg_scope([slim.batch_norm], decay=0.95, center=True, scale=True, activation_fn=tf.nn.relu, is_training=(self.mode=='train')): net = slim.conv2d_transpose(inputs, 512, [4, 4], padding='VALID', scope='conv_transpose1') # (batch_size, 4, 4, 512) net = slim.batch_norm(net, scope='bn1') net = slim.conv2d_transpose(net, 256, [3, 3], scope='conv_transpose2') # (batch_size, 8, 8, 256) net = slim.batch_norm(net, scope='bn2') net = slim.conv2d_transpose(net, 128, [3, 3], scope='conv_transpose3') # (batch_size, 16, 16, 128) net = slim.batch_norm(net, scope='bn3') net = slim.conv2d_transpose(net, 1, [3, 3], activation_fn=tf.nn.tanh, scope='conv_transpose4') # (batch_size, 32, 32, 1) return net
def generator(tensor): reuse = len([t for t in tf.global_variables() if t.name.startswith('generator')]) > 0 print tensor.get_shape() with variable_scope.variable_scope('generator', reuse = reuse): tensor = slim.fully_connected(tensor, 1024) print tensor tensor = slim.batch_norm(tensor, activation_fn=tf.nn.relu) tensor = slim.fully_connected(tensor, 7*7*128) tensor = slim.batch_norm(tensor, activation_fn=tf.nn.relu) tensor = tf.reshape(tensor, [-1, 7, 7, 128]) # print '22',tensor.get_shape() tensor = slim.conv2d_transpose(tensor, 64, kernel_size=[4,4], stride=2, activation_fn = None) print 'gen',tensor.get_shape() tensor = slim.batch_norm(tensor, activation_fn = tf.nn.relu) tensor = slim.conv2d_transpose(tensor, 1, kernel_size=[4, 4], stride=2, activation_fn=tf.nn.sigmoid) return tensor
def discriminator(self, images, reuse=False): # images: (batch, 32, 32, 1) with tf.variable_scope('discriminator', reuse=reuse): with slim.arg_scope([slim.conv2d], padding='SAME', activation_fn=None, stride=2, weights_initializer=tf.contrib.layers.xavier_initializer()): with slim.arg_scope([slim.batch_norm], decay=0.95, center=True, scale=True, activation_fn=tf.nn.relu, is_training=(self.mode=='train')): net = slim.conv2d(images, 128, [3, 3], activation_fn=tf.nn.relu, scope='conv1') # (batch_size, 16, 16, 128) net = slim.batch_norm(net, scope='bn1') net = slim.conv2d(net, 256, [3, 3], scope='conv2') # (batch_size, 8, 8, 256) net = slim.batch_norm(net, scope='bn2') net = slim.conv2d(net, 512, [3, 3], scope='conv3') # (batch_size, 4, 4, 512) net = slim.batch_norm(net, scope='bn3') net = slim.conv2d(net, 1, [4, 4], padding='VALID', scope='conv4') # (batch_size, 1, 1, 1) net = slim.flatten(net) return net
def forward(self, reshaped_input): """Forward pass of a Soft-DBoW block. Args: reshaped_input: If your input is in that form: 'batch_size' x 'max_samples' x 'feature_size' It should be reshaped in the following form: 'batch_size*max_samples' x 'feature_size' by performing: reshaped_input = tf.reshape(input, [-1, features_size]) Returns: bof: the pooled vector of size: 'batch_size' x 'output_dim' """ cluster_weights = tf.get_variable("cluster_weights", [self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) activation = tf.matmul(reshaped_input, cluster_weights) if self.add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [self.cluster_size], initializer = tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) activation += cluster_biases activation = tf.nn.softmax(activation) activation = tf.reshape(activation, [-1, self.max_samples, self.cluster_size]) bof = tf.reduce_sum(activation,1) bof = tf.nn.l2_normalize(bof,1) hidden1_weights = tf.get_variable("hidden1_weights", [self.cluster_size, self.output_dim], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.cluster_size))) bof = tf.matmul(bof, hidden1_weights) if self.gating: bof = super(self.__class__, self).context_gating(bof) return bof
def batch_norm(x, train, data_format='NHWC', name=None, act=lrelu, epsilon=1e-5, momentum=0.9): return slim.batch_norm(x, decay=momentum, updates_collections=None, epsilon=epsilon, scale=True, fused=True, is_training=train, activation_fn=act, data_format=data_format, scope=name)
def forward(self,reshaped_input): cluster_weights = tf.get_variable("cluster_weights", [self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if self.add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.softmax(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) a_sum = tf.reduce_sum(activation,-2,keep_dims=True) cluster_weights2 = tf.get_variable("cluster_weights2", [1,self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) a = tf.multiply(a_sum,cluster_weights2) activation = tf.transpose(activation,perm=[0,2,1]) reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) vlad = tf.matmul(activation,reshaped_input) vlad = tf.transpose(vlad,perm=[0,2,1]) vlad = tf.subtract(vlad,a) vlad = tf.nn.l2_normalize(vlad,1) vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size]) vlad = tf.nn.l2_normalize(vlad,1) return vlad
def _depthwise_separable_conv(inputs, num_pwc_filters, sc, kernel_size, stride): """ Helper function to build the depth-wise separable convolution layer. """ # skip pointwise by setting num_outputs=None depthwise_conv = slim.separable_convolution2d(inputs, num_outputs=None, stride=stride, depth_multiplier=1, kernel_size=kernel_size, scope=sc+'/depthwise_conv') bn = slim.batch_norm(depthwise_conv, scope=sc+'/dw_batch_norm') pointwise_conv = slim.convolution2d(bn, num_pwc_filters, kernel_size=[1, 1], scope=sc+'/pointwise_conv') bn = slim.batch_norm(pointwise_conv, scope=sc+'/pw_batch_norm') return bn
def forward(self, reshaped_input): feature_size = self.feature_size cluster_size = self.cluster_size add_batch_norm = self.add_batch_norm max_frames = self.max_frames is_training = self.is_training cluster_weights = tf.get_variable("cluster_weights", [feature_size, cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases if activation == 'glu': space_ind = range(cluster_size/2) gate_ind = range(cluster_size/2,cluster_size) gates = tf.sigmoid(activation[:,gate_ind]) activation = tf.multiply(activation[:,space_ind],gates) elif activation == 'relu': activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) avg_activation = utils.FramePooling(activation, 'average') avg_activation = tf.nn.l2_normalize(avg_activation,1) max_activation = utils.FramePooling(activation, 'max') max_activation = tf.nn.l2_normalize(max_activation,1) return tf.concat([avg_activation,max_activation],1)
def batchnorm(self, layer, inp): if not self.var: temp = (inp - layer.w['moving_mean']) temp /= (np.sqrt(layer.w['moving_variance']) + 1e-5) temp *= layer.w['gamma'] return temp else: args = dict({ 'center' : False, 'scale' : True, 'epsilon': 1e-5, 'scope' : self.scope, 'updates_collections' : None, 'is_training': layer.h['is_training'], 'param_initializers': layer.w }) return slim.batch_norm(inp, **args)
def forward(self,reshaped_input): cluster_weights = tf.get_variable("cluster_weights", [self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) activation = tf.matmul(reshaped_input, cluster_weights) if self.add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) activation = tf.nn.softmax(activation) activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) gate_weights = tf.get_variable("gate_weights", [1, self.cluster_size,self.feature_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) gate_weights = tf.sigmoid(gate_weights) activation = tf.transpose(activation,perm=[0,2,1]) reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) vlagd = tf.matmul(activation,reshaped_input) vlagd = tf.multiply(vlagd,gate_weights) vlagd = tf.transpose(vlagd,perm=[0,2,1]) vlagd = tf.nn.l2_normalize(vlagd,1) vlagd = tf.reshape(vlagd,[-1,self.cluster_size*self.feature_size]) vlagd = tf.nn.l2_normalize(vlagd,1) return vlagd
def forward(self, reshaped_input): feature_size = self.feature_size cluster_size = self.cluster_size add_batch_norm = self.add_batch_norm max_frames = self.max_frames is_training = self.is_training max_pool = self.max_pool cluster_weights = tf.get_variable("cluster_weights", [feature_size, cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.softmax(activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation_sum = tf.reduce_sum(activation,1) activation_sum = tf.nn.l2_normalize(activation_sum,1) if max_pool: activation_max = tf.reduce_max(activation,1) activation_max = tf.nn.l2_normalize(activation_max,1) activation = tf.concat([activation_sum,activation_max],1) else: activation = activation_sum return activation
def custom_residual_block(x, neurons, kernel_size, stride, name, is_training, wt_decay=0.0001, use_residual=True, residual_stride_conv=True, conv_fn=slim.conv2d, batch_norm_param=None): # batch norm x and relu init_var = np.sqrt(2.0/(kernel_size**2)/neurons) with arg_scope([conv_fn], weights_regularizer=slim.l2_regularizer(wt_decay), weights_initializer=tf.random_normal_initializer(stddev=init_var), biases_initializer=tf.zeros_initializer()): if batch_norm_param is None: batch_norm_param = {'center': True, 'scale': False, 'activation_fn':tf.nn.relu, 'is_training': is_training} y = slim.batch_norm(x, scope=name+'_bn', **batch_norm_param) y = conv_fn(y, num_outputs=neurons, kernel_size=kernel_size, stride=stride, activation_fn=None, scope=name+'_1', normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_param) y = conv_fn(y, num_outputs=neurons, kernel_size=kernel_size, stride=1, activation_fn=None, scope=name+'_2') if use_residual: if stride != 1 or x.get_shape().as_list()[-1] != neurons: batch_norm_param_ = dict(batch_norm_param) batch_norm_param_['activation_fn'] = None x = conv_fn(x, num_outputs=neurons, kernel_size=1, stride=stride if residual_stride_conv else 1, activation_fn=None, scope=name+'_0_1x1', normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_param_) if not residual_stride_conv: x = slim.avg_pool2d(x, 1, stride=stride, scope=name+'_0_avg') y = tf.add(x, y, name=name+'_add') return y
def batchnorm(bottom, is_train, num_reference, epsilon=1e-3, decay=0.999, name=None): """ virtual batch normalization (poor man's version) the first half is the true batch, the second half is the reference batch. When num_reference = 0, it is just typical batch normalization. To use virtual batch normalization in test phase, "update_popmean.py" needed to be executed first (in order to store the mean and variance of the reference batch into pop_mean and pop_variance of batchnorm.) """ batch_size = bottom.get_shape().as_list()[0] inst_size = batch_size - num_reference instance_weight = np.ones([batch_size]) if inst_size > 0: reference_weight = 1.0 - (1.0 / ( num_reference + 1.0)) instance_weight[0:inst_size] = 1.0 - reference_weight instance_weight[inst_size:] = reference_weight else: decay = 0.0 return slim.batch_norm(bottom, activation_fn=None, is_training=is_train, decay=decay, scale=True, scope=name, batch_weights=instance_weight)
def context_gating(self, input_layer): """Context Gating Args: input_layer: Input layer in the following shape: 'batch_size' x 'number_of_activation' Returns: activation: gated layer in the following shape: 'batch_size' x 'number_of_activation' """ input_dim = input_layer.get_shape().as_list()[1] gating_weights = tf.get_variable("gating_weights", [input_dim, input_dim], initializer = tf.random_normal_initializer( stddev=1 / math.sqrt(input_dim))) gates = tf.matmul(input_layer, gating_weights) if self.add_batch_norm: gates = slim.batch_norm( gates, center=True, scale=True, is_training=self.is_training, scope="gating_bn") else: gating_biases = tf.get_variable("gating_biases", [input_dim], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(input_dim))) gates += gating_biases gates = tf.sigmoid(gates) activation = tf.multiply(input_layer,gates) return activation
def forward(self, reshaped_input): """Forward pass of a NetRVLAD block. Args: reshaped_input: If your input is in that form: 'batch_size' x 'max_samples' x 'feature_size' It should be reshaped in the following form: 'batch_size*max_samples' x 'feature_size' by performing: reshaped_input = tf.reshape(input, [-1, features_size]) Returns: vlad: the pooled vector of size: 'batch_size' x 'output_dim' """ cluster_weights = tf.get_variable("cluster_weights", [self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) activation = tf.matmul(reshaped_input, cluster_weights) if self.add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [self.cluster_size], initializer = tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.softmax(activation) activation = tf.reshape(activation, [-1, self.max_samples, self.cluster_size]) activation = tf.transpose(activation,perm=[0,2,1]) reshaped_input = tf.reshape(reshaped_input,[-1, self.max_samples, self.feature_size]) vlad = tf.matmul(activation,reshaped_input) vlad = tf.transpose(vlad,perm=[0,2,1]) vlad = tf.nn.l2_normalize(vlad,1) vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size]) vlad = tf.nn.l2_normalize(vlad,1) hidden1_weights = tf.get_variable("hidden1_weights", [self.cluster_size*self.feature_size, self.output_dim], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.cluster_size))) vlad = tf.matmul(vlad, hidden1_weights) if self.gating: vlad = super(self.__class__, self).context_gating(vlad) return vlad
def forward(self, reshaped_input): cluster_weights = tf.get_variable( "cluster_weights", [self.feature_size, self.cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) cluster_weights_vlad = tf.get_variable( "cluster_weights_vlad", [self.feature_size, self.cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) activation = tf.matmul( reshaped_input, cluster_weights) # None (None * max_frames) x cluster_size activation_vlad = tf.matmul( reshaped_input, cluster_weights_vlad) # None (None * max_frames) x cluster_size activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") activation_vlad = slim.batch_norm(activation_vlad, center=True, scale=True, is_training=self.is_training, scope="cluster_bn_vlad") activation = tf.nn.softmax(activation) activation_vlad = tf.nn.softmax(activation_vlad) activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size ]) # None x max_frames x cluster_size activation_vlad = tf.reshape(activation_vlad, [-1, self.max_frames, self.cluster_size ]) # None x max_frames x cluster_size ### only to vlad ### a_sum = tf.reduce_sum(activation_vlad, -2, keep_dims=True) # None x 1 x cluster_size cluster_weights2 = tf.get_variable( "cluster_weights_vlad_2", [1, self.feature_size, self.cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size) )) # 1 x feature_size x cluster_size a = tf.multiply(a_sum, cluster_weights2) # None x feature_size x cluster_size ### only to vlad ### activation = tf.transpose(activation, perm=[0, 2, 1]) # None x cluster_size x max_frame activation_vlad = tf.transpose( activation_vlad, perm=[0, 2, 1]) # None x cluster_size x max_frame reshaped_input = tf.reshape(reshaped_input, [-1, self.max_frames, self.feature_size ]) # None x max_frame x feature_size ### only to light ### lightvlad = tf.matmul( activation, reshaped_input) # None x cluster_size x feature_size lightvlad = tf.transpose(lightvlad, perm=[0, 2, 1 ]) # None x feature_size x cluster_size ### only to light ### ### only to vlad ### vlad = tf.matmul(activation_vlad, reshaped_input) # None x cluster_size x feature_size vlad = tf.transpose(vlad, perm=[0, 2, 1]) # None x feature_size x cluster_size vlad = tf.subtract(vlad, a) ### only to vlad ### vlad_final = vlad + lightvlad vlad_final = tf.nn.l2_normalize(vlad_final, 1) vlad_final = tf.reshape(vlad_final, [-1, self.cluster_size * self.feature_size]) vlad_final = tf.nn.l2_normalize(vlad_final, 1) return vlad_final
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.netvlad_cluster_size hidden1_size = hidden_size or FLAGS.netvlad_hidden_size relu = FLAGS.netvlad_relu dimred = FLAGS.netvlad_dimred gating = FLAGS.gating remove_diag = FLAGS.gating_remove_diag lightvlad = FLAGS.lightvlad vlagd = FLAGS.vlagd num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) if lightvlad: video_NetVLAD = LightVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training) audio_NetVLAD = LightVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training) elif vlagd: video_NetVLAD = NetVLAGD(1024,max_frames,cluster_size, add_batch_norm, is_training) audio_NetVLAD = NetVLAGD(128,max_frames,cluster_size/2, add_batch_norm, is_training) else: video_NetVLAD = NetVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training) audio_NetVLAD = NetVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training) if add_batch_norm:# and not lightvlad: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_VLAD"): vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024]) with tf.variable_scope("audio_VLAD"): vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:]) vlad = tf.concat([vlad_video, vlad_audio],1) vlad_dim = vlad.get_shape().as_list()[1] hidden1_weights = tf.get_variable("hidden1_weights", [vlad_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) activation = tf.matmul(vlad, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable("hidden1_biases", [hidden1_size], initializer = tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) if gating: gating_weights = tf.get_variable("gating_weights_2", [hidden1_size, hidden1_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) if remove_diag: #removes diagonals coefficients diagonals = tf.matrix_diag_part(gating_weights) gates = gates - tf.multiply(diagonals,activation) if add_batch_norm: gates = slim.batch_norm( gates, center=True, scale=True, is_training=is_training, scope="gating_bn") else: gating_biases = tf.get_variable("gating_biases", [cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) gates += gating_biases gates = tf.sigmoid(gates) activation = tf.multiply(activation,gates) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model( model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.fv_cluster_size hidden1_size = hidden_size or FLAGS.fv_hidden_size relu = FLAGS.fv_relu gating = FLAGS.gating num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) video_NetFV = NetFV(1024,max_frames,cluster_size, add_batch_norm, is_training) audio_NetFV = NetFV(128,max_frames,cluster_size/2, add_batch_norm, is_training) if add_batch_norm: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_FV"): fv_video = video_NetFV.forward(reshaped_input[:,0:1024]) with tf.variable_scope("audio_FV"): fv_audio = audio_NetFV.forward(reshaped_input[:,1024:]) fv = tf.concat([fv_video, fv_audio],1) fv_dim = fv.get_shape().as_list()[1] hidden1_weights = tf.get_variable("hidden1_weights", [fv_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) activation = tf.matmul(fv, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable("hidden1_biases", [hidden1_size], initializer = tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) if gating: gating_weights = tf.get_variable("gating_weights_2", [hidden1_size, hidden1_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) if add_batch_norm: gates = slim.batch_norm( gates, center=True, scale=True, is_training=is_training, scope="gating_bn") else: gating_biases = tf.get_variable("gating_biases", [cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) gates += gating_biases gates = tf.sigmoid(gates) activation = tf.multiply(activation,gates) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model( model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1, residual_mask=None, scope=None): with tf.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc: flops = 0 depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) preact = slim.batch_norm(inputs, activation_fn=tf.nn.relu, scope='preact') if depth == depth_in: shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') else: shortcut, current_flops = flopsometer.conv2d(preact, depth, [1, 1], stride=stride, normalizer_fn=None, activation_fn=None, scope='shortcut') flops += current_flops if residual_mask is not None: # Max-pooling trick only works correctly when stride is 1. # We assume that stride=2 happens in the first layer where # residual_mask is None. assert stride == 1 diluted_residual_mask = slim.max_pool2d(residual_mask, [3, 3], stride=1, padding='SAME') else: diluted_residual_mask = None residual, current_flops = flopsometer.conv2d( preact, depth_bottleneck, [1, 1], stride=1, output_mask=diluted_residual_mask, scope='conv1') flops += current_flops residual, current_flops = flopsometer.conv2d_same( residual, depth_bottleneck, 3, stride, rate=rate, output_mask=residual_mask, scope='conv2') flops += current_flops residual, current_flops = flopsometer.conv2d(residual, depth, [1, 1], stride=1, normalizer_fn=None, activation_fn=None, output_mask=residual_mask, scope='conv3') flops += current_flops if residual_mask is not None: residual *= residual_mask outputs = shortcut + residual return outputs, flops
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): if is_training: iterations = iterations or DBoFConfig.train_iterations else: iterations = iterations or DBoFConfig.eval_iterations add_batch_norm = add_batch_norm or DBoFConfig.dbof_add_batch_norm random_frames = sample_random_frames or DBoFConfig.sample_random_frames cluster_size = cluster_size or DBoFConfig.dbof_cluster_size hidden1_size = hidden_size or DBoFConfig.dbof_hidden_size num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.Variable( tf.random_normal([feature_size, cluster_size], stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.Variable( tf.random_normal([cluster_size], stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = utils.FramePooling(activation, DBoFConfig.dbof_pooling_method) hidden1_weights = tf.Variable( tf.random_normal([cluster_size, hidden1_size], stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.Variable( tf.random_normal([hidden1_size], stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, DBoFConfig.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params)
def create_ds_cnn_model(fingerprint_input, model_settings, model_size_info, is_training): """Builds a model with depthwise separable convolutional neural network Model definition is based on https://arxiv.org/abs/1704.04861 and Tensorflow implementation: https://github.com/Zehaos/MobileNet model_size_info: defines number of layers, followed by the DS-Conv layer parameters in the order {number of conv features, conv filter height, width and stride in y,x dir.} for each of the layers. Note that first layer is always regular convolution, but the remaining layers are all depthwise separable convolutions. """ def ds_cnn_arg_scope(weight_decay=0): """Defines the default ds_cnn argument scope. Args: weight_decay: The weight decay to use for regularizing the model. Returns: An `arg_scope` to use for the DS-CNN model. """ with slim.arg_scope( [slim.convolution2d, slim.separable_convolution2d], weights_initializer=slim.initializers.xavier_initializer(), biases_initializer=slim.init_ops.zeros_initializer(), weights_regularizer=slim.l2_regularizer(weight_decay)) as sc: return sc def _depthwise_separable_conv(inputs, num_pwc_filters, sc, kernel_size, stride): """ Helper function to build the depth-wise separable convolution layer. """ # skip pointwise by setting num_outputs=None depthwise_conv = slim.separable_convolution2d(inputs, num_outputs=None, stride=stride, depth_multiplier=1, kernel_size=kernel_size, scope=sc + '/depthwise_conv') bn = slim.batch_norm(depthwise_conv, scope=sc + '/dw_batch_norm') pointwise_conv = slim.convolution2d(bn, num_pwc_filters, kernel_size=[1, 1], scope=sc + '/pointwise_conv') bn = slim.batch_norm(pointwise_conv, scope=sc + '/pw_batch_norm') return bn if is_training: dropout_prob = tf.placeholder(tf.float32, name='dropout_prob') label_count = model_settings['label_count'] input_frequency_size = model_settings['dct_coefficient_count'] input_time_size = model_settings['spectrogram_length'] fingerprint_4d = tf.reshape(fingerprint_input, [-1, input_time_size, input_frequency_size, 1]) t_dim = input_time_size f_dim = input_frequency_size # Extract model dimensions from model_size_info num_layers = model_size_info[0] conv_feat = [None] * num_layers conv_kt = [None] * num_layers conv_kf = [None] * num_layers conv_st = [None] * num_layers conv_sf = [None] * num_layers i = 1 for layer_no in range(0, num_layers): conv_feat[layer_no] = model_size_info[i] i += 1 conv_kt[layer_no] = model_size_info[i] i += 1 conv_kf[layer_no] = model_size_info[i] i += 1 conv_st[layer_no] = model_size_info[i] i += 1 conv_sf[layer_no] = model_size_info[i] i += 1 scope = 'DS-CNN' with tf.variable_scope(scope) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope( [slim.convolution2d, slim.separable_convolution2d], activation_fn=None, weights_initializer=slim.initializers.xavier_initializer(), biases_initializer=slim.init_ops.zeros_initializer(), outputs_collections=[end_points_collection]): with slim.arg_scope([slim.batch_norm], is_training=is_training, decay=0.96, updates_collections=None, activation_fn=tf.nn.relu): for layer_no in range(0, num_layers): if layer_no == 0: net = slim.convolution2d(fingerprint_4d, conv_feat[layer_no], \ [conv_kt[layer_no], conv_kf[layer_no]], stride=[conv_st[layer_no], conv_sf[layer_no]], padding='SAME', scope='conv_1') net = slim.batch_norm(net, scope='conv_1/batch_norm') else: net = _depthwise_separable_conv(net, conv_feat[layer_no], \ kernel_size=[conv_kt[layer_no], conv_kf[layer_no]], \ stride=[conv_st[layer_no], conv_sf[layer_no]], sc='conv_ds_' + str(layer_no)) t_dim = math.ceil(t_dim / float(conv_st[layer_no])) f_dim = math.ceil(f_dim / float(conv_sf[layer_no])) net = slim.avg_pool2d(net, [t_dim, f_dim], scope='avg_pool') net = tf.squeeze(net, [1, 2], name='SpatialSqueeze') logits = slim.fully_connected(net, label_count, activation_fn=None, scope='fc1') if is_training: return logits, dropout_prob else: return logits
def forward(self, reshaped_input): """Forward pass of a NetVLAD block. Args: reshaped_input: If your input is in that form: 'batch_size' x 'max_samples' x 'feature_size' It should be reshaped in the following form: 'batch_size*max_samples' x 'feature_size' by performing: reshaped_input = tf.reshape(input, [-1, features_size]) Returns: vlad: the pooled vector of size: 'batch_size' x 'output_dim' """ cluster_weights = tf.get_variable( "cluster_weights", [self.feature_size, self.cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) activation = tf.matmul(reshaped_input, cluster_weights) # activation = tf.contrib.layers.batch_norm(activation, # center=True, scale=True, # is_training=self.is_training, # scope='cluster_bn') # activation = slim.batch_norm( # activation, # center=True, # scale=True, # is_training=self.is_training, # scope="cluster_bn") if self.add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn", fused=False) else: cluster_biases = tf.get_variable( "cluster_biases", [self.cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) activation += cluster_biases activation = tf.nn.softmax(activation) activation = tf.reshape(activation, [-1, self.max_samples, self.cluster_size]) a_sum = tf.reduce_sum(activation, -2, keep_dims=True) cluster_weights2 = tf.get_variable( "cluster_weights2", [1, self.feature_size, self.cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) a = tf.multiply(a_sum, cluster_weights2) activation = tf.transpose(activation, perm=[0, 2, 1]) reshaped_input = tf.reshape(reshaped_input, [-1, self.max_samples, self.feature_size]) vlad = tf.matmul(activation, reshaped_input) vlad = tf.transpose(vlad, perm=[0, 2, 1]) vlad = tf.subtract(vlad, a) vlad = tf.nn.l2_normalize(vlad, 1) vlad = tf.reshape(vlad, [-1, self.cluster_size * self.feature_size]) vlad = tf.nn.l2_normalize(vlad, 1) hidden1_weights = tf.get_variable( "hidden1_weights", [self.cluster_size * self.feature_size, self.output_dim], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.cluster_size))) ##Tried using dropout #vlad=tf.layers.dropout(vlad,rate=0.5,training=self.is_training) vlad = tf.matmul(vlad, hidden1_weights) ##Added a batch norm vlad = tf.contrib.layers.batch_norm(vlad, center=True, scale=True, is_training=self.is_training, scope='bn') if self.gating: vlad = super(self.__class__, self).context_gating(vlad) return vlad
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = 300 add_batch_norm = True random_frames = True cluster_size = 2048 hidden1_size = 1024 fc_dimred = True relu = False max_pool = False num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) video_Dbof = GatedDBoF(1024, max_frames, cluster_size, max_pool, add_batch_norm, is_training) audio_Dbof = SoftDBoF(128, max_frames, cluster_size / 8, max_pool, add_batch_norm, is_training) if add_batch_norm: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_DBOF"): dbof_video = video_Dbof.forward(reshaped_input[:, 0:1024]) with tf.variable_scope("audio_DBOF"): dbof_audio = audio_Dbof.forward(reshaped_input[:, 1024:]) dbof = tf.concat([dbof_video, dbof_audio], 1) dbof_dim = dbof.get_shape().as_list()[1] if fc_dimred: hidden1_weights = tf.get_variable( "hidden1_weights", [dbof_dim, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(dbof, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) else: activation = dbof aggregated_model = getattr(video_level_models, 'willow_MoeModel_moe4_noGP') return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def data_bn_layer(self, x, in_channels): if self.data_bn is True: return slim.batch_norm(x) else: return x
def nm(x): w0 = tf.Variable(1.0, name='w0') w1 = tf.Variable(0.0, name='w1') return w0 * x + w1 * slim.batch_norm(x)
def create_model(self, model_input, vocab_size, is_training, num_mixtures=None, l2_penalty=1e-8, **unused_params): """Creates a Mixture of (Logistic) Experts model. It also includes the possibility of gating the probabilities The model consists of a per-class softmax distribution over a configurable number of logistic classifiers. One of the classifiers in the mixture is not trained, and always predicts 0. Args: model_input: 'batch_size' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. is_training: Is this the training phase ? num_mixtures: The number of mixtures (excluding a dummy 'expert' that always predicts the non-existence of an entity). l2_penalty: How much to penalize the squared magnitudes of parameter values. Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are batch_size x num_classes. """ num_mixtures = num_mixtures or FLAGS.moe_num_mixtures low_rank_gating = FLAGS.moe_low_rank_gating l2_penalty = FLAGS.moe_l2; gating_probabilities = FLAGS.moe_prob_gating gating_input = FLAGS.moe_prob_gating_input input_size = model_input.get_shape().as_list()[1] remove_diag = FLAGS.gating_remove_diag if low_rank_gating == -1: gate_activations = slim.fully_connected( model_input, vocab_size * (num_mixtures + 1), activation_fn=None, biases_initializer=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates") else: gate_activations1 = slim.fully_connected( model_input, low_rank_gating, activation_fn=None, biases_initializer=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates1") gate_activations = slim.fully_connected( gate_activations1, vocab_size * (num_mixtures + 1), activation_fn=None, biases_initializer=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates2") expert_activations = slim.fully_connected( model_input, vocab_size * num_mixtures, activation_fn=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="experts") gating_distribution = tf.nn.softmax(tf.reshape( gate_activations, [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) expert_distribution = tf.nn.sigmoid(tf.reshape( expert_activations, [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures probabilities_by_class_and_batch = tf.reduce_sum( gating_distribution[:, :num_mixtures] * expert_distribution, 1) probabilities = tf.reshape(probabilities_by_class_and_batch, [-1, vocab_size]) if gating_probabilities: if gating_input == 'prob': gating_weights = tf.get_variable("gating_prob_weights", [vocab_size, vocab_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) gates = tf.matmul(probabilities, gating_weights) else: gating_weights = tf.get_variable("gating_prob_weights", [input_size, vocab_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size))) gates = tf.matmul(model_input, gating_weights) if remove_diag: #removes diagonals coefficients diagonals = tf.matrix_diag_part(gating_weights) gates = gates - tf.multiply(diagonals,probabilities) gates = slim.batch_norm( gates, center=True, scale=True, is_training=is_training, scope="gating_prob_bn") gates = tf.sigmoid(gates) probabilities = tf.multiply(probabilities,gates) return {"predictions": probabilities}
def main(args): #network = importlib.import_module(args.model_def) subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir) if not os.path.isdir( log_dir): # Create the log directory if it doesn't exist os.makedirs(log_dir) model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir) if not os.path.isdir( model_dir): # Create the model directory if it doesn't exist os.makedirs(model_dir) # Store some git revision info in a text file in the log directory src_path, _ = os.path.split(os.path.realpath(__file__)) utils.store_revision_info(src_path, log_dir, ' '.join(sys.argv)) np.random.seed(seed=args.seed) train_set = utils.get_dataset(args.data_dir) #train_set = utils.dataset_from_list(args.data_dir,args.list_file) nrof_classes = len(train_set) print('nrof_classes: ', nrof_classes) image_list, label_list = utils.get_image_paths_and_labels(train_set) print('total images: ', len(image_list)) image_list = np.array(image_list) label_list = np.array(label_list, dtype=np.int32) dataset_size = len(image_list) single_batch_size = args.people_per_batch * args.images_per_person indices = list(range(dataset_size)) np.random.shuffle(indices) def _sample_people_softmax(x): global softmax_ind if softmax_ind >= dataset_size: np.random.shuffle(indices) softmax_ind = 0 true_num_batch = min(single_batch_size, dataset_size - softmax_ind) sample_paths = image_list[indices[softmax_ind:softmax_ind + true_num_batch]] sample_labels = label_list[indices[softmax_ind:softmax_ind + true_num_batch]] softmax_ind += true_num_batch return (np.array(sample_paths), np.array(sample_labels, dtype=np.int32)) def _sample_people(x): '''We sample people based on tf.data, where we can use transform and prefetch. ''' image_paths, num_per_class = sample_people( train_set, args.people_per_batch * (args.num_gpus - 1), args.images_per_person) labels = [] for i in range(len(num_per_class)): labels.extend([i] * num_per_class[i]) return (np.array(image_paths), np.array(labels, dtype=np.int32)) def _parse_function(filename, label): file_contents = tf.read_file(filename) image = tf.image.decode_image(file_contents, channels=3) #image = tf.image.decode_jpeg(file_contents, channels=3) print(image.shape) if args.random_crop: print('use random crop') image = tf.random_crop(image, [args.image_height, args.image_width, 3]) else: print('Not use random crop') #image.set_shape((args.image_size, args.image_size, 3)) image.set_shape((None, None, 3)) image = tf.image.resize_images(image, size=(args.image_height, args.image_width)) #print(image.shape) if args.random_flip: image = tf.image.random_flip_left_right(image) #pylint: disable=no-member #image.set_shape((args.image_size, args.image_size, 3)) image.set_shape((args.image_height, args.image_width, 3)) if debug: image = tf.cast(image, tf.float32) else: image = tf.cast(image, tf.float32) image = tf.subtract(image, 127.5) image = tf.div(image, 128.) #image = tf.image.per_image_standardization(image) return image, label print('Model directory: %s' % model_dir) print('Log directory: %s' % log_dir) if args.pretrained_model: print('Pre-trained model: %s' % os.path.expanduser(args.pretrained_model)) with tf.Graph().as_default(): tf.set_random_seed(args.seed) global_step = tf.Variable(0, trainable=False, name='global_step') # Placeholder for the learning rate learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate') phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train') #the image is generated by sequence with tf.device("/cpu:0"): softmax_dataset = tf_data.Dataset.range(args.epoch_size * args.max_nrof_epochs * 100) softmax_dataset = softmax_dataset.map(lambda x: tf.py_func( _sample_people_softmax, [x], [tf.string, tf.int32])) softmax_dataset = softmax_dataset.flat_map(_from_tensor_slices) softmax_dataset = softmax_dataset.map(_parse_function, num_parallel_calls=8) softmax_dataset = softmax_dataset.batch(args.num_gpus * single_batch_size) softmax_iterator = softmax_dataset.make_initializable_iterator() softmax_next_element = softmax_iterator.get_next() softmax_next_element[0].set_shape( (args.num_gpus * single_batch_size, args.image_height, args.image_width, 3)) softmax_next_element[1].set_shape(args.num_gpus * single_batch_size) batch_image_split = tf.split(softmax_next_element[0], args.num_gpus) batch_label_split = tf.split(softmax_next_element[1], args.num_gpus) learning_rate = tf.train.exponential_decay( learning_rate_placeholder, global_step, args.learning_rate_decay_epochs * args.epoch_size, args.learning_rate_decay_factor, staircase=True) tf.summary.scalar('learning_rate', learning_rate) print('Using optimizer: {}'.format(args.optimizer)) if args.optimizer == 'ADAGRAD': opt = tf.train.AdagradOptimizer(learning_rate) elif args.optimizer == 'MOM': opt = tf.train.MomentumOptimizer(learning_rate, 0.9) elif args.optimizer == 'ADAM': opt = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999, epsilon=0.1) else: raise Exception("Not supported optimizer: {}".format( args.optimizer)) tower_losses = [] tower_cross = [] tower_dist = [] tower_reg = [] for i in range(args.num_gpus): with tf.device("/gpu:" + str(i)): with tf.name_scope("tower_" + str(i)) as scope: with slim.arg_scope([slim.model_variable, slim.variable], device="/cpu:0"): with tf.variable_scope( tf.get_variable_scope()) as var_scope: reuse = False if i == 0 else True #with slim.arg_scope(resnet_v2.resnet_arg_scope(args.weight_decay)): #prelogits, end_points = resnet_v2.resnet_v2_50(batch_image_split[i],is_training=True, # output_stride=16,num_classes=args.embedding_size,reuse=reuse) #prelogits, end_points = network.inference(batch_image_split[i], args.keep_probability, # phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size, # weight_decay=args.weight_decay, reuse=reuse) if args.network == 'sphere_network': prelogits = network.infer( batch_image_split[i], args.embedding_size) print(prelogits) elif args.network == 'resface': prelogits, _ = resface.inference( batch_image_split[i], 1.0, bottleneck_layer_size=args.embedding_size, weight_decay=args.weight_decay, reuse=reuse) elif args.network == 'inception_net': prelogits, endpoints = inception_net.inference( batch_image_split[i], 1, phase_train=True, bottleneck_layer_size=args.embedding_size, weight_decay=args.weight_decay, reuse=reuse) print(prelogits) elif args.network == 'resnet_v2': with slim.arg_scope( resnet_v2.resnet_arg_scope( args.weight_decay)): prelogits, end_points = resnet_v2.resnet_v2_50( batch_image_split[i], is_training=True, output_stride=16, num_classes=args.embedding_size, reuse=reuse) prelogits = tf.squeeze(prelogits, axis=[1, 2]) elif args.network == 'mobilenet': prelogits, net_points = mobilenet.inference( batch_image_split[i], bottleneck_layer_size=args.embedding_size, phase_train=True, weight_decay=args.weight_decay, reuse=reuse) else: raise Exception( "Not supported network: {}".format( args.network)) if args.fc_bn: prelogits = slim.batch_norm( prelogits, is_training=True, decay=0.997, epsilon=1e-5, scale=True, updates_collections=tf.GraphKeys. UPDATE_OPS, reuse=reuse, scope='softmax_bn') if args.loss_type == 'softmax': cross_entropy_mean = utils.softmax_loss( prelogits, batch_label_split[i], len(train_set), args.weight_decay, reuse) regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) tower_cross.append(cross_entropy_mean) #loss = cross_entropy_mean + args.weight_decay*tf.add_n(regularization_losses) loss = cross_entropy_mean + tf.add_n( regularization_losses) #tower_dist.append(0) #tower_cross.append(cross_entropy_mean) #tower_th.append(0) tower_losses.append(loss) tower_reg.append(regularization_losses) elif args.loss_type == 'cosface': label_reshape = tf.reshape( batch_label_split[i], [single_batch_size]) label_reshape = tf.cast( label_reshape, tf.int64) coco_loss = utils.cos_loss(prelogits, label_reshape, len(train_set), reuse, alpha=args.alpha, scale=args.scale) #scatter_loss, _ = facenet.coco_loss(prelogits,label_reshape, len(train_set),reuse,alpha=args.alpha,scale=args.scale) #coco_loss = scatter_loss['loss_total'] regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if args.network == 'sphere_network': print( 'reg loss using weight_decay * tf.add_n' ) reg_loss = args.weight_decay * tf.add_n( regularization_losses) else: print('reg loss using tf.add_n') reg_loss = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) loss = coco_loss + reg_loss tower_losses.append(loss) tower_reg.append(reg_loss) #loss = tf.add_n([cross_entropy_mean] + regularization_losses, name='total_loss') tf.get_variable_scope().reuse_variables() total_loss = tf.reduce_mean(tower_losses) total_reg = tf.reduce_mean(tower_reg) losses = {} losses['total_loss'] = total_loss losses['total_reg'] = total_reg grads = opt.compute_gradients(total_loss, tf.trainable_variables(), colocate_gradients_with_ops=True) apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = tf.group(apply_gradient_op) save_vars = [ var for var in tf.global_variables() if 'Adagrad' not in var.name and 'global_step' not in var.name ] #saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3) saver = tf.train.Saver(save_vars, max_to_keep=3) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.summary.merge_all() # Start running operations on the Graph. gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_memory_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)) # Initialize variables sess.run(tf.global_variables_initializer(), feed_dict={phase_train_placeholder: True}) sess.run(tf.local_variables_initializer(), feed_dict={phase_train_placeholder: True}) #sess.run(iterator.initializer) sess.run(softmax_iterator.initializer) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) with sess.as_default(): #pdb.set_trace() if args.pretrained_model: print('Restoring pretrained model: %s' % args.pretrained_model) saver.restore(sess, os.path.expanduser(args.pretrained_model)) # Training and validation loop epoch = 0 while epoch < args.max_nrof_epochs: step = sess.run(global_step, feed_dict=None) epoch = step // args.epoch_size if debug: debug_train(args, sess, train_set, epoch, image_batch_gather, enqueue_op, batch_size_placeholder, image_batch_split, image_paths_split, num_per_class_split, image_paths_placeholder, image_paths_split_placeholder, labels_placeholder, labels_batch, num_per_class_placeholder, num_per_class_split_placeholder, len(gpus)) # Train for one epoch train(args, sess, epoch, learning_rate_placeholder, phase_train_placeholder, global_step, losses, train_op, summary_op, summary_writer, args.learning_rate_schedule_file) # Save variables and the metagraph if it doesn't exist already save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, step) return model_dir
def inference(inputs, num_classes, n): """ total layers 6n+2 :param inputs: :param num_classes: :param n: :return: """ with slim.arg_scope(arg_scope()): net = slim.batch_norm(inputs) net = slim.conv2d(net, 16, [3, 3]) with tf.variable_scope('residual_bolck1'): for i in range(n): with tf.variable_scope('residual_bolck1_%d' % i): res = net net = slim.batch_norm(net) net = slim.conv2d(net, 16, [3, 3]) net = slim.batch_norm(net) net = slim.conv2d(net, 16, [3, 3]) net = net + res with tf.variable_scope('residual_bolck2'): for i in range(n): with tf.name_scope('residual_bolck2_%d' % i): res = net net = slim.batch_norm(net) if i == 0: net = slim.conv2d(net, 32, [3, 3], stride=2) else: net = slim.conv2d(net, 32, [3, 3]) net = slim.batch_norm(net) net = slim.conv2d(net, 32, [3, 3]) if i == 0: res = slim.avg_pool2d(res, [2, 2]) net = net + tf.pad(res, [[0, 0], [0, 0], [0, 0], [8, 8]]) else: net = net + res with tf.variable_scope('residual_bolck3'): for i in range(n): with tf.variable_scope('residual_bolck3_%d' % i): res = net net = slim.batch_norm(net) if i == 0: net = slim.conv2d(net, 64, [3, 3], stride=2) else: net = slim.conv2d(net, 64, [3, 3]) net = slim.batch_norm(net) net = slim.conv2d(net, 64, [3, 3]) if i == 0: res = slim.avg_pool2d(res, [2, 2]) net = net + tf.pad(res, [[0, 0], [0, 0], [0, 0], [16, 16]]) else: net = net + res net = slim.batch_norm(net) assert net.get_shape().as_list()[1:] == [8, 8, 64] with tf.variable_scope('fully_connected'): net = tf.reduce_mean(net, [1, 2]) net = slim.flatten(net) logits = slim.fully_connected(net, num_classes) # net = slim.dropout(net, keep_prob=0.9, scope='dropout') return logits
def forward(self, reshaped_input): """Forward pass of a NetFV block. Args: reshaped_input: If your input is in that form: 'batch_size' x 'max_samples' x 'feature_size' It should be reshaped in the following form: 'batch_size*max_samples' x 'feature_size' by performing: reshaped_input = tf.reshape(input, [-1, features_size]) Returns: fv: the pooled vector of size: 'batch_size' x 'output_dim' """ cluster_weights = tf.get_variable("cluster_weights", [self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) covar_weights = tf.get_variable("covar_weights", [self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer( mean=1.0, stddev=1 /math.sqrt(self.feature_size))) covar_weights = tf.square(covar_weights) eps = tf.constant([1e-6]) covar_weights = tf.add(covar_weights,eps) activation = tf.matmul(reshaped_input, cluster_weights) if self.add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [self.cluster_size], initializer = tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) activation += cluster_biases activation = tf.nn.softmax(activation) activation = tf.reshape(activation, [-1, self.max_samples, self.cluster_size]) a_sum = tf.reduce_sum(activation,-2,keep_dims=True) cluster_weights2 = tf.get_variable("cluster_weights2", [1,self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) a = tf.multiply(a_sum,cluster_weights2) activation = tf.transpose(activation,perm=[0,2,1]) reshaped_input = tf.reshape(reshaped_input, [-1,self.max_samples,self.feature_size]) fv1 = tf.matmul(activation,reshaped_input) fv1 = tf.transpose(fv1,perm=[0,2,1]) # computing second order FV a2 = tf.multiply(a_sum,tf.square(cluster_weights2)) b2 = tf.multiply(fv1,cluster_weights2) fv2 = tf.matmul(activation,tf.square(reshaped_input)) fv2 = tf.transpose(fv2,perm=[0,2,1]) fv2 = tf.add_n([a2,fv2,tf.scalar_mul(-2,b2)]) fv2 = tf.divide(fv2,tf.square(covar_weights)) fv2 = tf.subtract(fv2,a_sum) fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size]) fv2 = tf.nn.l2_normalize(fv2,1) fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size]) fv2 = tf.nn.l2_normalize(fv2,1) fv1 = tf.subtract(fv1,a) fv1 = tf.divide(fv1,covar_weights) fv1 = tf.nn.l2_normalize(fv1,1) fv1 = tf.reshape(fv1,[-1,self.cluster_size*self.feature_size]) fv1 = tf.nn.l2_normalize(fv1,1) fv = tf.concat([fv1,fv2],1) hidden1_weights = tf.get_variable("hidden1_weights", [2*self.cluster_size*self.feature_size, self.output_dim], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.cluster_size))) fv = tf.matmul(fv, hidden1_weights) if self.gating: fv = super(self.__class__, self).context_gating(fv) return fv
def batch_norm_fn(x): return slim.batch_norm(x, scope=tf.get_variable_scope().name + "/bn")
def create_model(self, model_input, vocab_size, num_frames, is_training=True, l2_penalty=1e-8, **unused_params): num_layers = 3 lstm_size = 900 activation_proj_dim = int(lstm_size * 1.18) pool_size = 2 num_filters = [128, 128] filter_sizes = [1, 3] features_size = int(sum(num_filters)) self.is_training = is_training cnn_input = model_input cnn_max_frames = model_input.get_shape().as_list()[1] lstm_memories = [] for layer in range(num_layers): if layer > 0: cnn_output = self.cnn(cnn_input, num_filters=num_filters, filter_sizes=filter_sizes, sub_scope="cnn%d" % (layer + 1)) tf.summary.histogram("cnn_output_{}".format(layer), cnn_output) cnn_output = slim.batch_norm(cnn_output, center=True, scale=True, is_training=self.is_training, scope="cnn_output_bn_layer_" + str(layer)) tf.summary.histogram( "cnn_output_after_bn_before_tanh_{}".format(layer), cnn_output) else: cnn_output = slim.batch_norm(cnn_input, center=True, scale=True, is_training=self.is_training, scope="cnn_output_bn_layer_" + str(layer)) tf.summary.histogram( "cnn_output_after_bn_before_tanh_{}".format(layer), cnn_output) cnn_output_tanh = tf.nn.tanh(cnn_output) tf.summary.histogram( "cnn_output_after_bn_after_tanh_{}".format(layer), cnn_output_tanh) lstm_memory = self.rnn(cnn_output_tanh, lstm_size, num_frames, sub_scope="rnn%d" % (layer + 1)) # None x lstm_size tf.summary.histogram("lstm_memory_{}".format(layer), lstm_memory) lstm_memory = tf.nn.l2_normalize(lstm_memory, 1) tf.summary.histogram("lstm_memory_after_l2Norm_{}".format(layer), lstm_memory) lstm_memories.append(lstm_memory) max_pooled_cnn_output = tf.layers.max_pooling1d(cnn_output_tanh, pool_size=3, strides=2, padding='same') # for the next cnn layer cnn_input = max_pooled_cnn_output num_frames = tf.maximum(num_frames / pool_size, 1) concat_lstm_memory = tf.concat(lstm_memories, 1) concat_lstm_memory = tf.nn.l2_normalize(concat_lstm_memory, 1) print("\n\n\nconcat_lstm_memory size: {} \n\n\n".format( concat_lstm_memory.get_shape())) vlad_dim = concat_lstm_memory.get_shape().as_list()[1] concat_lstm_memory_weights = tf.get_variable( "concat_lstm_memory_weights", [vlad_dim, activation_proj_dim], initializer=tf.glorot_uniform_initializer()) activation = tf.matmul(concat_lstm_memory, concat_lstm_memory_weights) # None x lstm_size concat_lstm_memory_biases = tf.get_variable( "concat_lstm_memory_biases", [activation_proj_dim], initializer=tf.random_normal_initializer(stddev=0.01)) activation += concat_lstm_memory_biases ## gating gating_weights = tf.get_variable( "gating_weights_2", [activation_proj_dim, activation_proj_dim], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(activation_proj_dim))) gates = tf.matmul(activation, gating_weights) gates = slim.batch_norm(gates, center=True, scale=True, is_training=self.is_training, scope="activation_gating_bn") gates = tf.sigmoid(gates) activation = tf.multiply(activation, gates) tf.summary.histogram("activation_before_video_model", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=self.is_training, **unused_params)
def main(args): """Get dataset hyperparameters.""" assert len(args) == 2 and isinstance(args[1], str) dataset_name = args[1] logger.info('Using dataset: {}'.format(dataset_name)) """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size = get_dataset_size_train(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch) with tf.Graph().as_default(), tf.device('/cpu:0'): """Get global_step.""" global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) """Get batches per epoch.""" num_batches_per_epoch = int(dataset_size / cfg.batch_size) """Use exponential decay leanring rate?""" lrn_rate = tf.maximum(tf.train.exponential_decay( 1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5) tf.summary.scalar('learning_rate', lrn_rate) opt = tf.train.AdamOptimizer() # lrn_rate """Get batch from data queue.""" batch_x, batch_labels = create_inputs() # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32) """Define the dataflow graph.""" with tf.device('/gpu:0'): with slim.arg_scope([slim.variable], device='/cpu:0'): batch_x_squash = tf.divide(batch_x, 255.) batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True) output = net.build_arch_baseline(batch_x, is_train=True, num_classes=num_classes) loss, recon_loss, _ = net.cross_ent_loss(output, batch_x_squash, batch_labels) acc = net.test_accuracy(output, batch_labels) tf.summary.scalar('train_acc', acc) tf.summary.scalar('recon_loss', recon_loss) tf.summary.scalar('all_loss', loss) """Compute gradient.""" grad = opt.compute_gradients(loss) # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating grad_check = [tf.check_numerics(g, message='Gradient NaN Found!') for g, _ in grad if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')] """Apply graident.""" with tf.control_dependencies(grad_check): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = opt.apply_gradients(grad, global_step=global_step) """Set Session settings.""" sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) """Set Saver.""" var_to_save = [v for v in tf.global_variables( ) if 'Adam' not in v.name] # Don't save redundant Adam beta/gamma saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch) """Display parameters""" total_p = np.sum([np.prod(v.get_shape().as_list()) for v in var_to_save]).astype(np.int32) train_p = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]).astype(np.int32) logger.info('Total Parameters: {}'.format(total_p)) logger.info('Trainable Parameters: {}'.format(train_p)) # read snapshot # latest = os.path.join(cfg.logdir, 'model.ckpt-4680') # saver.restore(sess, latest) """Set summary op.""" summary_op = tf.summary.merge_all() """Start coord & queue.""" coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) """Set summary writer""" if not os.path.exists(cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name)): os.makedirs(cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name)) summary_writer = tf.summary.FileWriter( cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name), graph=sess.graph) """Main loop.""" for step in range(cfg.epoch * num_batches_per_epoch + 1): tic = time.time() """"TF queue would pop batch until no file""" try: _, loss_value, summary_str = sess.run( [train_op, loss, summary_op]) logger.info('%d iteration finishs in ' % step + '%f second' % (time.time() - tic) + ' loss=%f' % loss_value) except KeyboardInterrupt: sess.close() sys.exit() except tf.errors.InvalidArgumentError: logger.warning('%d iteration contains NaN gradients. Discard.' % step) continue else: """Write to summary.""" if step % 5 == 0: summary_writer.add_summary(summary_str, step) """Epoch wise linear annealling.""" if (step % num_batches_per_epoch) == 0: """Save model periodically""" ckpt_path = os.path.join( cfg.logdir + '/cnn_baseline/{}'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value)) saver.save(sess, ckpt_path, global_step=step) """Join threads""" coord.join(threads)
def forward(self, reshaped_input): feature_size = self.feature_size cluster_size = self.cluster_size add_batch_norm = self.add_batch_norm max_frames = self.max_frames is_training = self.is_training max_pool = self.max_pool cluster_weights = tf.get_variable("cluster_weights", [feature_size, cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.softmax(activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation_sum = tf.reduce_sum(activation,1) activation_max = tf.reduce_max(activation,1) activation_max = tf.nn.l2_normalize(activation_max,1) dim_red = tf.get_variable("dim_red", [cluster_size, feature_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) cluster_weights_2 = tf.get_variable("cluster_weights_2", [feature_size, cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights_2", cluster_weights_2) activation = tf.matmul(activation_max, dim_red) activation = tf.matmul(activation, cluster_weights_2) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="cluster_bn_2") else: cluster_biases = tf.get_variable("cluster_biases_2", [cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases_2", cluster_biases) activation += cluster_biases activation = tf.sigmoid(activation) activation = tf.multiply(activation,activation_sum) activation = tf.nn.l2_normalize(activation,1) return activation
def create_ds_cnn_model(fingerprint_input, model_settings, model_size_info, is_training): """Builds a model with depthwise separable convolutional neural network Model definition is based on https://arxiv.org/abs/1704.04861 and Tensorflow implementation: https://github.com/Zehaos/MobileNet model_size_info: defines number of layers, followed by the DS-Conv layer parameters in the order {number of conv features, conv filter height, width and stride in y,x dir.} for each of the layers. Note that first layer is always regular convolution, but the remaining layers are all depthwise separable convolutions. """ def ds_cnn_arg_scope(weight_decay=0): """Defines the default ds_cnn argument scope. Args: weight_decay: The weight decay to use for regularizing the model. Returns: An `arg_scope` to use for the DS-CNN model. """ with slim.arg_scope( [slim.convolution2d, slim.separable_convolution2d], weights_initializer=slim.initializers.xavier_initializer(), biases_initializer=slim.init_ops.zeros_initializer(), weights_regularizer=slim.l2_regularizer(weight_decay)) as sc: return sc def _depthwise_separable_conv(inputs, num_pwc_filters, sc, kernel_size, stride): """ Helper function to build the depth-wise separable convolution layer. """ # skip pointwise by setting num_outputs=None depthwise_conv = slim.separable_convolution2d(inputs, num_outputs=None, stride=stride, depth_multiplier=1, kernel_size=kernel_size, scope=sc+'/depthwise_conv') bn = slim.batch_norm(depthwise_conv, scope=sc+'/dw_batch_norm') pointwise_conv = slim.convolution2d(bn, num_pwc_filters, kernel_size=[1, 1], scope=sc+'/pointwise_conv') bn = slim.batch_norm(pointwise_conv, scope=sc+'/pw_batch_norm') return bn if is_training: dropout_prob = tf.placeholder(tf.float32, name='dropout_prob') label_count = model_settings['label_count'] input_frequency_size = model_settings['dct_coefficient_count'] input_time_size = model_settings['spectrogram_length'] fingerprint_4d = tf.reshape(fingerprint_input, [-1, input_time_size, input_frequency_size, 1]) t_dim = input_time_size f_dim = input_frequency_size #Extract model dimensions from model_size_info num_layers = model_size_info[0] conv_feat = [None]*num_layers conv_kt = [None]*num_layers conv_kf = [None]*num_layers conv_st = [None]*num_layers conv_sf = [None]*num_layers i=1 for layer_no in range(0,num_layers): conv_feat[layer_no] = model_size_info[i] i += 1 conv_kt[layer_no] = model_size_info[i] i += 1 conv_kf[layer_no] = model_size_info[i] i += 1 conv_st[layer_no] = model_size_info[i] i += 1 conv_sf[layer_no] = model_size_info[i] i += 1 scope = 'DS-CNN' with tf.variable_scope(scope) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope([slim.convolution2d, slim.separable_convolution2d], activation_fn=None, weights_initializer=slim.initializers.xavier_initializer(), biases_initializer=slim.init_ops.zeros_initializer(), outputs_collections=[end_points_collection]): with slim.arg_scope([slim.batch_norm], is_training=is_training, decay=0.96, updates_collections=None, activation_fn=tf.nn.relu): for layer_no in range(0,num_layers): if layer_no==0: net = slim.convolution2d(fingerprint_4d, conv_feat[layer_no],\ [conv_kt[layer_no], conv_kf[layer_no]], stride=[conv_st[layer_no], conv_sf[layer_no]], padding='SAME', scope='conv_1') net = slim.batch_norm(net, scope='conv_1/batch_norm') else: net = _depthwise_separable_conv(net, conv_feat[layer_no], \ kernel_size = [conv_kt[layer_no],conv_kf[layer_no]], \ stride = [conv_st[layer_no],conv_sf[layer_no]], sc='conv_ds_'+str(layer_no)) t_dim = math.ceil(t_dim/float(conv_st[layer_no])) f_dim = math.ceil(f_dim/float(conv_sf[layer_no])) net = slim.avg_pool2d(net, [t_dim, f_dim], scope='avg_pool') net = tf.squeeze(net, [1, 2], name='SpatialSqueeze') logits = slim.fully_connected(net, label_count, activation_fn=None, scope='fc1') if is_training: return logits, dropout_prob else: return logits
def forward(self,reshaped_input): cluster_weights = tf.get_variable("cluster_weights", [self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) covar_weights = tf.get_variable("covar_weights", [self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer(mean=1.0, stddev=1 /math.sqrt(self.feature_size))) covar_weights = tf.square(covar_weights) eps = tf.constant([1e-6]) covar_weights = tf.add(covar_weights,eps) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if self.add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [self.cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(self.feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.softmax(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) a_sum = tf.reduce_sum(activation,-2,keep_dims=True) if not FLAGS.fv_couple_weights: cluster_weights2 = tf.get_variable("cluster_weights2", [1,self.feature_size, self.cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size))) else: cluster_weights2 = tf.scalar_mul(FLAGS.fv_coupling_factor,cluster_weights) a = tf.multiply(a_sum,cluster_weights2) activation = tf.transpose(activation,perm=[0,2,1]) reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size]) fv1 = tf.matmul(activation,reshaped_input) fv1 = tf.transpose(fv1,perm=[0,2,1]) # computing second order FV a2 = tf.multiply(a_sum,tf.square(cluster_weights2)) b2 = tf.multiply(fv1,cluster_weights2) fv2 = tf.matmul(activation,tf.square(reshaped_input)) fv2 = tf.transpose(fv2,perm=[0,2,1]) fv2 = tf.add_n([a2,fv2,tf.scalar_mul(-2,b2)]) fv2 = tf.divide(fv2,tf.square(covar_weights)) fv2 = tf.subtract(fv2,a_sum) fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size]) fv2 = tf.nn.l2_normalize(fv2,1) fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size]) fv2 = tf.nn.l2_normalize(fv2,1) fv1 = tf.subtract(fv1,a) fv1 = tf.divide(fv1,covar_weights) fv1 = tf.nn.l2_normalize(fv1,1) fv1 = tf.reshape(fv1,[-1,self.cluster_size*self.feature_size]) fv1 = tf.nn.l2_normalize(fv1,1) return tf.concat([fv1,fv2],1)
def _build_planner(self, scaled_beliefs, m={}): debug = self._debug is_training = self._is_training batch_size = tf.shape(scaled_beliefs[0])[0] image_scaler = self._upscale_image estimate_size = self._estimate_size value_map_size = (estimate_size, estimate_size, 1) num_actions = self._num_actions num_iterations = self._num_iterations def _fuse_belief(belief): with slim.arg_scope( [slim.conv2d], activation_fn=tf.nn.elu, weights_initializer=tf.truncated_normal_initializer( stddev=1), biases_initializer=tf.constant_initializer(0), stride=1, padding='SAME', reuse=tf.AUTO_REUSE): net = slim.conv2d(belief, 1, [1, 1], scope='fuser_combine') return net class HierarchicalVINCell(tf.nn.rnn_cell.RNNCell): @property def state_size(self): return tf.TensorShape(value_map_size) @property def output_size(self): return self.state_size def __call__(self, inputs, state, scope=None): # Upscale previous value map state = image_scaler(state) estimate, _, values = [ tf.expand_dims(layer, axis=3) for layer in tf.unstack(inputs, axis=3) ] with slim.arg_scope([slim.conv2d], reuse=tf.AUTO_REUSE): rewards_map = _fuse_belief( tf.concat([estimate, values, state], axis=3)) actions_map = slim.conv2d( rewards_map, num_actions, [3, 3], weights_initializer=tf.truncated_normal_initializer( stddev=0.42), biases_initializer=tf.constant_initializer(0), scope='VIN_actions_initial') values_map = tf.reduce_max(actions_map, axis=3, keep_dims=True) with slim.arg_scope([slim.conv2d], reuse=tf.AUTO_REUSE): for i in xrange(num_iterations - 1): rv = tf.concat([rewards_map, values_map], axis=3) actions_map = slim.conv2d( rv, num_actions, [3, 3], weights_initializer=tf. truncated_normal_initializer(stddev=0.42), biases_initializer=tf.constant_initializer(0), scope='VIN_actions') values_map = tf.reduce_max(actions_map, axis=3, keep_dims=True) return values_map, values_map beliefs = tf.stack([ slim.batch_norm(belief, is_training=is_training) for belief in scaled_beliefs ], axis=1) vin_cell = HierarchicalVINCell() interm_values_map, final_values_map = tf.nn.dynamic_rnn( vin_cell, beliefs, initial_state=vin_cell.zero_state(batch_size, tf.float32), swap_memory=True) m['value_map'] = interm_values_map values_features = slim.flatten(final_values_map) actions_logit = slim.fully_connected( values_features, num_actions**2, weights_initializer=tf.truncated_normal_initializer(stddev=0.03), biases_initializer=tf.constant_initializer(0), activation_fn=tf.nn.elu, scope='logit_output_1') actions_logit = slim.fully_connected( actions_logit, num_actions, weights_initializer=tf.truncated_normal_initializer(stddev=0.5), biases_initializer=tf.constant_initializer(1.0 / num_actions), scope='logit_output_2') return actions_logit
def _build_mapper(self, m={}, estimator=None): debug = self._debug is_training = self._is_training sequence_length = self._sequence_length visual_input = self._visual_input egomotion = self._egomotion reward = self._reward estimate_map = self._estimate_map_list estimate_scale = self._estimate_scale estimate_shape = self._estimate_shape def _estimate(image): def _xavier_init(num_in, num_out): stddev = np.sqrt(4. / (num_in + num_out)) return tf.truncated_normal_initializer(stddev=stddev) def _constrain_confidence(belief): estimate, confidence = tf.unstack(belief, axis=3) return tf.stack([estimate, tf.nn.sigmoid(confidence)], axis=3) beliefs = [] net = image with slim.arg_scope( [slim.conv2d, slim.fully_connected, slim.conv2d_transpose], activation_fn=tf.nn.elu, biases_initializer=tf.constant_initializer(0), reuse=tf.AUTO_REUSE): last_output_channels = 3 with slim.arg_scope([slim.conv2d], stride=1, padding='VALID'): for index, output in enumerate([(32, [7, 7]), (48, [7, 7]), (64, [5, 5]), (64, [5, 5])]): channels, filter_size = output net = slim.conv2d(net, channels, filter_size, scope='mapper_conv_{}'.format(index), weights_initializer=_xavier_init( np.prod(filter_size) * last_output_channels, channels)) last_output_channels = channels net = slim.fully_connected( net, 200, scope='mapper_fc', weights_initializer=_xavier_init( last_output_channels, 200)) last_output_channels = 200 with slim.arg_scope([slim.conv2d_transpose], stride=1, padding='SAME'): for index, output in enumerate((64, 32, 2)): net = slim.conv2d_transpose( net, output, [7, 7], scope='mapper_deconv_{}'.format(index), weights_initializer=_xavier_init( 7 * 7 * last_output_channels, output)) last_output_channels = output beliefs.append(net) for i in xrange(estimate_scale - 1): net = slim.conv2d_transpose( net, 2, [6, 6], weights_initializer=_xavier_init( 6 * 6 * last_output_channels, 2), scope='mapper_upscale_{}'.format(i)) last_output_channels = 2 beliefs.append(self._upscale_image(net)) return [_constrain_confidence(belief) for belief in beliefs] def _apply_egomotion(tensor, scale_index, ego): translation, rotation = tf.unstack(ego, axis=1) cos_rot = tf.cos(rotation) sin_rot = tf.sin(rotation) zero = tf.zeros_like(rotation) scale = tf.constant( (2**scale_index) / (300. / self._estimate_size), dtype=tf.float32) transform = tf.stack([ cos_rot, sin_rot, tf.multiply(tf.negative(translation), scale), tf.negative(sin_rot), cos_rot, zero, zero, zero ], axis=1) return tf.contrib.image.transform(tensor, transform, interpolation='BILINEAR') def _delta_reward_map(reward): h, w, c = estimate_shape m_h, m_w = int((h - 1) / 2), int((w - 1) / 2) return tf.pad( tf.expand_dims(reward, axis=2), tf.constant([[0, 0], [m_h - 1, w - m_h], [m_w - 1, w - m_w]])) def _warp(temp_belief, prev_belief): temp_estimate, temp_confidence, temp_rewards = tf.unstack( temp_belief, axis=3) prev_estimate, prev_confidence, prev_rewards = tf.unstack( prev_belief, axis=3) current_confidence = temp_confidence + prev_confidence current_estimate = tf.divide( tf.multiply(temp_estimate, temp_confidence) + tf.multiply(prev_estimate, prev_confidence), current_confidence) current_rewards = temp_rewards + prev_rewards current_belief = tf.stack( [current_estimate, current_confidence, current_rewards], axis=3) return current_belief class BiLinearSamplingCell(tf.nn.rnn_cell.RNNCell): @property def state_size(self): return [tf.TensorShape(estimate_shape)] * estimate_scale @property def output_size(self): return self.state_size def __call__(self, inputs, state, scope=None): image, ego, re = inputs delta_reward_map = tf.expand_dims(_delta_reward_map(re), axis=3) current_scaled_estimates = _estimate( image) if estimator is None else estimator(image) current_scaled_estimates = [ tf.concat([estimate, delta_reward_map], axis=3) for estimate in current_scaled_estimates ] previous_scaled_estimates = [ _apply_egomotion(belief, scale_index, ego) for scale_index, belief in enumerate(state) ] outputs = [ _warp(c, p) for c, p in zip(current_scaled_estimates, previous_scaled_estimates) ] return outputs, outputs normalized_input = slim.batch_norm(visual_input, is_training=is_training) bilinear_cell = BiLinearSamplingCell() interm_beliefs, final_belief = tf.nn.dynamic_rnn( bilinear_cell, (normalized_input, egomotion, tf.expand_dims(reward, axis=2)), sequence_length=sequence_length, initial_state=estimate_map, swap_memory=True) m['estimate_map_list'] = interm_beliefs return final_belief
def _batch_norm_fn(x, scope=None): if scope is None: scope = tf.get_variable_scope().name + "/bn" return slim.batch_norm(x, scope=scope)
def build_frrn(inputs, num_classes, preset_model='FRRN-A'): """ Builds the Full Resolution Residual Network model. Arguments: inputs: The input tensor preset_model: Which model you want to use. Select FRRN-A or FRRN-B num_classes: Number of classes Returns: FRRN model """ if preset_model == 'FRRN-A': ##################### # Initial Stage ##################### net = slim.conv2d(inputs, 48, kernel_size=5, activation_fn=None) net = slim.batch_norm(net) net = tf.nn.relu(net) net = ResidualUnit(net, n_filters=48, filter_size=3) net = ResidualUnit(net, n_filters=48, filter_size=3) net = ResidualUnit(net, n_filters=48, filter_size=3) ##################### # Downsampling Path ##################### pool_stream = slim.pool(net, [2, 2], stride=[2, 2], pooling_type='MAX') res_stream = slim.conv2d(net, 32, kernel_size=1, activation_fn=None) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX') pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX') pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=8) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=8) pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX') pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=16) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=16) ##################### # Upsampling Path ##################### pool_stream = Unpooling(pool_stream, 2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=8) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=8) pool_stream = Unpooling(pool_stream, 2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream = Unpooling(pool_stream, 2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream = Unpooling(pool_stream, 2) ##################### # Final Stage ##################### net = tf.concat([pool_stream, res_stream], axis=-1) net = ResidualUnit(net, n_filters=48, filter_size=3) net = ResidualUnit(net, n_filters=48, filter_size=3) net = ResidualUnit(net, n_filters=48, filter_size=3) net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, scope='logits') return net elif preset_model == 'FRRN-B': ##################### # Initial Stage ##################### net = slim.conv2d(inputs, 48, kernel_size=5, activation_fn=None) net = slim.batch_norm(net) net = tf.nn.relu(net) net = ResidualUnit(net, n_filters=48, filter_size=3) net = ResidualUnit(net, n_filters=48, filter_size=3) net = ResidualUnit(net, n_filters=48, filter_size=3) ##################### # Downsampling Path ##################### pool_stream = slim.pool(net, [2, 2], stride=[2, 2], pooling_type='MAX') res_stream = slim.conv2d(net, 32, kernel_size=1, activation_fn=None) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX') pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX') pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=8) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=8) pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX') pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=16) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=16) pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX') pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=32) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=32) ##################### # Upsampling Path ##################### pool_stream = Unpooling(pool_stream, 2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=17) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=16) pool_stream = Unpooling(pool_stream, 2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=8) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=8) pool_stream = Unpooling(pool_stream, 2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4) pool_stream = Unpooling(pool_stream, 2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2) pool_stream = Unpooling(pool_stream, 2) ##################### # Final Stage ##################### net = tf.concat([pool_stream, res_stream], axis=-1) net = ResidualUnit(net, n_filters=48, filter_size=3) net = ResidualUnit(net, n_filters=48, filter_size=3) net = ResidualUnit(net, n_filters=48, filter_size=3) net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, scope='logits') return net else: raise ValueError("Unsupported FRRN model '%s'. This function only supports FRRN-A and FRRN-B" % (preset_model))
def batch_norm(net): net = slim.batch_norm(net, center=center, scale=True, epsilon=1e-5, is_training=training) if not center: net = tf.nn.bias_add(net, slim.variable('biases', shape=[tf.shape(net)[-1]], initializer=tf.zeros_initializer())) return net
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str) dataset_name = args[1] model_name = args[2] """Set reproduciable random seed""" tf.set_random_seed(1234) coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs( dataset_name, is_train=False, epochs=cfg.epoch) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = 2 # int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_squash = tf.divide(batch_x, 255.) batch_x_norm = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) output, pose_out = net.build_arch(batch_x_norm, coord_add, is_train=False, num_classes=num_classes) tf.logging.debug(pose_out.get_shape()) batch_acc = net.test_accuracy(output, batch_labels) m_op = tf.constant(0.9) loss, spread_loss, mse, recon_img_squash = net.spread_loss( output, pose_out, batch_squash, batch_labels, m_op) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('reconstruction_loss', mse) tf.summary.scalar('all_loss', loss) data_size = int(batch_x.get_shape()[1]) recon_img = tf.multiply(tf.reshape(recon_img_squash, shape=[ cfg.batch_size, data_size, data_size, 1]), 255.) orig_img = tf.reshape(batch_x, shape=[ cfg.batch_size, data_size, data_size, 1]) tf.summary.image('orig_image', orig_img) tf.summary.image('recon_image', recon_img) saver = tf.train.Saver() step = 0 tf.summary.scalar('accuracy', batch_acc) summary_op = tf.summary.merge_all() with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(14, 15): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join( cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str, orig_image, recon_image = sess.run( [batch_acc, summary_op, orig_img, recon_img]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 # display original/reconstructed images in matplotlib plot_imgs(orig_image, i, 'ori') plot_imgs(recon_image, i, 'rec') ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size fc_dimred = FLAGS.fc_dimred relu = FLAGS.dbof_relu max_pool = FLAGS.softdbof_maxpool num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training) audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training) if add_batch_norm: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_DBOF"): dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) with tf.variable_scope("audio_DBOF"): dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:]) dbof = tf.concat([dbof_video, dbof_audio],1) dbof_dim = dbof.get_shape().as_list()[1] if fc_dimred: hidden1_weights = tf.get_variable("hidden1_weights", [dbof_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(dbof, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable("hidden1_biases", [hidden1_size], initializer = tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) else: activation = dbof aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model( model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if add_batch_norm: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") cluster_weights = tf.get_variable("cluster_weights", [feature_size, cluster_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable("cluster_biases", [cluster_size], initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.relu6(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method) hidden1_weights = tf.get_variable("hidden1_weights", [cluster_size, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(activation, hidden1_weights) if add_batch_norm: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable("hidden1_biases", [hidden1_size], initializer = tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model( model_input=activation, vocab_size=vocab_size, **unused_params)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.netvlad_cluster_size hidden1_size = hidden_size or FLAGS.netvlad_hidden_size relu = FLAGS.netvlad_relu dimred = FLAGS.netvlad_dimred gating = FLAGS.gating remove_diag = FLAGS.gating_remove_diag lightvlad = FLAGS.lightvlad vlagd = FLAGS.vlagd SVD_dim = FLAGS.SVD_dim num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) video_NetVLAD = LightVLAD(1024, max_frames, int(cluster_size), add_batch_norm, is_training) audio_NetVLAD = LightVLAD(128, max_frames, int(cluster_size / 2), add_batch_norm, is_training) if add_batch_norm: # and not lightvlad: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_VLAD"): vlad_video = video_NetVLAD.forward(reshaped_input[:, 0:1024]) with tf.variable_scope("audio_VLAD"): vlad_audio = audio_NetVLAD.forward(reshaped_input[:, 1024:]) vlad = tf.concat([vlad_video, vlad_audio], 1) # None x vlad_dim vlad_dim = vlad.get_shape().as_list()[1] ##### simplier SVD ##### SVD_mat1 = tf.get_variable("hidden1_weights", [vlad_dim, SVD_dim], initializer=tf.glorot_uniform_initializer()) SVD_mat2 = tf.get_variable("hidden2_weights", [SVD_dim, int(hidden1_size * 2)], initializer=tf.glorot_uniform_initializer()) SVD_mat1_biases = tf.get_variable( "SVD_mat1_biases", [SVD_dim], initializer=tf.random_normal_initializer(stddev=0.01)) SVD_mat2_biases = tf.get_variable( "SVD_mat2_biases", [int(hidden1_size * 2)], initializer=tf.random_normal_initializer(stddev=0.01)) ##### simplier SVD ##### activation = tf.matmul(vlad, SVD_mat1) # None x 256 activation += SVD_mat1_biases activation = tf.matmul(activation, SVD_mat2) # None x 2*hidden1_size activation += SVD_mat2_biases tf.summary.histogram("activation_before_bn", activation) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") tf.summary.histogram("activation_after_bn", activation) ## gating part gating_weights = tf.get_variable( "gating_weights_2", [int(2 * hidden1_size), hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, scope="gating_bn") gates = tf.sigmoid(gates) tf.summary.histogram("gates_layer", gates) ## gating part ## hidden layer activation = tf.nn.tanh(activation) tf.summary.histogram("activation_after_bn_after_1_tanh", activation) activation_hidden_weights = tf.get_variable( "activation_hidden_weights", [int(hidden1_size * 2), hidden1_size], initializer=tf.glorot_uniform_initializer()) activation = tf.matmul(activation, activation_hidden_weights) activation = slim.batch_norm(activation, center=True, scale=True, is_training=is_training, scope="hidden_layer_bn") tf.summary.histogram("activation_after_bn_after_1_tanh_after_bn", activation) activation = tf.nn.tanh(activation) tf.summary.histogram( "activation_after_bn_after_1_tanh_after_bn_after_2_tanh", activation) activation = tf.multiply(activation, gates) tf.summary.histogram("activation_right_before_video", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, is_training=is_training, **unused_params)
def build_CVAE_v2(self): with tf.variable_scope('Encoder', reuse=self.reuse_variables): """ Q(Z|X)(approximate posterior distribution) encoder : Multivariate Gaussian """ "Initial conv block, H/2" net = slim.conv2d(inputs=self.model_input, num_outputs=32, kernel_size=7, stride=2, scope='conv1') net = slim.batch_norm(net, is_training=False) net = tf.nn.relu(net) net = slim.max_pool2d(net, 3, stride=2, padding='SAME') # "H/4" # net = slim.conv2d(inputs=net, num_outputs=64, kernel_size=3, stride=2) # net = slim.conv2d(inputs=net, num_outputs=64, kernel_size=3, stride=1) # # "H/8" # net = slim.conv2d(inputs=net, num_outputs=128, kernel_size=3, stride=2) # net = slim.conv2d(inputs=net, num_outputs=128, kernel_size=3, stride=1) # # "H/16" # net = slim.conv2d(inputs=net, num_outputs=256, kernel_size=3, stride=2) # net = slim.conv2d(inputs=net, num_outputs=256, kernel_size=3, stride=1) # # "H/32" # net = slim.conv2d(inputs=net, num_outputs=512, kernel_size=3, stride=2) # net = slim.conv2d(inputs=net, num_outputs=512, kernel_size=3, stride=1) # "H/4" # net = slim.max_pool2d(net, kernel_size=3, stride=2, padding='SAME') # # "H/8" # net = slim.conv2d(inputs=net, num_outputs=64, kernel_size=3, stride=2) # # "dilated dense convolution" # net = self.denseconv(net, num_outputs=64, kernel_size=3, dilation_rate=3) # net = self.denseconv(net, num_outputs=64, kernel_size=3, dilation_rate=6) # # "denseblock series, H/8" # conv_dense = self.conv_denseblock(net, num_outputs=128, kernel_size=3) # max_dense = self.max_denseblock(net, num_outputs=128, kernel_size=3) # dense = tf.concat([conv_dense, max_dense], axis=3) # # "H/32" # net = self.conv_denseblock(dense, num_outputs=512, kernel_size=3) "mean / variation" self.z_mu = slim.avg_pool2d(net, kernel_size=3, stride=1, padding='SAME') self.z_log_var = slim.avg_pool2d(net**2, 3, 1, 'SAME') - self.z_mu**2 with tf.variable_scope('Sampling_z'): """ sampling z using reparameterization trick """ epsilon = tf.random_normal(shape=tf.shape(self.z_mu), dtype=tf.float32) self.z_sample = self.z_mu + tf.exp( self.z_log_var / 2.) * epsilon with tf.variable_scope('Decoder', reuse=self.reuse_variables): """ P(X|Z) (likelihood) """ self.de_net = slim.conv2d_transpose(inputs=self.z_sample, num_outputs=256, kernel_size=3, stride=2, scope='deconv1') self.de_net = slim.conv2d_transpose(inputs=self.de_net, num_outputs=128, kernel_size=3, stride=2, scope='deconv2') self.de_net = slim.conv2d_transpose(inputs=self.de_net, num_outputs=32, kernel_size=3, stride=2, scope='deconv3') self.de_net = slim.conv2d_transpose(inputs=self.de_net, num_outputs=16, kernel_size=3, stride=2, scope='deconv4') # conv = slim.conv2d(self.de_net, num_outputs=8, kernel_size=3, stride=1) self.logits = slim.conv2d_transpose(inputs=self.de_net, num_outputs=3, kernel_size=3, stride=2, activation_fn=tf.nn.elu)
def build_AE(self): with tf.variable_scope('Encoder', reuse=self.reuse_variables): "Initial conv block, H/2" net = slim.conv2d(inputs=self.model_input, num_outputs=32, kernel_size=7, stride=2, scope='conv1') net = slim.batch_norm(net) net = tf.nn.relu(net) "H/4" net = slim.conv2d(inputs=net, num_outputs=32, kernel_size=3, stride=2) "H/8" net = slim.max_pool2d(net, kernel_size=3, stride=2, padding='SAME') conv2_net = slim.conv2d(inputs=net, num_outputs=64, kernel_size=3, stride=1, scope='conv2') "denseblock series, H/8" conv_dense = self.conv_denseblock(conv2_net, num_outputs=128, kernel_size=3) max_dense = self.max_denseblock(conv2_net, num_outputs=128, kernel_size=3) dense = tf.concat([conv_dense, max_dense], axis=3) "dilated dense convolution" net = self.denseconv(dense, num_outputs=512, kernel_size=3, dilation_rate=3) net = self.denseconv(net, num_outputs=512, kernel_size=3, dilation_rate=6) # "H/16" # net = slim.avg_pool2d(net, kernel_size=3, padding='SAME') with tf.variable_scope('Decoder', reuse=self.reuse_variables): "H/4" self.de_net = slim.conv2d_transpose( inputs=net, num_outputs=256, kernel_size=3, stride=2, scope='deconv1') # (B, 76, 76, 256) conv_denet = slim.conv2d(inputs=self.de_net, num_outputs=128, kernel_size=3) "H/2" self.de_net2 = slim.conv2d_transpose( inputs=conv_denet, num_outputs=64, kernel_size=3, stride=2, scope='deconv2') # (B, 152, 152, 128) conv_denet = slim.conv2d(inputs=self.de_net2, num_outputs=32, kernel_size=3) "H" self.de_net3 = slim.conv2d_transpose(inputs=conv_denet, num_outputs=16, kernel_size=3, stride=2, scope='deconv3') self.logits = slim.conv2d(inputs=self.de_net3, num_outputs=1, kernel_size=3)
def get_embd(inputs, is_training_dropout, is_training_bn, config, reuse=False, scope='embd_extractor'): with tf.variable_scope(scope, reuse=reuse): net = inputs end_points = {} if config['backbone_type'].startswith('resnet_v2_m'): arg_sc = modifiedResNet_v2.resnet_arg_scope( weight_decay=config['weight_decay'], batch_norm_decay=config['bn_decay']) with slim.arg_scope(arg_sc): if config['backbone_type'] == 'resnet_v2_m_50': net, end_points = modifiedResNet_v2.resnet_v2_m_50( net, is_training=is_training_bn, return_raw=True) elif config['backbone_type'] == 'resnet_v2_m_101': net, end_points = modifiedResNet_v2.resnet_v2_m_101( net, is_training=is_training_bn, return_raw=True) elif config['backbone_type'] == 'resnet_v2_m_152': net, end_points = modifiedResNet_v2.resnet_v2_m_152( net, is_training=is_training_bn, return_raw=True) elif config['backbone_type'] == 'resnet_v2_m_200': net, end_points = modifiedResNet_v2.resnet_v2_m_200( net, is_training=is_training_bn, return_raw=True) else: raise ValueError('Invalid backbone type.') elif config['backbone_type'] == 'mbv3': net = mobilenetv3( net, mode='face.large', is_train=is_training_bn, ) elif config['backbone_type'].startswith('resnet_v2'): arg_sc = ResNet_v2.resnet_arg_scope( weight_decay=config['weight_decay'], batch_norm_decay=config['bn_decay']) with slim.arg_scope(arg_sc): if config['backbone_type'] == 'resnet_v2_50': net, end_points = ResNet_v2.resnet_v2_50( net, is_training=is_training_bn, return_raw=True) elif config['backbone_type'] == 'resnet_v2_101': net, end_points = ResNet_v2.resnet_v2_101( net, is_training=is_training_bn, return_raw=True) elif config['backbone_type'] == 'resnet_v2_152': net, end_points = ResNet_v2.resnet_v2_152( net, is_training=is_training_bn, return_raw=True) elif config['backbone_type'] == 'resnet_v2_200': net, end_points = ResNet_v2.resnet_v2_200( net, is_training=is_training_bn, return_raw=True) else: raise ValueError('Invalid backbone type.') if config['out_type'] == 'E': with slim.arg_scope(arg_sc): net = slim.batch_norm(net, activation_fn=None, is_training=is_training_bn) net = slim.dropout(net, keep_prob=config['keep_prob'], is_training=is_training_dropout) net = slim.flatten(net) net = slim.fully_connected(net, config['embd_size'], normalizer_fn=None, activation_fn=None) net = slim.batch_norm(net, scale=False, activation_fn=None, is_training=is_training_bn) end_points['embds'] = net elif config['out_type'] == 'N': end_points['embds'] = net else: raise ValueError('Invalid out type.') return net, end_points
def mobilenet(inputs, num_classes=1000, is_training=True, width_multiplier=3, scope='MobileNet'): """ MobileNet More detail, please refer to Google's paper(https://arxiv.org/abs/1704.04861). Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. scope: Optional scope for the variables. Returns: logits: the pre-softmax activations, a tensor of size [batch_size, `num_classes`] end_points: a dictionary from components of the network to the corresponding activation. """ def _depthwise_separable_conv(inputs, num_pwc_filters, width_multiplier, sc, downsample=False): """ Helper function to build the depth-wise separable convolution layer. """ num_pwc_filters = round(num_pwc_filters * width_multiplier) _stride = 2 if downsample else 1 # skip pointwise by setting num_outputs=None depthwise_conv = slim.separable_convolution2d(inputs, num_outputs=None, stride=_stride, depth_multiplier=1, kernel_size=[3, 3], scope=sc + '/depthwise_conv') bn = slim.batch_norm(depthwise_conv, scope=sc + '/dw_batch_norm') pointwise_conv = slim.convolution2d(bn, num_pwc_filters, kernel_size=[1, 1], scope=sc + '/pointwise_conv') bn = slim.batch_norm(pointwise_conv, scope=sc + '/pw_batch_norm') return bn with tf.variable_scope(scope) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope([slim.convolution2d, slim.separable_convolution2d], activation_fn=None, outputs_collections=[end_points_collection]): with slim.arg_scope([slim.batch_norm], is_training=is_training, activation_fn=tf.nn.relu, fused=True): net = slim.convolution2d(inputs, round(32 * width_multiplier), [3, 3], stride=2, padding='SAME', scope='conv_1') net = slim.batch_norm(net, scope='conv_1/batch_norm') net = _depthwise_separable_conv(net, 64, width_multiplier, sc='conv_ds_2') net = _depthwise_separable_conv(net, 128, width_multiplier, downsample=True, sc='conv_ds_3') net = _depthwise_separable_conv(net, 128, width_multiplier, sc='conv_ds_4') net = _depthwise_separable_conv(net, 256, width_multiplier, downsample=True, sc='conv_ds_5') net = _depthwise_separable_conv(net, 256, width_multiplier, sc='conv_ds_6') net = _depthwise_separable_conv(net, 512, width_multiplier, downsample=True, sc='conv_ds_7') net = _depthwise_separable_conv(net, 512, width_multiplier, sc='conv_ds_8') net = _depthwise_separable_conv(net, 512, width_multiplier, sc='conv_ds_9') net = _depthwise_separable_conv(net, 512, width_multiplier, sc='conv_ds_10') net = _depthwise_separable_conv(net, 512, width_multiplier, sc='conv_ds_11') net = _depthwise_separable_conv(net, 512, width_multiplier, sc='conv_ds_12') net = _depthwise_separable_conv(net, 1024, width_multiplier, downsample=True, sc='conv_ds_13') net = _depthwise_separable_conv(net, 1024, width_multiplier, sc='conv_ds_14') net = slim.avg_pool2d(net, [7, 7], scope='avg_pool_15') def get_tensor_aliases(tensor): """Get a list with the aliases of the input tensor. If the tensor does not have any alias, it would default to its its op.name or its name. Args: tensor: A `Tensor`. Returns: A list of strings with the aliases of the tensor. """ if hasattr(tensor, 'aliases'): aliases = tensor.aliases else: if tensor.name[-2:] == ':0': # Use op.name for tensor ending in :0 aliases = [tensor.op.name] else: aliases = [tensor.name] return aliases from tensorflow.python.framework import ops for tensor in ops.get_collection(end_points_collection): for alias in get_tensor_aliases(tensor): # print(alias) pass end_points = slim.utils.convert_collection_to_dict( end_points_collection) net = tf.squeeze(net, [0], name='SpatialSqueeze') end_points['squeeze'] = net logits = slim.fully_connected(net, num_classes, activation_fn=None, scope='fc_16') predictions = slim.softmax(logits, scope='Predictions') end_points['Logits'] = logits end_points['Predictions'] = predictions return logits, end_points
def D(self, inputs, reuse=False, training=True): """ feed forward procedure :param inputs: shape [batch_size, time_step, channel] :return: """ keep_prob = 1.0 if training: keep_prob = 0.5 norm_scale = False with tf.variable_scope("D", reuse=reuse): with tf.name_scope("Reshaping_data") as scope: x_image = tf.reshape(inputs, [-1, self.config.FEATURE_LEN, 1, 1]) with tf.name_scope("Conv1") as scope: a_conv1 = slim.conv2d(x_image, num_outputs=self.config.num_filt_1, kernel_size=[5, 1], scope='conv1') with tf.name_scope('Batch_norm_conv1') as scope: a_conv1 = slim.batch_norm(a_conv1, is_training=self.bn_train, scale=norm_scale, updates_collections=None) h_conv1 = tf.nn.relu(a_conv1) # h_conv1 = slim.avg_pool2d(h_conv1, kernel_size=2, stride=2, padding='SAME') with tf.variable_scope('classification_branch'): with tf.name_scope("Conv2") as scope: W_conv2 = tf.get_variable("Conv_Layer_2", shape=[4, 1, self.config.num_filt_1, self.config.num_filt_2], initializer=initializer) b_conv2 = bias_variable([self.config.num_filt_2], 'bias_for_Conv_Layer_2') a_conv2 = conv2d(h_conv1, W_conv2) + b_conv2 with tf.name_scope('Batch_norm_conv2') as scope: a_conv2 = slim.batch_norm(a_conv2, is_training=self.bn_train, scale=norm_scale, updates_collections=None) h_conv2 = tf.nn.relu(a_conv2) # h_conv2 = slim.max_pool2d(h_conv2, kernel_size=2, stride=2, padding="SAME") if self.config.DATASET_NAME != 'Adiac': with tf.name_scope("Conv3") as scope: W_conv2 = tf.get_variable("Conv_Layer_3", shape=[4, 1, self.config.num_filt_2, self.config.num_filt_3], initializer=initializer) b_conv2 = bias_variable([self.config.num_filt_3], 'bias_for_Conv_Layer_3') a_conv2 = conv2d(h_conv2, W_conv2, kernel=2) + b_conv2 with tf.name_scope('Batch_norm_conv3') as scope: a_conv2 = slim.batch_norm(a_conv2, is_training=self.bn_train, scale=True, updates_collections=None) h_conv2 = tf.nn.relu(a_conv2) with tf.name_scope("Fully_Connected1") as scope: W_fc1 = tf.get_variable("Fully_Connected_layer_1", shape=[np.prod(h_conv2.get_shape().as_list()[1:]), self.config.num_fc_1], initializer=initializer) b_fc1 = bias_variable([self.config.num_fc_1], 'bias_for_Fully_Connected_Layer_1') h_conv3_flat = tf.reshape(h_conv2, [-1, np.prod(h_conv2.get_shape().as_list()[1:])]) h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1) with tf.name_scope("Fully_Connected2") as scope: h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) W_fc2 = tf.get_variable("W_fc2", shape=[self.config.num_fc_1, self.config.NUM_CLASSES], initializer=initializer) b_fc2 = tf.Variable(tf.constant(0.1, shape=[self.config.NUM_CLASSES]), name='b_fc2') logits = tf.matmul(h_fc1_drop, W_fc2) + b_fc2 with tf.variable_scope('real_fake_branch'): with tf.name_scope("Conv2") as scope: W_conv2 = tf.get_variable("Conv_Layer_2", shape=[4, 1, self.config.num_filt_1, self.config.num_filt_2], initializer=initializer) b_conv2 = bias_variable([self.config.num_filt_2], 'bias_for_Conv_Layer_2') a_conv2 = conv2d(h_conv1, W_conv2) + b_conv2 with tf.name_scope('Batch_norm_conv2') as scope: a_conv2 = slim.batch_norm(a_conv2, is_training=self.bn_train, updates_collections=None) h_conv2 = tf.nn.relu(a_conv2) with tf.name_scope("Fully_Connected1") as scope: W_fc1 = tf.get_variable("Fully_Connected_layer_1", shape=[self.config.FEATURE_LEN * self.config.num_filt_2, self.config.num_fc_1], initializer=initializer) b_fc1 = bias_variable([self.config.num_fc_1], 'bias_for_Fully_Connected_Layer_1') h_conv3_flat = tf.reshape(h_conv2, [-1, self.config.FEATURE_LEN * self.config.num_filt_2]) h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1) with tf.name_scope("Fully_Connected2") as scope: h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) W_fc2 = tf.get_variable("W_fc2", shape=[self.config.num_fc_1, self.config.NUM_CLASSES], initializer=initializer) b_fc2 = tf.Variable(tf.constant(0.1, shape=[1]), name='b_fc2') real_fake_logits = tf.matmul(h_fc1_drop, W_fc2) + b_fc2 real_fake_logits = tf.sigmoid(real_fake_logits) return logits, real_fake_logits
def forward(self, reshaped_input): cluster_weights = tf.get_variable( "cluster_weights", [self.feature_size, self.cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) tf.summary.histogram("cluster_weights", cluster_weights) activation = tf.matmul(reshaped_input, cluster_weights) if self.add_batch_norm: activation = slim.batch_norm(activation, center=True, scale=True, is_training=self.is_training, scope="cluster_bn") else: cluster_biases = tf.get_variable( "cluster_biases", [cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases activation = tf.nn.softmax(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size]) a_sum = tf.reduce_sum(activation, -2, keep_dims=True) cluster_weights2 = tf.get_variable( "cluster_weights2", [1, self.feature_size, self.cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) a = tf.multiply(a_sum, cluster_weights2) activation = tf.transpose(activation, perm=[0, 2, 1]) reshaped_input = tf.reshape(reshaped_input, [-1, self.max_frames, self.feature_size]) vlad = tf.matmul(activation, reshaped_input) vlad = tf.transpose(vlad, perm=[0, 2, 1]) vlad = tf.subtract(vlad, a) vlad = tf.transpose(vlad, perm=[0, 2, 1]) vlad = tf.reshape(vlad, [-1, self.feature_size]) vlad_softmax = self.embedgaussian_relation(vlad, 1 / float(64)) nonlocal_g = tf.get_variable( "nonlocal_g", [self.feature_size, self.cluster_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.feature_size))) nonlocal_out = tf.get_variable( "nonlocal_out", [self.cluster_size, self.feature_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(self.cluster_size))) vlad_g = tf.matmul(vlad, nonlocal_g) vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.cluster_size]) vlad_g = tf.matmul(vlad_softmax, vlad_g) vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size]) vlad_g = tf.matmul(vlad_g, nonlocal_out) vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.feature_size]) vlad = tf.reshape(vlad, [-1, self.cluster_size, self.feature_size]) vlad = vlad + vlad_g vlad = tf.transpose(vlad, perm=[0, 2, 1]) vlad = tf.nn.l2_normalize(vlad, 1) # [b,f,c] vlad = tf.reshape(vlad, [-1, self.cluster_size * self.feature_size]) vlad = tf.nn.l2_normalize(vlad, 1) return vlad
def main(args): """Get dataset hyperparameters.""" assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str) dataset_name = args[1] model_name = args[2] coord_add = get_coord_add(dataset_name) dataset_size_train = get_dataset_size_train(dataset_name) dataset_size_test = get_dataset_size_test(dataset_name) num_classes = get_num_classes(dataset_name) create_inputs = get_create_inputs( dataset_name, is_train=False, epochs=cfg.epoch) """Set reproduciable random seed""" tf.set_random_seed(1234) with tf.Graph().as_default(): num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size) num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1) batch_x, batch_labels = create_inputs() batch_x = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False) if model_name == "caps": output, _ = net.build_arch(batch_x, coord_add, is_train=False, num_classes=num_classes) elif model_name == "cnn_baseline": output = net.build_arch_baseline(batch_x, is_train=False, num_classes=num_classes) else: raise "Please select model from 'caps' or 'cnn_baseline' as the secondary argument of eval.py!" batch_acc = net.test_accuracy(output, batch_labels) saver = tf.train.Saver() step = 0 summaries = [] summaries.append(tf.summary.scalar('accuracy', batch_acc)) summary_op = tf.summary.merge(summaries) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)): os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)) summary_writer = tf.summary.FileWriter( cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph) # graph=sess.graph, huge! files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name)) for epoch in range(1, cfg.epoch): # requires a regex to adapt the loss value in the file name here ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch) for __file in files: if __file.endswith(ckpt_re + ".index"): ckpt = os.path.join(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6]) # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch)) saver.restore(sess, ckpt) accuracy_sum = 0 for i in range(num_batches_test): batch_acc_v, summary_str = sess.run([batch_acc, summary_op]) print('%d batches are tested.' % step) summary_writer.add_summary(summary_str, step) accuracy_sum += batch_acc_v step += 1 ave_acc = accuracy_sum / num_batches_test print('the average accuracy is %f' % ave_acc) coord.join(threads)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, is_training=True, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size relu = FLAGS.dbof_relu cluster_activation = FLAGS.dbof_activation num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) tf.summary.histogram("input_hist", reshaped_input) if cluster_activation == 'glu': cluster_size = 2*cluster_size video_Dbof = DBoF(1024,max_frames,cluster_size, cluster_activation, add_batch_norm, is_training) audio_Dbof = DBoF(128,max_frames,cluster_size/8, cluster_activation, add_batch_norm, is_training) if add_batch_norm: reshaped_input = slim.batch_norm( reshaped_input, center=True, scale=True, is_training=is_training, scope="input_bn") with tf.variable_scope("video_DBOF"): dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) with tf.variable_scope("audio_DBOF"): dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:]) dbof = tf.concat([dbof_video, dbof_audio],1) dbof_dim = dbof.get_shape().as_list()[1] hidden1_weights = tf.get_variable("hidden1_weights", [dbof_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) tf.summary.histogram("hidden1_weights", hidden1_weights) activation = tf.matmul(dbof, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm( activation, center=True, scale=True, is_training=is_training, scope="hidden1_bn") else: hidden1_biases = tf.get_variable("hidden1_biases", [hidden1_size], initializer = tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model( model_input=activation, vocab_size=vocab_size, **unused_params)
def network_model__creation(incident, reuse=None, weight_decay=1e-8): init_fc_weight = tf.truncated_normal_initializer(stddev=1e-3) init_fc_bias = tf.zeros_initializer() regularizer_fc = slim.l2_regularizer(weight_decay) non_Linearity = tf.nn.elu conv_weight_init = tf.truncated_normal_initializer(stddev=1e-3) conv_bias_init = tf.zeros_initializer() conv_regularizer = slim.l2_regularizer(weight_decay) def batch_norm_fn(x): return slim.batch_norm(x, scope=tf.get_variable_scope().name + "/bn") network_model_ = incident network_model_ = slim.conv2d( network_model_, 32, [3, 3], stride_new=1, activation_fn=non_Linearity, padding="SAME", normalizer_fn=batch_norm_fn, scope="conv1_1", weights_initializer=conv_weight_init, biases_initializer=conv_bias_init, weights_regularizer=conv_regularizer) network_model_ = slim.conv2d( network_model_, 32, [3, 3], stride_new=1, activation_fn=non_Linearity, padding="SAME", normalizer_fn=batch_norm_fn, scope="conv1_2", weights_initializer=conv_weight_init, biases_initializer=conv_bias_init, weights_regularizer=conv_regularizer) # NOTE(nwojke): This is missing a padding="SAME" to match the CNN # architecture in Table 1 of the paper. Information on how this affects # performance on MOT 16 training sequences can be found in # issue 10 https://github.com/nwojke/deep_sort/issues/10 network_model_ = slim.max_pool2d(network_model_, [3, 3], [2, 2], scope="pool1") network_model_ = residual_block( network_model_, "conv2_1", non_Linearity, conv_weight_init, conv_bias_init, conv_regularizer, increase_dim=False, if_first=True) network_model_ = residual_block( network_model_, "conv2_3", non_Linearity, conv_weight_init, conv_bias_init, conv_regularizer, increase_dim=False) network_model_ = residual_block( network_model_, "conv3_1", non_Linearity, conv_weight_init, conv_bias_init, conv_regularizer, increase_dim=True) network_model_ = residual_block( network_model_, "conv3_3", non_Linearity, conv_weight_init, conv_bias_init, conv_regularizer, increase_dim=False) network_model_ = residual_block( network_model_, "conv4_1", non_Linearity, conv_weight_init, conv_bias_init, conv_regularizer, increase_dim=True) network_model_ = residual_block( network_model_, "conv4_3", non_Linearity, conv_weight_init, conv_bias_init, conv_regularizer, increase_dim=False) feature_dim = network_model_.get_shape().as_list()[-1] network_model_ = slim.flatten(network_model_) network_model_ = slim.dropout(network_model_, keep_prob=0.6) network_model_ = slim.fully_connected( network_model_, feature_dim, activation_fn=non_Linearity, normalizer_fn=batch_norm_fn, weights_regularizer=regularizer_fc, scope="fc1", weights_initializer=init_fc_weight, biases_initializer=init_fc_bias) model_features = network_model_ # Features in rows, normalize axis 1. model_features = slim.batch_norm(model_features, scope="ball", reuse=reuse) feature_norm = tf.sqrt( tf.constant(1e-8, tf.float32) + tf.reduce_sum(tf.square(model_features), [1], keepdims=True)) model_features = model_features / feature_norm return model_features, None