def _build_net(self, class_n): """Build the network. :param class_n: the number of label classes. """ # 1st Layer FC(w ReLu) -> BatchNorm -> DropOut(or not) fc1 = fc(self._features, self.fc1_n, name="fc1") norm1 = tf.layers.batch_normalization(fc1, training=self._norm_mode, name="norm1") if self.do_dropout1: norm1 = tf.nn.dropout(norm1, self._keep_prob) # 2nd Layer FC(w ReLu) -> BatchNorm -> DropOut(or not) fc2 = fc(norm1, self.fc2_n, name="fc2") norm2 = tf.layers.batch_normalization(fc2, training=self._norm_mode, name="norm2") if self.do_dropout2: norm2 = tf.nn.dropout(norm2, self._keep_prob) # 3rd Layer FC(w ReLu) -> BatchNorm -> DropOut(or not) fc3 = fc(norm2, self.fc3_n, name="fc3") norm3 = tf.layers.batch_normalization(fc3, training=self._norm_mode, name="norm3") if self.do_dropout3: norm3 = tf.nn.dropout(norm3, self._keep_prob) # 4th Layer FC(w) fc4 = fc(norm1, class_n, relu=False, name="fc4") self._softmax = tf.nn.softmax(fc4, name="softmax") self._pred = tf.argmax(self._softmax, axis=1) self._acc = tf.reduce_mean( tf.cast(tf.equal(self._pred, self._labels), tf.float32)) self._interlayers = [fc1, fc2, fc3, fc4]
def resnet_v2(self, input): strides = [1, 2, 2, 2] blocks = [3, 4, 6, 3] num_conv = [64, 128, 256, 512] input = self.pad(input, 7) res = util.conv2d(input, [7, 7, 64], stride=[1, 2, 2, 1], padding='VALID', name="conv_pre", use_bias=False) res = tf.nn.max_pool(res, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME') for j, b in enumerate(blocks): block_stride = [1, strides[j], strides[j], 1] res = self.resnet_v2_bottleneck_block(res, num_conv=num_conv[j], strides=block_stride, name="block" + str(j + 1) + "-1", projection_shortcut=True) for i in range(1, b): res = self.resnet_v2_bottleneck_block( res, num_conv=num_conv[j], strides=[1, 1, 1, 1], name="block" + str(j + 1) + "-" + str(i + 1)) res = util.batch_norm(res, "post_bn", self.phase) res = tf.nn.relu(res) self.spatial = res # Average Pooling over both spatial dimensions res = tf.reduce_mean(res, axis=[1, 2]) # With ImageNet classifier if self.with_classifier: res = util.fc(res, 1001, "imagenet_dense") return res
def generate_attention_maps(self, state, feature): h, c = state DIM = self.DIM_ATT # There are 5 body parts. `tmp` is shared for each joint within a body part. # In other words, we need 5 `tmp` terms, or equivalently, 1 `tmp` term with 5*DIM channels. # Compute map (Eq. 2) Ac = util.conv2d(feature, [1, 1, 5 * DIM], "att_pose_c", use_bias=False) Ah = util.fc(h, 5 * DIM, "att_pose_h", use_bias=False) bias = tf.get_variable("bias", shape=[5 * DIM], initializer=tf.zeros_initializer()) # A_c: Bx7x7x32; A_h: Bx32. # Add A_h to A_c by broadcasting tmp = tf.nn.tanh(tf.reshape(Ah, [self.BATCH, 1, 1, DIM]) + Ac + bias) tmp = tf.split(tmp, 5, axis=3) # Split into 5 groups joint_maps = [] joint_tens = [] for i in range(5): # v is just a 1x1 convolution. # NOTE: From paper, it is not entirely clear if v is shared between body parts. # We assume this is NOT the case. res = util.conv2d(tmp[i], [1, 1, self.J], "att_map_bp" + str(i)) res = tf.reshape(res, [self.BATCH, 7, 7, self.J]) # Normalization (Eq. 3) t_res = tf.nn.softmax(res, 3) l_res = tf.split(t_res, self.J, axis=3) joint_maps.append(l_res) # For use in assemble_parts joint_tens.append(tf.expand_dims( t_res, axis=1)) # For convenient loss computation joint_tens = tf.concat( joint_tens, axis=1) # Resulting shape: BATCH x 5 x 7 x 7 x J return joint_maps, joint_tens
def generate_attention_maps( self, state, feature ): h, c = state DIM = self.DIM_ATT # Compute map (Eq. 2) Ac = util.conv2d( feature, [1, 1, DIM], name="att_pose_c" ) Ah = util.fc( h, DIM, "att_pose_h" ) # A_c: Bx7x7x32; A_h: Bx32. # Add A_h to A_c by broadcasting tmp = tf.nn.tanh( tf.reshape( Ah, [self.BATCH, 1, 1, DIM] ) + Ac ) # v res = util.conv2d( tmp, [1, 1, self.J], name="att_map" ) res = tf.reshape( res, [self.BATCH, 7, 7, self.J] ) # Normalization (Eq. 3) # t_res = tf.nn.softmax( res, axis=3 ) # Tensorflow 1.6 and higher t_res = tf.nn.softmax( res, dim=3 ) # This is deprecated in Tensorflow 1.8, but still works l_res = tf.split( t_res, self.J, axis=3 ) return l_res, t_res
def build_graph( self ): # Extract DCN features (here ResNet v2, 50 layers) X = tf.reshape( self.X, [self.BT, 224, 224, 3] ) _ = self.net.resnet_v2( X ) features = tf.reshape( self.net.spatial, [self.BATCH, self.T, 7, 7, 2048] ) self.features = features # Encoder with tf.variable_scope( self.scope ): with tf.variable_scope( "LSTM2" ) as scope: lstm = tf.contrib.rnn.LSTMCell( self.DIM_LSTM, initializer=tf.contrib.layers.xavier_initializer() ) state = lstm.zero_state( self.BATCH, tf.float32 ) feat_T = tf.split( features, self.T, axis=1 ) outputs = [] joint_maps = [] for t in range( self.T ): # TODO: Each body part has its own variables if t > 0: scope.reuse_variables() # Generate Attention Map for each Joint and normalize h_rgb = tf.reshape( feat_T[t], [self.BATCH, 7, 7, 2048] ) jm_list, jm_tensor = self.generate_attention_maps( state, h_rgb ) joint_maps.append( tf.expand_dims( jm_tensor, axis=2 ) ) # B x 5 x T x 7 x 7 x J # Assemble Parts body_parts = self.assemble_parts( jm_list, h_rgb ) # F_t^P body_pooled = tf.reduce_max( body_parts, axis=1 ) # S_t # body_pooled = tf.reshape( body_pooled, [self.BATCH, 7*7*2048] ) # Global pooling to save resources body_pooled = tf.reduce_mean( body_pooled, axis=[1,2] ) feat_out, state = lstm( body_pooled, state ) outputs.append( tf.expand_dims( feat_out, axis=1 ) ) h_lstm = tf.concat( outputs, axis=1 ) h_lstm = tf.reshape( h_lstm, [self.BT, self.DIM_LSTM] ) h_pred = util.fc( h_lstm, self.C, "classifier_pose" ) h_pred = tf.reshape( h_pred, [self.BATCH, self.T, self.C] ) # Loss computation var_list = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope = self.scope ) reg_loss = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES, scope = self.scope ) # Main losses: Softmax classification loss loss_pose_pre = tf.nn.sparse_softmax_cross_entropy_with_logits( logits = h_pred, labels = self.Y ) loss_pose_T = loss_pose_pre loss_pose_cls = tf.reduce_sum( loss_pose_pre, axis=1 ) # Main losses: Joint map L2 regression loss joint_maps = tf.concat( joint_maps, axis=2 ) loss_pose_l2 = 0 # Note, we got 5 sets of attention maps. Each have an L2 loss. for i in range( 5 ): diff = tf.reshape( joint_maps[:,i] - self.P, [self.BATCH, self.T, 7*7*self.J] ) loss_pose_l2 += 0.5 * tf.reduce_sum( diff ** 2, axis=2 ) # Total Loss loss = tf.reduce_mean( self.l_action * loss_pose_pre + self.l_pose * loss_pose_l2 ) reg_loss = self.lambda_l2 * tf.reduce_sum( reg_loss ) # Note: This is L2-regularization (see util.py) total = reg_loss + loss # Optimizer + Batch Gradient Accumulation #opt = tf.train.RMSPropOptimizer( learning_rate = self.LR ) opt = tf.train.AdamOptimizer( learning_rate = self.LR ) accum_vars = [tf.Variable( tf.zeros_like( tv.initialized_value() ), trainable = False ) for tv in var_list] zero_ops = [tv.assign( tf.zeros_like( tv ) ) for tv in accum_vars] gvs = opt.compute_gradients( total, var_list ) accum_ops = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate( gvs )] op = opt.apply_gradients( [(accum_vars[i], gv[1]) for i, gv in enumerate(gvs)] ) # Exposing variables self.joint_maps = joint_maps self.reg_loss = reg_loss self.loss_main_T= loss_pose_T self.loss_rpan = loss_pose_cls self.loss_pose = loss_pose_l2 self.zero_ops = zero_ops self.accum_ops = accum_ops self.accum_vars = accum_vars self.result = tf.nn.softmax( h_pred ) self.op = op self.total_loss = total
pos = pos.astype(numpy.int32) for j, t in enumerate(pos): t = min(t, h5data.shape[0] - 1) data[i * sample_size + j] = h5data[t] for c in cids: labels[i * sample_size + j, label_dict[c]] = 1. return data, labels X = tf.placeholder("float", [None, 4096]) Y = tf.placeholder("int32", [None, C]) fc1 = util.fc(X, C, "fc1") pre = tf.nn.softmax_cross_entropy_with_logits(logits=fc1, labels=Y) loss = tf.reduce_mean(pre) optimizer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-8) gradvars = optimizer.compute_gradients(loss) capped = [(tf.clip_by_value(grad, -5, 5), var) for grad, var in gradvars] train_op = optimizer.apply_gradients(capped) conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True), device_count={'GPU': 1}) train_files = [l.strip() for l in open("picked_train.txt")] test_files = [l.strip() for l in open("picked_test.txt")]
def generator_caffenet_fc6(input_feat, reuse=False, trainable=False): with tf.variable_scope('generator', reuse=reuse) as vs: assert input_feat.get_shape().as_list()[-1] == 4096 # input_feat = tf.placeholder(tf.float32, shape=(None, 4096), name='feat') relu_defc7 = leaky_relu( fc(input_feat, 4096, name='defc7', trainable=trainable)) relu_defc6 = leaky_relu( fc(relu_defc7, 4096, name='defc6', trainable=trainable)) relu_defc5 = leaky_relu( fc(relu_defc6, 4096, name='defc5', trainable=trainable)) reshaped_defc5 = tf.reshape(relu_defc5, [-1, 256, 4, 4]) relu_deconv5 = leaky_relu( upconv(tf.transpose(reshaped_defc5, perm=[0, 2, 3, 1]), 256, 4, 2, 'deconv5', biased=True, trainable=trainable)) relu_conv5_1 = leaky_relu( upconv(relu_deconv5, 512, 3, 1, 'conv5_1', biased=True, trainable=trainable)) relu_deconv4 = leaky_relu( upconv(relu_conv5_1, 256, 4, 2, 'deconv4', biased=True, trainable=trainable)) relu_conv4_1 = leaky_relu( upconv(relu_deconv4, 256, 3, 1, 'conv4_1', biased=True, trainable=trainable)) relu_deconv3 = leaky_relu( upconv(relu_conv4_1, 128, 4, 2, 'deconv3', biased=True, trainable=trainable)) relu_conv3_1 = leaky_relu( upconv(relu_deconv3, 128, 3, 1, 'conv3_1', biased=True, trainable=trainable)) deconv2 = leaky_relu( upconv(relu_conv3_1, 64, 4, 2, 'deconv2', biased=True, trainable=trainable)) deconv1 = leaky_relu( upconv(deconv2, 32, 4, 2, 'deconv1', biased=True, trainable=trainable)) deconv0 = upconv(deconv1, 3, 4, 2, 'deconv0', biased=True, trainable=trainable) variables = tf.contrib.framework.get_variables(vs) return deconv0, variables, [ relu_defc7, relu_defc6, relu_defc5, reshaped_defc5, relu_deconv5, relu_conv5_1, relu_deconv4, relu_conv4_1, relu_deconv3, relu_conv3_1, deconv2, deconv1, deconv0 ]
"c032": 15 } # In[16]: #fps_dict = { l.strip().split(' ')[0] : float( l.strip().split(' ')[1] ) for l in open( "video_fps.txt" ) } # In[17]: #X = tf.placeholder( "float", [None, crop_size*crop_size*3] ) Y = tf.placeholder("float", [None, C]) vgg16 = vgg.VGG() vgg16.build() fc1 = util.fc(vgg16.pool5, C, "fc1") pre = tf.nn.softmax_cross_entropy_with_logits(logits=fc1, labels=Y) loss = tf.reduce_mean(pre) # In[19]: train_op = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-8).minimize(loss) # In[20]: conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True), device_count={'GPU': 1}) # In[21]: