def test_convert_collection_to_dict_clear_collection(self): t1 = constant_op.constant(1.0, name='t1') t2 = constant_op.constant(2.0, name='t2') utils.collect_named_outputs('end_points', 'a1', t1) utils.collect_named_outputs('end_points', 'a21', t2) utils.collect_named_outputs('end_points', 'a22', t2) utils.convert_collection_to_dict('end_points', clear_collection=True) self.assertEqual(ops.get_collection('end_points'), [])
def vgg_a(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='vgg_a'): """Oxford Net VGG 11-Layers version A Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with variable_scope.variable_scope(scope, 'vgg_a', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with arg_scope( [layers.conv2d, layers_lib.max_pool2d], outputs_collections=end_points_collection): net = layers_lib.repeat( inputs, 1, layers.conv2d, 64, [3, 3], scope='conv1') net = layers_lib.max_pool2d(net, [2, 2], scope='pool1') net = layers_lib.repeat(net, 1, layers.conv2d, 128, [3, 3], scope='conv2') net = layers_lib.max_pool2d(net, [2, 2], scope='pool2') net = layers_lib.repeat(net, 2, layers.conv2d, 256, [3, 3], scope='conv3') net = layers_lib.max_pool2d(net, [2, 2], scope='pool3') net = layers_lib.repeat(net, 2, layers.conv2d, 512, [3, 3], scope='conv4') net = layers_lib.max_pool2d(net, [2, 2], scope='pool4') net = layers_lib.repeat(net, 2, layers.conv2d, 512, [3, 3], scope='conv5') net = layers_lib.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = layers.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = layers.conv2d(net, 4096, [1, 1], scope='fc7') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = layers.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def _resnet_plain(self, inputs, blocks, output_stride=None, scope=None): """A plain ResNet without extra layers before or after the ResNet blocks.""" with variable_scope.variable_scope(scope, values=[inputs]): with arg_scope([layers.conv2d], outputs_collections='end_points'): net = resnet_utils.stack_blocks_dense(inputs, blocks, output_stride) end_points = utils.convert_collection_to_dict('end_points') return net, end_points
def test_convert_collection_to_dict(self): t1 = constant_op.constant(1.0, name='t1') t2 = constant_op.constant(2.0, name='t2') utils.collect_named_outputs('end_points', 'a1', t1) utils.collect_named_outputs('end_points', 'a21', t2) utils.collect_named_outputs('end_points', 'a22', t2) end_points = utils.convert_collection_to_dict('end_points') self.assertEqual(end_points['a1'], t1) self.assertEqual(end_points['a21'], t2) self.assertEqual(end_points['a22'], t2)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) # image feature self attention # image_feat = tf.nn.dropout( # image_feat, keep_prob=1.-hp.layer_prepostprocess_dropout) # image_feat = image_feat - tf.reduce_mean( # image_feat, axis=-1, keepdims=True) # image_feat = tf.nn.l2_normalize(image_feat, -1) # utils.collect_named_outputs("norms", "image_feat_after_l2", # tf.norm(image_feat, axis=-1)) image_feat = tf.nn.dropout(image_feat, keep_prob=1.-hp.dropout) image_feat = image_encoder(image_feat, hp) utils.collect_named_outputs("norms", "image_feat_encoded", tf.norm(image_feat, axis=-1)) image_feat = common_layers.l2_norm(image_feat) utils.collect_named_outputs("norms", "image_feat_encoded_l2", tf.norm(image_feat, axis=-1)) query = question_encoder(features["question"], hp) utils.collect_named_outputs("norms", "query", tf.norm(query, axis=-1)) image_ave = attn(image_feat, query, hp) utils.collect_named_outputs("norms", "image_ave", tf.norm(image_ave, axis=-1)) image_question = tf.concat([image_ave, query], axis=1) utils.collect_named_outputs("norms", "image_question", tf.norm(image_question, axis=-1)) image_question = tf.nn.dropout(image_question, 1. - hp.dropout) output = mlp(image_question, hp) utils.collect_named_outputs("norms", "output", tf.norm(output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_feat = common_layers.dense(image_feat, hp.hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = prepare_image_question_encoder( image_feat, question, hp) encoder_input = tf.nn.dropout( encoder_input, keep_prob=1.-hp.layer_prepostprocess_dropout) encoder_output, _ = recurrent_transformer_decoder( encoder_input, None, encoder_self_attention_bias, None, hp, name="encoder") utils.collect_named_outputs( "norms", "encoder_output", tf.norm(encoder_output, axis=-1)) # scale query by sqrt(hidden_size) query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size **0.5 query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0) batch_size = common_layers.shape_list(encoder_input)[0] query = tf.tile(query, [batch_size, 1, 1]) query = tf.nn.dropout( query, keep_prob=1.-hp.layer_prepostprocess_dropout) decoder_output, _ = recurrent_transformer_decoder( query, encoder_output, None, encoder_decoder_attention_bias, hp, name="decoder") utils.collect_named_outputs("norms", "decoder_output", tf.norm(decoder_output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(decoder_output, axis=1)
def vgg_16_tcomb(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='vgg_16_tcomb'): with variable_scope.variable_scope(scope, 'vgg_16_tcomb', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=end_points_collection): net = init_conv_comb(inputs, 2, 64, [3, 3], 'conv1') net = layers_lib.max_pool2d(net, [2, 2], scope='pool1') net = repeat_conv_comb(net, 2, 64, 128, [3, 3], 'conv2') net = layers_lib.max_pool2d(net, [2, 2], scope='pool2') net = repeat_conv_comb(net, 3, 128, 256, [3, 3], 'conv3') net = layers_lib.max_pool2d(net, [2, 2], scope='pool3') net = repeat_conv_comb(net, 3, 256, 512, [3, 3], 'conv4') net = layers_lib.max_pool2d(net, [2, 2], scope='pool4') net = repeat_conv_comb(net, 3, 512, 512, [3, 3], 'conv5') net = layers_lib.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = layers.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = layers.conv2d(net, 4096, [1, 1], scope='fc7') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = layers.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def squeezenet(images, num_classes=1000, is_training=False, scope='squeezenet'): """Original squeezenet architecture for 227x227 images.""" #DEBUG print('squeezenet: is_training is %d' % is_training) with tf.variable_scope('squeezenet', values=[images]) as sc: end_point_collection = sc.original_name_scope + '_end_points' with slim.arg_scope( [fire_module, myconv2d, slim.max_pool2d, slim.avg_pool2d], outputs_collections=[end_point_collection]): net = myconv2d(images, 64, [3, 3], stride=2, padding='VALID', scope='conv1') net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool1') net = fire_module(net, 16, 64, scope='fire2') net = fire_module(net, 16, 64, scope='fire3') net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool3') net = fire_module(net, 32, 128, scope='fire4') net = fire_module(net, 32, 128, scope='fire5') net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool5') net = fire_module(net, 48, 192, scope='fire6') net = fire_module(net, 48, 192, scope='fire7') net = fire_module(net, 64, 256, scope='fire8') net = fire_module(net, 64, 256, scope='fire9') net = slim.dropout(net, keep_prob=0.5, is_training=is_training, scope='drop9') net = myconv2d(net, num_classes, [1, 1], stride=1, padding='VALID', scope='conv10') net = slim.avg_pool2d(net, [13, 13], stride=1, padding='VALID', scope='avgpool10') logits = tf.squeeze(net, [1, 2], name='logits') logits = utils.collect_named_outputs(end_point_collection, sc.name + '/logits', logits) end_points = utils.convert_collection_to_dict(end_point_collection) return logits, end_points
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] if hp.image_feat_size: image_feat = common_layers.dense(image_feat, hp.image_feat_size) # apply layer normalization and dropout on image_feature utils.collect_named_outputs("norms", "image_feat_before_l2", tf.norm(image_feat, axis=-1)) image_feat = common_layers.l2_norm(image_feat) utils.collect_named_outputs("norms", "image_feat_after_l2", tf.norm(image_feat, axis=-1)) image_feat = tf.nn.dropout(image_feat, keep_prob=1.-hp.dropout) query = question_encoder(features["question"], hp) utils.collect_named_outputs("norms", "query", tf.norm(query, axis=-1)) image_ave = attn(image_feat, query, hp) utils.collect_named_outputs("norms", "image_ave", tf.norm(image_ave, axis=-1)) image_question = tf.concat([image_ave, query], axis=1) utils.collect_named_outputs("norms", "image_question", tf.norm(image_question, axis=-1)) image_question = tf.nn.dropout(image_question, 1. - hp.dropout) output = mlp(image_question, hp) utils.collect_named_outputs("norms", "output", tf.norm(output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
def build_net(self, inputs, is_training): """ Net structure described in crnn paper feature_maps = [64, 128, 256, 256, 512, 512, 512] """ norm_params = { 'is_training': is_training, 'decay': 0.9, 'epsilon': 1e-05 } with tf.variable_scope(self._scope, self._scope, [inputs]) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.batch_norm], outputs_collections=end_points_collection): # net = slim.conv2d(inputs, 64, 3, 1, scope='conv1') net = slim.conv2d(inputs, 32, 1, 1, scope='conv1') net = dwise_conv(net, k_h=3, k_w=3, padding='SAME', name='dwise1') net = slim.max_pool2d(net, 2, 2, scope='pool1') # net = slim.conv2d(net, 128, 3, 1, scope='conv2') net = slim.conv2d(net, 64, 1, 1, scope='conv2') net = dwise_conv(net, k_h=3, k_w=3, padding='SAME', name='dwise2') net = slim.max_pool2d(net, 2, 2, scope='pool2') # net = slim.conv2d(net, 256, 3, scope='conv3') net = slim.conv2d(net, 128, 1, 1, scope='conv3') net = dwise_conv(net, k_h=3, k_w=3, padding='SAME', name='dwise3') # net = slim.conv2d(net, 256, 3, scope='conv4') net = slim.conv2d(net, 128, 1, 1, scope='conv4') net = dwise_conv(net, k_h=3, k_w=3, padding='SAME', name='dwise4') net = slim.max_pool2d(net, 2, [2, 1], scope='pool3') net = slim.conv2d(net, 256, 3, normalizer_fn=slim.batch_norm, normalizer_params=norm_params, scope='conv5') # 512 net = slim.conv2d(net, 256, 3, normalizer_fn=slim.batch_norm, normalizer_params=norm_params, scope='conv6') net = slim.max_pool2d(net, 2, [2, 1], scope='pool4') net = slim.conv2d(net, 256, 2, padding='VALID', scope='conv7') self.end_points = utils.convert_collection_to_dict(end_points_collection) self.net = net
def resnet_v2(inputs, blocks, num_classes=None, global_pool=True, include_root_block=True, reuse=True, scope=None): # scope_name default_name variable with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc: # 创建集合名 end_points_collection = sc.original_name_scope + '_end_points' # 收集多个end_points的方法 with slim.arg_scope([slim.conv2d, bottleneck, stack_blocks_dense], outputs_collections=end_points_collection): net = inputs if include_root_block: with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None): net = conv2d_same(net, 64, 7, stride=2, scope='conv1') net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') net = stack_blocks_dense(net, blocks) net = slim.batch_norm(net, activation_fn=tf.nn.relu, scope='postnorm') if global_pool: # reduce_mean实现全局池化 # batch_size [height width] channels -> batch_size 1 1 channels # reduce_mean实现全局池化 [1, 2] net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) # 通过一维卷积代替全连接 if num_classes is not None: # conv2d(inputs, num_classes, [1, 1], without activation and normalize) 一维卷积代替全连接 net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') end_points = utils.convert_collection_to_dict( end_points_collection) if num_classes is not None: end_points['predictions'] = slim.softmax(net, scope='predictions') return net, end_points
def pixelwise_predictor(feat, nc=3, n_layers=1, n_layerwise_steps=0, skip_feat=None, reuse=False, is_training=True): """Predicts texture images and probilistic masks. Args: feat: B X H X W X C feature vectors nc: number of output channels n_layers: number of plane equations to predict (denoted as L) n_layerwise_steps: Number of independent per-layer up-conv steps skip_feat: List of features useful for skip connections. Used if lws>0. reuse: Whether to reuse weights from an already defined net is_training: whether batch_norm should be in train mode Returns: textures : L X B X H X W X nc. """ with tf.variable_scope('pixelwise_pred', reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d], normalizer_fn=None, weights_regularizer=slim.l2_regularizer(0.05), activation_fn=tf.nn.sigmoid, outputs_collections=end_points_collection): preds = [] for l in range(n_layers): with tf.variable_scope('upsample_' + str(l), reuse=reuse): feat_l, _ = decoder_simple(feat, nconv=n_layerwise_steps, skip_feat=skip_feat, reuse=reuse, is_training=is_training) pred = slim.conv2d(feat_l, nc, [3, 3], stride=1, scope='pred_' + str(l)) preds.append(pred) end_points = utils.convert_collection_to_dict( end_points_collection) preds = tf.stack(preds, axis=0) return preds, end_points
def resnet_v2(inputs, blocks, num_classes=None, global_pool=True, include_root_block=True, reuse=None, scope=None): with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d, bottleneck, stack_blocks_dense], outputs_collections=end_points_collection): net = inputs if include_root_block: # 根据标记值,创建resnet最前面的64输出通道的步长为2的7*7卷积,然后接最大池化 with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None): net = conv2d_same(net, 64, 7, stride=2, scope='conv1') net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') # 经历过两个步长为2的层图片缩为1/4 net = stack_blocks_dense(net, blocks) # 将残差学习模块组生成好 net = slim.batch_norm(net, activation_fn=tf.nn.relu, scope='postnorm') if global_pool: net = tf.reduce_mean( net, [1, 2], name='pool5', keep_dims=True) # tf.reduce_mean实现全局平均池化效率比avg_pool高 if num_classes is not None: # 是否有通道数 net = slim.conv2d( net, num_classes, [1, 1], activation_fn=None, # 无激活函数和正则项 normalizer_fn=None, scope='logits') # 添加一个输出通道num_classes的1*1的卷积 end_points = utils.convert_collection_to_dict( end_points_collection) # 将collection转化为python的dict if num_classes is not None: end_points['predictions'] = slim.softmax( net, scope='predictions') # 输出网络结果 return net, end_points
def get_outputs(self, blobs, output_layers, sess, collect_metadata=True): feed_dict = { self._image: blobs['data'], self._im_info: blobs['im_info'], self._gt_boxes: np.zeros((10, 5)) } fetches = {} for collection_name in ops.get_all_collection_keys(): if self._resnet_scope in collection_name: collection_dict = utils.convert_collection_to_dict( collection_name) for alias, tensor in collection_dict.items(): alias = remove_net_suffix(alias, self._resnet_scope) for output_layer in output_layers: if output_layer.net_layer(self._resnet_scope) in alias: fetches[output_layer] = tensor # with timer('get_outputs sess.run'): # Run the graph with full trace option if collect_metadata: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() outputs = sess.run(fetches, feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) else: run_metadata = None outputs = sess.run(fetches, feed_dict=feed_dict) # Create the Timeline object, and write it to a json # tl = timeline.Timeline(run_metadata.step_stats) # ctf = tl.generate_chrome_trace_format() # with open('timeline.json', 'w') as f: # f.write(ctf) # outdir = osp.abspath(osp.join(cfg.ROOT_DIR, 'graph_defs')) # writer = tf.summary.FileWriter(logdir=outdir, graph=sess.graph) # writer.add_run_metadata(run_metadata, 'step1') # writer.flush() return outputs, run_metadata
def resnet_v2(inputs, blocks, num_classes=None, global_pool=True, include_root_block=True, reuse=None, scope=None): with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc: end_point_collections = sc.original_name_scope + '_end_points' # 用slim.arg_scope将slim.conv2d bottleneck stack_blocks_dense 3个函数的参数outputs_collections设置为end_point_collections with slim.arg_scope([slim.conv2d, bottleneck, stack_block_dense], outputs_collections=end_point_collections): net = inputs if include_root_block: # 根据include_root_block标记,创建resnet最前面一层的卷积神经网络 with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None): net = conv2d_same(net, 64, 7, stride=2, scope='conv1') net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') # 利用stack_blocks_dense将残差学习模块完成 net = stack_block_dense(net, blocks) net = slim.batch_norm(net, activation_fn=tf.nn.relu, scope='postnorm') if global_pool: # 根据标记添加平均池化层 net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) if num_classes is not None: # 根据是否有分类数,添加一个输出通道为num_classes的1*1卷积 net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') # utils.convert_collection_to_dict将collection转化为dict end_points = utils.convert_collection_to_dict( end_point_collections) if num_classes is not None: # 添加一个softmax输出层 end_points['prediction'] = slim.softmax(net, scope='prediction') return net, end_points
def encoder_simple(inp_img, nz=1000, is_training=True, reuse=False): """Creates a simple encoder CNN. Args: inp_img: TensorFlow node for input with size B X H X W X C nz: number of units in last layer, default=1000 is_training: whether batch_norm should be in train mode reuse: Whether to reuse weights from an already defined net Returns: An encoder CNN which computes a final representation with nz units. """ batch_norm_params = {'is_training': is_training} with tf.variable_scope('encoder', reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d, slim.fully_connected], normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params, weights_regularizer=slim.l2_regularizer(0.05), activation_fn=tf.nn.relu, outputs_collections=end_points_collection): cnv1 = slim.conv2d(inp_img, 32, [7, 7], stride=2, scope='cnv1') cnv1b = slim.conv2d(cnv1, 32, [7, 7], stride=1, scope='cnv1b') cnv2 = slim.conv2d(cnv1b, 64, [5, 5], stride=2, scope='cnv2') cnv2b = slim.conv2d(cnv2, 64, [5, 5], stride=1, scope='cnv2b') cnv3 = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3') cnv3b = slim.conv2d(cnv3, 128, [3, 3], stride=1, scope='cnv3b') cnv4 = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4') cnv4b = slim.conv2d(cnv4, 256, [3, 3], stride=1, scope='cnv4b') cnv5 = slim.conv2d(cnv4b, 512, [3, 3], stride=2, scope='cnv5') cnv5b = slim.conv2d(cnv5, 512, [3, 3], stride=1, scope='cnv5b') cnv6 = slim.conv2d(cnv5b, 512, [3, 3], stride=2, scope='cnv6') cnv6b = slim.conv2d(cnv6, 512, [3, 3], stride=1, scope='cnv6b') cnv7 = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7') cnv7b = slim.conv2d(cnv7, 512, [3, 3], stride=1, scope='cnv7b') cnv7b_flat = slim.flatten(cnv7b, scope='cnv7b_flat') enc = slim.stack(cnv7b_flat, slim.fully_connected, [2 * nz, nz, nz], scope='fc') end_points = utils.convert_collection_to_dict(end_points_collection) return enc, end_points
def vgg16(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, reuse=tf.AUTO_REUSE, scope='vgg_16'): with variable_scope.variable_scope(scope, 'vgg_16', [inputs], reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=end_points_collection): net = layers_lib.repeat(inputs, 2, layers.conv2d, 64, [3, 3], scope='conv1') net = layers_lib.max_pool2d(net, [2, 2], scope='pool1') net = layers_lib.repeat(net, 2, layers.conv2d, 128, [3, 3], scope='conv2') net = layers_lib.max_pool2d(net, [2, 2], scope='pool2') net = layers_lib.repeat(net, 3, layers.conv2d, 256, [3, 3], scope='conv3') net = layers_lib.max_pool2d(net, [2, 2], scope='pool3') net = layers_lib.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv4') net = layers_lib.max_pool2d(net, [2, 2], scope='pool4') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict(end_points_collection) return net, end_points
def build_net(self, inputs): with tf.variable_scope(self._scope, self._scope, [inputs]) as sc: end_points_collection = sc.name + '_end_points' with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.batch_norm], outputs_collections=end_points_collection): net = slim.conv2d(inputs, 96, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool1') net = self.fire_module(net, 16, 64, scope='fire2') net = self.fire_module(net, 16, 64, scope='fire3') net = self.fire_module(net, 32, 128, scope='fire4') net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool4') net = self.fire_module(net, 32, 128, scope='fire5') net = self.fire_module(net, 48, 192, scope='fire6') net = self.fire_module(net, 48, 192, scope='fire7') net = self.fire_module(net, 64, 256, scope='fire8') net = slim.max_pool2d(net, [2, 2], stride=[2, 1], scope='maxpool8') net = self.fire_module(net, 64, 256, scope='fire9') self.end_points = utils.convert_collection_to_dict(end_points_collection) self.net = net
def inference(inputs, batch_size, num_classes, training=True): with tf.variable_scope('inference') as sc: end_points_collection = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d], normalizer_fn=None, weights_regularizer=slim.l2_regularizer(0.05), activation_fn=tf.nn.relu, trainable=training, outputs_collections=end_points_collection): cnv1 = slim.conv2d(inputs, 16, [3, 3], stride=1, scope='cnv1') cnv2 = slim.conv2d(cnv1, 32, [1, 1], stride=1, scope='cnv2') max_pool1 = slim.max_pool2d(cnv2, [3, 3], stride=2, scope='maxpool1') cnv3 = slim.conv2d(max_pool1, 32, [3, 3], stride=1, scope='cnv3') cnv4 = slim.conv2d(cnv3, 64, [1, 1], stride=1, scope='cnv4') max_pool2 = slim.max_pool2d(cnv4, [3, 3], stride=2, scope='maxpool2') flat = slim.flatten(max_pool2, scope='flatten') fc_1 = slim.fully_connected(flat, 128, scope='fc_1', trainable=training) drop1 = slim.dropout(fc_1, scope='drop1', is_training=training) fc_2 = slim.fully_connected(drop1, num_classes, scope='fc_2', trainable=training) end_points = utils.convert_collection_to_dict(end_points_collection) return fc_2, end_points_collection
def resnet_v1(inputs, blocks, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, reuse=None, scope=None): with variable_scope.variable_scope( scope, 'resnet_v1', [inputs], reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' with arg_scope( [layers.conv2d, not_bottleneck, resnet_utils.stack_blocks_dense], outputs_collections=end_points_collection): with arg_scope([layers.batch_norm], is_training=is_training): net = inputs if include_root_block: if output_stride is not None: if output_stride % 4 != 0: raise ValueError('The output_stride needs to be a multiple of 4.') output_stride /= 4 net = resnet_utils.conv2d_same(net, 16, 3, stride=1, scope='conv1') net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) if global_pool: # Global average pooling. net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True) if num_classes is not None: net = layers.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') # Convert end_points_collection into a dictionary of end_points. end_points = utils.convert_collection_to_dict(end_points_collection) if num_classes is not None: end_points['predictions'] = layers_lib.softmax( net, scope='predictions') return net, end_points
def squeezenet(images, is_training=True, batch_norm_decay=0.999, num_classes=1000): """Original squeezenet architecture for 224x224 images.""" with slim.arg_scope(squeezenet_arg_scope(is_training, batch_norm_decay)): with tf.variable_scope('squeezenet', values=[images]) as sc: end_point_collection = sc.original_name_scope + '_end_points' with slim.arg_scope( [fire_module, slim.conv2d, slim.max_pool2d, slim.avg_pool2d], outputs_collections=[end_point_collection]): net = slim.conv2d(images, 96, [7, 7], stride=2, scope='conv1') net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool1') net = fire_module(net, 16, 64, scope='fire2') net = fire_module(net, 16, 64, scope='fire3') net = fire_module(net, 32, 128, scope='fire4') net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool4') net = fire_module(net, 32, 128, scope='fire5') net = fire_module(net, 48, 192, scope='fire6') net = fire_module(net, 48, 192, scope='fire7') net = fire_module(net, 64, 256, scope='fire8') net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool8') net = fire_module(net, 64, 256, scope='fire9') net = slim.dropout(net, is_training=is_training, scope='drop9') net = slim.conv2d(net, num_classes, [1, 1], stride=1, scope='conv10') net = slim.avg_pool2d(net, [13, 13], stride=1, scope='avgpool10') logits = tf.squeeze(net, [1, 2], name='logits') logits = utils.collect_named_outputs(end_point_collection, sc.name + '/logits', logits) end_points = utils.convert_collection_to_dict(end_point_collection) return logits, end_points
def pose_net_fb(tgt_image, src_image_stack, is_training=True, reuse=False): inputs = tf.concat([tgt_image, src_image_stack], axis=3) H = inputs.get_shape()[1].value W = inputs.get_shape()[2].value num_source = int(src_image_stack.get_shape()[3].value // 3) with tf.variable_scope('pose_net') as sc: if reuse: sc.reuse_variables() end_points_collection = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], normalizer_fn=None, weights_regularizer=slim.l2_regularizer(0.05), activation_fn=tf.nn.relu, outputs_collections=end_points_collection): # cnv1 to cnv5b are shared between pose and explainability prediction cnv1 = slim.conv2d(inputs, 16, [7, 7], stride=2, scope='cnv1') cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2') cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3') cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4') cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5') cnv6 = slim.conv2d(cnv5, 256, [3, 3], stride=2, scope='cnv6') cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7') # Double the number of channels pose_pred = slim.conv2d(cnv7, 6 * num_source * 2, [1, 1], scope='pred', stride=1, normalizer_fn=None, activation_fn=None) pose_avg = tf.reduce_mean(pose_pred, [1, 2]) # Empirically we found that scaling by a small constant # facilitates training. # 1st half: target->source, 2nd half: source->target pose_final = 0.01 * tf.reshape(pose_avg, [-1, num_source, 6 * 2]) end_points = utils.convert_collection_to_dict( end_points_collection) return pose_final, end_points
def cifar_squeezenet(images, is_training=True, batch_norm_decay=0.999, num_classes=10): """Modified version of squeezenet for CIFAR images""" with slim.arg_scope(squeezenet_arg_scope(is_training, batch_norm_decay)): with tf.variable_scope('squeezenet', values=[images]) as sc: end_point_collection = sc.original_name_scope + '_end_points' with slim.arg_scope( [fire_module, slim.conv2d, slim.max_pool2d, slim.avg_pool2d], outputs_collections=[end_point_collection]): net = slim.conv2d(images, 96, [2, 2], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='maxpool1') net = fire_module(net, 16, 64, scope='fire2') net = fire_module(net, 16, 64, scope='fire3') net = fire_module(net, 32, 128, scope='fire4') net = slim.max_pool2d(net, [2, 2], scope='maxpool4') net = fire_module(net, 32, 128, scope='fire5') net = fire_module(net, 48, 192, scope='fire6') net = fire_module(net, 48, 192, scope='fire7') net = fire_module(net, 64, 256, scope='fire8') net = slim.max_pool2d(net, [2, 2], scope='maxpool8') net = fire_module(net, 64, 256, scope='fire9') # Use global average pooling per 'Network in Network [1]' net = slim.avg_pool2d(net, [4, 4], scope='avgpool10') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='conv10') logits = tf.squeeze(net, [1, 2], name='logits') logits = utils.collect_named_outputs(end_point_collection, sc.name + '/logits', logits) end_points = utils.convert_collection_to_dict(end_point_collection) return logits, end_points
def get_slim_arch_bn(inputs, isTrainTensor, num_classes=1000, scope='vgg_16'): with variable_scope.variable_scope(scope, 'vgg_16', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. filters = 64 # Arg scope set default parameters for a list of ops with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=end_points_collection): net = layers_lib.repeat( inputs, 2, layers.conv2d, filters, [3, 3], scope='conv1', weights_regularizer=slim.l2_regularizer(0.01)) bn_0 = tf.contrib.layers.batch_norm(net, center=True, scale=True, is_training=isTrainTensor, scope='bn1', decay=0.9) p_0 = layers_lib.max_pool2d(bn_0, [2, 2], scope='pool1') net = layers_lib.repeat( p_0, 2, layers.conv2d, filters, [3, 3], scope='conv2', weights_regularizer=slim.l2_regularizer(0.01)) bn_1 = tf.contrib.layers.batch_norm(net, center=True, scale=True, is_training=isTrainTensor, scope='bn2', decay=0.9) res_1 = p_0 + bn_1 p_1 = layers_lib.max_pool2d(res_1, [2, 2], scope='pool2') net = layers_lib.repeat( p_1, 3, layers.conv2d, filters, [4, 4], scope='conv3', weights_regularizer=slim.l2_regularizer(0.01)) bn_2 = tf.contrib.layers.batch_norm(net, center=True, scale=True, is_training=isTrainTensor, scope='bn3', decay=0.9) res_2 = p_1 + bn_2 p_2 = layers_lib.max_pool2d(res_2, [2, 2], scope='pool3') net = layers_lib.repeat( p_2, 3, layers.conv2d, filters, [5, 5], scope='conv4', weights_regularizer=slim.l2_regularizer(0.01)) bn_3 = tf.contrib.layers.batch_norm(net, center=True, scale=True, is_training=isTrainTensor, scope='bn4', decay=0.9) res_3 = p_2 + bn_3 p_3 = layers_lib.max_pool2d(res_3, [2, 2], scope='pool4') last_conv = net = layers_lib.repeat( p_3, 3, layers.conv2d, filters, [5, 5], scope='conv5', weights_regularizer=slim.l2_regularizer(0.01)) # Here we have 14x14 filters net = tf.reduce_mean(net, [1, 2]) # Global average pooling # add layer with float 32 mask of same shape as global average pooling out # feed default with ones, leave placeholder mask = tf.placeholder_with_default(tf.ones_like(net), shape=net.shape, name='gap_mask') net = tf.multiply(net, mask) net = layers_lib.fully_connected(net, num_classes, activation_fn=None, biases_initializer=None, scope='softmax_logits') with tf.variable_scope("raw_CAM"): w_tensor_name = "vgg_16/softmax_logits/weights:0" s_w = tf.get_default_graph().get_tensor_by_name(w_tensor_name) softmax_weights = tf.expand_dims(tf.expand_dims(s_w, 0), 0) # reshape to match 1x1xFxC # tensor mult from (N x lh x lw x F) , (1 x 1 x F x C) cam = tf.tensordot(last_conv, softmax_weights, [[3], [2]], name='cam_out') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict( end_points_collection) return net, end_points
def resnet_v2( inputs, blocks, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, centered_stride=False, reuse=None, scope=None): """Generator for v2 (preactivation) ResNet models. This function generates a family of ResNet v2 models. See the resnet_v2_*() methods for specific model instantiations, obtained by selecting different block instantiations that produce ResNets of various depths. Training for image classification on Imagenet is usually done with [224, 224] inputs, resulting in [7, 7] feature maps at the output of the last ResNet block for the ResNets defined in [1] that have nominal stride equal to 32. However, for dense prediction tasks we advise that one uses inputs with spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In this case the feature maps at the ResNet output will have spatial shape [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] and corners exactly aligned with the input image corners, which greatly facilitates alignment of the features to the image. Using as input [225, 225] images results in [8, 8] feature maps at the output of the last ResNet block. For dense prediction tasks, the ResNet needs to run in fully-convolutional (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all have nominal stride equal to 32 and a good choice in FCN mode is to use output_stride=16 in order to increase the density of the computed features at small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. Args: inputs: A tensor of size [batch, height_in, width_in, channels]. blocks: A list of length equal to the number of ResNet blocks. Each element is a resnet_utils.Block object describing the units in the block. num_classes: Number of predicted classes for classification tasks. If None we return the features before the logit layer. is_training: whether batch_norm layers are in training mode. global_pool: If True, we perform global average pooling before computing the logits. Set to True for image classification, False for dense prediction. output_stride: If None, then the output will be computed at the nominal network stride. If output_stride is not None, it specifies the requested ratio of input to output spatial resolution. include_root_block: If True, include the initial convolution followed by max-pooling, if False excludes it. If excluded, `inputs` should be the results of an activation-less convolution. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. If global_pool is False, then height_out and width_out are reduced by a factor of output_stride compared to the respective height_in and width_in, else both height_out and width_out equal one. If num_classes is None, then net is the output of the last ResNet block, potentially after global average pooling. If num_classes is not None, net contains the pre-softmax activations. end_points: A dictionary from components of the network to the corresponding activation. Raises: ValueError: If the target output_stride is not valid. """ with variable_scope.variable_scope( scope, 'resnet_v2', [inputs], reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' with arg_scope( [layers_lib.conv2d, bottleneck, resnet_utils.stack_blocks_dense], outputs_collections=end_points_collection): with arg_scope([layers.batch_norm], is_training=is_training): net = inputs if include_root_block: if output_stride is not None: if output_stride % 4 != 0: raise ValueError('The output_stride needs to be a multiple of 4.') output_stride /= 4 # We do not include batch normalization or activation functions in # conv1 because the first ResNet unit will perform these. Cf. # Appendix of [2]. with arg_scope([layers_lib.conv2d], activation_fn=None, normalizer_fn=None): net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') net = resnet_utils.max_pool2d_same( net, 3, stride=2, scope='pool1', centered_stride=centered_stride and output_stride == 4) net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) # This is needed because the pre-activation variant does not have batch # normalization or activation functions in the residual unit output. See # Appendix of [2]. net = slim.batch_norm(net, activation_fn=nn_ops.relu, scope='postnorm') if global_pool: # Global average pooling. net = math_ops.reduce_mean(net, tfu.image_axes(), name='pool5', keepdims=True) if num_classes is not None: net = layers_lib.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') # Convert end_points_collection into a dictionary of end_points. end_points = utils.convert_collection_to_dict(end_points_collection) if num_classes is not None: end_points['predictions'] = layers.softmax(net, scope='predictions') return net, end_points
def vgg_16(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, scope='vgg_16'): """Oxford Net VGG 16-Layers version D Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ import tensorflow as tf inputs -= tf.constant([123.68, 116.779, 103.939]) inputs /= 255 with variable_scope.variable_scope(scope, 'vgg_16', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=end_points_collection): net = layers_lib.repeat(inputs, 2, layers.conv2d, 64, [3, 3], scope='conv1', trainable=False) net = layers_lib.max_pool2d(net, [2, 2], scope='pool1', padding="SAME") net = layers_lib.repeat(net, 2, layers.conv2d, 128, [3, 3], scope='conv2', trainable=False) net = layers_lib.max_pool2d(net, [2, 2], scope='pool2', padding="SAME") net = layers_lib.repeat(net, 3, layers.conv2d, 256, [3, 3], scope='conv3') net = layers_lib.max_pool2d(net, [2, 2], scope='pool3', padding="SAME") net = layers_lib.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv4') net = layers_lib.max_pool2d(net, [2, 2], scope='pool4', padding="SAME") net = layers_lib.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv5') net = layers_lib.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. # net = layers.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') # net = layers_lib.dropout( # net, dropout_keep_prob, is_training=is_training, scope='dropout6') # net = layers.conv2d(net, 4096, [1, 1], scope='fc7') # net = layers_lib.dropout( # net, dropout_keep_prob, is_training=is_training, scope='dropout7') # net = layers.conv2d( # net, # num_classes, [1, 1], # activation_fn=None, # normalizer_fn=None, # scope='fc8') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict( end_points_collection) return end_points["vgg_16/conv5/conv5_3"]
def vgg_a(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='vgg_a'): """Oxford Net VGG 11-Layers version A Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with variable_scope.variable_scope(scope, 'vgg_a', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with arg_scope([layers.conv2d, layers_lib.max_pool2d], outputs_collections=end_points_collection): net = layers_lib.repeat(inputs, 1, layers.conv2d, 64, [3, 3], scope='conv1', trainable=False) net = layers_lib.max_pool2d(net, [2, 2], scope='pool1') net = layers_lib.repeat(net, 1, layers.conv2d, 128, [3, 3], scope='conv2', trainable=False) net = layers_lib.max_pool2d(net, [2, 2], scope='pool2') net = layers_lib.repeat(net, 2, layers.conv2d, 256, [3, 3], scope='conv3') net = layers_lib.max_pool2d(net, [2, 2], scope='pool3') net = layers_lib.repeat(net, 2, layers.conv2d, 512, [3, 3], scope='conv4') net = layers_lib.max_pool2d(net, [2, 2], scope='pool4') net = layers_lib.repeat(net, 2, layers.conv2d, 512, [3, 3], scope='conv5') net = layers_lib.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = layers.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') net = layers_lib.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = layers.conv2d(net, 4096, [1, 1], scope='fc7') net = layers_lib.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = layers.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict( end_points_collection) if spatial_squeeze: net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def _build(self, inputs, is_training=True): """ Args: inputs: A Tensor of shape `(batch_size, height, width, channels)`. Returns: A dict of feature maps to be consumed by an SSD network """ # TODO: Is there a better way to manage scoping in these cases? scope = self.module_name if self.parent_name: scope = self.parent_name + "/" + scope base_net_endpoints = super(SSDFeatureExtractor, self)._build( inputs, is_training=is_training)["end_points"] if self.truncated_vgg_16_type: # As it is pointed out in SSD and ParseNet papers, `conv4_3` has a # different features scale compared to other layers, to adjust it # we need to add a spatial normalization before adding the # predictors. vgg_conv4_3 = base_net_endpoints[scope + "/vgg_16/conv4/conv4_3"] tf.summary.histogram("conv4_3_hist", vgg_conv4_3) with tf.variable_scope("conv_4_3_norm"): # Normalize through channels dimension (dim=3) vgg_conv4_3_norm = tf.nn.l2_normalize(vgg_conv4_3, 3, epsilon=1e-12) # Scale. scale_initializer = (tf.ones([1, 1, 1, vgg_conv4_3.shape[3]]) * 20.0) # They initialize to 20.0 in paper scale = tf.get_variable( "gamma", dtype=vgg_conv4_3.dtype.base_dtype, initializer=scale_initializer, ) vgg_conv4_3_norm = tf.multiply(vgg_conv4_3_norm, scale) tf.summary.histogram("conv4_3_normalized_hist", vgg_conv4_3) tf.add_to_collection("FEATURE_MAPS", vgg_conv4_3_norm) # The original SSD paper uses a modified version of the vgg16 # network, which we'll modify here vgg_network_truncation_endpoint = base_net_endpoints[ scope + "/vgg_16/conv5/conv5_3"] tf.summary.histogram("conv5_3_hist", vgg_network_truncation_endpoint) # Extra layers for vgg16 as detailed in paper with tf.variable_scope("extra_feature_layers"): self._init_vgg16_extra_layers() net = tf.nn.max_pool( vgg_network_truncation_endpoint, [1, 3, 3, 1], padding="SAME", strides=[1, 1, 1, 1], name="pool5", ) net = self.conv6(net) net = self.activation_fn(net) net = self.conv7(net) net = self.activation_fn(net) tf.summary.histogram("conv7_hist", net) tf.add_to_collection("FEATURE_MAPS", net) net = self.conv8_1(net) net = self.activation_fn(net) net = self.conv8_2(net) net = self.activation_fn(net) tf.summary.histogram("conv8_hist", net) tf.add_to_collection("FEATURE_MAPS", net) net = self.conv9_1(net) net = self.activation_fn(net) net = self.conv9_2(net) net = self.activation_fn(net) tf.summary.histogram("conv9_hist", net) tf.add_to_collection("FEATURE_MAPS", net) net = self.conv10_1(net) net = self.activation_fn(net) net = self.conv10_2(net) net = self.activation_fn(net) tf.summary.histogram("conv10_hist", net) tf.add_to_collection("FEATURE_MAPS", net) net = self.conv11_1(net) net = self.activation_fn(net) net = self.conv11_2(net) net = self.activation_fn(net) tf.summary.histogram("conv11_hist", net) tf.add_to_collection("FEATURE_MAPS", net) # This parameter determines onto which variables we try to load the # pretrained weights self.pretrained_weights_scope = scope + "/vgg_16" # It's actually an ordered dict return utils.convert_collection_to_dict("FEATURE_MAPS")
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_hidden_size = hp.image_hidden_size or hp.hidden_size if hp.image_feat_preprocess_proj: image_feat = common_layers.dense(image_feat, image_hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) else: assert image_hidden_size == 2048 image_feat = tf.nn.dropout( image_feat, keep_prob=1.-hp.layer_prepostprocess_dropout) if hp.image_feat_encode: image_feat = image_encoder(image_feat, hp) utils.collect_named_outputs("norms", "image_feat_encoded", tf.norm(image_feat, axis=-1)) else: image_feat = common_layers.layer_norm(image_feat) utils.collect_named_outputs("norms", "image_feat_after_layer", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) question, question_self_attention_bias = prepare_question_encoder( question, hp) question = tf.nn.dropout( question, keep_prob=1.-hp.layer_prepostprocess_dropout) query = question_encoder(question, question_self_attention_bias, hp) utils.collect_named_outputs( "norms", "query_encode", tf.norm(query, axis=-1)) query = (query + tf.expand_dims( tf.squeeze(question_self_attention_bias, [1, 2]), axis=2)) query = tf.reduce_max(query, axis=1) utils.collect_named_outputs( "norms", "query_maxpool", tf.norm(query, axis=-1)) # query = common_layers.l2_norm(query) # utils.collect_named_outputs("norms", "query_after_l2", # tf.norm(query, axis=-1)) image_ave = attn(image_feat, query, hp) utils.collect_named_outputs("norms", "image_ave", tf.norm(image_ave, axis=-1)) if hp.multimodal_combine == "concat": image_question = tf.concat([image_ave, query], axis=1) elif hp.multimodal_combine == "sum": image_question = image_ave + query elif hp.multimodal_combine == "product": image_question = image_ave * query utils.collect_named_outputs("norms", "image_question", tf.norm(image_question, axis=-1)) image_question = tf.nn.dropout(image_question, 1. - hp.dropout) output = mlp(image_question, hp) utils.collect_named_outputs("norms", "output", tf.norm(output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
def resnet_v1(inputs, blocks, num_classes=None, is_training=True, global_pool=True, include_root_block=True, reuse=None, scope=None, normalize_inside=True): """Removes output_stride, use pre-defined rate Returns: net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. If global_pool is False, then height_out and width_out are reduced by a factor of output_stride compared to the respective height_in and width_in, else both height_out and width_out equal one. If num_classes is None, then net is the output of the last ResNet block, potentially after global average pooling. If num_classes is not None, net contains the pre-softmax activations. end_points: A dictionary from components of the network to the corresponding activation. Raises: ValueError: If the target output_stride is not valid. """ if normalize_inside: # if no normalization is used outside, use detectron style normalization inputs = _detectron_img_preprocess(inputs) with variable_scope.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' with arg_scope([conv2d, bottleneck, stack_blocks_dense, max_pool2d], outputs_collections=end_points_collection): with arg_scope([batch_norm], is_training=is_training): net = inputs if include_root_block: # net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') net = conv2d(net, 64, 7, 2, scope='conv1') net = max_pool2d(net, 3, 2, scope='pool1') net = stack_blocks_dense(net, blocks) if global_pool: # Global average pooling. net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True) net = utils.collect_named_outputs(end_points_collection, sc.name + '/gap', net) if num_classes is not None: net = conv2d(net, num_classes, 1, activation_fn=None, normalizer_fn=None, scope='logits') # Convert end_points_collection into a dictionary of end_points. end_points = utils.convert_collection_to_dict( end_points_collection) if num_classes is not None: end_points['predictions'] = layers_lib.softmax( net, scope='predictions') return net, end_points
def disp_net(tgt_image, is_training=True): H = tgt_image.get_shape()[1].value W = tgt_image.get_shape()[2].value with tf.variable_scope('depth_net') as sc: end_points_collection = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], normalizer_fn=None, weights_regularizer=slim.l2_regularizer(0.05), activation_fn=tf.nn.relu, outputs_collections=end_points_collection): # cnv1 = slim.conv2d(tgt_image, 32, [7, 7], stride=2, scope='cnv1') cnv1 = slim.conv2d(tgt_image, 32, [7, 7], stride=2, scope='cnv1') cnv1b = slim.conv2d(cnv1, 32, [7, 7], stride=1, scope='cnv1b') cnv2 = slim.conv2d(cnv1b, 64, [5, 5], stride=2, scope='cnv2') cnv2b = slim.conv2d(cnv2, 64, [5, 5], stride=1, scope='cnv2b') cnv3 = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3') cnv3b = slim.conv2d(cnv3, 128, [3, 3], stride=1, scope='cnv3b') cnv4 = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4') cnv4b = slim.conv2d(cnv4, 256, [3, 3], stride=1, scope='cnv4b') cnv5 = slim.conv2d(cnv4b, 512, [3, 3], stride=2, scope='cnv5') cnv5b = slim.conv2d(cnv5, 512, [3, 3], stride=1, scope='cnv5b') cnv6 = slim.conv2d(cnv5b, 512, [3, 3], stride=2, scope='cnv6') cnv6b = slim.conv2d(cnv6, 512, [3, 3], stride=1, scope='cnv6b') cnv7 = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7') cnv7b = slim.conv2d(cnv7, 512, [3, 3], stride=1, scope='cnv7b') upcnv7 = slim.conv2d_transpose(cnv7b, 512, [3, 3], stride=2, scope='upcnv7') # There might be dimension mismatch due to uneven down/up-sampling upcnv7 = resize_like(upcnv7, cnv6b) i7_in = tf.concat([upcnv7, cnv6b], axis=3) icnv7 = slim.conv2d(i7_in, 512, [3, 3], stride=1, scope='icnv7') upcnv6 = slim.conv2d_transpose(icnv7, 512, [3, 3], stride=2, scope='upcnv6') upcnv6 = resize_like(upcnv6, cnv5b) i6_in = tf.concat([upcnv6, cnv5b], axis=3) icnv6 = slim.conv2d(i6_in, 512, [3, 3], stride=1, scope='icnv6') upcnv5 = slim.conv2d_transpose(icnv6, 256, [3, 3], stride=2, scope='upcnv5') upcnv5 = resize_like(upcnv5, cnv4b) i5_in = tf.concat([upcnv5, cnv4b], axis=3) icnv5 = slim.conv2d(i5_in, 256, [3, 3], stride=1, scope='icnv5') upcnv4 = slim.conv2d_transpose(icnv5, 128, [3, 3], stride=2, scope='upcnv4') upcnv4 = resize_like(upcnv4, cnv3b) i4_in = tf.concat([upcnv4, cnv3b], axis=3) icnv4 = slim.conv2d(i4_in, 128, [3, 3], stride=1, scope='icnv4') disp4 = DISP_SCALING * slim.conv2d(icnv4, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp4') + MIN_DISP disp4_up = tf.image.resize_bilinear(disp4, [np.int(H/4), np.int(W/4)]) upcnv3 = slim.conv2d_transpose(icnv4, 64, [3, 3], stride=2, scope='upcnv3') i3_in = tf.concat([upcnv3, cnv2b, disp4_up], axis=3) icnv3 = slim.conv2d(i3_in, 64, [3, 3], stride=1, scope='icnv3') disp3 = DISP_SCALING * slim.conv2d(icnv3, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp3') + MIN_DISP disp3_up = tf.image.resize_bilinear(disp3, [np.int(H/2), np.int(W/2)]) upcnv2 = slim.conv2d_transpose(icnv3, 32, [3, 3], stride=2, scope='upcnv2') i2_in = tf.concat([upcnv2, cnv1b, disp3_up], axis=3) icnv2 = slim.conv2d(i2_in, 32, [3, 3], stride=1, scope='icnv2') disp2 = DISP_SCALING * slim.conv2d(icnv2, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp2') + MIN_DISP disp2_up = tf.image.resize_bilinear(disp2, [H, W]) upcnv1 = slim.conv2d_transpose(icnv2, 16, [3, 3], stride=2, scope='upcnv1') i1_in = tf.concat([upcnv1, disp2_up], axis=3) icnv1 = slim.conv2d(i1_in, 16, [3, 3], stride=1, scope='icnv1') disp1 = DISP_SCALING * slim.conv2d(icnv1, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp1') + MIN_DISP end_points = utils.convert_collection_to_dict(end_points_collection) return [disp1, disp2, disp3, disp4], end_points
def get_inference(self, inputs, num_classes, for_training=False, restore_logits=True, scope=None): """ Build model Args: images: Images returned from inputs() or distorted_inputs(). num_classes: number of classes for_training: If set to `True`, build the inference model for training. Kernels that operate differently for inference during training e.g. dropout, are appropriately configured. restore_logits: whether or not the logits layers should be restored. Useful for fine-tuning a model with different num_classes. scope: optional prefix string identifying the ImageNet tower. Returns: Logits. 2-D float Tensor. Auxiliary Logits. 2-D float Tensor of side-head. Used for training only. """ with variable_scope.variable_scope(scope, 'SegDecNet', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, max_pool2d with arg_scope([ layers.conv2d, layers.fully_connected, layers_lib.max_pool2d, layers.batch_norm ], outputs_collections=end_points_collection): # Apply specific parameters to all conv2d layers (to use batch norm and relu - relu is by default) with arg_scope( [layers.conv2d, layers.fully_connected], weights_initializer=lambda shape, dtype=tf.float32, partition_info=None: tf.random_normal( shape, mean=0, stddev=0.01, dtype=dtype), biases_initializer=None, normalizer_fn=layers.batch_norm, normalizer_params= { 'center': True, 'scale': True, #'is_training': for_training, # we disable this to do feature normalization (but requires batch size=1) 'decay': self. BATCHNORM_MOVING_AVERAGE_DECAY, # Decay for the moving averages. 'epsilon': 0.001, # epsilon to prevent 0s in variance. }): net = layers_lib.repeat(inputs, 2, layers.conv2d, 32, [5, 5], scope='conv1') net = layers_lib.max_pool2d(net, [2, 2], scope='pool1') net = layers_lib.repeat(net, 3, layers.conv2d, 64, [5, 5], scope='conv2') net = layers_lib.max_pool2d(net, [2, 2], scope='pool2') net = layers_lib.repeat(net, 4, layers.conv2d, 64, [5, 5], scope='conv3') net = layers_lib.max_pool2d(net, [2, 2], scope='pool3') net = layers.conv2d(net, 1024, [15, 15], padding='SAME', scope='conv4') net_prob_mat = layers.conv2d(net, 1, [1, 1], scope='conv5', activation_fn=None) decision_net = self.decision_net_fn( net, tf.nn.relu(net_prob_mat)) # Convert end_points_collection into a end_point dict. endpoints = utils.convert_collection_to_dict( end_points_collection) # Add summaries for viewing model statistics on TensorBoard. self._activation_summaries(endpoints) return net_prob_mat, decision_net, endpoints
def alexnet_v2(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='alexnet_v2'): """AlexNet version 2. Described in: http://arxiv.org/pdf/1404.5997v2.pdf Parameters from: github.com/akrizhevsky/cuda-convnet2/blob/master/layers/ layers-imagenet-1gpu.cfg Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. To use in fully convolutional mode, set spatial_squeeze to false. The LRN layers have been removed and change the initializers from random_normal_initializer to xavier_initializer. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with variable_scope.variable_scope(scope, 'alexnet_v2', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=[end_points_collection]): net = layers.conv2d( inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool1') net = layers.conv2d(net, 192, [5, 5], scope='conv2') net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool2') net = layers.conv2d(net, 384, [3, 3], scope='conv3') net = layers.conv2d(net, 384, [3, 3], scope='conv4') net = layers.conv2d(net, 256, [3, 3], scope='conv5') net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool5') # Use conv2d instead of fully_connected layers. with arg_scope( [layers.conv2d], weights_initializer=trunc_normal(0.005), biases_initializer=init_ops.constant_initializer(0.1)): net = layers.conv2d(net, 4096, [5, 5], padding='VALID', scope='fc6') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = layers.conv2d(net, 4096, [1, 1], scope='fc7') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = layers.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, biases_initializer=init_ops.zeros_initializer(), scope='fc8') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_feat = common_layers.dense(image_feat, hp.hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = prepare_image_question_encoder( image_feat, question, hp) encoder_input = tf.nn.dropout(encoder_input, keep_prob=1. - hp.layer_prepostprocess_dropout) encoder_output, _ = recurrent_transformer_decoder( encoder_input, None, encoder_self_attention_bias, None, hp, name="encoder") utils.collect_named_outputs("norms", "encoder_output", tf.norm(encoder_output, axis=-1)) # scale query by sqrt(hidden_size) query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size**0.5 query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0) batch_size = common_layers.shape_list(encoder_input)[0] query = tf.tile(query, [batch_size, 1, 1]) query = tf.nn.dropout(query, keep_prob=1. - hp.layer_prepostprocess_dropout) decoder_output, _ = recurrent_transformer_decoder( query, encoder_output, None, encoder_decoder_attention_bias, hp, name="decoder") utils.collect_named_outputs("norms", "decoder_output", tf.norm(decoder_output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(decoder_output, axis=1)
def resnet_v2(inputs, blocks, num_classes=None, is_training=None, global_pool=True, output_stride=None, include_root_block=True, reuse=None, scope=None): """Generator for v2 (preactivation) ResNet models. This function generates a family of ResNet v2 models. See the resnet_v2_*() methods for specific model instantiations, obtained by selecting different block instantiations that produce ResNets of various depths. Training for image classification on Imagenet is usually done with [224, 224] inputs, resulting in [7, 7] feature maps at the output of the last ResNet block for the ResNets defined in [1] that have nominal stride equal to 32. However, for dense prediction tasks we advise that one uses inputs with spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In this case the feature maps at the ResNet output will have spatial shape [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] and corners exactly aligned with the input image corners, which greatly facilitates alignment of the features to the image. Using as input [225, 225] images results in [8, 8] feature maps at the output of the last ResNet block. For dense prediction tasks, the ResNet needs to run in fully-convolutional (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all have nominal stride equal to 32 and a good choice in FCN mode is to use output_stride=16 in order to increase the density of the computed features at small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. Args: inputs: A tensor of size [batch, height_in, width_in, channels]. blocks: A list of length equal to the number of ResNet blocks. Each element is a resnet_utils.Block object describing the units in the block. num_classes: Number of predicted classes for classification tasks. If None we return the features before the logit layer. is_training: whether is training or not. If None, the value inherited from the resnet_arg_scope is used. Specifying value None is deprecated. global_pool: If True, we perform global average pooling before computing the logits. Set to True for image classification, False for dense prediction. output_stride: If None, then the output will be computed at the nominal network stride. If output_stride is not None, it specifies the requested ratio of input to output spatial resolution. include_root_block: If True, include the initial convolution followed by max-pooling, if False excludes it. If excluded, `inputs` should be the results of an activation-less convolution. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. If global_pool is False, then height_out and width_out are reduced by a factor of output_stride compared to the respective height_in and width_in, else both height_out and width_out equal one. If num_classes is None, then net is the output of the last ResNet block, potentially after global average pooling. If num_classes is not None, net contains the pre-softmax activations. end_points: A dictionary from components of the network to the corresponding activation. Raises: ValueError: If the target output_stride is not valid. """ with variable_scope.variable_scope( scope, 'resnet_v2', [inputs], reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' with arg_scope( [layers_lib.conv2d, bottleneck, resnet_utils.stack_blocks_dense], outputs_collections=end_points_collection): if is_training is not None: bn_scope = arg_scope([layers.batch_norm], is_training=is_training) else: bn_scope = arg_scope([]) with bn_scope: net = inputs if include_root_block: if output_stride is not None: if output_stride % 4 != 0: raise ValueError('The output_stride needs to be a multiple of 4.') output_stride /= 4 # We do not include batch normalization or activation functions in # conv1 because the first ResNet unit will perform these. Cf. # Appendix of [2]. with arg_scope( [layers_lib.conv2d], activation_fn=None, normalizer_fn=None): net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') net = layers.max_pool2d(net, [3, 3], stride=2, scope='pool1') net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) # This is needed because the pre-activation variant does not have batch # normalization or activation functions in the residual unit output. See # Appendix of [2]. net = layers.batch_norm( net, activation_fn=nn_ops.relu, scope='postnorm') if global_pool: # Global average pooling. net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) if num_classes is not None: net = layers_lib.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') # Convert end_points_collection into a dictionary of end_points. end_points = utils.convert_collection_to_dict(end_points_collection) if num_classes is not None: end_points['predictions'] = layers.softmax(net, scope='predictions') return net, end_points
def alexnet_v2(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='alexnet_v2'): """AlexNet version 2. Described in: http://arxiv.org/pdf/1404.5997v2.pdf Parameters from: github.com/akrizhevsky/cuda-convnet2/blob/master/layers/ layers-imagenet-1gpu.cfg Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. To use in fully convolutional mode, set spatial_squeeze to false. The LRN layers have been removed and change the initializers from random_normal_initializer to xavier_initializer. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with name_scope(scope, 'alexnet_v2', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=[end_points_collection]): net = layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool1') net = layers.conv2d(net, 192, [5, 5], scope='conv2') net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool2') net = layers.conv2d(net, 384, [3, 3], scope='conv3') net = layers.conv2d(net, 384, [3, 3], scope='conv4') net = layers.conv2d(net, 256, [3, 3], scope='conv5') net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool5') # Use conv2d instead of fully_connected layers. with arg_scope( [layers.conv2d], weights_initializer=trunc_normal(0.005), biases_initializer=init_ops.constant_initializer(0.1)): net = layers.conv2d(net, 4096, [2, 1], padding='VALID', scope='fc6') net = layers_lib.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = layers.conv2d(net, 4096, [1, 1], scope='fc7') net = layers_lib.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = layers.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, biases_initializer=init_ops.zeros_initializer(), scope='logits') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict( end_points_collection) if spatial_squeeze: net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def decouple_net_v0_dilation(tgt_image, src0_image, src1_image, dropout=False, is_training=True, se_attention=False, batch_norm=False, cnv6_num_outputs=128): """ Input: flow_maps: centrelized. Return: pose_final = [rz,ry,rx,tx,ty,tz] """ num_source = 2 inputs = tf.concat([tgt_image, src0_image, src1_image], axis=3) print(">>> [PoseNN] inputs : ", inputs) print(">>> [PoseNN] use batch_norm : ", slim.batch_norm if batch_norm else None) print(">>> [PoseNN] cnv6_num_outputs = ", cnv6_num_outputs) with tf.variable_scope('pose_exp_net') as sc: end_points_collection = sc.original_name_scope + '_end_points' with slim.arg_scope( [slim.conv2d, slim.conv2d_transpose], normalizer_fn=slim.batch_norm if batch_norm else None, weights_regularizer=slim.l2_regularizer(1e-4), activation_fn=tf.nn.relu, outputs_collections=end_points_collection): # cnv1 to cnv5b are shared between pose and explainability prediction cnv1 = slim.conv2d(inputs, 16, [7, 7], stride=2, scope='cnv1') cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2') cnv3 = slim.conv2d(cnv2, 64, [3, 3], rate=2, scope='cnv3') cnv4 = slim.conv2d(cnv3, 128, [3, 3], rate=4, scope='cnv4') cnv5 = slim.conv2d(cnv4, 256, [3, 3], rate=8, scope='cnv5') if dropout: cnv5 = slim.dropout(cnv5, 0.7, is_training=is_training) # Pose specific layers cnv6s = {} poses_avg = {} with tf.variable_scope('pose'): for name in ['rotation', 'translation']: with tf.variable_scope(name): # cnv6 = tf.layers.conv2d(cnv5, 128, [3, 3], dilation_rate=(2, 2), padding='same', activation=tf.nn.relu, name='cnv6') if se_attention is True: # mode1 print( ">>> [PoseNN][%s] use se_attention (insert se_block between cnv5 & cnv6)" % name) cnv5 = se_block(cnv5, 'cnv5_se_attention', ratio=8) cnv6 = slim.conv2d(cnv5, cnv6_num_outputs, [3, 3], rate=2, scope='cnv6') elif se_attention == 'se_skipadd': # mode3 print( ">>> [PoseNN][%s] use cnv5 + dilated_cnv6_se_attention" % name) cnv6 = slim.conv2d(cnv5, cnv6_num_outputs, [3, 3], rate=2, scope='cnv6') se_cnv6 = se_block(cnv6, 'cnv6_se_attention', ratio=8) cnv6 = tf.nn.relu( cnv5 + se_cnv6, name="cnv6_se_attention/add_cnv5/relu") elif se_attention == 'se_replace': # mode2 print( ">>> [PoseNN][%s] use se_attention replace with cnv6" % name) cnv6 = se_block(cnv5, 'cnv6_se_attention', ratio=8) else: cnv6 = slim.conv2d(cnv5, cnv6_num_outputs, [3, 3], rate=2, scope='cnv6') cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7') pred = slim.conv2d(cnv7, 3 * num_source, [1, 1], scope='pred', stride=1, normalizer_fn=None, activation_fn=None) avg = tf.reduce_mean(pred, [1, 2]) poses_avg[name] = avg cnv6s[name] = cnv6 # Empirically we found that scaling by a small constant facilitates training. rot_final = tf.reshape(poses_avg['rotation'], [-1, num_source, 3]) trans_final = tf.reshape(poses_avg['translation'], [-1, num_source, 3]) pose_final = 0.01 * tf.concat([rot_final, trans_final], axis=-1) # -V4 : 2019/08/05 # Exp mask specific layers end_points = utils.convert_collection_to_dict( end_points_collection) #return pose_final, end_points return pose_final, (cnv6s['rotation'], cnv6s['translation'])
def vgg_a(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='vgg_a', fc_conv_padding='VALID', global_pool=False): """Oxford Net VGG 11-Layers version A Example. Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 224x224. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. If 0 or None, the logits layer is omitted and the input features to the logits layer are returned instead. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. fc_conv_padding: the type of padding to use for the fully connected layer that is implemented as a convolutional layer. Use 'SAME' padding if you are applying the network in a fully convolutional manner and want to get a prediction map downsampled by a factor of 32 as an output. Otherwise, the output prediction map will be (input / 32) - 6 in case of 'VALID' padding. global_pool: Optional boolean flag. If True, the input to the classification layer is avgpooled to size 1x1, for any input size. (This is not part of the original VGG architecture.) Returns: net: the output of the logits layer (if num_classes is a non-zero integer), or the input to the logits layer (if num_classes is 0 or None). end_points: a dict of tensors with intermediate activations. """ with tf.variable_scope(scope, 'vgg_a', [inputs]) as sc: end_points_collection = sc.original_name_scope + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope([slim.conv2d, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.repeat(inputs, 1, slim.conv2d, 64, [3, 3], scope='conv1') net = slim.max_pool2d(net, [2, 2], scope='pool1') net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2') net = slim.max_pool2d(net, [2, 2], scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3') net = slim.max_pool2d(net, [2, 2], scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4') net = slim.max_pool2d(net, [2, 2], scope='pool4') net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5') net = slim.max_pool2d(net, [2, 2], scope='pool5') # Use conv2d instead of fully_connected layers. net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6') net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = slim.conv2d(net, 4096, [1, 1], scope='fc7') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict( end_points_collection) if global_pool: net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool') end_points['global_pool'] = net if num_classes: net = slim.dropout(net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='fc8') if spatial_squeeze: net = tf.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points
def pose_exp_net(tgt_image, src_image_stack, do_exp=True, is_training=True): inputs = tf.concat([tgt_image, src_image_stack], axis=3) H = inputs.get_shape()[1].value W = inputs.get_shape()[2].value num_source = int(src_image_stack.get_shape()[3].value // 3) with tf.variable_scope('pose_exp_net') as sc: end_points_collection = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], normalizer_fn=None, weights_regularizer=slim.l2_regularizer(0.05), activation_fn=tf.nn.leaky_relu, outputs_collections=end_points_collection): # cnv1 to cnv5b are shared between pose and explainability prediction cnv1 = slim.conv2d(inputs, 16, [7, 7], stride=2, scope='cnv1') cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2') cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3') cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4') cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5') # Pose specific layers with tf.variable_scope('pose'): cnv6 = slim.conv2d(cnv5, 256, [3, 3], stride=2, scope='cnv6') cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7') pose_pred = slim.conv2d(cnv7, 6 * num_source, [1, 1], scope='pred', stride=1, normalizer_fn=None, activation_fn=None) pose_avg = tf.reduce_mean(pose_pred, [1, 2]) # Empirically we found that scaling by a small constant # facilitates training. pose_final = 0.01 * tf.reshape(pose_avg, [-1, num_source, 6]) # Exp mask specific layers if do_exp: with tf.variable_scope('exp'): upcnv5 = slim.conv2d_transpose(cnv5, 256, [3, 3], stride=2, scope='upcnv5') upcnv4 = slim.conv2d_transpose(upcnv5, 128, [3, 3], stride=2, scope='upcnv4') mask4 = slim.conv2d(upcnv4, num_source * 2, [3, 3], stride=1, scope='mask4', normalizer_fn=None, activation_fn=None) upcnv3 = slim.conv2d_transpose(upcnv4, 64, [3, 3], stride=2, scope='upcnv3') mask3 = slim.conv2d(upcnv3, num_source * 2, [3, 3], stride=1, scope='mask3', normalizer_fn=None, activation_fn=None) upcnv2 = slim.conv2d_transpose(upcnv3, 32, [5, 5], stride=2, scope='upcnv2') mask2 = slim.conv2d(upcnv2, num_source * 2, [5, 5], stride=1, scope='mask2', normalizer_fn=None, activation_fn=None) upcnv1 = slim.conv2d_transpose(upcnv2, 16, [7, 7], stride=2, scope='upcnv1') mask1 = slim.conv2d(upcnv1, num_source * 2, [7, 7], stride=1, scope='mask1', normalizer_fn=None, activation_fn=None) else: mask1 = None mask2 = None mask3 = None mask4 = None end_points = utils.convert_collection_to_dict( end_points_collection) return pose_final, [mask1, mask2, mask3, mask4], end_points
def overfeat(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.5, spatial_squeeze=True, scope='overfeat'): """Contains the model definition for the OverFeat network. The definition for the network was obtained from: OverFeat: Integrated Recognition, Localization and Detection using Convolutional Networks Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus and Yann LeCun, 2014 http://arxiv.org/abs/1312.6229 Note: All the fully_connected layers have been transformed to conv2d layers. To use in classification mode, resize input to 231x231. To use in fully convolutional mode, set spatial_squeeze to false. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. is_training: whether or not the model is being trained. dropout_keep_prob: the probability that activations are kept in the dropout layers during training. spatial_squeeze: whether or not should squeeze the spatial dimensions of the outputs. Useful to remove unnecessary dimensions for classification. scope: Optional scope for the variables. Returns: the last op containing the log predictions and end_points dict. """ with variable_scope.variable_scope(scope, 'overfeat', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d with arg_scope( [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d], outputs_collections=end_points_collection): net = layers.conv2d( inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') net = layers_lib.max_pool2d(net, [2, 2], scope='pool1') net = layers.conv2d(net, 256, [5, 5], padding='VALID', scope='conv2') net = layers_lib.max_pool2d(net, [2, 2], scope='pool2') net = layers.conv2d(net, 512, [3, 3], scope='conv3') net = layers.conv2d(net, 1024, [3, 3], scope='conv4') net = layers.conv2d(net, 1024, [3, 3], scope='conv5') net = layers_lib.max_pool2d(net, [2, 2], scope='pool5') with arg_scope( [layers.conv2d], weights_initializer=trunc_normal(0.005), biases_initializer=init_ops.constant_initializer(0.1)): # Use conv2d instead of fully_connected layers. net = layers.conv2d(net, 3072, [6, 6], padding='VALID', scope='fc6') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout6') net = layers.conv2d(net, 4096, [1, 1], scope='fc7') net = layers_lib.dropout( net, dropout_keep_prob, is_training=is_training, scope='dropout7') net = layers.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, biases_initializer=init_ops.zeros_initializer(), scope='fc8') # Convert end_points_collection into a end_point dict. end_points = utils.convert_collection_to_dict(end_points_collection) if spatial_squeeze: net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed') end_points[sc.name + '/fc8'] = net return net, end_points