def xdet_model_fn(features, labels, mode, params): """Our model_fn for ResNet to be used with our Estimator.""" num_anchors_list = labels['num_anchors_list'] num_feature_layers = len(num_anchors_list) shape = labels['targets'][-1] glabels = labels['targets'][:num_feature_layers][0] gtargets = labels['targets'][num_feature_layers:2 * num_feature_layers][0] #gtargets = tf.Print(gtargets, [gtargets], message='gtargets:', summarize=100) gscores = labels['targets'][2 * num_feature_layers:3 * num_feature_layers][0] with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = xdet_body.xdet_resnet_v2(params['resnet_size'], params['data_format']) multi_merged_feature = backbone( inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) cls_pred, location_pred = xdet_body.xdet_head( multi_merged_feature, params['num_classes'], num_anchors_list[0], (mode == tf.estimator.ModeKeys.TRAIN), data_format=params['data_format']) if params['data_format'] == 'channels_first': cls_pred = tf.transpose(cls_pred, [0, 2, 3, 1]) location_pred = tf.transpose(location_pred, [0, 2, 3, 1]) bboxes_pred = labels['decode_fn']( location_pred ) #(tf.reshape(location_pred, tf.shape(location_pred).as_list()[0:-1] + [-1, 4])) cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']]) location_pred = tf.reshape(location_pred, [-1, 4]) glabels = tf.reshape(glabels, [-1]) gscores = tf.reshape(gscores, [-1]) gtargets = tf.reshape(gtargets, [-1, 4]) # raw mask for positive > 0.5, and for negetive < 0.3 # each positive examples has one label positive_mask = glabels > 0 #tf.logical_and(glabels > 0, gscores > params['match_threshold']) fpositive_mask = tf.cast(positive_mask, tf.float32) n_positives = tf.reduce_sum(fpositive_mask) batch_glabels = tf.reshape(glabels, [tf.shape(features)[0], -1]) batch_n_positives = tf.count_nonzero(batch_glabels, -1) batch_negtive_mask = tf.equal(batch_glabels, 0) batch_n_negtives = tf.count_nonzero(batch_negtive_mask, -1) batch_n_neg_select = tf.cast( params['negative_ratio'] * tf.cast(batch_n_positives, tf.float32), tf.int32) batch_n_neg_select = tf.minimum(batch_n_neg_select, tf.cast(batch_n_negtives, tf.int32)) # hard negative mining for classification predictions_for_bg = tf.nn.softmax( tf.reshape(cls_pred, [tf.shape(features)[0], -1, params['num_classes']]))[:, :, 0] prob_for_negtives = tf.where( batch_negtive_mask, 0. - predictions_for_bg, # ignore all the positives 0. - tf.ones_like(predictions_for_bg)) topk_prob_for_bg, _ = tf.nn.top_k(prob_for_negtives, k=tf.shape(prob_for_negtives)[1]) score_at_k = tf.gather_nd( topk_prob_for_bg, tf.stack([tf.range(tf.shape(features)[0]), batch_n_neg_select - 1], axis=-1)) selected_neg_mask = prob_for_negtives >= tf.expand_dims(score_at_k, axis=-1) negtive_mask = tf.reshape( tf.logical_and(batch_negtive_mask, selected_neg_mask), [-1]) #tf.logical_and(tf.equal(glabels, 0), gscores > 0.) #negtive_mask = tf.logical_and(tf.logical_and(tf.logical_not(positive_mask), gscores < params['neg_threshold']), gscores > 0.) #negtive_mask = tf.logical_and(gscores < params['neg_threshold'], tf.logical_not(positive_mask)) # # random select negtive examples for classification # selected_neg_mask = tf.random_uniform(tf.shape(gscores), minval=0, maxval=1.) < tf.where( # tf.greater(n_negtives, 0), # tf.divide(tf.cast(n_neg_to_select, tf.float32), n_negtives), # tf.zeros_like(tf.cast(n_neg_to_select, tf.float32)), # name='rand_select_negtive') # include both selected negtive and all positive examples final_mask = tf.stop_gradient(tf.logical_or(negtive_mask, positive_mask)) total_examples = tf.reduce_sum(tf.cast(final_mask, tf.float32)) # add mask for glabels and cls_pred here glabels = tf.boolean_mask(tf.clip_by_value(glabels, 0, FLAGS.num_classes), tf.stop_gradient(final_mask)) cls_pred = tf.boolean_mask(cls_pred, tf.stop_gradient(final_mask)) location_pred = tf.boolean_mask(location_pred, tf.stop_gradient(positive_mask)) gtargets = tf.boolean_mask(gtargets, tf.stop_gradient(positive_mask)) predictions = { 'classes': tf.argmax(cls_pred, axis=-1), 'probabilities': tf.reduce_max(tf.nn.softmax(cls_pred, name='softmax_tensor'), axis=-1), 'bboxes_predict': tf.reshape(bboxes_pred, [-1, 4]) } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate loss, which includes softmax cross entropy and L2 regularization. cross_entropy = tf.cond( n_positives > 0., lambda: tf.losses.sparse_softmax_cross_entropy( labels=glabels, logits=cls_pred), lambda: 0.) #cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=glabels, logits=cls_pred) # Create a tensor named cross_entropy for logging purposes. tf.identity(cross_entropy, name='cross_entropy_loss') tf.summary.scalar('cross_entropy_loss', cross_entropy) loc_loss = tf.cond( n_positives > 0., lambda: modified_smooth_l1( location_pred, tf.stop_gradient(gtargets), sigma=1.), lambda: tf.zeros_like(location_pred)) #loc_loss = modified_smooth_l1(location_pred, tf.stop_gradient(gtargets)) loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1)) loc_loss = tf.identity(loc_loss, name='location_loss') tf.summary.scalar('location_loss', loc_loss) tf.losses.add_loss(loc_loss) # Add weight decay to the loss. We exclude the batch norm variables because # doing so leads to a small improvement in accuracy. loss = 1.2 * (cross_entropy + loc_loss) + params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) total_loss = tf.identity(loss, name='total_loss') if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() lr_values = [ params['learning_rate'] * decay for decay in params['lr_decay_factors'] ] learning_rate = tf.train.piecewise_constant( tf.cast(global_step, tf.int32), [int(_) for _ in params['decay_boundaries']], lr_values) truncated_learning_rate = tf.maximum( learning_rate, tf.constant(params['end_learning_rate'], dtype=learning_rate.dtype)) # Create a tensor named learning_rate for logging purposes. tf.identity(truncated_learning_rate, name='learning_rate') tf.summary.scalar('learning_rate', truncated_learning_rate) optimizer = tf.train.MomentumOptimizer( learning_rate=truncated_learning_rate, momentum=params['momentum']) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) else: train_op = None cls_accuracy = tf.metrics.accuracy(glabels, predictions['classes']) metrics = {'cls_accuracy': cls_accuracy} # Create a tensor named train_accuracy for logging purposes. tf.identity(cls_accuracy[1], name='cls_accuracy') tf.summary.scalar('cls_accuracy', cls_accuracy[1]) return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=metrics, scaffold=tf.train.Scaffold( init_fn=train_helper.get_init_fn_for_scaffold(FLAGS)))
def xdet_model_fn(features, labels, mode, params): """Our model_fn for ResNet to be used with our Estimator.""" num_anchors_list = labels['num_anchors_list'] num_feature_layers = len(num_anchors_list) shape = labels['targets'][-1] if mode != tf.estimator.ModeKeys.TRAIN: org_image = labels['targets'][-2] isdifficult = labels['targets'][-3] bbox_img = labels['targets'][-4] gbboxes_raw = labels['targets'][-5] glabels_raw = labels['targets'][-6] glabels = labels['targets'][:num_feature_layers][0] gtargets = labels['targets'][num_feature_layers:2 * num_feature_layers][0] gscores = labels['targets'][2 * num_feature_layers:3 * num_feature_layers][0] with tf.variable_scope(params['model_scope'], default_name=None, values=[features], reuse=tf.AUTO_REUSE): backbone = xdet_body.xdet_resnet_v2(params['resnet_size'], params['data_format']) multi_merged_feature = backbone( inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) cls_pred, location_pred = xdet_body.xdet_head( multi_merged_feature, params['num_classes'], num_anchors_list[0], (mode == tf.estimator.ModeKeys.TRAIN), data_format=params['data_format']) if params['data_format'] == 'channels_first': cls_pred = tf.transpose(cls_pred, [0, 2, 3, 1]) location_pred = tf.transpose(location_pred, [0, 2, 3, 1]) #org_image = tf.transpose(org_image, [0, 2, 3, 1]) # batch size is 1 shape = tf.squeeze(shape, axis=0) glabels = tf.squeeze(glabels, axis=0) gtargets = tf.squeeze(gtargets, axis=0) gscores = tf.squeeze(gscores, axis=0) cls_pred = tf.squeeze(cls_pred, axis=0) location_pred = tf.squeeze(location_pred, axis=0) if mode != tf.estimator.ModeKeys.TRAIN: org_image = tf.squeeze(org_image, axis=0) isdifficult = tf.squeeze(isdifficult, axis=0) gbboxes_raw = tf.squeeze(gbboxes_raw, axis=0) glabels_raw = tf.squeeze(glabels_raw, axis=0) bbox_img = tf.squeeze(bbox_img, axis=0) bboxes_pred = labels['decode_fn']( location_pred ) #(tf.reshape(location_pred, location_pred.get_shape().as_list()[:-1] + [-1, 4]))#(location_pred)# eval_ops, save_image_op = bboxes_eval(org_image, shape, bbox_img, cls_pred, bboxes_pred, glabels_raw, gbboxes_raw, isdifficult, params['num_classes']) _ = tf.identity(save_image_op, name='save_image_with_bboxes_op') cls_pred = tf.reshape(cls_pred, [-1, params['num_classes']]) location_pred = tf.reshape(location_pred, [-1, 4]) glabels = tf.reshape(glabels, [-1]) gscores = tf.reshape(gscores, [-1]) gtargets = tf.reshape(gtargets, [-1, 4]) # raw mask for positive > 0.5, and for negetive < 0.3 # each positive examples has one label positive_mask = glabels > 0 #tf.logical_and(glabels > 0, gscores > 0.) fpositive_mask = tf.cast(positive_mask, tf.float32) n_positives = tf.reduce_sum(fpositive_mask) # negtive examples are those max_overlap is still lower than neg_threshold, note that some positive may also has lower jaccard # note those gscores is 0 is either be ignored during anchors encode or anchors have 0 overlap with all ground truth #negtive_mask = tf.logical_and(tf.logical_and(tf.logical_not(tf.logical_or(positive_mask, glabels < 0)), gscores < params['neg_threshold']), gscores > 0.) negtive_mask = tf.logical_and(tf.equal(glabels, 0), gscores > 0.) #negtive_mask = tf.logical_and(tf.logical_and(tf.logical_not(positive_mask), gscores < params['neg_threshold']), gscores > 0.) #negtive_mask = tf.logical_and(gscores < params['neg_threshold'], tf.logical_not(positive_mask)) fnegtive_mask = tf.cast(negtive_mask, tf.float32) n_negtives = tf.reduce_sum(fnegtive_mask) n_neg_to_select = tf.cast(params['negative_ratio'] * n_positives, tf.int32) n_neg_to_select = tf.minimum(n_neg_to_select, tf.cast(n_negtives, tf.int32)) # hard negative mining for classification predictions_for_bg = tf.nn.softmax(cls_pred)[:, 0] #negtive_mask = tf.Print(negtive_mask,[n_positives]) prob_for_negtives = tf.where( negtive_mask, 0. - predictions_for_bg, # ignore all the positives 0. - tf.ones_like(predictions_for_bg)) topk_prob_for_bg, _ = tf.nn.top_k(prob_for_negtives, k=n_neg_to_select) selected_neg_mask = prob_for_negtives > topk_prob_for_bg[-1] # # random select negtive examples for classification # selected_neg_mask = tf.random_uniform(tf.shape(gscores), minval=0, maxval=1.) < tf.where( # tf.greater(n_negtives, 0), # tf.divide(tf.cast(n_neg_to_select, tf.float32), n_negtives), # tf.zeros_like(tf.cast(n_neg_to_select, tf.float32)), # name='rand_select_negtive') # include both selected negtive and all positive examples final_mask = tf.stop_gradient( tf.logical_or(tf.logical_and(negtive_mask, selected_neg_mask), positive_mask)) total_examples = tf.reduce_sum(tf.cast(final_mask, tf.float32)) # add mask for glabels and cls_pred here glabels = tf.boolean_mask(tf.clip_by_value(glabels, 0, FLAGS.num_classes), tf.stop_gradient(final_mask)) cls_pred = tf.boolean_mask(cls_pred, tf.stop_gradient(final_mask)) location_pred = tf.boolean_mask(location_pred, tf.stop_gradient(positive_mask)) gtargets = tf.boolean_mask(gtargets, tf.stop_gradient(positive_mask)) # Calculate loss, which includes softmax cross entropy and L2 regularization. cross_entropy = tf.cond( n_positives > 0., lambda: tf.losses.sparse_softmax_cross_entropy( labels=glabels, logits=cls_pred), lambda: 0.) #cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=glabels, logits=cls_pred) # Create a tensor named cross_entropy for logging purposes. tf.identity(cross_entropy, name='cross_entropy_loss') tf.summary.scalar('cross_entropy_loss', cross_entropy) loc_loss = tf.cond( n_positives > 0., lambda: modified_smooth_l1( location_pred, tf.stop_gradient(gtargets), sigma=1.), lambda: tf.zeros_like(location_pred)) #loc_loss = modified_smooth_l1(location_pred, tf.stop_gradient(gtargets)) loc_loss = tf.reduce_mean(tf.reduce_sum(loc_loss, axis=-1)) loc_loss = tf.identity(loc_loss, name='location_loss') tf.summary.scalar('location_loss', loc_loss) tf.losses.add_loss(loc_loss) with tf.control_dependencies([save_image_op]): # Add weight decay to the loss. We exclude the batch norm variables because # doing so leads to a small improvement in accuracy. loss = 1.2 * (cross_entropy + loc_loss) + params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) total_loss = tf.identity(loss, name='total_loss') predictions = { 'classes': tf.argmax(cls_pred, axis=-1), 'probabilities': tf.reduce_max(tf.nn.softmax(cls_pred, name='softmax_tensor'), axis=-1), 'bboxes_predict': tf.reshape(bboxes_pred, [-1, 4]) } summary_hook = tf.train.SummarySaverHook( save_secs=FLAGS.save_summary_steps, output_dir=FLAGS.model_dir, summary_op=tf.summary.merge_all()) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, evaluation_hooks=[summary_hook], loss=loss, eval_metric_ops=eval_ops) #=eval_ops) else: raise ValueError('This script only support predict mode!')