def safe_log(tensor, eps=1e-16): is_zero = tf.less(tensor, eps) tensor = tf.where(is_zero, tf.ones_like(tensor), tensor) tensor = tf.where(is_zero, tf.zeros_like(tensor) - 1e8, tf.log(tensor)) return tensor
def apply_gradients(self, grads_and_vars, global_step=None, name=None): if global_step is None: global_step = tf.train.get_or_create_global_step() new_global_step = global_step + 1 assignments = [] for (grad, param) in grads_and_vars: if grad is None or param is None: continue param_name = param.op.name v = tf.get_variable(name=param_name + "/Momentum", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) if self._use_weight_decay(param_name): grad += self.weight_decay * param if self.classic_momentum: trust_ratio = 1.0 if self._do_layer_adaptation(param_name): w_norm = tf.norm(param, ord=2) g_norm = tf.norm(grad, ord=2) trust_ratio = tf.where( tf.greater(w_norm, 0), tf.where(tf.greater(g_norm, 0), (self.eeta * w_norm / g_norm), 1.0), 1.0) scaled_lr = self.learning_rate * trust_ratio next_v = tf.multiply(self.momentum, v) + scaled_lr * grad if self.use_nesterov: update = tf.multiply(self.momentum, next_v) + scaled_lr * grad else: update = next_v next_param = param - update else: next_v = tf.multiply(self.momentum, v) + grad if self.use_nesterov: update = tf.multiply(self.momentum, next_v) + grad else: update = next_v trust_ratio = 1.0 if self._do_layer_adaptation(param_name): w_norm = tf.norm(param, ord=2) v_norm = tf.norm(update, ord=2) trust_ratio = tf.where( tf.greater(w_norm, 0), tf.where(tf.greater(v_norm, 0), (self.eeta * w_norm / v_norm), 1.0), 1.0) scaled_lr = trust_ratio * self.learning_rate next_param = param - scaled_lr * update assignments.extend([ param.assign(next_param), v.assign(next_v), global_step.assign(new_global_step) ]) return tf.group(*assignments, name=name)
def body(self, features, decode_step=None, cache=None, decoding_stats=None, add_summary=True): encoder_output = None extra_losses = [] padding_bias = None if not self.hparams.fast_decode: decode_step = None if "inputs" in features: inputs = features["inputs"] # remove the last two dimensions that are always 1. inputs = tf.reshape( inputs, utils.shape_list(inputs)[:2] + [self.hidden_size]) # Padding bias only used for seq2seq models. padding_bias = utils.embedding_to_padding(inputs) # Mask random positions shape = utils.shape_list(inputs) if self.hparams.input_dropout: inputs = tf.where( tf.random.uniform(shape) < self.hparams.input_dropout, tf.zeros_like(inputs), inputs) if self.hparams.add_timing_signal: inputs += utils.get_timing_signal_1d(self.hparams.max_length, self.hidden_size) if cache is not None and -1 in cache: encoder_output = cache[-1] else: encoder_output = utils.transformer_encoder_layers( inputs=inputs, num_layers=self.num_encoder_layers, hparams=self.hparams, losses=extra_losses, name="encoder", token_bias=features.get("token_bias_inputs"), padding_bias=padding_bias) if cache is not None and -1 not in cache: cache[-1] = encoder_output targets = tf.to_int32(features["targets"]) # remove the last two dimensions that are always 1. targets = tf.reshape(targets, utils.shape_list(targets)[:2]) # Clamp targets to max_target_length targets = targets[:, :self.hparams.max_target_length] if self.is_decode: targets = self.process_partial_targets_decoding(targets) decoder_input = self.prepare_decoder(targets) decoder_output = utils.transformer_decoder_layers( inputs=decoder_input, num_layers=self.num_decoder_layers, hparams=self.hparams, encoder_output=encoder_output, decode_step=decode_step, losses=extra_losses, cache=cache, name="decoder", decoding_stats=decoding_stats, token_bias_inputs=features.get("token_bias_inputs"), token_bias_targets=features.get("token_bias_targets"), padding_bias=padding_bias) logits = self.produce_output(decoder_output) # Return logits as-is in decoding mode if self.is_decode: return logits # Add cross entropy loss one_hot_targets = tf.one_hot(tf.cast(targets, dtype=tf.int32), self.vocab_size) x_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( labels=one_hot_targets, logits=logits) weights = tf.to_float(tf.not_equal(targets, 0)) loss = tf.reduce_sum(x_entropy * weights) / tf.reduce_sum(weights) if add_summary: tf.summary.scalar("losses/weight", tf.reduce_sum(weights)) tf.summary.scalar("losses/x_entropy", tf.reduce_sum(x_entropy * weights)) loss_dict = {"training": loss} if extra_losses: loss_dict["extra_loss"] = tf.add_n(extra_losses) # hack for T2T metrics logits = tf.reshape( logits, utils.shape_list(logits)[:2] + [1, 1] + utils.shape_list(logits)[-1:]) return logits, loss_dict
def logmarglike_threetransfergaussians( ells, # (..., ) y, # (..., dy) yinvvar, # (..., dy) M_T, # (..., dt, dy), z, # (..., dz), zinvvar, # (..., dz), R_T, # (..., dt, dz), mu, # (..., dt), muinvvar, # (..., dt), ): """ Fit linear model to three Gaussian data sets Parameters ---------- ells : ndarray (nobj, ) scaling between the data: y = ell * z y, yinvvar : ndarray (nobj, ..., n_pix_y) data and data inverse variances M_T : ndarray (..., n_components, n_pix_y) design matrix of linear model z, zinvvar : ndarray (nobj, ..., n_pix_z) data and data variances for y R_T : ndarray (..., n_components, n_pix_z) design matrix of linear model for z mu, muinvvar : ndarray ( ..., n_components) data and data variances for y Returns ------- logfml : ndarray (nobj, ) log likelihood values with parameters marginalised and at best fit theta_map : ndarray (nobj, ndim) Best fit MAP parameters theta_cov : ndarray (nobj, ndim, ndim) Parameter covariance """ log2pi = tf.cast(tf.math.log(2.0 * np.pi), T) nt = tf.cast(tf.shape(M_T)[-2], T) nobj = tf.cast(tf.shape(y)[0], T) ny = tf.cast( tf.math.count_nonzero(tf.where(yinvvar > 0)), T ) # tf.cast(tf.shape(y)[-1], T) nz = tf.cast( tf.math.count_nonzero(tf.where(zinvvar > 0)), T ) # tf.cast(tf.shape(z)[-1], T) nm = tf.cast( tf.math.count_nonzero(tf.where(muinvvar > 0)), T ) # tf.cast(tf.shape(mu)[-1], T) M = tf.transpose(M_T, [0, 2, 1]) # tf.einsum("...ij->...ji", M_T) R = tf.transpose(R_T, [0, 2, 1]) # tf.einsum("...ij->...ji", M_T) Hbar = ( ells[:, None, None] ** 2 * tf.matmul(R_T, R * zinvvar[..., :, None]) + tf.matmul(M_T, M * yinvvar[..., :, None]) + tf.eye(nt, dtype=T)[None, :, :] * tf.ones((nobj, 1, 1), dtype=T) * muinvvar[..., :, None] ) # (..., dt, dt) etabar = ( ells[:, None] * tf.reduce_sum(R_T * (z * zinvvar)[..., None, :], axis=-1) + tf.reduce_sum(M_T * (y * yinvvar)[..., None, :], axis=-1) + tf.reduce_sum((mu * muinvvar)[..., None, :], axis=-1) ) # (..., dt) theta_map = tf.linalg.solve(Hbar, etabar[..., None])[..., 0] # (..., dt) theta_cov = tf.linalg.inv(Hbar) logdetH = ( tf.reduce_sum(tf.where(zinvvar > 0, tf.math.log(zinvvar), zinvvar * 0), axis=-1) + tf.reduce_sum( tf.where(yinvvar > 0, tf.math.log(yinvvar), yinvvar * 0), axis=-1 ) + tf.reduce_sum( tf.where(muinvvar > 0, tf.math.log(muinvvar), muinvvar * 0), axis=-1 ) ) xi1 = -0.5 * ( (ny + nz + nm) * log2pi - logdetH + tf.reduce_sum(y * y * yinvvar, axis=-1) + tf.reduce_sum(z * z * zinvvar, axis=-1) + tf.reduce_sum(mu * mu * muinvvar, axis=-1) ) logdetHbar = tf.linalg.logdet(Hbar) xi2 = -0.5 * (nt * log2pi - logdetHbar + tf.reduce_sum(etabar * theta_map, axis=-1)) logfml = xi1 - xi2 return logfml, theta_map, theta_cov
def solarize(image, threshold=128): # For each pixel in the image, select the pixel # if the value is less than the threshold. # Otherwise, subtract 255 from the pixel. return tf.where(image < threshold, image, 255 - image)
def compute_loss(self, y_true, y_pred): """Compute mutlibox loss. # Arguments y_true: Ground truth targets, tensor of shape (?, num_boxes, 4 + num_classes + 8), priors in ground truth are fictitious, y_true[:, :, -8] has 1 if prior should be penalized or in other words is assigned to some ground truth box, y_true[:, :, -7:] are all 0. y_pred: Predicted logits, tensor of shape (?, num_boxes, 4 + num_classes + 8). # Returns loss: Loss for prediction, tensor of shape (?,). """ batch_size = tf.shape(y_true)[0] num_boxes = tf.to_float(tf.shape(y_true)[1]) # loss for all priors conf_loss = self._softmax_loss(y_true[:, :, 4:-8], y_pred[:, :, 4:-8]) loc_loss = self._l1_smooth_loss(y_true[:, :, :4], y_pred[:, :, :4]) # get positives loss num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1) pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8], axis=1) pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8], axis=1) # get negatives loss, we penalize only confidence here num_neg = tf.minimum(self.neg_pos_ratio * num_pos, num_boxes - num_pos) pos_num_neg_mask = tf.greater(num_neg, 0) has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask)) num_neg = tf.concat( axis=0, values=[num_neg, [(1 - has_min) * self.negatives_for_hard]]) num_neg_batch = tf.reduce_min( tf.boolean_mask(num_neg, tf.greater(num_neg, 0))) num_neg_batch = tf.to_int32(num_neg_batch) confs_start = 4 + self.background_label_id + 1 confs_end = confs_start + self.num_classes - 1 max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end], axis=2) _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]), k=num_neg_batch) batch_idx = tf.expand_dims(tf.range(0, batch_size), 1) batch_idx = tf.tile(batch_idx, (1, num_neg_batch)) full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) + tf.reshape(indices, [-1])) # full_indices = tf.concat(2, [tf.expand_dims(batch_idx, 2), # tf.expand_dims(indices, 2)]) # neg_conf_loss = tf.gather_nd(conf_loss, full_indices) neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]), full_indices) neg_conf_loss = tf.reshape(neg_conf_loss, [batch_size, num_neg_batch]) neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1) # loss is sum of positives and negatives total_loss = pos_conf_loss + neg_conf_loss total_loss /= (num_pos + tf.to_float(num_neg_batch)) num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos, tf.ones_like(num_pos)) total_loss += (self.alpha * pos_loc_loss) / num_pos return total_loss
def decode(self, serialized_example): """Decode the serialized example. Args: serialized_example: a single serialized tf.Example string. Returns: decoded_tensors: a dictionary of tensors with the following fields: - image: a uint8 tensor of shape [None, None, 3]. - source_id: a string scalar tensor. - height: an integer scalar tensor. - width: an integer scalar tensor. - groundtruth_classes: a int64 tensor of shape [None]. - groundtruth_is_crowd: a bool tensor of shape [None]. - groundtruth_area: a float32 tensor of shape [None]. - groundtruth_boxes: a float32 tensor of shape [None, 4]. - groundtruth_instance_masks: a float32 tensor of shape [None, None, None]. - groundtruth_instance_masks_png: a string tensor of shape [None]. """ parsed_tensors = tf.io.parse_single_example(serialized_example, self._keys_to_features) for k in parsed_tensors: if isinstance(parsed_tensors[k], tf.SparseTensor): if parsed_tensors[k].dtype == tf.string: parsed_tensors[k] = tf.sparse_tensor_to_dense( parsed_tensors[k], default_value='') else: parsed_tensors[k] = tf.sparse_tensor_to_dense( parsed_tensors[k], default_value=0) image = self._decode_image(parsed_tensors) boxes = self._decode_boxes(parsed_tensors) areas = self._decode_areas(parsed_tensors) decode_image_shape = tf.logical_or( tf.equal(parsed_tensors['image/height'], -1), tf.equal(parsed_tensors['image/width'], -1)) image_shape = tf.cast(tf.shape(image), dtype=tf.int64) parsed_tensors['image/height'] = tf.where( decode_image_shape, image_shape[0], parsed_tensors['image/height']) parsed_tensors['image/width'] = tf.where(decode_image_shape, image_shape[1], parsed_tensors['image/width']) is_crowds = tf.cond( tf.greater(tf.shape(parsed_tensors['image/object/is_crowd'])[0], 0), lambda: tf.cast(parsed_tensors['image/object/is_crowd'], dtype=tf.bool), lambda: tf.zeros_like(parsed_tensors['image/object/class/label'], dtype=tf.bool)) # pylint: disable=line-too-long if self._regenerate_source_id: source_id = _get_source_id_from_encoded_image(parsed_tensors) else: source_id = tf.cond( tf.greater( tf.strings.length(parsed_tensors['image/source_id']), 0), lambda: parsed_tensors['image/source_id'], lambda: _get_source_id_from_encoded_image(parsed_tensors)) if self._include_mask: masks = self._decode_masks(parsed_tensors) decoded_tensors = { 'image': image, 'source_id': source_id, 'height': parsed_tensors['image/height'], 'width': parsed_tensors['image/width'], 'groundtruth_classes': parsed_tensors['image/object/class/label'], 'groundtruth_is_crowd': is_crowds, 'groundtruth_area': areas, 'groundtruth_boxes': boxes, } if self._include_mask: decoded_tensors.update({ 'groundtruth_instance_masks': masks, 'groundtruth_instance_masks_png': parsed_tensors['image/object/mask'], }) return decoded_tensors
def inception_model_fn(features, labels, mode, params): """Inception v4 model using Estimator API.""" num_classes = FLAGS.num_classes is_training = (mode == tf.estimator.ModeKeys.TRAIN) is_eval = (mode == tf.estimator.ModeKeys.EVAL) if isinstance(features, dict): features = features['feature'] features = tensor_transform_fn(features, params['model_transpose_dims']) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): if FLAGS.precision == 'bfloat16': with contrib_tpu.bfloat16_scope(): logits, end_points = inception.inception_v4( features, num_classes, is_training=is_training) logits = tf.cast(logits, tf.float32) elif FLAGS.precision == 'float32': logits, end_points = inception.inception_v4( features, num_classes, is_training=is_training) return logits, end_points if FLAGS.clear_update_collections: with arg_scope( inception.inception_v4_arg_scope( weight_decay=0.0, batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON, updates_collections=None)): logits, end_points = build_network() else: with arg_scope( inception.inception_v4_arg_scope( batch_norm_decay=BATCH_NORM_DECAY, batch_norm_epsilon=BATCH_NORM_EPSILON)): logits, end_points = build_network() predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and ( not FLAGS.use_tpu): with tf.control_dependencies([ tf.Print(predictions['classes'], [predictions['classes']], summarize=FLAGS.eval_batch_size, message='prediction: ') ]): labels = tf.Print(labels, [labels], summarize=FLAGS.eval_batch_size, message='label: ') one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32) if 'AuxLogits' in end_points: tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=tf.cast(end_points['AuxLogits'], tf.float32), weights=0.4, label_smoothing=0.1, scope='aux_loss') tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits, weights=1.0, label_smoothing=0.1) losses = tf.add_n(tf.losses.get_losses()) l2_loss = [] for v in tf.trainable_variables(): tf.logging.info(v.name) if 'BatchNorm' not in v.name and 'weights' in v.name: l2_loss.append(tf.nn.l2_loss(v)) tf.logging.info(len(l2_loss)) loss = losses + WEIGHT_DECAY * tf.add_n(l2_loss) initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256 # Adjust the initial learning rate for warmup initial_learning_rate /= ( FLAGS.learning_rate_decay**((FLAGS.warmup_epochs + FLAGS.cold_epochs) / FLAGS.learning_rate_decay_epochs)) final_learning_rate = 0.0001 * initial_learning_rate host_call = None train_op = None if is_training: batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size global_step = tf.train.get_or_create_global_step() current_epoch = tf.cast( (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32) clr = FLAGS.cold_learning_rate wlr = initial_learning_rate / (FLAGS.warmup_epochs + FLAGS.cold_epochs) learning_rate = tf.where( tf.greater_equal(current_epoch, FLAGS.cold_epochs), (tf.where( tf.greater_equal(current_epoch, FLAGS.warmup_epochs + FLAGS.cold_epochs), tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=int( FLAGS.learning_rate_decay_epochs * batches_per_epoch), decay_rate=FLAGS.learning_rate_decay, staircase=True), tf.multiply(tf.cast(current_epoch, tf.float32), wlr))), clr) # Set a minimum boundary for the learning rate. learning_rate = tf.maximum(learning_rate, final_learning_rate, name='learning_rate') if FLAGS.optimizer == 'sgd': tf.logging.info('Using SGD optimizer') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) elif FLAGS.optimizer == 'momentum': tf.logging.info('Using Momentum optimizer') optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) elif FLAGS.optimizer == 'RMS': tf.logging.info('Using RMS optimizer') optimizer = tf.train.RMSPropOptimizer(learning_rate, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) else: tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer) if FLAGS.use_tpu: optimizer = contrib_tpu.CrossShardOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step) if FLAGS.moving_average: ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) with tf.control_dependencies([train_op ]), tf.name_scope('moving_average'): train_op = ema.apply(variables_to_average) # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) if not FLAGS.skip_host_call: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] with summary.create_file_writer(FLAGS.model_dir).as_default(): with summary.always_record_summaries(): summary.scalar('loss', tf.reduce_mean(loss), step=gs) summary.scalar('learning_rate', tf.reduce_mean(lr), step=gs) summary.scalar('current_epoch', tf.reduce_mean(ce), step=gs) return summary.all_summary_ops() host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) eval_metrics = None if is_eval: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, ]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'accuracy': top_1_accuracy, 'accuracy@5': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def _parse_train_data(self, data): """Parses data for training. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] labels: a dictionary of tensors used for training. The following describes {key: value} pairs in the dictionary. image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [scaled_height, scaled_width], num_groundtrtuhs: number of objects. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled image that is fed to the network. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by mask_crop_size. pasted_objects_mask: a binary tensor with the same size as image which is computed as the union of all the objects masks. """ classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] if self._include_mask: masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training: num_groundtrtuhs = tf.shape(classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Gets original image and its size. image = data['image'] image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = input_utils.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: if self._include_mask: image, boxes, masks = input_utils.random_horizontal_flip( image, boxes, masks) else: image, boxes = input_utils.random_horizontal_flip(image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. # Now the coordinates of boxes are w.r.t. the original image. boxes = box_utils.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = input_utils.resize_and_crop_image( image, self._output_size, padded_size=input_utils.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) # Resizes and crops boxes. # Now the coordinates of boxes are w.r.t the scaled image. image_scale = image_info[2, :] offset = image_info[3, :] boxes = input_utils.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_utils.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if self._include_mask: masks = tf.gather(masks, indices) uncropped_masks = tf.cast(masks, tf.int8) uncropped_masks = tf.expand_dims(uncropped_masks, axis=3) uncropped_masks = input_utils.resize_and_crop_masks( uncropped_masks, image_scale, self._output_size, offset) # Transfer boxes to the original image space and do normalization. cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) cropped_boxes = box_utils.normalize_boxes(cropped_boxes, image_shape) num_masks = tf.shape(masks)[0] masks = tf.image.crop_and_resize( tf.expand_dims(masks, axis=-1), cropped_boxes, box_indices=tf.range(num_masks, dtype=tf.int32), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) indices = tf.range(start=0, limit=tf.shape(classes)[0], dtype=tf.int32) # Samples the numbers of masks for pasting. m = tf.random.uniform(shape=[], maxval=tf.shape(classes)[0] + 1, dtype=tf.int32) m = tf.math.minimum(m, tf.shape(classes)[0]) # Shuffles the indices of objects and keep the first m objects for pasting. shuffled_indices = tf.random.shuffle(indices) shuffled_indices = tf.slice(shuffled_indices, [0], [m]) boxes = tf.gather(boxes, shuffled_indices) masks = tf.gather(masks, shuffled_indices) classes = tf.gather(classes, shuffled_indices) uncropped_masks = tf.gather(uncropped_masks, shuffled_indices) pasted_objects_mask = tf.reduce_max(uncropped_masks, 0) pasted_objects_mask = tf.cast(pasted_objects_mask, tf.bool) labels = { 'image': image, 'image_info': image_info, 'num_groundtrtuhs': tf.shape(classes)[0], 'boxes': boxes, 'masks': masks, 'classes': classes, 'pasted_objects_mask': pasted_objects_mask, } return labels
def _build(self, probs, all_anchors, gt_boxes): """ Args: all_anchors: A Tensor with anchors for all of SSD's features. The shape of the Tensor is (num_anchors, 4). gt_boxes: A Tensor with the ground truth boxes for the image. The shape of the Tensor is (num_gt, 5), having the truth label as the last value for each box. Returns: class_targets: Either a truth value of the anchor (a value between 0 and num_classes, with 0 being background), or -1 when the anchor is to be ignored in the minibatch. The shape of the Tensor is (num_anchors, 1). bbox_offsets_targets: A bounding box regression target for each of the anchors that have a greater than zero label. For every other anchors we return zeros. The shape of the Tensor is (num_anchors, 4). """ all_anchors = tf.cast(all_anchors, tf.float32) gt_boxes = tf.cast(gt_boxes, tf.float32) # We are going to label each anchor based on the IoU with # `gt_boxes`. Start by filling the labels with -1, marking them as # unknown. anchors_label_shape = tf.gather(tf.shape(all_anchors), [0]) anchors_label = tf.fill(dims=anchors_label_shape, value=-1.) overlaps = bbox_overlap_tf(all_anchors, gt_boxes[:, :4]) max_overlaps = tf.reduce_max(overlaps, axis=1) # Get the index of the best gt_box for each anchor. best_gtbox_for_anchors_idx = tf.argmax(overlaps, axis=1) # Having the index of the gt bbox with the best label we need to get # the label for each gt box and sum 1 to it because 0 is used for # background. best_fg_labels_for_anchors = tf.add( tf.gather(gt_boxes[:, 4], best_gtbox_for_anchors_idx), 1.) iou_is_fg = tf.greater_equal(max_overlaps, self._foreground_threshold) # We update anchors_label with the value in # best_fg_labels_for_anchors only when the box is foreground. # TODO: Replace with a sparse_to_dense with -1 default_value anchors_label = tf.where(condition=iou_is_fg, x=best_fg_labels_for_anchors, y=anchors_label) best_anchor_idxs = tf.argmax(overlaps, axis=0) is_best_box = tf.sparse_to_dense(sparse_indices=best_anchor_idxs, sparse_values=True, default_value=False, output_shape=tf.cast( anchors_label_shape, tf.int64), validate_indices=False) # Now we need to find the anchors that are the best for each of the # gt_boxes. We overwrite the previous anchors_label with this # because setting the best anchor for each gt_box has priority. best_anchors_gt_labels = tf.sparse_to_dense( sparse_indices=best_anchor_idxs, sparse_values=gt_boxes[:, 4] + 1, default_value=-1, output_shape=tf.cast(anchors_label_shape, tf.int64), validate_indices=False, name="get_right_labels_for_bestboxes") anchors_label = tf.where(condition=is_best_box, x=best_anchors_gt_labels, y=anchors_label, name="update_labels_for_bestbox_anchors") # Use the worst backgrounds (the bgs whose probability of being fg is # the greatest). cls_probs = probs[:, 1:] max_cls_probs = tf.reduce_max(cls_probs, axis=1) # Exclude boxes with IOU > `background_threshold_high` with any GT. iou_less_than_bg_tresh_high_filter = tf.less_equal( max_overlaps, self._background_threshold_high) bg_anchors = tf.less_equal(anchors_label, 0) bg_overlaps_filter = tf.logical_and(iou_less_than_bg_tresh_high_filter, bg_anchors) max_cls_probs = tf.where( condition=bg_overlaps_filter, x=max_cls_probs, y=tf.fill(dims=anchors_label_shape, value=-1.), ) # We calculate up to how many backgrounds we desire based on the # final number of foregrounds and the hard minning ratio. num_fg_mask = tf.greater(anchors_label, 0.0) num_fg = tf.cast(tf.count_nonzero(num_fg_mask), tf.float32) num_bg = tf.cast(num_fg * self._hard_negative_ratio, tf.int32) top_k_bg = tf.nn.top_k(max_cls_probs, k=num_bg) set_bg = tf.sparse_to_dense(sparse_indices=top_k_bg.indices, sparse_values=True, default_value=False, output_shape=anchors_label_shape, validate_indices=False) anchors_label = tf.where(condition=set_bg, x=tf.fill(dims=anchors_label_shape, value=0.), y=anchors_label) # Next step is to calculate the proper bbox targets for the labeled # anchors based on the values of the ground-truth boxes. # We have to use only the anchors labeled >= 1, each matching with # the proper gt_boxes # Get the ids of the anchors that mater for bbox_target comparison. is_anchor_with_target = tf.greater(anchors_label, 0) anchors_with_target_idx = tf.where(condition=is_anchor_with_target) # Get the corresponding ground truth box only for the anchors with # target. gt_boxes_idxs = tf.gather(best_gtbox_for_anchors_idx, anchors_with_target_idx) # Get the values of the ground truth boxes. anchors_gt_boxes = tf.gather_nd(gt_boxes[:, :4], gt_boxes_idxs) # We create the same array but with the anchors anchors_with_target = tf.gather_nd(all_anchors, anchors_with_target_idx) # We create our targets with bbox_transform bbox_targets = encode(anchors_with_target, anchors_gt_boxes, variances=self._variances) # We unmap targets to anchor_labels (containing the length of # anchors) bbox_targets = tf.scatter_nd(indices=anchors_with_target_idx, updates=bbox_targets, shape=tf.cast(tf.shape(all_anchors), tf.int64)) return anchors_label, bbox_targets
def focal_loss(logits, targets, alpha, gamma, normalizer): """Compute the focal loss between `logits` and the golden `target` values. Focal loss = -(1-pt)^gamma * log(pt) where pt is the probability of being classified to the true class. Args: logits: A float32 tensor of size [batch, height_in, width_in, num_predictions]. targets: A float32 tensor of size [batch, height_in, width_in, num_predictions]. alpha: A float32 scalar multiplying alpha to the loss from positive examples and (1-alpha) to the loss from negative examples. gamma: A float32 scalar modulating loss from hard and easy examples. normalizer: A float32 scalar normalizes the total loss from all examples. Returns: loss: A float32 Tensor of size [batch, height_in, width_in, num_predictions] representing normalized loss on the prediction map. """ with tf.name_scope('focal_loss'): positive_label_mask = tf.equal(targets, 1.0) cross_entropy = ( tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits)) # Below are comments/derivations for computing modulator. # For brevity, let x = logits, z = targets, r = gamma, and p_t = sigmod(x) # for positive samples and 1 - sigmoid(x) for negative examples. # # The modulator, defined as (1 - P_t)^r, is a critical part in focal loss # computation. For r > 0, it puts more weights on hard examples, and less # weights on easier ones. However if it is directly computed as (1 - P_t)^r, # its back-propagation is not stable when r < 1. The implementation here # resolves the issue. # # For positive samples (labels being 1), # (1 - p_t)^r # = (1 - sigmoid(x))^r # = (1 - (1 / (1 + exp(-x))))^r # = (exp(-x) / (1 + exp(-x)))^r # = exp(log((exp(-x) / (1 + exp(-x)))^r)) # = exp(r * log(exp(-x)) - r * log(1 + exp(-x))) # = exp(- r * x - r * log(1 + exp(-x))) # # For negative samples (labels being 0), # (1 - p_t)^r # = (sigmoid(x))^r # = (1 / (1 + exp(-x)))^r # = exp(log((1 / (1 + exp(-x)))^r)) # = exp(-r * log(1 + exp(-x))) # # Therefore one unified form for positive (z = 1) and negative (z = 0) # samples is: # (1 - p_t)^r = exp(-r * z * x - r * log(1 + exp(-x))). neg_logits = -1.0 * logits modulator = tf.exp(gamma * targets * neg_logits - gamma * tf.log1p( tf.exp(neg_logits))) loss = modulator * cross_entropy weighted_loss = tf.where(positive_label_mask, alpha * loss, (1.0 - alpha) * loss) weighted_loss /= normalizer return weighted_loss
def box_matching(boxes, gt_boxes, gt_classes): """Match boxes to groundtruth boxes. Given the proposal boxes and the groundtruth boxes and classes, perform the groundtruth matching by taking the argmax of the IoU between boxes and groundtruth boxes. Args: boxes: a tensor of shape of [batch_size, N, 4] representing the box coordiantes to be matched to groundtruth boxes. gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing the groundtruth box coordinates. It is padded with -1s to indicate the invalid boxes. gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box classes. It is padded with -1s to indicate the invalid classes. Returns: matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing the matched groundtruth box coordinates for each input box. If the box does not overlap with any groundtruth boxes, the matched boxes of it will be set to all 0s. matched_gt_classes: a tensor of shape of [batch_size, N], representing the matched groundtruth classes for each input box. If the box does not overlap with any groundtruth boxes, the matched box classes of it will be set to 0, which corresponds to the background class. matched_gt_indices: a tensor of shape of [batch_size, N], representing the indices of the matched groundtruth boxes in the original gt_boxes tensor. If the box does not overlap with any groundtruth boxes, the index of the matched groundtruth will be set to -1. matched_iou: a tensor of shape of [batch_size, N], representing the IoU between the box and its matched groundtruth box. The matched IoU is the maximum IoU of the box and all the groundtruth boxes. iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix between boxes and the groundtruth boxes. The IoU between a box and the invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1. """ # Compute IoU between boxes and gt_boxes. # iou <- [batch_size, N, K] iou = box_utils.bbox_overlap(boxes, gt_boxes) # max_iou <- [batch_size, N] # 0.0 -> no match to gt, or -1.0 match to no gt matched_iou = tf.reduce_max(iou, axis=-1) # background_box_mask <- bool, [batch_size, N] background_box_mask = tf.less_equal(matched_iou, 0.0) argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32) argmax_iou_indices_shape = tf.shape(argmax_iou_indices) batch_indices = ( tf.expand_dims(tf.range(argmax_iou_indices_shape[0]), axis=-1) * tf.ones([1, argmax_iou_indices_shape[-1]], dtype=tf.int32)) gather_nd_indices = tf.stack([batch_indices, argmax_iou_indices], axis=-1) matched_gt_boxes = tf.gather_nd(gt_boxes, gather_nd_indices) matched_gt_boxes = tf.where( tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]), tf.zeros_like(matched_gt_boxes, dtype=tf.float32), matched_gt_boxes) matched_gt_classes = tf.gather_nd(gt_classes, gather_nd_indices) matched_gt_classes = tf.where(background_box_mask, tf.zeros_like(matched_gt_classes), matched_gt_classes) matched_gt_indices = tf.where(background_box_mask, -tf.ones_like(argmax_iou_indices), argmax_iou_indices) return (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou, iou)
def assign_and_sample_proposals(proposed_boxes, gt_boxes, gt_classes, num_samples_per_image=512, mix_gt_boxes=True, fg_fraction=0.25, fg_iou_thresh=0.5, bg_iou_thresh_hi=0.5, bg_iou_thresh_lo=0.0): """Assigns the proposals with groundtruth classes and performs subsmpling. Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the following algorithm to generate the final `num_samples_per_image` RoIs. 1. Calculates the IoU between each proposal box and each gt_boxes. 2. Assigns each proposed box with a groundtruth class and box by choosing the largest IoU overlap. 3. Samples `num_samples_per_image` boxes from all proposed boxes, and returns box_targets, class_targets, and RoIs. Args: proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number of proposals before groundtruth assignment. The last dimension is the box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format. gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The coordinates of gt_boxes are in the pixel coordinates of the scaled image. This tensor might have padding of values -1 indicating the invalid box coordinates. gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This tensor might have paddings with values of -1 indicating the invalid classes. num_samples_per_image: a integer represents RoI minibatch size per image. mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes before sampling proposals. fg_fraction: a float represents the target fraction of RoI minibatch that is labeled foreground (i.e., class > 0). fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to be considered foreground (if >= fg_iou_thresh). bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI to be considered background (class = 0 if overlap in [LO, HI)). bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI to be considered background (class = 0 if overlap in [LO, HI)). Returns: sampled_rois: a tensor of shape of [batch_size, K, 4], representing the coordinates of the sampled RoIs, where K is the number of the sampled RoIs, i.e. K = num_samples_per_image. sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the box coordinates of the matched groundtruth boxes of the samples RoIs. sampled_gt_classes: a tensor of shape of [batch_size, K], storing the classes of the matched groundtruth boxes of the sampled RoIs. sampled_gt_indices: a tensor of shape of [batch_size, K], storing the indices of the sampled groudntruth boxes in the original `gt_boxes` tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i]. """ with tf.name_scope('sample_proposals'): if mix_gt_boxes: boxes = tf.concat([proposed_boxes, gt_boxes], axis=1) else: boxes = proposed_boxes (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou, _) = box_matching(boxes, gt_boxes, gt_classes) positive_match = tf.greater(matched_iou, fg_iou_thresh) negative_match = tf.logical_and( tf.greater_equal(matched_iou, bg_iou_thresh_lo), tf.less(matched_iou, bg_iou_thresh_hi)) ignored_match = tf.less(matched_iou, 0.0) # re-assign negatively matched boxes to the background class. matched_gt_classes = tf.where(negative_match, tf.zeros_like(matched_gt_classes), matched_gt_classes) matched_gt_indices = tf.where(negative_match, tf.zeros_like(matched_gt_indices), matched_gt_indices) sample_candidates = tf.logical_and( tf.logical_or(positive_match, negative_match), tf.logical_not(ignored_match)) sampler = ( balanced_positive_negative_sampler.BalancedPositiveNegativeSampler( positive_fraction=fg_fraction, is_static=True)) batch_size, _ = sample_candidates.get_shape().as_list() sampled_indicators = [] for i in range(batch_size): sampled_indicator = sampler.subsample(sample_candidates[i], num_samples_per_image, positive_match[i]) sampled_indicators.append(sampled_indicator) sampled_indicators = tf.stack(sampled_indicators) _, sampled_indices = tf.nn.top_k(tf.cast(sampled_indicators, dtype=tf.int32), k=num_samples_per_image, sorted=True) sampled_indices_shape = tf.shape(sampled_indices) batch_indices = ( tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) * tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32)) gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1) sampled_rois = tf.gather_nd(boxes, gather_nd_indices) sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices) sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices) sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices) return (sampled_rois, sampled_gt_boxes, sampled_gt_classes, sampled_gt_indices)
def build_train_graph(self, inputs, min_depth, max_depth, num_mpi_planes, learning_rate=0.0002, beta1=0.9, vgg_model_file=None, global_step=0): """Construct the training computation graph. Args: inputs: dictionary of tensors (see 'input_data' below) needed for training min_depth: minimum depth for the PSV and MPI planes max_depth: maximum depth for the PSV and MPI planes num_mpi_planes: number of MPI planes to infer learning_rate: learning rate beta1: hyperparameter for Adam vgg_model_file: path to vgg weights (needed when vgg loss is used) global_step: current optimization step Returns: A train_op to be used for training. """ print("starting to build graph") with tf.name_scope("input_size_randomization"): dim_choices = tf.constant([[1, 16], [2, 32], [4, 32], [4, 64], [4, 128], [8, 32], [8, 64], [8, 128]], dtype=tf.int32) rand_dim = tf.random_shuffle(dim_choices)[0, :] height_div = rand_dim[0] width_div = rand_dim[0] num_mpi_planes = rand_dim[1] tf.summary.scalar("num_mpi_planes", num_mpi_planes) with tf.name_scope("setup"): mpi_planes = self.inv_depths(min_depth, max_depth, num_mpi_planes) with tf.name_scope("input_data"): raw_tgt_image = inputs["tgt_image"] raw_ref_image = inputs["ref_image"] raw_src_images = inputs["src_images"] _, img_height, img_width, _ = raw_src_images.get_shape().as_list( ) img_height = img_height // height_div img_width = img_width // width_div raw_tgt_image = tf.image.convert_image_dtype( raw_tgt_image, dtype=tf.float32) raw_ref_image = tf.image.convert_image_dtype( raw_ref_image, dtype=tf.float32) raw_src_images = tf.image.convert_image_dtype( raw_src_images, dtype=tf.float32) raw_tgt_image = tf.image.resize_area(raw_tgt_image, [img_height, img_width]) raw_ref_image = tf.image.resize_area(raw_ref_image, [img_height, img_width]) raw_src_images = tf.image.resize_area(raw_src_images, [img_height, img_width]) tgt_pose = inputs["tgt_pose"] ref_pose = inputs["ref_pose"] src_poses = inputs["src_poses"] intrinsics = inputs["intrinsics"] # Scale intrinsics based on size randomization intrinsics = tf.concat([ intrinsics[:, 0:1, :] / tf.to_float(width_div), intrinsics[:, 1:2, :] / tf.to_float(height_div), intrinsics[:, 2:3, :] ], axis=1) inputs["intrinsics"] = intrinsics _, num_source, _, _ = src_poses.get_shape().as_list() with tf.name_scope("inference"): print("setting up MPI inference") num_mpi_planes = tf.shape(mpi_planes)[0] pred = self.infer_mpi(raw_src_images, raw_ref_image, ref_pose, src_poses, intrinsics, num_mpi_planes, mpi_planes) rgba_layers = pred["rgba_layers"] rgba_layers_refine = pred["rgba_layers_refine"] stuff_behind = pred["stuff_behind"] refine_input_mpi = pred["refine_input_mpi"] psv = pred["psv"] with tf.name_scope("synthesis"): print("setting up rendering") rel_pose = tf.matmul(tgt_pose, tf.matrix_inverse(ref_pose)) output_image, output_layers = self.mpi_render_view( rgba_layers, rel_pose, mpi_planes, intrinsics) output_alpha = output_layers[Ellipsis, -1] output_image_refine, _ = self.mpi_render_view( rgba_layers_refine, rel_pose, mpi_planes, intrinsics) with tf.name_scope("loss"): print("computing losses") # Mask loss for pixels outside reference frustum loss_mask = tf.where( tf.equal( tf.reduce_min( tf.abs(tf.reduce_sum(output_layers, axis=-1)), axis=3, keep_dims=True), 0.0), tf.zeros_like(output_alpha[:, :, :, 0:1]), tf.ones_like(output_alpha[:, :, :, 0:1])) loss_mask = tf.stop_gradient(loss_mask) tf.summary.image("loss_mask", loss_mask) # Helper functions for loss def compute_error(real, fake, mask): return tf.reduce_mean(mask * tf.abs(fake - real)) # Normalized VGG loss (from # https://github.com/CQFIO/PhotographicImageSynthesis) downsample = lambda tensor, ds: tf.nn.avg_pool(tensor, [1, ds, ds, 1], [1, ds, ds, 1], "SAME") def vgg_loss(raw_tgt_image, output_image, loss_mask): """Compute VGG loss.""" vgg_real = build_vgg19(raw_tgt_image * 255.0, vgg_model_file) rescaled_output_image = (output_image + 1.)/2. * 255.0 vgg_fake = build_vgg19( rescaled_output_image, vgg_model_file, reuse=True) p0 = compute_error(vgg_real["input"], vgg_fake["input"], loss_mask) p1 = compute_error(vgg_real["conv1_2"], vgg_fake["conv1_2"], loss_mask)/2.6 p2 = compute_error(vgg_real["conv2_2"], vgg_fake["conv2_2"], downsample(loss_mask, 2))/4.8 p3 = compute_error(vgg_real["conv3_2"], vgg_fake["conv3_2"], downsample(loss_mask, 4))/3.7 p4 = compute_error(vgg_real["conv4_2"], vgg_fake["conv4_2"], downsample(loss_mask, 8))/5.6 p5 = compute_error(vgg_real["conv5_2"], vgg_fake["conv5_2"], downsample(loss_mask, 16))*10/1.5 total_loss = p0+p1+p2+p3+p4+p5 return total_loss, vgg_real, vgg_fake vgg_loss_initial, _, _ = vgg_loss(raw_tgt_image, output_image, loss_mask) tf.summary.scalar("vgg_loss_initial", vgg_loss_initial) total_loss = vgg_loss_initial vgg_loss_refine, _, _ = vgg_loss(raw_tgt_image, output_image_refine, loss_mask) tf.summary.scalar("vgg_loss_refine", vgg_loss_refine) total_loss += vgg_loss_refine with tf.name_scope("train_op"): print("setting up train op") train_vars = [var for var in tf.trainable_variables()] optim = tf.train.AdamOptimizer(learning_rate, beta1) grads_and_vars = optim.compute_gradients(total_loss, var_list=train_vars) train_op = [optim.apply_gradients(grads_and_vars)] # Summaries tf.summary.scalar("total_loss", total_loss) # Source images for i in range(num_source): src_image = raw_src_images[:, :, :, i*3:(i+1)*3] tf.summary.image("src_image_%d" % i, src_image) # Output image tf.summary.image("output_image", self.deprocess_image(output_image)) # Refined output image tf.summary.image("output_image_refine", self.deprocess_image(output_image_refine)) # Target image tf.summary.image("tgt_image", raw_tgt_image) # Ref image tf.summary.image("ref_image", raw_ref_image) # Predicted color and alpha layers, and PSV num_summ = 16 # Number of plane summaries to show in tensorboard for i in range(num_summ): ind = tf.to_int32(i * num_mpi_planes/num_summ) rgb = rgba_layers[:, :, :, ind, :3] alpha = rgba_layers[:, :, :, ind, -1:] ref_plane = psv[:, :, :, ind, 3:6] source_plane = psv[:, :, :, ind, :3] output_rgb = output_layers[:, :, :, ind, :3] tf.summary.image("rgb_layer_%d" % i, self.deprocess_image(rgb)) tf.summary.image("alpha_layer_%d" % i, alpha) tf.summary.image("rgba_layer_%d" % i, self.deprocess_image(rgb * alpha)) tf.summary.image("psv_avg_%d" % i, (self.deprocess_image(0.5*ref_plane + 0.5*source_plane))) tf.summary.image("output_rgb_%d" % i, self.deprocess_image(output_rgb)) tf.summary.image("psv_ref_%d" % i, self.deprocess_image(ref_plane)) tf.summary.image("psv_source_%d" % i, self.deprocess_image(source_plane)) # Cumulative rendered images and refined MPI for i in range(num_summ): ind = tf.to_int32(i * num_mpi_planes/num_summ) rgb = rgba_layers_refine[:, :, :, ind, :3] alpha = rgba_layers_refine[:, :, :, ind, 3:] render = stuff_behind[:, :, :, ind, :3] input_colors = refine_input_mpi[:, :, :, ind, :3] tf.summary.image("rgb_layer_refine_%d" % i, self.deprocess_image(rgb)) tf.summary.image("alpha_layer_refine_%d" % i, alpha) tf.summary.image("rgba_layer_refine_%d" % i, self.deprocess_image(rgb * alpha)) tf.summary.image("cumulative_render_%d" % i, self.deprocess_image(render)) tf.summary.image("input_colors_refine_%d" % i, self.deprocess_image(input_colors)) return train_op
def model_fn(features, labels, mode, params): """The `model_fn` for TPUEstimator.""" del labels # Unused. tf.logging.info("*** Features ***") for name in sorted(features): tf.logging.info(" name = %s, shape = %s", name, features[name].shape) label_ids = features["label_ids"] input_mask = features["input_mask"] row_ids = features["row_ids"] column_ids = features["column_ids"] # Table cells only, without question tokens and table headers. table_mask = tf.where(row_ids > 0, tf.ones_like(row_ids), tf.zeros_like(row_ids)) do_model_aggregation = config.num_aggregation_labels > 0 aggregation_function_id = (tf.squeeze( features["aggregation_function_id"], axis=[1]) if do_model_aggregation else None) do_model_classification = config.num_classification_labels > 0 classification_class_index = (tf.squeeze( features["classification_class_index"], axis=[1]) if do_model_classification else None) is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = table_bert.create_model( features=features, mode=mode, bert_config=config.bert_config, disabled_features=config.disabled_features, disable_position_embeddings=config.disable_position_embeddings, reset_position_index_per_cell=config.reset_position_index_per_cell, proj_value_length=config.proj_value_length, ) answer, numeric_values, numeric_values_scale = ( utils.extract_answer_from_features( features=features, use_answer_as_supervision=config.use_answer_as_supervision)) outputs = _get_classification_outputs( config=config, output_layer=model.get_sequence_output(), output_layer_aggregation=model.get_pooled_output(), label_ids=label_ids, input_mask=input_mask, table_mask=table_mask, aggregation_function_id=aggregation_function_id, answer=answer, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, is_training=is_training, row_ids=row_ids, column_ids=column_ids, classification_class_index=classification_class_index) total_loss = outputs.total_loss tvars = tf.trainable_variables() if config.reset_output_cls: tvars = [ tvar for tvar in tvars if ("output_weights_cls" not in tvar.name and "output_bias_cls" not in tvar.name) ] initialized_variable_names = set() scaffold_fn = None init_from_checkpoints = [] def add_init_checkpoint(init_checkpoint, scope=None): if not init_checkpoint: return (assignment_map, initialized_variables ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint, scope=scope) initialized_variable_names.update(initialized_variables.keys()) init_from_checkpoints.append((init_checkpoint, assignment_map)) add_init_checkpoint(config.init_checkpoint) if init_from_checkpoints: if config.use_tpu: def tpu_scaffold(): for init_checkpoint, assignment_map in init_from_checkpoints: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: for init_checkpoint, assignment_map in init_from_checkpoints: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) fail_if_missing = init_from_checkpoints and params.get( "fail_if_missing_variables_in_checkpoint", False) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" elif fail_if_missing: if "layer_norm" not in var.name and "LayerNorm" not in var.name: tf.logging.fatal("Variable not found in checkpoint: %s", var.name) tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( total_loss, config.learning_rate, config.num_train_steps, config.num_warmup_steps, config.use_tpu, gradient_accumulation_steps=params.get( "gradient_accumulation_steps", 1), grad_clipping=config.grad_clipping) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (_calculate_eval_metrics_fn, [ total_loss, label_ids, outputs.logits, input_mask, aggregation_function_id, outputs.logits_aggregation, classification_class_index, outputs.logits_cls, ]) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: predictions = { "probabilities": outputs.probs, "input_ids": features["input_ids"], "column_ids": features["column_ids"], "row_ids": features["row_ids"], "segment_ids": features["segment_ids"], "question_id_ints": features["question_id_ints"], } if "question_id" in features: # Only available when predicting on GPU. predictions["question_id"] = features["question_id"] del predictions["question_id_ints"] if do_model_aggregation: predictions.update({ "gold_aggr": features["aggregation_function_id"], "pred_aggr": tf.argmax( outputs.logits_aggregation, axis=-1, output_type=tf.int32, ) }) if do_model_classification: predictions.update({ "gold_cls": features["classification_class_index"], "pred_cls": tf.argmax( outputs.logits_cls, axis=-1, output_type=tf.int32, ) }) if config.num_classification_labels == 2: predictions.update({ "logits_cls": outputs.logits_cls[:, 1] - outputs.logits_cls[:, 0] }) else: predictions.update({"logits_cls": outputs.logits_cls}) if outputs.span_indexes is not None and outputs.span_logits is not None: predictions.update({"span_indexes": outputs.span_indexes}) predictions.update({"span_logits": outputs.span_logits}) if custom_prediction_keys: predictions = { key: predictions[key] for key in custom_prediction_keys } output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def meta_optimize(self): """Meta optimization step.""" probe_images, probe_labels = self.probe_images, self.probe_labels labels = self.labels net = self.net logits = self.logits gate_gradients = 1 batch_size = int(self.batch_size / self.strategy.num_replicas_in_sync) init_eps_val = float(1) / batch_size meta_net = networks.MetaImage(self.net, name='meta_model') if FLAGS.meta_momentum and not self.optimizer.variables(): # Initializing momentum state of optimizer for meta momentum update. # It is a hacky implementation logging.info('Pre-initialize optimizer momentum states.') idle_net_cost = tf.losses.sparse_softmax_cross_entropy( self.labels, logits) tmp_var_grads = self.optimizer.compute_gradients( tf.reduce_mean(idle_net_cost), net.trainable_variables) self.optimizer.apply_gradients(tmp_var_grads) with tf.name_scope('coefficient'): # Data weight coefficient target = tf.constant([init_eps_val] * batch_size, shape=(batch_size, ), dtype=np.float32, name='weight') # Data re-labeling coefficient eps = tf.constant([FLAGS.grad_eps_init] * batch_size, shape=(batch_size, ), dtype=tf.float32, name='eps') onehot_labels = tf.one_hot(labels, self.dataset.num_classes) onehot_labels = tf.cast(onehot_labels, tf.float32) eps_k = tf.reshape(eps, [batch_size, 1]) mixed_labels = eps_k * onehot_labels + (1 - eps_k) * self.guessed_label # raw softmax loss log_softmax = tf.nn.log_softmax(logits) net_cost = -tf.reduce_sum(mixed_labels * log_softmax, 1) lookahead_loss = tf.reduce_sum(tf.multiply(target, net_cost)) lookahead_loss = lookahead_loss + net.regularization_loss with tf.control_dependencies([lookahead_loss]): train_vars = net.trainable_variables var_grads = tf.gradients(lookahead_loss, train_vars, gate_gradients=gate_gradients) static_vars = [] for i in range(len(train_vars)): if FLAGS.meta_momentum > 0: actual_grad = self.meta_momentum_update( var_grads[i], train_vars[i].name, self.optimizer) static_vars.append( tf.math.subtract(train_vars[i], FLAGS.meta_stepsize * actual_grad)) else: static_vars.append( tf.math.subtract(train_vars[i], FLAGS.meta_stepsize * var_grads[i])) # new style meta_net.add_variable_alias(static_vars[-1], var_name=train_vars[i].name) for uv in net.updates_variables: meta_net.add_variable_alias(uv, var_name=uv.name, var_type='updates_variables') meta_net.verbose() with tf.control_dependencies(static_vars): g_logits = meta_net(probe_images, name='meta_model', reuse=True, training=True) desired_y = tf.one_hot(probe_labels, self.dataset.num_classes) meta_loss = tf.nn.softmax_cross_entropy_with_logits_v2( desired_y, g_logits) meta_loss = tf.reduce_mean(meta_loss, name='meta_loss') meta_loss = meta_loss + meta_net.get_regularization_loss(net.wd) meta_acc, meta_acc_op = tf.metrics.accuracy( probe_labels, tf.argmax(g_logits, axis=1)) with tf.control_dependencies([meta_loss] + [meta_acc_op]): meta_train_vars = meta_net.trainable_variables grad_meta_vars = tf.gradients(meta_loss, meta_train_vars, gate_gradients=gate_gradients) grad_target, grad_eps = tf.gradients(static_vars, [target, eps], grad_ys=grad_meta_vars, gate_gradients=gate_gradients) # updates weight raw_weight = target - grad_target raw_weight = raw_weight - init_eps_val unorm_weight = tf.clip_by_value(raw_weight, clip_value_min=0, clip_value_max=float('inf')) norm_c = tf.reduce_sum(unorm_weight) weight = tf.divide(unorm_weight, norm_c + 0.00001) # gets new lambda by the sign of gradient new_eps = tf.where(grad_eps < 0, x=tf.ones_like(eps), y=tf.zeros_like(eps)) return tf.stop_gradient(weight), tf.stop_gradient( new_eps), meta_loss, meta_acc
def too_close_condition(trip, depth_threshold=0.1): depths = trip.depth[:3, :, :, 0] depthmax = tf.reduce_max(depths) depths = tf.where(tf.equal(depths, 0.0), depthmax * tf.ones_like(depths), depths) return tf.greater(tf.reduce_min(depths), depth_threshold)
def train_step(self): def step_fn(inputs): """Step functon. Args: inputs: inputs from data iterator Returns: a set of variables want to observe in Tensorboard """ net = self.net (all_images, labels), (self.probe_images, self.probe_labels) = inputs assert len(all_images.shape) == 5 images, self.aug_images = all_images[:, 0], all_images[:, 1] self.images, self.labels = images, labels batch_size = int(self.batch_size / self.strategy.num_replicas_in_sync) logits = net(images, name='model', reuse=tf.AUTO_REUSE, training=True) self.logits = logits # other losses # initialized first to use self.guessed_label for meta step xe_loss, cs_loss = self.unsupervised_loss() # meta optimization weight, eps, meta_loss, meta_acc = self.meta_optimize() ## losses w.r.t new weight and loss onehot_labels = tf.one_hot(labels, self.dataset.num_classes) onehot_labels = tf.cast(onehot_labels, tf.float32) eps_k = tf.reshape(eps, [batch_size, 1]) mixed_labels = tf.math.add(eps_k * onehot_labels, (1 - eps_k) * self.guessed_label, name='mixed_labels') net_cost = tf.losses.softmax_cross_entropy( mixed_labels, logits, reduction=tf.losses.Reduction.NONE) # loss with initial weight net_loss1 = tf.reduce_mean(net_cost) # loss with initial eps init_eps = tf.constant([FLAGS.grad_eps_init] * batch_size, dtype=tf.float32) init_eps = tf.reshape(init_eps, (-1, 1)) init_mixed_labels = tf.math.add( init_eps * onehot_labels, (1 - init_eps) * self.guessed_label, name='init_mixed_labels') net_cost2 = tf.losses.softmax_cross_entropy( init_mixed_labels, logits, reduction=tf.losses.Reduction.NONE) net_loss2 = tf.reduce_sum(tf.math.multiply(net_cost2, weight)) net_loss = (net_loss1 + net_loss2) / 2 net_loss = net_loss + tf.add_n([xe_loss, cs_loss]) net_loss += net.regularization_loss net_loss /= self.strategy.num_replicas_in_sync # rescale by gpus with tf.control_dependencies(net.updates): net_grads = tf.gradients(net_loss, net.trainable_variables) minimizer_op = self.optimizer.apply_gradients( zip(net_grads, net.trainable_variables), global_step=self.global_step) with tf.control_dependencies([minimizer_op]): train_op = self.ema.apply(net.trainable_variables) acc_op, acc_update_op = self.acc_func(labels, tf.argmax(logits, axis=1)) with tf.control_dependencies([train_op, acc_update_op]): return (tf.identity(net_loss), tf.identity(xe_loss), tf.identity(cs_loss), tf.identity(meta_loss), tf.identity(meta_acc), tf.identity(acc_op), tf.identity(weight), tf.identity(labels)) # end of parallel (pr_net_loss, pr_xe_loss, pr_cs_loss, pr_metaloss, pr_metaacc, pr_acc, pr_weight, pr_labels) = self.strategy.run( step_fn, args=((next(self.train_input_iterator), next(self.probe_input_iterator)), )) # collect device variables weights = self.strategy.unwrap(pr_weight) weights = tf.concat(weights, axis=0) labels = self.strategy.unwrap(pr_labels) labels = tf.concat(labels, axis=0) mean_acc = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_acc) mean_metaacc = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_metaacc) net_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_net_loss) xe_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_xe_loss) cs_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_cs_loss) meta_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_metaloss) # The following add variables for tensorboard visualization merges = [] merges.append(tf.summary.scalar('acc/train', mean_acc)) merges.append(tf.summary.scalar('loss/xemin', xe_loss)) merges.append(tf.summary.scalar('loss/consistency', cs_loss)) merges.append(tf.summary.scalar('loss/net', net_loss)) merges.append(tf.summary.scalar('loss/meta', meta_loss)) merges.append(tf.summary.scalar('acc/meta', mean_metaacc)) if hasattr(self, 'eval_acc_on_train'): merges.append( tf.summary.scalar('acc/eval_on_train', self.eval_acc_on_train[0])) merges.append( tf.summary.scalar('acc/eval_on_train_top5', self.eval_acc_on_train[1])) merges.append( tf.summary.scalar('acc/num_eval', self.eval_acc_on_train[2])) zw_inds = tf.squeeze( tf.where(tf.less_equal(weights, 0), name='zero_weight_index')) merges.append( tf.summary.scalar( 'weights/zeroratio', tf.math.divide(tf.cast(tf.size(zw_inds), tf.float32), tf.cast(tf.size(weights), tf.float32)))) self.epoch_var = tf.cast(self.global_step / self.iter_epoch, tf.float32) merges.append(tf.summary.scalar('epoch', self.epoch_var)) merges.append(tf.summary.scalar('learningrate', self.learning_rate)) summary = tf.summary.merge(merges) return [ net_loss, meta_loss, xe_loss, cs_loss, mean_acc, mean_metaacc, summary, weights ]
def decode(self, tf_example_string_tensor): """Decodes serialized tensorflow example and returns a tensor dictionary. Args: tf_example_string_tensor: a string tensor holding a serialized tensorflow example proto. Returns: A dictionary of the following tensors. fields.InputDataFields.image - 3D uint8 tensor of shape [None, None, 3] containing image. fields.InputDataFields.original_image_spatial_shape - 1D int32 tensor of shape [2] containing shape of the image. fields.InputDataFields.source_id - string tensor containing original image id. fields.InputDataFields.key - string tensor with unique sha256 hash key. fields.InputDataFields.filename - string tensor with original dataset filename. fields.InputDataFields.groundtruth_boxes - 2D float32 tensor of shape [None, 4] containing box corners. fields.InputDataFields.groundtruth_classes - 1D int64 tensor of shape [None] containing classes for the boxes. fields.InputDataFields.groundtruth_weights - 1D float32 tensor of shape [None] indicating the weights of groundtruth boxes. fields.InputDataFields.groundtruth_area - 1D float32 tensor of shape [None] containing containing object mask area in pixel squared. fields.InputDataFields.groundtruth_is_crowd - 1D bool tensor of shape [None] indicating if the boxes enclose a crowd. Optional: fields.InputDataFields.groundtruth_image_confidences - 1D float tensor of shape [None] indicating if a class is present in the image (1.0) or a class is not present in the image (0.0). fields.InputDataFields.image_additional_channels - 3D uint8 tensor of shape [None, None, num_additional_channels]. 1st dim is height; 2nd dim is width; 3rd dim is the number of additional channels. fields.InputDataFields.groundtruth_difficult - 1D bool tensor of shape [None] indicating if the boxes represent `difficult` instances. fields.InputDataFields.groundtruth_group_of - 1D bool tensor of shape [None] indicating if the boxes represent `group_of` instances. fields.InputDataFields.groundtruth_keypoints - 3D float32 tensor of shape [None, num_keypoints, 2] containing keypoints, where the coordinates of the keypoints are ordered (y, x). fields.InputDataFields.groundtruth_keypoint_visibilities - 2D bool tensor of shape [None, num_keypoints] containing keypoint visibilites. fields.InputDataFields.groundtruth_instance_masks - 3D float32 tensor of shape [None, None, None] containing instance masks. fields.InputDataFields.groundtruth_image_classes - 1D int64 of shape [None] containing classes for the boxes. fields.InputDataFields.multiclass_scores - 1D float32 tensor of shape [None * num_classes] containing flattened multiclass scores for groundtruth boxes. fields.InputDataFields.context_features - 1D float32 tensor of shape [context_feature_length * num_context_features] fields.InputDataFields.context_feature_length - int32 tensor specifying the length of each feature in context_features """ serialized_example = tf.reshape(tf_example_string_tensor, shape=[]) decoder = slim_example_decoder.TFExampleDecoder( self.keys_to_features, self.items_to_handlers) keys = decoder.list_items() tensors = decoder.decode(serialized_example, items=keys) tensor_dict = dict(zip(keys, tensors)) is_crowd = fields.InputDataFields.groundtruth_is_crowd tensor_dict[is_crowd] = tf.cast(tensor_dict[is_crowd], dtype=tf.bool) tensor_dict[fields.InputDataFields.image].set_shape([None, None, 3]) tensor_dict[ fields.InputDataFields.original_image_spatial_shape] = tf.shape( tensor_dict[fields.InputDataFields.image])[:2] if fields.InputDataFields.image_additional_channels in tensor_dict: channels = tensor_dict[ fields.InputDataFields.image_additional_channels] channels = tf.squeeze(channels, axis=3) channels = tf.transpose(channels, perm=[1, 2, 0]) tensor_dict[ fields.InputDataFields.image_additional_channels] = channels def default_groundtruth_weights(): return tf.ones([ tf.shape( tensor_dict[fields.InputDataFields.groundtruth_boxes])[0] ], dtype=tf.float32) tensor_dict[fields.InputDataFields.groundtruth_weights] = tf.cond( tf.greater( tf.shape(tensor_dict[ fields.InputDataFields.groundtruth_weights])[0], 0), lambda: tensor_dict[fields.InputDataFields.groundtruth_weights], default_groundtruth_weights) if fields.InputDataFields.groundtruth_keypoints in tensor_dict: # Set all keypoints that are not labeled to NaN. gt_kpt_fld = fields.InputDataFields.groundtruth_keypoints gt_kpt_vis_fld = fields.InputDataFields.groundtruth_keypoint_visibilities visibilities_tiled = tf.tile( tf.expand_dims(tensor_dict[gt_kpt_vis_fld], -1), [1, 1, 2]) tensor_dict[gt_kpt_fld] = tf.where( visibilities_tiled, tensor_dict[gt_kpt_fld], np.nan * tf.ones_like(tensor_dict[gt_kpt_fld])) if self._expand_hierarchy_labels: input_fields = fields.InputDataFields image_classes, image_confidences = self._expand_image_label_hierarchy( tensor_dict[input_fields.groundtruth_image_classes], tensor_dict[input_fields.groundtruth_image_confidences]) tensor_dict[input_fields.groundtruth_image_classes] = image_classes tensor_dict[input_fields.groundtruth_image_confidences] = ( image_confidences) box_fields = [ fields.InputDataFields.groundtruth_group_of, fields.InputDataFields.groundtruth_is_crowd, fields.InputDataFields.groundtruth_difficult, fields.InputDataFields.groundtruth_area, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_weights, ] def expand_field(field_name): return self._expansion_box_field_labels( tensor_dict[input_fields.groundtruth_classes], tensor_dict[field_name]) # pylint: disable=cell-var-from-loop for field in box_fields: if field in tensor_dict: tensor_dict[field] = tf.cond( tf.size(tensor_dict[field]) > 0, lambda: expand_field(field), lambda: tensor_dict[field]) # pylint: enable=cell-var-from-loop tensor_dict[input_fields.groundtruth_classes] = ( self._expansion_box_field_labels( tensor_dict[input_fields.groundtruth_classes], tensor_dict[input_fields.groundtruth_classes], True)) if fields.InputDataFields.groundtruth_group_of in tensor_dict: group_of = fields.InputDataFields.groundtruth_group_of tensor_dict[group_of] = tf.cast(tensor_dict[group_of], dtype=tf.bool) if fields.InputDataFields.groundtruth_dp_num_points in tensor_dict: tensor_dict[ fields.InputDataFields.groundtruth_dp_num_points] = tf.cast( tensor_dict[ fields.InputDataFields.groundtruth_dp_num_points], dtype=tf.int32) tensor_dict[ fields.InputDataFields.groundtruth_dp_part_ids] = tf.cast( tensor_dict[ fields.InputDataFields.groundtruth_dp_part_ids], dtype=tf.int32) return tensor_dict
def resize_and_crop_image_v2(image, short_side, long_side, padded_size, aug_scale_min=1.0, aug_scale_max=1.0, seed=1, method=tf.image.ResizeMethod.BILINEAR): """Resizes the input image to output size (Faster R-CNN style). Resize and pad images given the specified short / long side length and the stride size. Here are the preprocessing steps. 1. For a given image, keep its aspect ratio and first try to rescale the short side of the original image to `short_side`. 2. If the scaled image after 1 has a long side that exceeds `long_side`, keep the aspect ratio and rescal the long side of the image to `long_side`. 2. Pad the rescaled image to the padded_size. Args: image: a `Tensor` of shape [height, width, 3] representing an image. short_side: a scalar `Tensor` or `int` representing the desired short side to be rescaled to. long_side: a scalar `Tensor` or `int` representing the desired long side to be rescaled to. padded_size: a `Tensor` or `int` list/tuple of two elements representing [height, width] of the padded output image size. Padding will be applied after scaling the image to the desired_size. aug_scale_min: a `float` with range between [0, 1.0] representing minimum random scale applied to desired_size for training scale jittering. aug_scale_max: a `float` with range between [1.0, inf] representing maximum random scale applied to desired_size for training scale jittering. seed: seed for random scale jittering. method: function to resize input image to scaled image. Returns: output_image: `Tensor` of shape [height, width, 3] where [height, width] equals to `output_size`. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [desired_height, desired_width], [y_scale, x_scale], [y_offset, x_offset]], where [desired_height, desired_width] is the actual scaled image size, and [y_scale, x_scale] is the scaling factor, which is the ratio of scaled dimension / original dimension. """ with tf.name_scope('resize_and_crop_image_v2'): image_size = tf.cast(tf.shape(image)[0:2], tf.float32) scale_using_short_side = (short_side / tf.minimum(image_size[0], image_size[1])) scale_using_long_side = (long_side / tf.maximum(image_size[0], image_size[1])) scaled_size = tf.round(image_size * scale_using_short_side) scaled_size = tf.where( tf.greater(tf.maximum(scaled_size[0], scaled_size[1]), long_side), tf.round(image_size * scale_using_long_side), scaled_size) desired_size = scaled_size random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0) if random_jittering: random_scale = tf.random_uniform([], aug_scale_min, aug_scale_max, seed=seed) scaled_size = tf.round(random_scale * scaled_size) # Computes 2D image_scale. image_scale = scaled_size / image_size # Selects non-zero random offset (x, y) if scaled image is larger than # desired_size. if random_jittering: max_offset = scaled_size - desired_size max_offset = tf.where(tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset) offset = max_offset * tf.random_uniform([ 2, ], 0, 1, seed=seed) offset = tf.cast(offset, tf.int32) else: offset = tf.zeros((2, ), tf.int32) scaled_image = tf.image.resize_images(image, tf.cast(scaled_size, tf.int32), method=method) if random_jittering: scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0], offset[1]:offset[1] + desired_size[1], :] output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0, padded_size[0], padded_size[1]) image_info = tf.stack([ image_size, tf.cast(desired_size, dtype=tf.float32), image_scale, tf.cast(offset, tf.float32) ]) return output_image, image_info
def logmarglike_twotransfergaussians( ells, y, # (..., dy) yinvvar, # (..., dy) M_T, # (..., dt, dy), z, # (..., dz) zinvvar, # (..., dz) R_T, # (..., dt, dz), perm=[0, 2, 1], ): """ Fit linear model to two Gaussian data sets Parameters ---------- ells : ndarray (nobj, ) scaling between the data: y = ell * z y, yinvvar : ndarray (nobj, ..., n_pix_y) data and data inverse variances M_T : ndarray (..., n_components, n_pix_y) design matrix of linear model z, zinvvar : ndarray (nobj, ..., n_pix_z) data and data inverse variances for z R_T : ndarray (..., n_components, n_pix_z) design matrix of linear model for z perm : list permutation to get M and R from R_T and M_T Returns ------- logfml : ndarray (nobj, ) log likelihood values with parameters marginalised and at best fit theta_map : ndarray (nobj, ndim) Best fit MAP parameters theta_cov : ndarray (nobj, ndim, ndim) Parameter covariance """ log2pi = tf.cast(tf.math.log(2.0 * np.pi), T) nt = tf.cast(tf.shape(M_T)[-2], T) ny = tf.cast( tf.math.count_nonzero(tf.where(yinvvar > 0)), T ) # tf.cast(tf.shape(y)[-1], T) nz = tf.cast( tf.math.count_nonzero(tf.where(zinvvar > 0)), T ) # tf.cast(tf.shape(z)[-1], T) M = tf.transpose(M_T, perm) # tf.einsum("...ij->...ji", M_T) R = tf.transpose(R_T, perm) # tf.einsum("...ij->...ji", M_T) Hbar = ells[..., None, None] ** 2 * tf.matmul( R_T, R * zinvvar[..., :, None] ) + tf.matmul( M_T, M * yinvvar[..., :, None] ) # (..., dt, dt) etabar = ells[..., None] * tf.reduce_sum( R_T * (z * zinvvar)[..., None, :], axis=-1 ) + tf.reduce_sum( M_T * (y * yinvvar)[..., None, :], axis=-1 ) # (..., dt) theta_map = tf.linalg.solve(Hbar, etabar[..., None])[..., 0] # (..., dt) theta_cov = tf.linalg.inv(Hbar) logdetH = tf.reduce_sum( tf.where(zinvvar > 0, tf.math.log(zinvvar), zinvvar * 0), axis=-1 ) + tf.reduce_sum(tf.where(yinvvar > 0, tf.math.log(yinvvar), yinvvar * 0), axis=-1) xi1 = -0.5 * ( (ny + nz) * log2pi - logdetH + tf.reduce_sum(y * y * yinvvar, axis=-1) + tf.reduce_sum(z * z * zinvvar, axis=-1) ) logdetHbar = tf.linalg.logdet(Hbar) xi2 = -0.5 * (nt * log2pi - logdetHbar + tf.reduce_sum(etabar * theta_map, axis=-1)) logfml = xi1 - xi2 return logfml, theta_map, theta_cov
def _calculate_expected_result(dist_per_cell, numeric_values, numeric_values_scale, input_mask_float, logits_aggregation, config): """Calculate the expected result given cell and aggregation probabilities.""" if config.use_gumbel_for_cells: gumbel_dist = tfp.distributions.RelaxedBernoulli( # The token logits where already divided by the temperature and used for # computing cell selection errors so we need to multiply it again here config.temperature, logits=dist_per_cell.logits_parameter() * config.temperature) scaled_probability_per_cell = gumbel_dist.sample() else: scaled_probability_per_cell = _get_probs(dist_per_cell) # <float32>[batch_size, seq_length] scaled_probability_per_cell = (scaled_probability_per_cell / numeric_values_scale) * input_mask_float count_result = tf.reduce_sum(scaled_probability_per_cell, axis=1) numeric_values_masked = tf.where( tf.is_nan(numeric_values), tf.zeros_like(numeric_values), numeric_values) # Mask non-numeric table values to zero. sum_result = tf.reduce_sum(scaled_probability_per_cell * numeric_values_masked, axis=1) avg_approximation = config.average_approximation_function if avg_approximation == AverageApproximationFunction.RATIO: average_result = sum_result / (count_result + _EPSILON_ZERO_DIVISION) elif avg_approximation == AverageApproximationFunction.FIRST_ORDER: # The sum of all probabilities exept that correspond to other cells ex = ( tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1) average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1) elif avg_approximation == AverageApproximationFunction.SECOND_ORDER: # The sum of all probabilities exept that correspond to other cells ex = ( tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1) pointwise_var = (scaled_probability_per_cell * (1 - scaled_probability_per_cell)) var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var multiplier = (var / tf.math.square(ex) + 1) / ex average_result = tf.reduce_sum( numeric_values_masked * scaled_probability_per_cell * multiplier, axis=1) else: tf.logging.error("Invalid average_approximation_function: %s", config.average_approximation_function) if config.use_gumbel_for_agg: gumbel_dist = tfp.distributions.RelaxedOneHotCategorical( config.agg_temperature, logits=logits_aggregation[:, 1:]) # <float32>[batch_size, num_aggregation_labels - 1] aggregation_op_only_probs = gumbel_dist.sample() else: # <float32>[batch_size, num_aggregation_labels - 1] aggregation_op_only_probs = tf.nn.softmax(logits_aggregation[:, 1:] / config.agg_temperature, axis=-1) all_results = tf.concat([ tf.expand_dims(sum_result, axis=1), tf.expand_dims(average_result, axis=1), tf.expand_dims(count_result, axis=1) ], axis=1) expected_result = tf.reduce_sum(all_results * aggregation_op_only_probs, axis=1) return expected_result
def huber_loss(x, delta=1.0): """Reference: https://en.wikipedia.org/wiki/Huber_loss""" return tf.where( tf.abs(x) < delta, tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, cell_index, col_index, cell_mask): """Computes the loss for cell selection constrained to a single column. The loss is a hierarchical log-likelihood. The model first predicts a column and then selects cells within that column (conditioned on the column). Cells outside the selected column are never selected. Args: token_logits: <float>[batch_size, seq_length] Logits per token. column_logits: <float>[batch_size, max_num_cols] Logits per column. label_ids: <int32>[batch_size, seq_length] Labels per token. cell_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into cells. col_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into columns. cell_mask: <float>[batch_size, max_num_rows * max_num_cols] Input mask per cell, 1 for cells that exists in the example and 0 for padding. Returns: selection_loss_per_example: <float>[batch_size] Loss for each example. logits: <float>[batch_size, seq_length] New logits which are only allowed to select cells in a single column. Logits outside of the most likely column according to `column_logits` will be set to a very low value (such that the probabilities are 0). """ # First find the column we should select. We use the column with maximum # number of selected cells. labels_per_column, _ = segmented_tensor.reduce_sum( tf.cast(label_ids, tf.float32), col_index) column_label = tf.argmax(labels_per_column, axis=-1, output_type=tf.int32) # Check if there are no selected cells in the column. In that case the model # should predict the special column id 0, which means "select nothing". no_cell_selected = tf.equal(tf.reduce_max(labels_per_column, axis=-1), 0) column_label = tf.where(no_cell_selected, tf.zeros_like(column_label), column_label) column_dist = tfp.distributions.Categorical(logits=column_logits) column_loss_per_example = -column_dist.log_prob(column_label) # Reduce the labels and logits to per-cell from per-token. logits_per_cell, _ = segmented_tensor.reduce_mean(token_logits, cell_index) labels_per_cell, labels_index = segmented_tensor.reduce_max( tf.cast(label_ids, tf.int32), cell_index) # Mask for the selected column. column_id_for_cells = cell_index.project_inner(labels_index).indices column_mask = tf.cast( tf.equal(column_id_for_cells, tf.expand_dims(column_label, axis=1)), tf.float32) # Compute the log-likelihood for cells, but only for the selected column. cell_dist = tfp.distributions.Bernoulli(logits=logits_per_cell) cell_log_prob = cell_dist.log_prob(labels_per_cell) cell_loss = -tf.reduce_sum(cell_log_prob * column_mask * cell_mask, axis=1) # We need to normalize the loss by the number of cells in the column. cell_loss /= tf.reduce_sum(column_mask * cell_mask, axis=1) + _EPSILON_ZERO_DIVISION selection_loss_per_example = column_loss_per_example selection_loss_per_example += tf.where( no_cell_selected, tf.zeros_like(selection_loss_per_example), cell_loss) # Set the probs outside the selected column (selected by the *model*) # to 0. This ensures backwards compatibility with models that select # cells from multiple columns. selected_column_id = tf.argmax(column_logits, axis=-1, output_type=tf.int32) selected_column_mask = tf.cast( tf.equal(column_id_for_cells, tf.expand_dims(selected_column_id, axis=-1)), tf.float32) # Never select cells with the special column id 0. selected_column_mask = tf.where(tf.equal(column_id_for_cells, 0), tf.zeros_like(selected_column_mask), selected_column_mask) logits_per_cell += _CLOSE_ENOUGH_TO_LOG_ZERO * ( 1.0 - cell_mask * selected_column_mask) logits = segmented_tensor.gather(logits_per_cell, cell_index) return selection_loss_per_example, logits
def _prepare_groundtruth_for_eval(detection_model, class_agnostic, max_number_of_boxes): """Extracts groundtruth data from detection_model and prepares it for eval. Args: detection_model: A `DetectionModel` object. class_agnostic: Whether the detections are class_agnostic. max_number_of_boxes: Max number of groundtruth boxes. Returns: A tuple of: groundtruth: Dictionary with the following fields: 'groundtruth_boxes': [batch_size, num_boxes, 4] float32 tensor of boxes, in normalized coordinates. 'groundtruth_classes': [batch_size, num_boxes] int64 tensor of 1-indexed classes. 'groundtruth_masks': 4D float32 tensor of instance masks (if provided in groundtruth) 'groundtruth_is_crowd': [batch_size, num_boxes] bool tensor indicating is_crowd annotations (if provided in groundtruth). 'groundtruth_area': [batch_size, num_boxes] float32 tensor indicating the area (in the original absolute coordinates) of annotations (if provided in groundtruth). 'num_groundtruth_boxes': [batch_size] tensor containing the maximum number of groundtruth boxes per image.. 'groundtruth_keypoints': [batch_size, num_boxes, num_keypoints, 2] float32 tensor of keypoints (if provided in groundtruth). 'groundtruth_dp_num_points_list': [batch_size, num_boxes] int32 tensor with the number of DensePose points for each instance (if provided in groundtruth). 'groundtruth_dp_part_ids_list': [batch_size, num_boxes, max_sampled_points] int32 tensor with the part ids for each DensePose sampled point (if provided in groundtruth). 'groundtruth_dp_surface_coords_list': [batch_size, num_boxes, max_sampled_points, 4] containing the DensePose surface coordinates for each sampled point (if provided in groundtruth). 'groundtruth_group_of': [batch_size, num_boxes] bool tensor indicating group_of annotations (if provided in groundtruth). 'groundtruth_labeled_classes': [batch_size, num_classes] int64 tensor of 1-indexed classes. class_agnostic: Boolean indicating whether detections are class agnostic. """ input_data_fields = fields.InputDataFields() groundtruth_boxes = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.boxes)) groundtruth_boxes_shape = tf.shape(groundtruth_boxes) # For class-agnostic models, groundtruth one-hot encodings collapse to all # ones. if class_agnostic: groundtruth_classes_one_hot = tf.ones( [groundtruth_boxes_shape[0], groundtruth_boxes_shape[1], 1]) else: groundtruth_classes_one_hot = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.classes)) label_id_offset = 1 # Applying label id offset (b/63711816) groundtruth_classes = (tf.argmax(groundtruth_classes_one_hot, axis=2) + label_id_offset) groundtruth = { input_data_fields.groundtruth_boxes: groundtruth_boxes, input_data_fields.groundtruth_classes: groundtruth_classes } if detection_model.groundtruth_has_field( additional_fields.InputDataFields.y_rotation_angle): groundtruth[ additional_fields.GroundtruthResultFields. y_rotation_angles] = tf.stack( detection_model.groundtruth_lists( additional_fields.InputDataFields.y_rotation_angle)) if detection_model.groundtruth_has_field(fields.BoxListFields.masks): groundtruth[input_data_fields.groundtruth_instance_masks] = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.masks)) if detection_model.groundtruth_has_field(fields.BoxListFields.is_crowd): groundtruth[input_data_fields.groundtruth_is_crowd] = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.is_crowd)) if detection_model.groundtruth_has_field( input_data_fields.groundtruth_area): groundtruth[input_data_fields.groundtruth_area] = tf.stack( detection_model.groundtruth_lists( input_data_fields.groundtruth_area)) if detection_model.groundtruth_has_field(fields.BoxListFields.keypoints): groundtruth[input_data_fields.groundtruth_keypoints] = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.keypoints)) if detection_model.groundtruth_has_field( fields.BoxListFields.keypoint_visibilities): groundtruth[ input_data_fields.groundtruth_keypoint_visibilities] = tf.stack( detection_model.groundtruth_lists( fields.BoxListFields.keypoint_visibilities)) if detection_model.groundtruth_has_field(fields.BoxListFields.group_of): groundtruth[input_data_fields.groundtruth_group_of] = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.group_of)) if detection_model.groundtruth_has_field( fields.InputDataFields.groundtruth_labeled_classes): labeled_classes_list = detection_model.groundtruth_lists( fields.InputDataFields.groundtruth_labeled_classes) labeled_classes = [ tf.where(x)[:, 0] + label_id_offset for x in labeled_classes_list ] if len(labeled_classes) > 1: num_classes = labeled_classes_list[0].shape[0] padded_labeled_classes = [] for x in labeled_classes: padding = num_classes - tf.shape(x)[0] padded_labeled_classes.append(tf.pad(x, [[0, padding]])) groundtruth[ input_data_fields.groundtruth_labeled_classes] = tf.stack( padded_labeled_classes) else: groundtruth[ input_data_fields.groundtruth_labeled_classes] = tf.stack( labeled_classes) if detection_model.groundtruth_has_field( fields.BoxListFields.densepose_num_points): groundtruth[input_data_fields.groundtruth_dp_num_points] = tf.stack( detection_model.groundtruth_lists( fields.BoxListFields.densepose_num_points)) if detection_model.groundtruth_has_field( fields.BoxListFields.densepose_part_ids): groundtruth[input_data_fields.groundtruth_dp_part_ids] = tf.stack( detection_model.groundtruth_lists( fields.BoxListFields.densepose_part_ids)) if detection_model.groundtruth_has_field( fields.BoxListFields.densepose_surface_coords): groundtruth[ input_data_fields.groundtruth_dp_surface_coords] = tf.stack( detection_model.groundtruth_lists( fields.BoxListFields.densepose_surface_coords)) groundtruth[input_data_fields.num_groundtruth_boxes] = (tf.tile( [max_number_of_boxes], multiples=[groundtruth_boxes_shape[0]])) return groundtruth
def _get_classification_outputs( config, is_training, output_layer, output_layer_aggregation, label_ids, input_mask, table_mask, aggregation_function_id, answer, numeric_values, numeric_values_scale, row_ids, column_ids, classification_class_index, ): """Creates a classification model. Args: config: Configuration for Tapas model. is_training: Whether the model is training. output_layer: <float32>[batch_size, seq_length, hidden_size] output_layer_aggregation: <float32>[batch_size, hidden_size] label_ids: <int32>[batch_size, seq_length] input_mask: <int32>[batch_size, seq_length] table_mask: <int32>[batch_size, seq_length] aggregation_function_id: <int32>[batch_size] answer: <float32>[batch_size] numeric_values: <float32>[batch_size, seq_length] numeric_values_scale: <float32>[batch_size, seq_length] row_ids: <int32>[batch_size, seq_length] column_ids: <int32>[batch_size, seq_length] classification_class_index: <int32>[batch] Returns: Outputs """ if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) # Construct indices for the table. row_index = segmented_tensor.IndexMap(indices=tf.minimum( row_ids, config.max_num_rows - 1), num_segments=config.max_num_rows, batch_dims=1) col_index = segmented_tensor.IndexMap(indices=tf.minimum( column_ids, config.max_num_columns - 1), num_segments=config.max_num_columns, batch_dims=1) cell_index = segmented_tensor.ProductIndexMap(row_index, col_index) # Masks. # <float32>[batch_size, seq_length] input_mask_float = tf.cast(input_mask, tf.float32) table_mask_float = tf.cast(table_mask, tf.float32) # Mask for cells that exist in the table (i.e. that are not padding). cell_mask, _ = segmented_tensor.reduce_mean(input_mask_float, cell_index) # Compute logits per token. These are used to select individual cells. logits = utils.compute_token_logits( output_layer=output_layer, temperature=config.temperature, init_cell_selection_weights_to_zero=( config.init_cell_selection_weights_to_zero)) # Compute logits per column. These are used to select a column. if config.select_one_column: column_logits = utils.compute_column_logits( output_layer=output_layer, cell_index=cell_index, cell_mask=cell_mask, init_cell_selection_weights_to_zero=( config.init_cell_selection_weights_to_zero), allow_empty_column_selection=config.allow_empty_column_selection) # TODO(pawelnow): Extract this into a function. # Compute aggregation function logits. do_model_aggregation = config.num_aggregation_labels > 0 if do_model_aggregation: hidden_size_agg = output_layer_aggregation.shape[-1].value output_weights_agg = tf.get_variable( "output_weights_agg", shape=[config.num_aggregation_labels, hidden_size_agg], initializer=_classification_initializer()) output_bias_agg = tf.get_variable( "output_bias_agg", shape=[config.num_aggregation_labels], initializer=tf.zeros_initializer()) do_model_classification = config.num_classification_labels > 0 logits_cls = None if do_model_classification: logits_cls = compute_classification_logits( config.num_classification_labels, output_layer_aggregation) with tf.variable_scope("loss"): total_loss = 0.0 is_supervised = (not do_model_aggregation or not config.use_answer_as_supervision) ### Semi-supervised cell selection in case of no aggregation ############################################################# # If the answer (the denotation) appears directly in the table we might # select the answer without applying any aggregation function. There are # some ambiguous cases, see _calculate_aggregate_mask for more info. # `aggregate_mask` is 1 for examples where we chose to aggregate and 0 # for examples where we chose to select the answer directly. # `label_ids` encodes the positions of the answer appearing in the table. if is_supervised: aggregate_mask = None else: # <float32>[batch_size] aggregate_mask = _calculate_aggregate_mask( answer=answer, output_layer_aggregation=output_layer_aggregation, output_bias_agg=output_bias_agg, output_weights_agg=output_weights_agg, cell_select_pref=config.cell_select_pref, label_ids=label_ids) ### Cell selection log-likelihood ################################### if config.average_logits_per_cell: logits_per_cell, _ = segmented_tensor.reduce_mean( logits, cell_index) logits = segmented_tensor.gather(logits_per_cell, cell_index) dist_per_token = tfp.distributions.Bernoulli(logits=logits) selection_loss_per_example = None if config.select_one_column: selection_loss_per_example, logits = _single_column_cell_selection_loss( token_logits=logits, column_logits=column_logits, label_ids=label_ids, cell_index=cell_index, col_index=col_index, cell_mask=cell_mask) dist_per_token = tfp.distributions.Bernoulli(logits=logits) else: weight = tf.where( label_ids == 0, tf.ones_like(label_ids, dtype=tf.float32), config.positive_weight * tf.ones_like(label_ids, dtype=tf.float32)) selection_loss_per_token = -dist_per_token.log_prob( label_ids) * weight selection_loss_per_example = ( tf.reduce_sum(selection_loss_per_token * input_mask_float, axis=1) / (tf.reduce_sum(input_mask_float, axis=1) + _EPSILON_ZERO_DIVISION)) ### Logits for the aggregation function ######################################### logits_aggregation = None if do_model_aggregation: logits_aggregation = _calculate_aggregation_logits( output_layer_aggregation, output_weights_agg, output_bias_agg) ### Classification loss ############################### if do_model_classification: one_hot_labels = tf.one_hot(classification_class_index, depth=config.num_classification_labels, dtype=tf.float32) if config.classification_label_weight: label_weights = [ config.classification_label_weight.get(i, 1.0) for i in range(config.num_classification_labels) ] one_hot_labels *= tf.constant(label_weights, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits_cls, axis=-1) # <float32>[batch_size] per_example_classification_intermediate = -tf.reduce_sum( one_hot_labels * log_probs, axis=-1) cls_loss = tf.reduce_mean(per_example_classification_intermediate) total_loss += cls_loss ### Supervised cell selection ############################### span_indexes = None span_logits = None if config.span_prediction != SpanPredictionMode.NONE: ( span_indexes, span_logits, span_loss, ) = span_prediction_utils.get_span_logits_by_mode( config.span_prediction, output_layer, label_ids, column_ids, row_ids, max_span_length=10, ) total_loss += span_loss elif config.disable_per_token_loss: pass elif config.mask_examples_without_labels: total_loss += tf.reduce_mean( span_prediction_utils.compute_masked_example_loss( label_ids, selection_loss_per_example, )) elif is_supervised: total_loss += tf.reduce_mean(selection_loss_per_example) else: # For the not supervissed case, do not assign loss for cell selection total_loss += tf.reduce_mean(selection_loss_per_example * (1.0 - aggregate_mask)) ### Semi-supervised regression loss and supervised loss for aggregations ######################################################################### if do_model_aggregation: # Note that `aggregate_mask` is None if the setting is supervised. per_example_additional_loss = _calculate_aggregation_loss( logits_aggregation, aggregate_mask, aggregation_function_id, config) if config.use_answer_as_supervision: # Add regression loss for numeric answers which require aggregation. answer_loss, large_answer_loss_mask = _calculate_regression_loss( answer, aggregate_mask, dist_per_token, numeric_values, numeric_values_scale, table_mask_float, logits_aggregation, config) per_example_additional_loss += answer_loss # Zero loss for examples with answer_loss > cutoff. per_example_additional_loss *= large_answer_loss_mask total_loss += tf.reduce_mean(per_example_additional_loss) return Outputs( total_loss=total_loss, logits=logits, probs=_get_probs(dist_per_token) * input_mask_float, logits_aggregation=logits_aggregation, logits_cls=logits_cls, span_indexes=span_indexes, span_logits=span_logits, )
def elements_model(elements_texts_enc, feature_map, output_size, elements_mask, ref_enc, flags): """The part of the model that processes the elements text and boxes. This assumes that the text has already been preprocessed with the text_model. Even if you are only using the elements and not the referring expression, you should probably use the ref_elements_model since that also handles preprocessing with the text_model. Args: elements_texts_enc: The elements text encoded by the text_model. Size: [batch_size * elements_per_query, text_embed_size] feature_map: Features used by the model. output_size: Desired output size of the encoding. Format: [length, width, depth] elements_mask: Mask for what elements items exist in the input. ref_enc: The referring expression encoded by the text_model. [batch_size, text_embed_size] flags: The input Flags. Returns: The encoding of the elements data. """ with tf.variable_scope('elements_model'): elements_item_size = output_size[2] if flags.use_elements_boxes: elements_boxes = tf.identity(feature_map[ELEMENTS_BOX_ID], ELEMENTS_BOX_ID) flat_elements_boxes = tf.boolean_mask(elements_boxes, elements_mask) else: elements_boxes = None flat_elements_boxes = None if ref_enc is not None: ref_enc_tile = tile_ref_enc_to_elements(ref_enc, elements_mask) elements_ref_match_enc = None if flags.use_elements_ref_match: elements_ref_match = tf.identity( feature_map[ELEMENTS_REF_MATCH_ID], ELEMENTS_REF_MATCH_ID) tf.summary.text('elements_ref_match', elements_ref_match) flat_elements_ref_match = tf.boolean_mask(elements_ref_match, elements_mask) elements_ref_match_enc = text_model( flat_elements_ref_match, flags.pretrained_elements_ref_match_model) # For combinding the element with the refering expression. if flags.merge_ref_elements_method == 'combine' and (ref_enc is not None): elements_enc = tf.concat( filter_none([ elements_texts_enc, flat_elements_boxes, ref_enc_tile, elements_ref_match_enc ]), 1) elements_enc = tf.layers.dense(elements_enc, elements_item_size * 2, tf.nn.relu) else: # Paper results elements_enc = tf.concat( filter_none([ elements_texts_enc, flat_elements_boxes, elements_ref_match_enc ]), 1) elements_enc = tf.layers.dense(elements_enc, elements_item_size, tf.nn.relu) neighbor_embed = None if flags.use_elements_neighbors: neighbor_embed = calc_neighbor_embed( feature_map[ELEMENTS_NEIGHBORS_ID], elements_enc, elements_mask) elements_enc = tf.concat(filter_none([elements_enc, neighbor_embed]), 1) elements_enc = tf.layers.dense(elements_enc, elements_item_size, tf.nn.relu) attend_in = elements_enc # "DNN" elements_enc = tf.nn.dropout(elements_enc, flags.elements_keep_prob) elements_enc = tf.layers.dense(elements_enc, elements_item_size, tf.nn.relu) elements_enc = tf.nn.dropout(elements_enc, flags.elements_keep_prob) elements_enc = tf.layers.dense(elements_enc, elements_item_size) elements_enc_pre_atten = elements_enc if 'Atten' in flags.merge_ref_elements_method and (ref_enc is not None): with tf.variable_scope('attention'): if elements_texts_enc is None: # Prepad with 0s so the box embedding won't overlap with the ref_enc. single_dot_concat = tf.zeros([ tf.shape(flat_elements_boxes)[0], ref_enc.get_shape().as_list()[1] ]) else: single_dot_concat = elements_texts_enc single_dot_in = tf.concat( filter_none([ single_dot_concat, flat_elements_boxes, neighbor_embed, elements_ref_match_enc, ]), 1) single_dot_in = tf.concat( [single_dot_in, tf.ones([tf.shape(single_dot_in)[0], 1])], 1) attention_mask = attention(ref_enc, attend_in, single_dot_in, elements_mask, True, flags.merge_ref_elements_method, flags) attention_mask = tf.expand_dims(attention_mask, 1) elements_enc *= attention_mask # Projects the element embeddings into a 2d feature map. if flags.elements_proj_mode != 'tile': with tf.variable_scope('elements_proj'): # Projects the elements text onto the image feature map # on the corresponding bounding boxes. assert_op = tf.Assert(tf.equal( output_size[0], output_size[1]), [ 'Assumes height and width are the same.', feature_map[ELEMENTS_BOX_ID] ]) with tf.control_dependencies([assert_op]): if flags.proj_elements_memop: # Iterate through all bounding boxes and embeddings to create # embedded bounding boxes and sum to result vector iterately elements_enc = undo_mask(elements_enc, elements_mask) fold_elms = tf.transpose( tf.concat([elements_enc, elements_boxes], 2), [1, 0, 2]) initializer = tf.zeros([tf.shape(elements_mask)[0]] + output_size) def fold_fn(total, fold_elm): elements_enc_boxes = tf.split( fold_elm, [ tf.shape(elements_enc)[2], tf.shape(elements_boxes)[2] ], 1) return total + get_filled_rect( elements_enc_boxes[1], elements_enc_boxes[0], output_size[0], flags.elements_proj_mode) elements_enc = tf.foldl(fold_fn, fold_elms, initializer=initializer, swap_memory=True, parallel_iterations=2) else: # Create embedding of all bb then reduce sum elements_enc = get_filled_rect( flat_elements_boxes, elements_enc, output_size[0], flags.elements_proj_mode) elements_enc = undo_mask(elements_enc, elements_mask) elements_enc = tf.reduce_sum(elements_enc, axis=1) # Turn sum into average. mask_sum = tf.cast( tf.reduce_sum(tf.cast(elements_mask, tf.uint8), 1), tf.float32) mask_sum = tf.reshape(mask_sum, [-1, 1, 1, 1]) mask_sum = tf.where(tf.equal(mask_sum, 0), tf.ones_like(mask_sum), mask_sum) elements_enc /= mask_sum tf.summary.histogram('elements_enc', elements_enc) elements_enc_for_disp = tf.reduce_mean(elements_enc, 3, keepdims=True) tf.summary.image('elements_enc_for_disp', elements_enc_for_disp, 4) else: # Undo the mask for feature mapping sequence_elements_enc = undo_mask(elements_enc, elements_mask) elements_enc = tf.reduce_mean(sequence_elements_enc, axis=1) tf.summary.histogram('elements_enc', elements_enc) if flags.elements_3d_output: elements_enc = tile_to_image(elements_enc, output_size) if flags.elements_3d_output: elements_enc.set_shape( [None, output_size[0], output_size[1], elements_item_size]) # Last CNN layer of elements model if flags.elements_3d_output and flags.elements_cnn: elements_enc = tf.layers.conv2d(elements_enc, elements_enc.shape[3], 3, padding='SAME', activation=tf.nn.relu, strides=1) elements_enc = tf.nn.dropout(elements_enc, flags.elements_keep_prob) elements_enc = tf.layers.conv2d(elements_enc, elements_enc.shape[3], 3, padding='SAME', activation=None, strides=1) return elements_enc, elements_enc_pre_atten
def _calculate_eval_metrics_fn( loss, label_ids, logits, input_mask, aggregation_function_id, logits_aggregation, classification_class_index, logits_cls, ): """Calculates metrics for both cells and aggregation functions.""" logits.shape.assert_has_rank(2) label_ids.shape.assert_has_rank(2) # <int32>[batch size, seq_length] predictions = tf.where(logits >= 0, tf.ones_like(logits, dtype=tf.int32), tf.zeros_like(logits, dtype=tf.int32)) input_mask_float = tf.cast(input_mask, tf.float32) loss = tf.metrics.mean(values=loss) # <bool>[batch size, seq_length] token_correct = tf.logical_or(tf.equal(label_ids, predictions), tf.logical_not(tf.cast(input_mask, tf.bool))) # <bool>[batch size] per_sequence_accuracy = tf.reduce_all(token_correct, axis=1) sequence_accuracy = tf.metrics.mean(values=per_sequence_accuracy) mean_label = tf.metrics.mean(values=tf.cast(label_ids, tf.float32), weights=input_mask_float) metrics = { "eval_loss": loss, "eval_sequence_accuracy": sequence_accuracy, "eval_mean_label": mean_label, } if logits_cls is not None: # <int32>[batch size] predictions_cls = tf.argmax(logits_cls, axis=-1, output_type=tf.int32) accuracy_cls = tf.metrics.accuracy(labels=classification_class_index, predictions=predictions_cls) mean_per_class_accuracy_cls = tf.metrics.mean_per_class_accuracy( labels=classification_class_index, predictions=predictions_cls, num_classes=logits_cls.shape[-1].value) metrics.update({ "eval_classification_accuracy": accuracy_cls, "eval_mean_per_class_classification_accuracy": mean_per_class_accuracy_cls, }) if logits_aggregation is not None: # <int32>[batch size] predictions_agg = tf.argmax(logits_aggregation, axis=-1, output_type=tf.int32) accuracy_agg = tf.metrics.accuracy(labels=aggregation_function_id, predictions=predictions_agg) # <bool>[batch size] per_sequence_agg_accuracy = tf.equal(aggregation_function_id, predictions_agg) # Whether cells and aggregation function predictions are both correct. per_sequence_joint_accuracy = tf.logical_and(per_sequence_agg_accuracy, per_sequence_accuracy) joint_accuracy = tf.metrics.mean(values=per_sequence_joint_accuracy) metrics.update({ "eval_aggregation_accuracy": accuracy_agg, "eval_joint_accuracy": joint_accuracy, }) return metrics
def blackout_pixel_weights_by_box_regions(height, width, boxes, blackout, weights=None): """Apply weights at pixel locations. This function is used to generate the pixel weight mask (usually in the output image dimension). The mask is to ignore some regions when computing loss. Weights are applied as follows: - Any region outside of a box gets the default weight 1.0 - Any box for which an explicit weight is specifed gets that weight. If multiple boxes overlap, the maximum of the weights is applied. - Any box for which blackout=True is specified will get a weight of 0.0, regardless of whether an equivalent non-zero weight is specified. Also, the blackout region takes precedence over other boxes which may overlap with non-zero weight. Example: height = 4 width = 4 boxes = [[0., 0., 2., 2.], [0., 0., 4., 2.], [3., 0., 4., 4.]] blackout = [False, False, True] weights = [4.0, 3.0, 2.0] blackout_pixel_weights_by_box_regions(height, width, boxes, blackout, weights) >> [[4.0, 4.0, 1.0, 1.0], [4.0, 4.0, 1.0, 1.0], [3.0, 3.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0]] Args: height: int, height of the (output) image. width: int, width of the (output) image. boxes: A float tensor with shape [num_instances, 4] indicating the coordinates of the four corners of the boxes. blackout: A boolean tensor with shape [num_instances] indicating whether to blackout (zero-out) the weights within the box regions. weights: An optional float32 tensor with shape [num_instances] indicating a value to apply in each box region. Note that if blackout=True for a given box, the weight will be zero. If None, all weights are assumed to be 1. Returns: A float tensor with shape [height, width] where all values within the regions of the blackout boxes are 0.0 and 1.0 (or weights if supplied) elsewhere. """ num_instances, _ = shape_utils.combined_static_and_dynamic_shape(boxes) # If no annotation instance is provided, return all ones (instead of # unexpected values) to avoid NaN loss value. if num_instances == 0: return tf.ones([height, width], dtype=tf.float32) (y_grid, x_grid) = image_shape_to_grids(height, width) y_grid = tf.expand_dims(y_grid, axis=0) x_grid = tf.expand_dims(x_grid, axis=0) y_min = tf.expand_dims(boxes[:, 0:1], axis=-1) x_min = tf.expand_dims(boxes[:, 1:2], axis=-1) y_max = tf.expand_dims(boxes[:, 2:3], axis=-1) x_max = tf.expand_dims(boxes[:, 3:], axis=-1) # Make the mask with all 1.0 in the box regions. # Shape: [num_instances, height, width] in_boxes = tf.math.logical_and( tf.math.logical_and(y_grid >= y_min, y_grid < y_max), tf.math.logical_and(x_grid >= x_min, x_grid < x_max)) if weights is None: weights = tf.ones_like(blackout, dtype=tf.float32) # Compute a [height, width] tensor with the maximum weight in each box, and # 0.0 elsewhere. weights_tiled = tf.tile(weights[:, tf.newaxis, tf.newaxis], [1, height, width]) weights_3d = tf.where(in_boxes, weights_tiled, tf.zeros_like(weights_tiled)) weights_2d = tf.math.maximum(tf.math.reduce_max(weights_3d, axis=0), 0.0) # Add 1.0 to all regions outside a box. weights_2d = tf.where(tf.math.reduce_any(in_boxes, axis=0), weights_2d, tf.ones_like(weights_2d)) # Now enforce that blackout regions all have zero weights. keep_region = tf.cast(tf.math.logical_not(blackout), tf.float32) keep_region_tiled = tf.tile(keep_region[:, tf.newaxis, tf.newaxis], [1, height, width]) keep_region_3d = tf.where(in_boxes, keep_region_tiled, tf.ones_like(keep_region_tiled)) keep_region_2d = tf.math.reduce_min(keep_region_3d, axis=0) return weights_2d * keep_region_2d
def _stitch(features): """Stitch features on the first dimension.""" full_mask = tf.greater(features['task'], 1) step_mask = tf.reduce_any(full_mask, axis=-1) step_mask_exclude_last = tf.pad(step_mask, [[0, 0], [0, 1]], constant_values=False)[:, 1:] num_sequences = common_layers.shape_list(features['task'])[0] num_steps = common_layers.shape_list(features['task'])[1] connectors = tf.constant(PADDED_CONCATENATORS) # Select connectors connector_indices = tf.random.uniform([num_sequences * num_steps], minval=0, maxval=len(PADDED_CONCATENATORS), dtype=tf.int32) selected_connectors = tf.reshape( tf.gather(connectors, connector_indices), [num_sequences, num_steps, len(PADDED_CONCATENATORS[0])]) selected_connectors = tf.multiply(selected_connectors, tf.expand_dims( tf.to_int32(step_mask_exclude_last), 2), name='connector_mask') features['task'] = tf.concat([features['task'], selected_connectors], axis=-1) ref_offsets = tf.expand_dims( tf.cumsum(tf.reduce_sum(tf.to_int32(tf.greater(features['task'], 1)), -1), exclusive=True, axis=-1), 2) features['task'] = tf.reshape(features['task'], [num_sequences, -1]) full_mask = tf.greater(features['task'], 1) full_mask_int = tf.to_int32(full_mask) indices = tf.where( tf.sequence_mask(lengths=tf.reduce_sum(full_mask_int, -1))) values = tf.boolean_mask(tf.reshape(features['task'], [-1]), tf.reshape(full_mask, [-1])) sparse_task = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=tf.to_int64( tf.shape(features['task']))) # Stitch task and raw_task stitched_features = {} stitched_features['task'] = tf.sparse_tensor_to_dense(sparse_task) max_len = tf.reduce_max( tf.reduce_sum(tf.to_int32(tf.greater(stitched_features['task'], 1)), -1)) stitched_features['task'] = stitched_features['task'][:, :max_len] if 'raw_task' in features: connector_strs = tf.reshape( tf.gather(tf.constant(CONCATENATORS_STR), connector_indices), [num_sequences, num_steps]) masked_connector_strs = tf.where(step_mask_exclude_last, connector_strs, tf.fill(tf.shape(connector_strs), '')) stitched_features['raw_task'] = tf.strings.reduce_join( tf.strings.reduce_join(tf.concat([ tf.expand_dims(features['raw_task'], 2), tf.expand_dims(masked_connector_strs, 2) ], axis=2), axis=-1), -1) # Stitch screen sequences action_lengths = tf.reduce_sum( tf.to_int32( tf.greater(features['verb_refs'][:, :, 0, 1], features['verb_refs'][:, :, 0, 0])), -1) max_action_length = tf.reduce_max(action_lengths) def _pad(tensor, padding_value=0): shape_list = common_layers.shape_list(tensor) assert len(shape_list) >= 2 padding_list = [[0, 0], [0, 1]] + [[0, 0]] * (len(shape_list) - 2) return tf.pad(tensor[:, :max_action_length], padding_list, constant_values=padding_value) for key in features.keys(): if key.endswith('_refs'): features[key] = tf.squeeze(features[key], 2) ref_mask = tf.expand_dims( tf.to_int32( tf.not_equal(features[key][:, :, 0], features[key][:, :, 1])), 2) stitched_features[key] = tf.multiply((features[key] + ref_offsets), ref_mask, name='ref_mask') stitched_features[key] = _pad(stitched_features[key]) elif key in [ 'verbs', 'objects', 'consumed', 'obj_dom_pos', 'obj_text', 'obj_type', 'obj_clickable', 'obj_screen_pos', 'verb_refs', 'obj_refs', 'input_refs', 'obj_dom_dist' ]: features[key] = tf.squeeze(features[key], 2) stitched_features[key] = features[key] stitched_features[key] = _pad( stitched_features[key], padding_value=-1 if key == 'obj_type' else 0) elif key not in ['task', 'raw_task']: stitched_features[key] = features[key][:, 0] # Append eos to 'task' stitched_features['task'] = tf.pad(stitched_features['task'], [[0, 0], [0, 1]]) task_mask = tf.to_int32(tf.greater(stitched_features['task'], 1)) task_eos_mask = tf.pad(task_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1] stitched_features['task'] = stitched_features['task'] + (task_eos_mask - task_mask) # Append eos verb_mask = tf.to_int32(tf.greater(stitched_features['verbs'], 1)) verb_eos_mask = tf.pad(verb_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1] verb_eos = verb_eos_mask - verb_mask stitched_features['verbs'] = stitched_features['verbs'] + verb_eos # Append last step refs to 'verb_refs' task_lengths = tf.where(tf.equal(stitched_features['task'], 1))[:, 1] eos_pos = tf.to_int32(tf.stack([task_lengths, task_lengths + 1], axis=1)) action_mask = tf.to_int32( tf.sequence_mask(action_lengths, max_action_length + 1)) action_and_eos_mask = tf.pad(action_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1] verb_ref_eos = action_and_eos_mask - action_mask eos_refs = tf.multiply(tf.tile(tf.expand_dims(eos_pos, 1), [1, max_action_length + 1, 1]), tf.expand_dims(verb_ref_eos, 2), name='verb_ref_eos') stitched_features['verb_refs'] += eos_refs return stitched_features