def body(index, summation): precessed = tf.slice(stacked_tensor, [0, index - 1, 0], [-1, -1, -1]) summand = tf.reduce_mean(precessed, 1) return tf.subtract(index, 1), tf.add(summation, summand)
def get_baseline_batch(self, hparams): """Get the Tensor expressions from the reader. Args: hparams: Hyperparameters object with specgram parameters. Returns: A dict of key:tensor pairs. This includes "pitch", "wav", and "key". """ example = self.get_example(hparams.batch_size) audio = tf.slice(example["audio"], [0], [64000]) audio = tf.reshape(audio, [1, 64000]) pitch = tf.slice(example["pitch"], [0], [1]) velocity = tf.slice(example["velocity"], [0], [1]) instrument_source = tf.slice(example["instrument_source"], [0], [1]) instrument_family = tf.slice(example["instrument_family"], [0], [1]) qualities = tf.slice(example["qualities"], [0], [10]) qualities = tf.reshape(qualities, [1, 10]) # Get Specgrams hop_length = hparams.hop_length n_fft = hparams.n_fft if hop_length and n_fft: specgram = utils.tf_specgram( audio, n_fft=n_fft, hop_length=hop_length, mask=hparams.mask, log_mag=hparams.log_mag, re_im=hparams.re_im, dphase=hparams.dphase, mag_only=hparams.mag_only) shape = [1] + SPECGRAM_REGISTRY[(n_fft, hop_length)] if hparams.mag_only: shape[-1] = 1 specgram = tf.reshape(specgram, shape) tf.logging.info("SPECGRAM BEFORE PADDING", specgram) if hparams.pad: # Pad and crop specgram to 256x256 num_padding = 2**int(np.ceil(np.log(shape[2]) / np.log(2))) - shape[2] tf.logging.info("num_pading: %d" % num_padding) specgram = tf.reshape(specgram, shape) specgram = tf.pad(specgram, [[0, 0], [0, 0], [0, num_padding], [0, 0]]) specgram = tf.slice(specgram, [0, 0, 0, 0], [-1, shape[1] - 1, -1, -1]) tf.logging.info("SPECGRAM AFTER PADDING", specgram) # Form a Batch if self.is_training: (audio, velocity, pitch, specgram, instrument_source, instrument_family, qualities) = tf.train.shuffle_batch( [ audio, velocity, pitch, specgram, instrument_source, instrument_family, qualities ], batch_size=hparams.batch_size, capacity=20 * hparams.batch_size, min_after_dequeue=10 * hparams.batch_size, enqueue_many=True) elif hparams.batch_size > 1: (audio, velocity, pitch, specgram, instrument_source, instrument_family, qualities) = tf.train.batch( [ audio, velocity, pitch, specgram, instrument_source, instrument_family, qualities ], batch_size=hparams.batch_size, capacity=10 * hparams.batch_size, enqueue_many=True) audio.set_shape([hparams.batch_size, 64000]) batch = dict( pitch=pitch, velocity=velocity, audio=audio, instrument_source=instrument_source, instrument_family=instrument_family, qualities=qualities, spectrogram=specgram) return batch
def main(_): tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): # Loads content images. eval_content_inputs_, _ = image_utils.imagenet_inputs( FLAGS.batch_size, FLAGS.image_size) # Process style and content weight flags. content_weights = ast.literal_eval(FLAGS.content_weights) style_weights = ast.literal_eval(FLAGS.style_weights) # Loads evaluation style images. eval_style_inputs_, _, _ = image_utils.arbitrary_style_image_inputs( FLAGS.eval_style_dataset_file, batch_size=FLAGS.batch_size, image_size=FLAGS.image_size, center_crop=True, shuffle=True, augment_style_images=False, random_style_image_size=False) # Computes stylized noise. stylized_noise, _, _, _ = build_model.build_model( tf.random_uniform( [min(4, FLAGS.batch_size), FLAGS.image_size, FLAGS.image_size, 3]), tf.slice(eval_style_inputs_, [0, 0, 0, 0], [min(4, FLAGS.batch_size), -1, -1, -1]), trainable=False, is_training=False, reuse=None, inception_end_point='Mixed_6e', style_prediction_bottleneck=100, adds_losses=False) # Computes stylized images. stylized_images, _, loss_dict, _ = build_model.build_model( eval_content_inputs_, eval_style_inputs_, trainable=False, is_training=False, reuse=True, inception_end_point='Mixed_6e', style_prediction_bottleneck=100, adds_losses=True, content_weights=content_weights, style_weights=style_weights, total_variation_weight=FLAGS.total_variation_weight) # Adds Image summaries to the tensorboard. tf.summary.image('image/{}/0_eval_content_inputs'.format(FLAGS.eval_name), eval_content_inputs_, 3) tf.summary.image('image/{}/1_eval_style_inputs'.format(FLAGS.eval_name), eval_style_inputs_, 3) tf.summary.image('image/{}/2_eval_stylized_images'.format(FLAGS.eval_name), stylized_images, 3) tf.summary.image('image/{}/3_stylized_noise'.format(FLAGS.eval_name), stylized_noise, 3) metrics = {} for key, value in loss_dict.items(): metrics[key] = tf.metrics.mean(value) names_values, names_updates = slim.metrics.aggregate_metric_map(metrics) for name, value in names_values.items(): slim.summaries.add_scalar_summary(value, name, print_summary=True) eval_op = list(names_updates.values()) num_evals = FLAGS.num_evaluation_styles / FLAGS.batch_size slim.evaluation.evaluation_loop( master=FLAGS.master, checkpoint_dir=FLAGS.checkpoint_dir, logdir=FLAGS.eval_dir, eval_op=eval_op, num_evals=num_evals, eval_interval_secs=FLAGS.eval_interval_secs)
def body(self, features): hparams = self.hparams input_shape = common_layers.shape_list(features['inputs']) batch_size, _, frame_width, frame_height, frame_channels = input_shape # pylint: disable=unused-variable # Swap time and batch axes. input_frames = common_video.swap_time_and_batch_axes( tf.to_float(features['inputs'])) target_frames = common_video.swap_time_and_batch_axes( features['targets']) # Get actions if exist otherwise use zeros input_actions = self.get_input_if_exists( features, 'input_action', batch_size, hparams.video_num_input_frames) target_actions = self.get_input_if_exists( features, 'target_action', batch_size, hparams.video_num_target_frames) # Get rewards if exist otherwise use zeros # TODO(blazej) enable rewards. # input_rewards = self.get_input_if_exists( # features, 'input_reward', batch_size, hparams.video_num_input_frames) # target_rewards = self.get_input_if_exists( # features, 'target_reward', batch_size,hparams.video_num_target_frames) # all_rewards = tf.concat([input_rewards, target_rewards], axis=0) all_actions = tf.concat([input_actions, target_actions], axis=0) # flatten actions tensor to have the shape: framesXbatch_sizeXaction_dims. actions_shape = common_layers.shape_list(all_actions) all_actions = tf.reshape(all_actions, [ actions_shape[0], -1, reduce(lambda x, y: x * y, actions_shape[2:]) ]) all_frames = tf.concat([input_frames, target_frames], axis=0) all_frames = tf.unstack(all_frames, axis=0) all_actions = tf.unstack(all_actions, axis=0) # TODO(blazej) - most likely this downsize is too strong. all_frames = [ tf.image.resize_images(image, (IMG_HEIGHT, IMG_WIDTH), method=tf.image.ResizeMethod.BICUBIC) for image in all_frames ] enc_out_all, pred_out_all, _, van_on_enc_all = construct_model( all_frames, all_actions, context_frames=hparams.context_frames, hparams=hparams, is_training=self.is_training) enc_pred_loss, _ = calc_loss_psnr( enc_out_all[1:], pred_out_all, 'enc_pred_loss', hparams=hparams, use_l1_loss=hparams.enc_pred_use_l1_loss) van_on_enc_loss, _ = calc_loss_psnr(van_on_enc_all, all_frames[1:], 'van_on_enc_loss', hparams=hparams) enc_pred_loss_scale_delay = max(hparams.enc_pred_loss_scale_delay, 1) enc_pred_loss_scale = tf.nn.sigmoid( (tf.to_float(tf.train.get_or_create_global_step()) - enc_pred_loss_scale_delay) / (enc_pred_loss_scale_delay * .1)) * hparams.enc_pred_loss_scale tf.summary.scalar('enc_pred_loss_scale', enc_pred_loss_scale) epva_loss = enc_pred_loss * enc_pred_loss_scale + van_on_enc_loss tf.summary.scalar('epva_loss', epva_loss) predictions = tf.stack(van_on_enc_all) if hparams.clip_pixel_values: predictions = tf.clip_by_value(predictions, 0.0, 1.0) # TODO(mbz): clean this up! def fix_video_dims_and_concat_on_x_axis(x): x = tf.transpose(x, [1, 3, 4, 0, 2]) x = tf.reshape(x, [batch_size, frame_height, frame_channels, -1]) x = tf.transpose(x, [0, 3, 1, 2]) return x frames_gd = fix_video_dims_and_concat_on_x_axis(target_frames) frames_pd = fix_video_dims_and_concat_on_x_axis(predictions) side_by_side_video = tf.concat([frames_gd, frames_pd], axis=1) tf.summary.image('full_video', side_by_side_video) predictions = tf.unstack(predictions) predictions = [ tf.image.resize_images(image, (frame_width, frame_height), method=tf.image.ResizeMethod.BICUBIC) for image in predictions ] predictions = tf.stack(predictions) predictions = common_video.swap_time_and_batch_axes(predictions) predictions = tf.slice( predictions, [0, hparams.video_num_input_frames - 1, 0, 0, 0], [-1] * 5) return predictions, {'extra': epva_loss}
def random_image_crop(image, boxes, min_object_covered=0.9, aspect_ratio_range=(0.75, 1.33), area_range=(0.5, 1.0), overlap_threshold=0.3): """ Performs random crop. Given the input image and its bounding boxes, this op randomly crops a subimage. Given a user-provided set of input constraints, the crop window is resampled until it satisfies these constraints. If within 100 trials it is unable to find a valid crop, the original image is returned. Both input boxes and returned boxes are in normalized form (e.g., lie in the unit square [0, 1]). Arguments: image: a float tensor with shape [height, width, 3]. boxes: a float tensor containing bounding boxes. It has shape [num_boxes, 4]. Boxes are in normalized form, meaning their coordinates vary between [0, 1]. Each row is in the form of [ymin, xmin, ymax, xmax]. min_object_covered: the cropped image must cover at least this fraction of at least one of the input bounding boxes. aspect_ratio_range: allowed range for aspect ratio of cropped image. area_range: allowed range for area ratio between cropped image and the original image. overlap_threshold: minimum overlap thresh with new cropped image to keep the box. Returns: image: cropped image, a float tensor with shape [None, None, 3]. boxes: a float tensor with shape [num_remaining, 4], remaining boxes. Where 0 <= num_remaining <= num_boxes. window: a float tensor with shape [4], in normalized coordinates. keep_indices: an int tensor with shape [num_remaining], indices of remaining boxes in input boxes tensor. """ sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( tf.shape(image), bounding_boxes=tf.expand_dims(boxes, 0), min_object_covered=min_object_covered, aspect_ratio_range=aspect_ratio_range, area_range=area_range, max_attempts=100, use_image_if_no_bounding_boxes=True) begin, size, window = sample_distorted_bounding_box image = tf.slice(image, begin, size) image.set_shape([None, None, 3]) window = tf.squeeze(window, axis=[0, 1]) # remove boxes that are completely outside the cropped image boxes, inside_window_ids = prune_completely_outside_window(boxes, window) # why do i need this function? i believe the one below is enough # remove boxes that are too much outside the cropped image boxes, keep_indices = prune_non_overlapping_boxes( boxes, tf.expand_dims(window, 0), min_overlap=overlap_threshold) # change coordinates of the remaining boxes boxes = change_coordinate_frame(boxes, window) keep_indices = tf.gather(inside_window_ids, keep_indices) return image, boxes, window, keep_indices
def call(self, inputs, prev_state): """Evaluates one timestep of the current neural stack cell. See section 3.4 of Grefenstette et al., 2015. Args: inputs: The inputs to the neural stack cell should be a tf.float32 tensor with shape [batch_size, embedding_size] prev_state: The NeuralStackState from the previous timestep. Returns: A tuple of the output of the stack as well as the new NeuralStackState. """ batch_size = tf.shape(inputs)[0] # Call the controller and get controller interface values. with tf.control_dependencies([prev_state.read_strengths]): controller_output = self.call_controller( inputs, prev_state.read_values, prev_state.controller_state, batch_size) # Always write input values to memory regardless of push strength. # See Equation-1 in Grefenstette et al., 2015. new_memory_values = prev_state.memory_values + tf.reduce_sum( tf.expand_dims(controller_output.write_values, axis=2) * prev_state.write_strengths, axis=1) # Attenuate the read strengths of existing memory values depending on the # current pop strength. # See Equation-2 in Grefenstette et al., 2015. new_read_strengths = prev_state.read_strengths for h in range(self._num_read_heads - 1, -1, -1): new_read_strengths = tf.nn.relu(new_read_strengths - tf.nn.relu( tf.slice(controller_output.pop_strengths, [0, h, 0, 0], [-1, 1, -1, -1]) - tf.expand_dims(tf.reduce_sum( new_read_strengths * self.get_read_mask(h), axis=2), axis=3))) # Combine all write heads and their associated push values into a single set # of read weights. new_read_strengths += tf.reduce_sum(controller_output.push_strengths * prev_state.write_strengths, axis=1, keep_dims=True) # Calculate the "top" value of the stack by looking at read strengths. # See Equation-3 in Grefenstette et al., 2015. new_read_values = tf.reduce_sum( tf.minimum( new_read_strengths, tf.nn.relu(1 - tf.expand_dims(tf.reduce_sum( new_read_strengths * tf.concat([ self.get_read_mask(h) for h in range(self._num_read_heads) ], axis=1), axis=2), axis=3))) * tf.expand_dims(new_memory_values, axis=1), axis=2) # Temporarily split write strengths apart so they can be shifted in # different directions. write_strengths_by_head = tf.split(prev_state.write_strengths, self._num_write_heads, axis=1) # Shift the write strengths for each write head in the direction indicated # by get_write_head_offset(). new_write_strengths = tf.concat([ tf.roll( write_strength, shift=self.get_write_head_offset(h), axis=2) for h, write_strength in enumerate(write_strengths_by_head) ], axis=1) return (controller_output.outputs, NeuralStackState(controller_state=controller_output.state, read_values=new_read_values, memory_values=new_memory_values, read_strengths=new_read_strengths, write_strengths=new_write_strengths))
def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', True): detections_bs = [] nms_configs = params['nms_configs'] for index in range(kwargs['boxes'].shape[0]): detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) detections_bs = postprocess.transform_detections( tf.stack(detections_bs)) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file'], label_map=params['label_map']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) # Set policy for mixed-precision training with Keras-based models. if use_tpu and train_config.use_bfloat16: from tensorflow.python.keras.engine import base_layer_utils # pylint: disable=g-import-not-at-top # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0. base_layer_utils.enable_v2_dtype_behavior() tf.compat.v2.keras.mixed_precision.experimental.set_policy( 'mixed_bfloat16') detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch(labels, unpad_groundtruth_tensors=train_config. unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes]. get_shape().as_list()) unpad_groundtruth_tensors = boxes_shape[ 1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): provide_groundtruth(detection_model, labels) preprocessed_images = features[fields.InputDataFields.image] side_inputs = detection_model.get_side_inputs(features) if use_tpu and train_config.use_bfloat16: with contrib_tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) prediction_dict = ops.bfloat16_to_float32_nested( prediction_dict) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) def postprocess_wrapper(args): return detection_model.postprocess(args[0], args[1]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): if use_tpu and postprocess_on_cpu: detections = contrib_tpu.outside_compilation( postprocess_wrapper, (prediction_dict, features[fields.InputDataFields.true_image_shape])) else: detections = postprocess_wrapper( (prediction_dict, features[fields.InputDataFields.true_image_shape])) if mode == tf.estimator.ModeKeys.TRAIN: load_pretrained = hparams.load_pretrained if hparams else False if train_config.fine_tune_checkpoint and load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): if (mode == tf.estimator.ModeKeys.EVAL and eval_config.use_dummy_loss_in_eval): total_loss = tf.constant(1.0) losses_dict = {'Loss/total_loss': total_loss} else: losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses( ) if use_tpu and train_config.use_bfloat16: regularization_losses = ops.bfloat16_to_float32_nested( regularization_losses) if regularization_losses: regularization_loss = tf.add_n( regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict[ 'Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = contrib_tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = slim.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None if train_config.summarize_gradients: summaries = [ 'gradients', 'gradient_norm', 'global_gradient_norm' ] train_op = slim.optimizers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, update_ops=detection_model.updates(), variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: exported_output = exporter_lib.add_output_tensor_nodes(detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[ fields.InputDataFields.original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if fields.InputDataFields.image_additional_channels in features: eval_dict[fields.InputDataFields. image_additional_channels] = features[ fields.InputDataFields.image_additional_channels] if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: keypoint_edges = [(kp.start, kp.end) for kp in eval_config.keypoint_edge] eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False, keypoint_edges=keypoint_edges or None) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, list(category_index.values()), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours ) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return contrib_tpu.TPUEstimatorSpec(mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def get(self): """ Provides input data to the graph. """ # calculate size of each record (this lists what is contained in the db and how many bytes are occupied) record_bytes = 2 encoding_bytes = 4 kp_xyz_entries = 3 * self.num_kp record_bytes += encoding_bytes * kp_xyz_entries encoding_bytes = 4 kp_uv_entries = 2 * self.num_kp record_bytes += encoding_bytes * kp_uv_entries cam_matrix_entries = 9 record_bytes += encoding_bytes * cam_matrix_entries image_bytes = self.image_size[0] * self.image_size[1] * 3 record_bytes += image_bytes hand_parts_bytes = self.image_size[0] * self.image_size[1] record_bytes += hand_parts_bytes kp_vis_bytes = self.num_kp record_bytes += kp_vis_bytes """ READ DATA ITEMS""" # Start reader reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes) _, value = reader.read( tf.train.string_input_producer([self.path_to_db])) # decode to floats bytes_read = 0 data_dict = dict() record_bytes_float32 = tf.decode_raw(value, tf.float32) # 1. Read keypoint xyz keypoint_xyz = tf.reshape( tf.slice(record_bytes_float32, [bytes_read // 4], [kp_xyz_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes * kp_xyz_entries # calculate palm coord if not self.use_wrist_coord: palm_coord_l = tf.expand_dims( 0.5 * (keypoint_xyz[0, :] + keypoint_xyz[12, :]), 0) palm_coord_r = tf.expand_dims( 0.5 * (keypoint_xyz[21, :] + keypoint_xyz[33, :]), 0) keypoint_xyz = tf.concat([ palm_coord_l, keypoint_xyz[1:21, :], palm_coord_r, keypoint_xyz[-20:, :] ], 0) data_dict['keypoint_xyz'] = keypoint_xyz # 2. Read keypoint uv keypoint_uv = tf.cast( tf.reshape( tf.slice(record_bytes_float32, [bytes_read // 4], [kp_uv_entries]), [self.num_kp, 2]), tf.int32) bytes_read += encoding_bytes * kp_uv_entries keypoint_uv = tf.cast(keypoint_uv, tf.float32) # calculate palm coord if not self.use_wrist_coord: palm_coord_uv_l = tf.expand_dims( 0.5 * (keypoint_uv[0, :] + keypoint_uv[12, :]), 0) palm_coord_uv_r = tf.expand_dims( 0.5 * (keypoint_uv[21, :] + keypoint_uv[33, :]), 0) keypoint_uv = tf.concat([ palm_coord_uv_l, keypoint_uv[1:21, :], palm_coord_uv_r, keypoint_uv[-20:, :] ], 0) if self.coord_uv_noise: noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv += noise data_dict['keypoint_uv'] = keypoint_uv # 3. Camera intrinsics cam_mat = tf.reshape( tf.slice(record_bytes_float32, [bytes_read // 4], [cam_matrix_entries]), [3, 3]) bytes_read += encoding_bytes * cam_matrix_entries data_dict['cam_mat'] = cam_mat # decode to uint8 bytes_read += 2 record_bytes_uint8 = tf.decode_raw(value, tf.uint8) # 4. Read image image = tf.reshape( tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]), [self.image_size[0], self.image_size[1], 3]) image = tf.cast(image, tf.float32) bytes_read += image_bytes # subtract mean image = image / 255.0 - 0.5 if self.hue_aug: image = tf.image.random_hue(image, self.hue_aug_max) data_dict['image'] = image # 5. Read mask hand_parts_mask = tf.reshape( tf.slice(record_bytes_uint8, [bytes_read], [hand_parts_bytes]), [self.image_size[0], self.image_size[1]]) hand_parts_mask = tf.cast(hand_parts_mask, tf.int32) bytes_read += hand_parts_bytes data_dict['hand_parts'] = hand_parts_mask hand_mask = tf.greater(hand_parts_mask, 1) bg_mask = tf.logical_not(hand_mask) data_dict['hand_mask'] = tf.cast(tf.stack([bg_mask, hand_mask], 2), tf.int32) # 6. Read visibilty keypoint_vis = tf.reshape( tf.slice(record_bytes_uint8, [bytes_read], [kp_vis_bytes]), [self.num_kp]) keypoint_vis = tf.cast(keypoint_vis, tf.bool) bytes_read += kp_vis_bytes # calculate palm visibility if not self.use_wrist_coord: palm_vis_l = tf.expand_dims( tf.logical_or(keypoint_vis[0], keypoint_vis[12]), 0) palm_vis_r = tf.expand_dims( tf.logical_or(keypoint_vis[21], keypoint_vis[33]), 0) keypoint_vis = tf.concat([ palm_vis_l, keypoint_vis[1:21], palm_vis_r, keypoint_vis[-20:] ], 0) data_dict['keypoint_vis'] = keypoint_vis assert bytes_read == record_bytes, "Doesnt add up." """ DEPENDENT DATA ITEMS: SUBSET of 21 keypoints""" # figure out dominant hand by analysis of the segmentation mask one_map, zero_map = tf.ones_like(hand_parts_mask), tf.zeros_like( hand_parts_mask) cond_l = tf.logical_and(tf.greater(hand_parts_mask, one_map), tf.less(hand_parts_mask, one_map * 18)) cond_r = tf.greater(hand_parts_mask, one_map * 17) hand_map_l = tf.where(cond_l, one_map, zero_map) hand_map_r = tf.where(cond_r, one_map, zero_map) num_px_left_hand = tf.reduce_sum(hand_map_l) num_px_right_hand = tf.reduce_sum(hand_map_r) # PRODUCE the 21 subset using the segmentation masks # We only deal with the more prominent hand for each frame and discard the second set of keypoints kp_coord_xyz_left = keypoint_xyz[:21, :] kp_coord_xyz_right = keypoint_xyz[-21:, :] cond_left = tf.logical_and( tf.cast(tf.ones_like(kp_coord_xyz_left), tf.bool), tf.greater(num_px_left_hand, num_px_right_hand)) kp_coord_xyz21 = tf.where(cond_left, kp_coord_xyz_left, kp_coord_xyz_right) hand_side = tf.where( tf.greater(num_px_left_hand, num_px_right_hand), tf.constant(0, dtype=tf.int32), tf.constant(1, dtype=tf.int32)) # left hand = 0; right hand = 1 data_dict['hand_side'] = tf.one_hot(hand_side, depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32) data_dict['keypoint_xyz21'] = kp_coord_xyz21 # make coords relative to root joint kp_coord_xyz_root = kp_coord_xyz21[0, :] # this is the palm coord kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = tf.sqrt( tf.reduce_sum( tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :]))) data_dict['keypoint_scale'] = index_root_bone_length data_dict[ 'keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length # normalized by length of 12->11 # calculate local coordinates kp_coord_xyz21_local = bone_rel_trafo( data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local) data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local # calculate viewpoint and coords in canonical coordinates kp_coord_xyz21_rel_can, rot_mat = canonical_trafo( data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_rel_can, rot_mat = tf.squeeze( kp_coord_xyz21_rel_can), tf.squeeze(rot_mat) kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can, tf.logical_not(cond_left)) data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can data_dict['rot_mat'] = tf.matrix_inverse(rot_mat) # Set of 21 for visibility keypoint_vis_left = keypoint_vis[:21] keypoint_vis_right = keypoint_vis[-21:] keypoint_vis21 = tf.where(cond_left[:, 0], keypoint_vis_left, keypoint_vis_right) data_dict['keypoint_vis21'] = keypoint_vis21 # Set of 21 for UV coordinates keypoint_uv_left = keypoint_uv[:21, :] keypoint_uv_right = keypoint_uv[-21:, :] keypoint_uv21 = tf.where(cond_left[:, :2], keypoint_uv_left, keypoint_uv_right) data_dict['keypoint_uv21'] = keypoint_uv21 """ DEPENDENT DATA ITEMS: HAND CROP """ if self.hand_crop: crop_center = keypoint_uv21[12, ::-1] # catch problem, when no valid kp available (happens almost never) crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center, lambda: tf.constant([0.0, 0.0])) crop_center.set_shape([ 2, ]) if self.crop_center_noise: noise = tf.truncated_normal( [2], mean=0.0, stddev=self.crop_center_noise_sigma) crop_center += noise crop_scale_noise = tf.constant(1.0) if self.crop_scale_noise: crop_scale_noise = tf.squeeze( tf.random_uniform([1], minval=1.0, maxval=1.2)) # select visible coords only kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21) kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21) kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1) # determine size of crop (measure spatial extend of hw coords first) min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0) max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size) # find out larger distance wrt the center of crop crop_size_best = 2 * tf.maximum(max_coord - crop_center, crop_center - min_coord) crop_size_best = tf.reduce_max(crop_size_best) crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0) # catch problem, when no valid kp available crop_size_best = tf.cond( tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best, lambda: tf.constant(200.0)) crop_size_best.set_shape([]) # calculate necessary scaling scale = tf.cast(self.crop_size, tf.float32) / crop_size_best scale = tf.minimum(tf.maximum(scale, 1.0), 10.0) scale *= crop_scale_noise data_dict['crop_scale'] = scale if self.crop_offset_noise: noise = tf.truncated_normal( [2], mean=0.0, stddev=self.crop_offset_noise_sigma) crop_center += noise # Crop image img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale) data_dict['image_crop'] = tf.squeeze(img_crop) # Modify uv21 coordinates crop_center_float = tf.cast(crop_center, tf.float32) keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1] ) * scale + self.crop_size // 2 keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0] ) * scale + self.crop_size // 2 keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1) data_dict['keypoint_uv21'] = keypoint_uv21 # Modify camera intrinsics scale = tf.reshape(scale, [ 1, ]) scale_matrix = tf.dynamic_stitch([ [0], [1], [2], [3], [4], [5], [6], [7], [8] ], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]]) scale_matrix = tf.reshape(scale_matrix, [3, 3]) crop_center_float = tf.cast(crop_center, tf.float32) trans1 = crop_center_float[0] * scale - self.crop_size // 2 trans2 = crop_center_float[1] * scale - self.crop_size // 2 trans1 = tf.reshape(trans1, [ 1, ]) trans2 = tf.reshape(trans2, [ 1, ]) trans_matrix = tf.dynamic_stitch( [[0], [1], [2], [3], [4], [5], [6], [7], [8]], [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0], [1.0]]) trans_matrix = tf.reshape(trans_matrix, [3, 3]) data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, cam_mat)) """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints""" # create scoremaps from the subset of 2D annoataion keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) scoremap = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21) if self.scoremap_dropout: scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob, noise_shape=[1, 1, 21]) scoremap *= self.scoremap_dropout_prob data_dict['scoremap'] = scoremap if self.scale_to_size: image, keypoint_uv21, keypoint_vis21 = data_dict[ 'image'], data_dict['keypoint_uv21'], data_dict[ 'keypoint_vis21'] s = image.get_shape().as_list() image = tf.image.resize_images(image, self.scale_target_size) scale = (self.scale_target_size[0] / float(s[0]), self.scale_target_size[1] / float(s[1])) keypoint_uv21 = tf.stack([ keypoint_uv21[:, 0] * scale[1], keypoint_uv21[:, 1] * scale[0] ], 1) data_dict = dict( ) # delete everything else because the scaling makes the data invalid anyway data_dict['image'] = image data_dict['keypoint_uv21'] = keypoint_uv21 data_dict['keypoint_vis21'] = keypoint_vis21 elif self.random_crop_to_size: tensor_stack = tf.concat([ data_dict['image'], tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1), tf.cast(data_dict['hand_mask'], tf.float32) ], 2) s = tensor_stack.get_shape().as_list() tensor_stack_cropped = tf.random_crop( tensor_stack, [self.random_crop_size, self.random_crop_size, s[2]]) data_dict = dict( ) # delete everything else because the random cropping makes the data invalid anyway data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\ tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\ tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32) names, tensors = zip(*data_dict.items()) if self.shuffle: tensors = tf.train.shuffle_batch_join([tensors], batch_size=self.batch_size, capacity=100, min_after_dequeue=50, enqueue_many=False) else: tensors = tf.train.batch_join([tensors], batch_size=self.batch_size, capacity=100, enqueue_many=False) return dict(zip(names, tensors))
def _crop_to_square(image, decode_image=False, side_length=IMAGE_SIZE, crop_padding=CROP_PADDING, area_range=(0.08, 1.0), is_training=True, resize_only=False, eval_crop_method=enums.EvalCropMethod.RESIZE_THEN_CROP): """Produces a (possibly distorted) square crop of an image. Given an input image, either as an encoded bytes string or a decoded image Tensor, produces a square version of it with the desired side length, using a combination of cropping and resizing. If `resize_only` is True, simply resize the image to be `side_length`x`side_length`, possibly distorting it if the original image is not square. If `is_training` is True, then sample a random box to crop from the image and then resize the result to be `side_length`x`side_length`. If `is_training` is False then we follow `eval_crop_method` to determine the strategy of cropping and resizing. Generally the approach is to end up with a center crop of size `side_length`x`side_length` taken from the image resized to have a minimum dimension of `side_length` + `crop_padding`. By setting eval_crop_method appropriately, this can be accomplished by first resizing and then cropping, first cropping and then resizing, or a less common approach of cropping the central `side_length`/(`side_length`+`crop_padding`) pixels in each dimension followed by resizing (and distorting) to `side_length`x`side_length`. If `decode_image` is True (i.e., `image` is an encoded jpeg image string), when possible we crop before decoding, which can provide substantial speedups. Args: image: An image represented either as a 3D Tensor with any numeric DType or else as an encoded jpeg image string. decode_image: Whether `image` is an encoded jpeg image string or not. side_length: The side length, in both spatial dimentions, of the output image. crop_padding: When `is_training` is False, this determines how much padding to apply around the central square crop. area_range: List of floats. The cropped area of the image must contain a fraction of the supplied image within this range. Only relevant when `is_training` is True and `resize_only` is False. is_training: Whether this should operate in training (non-deterministic random crop window) or eval (deterministic central crop window) mode. resize_only: Whether to just resize the image to the target `side_length` without performing any cropping. This is likely to distort the image. eval_crop_method: The strategy for obtaining the desired square crop in eval mode. See EvalCropMethod for valid values. Returns: An image Tensor of shape [`side_length`, `side_length`, 3]. If `image` was provided then the output has the same dtype as `image`. If `image_bytes` was provided then the output dtype is tf.uint8. Raises: ValueError: If both or neither of `image` and `image_bytes` was passed. """ with tf.name_scope('crop_to_square'): if not decode_image: image = _validate_image_dimensions(image) if resize_only: if decode_image: image = _decode_and_maybe_crop_image(image) resized = _resize_image(image, (side_length, side_length)) return tf.ensure_shape(resized, [side_length, side_length, 3]) image_shape = (tf.shape(image) if not decode_image else tf.image.extract_jpeg_shape(image)) if is_training: # During training, always crop then resize. crop_window = _distorted_crop_window(image_shape, area_range=area_range) if decode_image: cropped = _decode_and_maybe_crop_image( image, _convert_3d_crop_window_to_2d(crop_window)) else: cropped = tf.slice(image, crop_window[:3], crop_window[3:]) resized = _resize_image(cropped, [side_length, side_length]) return tf.ensure_shape(resized, [side_length, side_length, 3]) else: # For eval, the ordering depends on eval_crop_method. crop_frac = (side_length / (side_length + crop_padding)) if eval_crop_method == enums.EvalCropMethod.RESIZE_THEN_CROP: if decode_image: image = _decode_and_maybe_crop_image(image) resize_dim = side_length + crop_padding resized = _resize_to_min_dim(image, resize_dim) crop_window = _center_crop_window(tf.shape(resized), crop_dim=side_length) cropped = tf.slice(resized, crop_window[:3], crop_window[3:]) return tf.ensure_shape(cropped, [side_length, side_length, 3]) elif eval_crop_method == enums.EvalCropMethod.CROP_THEN_RESIZE: crop_window = _center_crop_window(image_shape, crop_frac=crop_frac) if decode_image: cropped = _decode_and_maybe_crop_image( image, _convert_3d_crop_window_to_2d(crop_window)) else: cropped = tf.slice(image, crop_window[:3], crop_window[3:]) resized = _resize_image(cropped, [side_length, side_length]) return tf.ensure_shape(resized, [side_length, side_length, 3]) elif eval_crop_method == enums.EvalCropMethod.CROP_THEN_DISTORT: if decode_image: image = _decode_and_maybe_crop_image(image) # Note that tf.image.central_crop does not produce a square crop. It # preserves the input aspect ratio. cropped = tf.image.central_crop(image, central_fraction=crop_frac) resized = _resize_image(cropped, [side_length, side_length]) return tf.ensure_shape(resized, [side_length, side_length, 3]) elif eval_crop_method == enums.EvalCropMethod.IDENTITY: if decode_image: image = _decode_and_maybe_crop_image(image) return tf.ensure_shape(image, [side_length, side_length, 3])
def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True): """Unstacks all tensors in `tensor_dict` along 0th dimension. Unstacks tensor from the tensor dict along 0th dimension and returns a tensor_dict containing values that are lists of unstacked, unpadded tensors. Tensors in the `tensor_dict` are expected to be of one of the three shapes: 1. [batch_size] 2. [batch_size, height, width, channels] 3. [batch_size, num_boxes, d1, d2, ... dn] When unpad_groundtruth_tensors is set to true, unstacked tensors of form 3 above are sliced along the `num_boxes` dimension using the value in tensor field.InputDataFields.num_groundtruth_boxes. Note that this function has a static list of input data fields and has to be kept in sync with the InputDataFields defined in core/standard_fields.py Args: tensor_dict: A dictionary of batched groundtruth tensors. unpad_groundtruth_tensors: Whether to remove padding along `num_boxes` dimension of the groundtruth tensors. Returns: A dictionary where the keys are from fields.InputDataFields and values are a list of unstacked (optionally unpadded) tensors. Raises: ValueError: If unpad_tensors is True and `tensor_dict` does not contain `num_groundtruth_boxes` tensor. """ unbatched_tensor_dict = { key: tf.unstack(tensor) for key, tensor in tensor_dict.items() } if unpad_groundtruth_tensors: if (fields.InputDataFields.num_groundtruth_boxes not in unbatched_tensor_dict): raise ValueError( '`num_groundtruth_boxes` not found in tensor_dict. ' 'Keys available: {}'.format(unbatched_tensor_dict.keys())) unbatched_unpadded_tensor_dict = {} unpad_keys = set([ # List of input data fields that are padded along the num_boxes # dimension. This list has to be kept in sync with InputDataFields in # standard_fields.py. fields.InputDataFields.groundtruth_instance_masks, fields.InputDataFields.groundtruth_classes, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_keypoints, fields.InputDataFields.groundtruth_keypoint_visibilities, fields.InputDataFields.groundtruth_group_of, fields.InputDataFields.groundtruth_difficult, fields.InputDataFields.groundtruth_is_crowd, fields.InputDataFields.groundtruth_area, fields.InputDataFields.groundtruth_weights ]).intersection(set(unbatched_tensor_dict.keys())) for key in unpad_keys: unpadded_tensor_list = [] for num_gt, padded_tensor in zip( unbatched_tensor_dict[ fields.InputDataFields.num_groundtruth_boxes], unbatched_tensor_dict[key]): tensor_shape = shape_utils.combined_static_and_dynamic_shape( padded_tensor) slice_begin = tf.zeros([len(tensor_shape)], dtype=tf.int32) slice_size = tf.stack( [num_gt] + [-1 if dim is None else dim for dim in tensor_shape[1:]]) unpadded_tensor = tf.slice(padded_tensor, slice_begin, slice_size) unpadded_tensor_list.append(unpadded_tensor) unbatched_unpadded_tensor_dict[key] = unpadded_tensor_list unbatched_tensor_dict.update(unbatched_unpadded_tensor_dict) return unbatched_tensor_dict
def _grow_alive_seq(self, state): """Grow alive sequences by one token, and collect top 2*beam_size sequences. 2*beam_size sequences are collected because some sequences may have reached the EOS token. 2*beam_size ensures that at least beam_size sequences are still alive. Args: state: A dictionary with the current loop state. Returns: Tuple of (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1], Scores of returned sequences [batch_size, 2 * beam_size], New alive cache, for each of the 2 * beam_size sequences) """ i = state[_StateKeys.CUR_INDEX] alive_seq = state[_StateKeys.ALIVE_SEQ] alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS] alive_cache = state[_StateKeys.ALIVE_CACHE] beams_to_keep = 2 * self.beam_size # Get logits for the next candidate IDs for the alive sequences. Get the new # cache values at the same time. if self.padded_decode: flat_ids = tf.reshape( tf.slice(alive_seq, [0, 0, i], [self.batch_size, self.beam_size, 1]), [self.batch_size * self.beam_size, -1]) else: flat_ids = _flatten_beam_dim(alive_seq) # [batch_size * beam_size] flat_cache = tf.nest.map_structure(_flatten_beam_dim, alive_cache) flat_logits, flat_cache = self.symbols_to_logits_fn( flat_ids, i, flat_cache) # Unflatten logits to shape [batch_size, beam_size, vocab_size] logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size) new_cache = tf.nest.map_structure( lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size), flat_cache) # Convert logits to normalized log probs candidate_log_probs = _log_prob_from_logits(logits) # Calculate new log probabilities if each of the alive sequences were # extended # by the the candidate IDs. # Shape [batch_size, beam_size, vocab_size] log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2) # Each batch item has beam_size * vocab_size candidate sequences. For each # batch item, get the k candidates with the highest log probabilities. flat_log_probs = tf.reshape(log_probs, [-1, self.beam_size * self.vocab_size]) topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep) # Extract the alive sequences that generate the highest log probabilities # after being extended. topk_beam_indices = topk_indices // self.vocab_size topk_seq, new_cache = _gather_beams([alive_seq, new_cache], topk_beam_indices, self.batch_size, beams_to_keep) # Append the most probable IDs to the topk sequences topk_ids = topk_indices % self.vocab_size if self.padded_decode: topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1]) topk_seq = tf.tensor_scatter_nd_update(topk_seq, [i + 1], topk_ids) topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0]) else: topk_ids = tf.expand_dims(topk_ids, axis=2) topk_seq = tf.concat([topk_seq, topk_ids], axis=2) return topk_seq, topk_log_probs, new_cache
def slice_layer(x, offsets, lengths): y = [] for i in zip(offsets, lengths): y.append(tf.slice(x, [0, i[0]], [-1, i[1]])) return y
def filter(xt, k): xt = tf.transpose(xt) # N x M xt = tf.reshape(xt, [-1, 1]) # NM x 1 w = tf.slice(W, [k, 0], [1, -1]) # 1 x F y = tf.matmul(xt, w) # NM x F return tf.reshape(y, [-1, M, self.F]) # N x M x F
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = audio_ops.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif model_settings['preprocess'] == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) tf.summary.image('micro', tf.expand_dims( tf.expand_dims(self.output_, -1), 0), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (model_settings['preprocess'])) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') if summaries_dir: self.summary_writer_ = tf.summary.FileWriter( summaries_dir + '/data', tf.get_default_graph())
def train(imPath, logPath, modelPath, pmPath, nTrain, nValid, nTest, restoreVariables, nSteps, gpuIndex, testPMIndex): os.environ['CUDA_VISIBLE_DEVICES'] = '%d' % gpuIndex outLogPath = logPath trainWriterPath = pathjoin(logPath, 'Train') validWriterPath = pathjoin(logPath, 'Valid') outModelPath = pathjoin(modelPath, 'model.ckpt') outPMPath = pmPath batchSize = UNet2D.hp['batchSize'] imSize = UNet2D.hp['imSize'] nChannels = UNet2D.hp['nChannels'] nClasses = UNet2D.hp['nClasses'] # -------------------------------------------------- # data # -------------------------------------------------- Train = np.zeros((nTrain, imSize, imSize, nChannels)) Valid = np.zeros((nValid, imSize, imSize, nChannels)) Test = np.zeros((nTest, imSize, imSize, nChannels)) LTrain = np.zeros((nTrain, imSize, imSize, nClasses)) LValid = np.zeros((nValid, imSize, imSize, nClasses)) LTest = np.zeros((nTest, imSize, imSize, nClasses)) print('loading data, computing mean / st dev') if not os.path.exists(modelPath): os.makedirs(modelPath) if restoreVariables: datasetMean = loadData(pathjoin(modelPath, 'datasetMean.data')) datasetStDev = loadData(pathjoin(modelPath, 'datasetStDev.data')) else: datasetMean = 0 datasetStDev = 0 for iSample in range(nTrain + nValid + nTest): I = im2double(tifread('%s/I%05d_Img.tif' % (imPath, iSample))) datasetMean += np.mean(I) datasetStDev += np.std(I) datasetMean /= (nTrain + nValid + nTest) datasetStDev /= (nTrain + nValid + nTest) saveData(datasetMean, pathjoin(modelPath, 'datasetMean.data')) saveData(datasetStDev, pathjoin(modelPath, 'datasetStDev.data')) perm = np.arange(nTrain + nValid + nTest) np.random.shuffle(perm) for iSample in range(0, nTrain): path = '%s/I%05d_Img.tif' % (imPath, perm[iSample]) im = im2double(tifread(path)) Train[iSample, :, :, 0] = (im - datasetMean) / datasetStDev path = '%s/I%05d_Ant.tif' % (imPath, perm[iSample]) im = tifread(path) for i in range(nClasses): LTrain[iSample, :, :, i] = (im == i + 1) for iSample in range(0, nValid): path = '%s/I%05d_Img.tif' % (imPath, perm[nTrain + iSample]) im = im2double(tifread(path)) Valid[iSample, :, :, 0] = (im - datasetMean) / datasetStDev path = '%s/I%05d_Ant.tif' % (imPath, perm[nTrain + iSample]) im = tifread(path) for i in range(nClasses): LValid[iSample, :, :, i] = (im == i + 1) for iSample in range(0, nTest): path = '%s/I%05d_Img.tif' % (imPath, perm[nTrain + nValid + iSample]) im = im2double(tifread(path)) Test[iSample, :, :, 0] = (im - datasetMean) / datasetStDev path = '%s/I%05d_Ant.tif' % (imPath, perm[nTrain + nValid + iSample]) im = tifread(path) for i in range(nClasses): LTest[iSample, :, :, i] = (im == i + 1) # -------------------------------------------------- # optimization # -------------------------------------------------- tfLabels = tf.placeholder("float", shape=[None, imSize, imSize, nClasses], name='labels') globalStep = tf.Variable(0, trainable=False) learningRate0 = 0.01 decaySteps = 1000 decayRate = 0.95 learningRate = tf.train.exponential_decay(learningRate0, globalStep, decaySteps, decayRate, staircase=True) with tf.name_scope('optim'): loss = tf.reduce_mean( -tf.reduce_sum(tf.multiply(tfLabels, tf.log(UNet2D.nn)), 3)) updateOps = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # optimizer = tf.train.MomentumOptimizer(1e-3,0.9) optimizer = tf.train.MomentumOptimizer(learningRate, 0.9) # optimizer = tf.train.GradientDescentOptimizer(learningRate) with tf.control_dependencies(updateOps): optOp = optimizer.minimize(loss, global_step=globalStep) with tf.name_scope('eval'): error = [] for iClass in range(nClasses): labels0 = tf.reshape( tf.to_int32( tf.slice(tfLabels, [0, 0, 0, iClass], [-1, -1, -1, 1])), [batchSize, imSize, imSize]) predict0 = tf.reshape( tf.to_int32(tf.equal(tf.argmax(UNet2D.nn, 3), iClass)), [batchSize, imSize, imSize]) correct = tf.multiply(labels0, predict0) nCorrect0 = tf.reduce_sum(correct) nLabels0 = tf.reduce_sum(labels0) error.append(1 - tf.to_float(nCorrect0) / tf.to_float(nLabels0)) errors = tf.tuple(error) # -------------------------------------------------- # inspection # -------------------------------------------------- with tf.name_scope('scalars'): tf.summary.scalar('avg_cross_entropy', loss) for iClass in range(nClasses): tf.summary.scalar('avg_pixel_error_%d' % iClass, error[iClass]) tf.summary.scalar('learning_rate', learningRate) with tf.name_scope('images'): split0 = tf.slice(UNet2D.nn, [0, 0, 0, 0], [-1, -1, -1, 1]) split1 = tf.slice(UNet2D.nn, [0, 0, 0, 1], [-1, -1, -1, 1]) if nClasses > 2: split2 = tf.slice(UNet2D.nn, [0, 0, 0, 2], [-1, -1, -1, 1]) tf.summary.image('pm0', split0) tf.summary.image('pm1', split1) if nClasses > 2: tf.summary.image('pm2', split2) merged = tf.summary.merge_all() # -------------------------------------------------- # session # -------------------------------------------------- saver = tf.train.Saver() sess = tf.Session( config=tf.ConfigProto(allow_soft_placement=True) ) # config parameter needed to save variables when using GPU if os.path.exists(outLogPath): shutil.rmtree(outLogPath) trainWriter = tf.summary.FileWriter(trainWriterPath, sess.graph) validWriter = tf.summary.FileWriter(validWriterPath, sess.graph) if restoreVariables: saver.restore(sess, outModelPath) print("Model restored.") else: sess.run(tf.global_variables_initializer()) # -------------------------------------------------- # train # -------------------------------------------------- batchData = np.zeros((batchSize, imSize, imSize, nChannels)) batchLabels = np.zeros((batchSize, imSize, imSize, nClasses)) for i in range(nSteps): # train perm = np.arange(nTrain) np.random.shuffle(perm) for j in range(batchSize): batchData[j, :, :, :] = Train[perm[j], :, :, :] batchLabels[j, :, :, :] = LTrain[perm[j], :, :, :] summary, _ = sess.run( [merged, optOp], feed_dict={ UNet2D.tfData: batchData, tfLabels: batchLabels, UNet2D.tfTraining: 1 }) trainWriter.add_summary(summary, i) # validation perm = np.arange(nValid) np.random.shuffle(perm) for j in range(batchSize): batchData[j, :, :, :] = Valid[perm[j], :, :, :] batchLabels[j, :, :, :] = LValid[perm[j], :, :, :] summary, es = sess.run( [merged, errors], feed_dict={ UNet2D.tfData: batchData, tfLabels: batchLabels, UNet2D.tfTraining: 0 }) validWriter.add_summary(summary, i) e = np.mean(es) print('step %05d, e: %f' % (i, e)) if i == 0: if restoreVariables: lowestError = e else: lowestError = np.inf if np.mod(i, 100) == 0 and e < lowestError: lowestError = e print("Model saved in file: %s" % saver.save(sess, outModelPath)) # -------------------------------------------------- # test # -------------------------------------------------- if not os.path.exists(outPMPath): os.makedirs(outPMPath) for i in range(nTest): j = np.mod(i, batchSize) batchData[j, :, :, :] = Test[i, :, :, :] batchLabels[j, :, :, :] = LTest[i, :, :, :] if j == batchSize - 1 or i == nTest - 1: output = sess.run(UNet2D.nn, feed_dict={ UNet2D.tfData: batchData, tfLabels: batchLabels, UNet2D.tfTraining: 0 }) for k in range(j + 1): pm = output[k, :, :, testPMIndex] gt = batchLabels[k, :, :, testPMIndex] im = np.sqrt(normalize(batchData[k, :, :, 0])) imwrite( np.uint8(255 * np.concatenate( (im, np.concatenate((pm, gt), axis=1)), axis=1)), '%s/I%05d.png' % (outPMPath, i - j + k + 1)) # -------------------------------------------------- # save hyper-parameters, clean-up # -------------------------------------------------- saveData(UNet2D.hp, pathjoin(modelPath, 'hp.data')) trainWriter.close() validWriter.close() sess.close()
def prepare_processing_graph(self, flags): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - foreground_resampling_placeholder_: Controls signal stretching/squeezing - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio or raw audio. Args: flags: data and model parameters, described at model_train.py Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = flags.desired_samples self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') # signal resampling to generate more training data # it will stretch or squeeze input signal proportinally to: self.foreground_resampling_placeholder_ = tf.placeholder( tf.float32, []) if self.foreground_resampling_placeholder_ != 1.0: image = tf.expand_dims(wav_decoder.audio, 0) image = tf.expand_dims(image, 2) shape = tf.shape(wav_decoder.audio) image_resized = tf.image.resize( images=image, size=(tf.cast((tf.cast(shape[0], tf.float32) * self.foreground_resampling_placeholder_), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=desired_samples, target_width=1, ) image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3]) scaled_foreground = tf.multiply( image_resized_cropped, self.foreground_volume_placeholder_) else: scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) if flags.preprocess == 'raw': # background_clamp dims: [time, channels] # remove channel dim self.output_ = tf.squeeze(background_clamp, axis=1) # below options are for backward compatibility with previous # version of hotword detection on microcontrollers # in this case audio feature extraction is done separately from # neural net and user will have to manage it. elif flags.preprocess == 'mfcc': # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs # background_clamp dims: [time, channels] spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=flags.window_size_samples, stride=flags.window_stride_samples, magnitude_squared=flags.fft_magnitude_squared) # spectrogram: [channels/batch, frames, fft_feature] # extract mfcc features from spectrogram by audio_ops.mfcc: # 1 Input is spectrogram frames. # 2 Weighted spectrogram into bands using a triangular mel filterbank # 3 Logarithmic scaling # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count mfcc = audio_ops.mfcc( spectrogram=spectrogram, sample_rate=flags.sample_rate, upper_frequency_limit=flags.mel_upper_edge_hertz, lower_frequency_limit=flags.mel_lower_edge_hertz, filterbank_channel_count=flags.mel_num_bins, dct_coefficient_count=flags.dct_num_features) # mfcc: [channels/batch, frames, dct_coefficient_count] # remove channel dim self.output_ = tf.squeeze(mfcc, axis=0) elif flags.preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') int16_input = tf.cast( tf.multiply(background_clamp, MAX_ABS_INT16), tf.int16) # audio_microfrontend does: # 1. A slicing window function of raw audio # 2. Short-time FFTs # 3. Filterbank calculations # 4. Noise reduction # 5. PCAN Auto Gain Control # 6. Logarithmic scaling # int16_input dims: [time, channels] micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=flags.sample_rate, window_size=flags.window_size_ms, window_step=flags.window_stride_ms, num_channels=flags.mel_num_bins, upper_band_limit=flags.mel_upper_edge_hertz, lower_band_limit=flags.mel_lower_edge_hertz, out_scale=1, out_type=tf.float32) # int16_input dims: [frames, num_channels] self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "raw", ' ' "mfcc", or "micro")' % (flags.preprocess))
def CustomCropImages(images, input_shape, target_shape, target_locations): """Crop a list of images at with a custom crop location and size. Args: images: List of tensors of shape [batch_size, h, w, c]. input_shape: Shape [h, w, c] of the input images. target_shape: Shape [h, w] of the cropped output. target_locations: List of crop center coordinates tensors of shape [b, 2]. Returns: crops: List of cropped tensors of shape [batch_size] + target_shape + [3]. """ if len(input_shape) != 3: raise ValueError( 'The input shape has to be of the form (height, width, channels) ' 'but has len {}'.format(len(input_shape))) if len(target_shape) != 2: raise ValueError('The target shape has to be of the form (height, width) ' 'but has len {}'.format(len(target_shape))) if len(images) != len(target_locations): raise ValueError('There should be one target location per image. Found {} ' 'images for {} locations'.format(len(images), len(target_locations))) if input_shape[0] == target_shape[0] and input_shape[1] == target_shape[1]: return [image for image in images] if input_shape[0] < target_shape[0] or input_shape[1] < target_shape[1]: raise ValueError('The target shape {} is larger than the input image size ' '{}'.format(target_shape, input_shape[:2])) assert_ops = [] for image, target_location in zip(images, target_locations): # Assert all images have the same shape. assert_ops.append( tf.assert_equal( input_shape[:2], tf.shape(image)[1:3], message=('All images must have same width and height' 'for CenterCropImages.'))) with tf.control_dependencies(assert_ops): crops = [] for image, target_location in zip(images, target_locations): # If bounding box is outside of image boundaries, move it x_coordinates = tf.slice( target_location, [0, 1], [tf.shape(target_location)[0], 1]) y_coordinates = tf.slice( target_location, [0, 0], [tf.shape(target_location)[0], 1]) x_coordinates = tf.math.maximum( tf.cast(x_coordinates, tf.float32), tf.cast(target_shape[1] // 2, tf.float32)) y_coordinates = tf.math.maximum( tf.cast(y_coordinates, tf.float32), tf.cast(target_shape[0] // 2, tf.float32)) x_coordinates = tf.math.minimum( tf.cast(x_coordinates, tf.float32), tf.cast(tf.shape(image)[2] - target_shape[1] // 2, tf.float32)) y_coordinates = tf.math.minimum( tf.cast(y_coordinates, tf.float32), tf.cast(tf.shape(image)[1] - target_shape[0] // 2, tf.float32) ) target_location = tf.concat([x_coordinates, y_coordinates], 1) crops.append( tf.image.extract_glimpse(image, target_shape, tf.cast( target_location, tf.float32), centered=False, normalized=False)) return crops
def main(_): # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) np.set_printoptions(threshold=np.inf, linewidth=10000) flags = vars(FLAGS) for key in sorted(flags.keys()): tf.logging.info('%s = %s', key, flags[key]) if FLAGS.random_seed_weights != -1: tf.random.set_random_seed(FLAGS.random_seed_weights) # Start a new TensorFlow session. config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True #config.log_device_placement = False sess = tf.InteractiveSession(config=config) # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. label_count = len( input_data.prepare_words_list(FLAGS.wanted_words.split(','), FLAGS.silence_percentage, FLAGS.unknown_percentage)) model_settings = models.prepare_model_settings( label_count, FLAGS.sample_rate, FLAGS.nchannels, FLAGS.clip_duration_ms, FLAGS.representation, FLAGS.window_size_ms, FLAGS.window_stride_ms, 1, FLAGS.dct_coefficient_count, FLAGS.filterbank_channel_count, [int(x) for x in FLAGS.filter_counts.split(',')], [int(x) for x in FLAGS.filter_sizes.split(',')], FLAGS.final_filter_len, FLAGS.dropout_prob, FLAGS.batch_size, FLAGS.dilate_after_layer, FLAGS.stride_after_layer, FLAGS.connection_type) fingerprint_size = model_settings['fingerprint_size'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) # Figure out the learning rates for each training phase. Since it's often # effective to have high learning rates at the start of training, followed by # lower levels towards the end, the number of steps and learning rates can be # specified as comma-separated lists to define the rate at each stage. For # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001 # will run 13,000 training loops in total, with a rate of 0.001 for the first # 10,000, and 0.0001 for the final 3,000. training_steps_list = list( map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) actual_batch_size = tf.placeholder(tf.int32, [1]) fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') hidden, logits, dropout_prob = models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, is_training=True) # Define loss and optimizer ground_truth_input = tf.placeholder(tf.int64, [None], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy( labels=tf.slice(ground_truth_input, [0], actual_batch_size), logits=tf.slice(logits, [0, 0], tf.concat([actual_batch_size, [-1]], 0))) tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'), tf.control_dependencies(control_dependencies): learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input') if FLAGS.optimizer == 'sgd': train_step = tf.train.GradientDescentOptimizer( learning_rate_input).minimize(cross_entropy_mean) elif FLAGS.optimizer == 'adam': train_step = tf.train.AdamOptimizer(learning_rate_input).minimize( cross_entropy_mean) elif FLAGS.optimizer == 'adagrad': train_step = tf.train.AdagradOptimizer( learning_rate_input).minimize(cross_entropy_mean) elif FLAGS.optimizer == 'rmsprop': train_step = tf.train.RMSPropOptimizer( learning_rate_input).minimize(cross_entropy_mean) predicted_indices = tf.argmax(logits, 1) correct_prediction = tf.equal(predicted_indices, ground_truth_input) confusion_matrix = tf.confusion_matrix(tf.slice(ground_truth_input, [0], actual_batch_size), tf.slice(predicted_indices, [0], actual_batch_size), num_classes=label_count) evaluation_step = tf.reduce_mean( tf.cast(tf.slice(correct_prediction, [0], actual_batch_size), tf.float32)) tf.summary.scalar('accuracy', evaluation_step) global_step = tf.train.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables(), max_to_keep=0) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = 1 + global_step.eval(session=sess) t0 = dt.datetime.now() tf.logging.info('Training from time %s, step: %d ', t0.isoformat(), start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. if FLAGS.start_checkpoint == '': with gfile.GFile(os.path.join(FLAGS.train_dir, \ FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write(FLAGS.wanted_words.replace(',', '\n')) # log complexity of model total_parameters = 0 for variable in tf.trainable_variables(): shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= int(dim) total_parameters += variable_parameters tf.logging.info('number of trainable parameters: %d', total_parameters) checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') if FLAGS.start_checkpoint == '': tf.logging.info('Saving to "%s-%d"', checkpoint_path, 0) saver.save(sess, checkpoint_path, global_step=0) audio_processor = input_data.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.labels_touse.split(','), FLAGS.validation_percentage, FLAGS.validation_offset_percentage, FLAGS.validation_files.split(','), FLAGS.testing_percentage, FLAGS.testing_files.split(','), FLAGS.subsample_skip, FLAGS.subsample_word, FLAGS.partition_word, FLAGS.partition_n, FLAGS.partition_training_files.split(','), FLAGS.partition_validation_files.split(','), FLAGS.random_seed_batch, FLAGS.testing_equalize_ratio, FLAGS.testing_max_samples, model_settings) # exit if how_many_training_steps==0 if FLAGS.how_many_training_steps == '0': # pre-process a batch of data to make sure settings are valid train_fingerprints, train_ground_truth, _ = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, FLAGS.time_shift_random, 'training', sess) sess.run( [evaluation_step], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rates_list[0], actual_batch_size: [FLAGS.batch_size], dropout_prob: model_settings['dropout_prob'] }) return training_set_size = audio_processor.set_size('training') testing_set_size = audio_processor.set_size('testing') validation_set_size = audio_processor.set_size('validation') # Training loop. training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): if training_set_size > 0 and FLAGS.save_step_interval > 0: # Figure out what the current learning rate is. training_steps_sum = 0 for i in range(len(training_steps_list)): training_steps_sum += training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = learning_rates_list[i] break # Pull the audio samples we'll use for training. train_fingerprints, train_ground_truth, _ = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, FLAGS.time_shift_random, 'training', sess) # Run the graph with this batch of training data. train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, actual_batch_size: [FLAGS.batch_size], dropout_prob: model_settings['dropout_prob'] }) train_writer.add_summary(train_summary, training_step) t1 = dt.datetime.now() - t0 tf.logging.info( 'Elapsed %f, Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (t1.total_seconds(), training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) is_last_step = (training_step == training_steps_max) if validation_set_size > 0 and (is_last_step or (training_step % FLAGS.eval_step_interval) == 0): validate_and_test('validation', validation_set_size, model_settings, \ time_shift_samples, sess, merged_summaries, evaluation_step, \ confusion_matrix, logits, hidden, validation_writer, \ audio_processor, is_last_step, fingerprint_input, \ ground_truth_input, actual_batch_size, dropout_prob, \ training_step, t0) if testing_set_size > 0: validate_and_test('testing', testing_set_size, model_settings, time_shift_samples, \ sess, merged_summaries, evaluation_step, confusion_matrix, \ logits, hidden, validation_writer, audio_processor, \ True, fingerprint_input, ground_truth_input, \ actual_batch_size, dropout_prob, training_steps_max, t0)
def evolved_transformer_decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=None, decode_loop_step=None, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """Evolved Transformer decoder. See arxiv.org/abs/1901.11117 for more details. Args: decoder_input: a Tensor. encoder_output: a Tensor. decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()). encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()). hparams: hyperparameters for model. cache: dict, containing tensors which are the results of previous layers, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. name: a string. nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convolutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: Not supported. Returns: Decoder output tensor. """ del losses num_trainable_top_decoder_layers = hparams.get( "num_trainable_top_decoder_layers", -1) # -1 means train all weights. if num_trainable_top_decoder_layers >= 0: encoder_output = tf.stop_gradient(encoder_output) attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): hidden_state = decoder_input num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers for layer in range(num_layers): if num_trainable_top_decoder_layers == num_layers - layer: hidden_state = tf.stop_gradient(hidden_state) layer_name = "layer_%d" % layer layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope(_SIXTEEN_HEAD_ATTENTION_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) attention_cache = layer_cache[ _SIXTEEN_HEAD_ATTENTION_NAME] if layer_cache is not None else None left_state = common_attention.multihead_attention( hidden_state, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, _capped_double_heads(hparams.num_heads), hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), decode_loop_step=decode_loop_step, vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) if encoder_output is not None: with tf.variable_scope(_FIRST_ATTEND_TO_ENCODER_NAME): attention_cache = ( layer_cache[_FIRST_ATTEND_TO_ENCODER_NAME] if layer_cache is not None else None) right_state = common_attention.multihead_attention( hidden_state, encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, max_relative_position=hparams. max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams. add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims= attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get( "activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) left_state = tf.nn.dropout( left_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.nn.dropout( right_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = residual_state + left_state + right_state else: hidden_state = common_layers.layer_postprocess( residual_state, left_state, hparams) with tf.variable_scope(_CONV_BRANCHES_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) if nonpadding is not None: # Mask padding from conv layers. mask = tf.tile(tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size]) hidden_state *= mask if layer_cache: if decode_loop_step is None: hidden_state = layer_cache[ _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.concat( [ layer_cache[ _CONV_BRANCHES_FIRST_LAYER_NAME], hidden_state ], axis=1)[:, -1 * _DECODER_LEFT_CONV_PADDING - 1:, :] left_state = hidden_state right_state = hidden_state[:, _DECODER_LEFT_CONV_PADDING - _DECODER_RIGHT_CONV_PADDING:, :] else: # Inplace update is required for inference on TPU. # Inplace_ops only supports inplace_update on the first dimension. tmp = tf.transpose( layer_cache[_CONV_BRANCHES_FIRST_LAYER_NAME], perm=[1, 0, 2]) tmp = tf.expand_dims(tmp, axis=1) tmp = inplace_ops.alias_inplace_update( tmp, decode_loop_step * tf.shape(hidden_state)[1] + _DECODER_LEFT_CONV_PADDING, tf.transpose(hidden_state, perm=[1, 0, 2])) tmp = tf.squeeze(tmp, axis=1) hidden_state = layer_cache[ _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.transpose( tmp, perm=[1, 0, 2]) batch_size = hidden_state.shape.as_list()[0] left_state = tf.slice( hidden_state, [0, decode_loop_step, 0], [ batch_size, _DECODER_LEFT_CONV_PADDING + 1, hparams.hidden_size ]) right_state = tf.slice(hidden_state, [ 0, decode_loop_step + _DECODER_LEFT_CONV_PADDING - _DECODER_RIGHT_CONV_PADDING, 0 ], [ batch_size, _DECODER_RIGHT_CONV_PADDING + 1, hparams.hidden_size ]) else: # No caching. left_state = tf.pad( hidden_state, paddings=[[0, 0], [_DECODER_LEFT_CONV_PADDING, 0], [0, 0]]) right_state = tf.pad( hidden_state, paddings=[[0, 0], [_DECODER_RIGHT_CONV_PADDING, 0], [0, 0]]) left_output_dim = int(hparams.hidden_size * 2) separable_conv_11x1 = tf.layers.SeparableConv1D( left_output_dim, 11, padding="VALID", name="separable_conv11x1", activation=tf.nn.relu) left_state = separable_conv_11x1.apply(left_state) left_state = tf.nn.dropout( left_state, 1 - hparams.layer_prepostprocess_dropout) right_output_dim = int(hparams.hidden_size / 2) separable_conv_7x1_1 = tf.layers.SeparableConv1D( right_output_dim, 7, padding="VALID", name="separable_conv_7x1_1") right_state = separable_conv_7x1_1.apply(right_state) right_state = tf.nn.dropout( right_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.pad( right_state, [[0, 0], [0, 0], [0, left_output_dim - right_output_dim]], constant_values=0) hidden_state = left_state + right_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) if nonpadding is not None: # Mask padding from conv layers. mask = tf.tile(tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size * 2]) hidden_state *= mask if layer_cache: if decode_loop_step is None: hidden_state = layer_cache[ _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.concat( [ layer_cache[ _CONV_BRANCHES_SECOND_LAYER_NAME], hidden_state ], axis=1)[:, -1 * _DECODER_FINAL_CONV_PADDING - 1:, :] else: # Inplace update is required for inference on TPU. # Inplace_ops only supports inplace_update on the first dimension. tmp = tf.transpose( layer_cache[_CONV_BRANCHES_SECOND_LAYER_NAME], perm=[1, 0, 2]) tmp = tf.expand_dims(tmp, axis=1) tmp = inplace_ops.alias_inplace_update( tmp, (decode_loop_step + _DECODER_FINAL_CONV_PADDING) * tf.shape(hidden_state)[1], tf.transpose(hidden_state, perm=[1, 0, 2])) tmp = tf.squeeze(tmp, axis=1) hidden_state = layer_cache[ _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.transpose( tmp, perm=[1, 0, 2]) batch_size = hidden_state.shape.as_list()[0] hidden_state = tf.slice( hidden_state, [0, decode_loop_step, 0], [ batch_size, _DECODER_FINAL_CONV_PADDING + 1, hparams.hidden_size * 2 ]) else: hidden_state = tf.pad( hidden_state, paddings=[[0, 0], [_DECODER_FINAL_CONV_PADDING, 0], [0, 0]]) separable_conv_7x1_2 = tf.layers.SeparableConv1D( hparams.hidden_size, 7, padding="VALID", name="separable_conv_7x1_2") hidden_state = separable_conv_7x1_2.apply(hidden_state) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope(_VANILLA_ATTENTION_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) attention_cache = layer_cache[ _VANILLA_ATTENTION_NAME] if layer_cache is not None else None hidden_state = common_attention.multihead_attention( hidden_state, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), decode_loop_step=decode_loop_step, vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) if encoder_output is not None: with tf.variable_scope(_SECOND_ATTEND_TO_ENCODER_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) attention_cache = ( layer_cache[_SECOND_ATTEND_TO_ENCODER_NAME] if layer_cache is not None else None) hidden_state = common_attention.multihead_attention( hidden_state, encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, max_relative_position=hparams. max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams. add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims= attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get( "activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("dense_layers"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess( hidden_state, hparams) hidden_state = tf.layers.dense(hidden_state, int(hparams.hidden_size * 4), activation=tf.nn.swish) hidden_state = tf.nn.dropout( hidden_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = common_layers.layer_preprocess( hidden_state, hparams) hidden_state = tf.layers.dense(hidden_state, hparams.hidden_size) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) decoder_output = common_layers.layer_preprocess(hidden_state, hparams) if num_trainable_top_decoder_layers == 0: decoder_output = tf.stop_gradient(decoder_output) return decoder_output
def prepare_processing_graph(self, flags): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - foreground_resampling_placeholder_: Controls signal stretching/squeezing - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio or raw audio. Args: flags: data and model parameters, described at model_train.py Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.compat.v1.get_default_graph().name_scope('data'): desired_samples = flags.desired_samples self.wav_filename_placeholder_ = tf.compat.v1.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.compat.v1.placeholder( tf.float32, [], name='foreground_volume') # signal resampling to generate more training data # it will stretch or squeeze input signal proportinally to: self.foreground_resampling_placeholder_ = tf.placeholder(tf.float32, []) if self.foreground_resampling_placeholder_ != 1.0: image = tf.expand_dims(wav_decoder.audio, 0) image = tf.expand_dims(image, 2) shape = tf.shape(wav_decoder.audio) image_resized = tf.image.resize( images=image, size=(tf.cast((tf.cast(shape[0], tf.float32) * self.foreground_resampling_placeholder_), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=desired_samples, target_width=1, ) image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3]) scaled_foreground = tf.multiply(image_resized_cropped, self.foreground_volume_placeholder_) else: scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.compat.v1.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.compat.v1.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.compat.v1.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) if flags.preprocess == 'raw': # return raw audio self.output_ = background_clamp tf.compat.v1.summary.image( 'input_audio', tf.expand_dims(tf.expand_dims(background_clamp, -1), -1), max_outputs=1) else: # Run the spectrogram and MFCC ops to get a 2D audio 'fingerprint' spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=flags.window_size_samples, stride=flags.window_stride_samples, magnitude_squared=True) tf.compat.v1.summary.image( 'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend # on how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want # to shrink them down to produce a smaller result. That's what this # section implements. One method is to use average pooling to merge # adjacent buckets, but a more sophisticated approach is to apply the # MFCC algorithm to shrink the representation. if flags.preprocess == 'average': self.output_ = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, flags.average_window_width], strides=[1, flags.average_window_width], pooling_type='AVG', padding='SAME') tf.compat.v1.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif flags.preprocess == 'mfcc': self.output_ = audio_ops.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=flags.fingerprint_width) tf.compat.v1.summary.image( 'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif flags.preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') sample_rate = flags.sample_rate window_size_ms = (flags.window_size_samples * 1000) / sample_rate window_step_ms = (flags.window_stride_samples * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=flags.fingerprint_width, out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) tf.compat.v1.summary.image( 'micro', tf.expand_dims(tf.expand_dims(self.output_, -1), 0), max_outputs=1) else: raise ValueError('Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (flags.preprocess)) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.compat.v1.summary.merge_all(scope='data') if flags.summaries_dir: self.summary_writer_ = tf.compat.v1.summary.FileWriter( flags.summaries_dir + '/data', tf.compat.v1.get_default_graph())
print('chunk data and apply model') #if i ==1: # import code # code.interact(local=locals()) for f in range(len(score_list)): stage = math.floor(f / 2) IS_EVEN = (f % 2) == 0 #even are big chunks, odd small chunks IS_LAST = f == len( score_list) - 1 #last is remainder and not abbreviated if IS_EVEN & (IS_LAST == False): chunk = tf.expand_dims( tf.slice(waveform, [stage * max_samples, 0], [max_samples, 1]), 0) #eliminate last dets later scores = score_fn(waveform=chunk, context_step_samples=context_step_samples)[ 'scores'] #["scores"].numpy().ravel() elif IS_EVEN & IS_LAST: pos = stage * max_samples if pos != 0: rem = samples_all % pos else: #only should occur in single file case rem = samples_all chunk = tf.expand_dims(tf.slice(waveform, [pos, 0], [rem, 1]), 0) scores = score_fn( waveform=chunk,
def _update_block_mask(self, weights, threshold, mask): """Performs block-granular masking of the weights. Block pruning occurs only if the block_height or block_width is > 1 and if the weight tensor, when squeezed, has ndims = 2. Otherwise, elementwise pruning occurs. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold mask: The mask from the previous pruning update. Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if block pooling function is not AVG or MAX """ squeezed_weights = tf.squeeze(weights) if squeezed_weights.get_shape().ndims != 2 or self._block_dim == [ 1, 1 ]: if self._pruning_method == 'threshold': return self._update_mask(weights, threshold) # random_cumulative removes at random taking into account previous # random modification. random_indepent simply removes at random. elif self._pruning_method in [ 'random_independent', 'random_cumulative' ]: return self._update_random_mask(weights, mask) else: raise ValueError('Unknown pruning method: %s' % self._pruning_method) if self._block_pooling_function not in ['AVG', 'MAX']: raise ValueError( 'Unknown pooling function for block sparsity: %s' % self._block_pooling_function) with tf.name_scope(weights.op.name + '_pruning_ops'): abs_weights = tf.abs(squeezed_weights) pool_window = [self._block_dim[0], self._block_dim[1]] pool_fn = pruning_utils.factorized_pool if not self._use_tpu: pool_fn = tf.pool abs_weights = tf.reshape(abs_weights, [ 1, abs_weights.get_shape()[0], abs_weights.get_shape()[1], 1 ]) pooled_weights = pool_fn(abs_weights, window_shape=pool_window, pooling_type=self._block_pooling_function, strides=pool_window, padding='SAME', name=weights.op.name + '_pooled') if pooled_weights.get_shape().ndims != 2: pooled_weights = tf.squeeze(pooled_weights) if self._pruning_method == 'threshold': smoothed_threshold, new_mask = self._update_mask( pooled_weights, threshold) elif self._pruning_method in [ 'random_independent', 'random_cumulative' ]: smoothed_threshold, new_mask = self._update_random_mask( pooled_weights, mask) else: raise ValueError('Unknown pruning method: %s' % self._pruning_method) ## this is the process that updates the mask. updated_mask = pruning_utils.kronecker_product( new_mask, tf.ones(self._block_dim)) sliced_mask = tf.slice(updated_mask, [0, 0], [ squeezed_weights.get_shape()[0], squeezed_weights.get_shape()[1] ]) return smoothed_threshold, tf.reshape(sliced_mask, tf.shape(weights))
def __build_graph(self): """ Build TF graph for analogy-making """ self.__sem_target = tf.placeholder( shape=[self._n_slots, self._sem_dim], dtype=tf.float32) self.__struct_target = tf.placeholder( shape=[self._max_arity, self._n_slots, self._n_slots], dtype=tf.float32) self.__sem_base = tf.placeholder(\ shape=[None, self._n_slots, self._sem_dim], dtype=tf.float32) self.__struct_base = tf.placeholder( shape=[None, self._max_arity, self._n_slots, self._n_slots], dtype=tf.float32) #Construct recoding matrix if Path("recode/recode_mat.{}.pickle".format(self._n_slots)).is_file(): #a recoding matrix with given parameters is already # created and serialized, load it print("Loading recoding matix...", end="") sys.stdout.flush() with open("recode/recode_mat.{}.pickle".format(self._n_slots), "rb") as file_h: recode_mat = tf.constant(pickle.load(file_h)) print("Done.") sys.stdout.flush() else: #create recoding matrix and serialize it to a file recode_mat = tf.constant(self.__construct_recode_mat()) with open("recode/recode_mat.{}.pickle".format(self._n_slots), "wb") as file_h: with tf.Session(): pickle.dump(recode_mat.eval(), file_h) #generate all possible states of the semantics of the target sem_targets = tf.reshape( tf.matmul( tf.reshape(recode_mat, [self._n_states * self._n_slots, self._n_slots]), self.__sem_target), [1, self._n_states, self._n_slots * self._sem_dim]) #generate all possible states of the structure of the target struct_targets = tf.reshape( tf.transpose( tf.reshape( tf.concat([ tf.matmul( tf.reshape( tf.matmul( tf.reshape(recode_mat, [ self._n_states * self._n_slots, self._n_slots ]), self.__struct_target[a_i], ), [self._n_states, self._n_slots, self._n_slots ]), tf.transpose(recode_mat, [0, 2, 1]), ) for a_i in range(self._max_arity) ], 0), [ self._max_arity, self._n_states, self._n_slots * self._n_slots ]), [1, 0, 2]), [ 1, self._n_states, self._max_arity * self._n_slots * self._n_slots ]) #computer number of bases n_bases = tf.shape(self.__sem_base)[0] #reshapoe bases sem_base = tf.reshape( tf.tile( tf.reshape(self.__sem_base, [n_bases, self._n_slots * self._sem_dim]), [1, self._n_states]), [n_bases, self._n_states, self._n_slots * self._sem_dim]) struct_base = tf.reshape( tf.tile( tf.reshape( self.__struct_base, [n_bases, self._max_arity * self._n_slots * self._n_slots ]), [1, self._n_states]), [ n_bases, self._n_states, self._max_arity * self._n_slots * self._n_slots ]) #compute semantics denominator for cosine similarity denom_sem = tf.multiply( tf.sqrt( tf.reduce_sum(tf.multiply(sem_targets, sem_targets), axis=[2])), tf.sqrt(tf.reduce_sum(tf.multiply(sem_base, sem_base), axis=[2]))) #compute numerator num_sem = tf.reduce_sum(tf.multiply(sem_targets, sem_base), axis=[2]) #compute cosine similarity sem_cos = -K.losses.cosine_similarity(sem_targets, sem_base, axis=[2]) #tf.add(tf.multiply(tf.divide(num_sem, denom_sem), 0.5), 0.5) print(sem_cos) #compute structure denominator for cosine similarity denom_struct = tf.multiply( tf.sqrt( tf.reduce_sum(tf.multiply(struct_targets, struct_targets), axis=[2])), tf.sqrt( tf.reduce_sum(tf.multiply(struct_base, struct_base), axis=[2]))) #compute numerator num_struct = tf.reduce_sum(tf.multiply(struct_targets, struct_base), axis=[2]) #compute cosine similarity struct_cos = -K.losses.cosine_similarity( struct_targets, struct_base, axis=[2]) #tf.divide(num_struct, denom_struct) similarities = tf.add(tf.multiply(sem_cos, 1 - self._sigma), tf.multiply(struct_cos, self._sigma)) self._sem_cos = sem_cos self._struct_cos = struct_cos #get maximum similarity base_max_similarities = tf.reduce_max(similarities, axis=[1]) #get index of base with max similarity self.__best_base_index = tf.argmax(base_max_similarities) #get the index of the recoding which lead to the max similarity best_recoding_no = \ tf.argmax( tf.reshape( tf.slice( similarities, [self.__best_base_index, 0], [1, self._n_states]), [self._n_states])) #maximum similarity value self.__best_base_similarity = tf.reduce_max(base_max_similarities) #best recoding self.__best_recoding = tf.slice(recode_mat, [best_recoding_no, 0, 0], [1, self._n_slots, self._n_slots]) #best recoding of semantics self.__best_target_sem_recoding = tf.reshape( tf.slice(sem_targets, [0, best_recoding_no, 0], [1, 1, self._n_slots * self._sem_dim]), [self._n_slots, self._sem_dim]) #best recoding of structure self.__best_target_struct_recoding = tf.reshape( tf.slice(struct_targets, [0, best_recoding_no, 0], [1, 1, self._max_arity * self._n_slots * self._n_slots]), [self._max_arity, self._n_slots, self._n_slots])
def build(input_reader_config, model_config, lstm_config, unroll_length, data_augmentation_options=None, batch_size=1): """Builds a tensor dictionary based on the InputReader config. Args: input_reader_config: An input_reader_builder.InputReader object. model_config: A model.proto object containing the config for the desired DetectionModel. lstm_config: LSTM specific configs. unroll_length: Unrolled length for LSTM training. data_augmentation_options: A list of tuples, where each tuple contains a data augmentation function and a dictionary containing arguments and their values (see preprocessor.py). batch_size: Batch size for queue outputs. Returns: A dictionary of tensors based on items in the input_reader_config. Raises: ValueError: On invalid input reader proto. ValueError: If no input paths are specified. """ if not isinstance(input_reader_config, input_reader_pb2.InputReader): raise ValueError('input_reader_config not of type ' 'input_reader_pb2.InputReader.') external_reader_config = input_reader_config.external_input_reader external_input_reader_config = external_reader_config.Extensions[ input_reader_google_pb2.GoogleInputReader.google_input_reader] input_reader_type = external_input_reader_config.WhichOneof('input_reader') if input_reader_type == 'tf_record_video_input_reader': config = external_input_reader_config.tf_record_video_input_reader reader_type_class = tf.TFRecordReader else: raise ValueError('Unsupported reader in input_reader_config: %s' % input_reader_type) if not config.input_path: raise ValueError('At least one input path must be specified in ' '`input_reader_config`.') key, value = parallel_reader.parallel_read( config.input_path[:], # Convert `RepeatedScalarContainer` to list. reader_class=reader_type_class, num_epochs=(input_reader_config.num_epochs if input_reader_config.num_epochs else None), num_readers=input_reader_config.num_readers, shuffle=input_reader_config.shuffle, dtypes=[tf.string, tf.string], capacity=input_reader_config.queue_capacity, min_after_dequeue=input_reader_config.min_after_dequeue) # TODO(yinxiao): Add loading instance mask option. decoder = tf_sequence_example_decoder.TFSequenceExampleDecoder() keys_to_decode = [ fields.InputDataFields.image, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_classes ] tensor_dict = decoder.decode(value, items=keys_to_decode) tensor_dict['image'].set_shape([None, None, None, 3]) tensor_dict['groundtruth_boxes'].set_shape([None, None, 4]) height = model_config.ssd.image_resizer.fixed_shape_resizer.height width = model_config.ssd.image_resizer.fixed_shape_resizer.width # If data augmentation is specified in the config file, the preprocessor # will be called here to augment the data as specified. Most common # augmentations include horizontal flip and cropping. if data_augmentation_options: images_pre = tf.split(tensor_dict['image'], config.video_length, axis=0) bboxes_pre = tf.split(tensor_dict['groundtruth_boxes'], config.video_length, axis=0) labels_pre = tf.split(tensor_dict['groundtruth_classes'], config.video_length, axis=0) images_proc, bboxes_proc, labels_proc = [], [], [] cache = preprocessor_cache.PreprocessorCache() for i, _ in enumerate(images_pre): image_dict = { fields.InputDataFields.image: images_pre[i], fields.InputDataFields.groundtruth_boxes: tf.squeeze(bboxes_pre[i], axis=0), fields.InputDataFields.groundtruth_classes: tf.squeeze(labels_pre[i], axis=0), } image_dict = preprocessor.preprocess( image_dict, data_augmentation_options, func_arg_map=preprocessor.get_default_func_arg_map(), preprocess_vars_cache=cache) # Pads detection count to _PADDING_SIZE. image_dict[fields.InputDataFields.groundtruth_boxes] = tf.pad( image_dict[fields.InputDataFields.groundtruth_boxes], [[0, _PADDING_SIZE], [0, 0]]) image_dict[fields.InputDataFields.groundtruth_boxes] = tf.slice( image_dict[fields.InputDataFields.groundtruth_boxes], [0, 0], [_PADDING_SIZE, -1]) image_dict[fields.InputDataFields.groundtruth_classes] = tf.pad( image_dict[fields.InputDataFields.groundtruth_classes], [[0, _PADDING_SIZE]]) image_dict[fields.InputDataFields.groundtruth_classes] = tf.slice( image_dict[fields.InputDataFields.groundtruth_classes], [0], [_PADDING_SIZE]) images_proc.append(image_dict[fields.InputDataFields.image]) bboxes_proc.append( image_dict[fields.InputDataFields.groundtruth_boxes]) labels_proc.append( image_dict[fields.InputDataFields.groundtruth_classes]) tensor_dict['image'] = tf.concat(images_proc, axis=0) tensor_dict['groundtruth_boxes'] = tf.stack(bboxes_proc, axis=0) tensor_dict['groundtruth_classes'] = tf.stack(labels_proc, axis=0) else: # Pads detection count to _PADDING_SIZE per frame. tensor_dict['groundtruth_boxes'] = tf.pad( tensor_dict['groundtruth_boxes'], [[0, 0], [0, _PADDING_SIZE], [0, 0]]) tensor_dict['groundtruth_boxes'] = tf.slice( tensor_dict['groundtruth_boxes'], [0, 0, 0], [-1, _PADDING_SIZE, -1]) tensor_dict['groundtruth_classes'] = tf.pad( tensor_dict['groundtruth_classes'], [[0, 0], [0, _PADDING_SIZE]]) tensor_dict['groundtruth_classes'] = tf.slice( tensor_dict['groundtruth_classes'], [0, 0], [-1, _PADDING_SIZE]) tensor_dict['image'], _ = preprocessor.resize_image(tensor_dict['image'], new_height=height, new_width=width) num_steps = config.video_length / unroll_length init_states = { 'lstm_state_c': tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]), 'lstm_state_h': tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]), 'lstm_state_step': tf.constant(num_steps, shape=[]), } batch = sqss.batch_sequences_with_states(input_key=key, input_sequences=tensor_dict, input_context={}, input_length=None, initial_states=init_states, num_unroll=unroll_length, batch_size=batch_size, num_threads=batch_size, make_keys_unique=True, capacity=batch_size * batch_size) return _build_training_batch_dict(batch, unroll_length, batch_size)
def _parser_fn(serialized_example): """Parses a single tf.Example into image and label tensors.""" features = {} features['image/ct_image'] = tf.FixedLenFeature([], tf.string) features['image/label'] = tf.FixedLenFeature([], tf.string) parsed = tf.parse_single_example(serialized_example, features=features) spatial_dims = [FLAGS.ct_resolution] * 3 if FLAGS.sampled_2d_slices: noise_shape = [FLAGS.ct_resolution] * 2 + [FLAGS.image_c] else: noise_shape = [FLAGS.ct_resolution] * 3 image = tf.decode_raw(parsed['image/ct_image'], tf.float32) label = tf.decode_raw(parsed['image/label'], tf.float32) if dataset_str != 'train': # Preprocess intensity, clip to 0 ~ 1. # The training set is already preprocessed. image = tf.clip_by_value(image / 1024.0 + 0.5, 0, 1) image = tf.reshape(image, spatial_dims) label = tf.reshape(label, spatial_dims) if dataset_str == 'eval' and FLAGS.sampled_2d_slices: return _get_stacked_2d_slices(image, label) if FLAGS.sampled_2d_slices: # Take random slices of images and label begin_idx = tf.random_uniform(shape=[], minval=0, maxval=FLAGS.ct_resolution - FLAGS.image_c + 1, dtype=tf.int32) slice_begin = [0, 0, begin_idx] slice_size = [ FLAGS.ct_resolution, FLAGS.ct_resolution, FLAGS.image_c ] image = tf.slice(image, slice_begin, slice_size) label = tf.slice(label, slice_begin, slice_size) if dataset_str == 'train': for flip_axis in [0, 1, 2]: image, label = data_aug_lib.maybe_flip( image, label, flip_axis) image, label = data_aug_lib.maybe_rot180(image, label, static_axis=2) image = data_aug_lib.intensity_shift( image, label, FLAGS.per_class_intensity_scale, FLAGS.per_class_intensity_shift) image = data_aug_lib.image_corruption( image, label, FLAGS.ct_resolution, FLAGS.image_corrupt_ratio_mean, FLAGS.image_corrupt_ratio_stddev) image = data_aug_lib.maybe_add_noise( image, noise_shape, 1, 4, FLAGS.image_noise_probability, FLAGS.image_noise_ratio) image, label = data_aug_lib.projective_transform( image, label, FLAGS.ct_resolution, FLAGS.image_translate_ratio, FLAGS.image_transform_ratio, FLAGS.sampled_2d_slices) if FLAGS.sampled_2d_slices: # Only get the center slice of label. label = tf.slice(label, [0, 0, FLAGS.image_c // 2], [FLAGS.ct_resolution, FLAGS.ct_resolution, 1]) spatial_dims_w_blocks = [ FLAGS.image_nx_block, FLAGS.ct_resolution // FLAGS.image_nx_block, FLAGS.image_ny_block, FLAGS.ct_resolution // FLAGS.image_ny_block ] if not FLAGS.sampled_2d_slices: spatial_dims_w_blocks += [FLAGS.ct_resolution] image = tf.reshape(image, spatial_dims_w_blocks + [FLAGS.image_c]) label = tf.reshape(label, spatial_dims_w_blocks) label = tf.cast(label, tf.int32) label = tf.one_hot(label, FLAGS.label_c) data_dtype = tf.as_dtype(FLAGS.mtf_dtype) image = tf.cast(image, data_dtype) label = tf.cast(label, data_dtype) return image, label
def main(): if FLAGS.datasource == 'sinusoid': if FLAGS.train: test_num_updates = 1 else: test_num_updates = 10 else: if FLAGS.datasource == 'miniimagenet': if FLAGS.train: test_num_updates = 1 # eval on at least one update during training else: test_num_updates = 10 else: test_num_updates = 10 if not FLAGS.train: orig_meta_batch_size = FLAGS.meta_batch_size # always use meta batch size of 1 when testing. FLAGS.meta_batch_size = 1 if FLAGS.datasource == 'sinusoid': data_generator = DataGenerator(FLAGS.update_batch_size * 2, FLAGS.meta_batch_size) else: if FLAGS.metatrain_iterations == 0 and FLAGS.datasource == 'miniimagenet': assert FLAGS.meta_batch_size == 1 assert FLAGS.update_batch_size == 1 data_generator = DataGenerator( 1, FLAGS.meta_batch_size) # only use one datapoint, else: if FLAGS.datasource == 'miniimagenet': # TODO - use 15 val examples for imagenet? if FLAGS.train: data_generator = DataGenerator( FLAGS.update_batch_size + 15, FLAGS.meta_batch_size ) # only use one datapoint for testing to save memory else: data_generator = DataGenerator( FLAGS.update_batch_size * 2, FLAGS.meta_batch_size ) # only use one datapoint for testing to save memory else: data_generator = DataGenerator( FLAGS.update_batch_size * 2, FLAGS.meta_batch_size ) # only use one datapoint for testing to save memory dim_output = data_generator.dim_output if FLAGS.baseline == 'oracle': assert FLAGS.datasource == 'sinusoid' dim_input = 3 FLAGS.pretrain_iterations += FLAGS.metatrain_iterations FLAGS.metatrain_iterations = 0 else: dim_input = data_generator.dim_input if FLAGS.datasource == 'miniimagenet' or FLAGS.datasource == 'omniglot': tf_data_load = True num_classes = data_generator.num_classes if FLAGS.train: # only construct training model if needed random.seed(5) image_tensor, label_tensor = data_generator.make_data_tensor() inputa = tf.slice(image_tensor, [0, 0, 0], [-1, num_classes * FLAGS.update_batch_size, -1]) inputb = tf.slice(image_tensor, [0, num_classes * FLAGS.update_batch_size, 0], [-1, -1, -1]) labela = tf.slice(label_tensor, [0, 0, 0], [-1, num_classes * FLAGS.update_batch_size, -1]) labelb = tf.slice(label_tensor, [0, num_classes * FLAGS.update_batch_size, 0], [-1, -1, -1]) input_tensors = { 'inputa': inputa, 'inputb': inputb, 'labela': labela, 'labelb': labelb } print("inputa shape", inputa.shape) random.seed(6) image_tensor, label_tensor = data_generator.make_data_tensor( train=False) inputa = tf.slice(image_tensor, [0, 0, 0], [-1, num_classes * FLAGS.update_batch_size, -1]) inputb = tf.slice(image_tensor, [0, num_classes * FLAGS.update_batch_size, 0], [-1, -1, -1]) labela = tf.slice(label_tensor, [0, 0, 0], [-1, num_classes * FLAGS.update_batch_size, -1]) labelb = tf.slice(label_tensor, [0, num_classes * FLAGS.update_batch_size, 0], [-1, -1, -1]) metaval_input_tensors = { 'inputa': inputa, 'inputb': inputb, 'labela': labela, 'labelb': labelb } else: tf_data_load = False input_tensors = None model = MAML(dim_input, dim_output, test_num_updates=test_num_updates) if FLAGS.train or not tf_data_load: model.construct_model(input_tensors=input_tensors, prefix='metatrain_') if tf_data_load: model.construct_model(input_tensors=metaval_input_tensors, prefix='metaval_') model.summ_op = tf.summary.merge_all() saver = loader = tf.train.Saver(tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES), max_to_keep=10) sess = tf.InteractiveSession() if not FLAGS.train: # change to original meta batch size when loading model. FLAGS.meta_batch_size = orig_meta_batch_size if FLAGS.train_update_batch_size == -1: FLAGS.train_update_batch_size = FLAGS.update_batch_size if FLAGS.train_update_lr == -1: FLAGS.train_update_lr = FLAGS.update_lr exp_string = 'cls_' + str(FLAGS.num_classes) + '.mbs_' + str( FLAGS.meta_batch_size) + '.ubs_' + str( FLAGS.train_update_batch_size) + '.numstep' + str( FLAGS.num_updates) + '.updatelr' + str(FLAGS.train_update_lr) if FLAGS.num_filters != 64: exp_string += 'hidden' + str(FLAGS.num_filters) if FLAGS.max_pool: exp_string += 'maxpool' if FLAGS.stop_grad: exp_string += 'stopgrad' if FLAGS.baseline: exp_string += FLAGS.baseline if FLAGS.norm == 'batch_norm': exp_string += 'batchnorm' elif FLAGS.norm == 'layer_norm': exp_string += 'layernorm' elif FLAGS.norm == 'None': exp_string += 'nonorm' else: print('Norm setting not recognized.') resume_itr = 0 model_file = None tf.global_variables_initializer().run() tf.train.start_queue_runners() if not FLAGS.rand_init: if FLAGS.resume or not FLAGS.train: model_file = tf.train.latest_checkpoint(FLAGS.logdir + '/' + exp_string) if FLAGS.test_iter > 0: model_file = model_file[:model_file.index('model' )] + 'model' + str( FLAGS.test_iter) if model_file: ind1 = model_file.index('model') resume_itr = int(model_file[ind1 + 5:]) print("Restoring model weights from " + model_file) saver.restore(sess, model_file) if FLAGS.train: train(model, saver, sess, exp_string, data_generator, resume_itr) else: test(model, saver, sess, exp_string, data_generator, test_num_updates)
def _run(): """Forward pass through the network.""" with slim.arg_scope([slim.dropout], is_training=is_training): with slim.arg_scope( [slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer(stddev=0.01), weights_regularizer=slim.l2_regularizer(self._l2_regularization), activation_fn=tf.nn.relu, trainable=is_training): with slim.arg_scope( [slim.conv2d, slim.max_pool2d], stride=1, padding='SAME'): with slim.arg_scope( [slim.conv2d, slim.fully_connected], normalizer_fn=slim.batch_norm, normalizer_params=batch_norm): _, grasp_image = images net = slim.conv2d( grasp_image, 64, [6, 6], stride=2, scope='conv1_1', activation_fn=None, normalizer_fn=None, normalizer_params=None) # Old checkpoints (such as those used for tests) did not have # scaling on the separate batch norm operations (those not # associated with a conv operation), so only setting the scale # parameter in arg_scope would break the tests. We set scale= # False for these separate batch norm operations temporarily. # However, future users are encouraged to not set scale=False so # that barch_norm parameters are consistent through the whole # network. net = tf.nn.relu(slim.batch_norm(net, scale=False)) net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool1') self.activation_layers.append(net) for l in range(2, 2 + self.num_convs[0]): net = slim.conv2d(net, 64, [5, 5], scope='conv%d' % l) self.activation_layers.append(net) net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool2') end_points['pool2'] = net self.activation_layers.append(net) logging.debug('pool2') logging.debug(net.get_shape()) if grasp_param_names is None: grasp_param_blocks = [grasp_params] grasp_param_block_names = ['fcgrasp'] else: grasp_param_blocks = [] grasp_param_block_names = [] # Note: Creating variables must happen in a deterministic # order, otherwise some workers will look for variables on the # wrong parameter servers, so we sort the grasp_param_names # here. for block_name in sorted(grasp_param_names): offset, size = grasp_param_names[block_name] grasp_param_blocks += [ tf.slice(grasp_params, [0, offset], [-1, size]) ] grasp_param_block_names += [block_name] grasp_param_tensors = [] for block, name in zip(grasp_param_blocks, grasp_param_block_names): grasp_param_tensors += [ slim.fully_connected( block, 256, scope=name, activation_fn=None, normalizer_fn=None, normalizer_params=None) ] fcgrasp = tf.add_n(grasp_param_tensors) # Old checkpoints (such as those used for tests) did not have # scaling on the separate batch norm operations (those not # associated with a conv operation), so only setting the scale # parameter in arg_scope would break the tests. We set scale= # False for these separate batch norm operations temporarily. # However, future users are encouraged to not set scale=False so # that barch_norm parameters are consistent through the whole # network. fcgrasp = tf.nn.relu(slim.batch_norm(fcgrasp, scale=False)) fcgrasp = slim.fully_connected(fcgrasp, 64, scope='fcgrasp2') context = tf.reshape(fcgrasp, [-1, 1, 1, 64]) end_points['fcgrasp'] = fcgrasp # Tile the image embedding action_batch_size times to align # with the expanded action dimension of action_batch_size. # Same image is used with all the actions in a action_batch. # net pre expansion should be [batch, *, *, *] # net post expansion should be [batch x action_batch, *, *, *] if tile_batch: net = contrib_seq2seq.tile_batch(net, self._action_batch_size) net = tf.add(net, context) logging.debug('net post add %s', net) end_points['vsum'] = net self.activation_layers.append(net) logging.debug('vsum') logging.debug(net.get_shape()) for l in range(2 + sum(self.num_convs[:1]), 2 + sum(self.num_convs[:2])): net = slim.conv2d(net, 64, [3, 3], scope='conv%d' % l) logging.debug('conv%d', l) self.activation_layers.append(net) logging.debug(net.get_shape()) net = slim.max_pool2d(net, [2, 2], stride=2, scope='pool3') logging.debug('pool3') logging.debug(net.get_shape()) self.activation_layers.append(net) for l in range(2 + sum(self.num_convs[:2]), 2 + sum(self.num_convs[:3])): net = slim.conv2d( net, 64, [3, 3], scope='conv%d' % l, padding='VALID') self.activation_layers.append(net) logging.debug('final conv') logging.debug(net.get_shape()) end_points['final_conv'] = net batch_size = tf.shape(net)[0] if goal_spatial_fn is not None: goal_spatial = goal_spatial_fn() # Tile goal to match net batch size (e.g. CEM). goal_batch_size = tf.shape(goal_spatial)[0] goal_spatial = tf.tile( goal_spatial, [batch_size//goal_batch_size, 1, 1, 1]) # Merging features in style of Fang 2017. net = tf.concat([net, goal_spatial], axis=3) net = slim.flatten(net, scope='flatten') if goal_vector_fn is not None: goal_vector = goal_vector_fn() goal_batch_size = tf.shape(goal_vector)[0] goal_vector = tf.tile( goal_vector, [batch_size//goal_batch_size, 1]) net = tf.concat([net, goal_vector], axis=1) for l in range(self.hid_layers): net = slim.fully_connected(net, 64, scope='fc%d' % l) name = 'logit' if num_classes > 1: name = 'logit_%d' % num_classes logits = slim.fully_connected( net, num_classes, activation_fn=None, scope=name, normalizer_fn=None, normalizer_params=None) end_points['logits'] = logits if softmax: predictions = tf.nn.softmax(logits) else: predictions = tf.nn.sigmoid(logits) if tile_batch: if num_classes > 1: predictions = tf.reshape( predictions, [-1, self._action_batch_size, num_classes]) else: predictions = tf.reshape(predictions, [-1, self._action_batch_size]) end_points['predictions'] = predictions return logits, end_points
def embedding_postprocessor( input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, ): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError('`token_type_ids` must be specified if' '`use_token_type` is True.') token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range), ) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range), ) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def get(self): """ Provides input data to the graph. """ # calculate size of each record (this lists what is contained in the db and how many bytes are occupied) record_bytes = 0 encoding_bytes = 4 kp_xyz_entries = 3 * self.num_kp record_bytes += encoding_bytes*kp_xyz_entries encoding_bytes = 4 kp_uv_entries = 2 * self.num_kp record_bytes += encoding_bytes*kp_uv_entries kp_vis_entries = self.num_kp record_bytes += encoding_bytes*kp_vis_entries image_bytes = self.image_size[0] * self.image_size[1] * 3 record_bytes += image_bytes """ READ DATA ITEMS""" # Start reader reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes) _, value = reader.read(tf.train.string_input_producer([self.path_to_db])) # decode to floats bytes_read = 0 data_dict = dict() record_bytes_float32 = tf.decode_raw(value, tf.float32) # 1. Read keypoint xyz keypoint_xyz21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_xyz_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes*kp_xyz_entries keypoint_xyz21 /= 1000.0 # scale to meters keypoint_xyz21 = self.convert_kp(keypoint_xyz21) # calculate wrist coord if self.use_wrist_coord: wrist_xyz = keypoint_xyz21[16, :] + 2.0*(keypoint_xyz21[0, :] - keypoint_xyz21[16, :]) keypoint_xyz21 = tf.concat([tf.expand_dims(wrist_xyz, 0), keypoint_xyz21[1:, :]], 0) data_dict['keypoint_xyz21'] = keypoint_xyz21 # 2. Read keypoint uv AND VIS keypoint_uv_vis21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_uv_entries+kp_vis_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes*(kp_uv_entries+kp_vis_entries) keypoint_uv_vis21 = self.convert_kp(keypoint_uv_vis21) keypoint_uv21 = keypoint_uv_vis21[:, :2] keypoint_vis21 = tf.equal(keypoint_uv_vis21[:, 2], 1.0) # calculate wrist vis if self.use_wrist_coord: wrist_vis = tf.logical_or(keypoint_vis21[16], keypoint_vis21[0]) keypoint_vis21 = tf.concat([tf.expand_dims(wrist_vis, 0), keypoint_vis21[1:]], 0) wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :]) keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0), keypoint_uv21[1:, :]], 0) data_dict['keypoint_vis21'] = keypoint_vis21 if self.coord_uv_noise: noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv21 += noise data_dict['keypoint_uv21'] = keypoint_uv21 # decode to uint8 record_bytes_uint8 = tf.decode_raw(value, tf.uint8) # 4. Read image image = tf.reshape(tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]), [self.image_size[0], self.image_size[1], 3]) image = tf.cast(image, tf.float32) bytes_read += image_bytes # subtract mean image = image / 255.0 - 0.5 if self.hue_aug: image = tf.image.random_hue(image, self.hue_aug_max) data_dict['image'] = image """ CONSTANTS """ # Camera intrinsics sx = 822.79041 sy = 822.79041 tx = 318.47345 ty = 250.31296 data_dict['cam_mat'] = tf.constant([[sx, 0.0, tx], [0.0, sy, ty], [0.0, 0.0, 1.0]]) # Hand side: this dataset only contains left hands data_dict['hand_side'] = tf.one_hot(tf.constant(0, dtype=tf.int32), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32) assert bytes_read == record_bytes, "Doesnt add up." """ DEPENDENT DATA ITEMS: XYZ represenations. """ # make coords relative to root joint kp_coord_xyz_root = keypoint_xyz21[0, :] # this is the palm coord kp_coord_xyz21_rel = keypoint_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = tf.sqrt(tf.reduce_sum(tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :]))) data_dict['keypoint_scale'] = index_root_bone_length data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length # normalized by length of 12->11 # calculate local coordinates kp_coord_xyz21_local = bone_rel_trafo(data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local) data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local # calculate viewpoint and coords in canonical coordinates kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(kp_coord_xyz21_rel_can), tf.squeeze(rot_mat) data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can data_dict['rot_mat'] = tf.matrix_inverse(rot_mat) """ DEPENDENT DATA ITEMS: HAND CROP """ if self.hand_crop: crop_center = keypoint_uv21[12, ::-1] # catch problem, when no valid kp available (happens almost never) crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center, lambda: tf.constant([0.0, 0.0])) crop_center.set_shape([2, ]) if self.crop_center_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_center_noise_sigma) crop_center += noise crop_scale_noise = tf.constant(1.0) if self.crop_scale_noise: crop_scale_noise = tf.squeeze(tf.random_uniform([1], minval=1.0, maxval=1.2)) if not self.use_wrist_coord: wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :]) keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0), keypoint_uv21[1:, :]], 0) # select visible coords only kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21) kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21) kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1) # determine size of crop (measure spatial extend of hw coords first) min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0) max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size) # find out larger distance wrt the center of crop crop_size_best = 2*tf.maximum(max_coord - crop_center, crop_center - min_coord) crop_size_best = tf.reduce_max(crop_size_best) crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0) # catch problem, when no valid kp available crop_size_best = tf.cond(tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best, lambda: tf.constant(200.0)) crop_size_best.set_shape([]) # calculate necessary scaling scale = tf.cast(self.crop_size, tf.float32) / crop_size_best scale = tf.minimum(tf.maximum(scale, 1.0), 10.0) scale *= crop_scale_noise data_dict['crop_scale'] = scale if self.crop_offset_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_offset_noise_sigma) crop_center += noise # Crop image img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale) data_dict['image_crop'] = tf.squeeze(img_crop) # Modify uv21 coordinates crop_center_float = tf.cast(crop_center, tf.float32) keypoint_uv21_u = (data_dict['keypoint_uv21'][:, 0] - crop_center_float[1]) * scale + self.crop_size // 2 keypoint_uv21_v = (data_dict['keypoint_uv21'][:, 1] - crop_center_float[0]) * scale + self.crop_size // 2 keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1) data_dict['keypoint_uv21'] = keypoint_uv21 # Modify camera intrinsics scale = tf.reshape(scale, [1, ]) scale_matrix = tf.dynamic_stitch([[0], [1], [2], [3], [4], [5], [6], [7], [8]], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]]) scale_matrix = tf.reshape(scale_matrix, [3, 3]) crop_center_float = tf.cast(crop_center, tf.float32) trans1 = crop_center_float[0] * scale - self.crop_size // 2 trans2 = crop_center_float[1] * scale - self.crop_size // 2 trans1 = tf.reshape(trans1, [1, ]) trans2 = tf.reshape(trans2, [1, ]) trans_matrix = tf.dynamic_stitch([[0], [1], [2], [3], [4], [5], [6], [7], [8]], [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0], [1.0]]) trans_matrix = tf.reshape(trans_matrix, [3, 3]) data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, data_dict['cam_mat'])) """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints""" # create scoremaps from the subset of 2D annoataion keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) scoremap = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21) if self.scoremap_dropout: scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob, noise_shape=[1, 1, 21]) scoremap *= self.scoremap_dropout_prob data_dict['scoremap'] = scoremap if self.random_crop_to_size: tensor_stack = tf.concat([data_dict['image'], tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1), tf.cast(data_dict['hand_mask'], tf.float32)], 2) s = tensor_stack.get_shape().as_list() tensor_stack_cropped = tf.random_crop(tensor_stack, [self.random_crop_size, self.random_crop_size, s[2]]) data_dict = dict() # delete everything else because the random cropping makes the data invalid anyway data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\ tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\ tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32) names, tensors = zip(*data_dict.items()) if self.shuffle: tensors = tf.train.shuffle_batch_join([tensors], batch_size=self.batch_size, capacity=100, min_after_dequeue=50, enqueue_many=False) else: tensors = tf.train.batch_join([tensors], batch_size=self.batch_size, capacity=100, enqueue_many=False) return dict(zip(names, tensors))