예제 #1
0
 def body(index, summation):
     precessed = tf.slice(stacked_tensor, [0, index - 1, 0], [-1, -1, -1])
     summand = tf.reduce_mean(precessed, 1)
     return tf.subtract(index, 1), tf.add(summation, summand)
예제 #2
0
  def get_baseline_batch(self, hparams):
    """Get the Tensor expressions from the reader.

    Args:
      hparams: Hyperparameters object with specgram parameters.

    Returns:
      A dict of key:tensor pairs. This includes "pitch", "wav", and "key".
    """
    example = self.get_example(hparams.batch_size)
    audio = tf.slice(example["audio"], [0], [64000])
    audio = tf.reshape(audio, [1, 64000])
    pitch = tf.slice(example["pitch"], [0], [1])
    velocity = tf.slice(example["velocity"], [0], [1])
    instrument_source = tf.slice(example["instrument_source"], [0], [1])
    instrument_family = tf.slice(example["instrument_family"], [0], [1])
    qualities = tf.slice(example["qualities"], [0], [10])
    qualities = tf.reshape(qualities, [1, 10])

    # Get Specgrams
    hop_length = hparams.hop_length
    n_fft = hparams.n_fft
    if hop_length and n_fft:
      specgram = utils.tf_specgram(
          audio,
          n_fft=n_fft,
          hop_length=hop_length,
          mask=hparams.mask,
          log_mag=hparams.log_mag,
          re_im=hparams.re_im,
          dphase=hparams.dphase,
          mag_only=hparams.mag_only)
      shape = [1] + SPECGRAM_REGISTRY[(n_fft, hop_length)]
      if hparams.mag_only:
        shape[-1] = 1
      specgram = tf.reshape(specgram, shape)
      tf.logging.info("SPECGRAM BEFORE PADDING", specgram)

      if hparams.pad:
        # Pad and crop specgram to 256x256
        num_padding = 2**int(np.ceil(np.log(shape[2]) / np.log(2))) - shape[2]
        tf.logging.info("num_pading: %d" % num_padding)
        specgram = tf.reshape(specgram, shape)
        specgram = tf.pad(specgram, [[0, 0], [0, 0], [0, num_padding], [0, 0]])
        specgram = tf.slice(specgram, [0, 0, 0, 0], [-1, shape[1] - 1, -1, -1])
        tf.logging.info("SPECGRAM AFTER PADDING", specgram)

    # Form a Batch
    if self.is_training:
      (audio, velocity, pitch, specgram,
       instrument_source, instrument_family,
       qualities) = tf.train.shuffle_batch(
           [
               audio, velocity, pitch, specgram,
               instrument_source, instrument_family, qualities
           ],
           batch_size=hparams.batch_size,
           capacity=20 * hparams.batch_size,
           min_after_dequeue=10 * hparams.batch_size,
           enqueue_many=True)
    elif hparams.batch_size > 1:
      (audio, velocity, pitch, specgram,
       instrument_source, instrument_family, qualities) = tf.train.batch(
           [
               audio, velocity, pitch, specgram,
               instrument_source, instrument_family, qualities
           ],
           batch_size=hparams.batch_size,
           capacity=10 * hparams.batch_size,
           enqueue_many=True)

    audio.set_shape([hparams.batch_size, 64000])

    batch = dict(
        pitch=pitch,
        velocity=velocity,
        audio=audio,
        instrument_source=instrument_source,
        instrument_family=instrument_family,
        qualities=qualities,
        spectrogram=specgram)

    return batch
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  with tf.Graph().as_default():
    # Loads content images.
    eval_content_inputs_, _ = image_utils.imagenet_inputs(
        FLAGS.batch_size, FLAGS.image_size)

    # Process style and content weight flags.
    content_weights = ast.literal_eval(FLAGS.content_weights)
    style_weights = ast.literal_eval(FLAGS.style_weights)

    # Loads evaluation style images.
    eval_style_inputs_, _, _ = image_utils.arbitrary_style_image_inputs(
        FLAGS.eval_style_dataset_file,
        batch_size=FLAGS.batch_size,
        image_size=FLAGS.image_size,
        center_crop=True,
        shuffle=True,
        augment_style_images=False,
        random_style_image_size=False)

    # Computes stylized noise.
    stylized_noise, _, _, _ = build_model.build_model(
        tf.random_uniform(
            [min(4, FLAGS.batch_size), FLAGS.image_size, FLAGS.image_size, 3]),
        tf.slice(eval_style_inputs_, [0, 0, 0, 0],
                 [min(4, FLAGS.batch_size), -1, -1, -1]),
        trainable=False,
        is_training=False,
        reuse=None,
        inception_end_point='Mixed_6e',
        style_prediction_bottleneck=100,
        adds_losses=False)

    # Computes stylized images.
    stylized_images, _, loss_dict, _ = build_model.build_model(
        eval_content_inputs_,
        eval_style_inputs_,
        trainable=False,
        is_training=False,
        reuse=True,
        inception_end_point='Mixed_6e',
        style_prediction_bottleneck=100,
        adds_losses=True,
        content_weights=content_weights,
        style_weights=style_weights,
        total_variation_weight=FLAGS.total_variation_weight)

    # Adds Image summaries to the tensorboard.
    tf.summary.image('image/{}/0_eval_content_inputs'.format(FLAGS.eval_name),
                     eval_content_inputs_, 3)
    tf.summary.image('image/{}/1_eval_style_inputs'.format(FLAGS.eval_name),
                     eval_style_inputs_, 3)
    tf.summary.image('image/{}/2_eval_stylized_images'.format(FLAGS.eval_name),
                     stylized_images, 3)
    tf.summary.image('image/{}/3_stylized_noise'.format(FLAGS.eval_name),
                     stylized_noise, 3)

    metrics = {}
    for key, value in loss_dict.items():
      metrics[key] = tf.metrics.mean(value)

    names_values, names_updates = slim.metrics.aggregate_metric_map(metrics)
    for name, value in names_values.items():
      slim.summaries.add_scalar_summary(value, name, print_summary=True)
    eval_op = list(names_updates.values())
    num_evals = FLAGS.num_evaluation_styles / FLAGS.batch_size

    slim.evaluation.evaluation_loop(
        master=FLAGS.master,
        checkpoint_dir=FLAGS.checkpoint_dir,
        logdir=FLAGS.eval_dir,
        eval_op=eval_op,
        num_evals=num_evals,
        eval_interval_secs=FLAGS.eval_interval_secs)
예제 #4
0
    def body(self, features):
        hparams = self.hparams
        input_shape = common_layers.shape_list(features['inputs'])
        batch_size, _, frame_width, frame_height, frame_channels = input_shape  # pylint: disable=unused-variable

        # Swap time and batch axes.
        input_frames = common_video.swap_time_and_batch_axes(
            tf.to_float(features['inputs']))
        target_frames = common_video.swap_time_and_batch_axes(
            features['targets'])

        # Get actions if exist otherwise use zeros
        input_actions = self.get_input_if_exists(
            features, 'input_action', batch_size,
            hparams.video_num_input_frames)
        target_actions = self.get_input_if_exists(
            features, 'target_action', batch_size,
            hparams.video_num_target_frames)

        # Get rewards if exist otherwise use zeros
        # TODO(blazej) enable rewards.
        # input_rewards = self.get_input_if_exists(
        #     features, 'input_reward', batch_size, hparams.video_num_input_frames)
        # target_rewards = self.get_input_if_exists(
        #     features, 'target_reward', batch_size,hparams.video_num_target_frames)
        # all_rewards = tf.concat([input_rewards, target_rewards], axis=0)

        all_actions = tf.concat([input_actions, target_actions], axis=0)
        # flatten actions tensor to have the shape: framesXbatch_sizeXaction_dims.
        actions_shape = common_layers.shape_list(all_actions)
        all_actions = tf.reshape(all_actions, [
            actions_shape[0], -1,
            reduce(lambda x, y: x * y, actions_shape[2:])
        ])
        all_frames = tf.concat([input_frames, target_frames], axis=0)

        all_frames = tf.unstack(all_frames, axis=0)
        all_actions = tf.unstack(all_actions, axis=0)

        # TODO(blazej) - most likely this downsize is too strong.
        all_frames = [
            tf.image.resize_images(image, (IMG_HEIGHT, IMG_WIDTH),
                                   method=tf.image.ResizeMethod.BICUBIC)
            for image in all_frames
        ]

        enc_out_all, pred_out_all, _, van_on_enc_all = construct_model(
            all_frames,
            all_actions,
            context_frames=hparams.context_frames,
            hparams=hparams,
            is_training=self.is_training)

        enc_pred_loss, _ = calc_loss_psnr(
            enc_out_all[1:],
            pred_out_all,
            'enc_pred_loss',
            hparams=hparams,
            use_l1_loss=hparams.enc_pred_use_l1_loss)

        van_on_enc_loss, _ = calc_loss_psnr(van_on_enc_all,
                                            all_frames[1:],
                                            'van_on_enc_loss',
                                            hparams=hparams)

        enc_pred_loss_scale_delay = max(hparams.enc_pred_loss_scale_delay, 1)
        enc_pred_loss_scale = tf.nn.sigmoid(
            (tf.to_float(tf.train.get_or_create_global_step()) -
             enc_pred_loss_scale_delay) /
            (enc_pred_loss_scale_delay * .1)) * hparams.enc_pred_loss_scale
        tf.summary.scalar('enc_pred_loss_scale', enc_pred_loss_scale)
        epva_loss = enc_pred_loss * enc_pred_loss_scale + van_on_enc_loss
        tf.summary.scalar('epva_loss', epva_loss)

        predictions = tf.stack(van_on_enc_all)

        if hparams.clip_pixel_values:
            predictions = tf.clip_by_value(predictions, 0.0, 1.0)

        # TODO(mbz): clean this up!
        def fix_video_dims_and_concat_on_x_axis(x):
            x = tf.transpose(x, [1, 3, 4, 0, 2])
            x = tf.reshape(x, [batch_size, frame_height, frame_channels, -1])
            x = tf.transpose(x, [0, 3, 1, 2])
            return x

        frames_gd = fix_video_dims_and_concat_on_x_axis(target_frames)
        frames_pd = fix_video_dims_and_concat_on_x_axis(predictions)
        side_by_side_video = tf.concat([frames_gd, frames_pd], axis=1)
        tf.summary.image('full_video', side_by_side_video)

        predictions = tf.unstack(predictions)
        predictions = [
            tf.image.resize_images(image, (frame_width, frame_height),
                                   method=tf.image.ResizeMethod.BICUBIC)
            for image in predictions
        ]
        predictions = tf.stack(predictions)

        predictions = common_video.swap_time_and_batch_axes(predictions)
        predictions = tf.slice(
            predictions, [0, hparams.video_num_input_frames - 1, 0, 0, 0],
            [-1] * 5)

        return predictions, {'extra': epva_loss}
예제 #5
0
def random_image_crop(image,
                      boxes,
                      min_object_covered=0.9,
                      aspect_ratio_range=(0.75, 1.33),
                      area_range=(0.5, 1.0),
                      overlap_threshold=0.3):
    """
    Performs random crop. Given the input image and its bounding boxes,
    this op randomly crops a subimage.  Given a user-provided set of input constraints,
    the crop window is resampled until it satisfies these constraints.
    If within 100 trials it is unable to find a valid crop, the original
    image is returned. Both input boxes and returned boxes are in normalized
    form (e.g., lie in the unit square [0, 1]).

    Arguments:
        image: a float tensor with shape [height, width, 3].
        boxes: a float tensor containing bounding boxes. It has shape
            [num_boxes, 4]. Boxes are in normalized form, meaning
            their coordinates vary between [0, 1].
            Each row is in the form of [ymin, xmin, ymax, xmax].
        min_object_covered: the cropped image must cover at least this fraction of
            at least one of the input bounding boxes.
        aspect_ratio_range: allowed range for aspect ratio of cropped image.
        area_range: allowed range for area ratio between cropped image and the
            original image.
        overlap_threshold: minimum overlap thresh with new cropped
            image to keep the box.
    Returns:
        image: cropped image, a float tensor with shape [None, None, 3].
        boxes: a float tensor with shape [num_remaining, 4], remaining boxes.
            Where 0 <= num_remaining <= num_boxes.
        window: a float tensor with shape [4], in normalized coordinates.
        keep_indices: an int tensor with shape [num_remaining],
            indices of remaining boxes in input boxes tensor.
    """

    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
        tf.shape(image),
        bounding_boxes=tf.expand_dims(boxes, 0),
        min_object_covered=min_object_covered,
        aspect_ratio_range=aspect_ratio_range,
        area_range=area_range,
        max_attempts=100,
        use_image_if_no_bounding_boxes=True)
    begin, size, window = sample_distorted_bounding_box
    image = tf.slice(image, begin, size)
    image.set_shape([None, None, 3])
    window = tf.squeeze(window, axis=[0, 1])

    # remove boxes that are completely outside the cropped image
    boxes, inside_window_ids = prune_completely_outside_window(boxes, window)
    # why do i need this function? i believe the one below is enough

    # remove boxes that are too much outside the cropped image
    boxes, keep_indices = prune_non_overlapping_boxes(
        boxes, tf.expand_dims(window, 0), min_overlap=overlap_threshold)

    # change coordinates of the remaining boxes
    boxes = change_coordinate_frame(boxes, window)

    keep_indices = tf.gather(inside_window_ids, keep_indices)
    return image, boxes, window, keep_indices
예제 #6
0
    def call(self, inputs, prev_state):
        """Evaluates one timestep of the current neural stack cell.

    See section 3.4 of Grefenstette et al., 2015.

    Args:
      inputs: The inputs to the neural stack cell should be a tf.float32 tensor
        with shape [batch_size, embedding_size]
      prev_state: The NeuralStackState from the previous timestep.

    Returns:
      A tuple of the output of the stack as well as the new NeuralStackState.
    """
        batch_size = tf.shape(inputs)[0]

        # Call the controller and get controller interface values.
        with tf.control_dependencies([prev_state.read_strengths]):
            controller_output = self.call_controller(
                inputs, prev_state.read_values, prev_state.controller_state,
                batch_size)

        # Always write input values to memory regardless of push strength.
        # See Equation-1 in Grefenstette et al., 2015.
        new_memory_values = prev_state.memory_values + tf.reduce_sum(
            tf.expand_dims(controller_output.write_values, axis=2) *
            prev_state.write_strengths,
            axis=1)

        # Attenuate the read strengths of existing memory values depending on the
        # current pop strength.
        # See Equation-2 in Grefenstette et al., 2015.
        new_read_strengths = prev_state.read_strengths
        for h in range(self._num_read_heads - 1, -1, -1):
            new_read_strengths = tf.nn.relu(new_read_strengths - tf.nn.relu(
                tf.slice(controller_output.pop_strengths, [0, h, 0, 0],
                         [-1, 1, -1, -1]) -
                tf.expand_dims(tf.reduce_sum(
                    new_read_strengths * self.get_read_mask(h), axis=2),
                               axis=3)))

        # Combine all write heads and their associated push values into a single set
        # of read weights.
        new_read_strengths += tf.reduce_sum(controller_output.push_strengths *
                                            prev_state.write_strengths,
                                            axis=1,
                                            keep_dims=True)

        # Calculate the "top" value of the stack by looking at read strengths.
        # See Equation-3 in Grefenstette et al., 2015.
        new_read_values = tf.reduce_sum(
            tf.minimum(
                new_read_strengths,
                tf.nn.relu(1 - tf.expand_dims(tf.reduce_sum(
                    new_read_strengths * tf.concat([
                        self.get_read_mask(h)
                        for h in range(self._num_read_heads)
                    ],
                                                   axis=1),
                    axis=2),
                                              axis=3))) *
            tf.expand_dims(new_memory_values, axis=1),
            axis=2)

        # Temporarily split write strengths apart so they can be shifted in
        # different directions.
        write_strengths_by_head = tf.split(prev_state.write_strengths,
                                           self._num_write_heads,
                                           axis=1)
        # Shift the write strengths for each write head in the direction indicated
        # by get_write_head_offset().
        new_write_strengths = tf.concat([
            tf.roll(
                write_strength, shift=self.get_write_head_offset(h), axis=2)
            for h, write_strength in enumerate(write_strengths_by_head)
        ],
                                        axis=1)

        return (controller_output.outputs,
                NeuralStackState(controller_state=controller_output.state,
                                 read_values=new_read_values,
                                 memory_values=new_memory_values,
                                 read_strengths=new_read_strengths,
                                 write_strengths=new_write_strengths))
예제 #7
0
        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            if params['nms_configs'].get('pyfunc', True):
                detections_bs = []
                nms_configs = params['nms_configs']
                for index in range(kwargs['boxes'].shape[0]):
                    detections = tf.numpy_function(
                        functools.partial(nms_np.per_class_nms,
                                          nms_configs=nms_configs),
                        [
                            kwargs['boxes'][index],
                            kwargs['scores'][index],
                            kwargs['classes'][index],
                            tf.slice(kwargs['image_ids'], [index], [1]),
                            tf.slice(kwargs['image_scales'], [index], [1]),
                            params['num_classes'],
                            nms_configs['max_output_size'],
                        ], tf.float32)
                    detections_bs.append(detections)
                detections_bs = postprocess.transform_detections(
                    tf.stack(detections_bs))
            else:
                # These two branches should be equivalent, but currently they are not.
                # TODO(tanmingxing): enable the non_pyfun path after bug fix.
                nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms(
                    params, kwargs['boxes'], kwargs['scores'],
                    kwargs['classes'], kwargs['image_scales'])
                img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1),
                                  nms_scores.dtype)
                detections_bs = [
                    img_ids * tf.ones_like(nms_scores),
                    nms_boxes[:, :, 1],
                    nms_boxes[:, :, 0],
                    nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
                    nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
                    nms_scores,
                    nms_classes,
                ]
                detections_bs = tf.stack(detections_bs,
                                         axis=-1,
                                         name='detnections')

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                eval_metric = coco_metric.EvaluationMetric(
                    testdev_dir=params['testdev_dir'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, tf.zeros([1]))
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                eval_metric = coco_metric.EvaluationMetric(
                    filename=params['val_json_file'],
                    label_map=params['label_map'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, kwargs['groundtruth_data'])

            # Add metrics to output.
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        # Set policy for mixed-precision training with Keras-based models.
        if use_tpu and train_config.use_bfloat16:
            from tensorflow.python.keras.engine import base_layer_utils  # pylint: disable=g-import-not-at-top
            # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0.
            base_layer_utils.enable_v2_dtype_behavior()
            tf.compat.v2.keras.mixed_precision.experimental.set_policy(
                'mixed_bfloat16')
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))
        scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            labels = unstack_batch(labels,
                                   unpad_groundtruth_tensors=train_config.
                                   unpad_groundtruth_tensors)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes].
                           get_shape().as_list())
            unpad_groundtruth_tensors = boxes_shape[
                1] is not None and not use_tpu
            labels = unstack_batch(
                labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            provide_groundtruth(detection_model, labels)

        preprocessed_images = features[fields.InputDataFields.image]

        side_inputs = detection_model.get_side_inputs(features)

        if use_tpu and train_config.use_bfloat16:
            with contrib_tpu.bfloat16_scope():
                prediction_dict = detection_model.predict(
                    preprocessed_images,
                    features[fields.InputDataFields.true_image_shape],
                    **side_inputs)
                prediction_dict = ops.bfloat16_to_float32_nested(
                    prediction_dict)
        else:
            prediction_dict = detection_model.predict(
                preprocessed_images,
                features[fields.InputDataFields.true_image_shape],
                **side_inputs)

        def postprocess_wrapper(args):
            return detection_model.postprocess(args[0], args[1])

        if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
            if use_tpu and postprocess_on_cpu:
                detections = contrib_tpu.outside_compilation(
                    postprocess_wrapper,
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))
            else:
                detections = postprocess_wrapper(
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))

        if mode == tf.estimator.ModeKeys.TRAIN:
            load_pretrained = hparams.load_pretrained if hparams else False
            if train_config.fine_tune_checkpoint and load_pretrained:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            if (mode == tf.estimator.ModeKeys.EVAL
                    and eval_config.use_dummy_loss_in_eval):
                total_loss = tf.constant(1.0)
                losses_dict = {'Loss/total_loss': total_loss}
            else:
                losses_dict = detection_model.loss(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])
                losses = [loss_tensor for loss_tensor in losses_dict.values()]
                if train_config.add_regularization_loss:
                    regularization_losses = detection_model.regularization_losses(
                    )
                    if use_tpu and train_config.use_bfloat16:
                        regularization_losses = ops.bfloat16_to_float32_nested(
                            regularization_losses)
                    if regularization_losses:
                        regularization_loss = tf.add_n(
                            regularization_losses, name='regularization_loss')
                        losses.append(regularization_loss)
                        losses_dict[
                            'Loss/regularization_loss'] = regularization_loss
                total_loss = tf.add_n(losses, name='total_loss')
                losses_dict['Loss/total_loss'] = total_loss

            if 'graph_rewriter_config' in configs:
                graph_rewriter_fn = graph_rewriter_builder.build(
                    configs['graph_rewriter_config'], is_training=is_training)
                graph_rewriter_fn()

            # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
            # can write learning rate summaries on TPU without host calls.
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                training_optimizer = contrib_tpu.CrossShardOptimizer(
                    training_optimizer)

            # Optionally freeze some layers by setting their gradients to be zero.
            trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = slim.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            if not use_tpu:
                for var in optimizer_summary_vars:
                    tf.summary.scalar(var.op.name, var)
            summaries = [] if use_tpu else None
            if train_config.summarize_gradients:
                summaries = [
                    'gradients', 'gradient_norm', 'global_gradient_norm'
                ]
            train_op = slim.optimizers.optimize_loss(
                loss=total_loss,
                global_step=global_step,
                learning_rate=None,
                clip_gradients=clip_gradients_value,
                optimizer=training_optimizer,
                update_ops=detection_model.updates(),
                variables=trainable_variables,
                summaries=summaries,
                name='')  # Preventing scope prefix on all variables.

        if mode == tf.estimator.ModeKeys.PREDICT:
            exported_output = exporter_lib.add_output_tensor_nodes(detections)
            export_outputs = {
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                tf.estimator.export.PredictOutput(exported_output)
            }

        eval_metric_ops = None
        scaffold = None
        if mode == tf.estimator.ModeKeys.EVAL:
            class_agnostic = (fields.DetectionResultFields.detection_classes
                              not in detections)
            groundtruth = _prepare_groundtruth_for_eval(
                detection_model, class_agnostic,
                eval_input_config.max_number_of_boxes)
            use_original_images = fields.InputDataFields.original_image in features
            if use_original_images:
                eval_images = features[fields.InputDataFields.original_image]
                true_image_shapes = tf.slice(
                    features[fields.InputDataFields.true_image_shape], [0, 0],
                    [-1, 3])
                original_image_spatial_shapes = features[
                    fields.InputDataFields.original_image_spatial_shape]
            else:
                eval_images = features[fields.InputDataFields.image]
                true_image_shapes = None
                original_image_spatial_shapes = None

            eval_dict = eval_util.result_dict_for_batched_example(
                eval_images,
                features[inputs.HASH_KEY],
                detections,
                groundtruth,
                class_agnostic=class_agnostic,
                scale_to_absolute=True,
                original_image_spatial_shapes=original_image_spatial_shapes,
                true_image_shapes=true_image_shapes)

            if fields.InputDataFields.image_additional_channels in features:
                eval_dict[fields.InputDataFields.
                          image_additional_channels] = features[
                              fields.InputDataFields.image_additional_channels]

            if class_agnostic:
                category_index = label_map_util.create_class_agnostic_category_index(
                )
            else:
                category_index = label_map_util.create_category_index_from_labelmap(
                    eval_input_config.label_map_path)
            vis_metric_ops = None
            if not use_tpu and use_original_images:
                keypoint_edges = [(kp.start, kp.end)
                                  for kp in eval_config.keypoint_edge]

                eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                    category_index,
                    max_examples_to_draw=eval_config.num_visualizations,
                    max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
                    min_score_thresh=eval_config.min_score_threshold,
                    use_normalized_coordinates=False,
                    keypoint_edges=keypoint_edges or None)
                vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                    eval_dict)

            # Eval metrics on a single example.
            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                eval_config, list(category_index.values()), eval_dict)
            for loss_key, loss_tensor in iter(losses_dict.items()):
                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
            for var in optimizer_summary_vars:
                eval_metric_ops[var.op.name] = (var, tf.no_op())
            if vis_metric_ops is not None:
                eval_metric_ops.update(vis_metric_ops)
            eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

            if eval_config.use_moving_averages:
                variable_averages = tf.train.ExponentialMovingAverage(0.0)
                variables_to_restore = variable_averages.variables_to_restore()
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    variables_to_restore,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours
                )
                scaffold = tf.train.Scaffold(saver=saver)

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                                scaffold_fn=scaffold_fn,
                                                predictions=detections,
                                                loss=total_loss,
                                                train_op=train_op,
                                                eval_metrics=eval_metric_ops,
                                                export_outputs=export_outputs)
        else:
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
예제 #9
0
    def get(self):
        """ Provides input data to the graph. """
        # calculate size of each record (this lists what is contained in the db and how many bytes are occupied)
        record_bytes = 2

        encoding_bytes = 4
        kp_xyz_entries = 3 * self.num_kp
        record_bytes += encoding_bytes * kp_xyz_entries

        encoding_bytes = 4
        kp_uv_entries = 2 * self.num_kp
        record_bytes += encoding_bytes * kp_uv_entries

        cam_matrix_entries = 9
        record_bytes += encoding_bytes * cam_matrix_entries

        image_bytes = self.image_size[0] * self.image_size[1] * 3
        record_bytes += image_bytes

        hand_parts_bytes = self.image_size[0] * self.image_size[1]
        record_bytes += hand_parts_bytes

        kp_vis_bytes = self.num_kp
        record_bytes += kp_vis_bytes
        """ READ DATA ITEMS"""
        # Start reader
        reader = tf.FixedLengthRecordReader(header_bytes=0,
                                            record_bytes=record_bytes)
        _, value = reader.read(
            tf.train.string_input_producer([self.path_to_db]))

        # decode to floats
        bytes_read = 0
        data_dict = dict()
        record_bytes_float32 = tf.decode_raw(value, tf.float32)

        # 1. Read keypoint xyz
        keypoint_xyz = tf.reshape(
            tf.slice(record_bytes_float32, [bytes_read // 4],
                     [kp_xyz_entries]), [self.num_kp, 3])
        bytes_read += encoding_bytes * kp_xyz_entries

        # calculate palm coord
        if not self.use_wrist_coord:
            palm_coord_l = tf.expand_dims(
                0.5 * (keypoint_xyz[0, :] + keypoint_xyz[12, :]), 0)
            palm_coord_r = tf.expand_dims(
                0.5 * (keypoint_xyz[21, :] + keypoint_xyz[33, :]), 0)
            keypoint_xyz = tf.concat([
                palm_coord_l, keypoint_xyz[1:21, :], palm_coord_r,
                keypoint_xyz[-20:, :]
            ], 0)

        data_dict['keypoint_xyz'] = keypoint_xyz

        # 2. Read keypoint uv
        keypoint_uv = tf.cast(
            tf.reshape(
                tf.slice(record_bytes_float32, [bytes_read // 4],
                         [kp_uv_entries]), [self.num_kp, 2]), tf.int32)
        bytes_read += encoding_bytes * kp_uv_entries

        keypoint_uv = tf.cast(keypoint_uv, tf.float32)

        # calculate palm coord
        if not self.use_wrist_coord:
            palm_coord_uv_l = tf.expand_dims(
                0.5 * (keypoint_uv[0, :] + keypoint_uv[12, :]), 0)
            palm_coord_uv_r = tf.expand_dims(
                0.5 * (keypoint_uv[21, :] + keypoint_uv[33, :]), 0)
            keypoint_uv = tf.concat([
                palm_coord_uv_l, keypoint_uv[1:21, :], palm_coord_uv_r,
                keypoint_uv[-20:, :]
            ], 0)

        if self.coord_uv_noise:
            noise = tf.truncated_normal([42, 2],
                                        mean=0.0,
                                        stddev=self.coord_uv_noise_sigma)
            keypoint_uv += noise

        data_dict['keypoint_uv'] = keypoint_uv

        # 3. Camera intrinsics
        cam_mat = tf.reshape(
            tf.slice(record_bytes_float32, [bytes_read // 4],
                     [cam_matrix_entries]), [3, 3])
        bytes_read += encoding_bytes * cam_matrix_entries
        data_dict['cam_mat'] = cam_mat

        # decode to uint8
        bytes_read += 2
        record_bytes_uint8 = tf.decode_raw(value, tf.uint8)

        # 4. Read image
        image = tf.reshape(
            tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]),
            [self.image_size[0], self.image_size[1], 3])
        image = tf.cast(image, tf.float32)
        bytes_read += image_bytes

        # subtract mean
        image = image / 255.0 - 0.5
        if self.hue_aug:
            image = tf.image.random_hue(image, self.hue_aug_max)
        data_dict['image'] = image

        # 5. Read mask
        hand_parts_mask = tf.reshape(
            tf.slice(record_bytes_uint8, [bytes_read], [hand_parts_bytes]),
            [self.image_size[0], self.image_size[1]])
        hand_parts_mask = tf.cast(hand_parts_mask, tf.int32)
        bytes_read += hand_parts_bytes
        data_dict['hand_parts'] = hand_parts_mask
        hand_mask = tf.greater(hand_parts_mask, 1)
        bg_mask = tf.logical_not(hand_mask)
        data_dict['hand_mask'] = tf.cast(tf.stack([bg_mask, hand_mask], 2),
                                         tf.int32)

        # 6. Read visibilty
        keypoint_vis = tf.reshape(
            tf.slice(record_bytes_uint8, [bytes_read], [kp_vis_bytes]),
            [self.num_kp])
        keypoint_vis = tf.cast(keypoint_vis, tf.bool)
        bytes_read += kp_vis_bytes

        # calculate palm visibility
        if not self.use_wrist_coord:
            palm_vis_l = tf.expand_dims(
                tf.logical_or(keypoint_vis[0], keypoint_vis[12]), 0)
            palm_vis_r = tf.expand_dims(
                tf.logical_or(keypoint_vis[21], keypoint_vis[33]), 0)
            keypoint_vis = tf.concat([
                palm_vis_l, keypoint_vis[1:21], palm_vis_r, keypoint_vis[-20:]
            ], 0)
        data_dict['keypoint_vis'] = keypoint_vis

        assert bytes_read == record_bytes, "Doesnt add up."
        """ DEPENDENT DATA ITEMS: SUBSET of 21 keypoints"""
        # figure out dominant hand by analysis of the segmentation mask
        one_map, zero_map = tf.ones_like(hand_parts_mask), tf.zeros_like(
            hand_parts_mask)
        cond_l = tf.logical_and(tf.greater(hand_parts_mask, one_map),
                                tf.less(hand_parts_mask, one_map * 18))
        cond_r = tf.greater(hand_parts_mask, one_map * 17)
        hand_map_l = tf.where(cond_l, one_map, zero_map)
        hand_map_r = tf.where(cond_r, one_map, zero_map)
        num_px_left_hand = tf.reduce_sum(hand_map_l)
        num_px_right_hand = tf.reduce_sum(hand_map_r)

        # PRODUCE the 21 subset using the segmentation masks
        # We only deal with the more prominent hand for each frame and discard the second set of keypoints
        kp_coord_xyz_left = keypoint_xyz[:21, :]
        kp_coord_xyz_right = keypoint_xyz[-21:, :]

        cond_left = tf.logical_and(
            tf.cast(tf.ones_like(kp_coord_xyz_left), tf.bool),
            tf.greater(num_px_left_hand, num_px_right_hand))
        kp_coord_xyz21 = tf.where(cond_left, kp_coord_xyz_left,
                                  kp_coord_xyz_right)

        hand_side = tf.where(
            tf.greater(num_px_left_hand,
                       num_px_right_hand), tf.constant(0, dtype=tf.int32),
            tf.constant(1, dtype=tf.int32))  # left hand = 0; right hand = 1
        data_dict['hand_side'] = tf.one_hot(hand_side,
                                            depth=2,
                                            on_value=1.0,
                                            off_value=0.0,
                                            dtype=tf.float32)

        data_dict['keypoint_xyz21'] = kp_coord_xyz21

        # make coords relative to root joint
        kp_coord_xyz_root = kp_coord_xyz21[0, :]  # this is the palm coord
        kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root  # relative coords in metric coords
        index_root_bone_length = tf.sqrt(
            tf.reduce_sum(
                tf.square(kp_coord_xyz21_rel[12, :] -
                          kp_coord_xyz21_rel[11, :])))
        data_dict['keypoint_scale'] = index_root_bone_length
        data_dict[
            'keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length  # normalized by length of 12->11

        # calculate local coordinates
        kp_coord_xyz21_local = bone_rel_trafo(
            data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local)
        data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local

        # calculate viewpoint and coords in canonical coordinates
        kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(
            data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(
            kp_coord_xyz21_rel_can), tf.squeeze(rot_mat)
        kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can,
                                                 tf.logical_not(cond_left))
        data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can
        data_dict['rot_mat'] = tf.matrix_inverse(rot_mat)

        # Set of 21 for visibility
        keypoint_vis_left = keypoint_vis[:21]
        keypoint_vis_right = keypoint_vis[-21:]
        keypoint_vis21 = tf.where(cond_left[:, 0], keypoint_vis_left,
                                  keypoint_vis_right)
        data_dict['keypoint_vis21'] = keypoint_vis21

        # Set of 21 for UV coordinates
        keypoint_uv_left = keypoint_uv[:21, :]
        keypoint_uv_right = keypoint_uv[-21:, :]
        keypoint_uv21 = tf.where(cond_left[:, :2], keypoint_uv_left,
                                 keypoint_uv_right)
        data_dict['keypoint_uv21'] = keypoint_uv21
        """ DEPENDENT DATA ITEMS: HAND CROP """
        if self.hand_crop:
            crop_center = keypoint_uv21[12, ::-1]

            # catch problem, when no valid kp available (happens almost never)
            crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)),
                                  lambda: crop_center,
                                  lambda: tf.constant([0.0, 0.0]))
            crop_center.set_shape([
                2,
            ])

            if self.crop_center_noise:
                noise = tf.truncated_normal(
                    [2], mean=0.0, stddev=self.crop_center_noise_sigma)
                crop_center += noise

            crop_scale_noise = tf.constant(1.0)
            if self.crop_scale_noise:
                crop_scale_noise = tf.squeeze(
                    tf.random_uniform([1], minval=1.0, maxval=1.2))

            # select visible coords only
            kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21)
            kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21)
            kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1)

            # determine size of crop (measure spatial extend of hw coords first)
            min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0)
            max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0),
                                   self.image_size)

            # find out larger distance wrt the center of crop
            crop_size_best = 2 * tf.maximum(max_coord - crop_center,
                                            crop_center - min_coord)
            crop_size_best = tf.reduce_max(crop_size_best)
            crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0),
                                        500.0)

            # catch problem, when no valid kp available
            crop_size_best = tf.cond(
                tf.reduce_all(tf.is_finite(crop_size_best)),
                lambda: crop_size_best, lambda: tf.constant(200.0))
            crop_size_best.set_shape([])

            # calculate necessary scaling
            scale = tf.cast(self.crop_size, tf.float32) / crop_size_best
            scale = tf.minimum(tf.maximum(scale, 1.0), 10.0)
            scale *= crop_scale_noise
            data_dict['crop_scale'] = scale

            if self.crop_offset_noise:
                noise = tf.truncated_normal(
                    [2], mean=0.0, stddev=self.crop_offset_noise_sigma)
                crop_center += noise

            # Crop image
            img_crop = crop_image_from_xy(tf.expand_dims(image, 0),
                                          crop_center, self.crop_size, scale)
            data_dict['image_crop'] = tf.squeeze(img_crop)

            # Modify uv21 coordinates
            crop_center_float = tf.cast(crop_center, tf.float32)
            keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1]
                               ) * scale + self.crop_size // 2
            keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0]
                               ) * scale + self.crop_size // 2
            keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1)
            data_dict['keypoint_uv21'] = keypoint_uv21

            # Modify camera intrinsics
            scale = tf.reshape(scale, [
                1,
            ])
            scale_matrix = tf.dynamic_stitch([
                [0], [1], [2], [3], [4], [5], [6], [7], [8]
            ], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]])
            scale_matrix = tf.reshape(scale_matrix, [3, 3])

            crop_center_float = tf.cast(crop_center, tf.float32)
            trans1 = crop_center_float[0] * scale - self.crop_size // 2
            trans2 = crop_center_float[1] * scale - self.crop_size // 2
            trans1 = tf.reshape(trans1, [
                1,
            ])
            trans2 = tf.reshape(trans2, [
                1,
            ])
            trans_matrix = tf.dynamic_stitch(
                [[0], [1], [2], [3], [4], [5], [6], [7], [8]],
                [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0],
                 [1.0]])
            trans_matrix = tf.reshape(trans_matrix, [3, 3])

            data_dict['cam_mat'] = tf.matmul(trans_matrix,
                                             tf.matmul(scale_matrix, cam_mat))
        """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints"""
        # create scoremaps from the subset of 2D annoataion
        keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]],
                                 -1)

        scoremap_size = self.image_size

        if self.hand_crop:
            scoremap_size = (self.crop_size, self.crop_size)

        scoremap = self.create_multiple_gaussian_map(keypoint_hw21,
                                                     scoremap_size,
                                                     self.sigma,
                                                     valid_vec=keypoint_vis21)

        if self.scoremap_dropout:
            scoremap = tf.nn.dropout(scoremap,
                                     self.scoremap_dropout_prob,
                                     noise_shape=[1, 1, 21])
            scoremap *= self.scoremap_dropout_prob

        data_dict['scoremap'] = scoremap

        if self.scale_to_size:
            image, keypoint_uv21, keypoint_vis21 = data_dict[
                'image'], data_dict['keypoint_uv21'], data_dict[
                    'keypoint_vis21']
            s = image.get_shape().as_list()
            image = tf.image.resize_images(image, self.scale_target_size)
            scale = (self.scale_target_size[0] / float(s[0]),
                     self.scale_target_size[1] / float(s[1]))
            keypoint_uv21 = tf.stack([
                keypoint_uv21[:, 0] * scale[1], keypoint_uv21[:, 1] * scale[0]
            ], 1)

            data_dict = dict(
            )  # delete everything else because the scaling makes the data invalid anyway
            data_dict['image'] = image
            data_dict['keypoint_uv21'] = keypoint_uv21
            data_dict['keypoint_vis21'] = keypoint_vis21

        elif self.random_crop_to_size:
            tensor_stack = tf.concat([
                data_dict['image'],
                tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32),
                               -1),
                tf.cast(data_dict['hand_mask'], tf.float32)
            ], 2)
            s = tensor_stack.get_shape().as_list()
            tensor_stack_cropped = tf.random_crop(
                tensor_stack,
                [self.random_crop_size, self.random_crop_size, s[2]])
            data_dict = dict(
            )  # delete everything else because the random cropping makes the data invalid anyway
            data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32)

        names, tensors = zip(*data_dict.items())

        if self.shuffle:
            tensors = tf.train.shuffle_batch_join([tensors],
                                                  batch_size=self.batch_size,
                                                  capacity=100,
                                                  min_after_dequeue=50,
                                                  enqueue_many=False)
        else:
            tensors = tf.train.batch_join([tensors],
                                          batch_size=self.batch_size,
                                          capacity=100,
                                          enqueue_many=False)

        return dict(zip(names, tensors))
예제 #10
0
def _crop_to_square(image,
                    decode_image=False,
                    side_length=IMAGE_SIZE,
                    crop_padding=CROP_PADDING,
                    area_range=(0.08, 1.0),
                    is_training=True,
                    resize_only=False,
                    eval_crop_method=enums.EvalCropMethod.RESIZE_THEN_CROP):
    """Produces a (possibly distorted) square crop of an image.

  Given an input image, either as an encoded bytes string or a decoded image
  Tensor, produces a square version of it with the desired side length, using a
  combination of cropping and resizing.

  If `resize_only` is True, simply resize the image to be
  `side_length`x`side_length`, possibly distorting it if the original image is
  not square.

  If `is_training` is True, then sample a random box to crop from the image and
  then resize the result to be `side_length`x`side_length`.

  If `is_training` is False then we follow `eval_crop_method` to determine the
  strategy of cropping and resizing. Generally the approach is to end up with a
  center crop of size `side_length`x`side_length` taken from the image resized
  to have a minimum dimension of `side_length` + `crop_padding`. By setting
  eval_crop_method appropriately, this can be accomplished by first resizing and
  then cropping, first cropping and then resizing, or a less common approach of
  cropping the central `side_length`/(`side_length`+`crop_padding`) pixels in
  each dimension followed by resizing (and distorting) to
  `side_length`x`side_length`.

  If `decode_image` is True (i.e., `image` is an encoded jpeg image string),
  when possible we crop before decoding, which can provide substantial speedups.

  Args:
    image: An image represented either as a 3D Tensor with any numeric DType or
      else as an encoded jpeg image string.
    decode_image: Whether `image` is an encoded jpeg image string or not.
    side_length: The side length, in both spatial dimentions, of the output
      image.
    crop_padding: When `is_training` is False, this determines how much padding
      to apply around the central square crop.
    area_range: List of floats. The cropped area of the image must contain a
      fraction of the supplied image within this range. Only relevant when
      `is_training` is True and `resize_only` is False.
    is_training: Whether this should operate in training (non-deterministic
      random crop window) or eval (deterministic central crop window) mode.
    resize_only: Whether to just resize the image to the target `side_length`
      without performing any cropping. This is likely to distort the image.
    eval_crop_method: The strategy for obtaining the desired square crop in eval
      mode. See EvalCropMethod for valid values.

  Returns:
    An image Tensor of shape [`side_length`, `side_length`, 3]. If `image` was
    provided then the output has the same dtype as `image`. If `image_bytes` was
    provided then the output dtype is tf.uint8.

  Raises:
    ValueError: If both or neither of `image` and `image_bytes` was passed.
  """
    with tf.name_scope('crop_to_square'):
        if not decode_image:
            image = _validate_image_dimensions(image)

        if resize_only:
            if decode_image:
                image = _decode_and_maybe_crop_image(image)
            resized = _resize_image(image, (side_length, side_length))
            return tf.ensure_shape(resized, [side_length, side_length, 3])

        image_shape = (tf.shape(image) if not decode_image else
                       tf.image.extract_jpeg_shape(image))
        if is_training:
            # During training, always crop then resize.
            crop_window = _distorted_crop_window(image_shape,
                                                 area_range=area_range)
            if decode_image:
                cropped = _decode_and_maybe_crop_image(
                    image, _convert_3d_crop_window_to_2d(crop_window))
            else:
                cropped = tf.slice(image, crop_window[:3], crop_window[3:])
            resized = _resize_image(cropped, [side_length, side_length])
            return tf.ensure_shape(resized, [side_length, side_length, 3])
        else:
            # For eval, the ordering depends on eval_crop_method.
            crop_frac = (side_length / (side_length + crop_padding))
            if eval_crop_method == enums.EvalCropMethod.RESIZE_THEN_CROP:
                if decode_image:
                    image = _decode_and_maybe_crop_image(image)
                resize_dim = side_length + crop_padding
                resized = _resize_to_min_dim(image, resize_dim)
                crop_window = _center_crop_window(tf.shape(resized),
                                                  crop_dim=side_length)
                cropped = tf.slice(resized, crop_window[:3], crop_window[3:])
                return tf.ensure_shape(cropped, [side_length, side_length, 3])
            elif eval_crop_method == enums.EvalCropMethod.CROP_THEN_RESIZE:
                crop_window = _center_crop_window(image_shape,
                                                  crop_frac=crop_frac)
                if decode_image:
                    cropped = _decode_and_maybe_crop_image(
                        image, _convert_3d_crop_window_to_2d(crop_window))
                else:
                    cropped = tf.slice(image, crop_window[:3], crop_window[3:])
                resized = _resize_image(cropped, [side_length, side_length])
                return tf.ensure_shape(resized, [side_length, side_length, 3])
            elif eval_crop_method == enums.EvalCropMethod.CROP_THEN_DISTORT:
                if decode_image:
                    image = _decode_and_maybe_crop_image(image)
                # Note that tf.image.central_crop does not produce a square crop. It
                # preserves the input aspect ratio.
                cropped = tf.image.central_crop(image,
                                                central_fraction=crop_frac)
                resized = _resize_image(cropped, [side_length, side_length])
                return tf.ensure_shape(resized, [side_length, side_length, 3])
            elif eval_crop_method == enums.EvalCropMethod.IDENTITY:
                if decode_image:
                    image = _decode_and_maybe_crop_image(image)
                return tf.ensure_shape(image, [side_length, side_length, 3])
def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
    """Unstacks all tensors in `tensor_dict` along 0th dimension.

  Unstacks tensor from the tensor dict along 0th dimension and returns a
  tensor_dict containing values that are lists of unstacked, unpadded tensors.

  Tensors in the `tensor_dict` are expected to be of one of the three shapes:
  1. [batch_size]
  2. [batch_size, height, width, channels]
  3. [batch_size, num_boxes, d1, d2, ... dn]

  When unpad_groundtruth_tensors is set to true, unstacked tensors of form 3
  above are sliced along the `num_boxes` dimension using the value in tensor
  field.InputDataFields.num_groundtruth_boxes.

  Note that this function has a static list of input data fields and has to be
  kept in sync with the InputDataFields defined in core/standard_fields.py

  Args:
    tensor_dict: A dictionary of batched groundtruth tensors.
    unpad_groundtruth_tensors: Whether to remove padding along `num_boxes`
      dimension of the groundtruth tensors.

  Returns:
    A dictionary where the keys are from fields.InputDataFields and values are
    a list of unstacked (optionally unpadded) tensors.

  Raises:
    ValueError: If unpad_tensors is True and `tensor_dict` does not contain
      `num_groundtruth_boxes` tensor.
  """
    unbatched_tensor_dict = {
        key: tf.unstack(tensor)
        for key, tensor in tensor_dict.items()
    }
    if unpad_groundtruth_tensors:
        if (fields.InputDataFields.num_groundtruth_boxes
                not in unbatched_tensor_dict):
            raise ValueError(
                '`num_groundtruth_boxes` not found in tensor_dict. '
                'Keys available: {}'.format(unbatched_tensor_dict.keys()))
        unbatched_unpadded_tensor_dict = {}
        unpad_keys = set([
            # List of input data fields that are padded along the num_boxes
            # dimension. This list has to be kept in sync with InputDataFields in
            # standard_fields.py.
            fields.InputDataFields.groundtruth_instance_masks,
            fields.InputDataFields.groundtruth_classes,
            fields.InputDataFields.groundtruth_boxes,
            fields.InputDataFields.groundtruth_keypoints,
            fields.InputDataFields.groundtruth_keypoint_visibilities,
            fields.InputDataFields.groundtruth_group_of,
            fields.InputDataFields.groundtruth_difficult,
            fields.InputDataFields.groundtruth_is_crowd,
            fields.InputDataFields.groundtruth_area,
            fields.InputDataFields.groundtruth_weights
        ]).intersection(set(unbatched_tensor_dict.keys()))

        for key in unpad_keys:
            unpadded_tensor_list = []
            for num_gt, padded_tensor in zip(
                    unbatched_tensor_dict[
                        fields.InputDataFields.num_groundtruth_boxes],
                    unbatched_tensor_dict[key]):
                tensor_shape = shape_utils.combined_static_and_dynamic_shape(
                    padded_tensor)
                slice_begin = tf.zeros([len(tensor_shape)], dtype=tf.int32)
                slice_size = tf.stack(
                    [num_gt] +
                    [-1 if dim is None else dim for dim in tensor_shape[1:]])
                unpadded_tensor = tf.slice(padded_tensor, slice_begin,
                                           slice_size)
                unpadded_tensor_list.append(unpadded_tensor)
            unbatched_unpadded_tensor_dict[key] = unpadded_tensor_list

        unbatched_tensor_dict.update(unbatched_unpadded_tensor_dict)

    return unbatched_tensor_dict
예제 #12
0
    def _grow_alive_seq(self, state):
        """Grow alive sequences by one token, and collect top 2*beam_size sequences.

    2*beam_size sequences are collected because some sequences may have reached
    the EOS token. 2*beam_size ensures that at least beam_size sequences are
    still alive.

    Args:
      state: A dictionary with the current loop state.
    Returns:
      Tuple of
      (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
       Scores of returned sequences [batch_size, 2 * beam_size],
       New alive cache, for each of the 2 * beam_size sequences)
    """
        i = state[_StateKeys.CUR_INDEX]
        alive_seq = state[_StateKeys.ALIVE_SEQ]
        alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
        alive_cache = state[_StateKeys.ALIVE_CACHE]

        beams_to_keep = 2 * self.beam_size

        # Get logits for the next candidate IDs for the alive sequences. Get the new
        # cache values at the same time.
        if self.padded_decode:
            flat_ids = tf.reshape(
                tf.slice(alive_seq, [0, 0, i],
                         [self.batch_size, self.beam_size, 1]),
                [self.batch_size * self.beam_size, -1])
        else:
            flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
        flat_cache = tf.nest.map_structure(_flatten_beam_dim, alive_cache)

        flat_logits, flat_cache = self.symbols_to_logits_fn(
            flat_ids, i, flat_cache)

        # Unflatten logits to shape [batch_size, beam_size, vocab_size]
        logits = _unflatten_beam_dim(flat_logits, self.batch_size,
                                     self.beam_size)
        new_cache = tf.nest.map_structure(
            lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size),
            flat_cache)

        # Convert logits to normalized log probs
        candidate_log_probs = _log_prob_from_logits(logits)

        # Calculate new log probabilities if each of the alive sequences were
        # extended # by the the candidate IDs.
        # Shape [batch_size, beam_size, vocab_size]
        log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs,
                                                         axis=2)

        # Each batch item has beam_size * vocab_size candidate sequences. For each
        # batch item, get the k candidates with the highest log probabilities.
        flat_log_probs = tf.reshape(log_probs,
                                    [-1, self.beam_size * self.vocab_size])
        topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs,
                                                   k=beams_to_keep)

        # Extract the alive sequences that generate the highest log probabilities
        # after being extended.
        topk_beam_indices = topk_indices // self.vocab_size
        topk_seq, new_cache = _gather_beams([alive_seq, new_cache],
                                            topk_beam_indices, self.batch_size,
                                            beams_to_keep)

        # Append the most probable IDs to the topk sequences
        topk_ids = topk_indices % self.vocab_size
        if self.padded_decode:
            topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
            topk_seq = tf.tensor_scatter_nd_update(topk_seq, [i + 1], topk_ids)
            topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
        else:
            topk_ids = tf.expand_dims(topk_ids, axis=2)
            topk_seq = tf.concat([topk_seq, topk_ids], axis=2)
        return topk_seq, topk_log_probs, new_cache
예제 #13
0
def slice_layer(x, offsets, lengths):
    y = []
    for i in zip(offsets, lengths):
        y.append(tf.slice(x, [0, i[0]], [-1, i[1]]))
    return y
예제 #14
0
 def filter(xt, k):
     xt = tf.transpose(xt)  # N x M
     xt = tf.reshape(xt, [-1, 1])  # NM x 1
     w = tf.slice(W, [k, 0], [1, -1])  # 1 x F
     y = tf.matmul(xt, w)  # NM x F
     return tf.reshape(y, [-1, M, self.F])  # N x M x F
예제 #15
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = audio.decode_wav(wav_loader,
                                           desired_channels=1,
                                           desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
            spectrogram = audio_ops.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            tf.summary.image('spectrogram',
                             tf.expand_dims(spectrogram, -1),
                             max_outputs=1)
            # The number of buckets in each FFT row in the spectrogram will depend on
            # how many input samples there are in each window. This can be quite
            # large, with a 160 sample window producing 127 buckets for example. We
            # don't need this level of detail for classification, so we often want to
            # shrink them down to produce a smaller result. That's what this section
            # implements. One method is to use average pooling to merge adjacent
            # buckets, but a more sophisticated approach is to apply the MFCC
            # algorithm to shrink the representation.
            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    input=tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.summary.image('shrunk_spectrogram',
                                 self.output_,
                                 max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = audio_ops.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                tf.summary.image('mfcc',
                                 tf.expand_dims(self.output_, -1),
                                 max_outputs=1)
            elif model_settings['preprocess'] == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                sample_rate = model_settings['sample_rate']
                window_size_ms = (model_settings['window_size_samples'] *
                                  1000) / sample_rate
                window_step_ms = (model_settings['window_stride_samples'] *
                                  1000) / sample_rate
                int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                      tf.int16)
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=sample_rate,
                    window_size=window_size_ms,
                    window_step=window_step_ms,
                    num_channels=model_settings['fingerprint_width'],
                    out_scale=1,
                    out_type=tf.float32)
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
                tf.summary.image('micro',
                                 tf.expand_dims(
                                     tf.expand_dims(self.output_, -1), 0),
                                 max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc", '
                    ' "average", or "micro")' % (model_settings['preprocess']))

            # Merge all the summaries and write them out to /tmp/retrain_logs (by
            # default)
            self.merged_summaries_ = tf.summary.merge_all(scope='data')
            if summaries_dir:
                self.summary_writer_ = tf.summary.FileWriter(
                    summaries_dir + '/data', tf.get_default_graph())
예제 #16
0
    def train(imPath, logPath, modelPath, pmPath, nTrain, nValid, nTest,
              restoreVariables, nSteps, gpuIndex, testPMIndex):
        os.environ['CUDA_VISIBLE_DEVICES'] = '%d' % gpuIndex

        outLogPath = logPath
        trainWriterPath = pathjoin(logPath, 'Train')
        validWriterPath = pathjoin(logPath, 'Valid')
        outModelPath = pathjoin(modelPath, 'model.ckpt')
        outPMPath = pmPath

        batchSize = UNet2D.hp['batchSize']
        imSize = UNet2D.hp['imSize']
        nChannels = UNet2D.hp['nChannels']
        nClasses = UNet2D.hp['nClasses']

        # --------------------------------------------------
        # data
        # --------------------------------------------------

        Train = np.zeros((nTrain, imSize, imSize, nChannels))
        Valid = np.zeros((nValid, imSize, imSize, nChannels))
        Test = np.zeros((nTest, imSize, imSize, nChannels))
        LTrain = np.zeros((nTrain, imSize, imSize, nClasses))
        LValid = np.zeros((nValid, imSize, imSize, nClasses))
        LTest = np.zeros((nTest, imSize, imSize, nClasses))

        print('loading data, computing mean / st dev')
        if not os.path.exists(modelPath):
            os.makedirs(modelPath)
        if restoreVariables:
            datasetMean = loadData(pathjoin(modelPath, 'datasetMean.data'))
            datasetStDev = loadData(pathjoin(modelPath, 'datasetStDev.data'))
        else:
            datasetMean = 0
            datasetStDev = 0
            for iSample in range(nTrain + nValid + nTest):
                I = im2double(tifread('%s/I%05d_Img.tif' % (imPath, iSample)))
                datasetMean += np.mean(I)
                datasetStDev += np.std(I)
            datasetMean /= (nTrain + nValid + nTest)
            datasetStDev /= (nTrain + nValid + nTest)
            saveData(datasetMean, pathjoin(modelPath, 'datasetMean.data'))
            saveData(datasetStDev, pathjoin(modelPath, 'datasetStDev.data'))

        perm = np.arange(nTrain + nValid + nTest)
        np.random.shuffle(perm)

        for iSample in range(0, nTrain):
            path = '%s/I%05d_Img.tif' % (imPath, perm[iSample])
            im = im2double(tifread(path))
            Train[iSample, :, :, 0] = (im - datasetMean) / datasetStDev
            path = '%s/I%05d_Ant.tif' % (imPath, perm[iSample])
            im = tifread(path)
            for i in range(nClasses):
                LTrain[iSample, :, :, i] = (im == i + 1)

        for iSample in range(0, nValid):
            path = '%s/I%05d_Img.tif' % (imPath, perm[nTrain + iSample])
            im = im2double(tifread(path))
            Valid[iSample, :, :, 0] = (im - datasetMean) / datasetStDev
            path = '%s/I%05d_Ant.tif' % (imPath, perm[nTrain + iSample])
            im = tifread(path)
            for i in range(nClasses):
                LValid[iSample, :, :, i] = (im == i + 1)

        for iSample in range(0, nTest):
            path = '%s/I%05d_Img.tif' % (imPath,
                                         perm[nTrain + nValid + iSample])
            im = im2double(tifread(path))
            Test[iSample, :, :, 0] = (im - datasetMean) / datasetStDev
            path = '%s/I%05d_Ant.tif' % (imPath,
                                         perm[nTrain + nValid + iSample])
            im = tifread(path)
            for i in range(nClasses):
                LTest[iSample, :, :, i] = (im == i + 1)

        # --------------------------------------------------
        # optimization
        # --------------------------------------------------

        tfLabels = tf.placeholder("float",
                                  shape=[None, imSize, imSize, nClasses],
                                  name='labels')

        globalStep = tf.Variable(0, trainable=False)
        learningRate0 = 0.01
        decaySteps = 1000
        decayRate = 0.95
        learningRate = tf.train.exponential_decay(learningRate0,
                                                  globalStep,
                                                  decaySteps,
                                                  decayRate,
                                                  staircase=True)

        with tf.name_scope('optim'):
            loss = tf.reduce_mean(
                -tf.reduce_sum(tf.multiply(tfLabels, tf.log(UNet2D.nn)), 3))
            updateOps = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            # optimizer = tf.train.MomentumOptimizer(1e-3,0.9)
            optimizer = tf.train.MomentumOptimizer(learningRate, 0.9)
            # optimizer = tf.train.GradientDescentOptimizer(learningRate)
            with tf.control_dependencies(updateOps):
                optOp = optimizer.minimize(loss, global_step=globalStep)

        with tf.name_scope('eval'):
            error = []
            for iClass in range(nClasses):
                labels0 = tf.reshape(
                    tf.to_int32(
                        tf.slice(tfLabels, [0, 0, 0, iClass],
                                 [-1, -1, -1, 1])),
                    [batchSize, imSize, imSize])
                predict0 = tf.reshape(
                    tf.to_int32(tf.equal(tf.argmax(UNet2D.nn, 3), iClass)),
                    [batchSize, imSize, imSize])
                correct = tf.multiply(labels0, predict0)
                nCorrect0 = tf.reduce_sum(correct)
                nLabels0 = tf.reduce_sum(labels0)
                error.append(1 -
                             tf.to_float(nCorrect0) / tf.to_float(nLabels0))
            errors = tf.tuple(error)

        # --------------------------------------------------
        # inspection
        # --------------------------------------------------

        with tf.name_scope('scalars'):
            tf.summary.scalar('avg_cross_entropy', loss)
            for iClass in range(nClasses):
                tf.summary.scalar('avg_pixel_error_%d' % iClass, error[iClass])
            tf.summary.scalar('learning_rate', learningRate)
        with tf.name_scope('images'):
            split0 = tf.slice(UNet2D.nn, [0, 0, 0, 0], [-1, -1, -1, 1])
            split1 = tf.slice(UNet2D.nn, [0, 0, 0, 1], [-1, -1, -1, 1])
            if nClasses > 2:
                split2 = tf.slice(UNet2D.nn, [0, 0, 0, 2], [-1, -1, -1, 1])
            tf.summary.image('pm0', split0)
            tf.summary.image('pm1', split1)
            if nClasses > 2:
                tf.summary.image('pm2', split2)
        merged = tf.summary.merge_all()

        # --------------------------------------------------
        # session
        # --------------------------------------------------

        saver = tf.train.Saver()
        sess = tf.Session(
            config=tf.ConfigProto(allow_soft_placement=True)
        )  # config parameter needed to save variables when using GPU

        if os.path.exists(outLogPath):
            shutil.rmtree(outLogPath)
        trainWriter = tf.summary.FileWriter(trainWriterPath, sess.graph)
        validWriter = tf.summary.FileWriter(validWriterPath, sess.graph)

        if restoreVariables:
            saver.restore(sess, outModelPath)
            print("Model restored.")
        else:
            sess.run(tf.global_variables_initializer())

        # --------------------------------------------------
        # train
        # --------------------------------------------------

        batchData = np.zeros((batchSize, imSize, imSize, nChannels))
        batchLabels = np.zeros((batchSize, imSize, imSize, nClasses))
        for i in range(nSteps):
            # train

            perm = np.arange(nTrain)
            np.random.shuffle(perm)

            for j in range(batchSize):
                batchData[j, :, :, :] = Train[perm[j], :, :, :]
                batchLabels[j, :, :, :] = LTrain[perm[j], :, :, :]

            summary, _ = sess.run(
                [merged, optOp],
                feed_dict={
                    UNet2D.tfData: batchData,
                    tfLabels: batchLabels,
                    UNet2D.tfTraining: 1
                })
            trainWriter.add_summary(summary, i)

            # validation

            perm = np.arange(nValid)
            np.random.shuffle(perm)

            for j in range(batchSize):
                batchData[j, :, :, :] = Valid[perm[j], :, :, :]
                batchLabels[j, :, :, :] = LValid[perm[j], :, :, :]

            summary, es = sess.run(
                [merged, errors],
                feed_dict={
                    UNet2D.tfData: batchData,
                    tfLabels: batchLabels,
                    UNet2D.tfTraining: 0
                })
            validWriter.add_summary(summary, i)

            e = np.mean(es)
            print('step %05d, e: %f' % (i, e))

            if i == 0:
                if restoreVariables:
                    lowestError = e
                else:
                    lowestError = np.inf

            if np.mod(i, 100) == 0 and e < lowestError:
                lowestError = e
                print("Model saved in file: %s" %
                      saver.save(sess, outModelPath))

        # --------------------------------------------------
        # test
        # --------------------------------------------------

        if not os.path.exists(outPMPath):
            os.makedirs(outPMPath)

        for i in range(nTest):
            j = np.mod(i, batchSize)

            batchData[j, :, :, :] = Test[i, :, :, :]
            batchLabels[j, :, :, :] = LTest[i, :, :, :]

            if j == batchSize - 1 or i == nTest - 1:

                output = sess.run(UNet2D.nn,
                                  feed_dict={
                                      UNet2D.tfData: batchData,
                                      tfLabels: batchLabels,
                                      UNet2D.tfTraining: 0
                                  })

                for k in range(j + 1):
                    pm = output[k, :, :, testPMIndex]
                    gt = batchLabels[k, :, :, testPMIndex]
                    im = np.sqrt(normalize(batchData[k, :, :, 0]))
                    imwrite(
                        np.uint8(255 * np.concatenate(
                            (im, np.concatenate((pm, gt), axis=1)), axis=1)),
                        '%s/I%05d.png' % (outPMPath, i - j + k + 1))

        # --------------------------------------------------
        # save hyper-parameters, clean-up
        # --------------------------------------------------

        saveData(UNet2D.hp, pathjoin(modelPath, 'hp.data'))

        trainWriter.close()
        validWriter.close()
        sess.close()
예제 #17
0
    def prepare_processing_graph(self, flags):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      flags: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = flags.desired_samples
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)

            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            # signal resampling to generate more training data
            # it will stretch or squeeze input signal proportinally to:
            self.foreground_resampling_placeholder_ = tf.placeholder(
                tf.float32, [])

            if self.foreground_resampling_placeholder_ != 1.0:
                image = tf.expand_dims(wav_decoder.audio, 0)
                image = tf.expand_dims(image, 2)
                shape = tf.shape(wav_decoder.audio)
                image_resized = tf.image.resize(
                    images=image,
                    size=(tf.cast((tf.cast(shape[0], tf.float32) *
                                   self.foreground_resampling_placeholder_),
                                  tf.int32), 1),
                    preserve_aspect_ratio=False)
                image_resized_cropped = tf.image.resize_with_crop_or_pad(
                    image_resized,
                    target_height=desired_samples,
                    target_width=1,
                )
                image_resized_cropped = tf.squeeze(image_resized_cropped,
                                                   axis=[0, 3])
                scaled_foreground = tf.multiply(
                    image_resized_cropped, self.foreground_volume_placeholder_)
            else:
                scaled_foreground = tf.multiply(
                    wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            if flags.preprocess == 'raw':
                # background_clamp dims: [time, channels]
                # remove channel dim
                self.output_ = tf.squeeze(background_clamp, axis=1)
            # below options are for backward compatibility with previous
            # version of hotword detection on microcontrollers
            # in this case audio feature extraction is done separately from
            # neural net and user will have to manage it.
            elif flags.preprocess == 'mfcc':
                # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs
                # background_clamp dims: [time, channels]
                spectrogram = audio_ops.audio_spectrogram(
                    background_clamp,
                    window_size=flags.window_size_samples,
                    stride=flags.window_stride_samples,
                    magnitude_squared=flags.fft_magnitude_squared)
                # spectrogram: [channels/batch, frames, fft_feature]

                # extract mfcc features from spectrogram by audio_ops.mfcc:
                # 1 Input is spectrogram frames.
                # 2 Weighted spectrogram into bands using a triangular mel filterbank
                # 3 Logarithmic scaling
                # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count
                mfcc = audio_ops.mfcc(
                    spectrogram=spectrogram,
                    sample_rate=flags.sample_rate,
                    upper_frequency_limit=flags.mel_upper_edge_hertz,
                    lower_frequency_limit=flags.mel_lower_edge_hertz,
                    filterbank_channel_count=flags.mel_num_bins,
                    dct_coefficient_count=flags.dct_num_features)
                # mfcc: [channels/batch, frames, dct_coefficient_count]
                # remove channel dim
                self.output_ = tf.squeeze(mfcc, axis=0)
            elif flags.preprocess == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                int16_input = tf.cast(
                    tf.multiply(background_clamp, MAX_ABS_INT16), tf.int16)
                # audio_microfrontend does:
                # 1. A slicing window function of raw audio
                # 2. Short-time FFTs
                # 3. Filterbank calculations
                # 4. Noise reduction
                # 5. PCAN Auto Gain Control
                # 6. Logarithmic scaling

                # int16_input dims: [time, channels]
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=flags.sample_rate,
                    window_size=flags.window_size_ms,
                    window_step=flags.window_stride_ms,
                    num_channels=flags.mel_num_bins,
                    upper_band_limit=flags.mel_upper_edge_hertz,
                    lower_band_limit=flags.mel_lower_edge_hertz,
                    out_scale=1,
                    out_type=tf.float32)
                # int16_input dims: [frames, num_channels]
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "raw", '
                    ' "mfcc", or "micro")' % (flags.preprocess))
예제 #18
0
def CustomCropImages(images, input_shape,
                     target_shape,
                     target_locations):
  """Crop a list of images at with a custom crop location and size.

  Args:
    images: List of tensors of shape [batch_size, h, w, c].
    input_shape: Shape [h, w, c] of the input images.
    target_shape: Shape [h, w] of the cropped output.
    target_locations: List of crop center coordinates tensors of shape [b, 2].
  Returns:
    crops: List of cropped tensors of shape [batch_size] + target_shape + [3].
  """
  if len(input_shape) != 3:
    raise ValueError(
        'The input shape has to be of the form (height, width, channels) '
        'but has len {}'.format(len(input_shape)))
  if len(target_shape) != 2:
    raise ValueError('The target shape has to be of the form (height, width) '
                     'but has len {}'.format(len(target_shape)))
  if len(images) != len(target_locations):
    raise ValueError('There should be one target location per image. Found {} '
                     'images for {} locations'.format(len(images),
                                                      len(target_locations)))
  if input_shape[0] == target_shape[0] and input_shape[1] == target_shape[1]:
    return [image for image in images]
  if input_shape[0] < target_shape[0] or input_shape[1] < target_shape[1]:
    raise ValueError('The target shape {} is larger than the input image size '
                     '{}'.format(target_shape, input_shape[:2]))
  assert_ops = []
  for image, target_location in zip(images, target_locations):
    # Assert all images have the same shape.
    assert_ops.append(
        tf.assert_equal(
            input_shape[:2],
            tf.shape(image)[1:3],
            message=('All images must have same width and height'
                     'for CenterCropImages.')))

  with tf.control_dependencies(assert_ops):
    crops = []
    for image, target_location in zip(images, target_locations):
      # If bounding box is outside of image boundaries, move it
      x_coordinates = tf.slice(
          target_location,
          [0, 1], [tf.shape(target_location)[0], 1])
      y_coordinates = tf.slice(
          target_location,
          [0, 0], [tf.shape(target_location)[0], 1])

      x_coordinates = tf.math.maximum(
          tf.cast(x_coordinates, tf.float32),
          tf.cast(target_shape[1] // 2, tf.float32))
      y_coordinates = tf.math.maximum(
          tf.cast(y_coordinates, tf.float32),
          tf.cast(target_shape[0] // 2, tf.float32))
      x_coordinates = tf.math.minimum(
          tf.cast(x_coordinates, tf.float32),
          tf.cast(tf.shape(image)[2] - target_shape[1] // 2, tf.float32))
      y_coordinates = tf.math.minimum(
          tf.cast(y_coordinates, tf.float32),
          tf.cast(tf.shape(image)[1] - target_shape[0] // 2, tf.float32)
          )

      target_location = tf.concat([x_coordinates, y_coordinates], 1)
      crops.append(
          tf.image.extract_glimpse(image, target_shape, tf.cast(
              target_location, tf.float32), centered=False, normalized=False))
  return crops
예제 #19
0
def main(_):
    # We want to see all the logging messages for this tutorial.
    tf.logging.set_verbosity(tf.logging.INFO)
    np.set_printoptions(threshold=np.inf, linewidth=10000)

    flags = vars(FLAGS)
    for key in sorted(flags.keys()):
        tf.logging.info('%s = %s', key, flags[key])

    if FLAGS.random_seed_weights != -1:
        tf.random.set_random_seed(FLAGS.random_seed_weights)

    # Start a new TensorFlow session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    #config.log_device_placement = False
    sess = tf.InteractiveSession(config=config)

    # Begin by making sure we have the training data we need. If you already have
    # training data of your own, use `--data_url= ` on the command line to avoid
    # downloading.

    label_count = len(
        input_data.prepare_words_list(FLAGS.wanted_words.split(','),
                                      FLAGS.silence_percentage,
                                      FLAGS.unknown_percentage))

    model_settings = models.prepare_model_settings(
        label_count, FLAGS.sample_rate, FLAGS.nchannels,
        FLAGS.clip_duration_ms, FLAGS.representation, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, 1, FLAGS.dct_coefficient_count,
        FLAGS.filterbank_channel_count,
        [int(x) for x in FLAGS.filter_counts.split(',')],
        [int(x)
         for x in FLAGS.filter_sizes.split(',')], FLAGS.final_filter_len,
        FLAGS.dropout_prob, FLAGS.batch_size, FLAGS.dilate_after_layer,
        FLAGS.stride_after_layer, FLAGS.connection_type)

    fingerprint_size = model_settings['fingerprint_size']
    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
    # Figure out the learning rates for each training phase. Since it's often
    # effective to have high learning rates at the start of training, followed by
    # lower levels towards the end, the number of steps and learning rates can be
    # specified as comma-separated lists to define the rate at each stage. For
    # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
    # will run 13,000 training loops in total, with a rate of 0.001 for the first
    # 10,000, and 0.0001 for the final 3,000.
    training_steps_list = list(
        map(int, FLAGS.how_many_training_steps.split(',')))
    learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
    if len(training_steps_list) != len(learning_rates_list):
        raise Exception(
            '--how_many_training_steps and --learning_rate must be equal length '
            'lists, but are %d and %d long instead' %
            (len(training_steps_list), len(learning_rates_list)))

    actual_batch_size = tf.placeholder(tf.int32, [1])

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    hidden, logits, dropout_prob = models.create_model(
        fingerprint_input,
        model_settings,
        FLAGS.model_architecture,
        is_training=True)

    # Define loss and optimizer
    ground_truth_input = tf.placeholder(tf.int64, [None],
                                        name='groundtruth_input')

    # Optionally we can add runtime checks to spot when NaNs or other symptoms of
    # numerical errors start occurring during training.
    control_dependencies = []
    if FLAGS.check_nans:
        checks = tf.add_check_numerics_ops()
        control_dependencies = [checks]

    # Create the back propagation and training evaluation machinery in the graph.
    with tf.name_scope('cross_entropy'):
        cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
            labels=tf.slice(ground_truth_input, [0], actual_batch_size),
            logits=tf.slice(logits, [0, 0],
                            tf.concat([actual_batch_size, [-1]], 0)))
    tf.summary.scalar('cross_entropy', cross_entropy_mean)
    with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
        learning_rate_input = tf.placeholder(tf.float32, [],
                                             name='learning_rate_input')
        if FLAGS.optimizer == 'sgd':
            train_step = tf.train.GradientDescentOptimizer(
                learning_rate_input).minimize(cross_entropy_mean)
        elif FLAGS.optimizer == 'adam':
            train_step = tf.train.AdamOptimizer(learning_rate_input).minimize(
                cross_entropy_mean)
        elif FLAGS.optimizer == 'adagrad':
            train_step = tf.train.AdagradOptimizer(
                learning_rate_input).minimize(cross_entropy_mean)
        elif FLAGS.optimizer == 'rmsprop':
            train_step = tf.train.RMSPropOptimizer(
                learning_rate_input).minimize(cross_entropy_mean)
    predicted_indices = tf.argmax(logits, 1)
    correct_prediction = tf.equal(predicted_indices, ground_truth_input)
    confusion_matrix = tf.confusion_matrix(tf.slice(ground_truth_input, [0],
                                                    actual_batch_size),
                                           tf.slice(predicted_indices, [0],
                                                    actual_batch_size),
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(
        tf.cast(tf.slice(correct_prediction, [0], actual_batch_size),
                tf.float32))
    tf.summary.scalar('accuracy', evaluation_step)

    global_step = tf.train.get_or_create_global_step()
    increment_global_step = tf.assign(global_step, global_step + 1)

    saver = tf.train.Saver(tf.global_variables(), max_to_keep=0)

    # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
    merged_summaries = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                         sess.graph)
    validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                              '/validation')

    tf.global_variables_initializer().run()

    start_step = 1

    if FLAGS.start_checkpoint:
        models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
        start_step = 1 + global_step.eval(session=sess)

    t0 = dt.datetime.now()
    tf.logging.info('Training from time %s, step: %d ', t0.isoformat(),
                    start_step)

    # Save graph.pbtxt.
    tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                         FLAGS.model_architecture + '.pbtxt')

    # Save list of words.
    if FLAGS.start_checkpoint == '':
        with gfile.GFile(os.path.join(FLAGS.train_dir, \
                                      FLAGS.model_architecture + '_labels.txt'), 'w') as f:
            f.write(FLAGS.wanted_words.replace(',', '\n'))

    # log complexity of model
    total_parameters = 0
    for variable in tf.trainable_variables():
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= int(dim)
        total_parameters += variable_parameters
    tf.logging.info('number of trainable parameters: %d', total_parameters)

    checkpoint_path = os.path.join(FLAGS.train_dir,
                                   FLAGS.model_architecture + '.ckpt')
    if FLAGS.start_checkpoint == '':
        tf.logging.info('Saving to "%s-%d"', checkpoint_path, 0)
        saver.save(sess, checkpoint_path, global_step=0)

    audio_processor = input_data.AudioProcessor(
        FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
        FLAGS.unknown_percentage, FLAGS.wanted_words.split(','),
        FLAGS.labels_touse.split(','),
        FLAGS.validation_percentage, FLAGS.validation_offset_percentage,
        FLAGS.validation_files.split(','), FLAGS.testing_percentage,
        FLAGS.testing_files.split(','), FLAGS.subsample_skip,
        FLAGS.subsample_word, FLAGS.partition_word, FLAGS.partition_n,
        FLAGS.partition_training_files.split(','),
        FLAGS.partition_validation_files.split(','), FLAGS.random_seed_batch,
        FLAGS.testing_equalize_ratio, FLAGS.testing_max_samples,
        model_settings)

    # exit if how_many_training_steps==0
    if FLAGS.how_many_training_steps == '0':
        # pre-process a batch of data to make sure settings are valid
        train_fingerprints, train_ground_truth, _ = audio_processor.get_data(
            FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
            FLAGS.background_volume, time_shift_samples,
            FLAGS.time_shift_random, 'training', sess)
        sess.run(
            [evaluation_step],
            feed_dict={
                fingerprint_input: train_fingerprints,
                ground_truth_input: train_ground_truth,
                learning_rate_input: learning_rates_list[0],
                actual_batch_size: [FLAGS.batch_size],
                dropout_prob: model_settings['dropout_prob']
            })
        return

    training_set_size = audio_processor.set_size('training')
    testing_set_size = audio_processor.set_size('testing')
    validation_set_size = audio_processor.set_size('validation')

    # Training loop.
    training_steps_max = np.sum(training_steps_list)
    for training_step in xrange(start_step, training_steps_max + 1):
        if training_set_size > 0 and FLAGS.save_step_interval > 0:
            # Figure out what the current learning rate is.
            training_steps_sum = 0
            for i in range(len(training_steps_list)):
                training_steps_sum += training_steps_list[i]
                if training_step <= training_steps_sum:
                    learning_rate_value = learning_rates_list[i]
                    break
            # Pull the audio samples we'll use for training.
            train_fingerprints, train_ground_truth, _ = audio_processor.get_data(
                FLAGS.batch_size, 0, model_settings,
                FLAGS.background_frequency, FLAGS.background_volume,
                time_shift_samples, FLAGS.time_shift_random, 'training', sess)
            # Run the graph with this batch of training data.
            train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
                [
                    merged_summaries, evaluation_step, cross_entropy_mean,
                    train_step, increment_global_step
                ],
                feed_dict={
                    fingerprint_input: train_fingerprints,
                    ground_truth_input: train_ground_truth,
                    learning_rate_input: learning_rate_value,
                    actual_batch_size: [FLAGS.batch_size],
                    dropout_prob: model_settings['dropout_prob']
                })
            train_writer.add_summary(train_summary, training_step)
            t1 = dt.datetime.now() - t0
            tf.logging.info(
                'Elapsed %f, Step #%d: rate %f, accuracy %.1f%%, cross entropy %f'
                % (t1.total_seconds(), training_step, learning_rate_value,
                   train_accuracy * 100, cross_entropy_value))

            # Save the model checkpoint periodically.
            if (training_step % FLAGS.save_step_interval == 0
                    or training_step == training_steps_max):
                tf.logging.info('Saving to "%s-%d"', checkpoint_path,
                                training_step)
                saver.save(sess, checkpoint_path, global_step=training_step)

        is_last_step = (training_step == training_steps_max)
        if validation_set_size > 0 and (is_last_step or
                                        (training_step %
                                         FLAGS.eval_step_interval) == 0):
            validate_and_test('validation', validation_set_size, model_settings, \
                              time_shift_samples, sess, merged_summaries, evaluation_step, \
                              confusion_matrix, logits, hidden, validation_writer, \
                              audio_processor, is_last_step, fingerprint_input, \
                              ground_truth_input, actual_batch_size, dropout_prob, \
                              training_step, t0)
    if testing_set_size > 0:
        validate_and_test('testing', testing_set_size, model_settings, time_shift_samples, \
                          sess, merged_summaries, evaluation_step, confusion_matrix, \
                          logits, hidden, validation_writer, audio_processor, \
                          True, fingerprint_input, ground_truth_input, \
                          actual_batch_size, dropout_prob, training_steps_max, t0)
예제 #20
0
def evolved_transformer_decoder(decoder_input,
                                encoder_output,
                                decoder_self_attention_bias,
                                encoder_decoder_attention_bias,
                                hparams,
                                cache=None,
                                decode_loop_step=None,
                                name="decoder",
                                nonpadding=None,
                                save_weights_to=None,
                                make_image_summary=True,
                                losses=None):
    """Evolved Transformer decoder. See arxiv.org/abs/1901.11117 for more details.

  Args:
    decoder_input: a Tensor.
    encoder_output: a Tensor.
    decoder_self_attention_bias: bias Tensor for self-attention (see
      common_attention.attention_bias()).
    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
      (see common_attention.attention_bias()).
    hparams: hyperparameters for model.
    cache: dict, containing tensors which are the results of previous
      layers, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop. Only used
      for inference on TPU.
    name: a string.
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This is used to mask out
      padding in convolutional layers.  We generally only need this mask for
      "packed" datasets, because for ordinary datasets, no padding is ever
      followed by nonpadding.
    save_weights_to: an optional dictionary to capture attention weights for
      visualization; the weights tensor will be appended there under a string
      key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: Not supported.

  Returns:
    Decoder output tensor.
  """
    del losses

    num_trainable_top_decoder_layers = hparams.get(
        "num_trainable_top_decoder_layers", -1)  # -1 means train all weights.

    if num_trainable_top_decoder_layers >= 0:
        encoder_output = tf.stop_gradient(encoder_output)

    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))

    with tf.variable_scope(name):
        hidden_state = decoder_input

        num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers
        for layer in range(num_layers):
            if num_trainable_top_decoder_layers == num_layers - layer:
                hidden_state = tf.stop_gradient(hidden_state)
            layer_name = "layer_%d" % layer
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name):

                with tf.variable_scope(_SIXTEEN_HEAD_ATTENTION_NAME):
                    residual_state = hidden_state
                    hidden_state = common_layers.layer_preprocess(
                        hidden_state, hparams)

                    attention_cache = layer_cache[
                        _SIXTEEN_HEAD_ATTENTION_NAME] if layer_cache is not None else None
                    left_state = common_attention.multihead_attention(
                        hidden_state,
                        None,
                        decoder_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        _capped_double_heads(hparams.num_heads),
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        max_relative_position=hparams.max_relative_position,
                        heads_share_relative_embedding=(
                            hparams.heads_share_relative_embedding),
                        add_relative_to_values=hparams.add_relative_to_values,
                        save_weights_to=save_weights_to,
                        cache=attention_cache,
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"),
                        decode_loop_step=decode_loop_step,
                        vars_3d=hparams.get("attention_variables_3d"),
                        activation_dtype=hparams.get("activation_dtype",
                                                     "float32"),
                        weight_dtype=hparams.get("weight_dtype", "float32"))

                if encoder_output is not None:
                    with tf.variable_scope(_FIRST_ATTEND_TO_ENCODER_NAME):
                        attention_cache = (
                            layer_cache[_FIRST_ATTEND_TO_ENCODER_NAME]
                            if layer_cache is not None else None)
                        right_state = common_attention.multihead_attention(
                            hidden_state,
                            encoder_output,
                            encoder_decoder_attention_bias,
                            hparams.attention_key_channels
                            or hparams.hidden_size,
                            hparams.attention_value_channels
                            or hparams.hidden_size,
                            hparams.hidden_size,
                            hparams.num_heads,
                            hparams.attention_dropout,
                            max_relative_position=hparams.
                            max_relative_position,
                            heads_share_relative_embedding=(
                                hparams.heads_share_relative_embedding),
                            add_relative_to_values=hparams.
                            add_relative_to_values,
                            save_weights_to=save_weights_to,
                            cache=attention_cache,
                            make_image_summary=make_image_summary,
                            dropout_broadcast_dims=
                            attention_dropout_broadcast_dims,
                            max_length=hparams.get("max_length"),
                            vars_3d=hparams.get("attention_variables_3d"),
                            activation_dtype=hparams.get(
                                "activation_dtype", "float32"),
                            weight_dtype=hparams.get("weight_dtype",
                                                     "float32"))

                        left_state = tf.nn.dropout(
                            left_state,
                            1 - hparams.layer_prepostprocess_dropout)
                        right_state = tf.nn.dropout(
                            right_state,
                            1 - hparams.layer_prepostprocess_dropout)

                        hidden_state = residual_state + left_state + right_state

                else:
                    hidden_state = common_layers.layer_postprocess(
                        residual_state, left_state, hparams)

                with tf.variable_scope(_CONV_BRANCHES_NAME):
                    residual_state = hidden_state
                    hidden_state = common_layers.layer_preprocess(
                        hidden_state, hparams)

                    if nonpadding is not None:
                        # Mask padding from conv layers.
                        mask = tf.tile(tf.expand_dims(nonpadding, 2),
                                       [1, 1, hparams.hidden_size])
                        hidden_state *= mask

                    if layer_cache:
                        if decode_loop_step is None:
                            hidden_state = layer_cache[
                                _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.concat(
                                    [
                                        layer_cache[
                                            _CONV_BRANCHES_FIRST_LAYER_NAME],
                                        hidden_state
                                    ],
                                    axis=1)[:,
                                            -1 * _DECODER_LEFT_CONV_PADDING -
                                            1:, :]
                            left_state = hidden_state
                            right_state = hidden_state[:,
                                                       _DECODER_LEFT_CONV_PADDING
                                                       -
                                                       _DECODER_RIGHT_CONV_PADDING:, :]

                        else:
                            # Inplace update is required for inference on TPU.
                            # Inplace_ops only supports inplace_update on the first dimension.
                            tmp = tf.transpose(
                                layer_cache[_CONV_BRANCHES_FIRST_LAYER_NAME],
                                perm=[1, 0, 2])
                            tmp = tf.expand_dims(tmp, axis=1)
                            tmp = inplace_ops.alias_inplace_update(
                                tmp,
                                decode_loop_step * tf.shape(hidden_state)[1] +
                                _DECODER_LEFT_CONV_PADDING,
                                tf.transpose(hidden_state, perm=[1, 0, 2]))
                            tmp = tf.squeeze(tmp, axis=1)
                            hidden_state = layer_cache[
                                _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.transpose(
                                    tmp, perm=[1, 0, 2])

                            batch_size = hidden_state.shape.as_list()[0]
                            left_state = tf.slice(
                                hidden_state, [0, decode_loop_step, 0], [
                                    batch_size, _DECODER_LEFT_CONV_PADDING + 1,
                                    hparams.hidden_size
                                ])
                            right_state = tf.slice(hidden_state, [
                                0,
                                decode_loop_step + _DECODER_LEFT_CONV_PADDING -
                                _DECODER_RIGHT_CONV_PADDING, 0
                            ], [
                                batch_size, _DECODER_RIGHT_CONV_PADDING + 1,
                                hparams.hidden_size
                            ])

                    else:  # No caching.
                        left_state = tf.pad(
                            hidden_state,
                            paddings=[[0, 0], [_DECODER_LEFT_CONV_PADDING, 0],
                                      [0, 0]])
                        right_state = tf.pad(
                            hidden_state,
                            paddings=[[0, 0], [_DECODER_RIGHT_CONV_PADDING, 0],
                                      [0, 0]])

                    left_output_dim = int(hparams.hidden_size * 2)
                    separable_conv_11x1 = tf.layers.SeparableConv1D(
                        left_output_dim,
                        11,
                        padding="VALID",
                        name="separable_conv11x1",
                        activation=tf.nn.relu)
                    left_state = separable_conv_11x1.apply(left_state)
                    left_state = tf.nn.dropout(
                        left_state, 1 - hparams.layer_prepostprocess_dropout)

                    right_output_dim = int(hparams.hidden_size / 2)
                    separable_conv_7x1_1 = tf.layers.SeparableConv1D(
                        right_output_dim,
                        7,
                        padding="VALID",
                        name="separable_conv_7x1_1")
                    right_state = separable_conv_7x1_1.apply(right_state)
                    right_state = tf.nn.dropout(
                        right_state, 1 - hparams.layer_prepostprocess_dropout)
                    right_state = tf.pad(
                        right_state, [[0, 0], [0, 0],
                                      [0, left_output_dim - right_output_dim]],
                        constant_values=0)

                    hidden_state = left_state + right_state

                    hidden_state = common_layers.layer_preprocess(
                        hidden_state, hparams)
                    if nonpadding is not None:
                        # Mask padding from conv layers.
                        mask = tf.tile(tf.expand_dims(nonpadding, 2),
                                       [1, 1, hparams.hidden_size * 2])
                        hidden_state *= mask

                    if layer_cache:
                        if decode_loop_step is None:
                            hidden_state = layer_cache[
                                _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.concat(
                                    [
                                        layer_cache[
                                            _CONV_BRANCHES_SECOND_LAYER_NAME],
                                        hidden_state
                                    ],
                                    axis=1)[:,
                                            -1 * _DECODER_FINAL_CONV_PADDING -
                                            1:, :]

                        else:
                            # Inplace update is required for inference on TPU.
                            # Inplace_ops only supports inplace_update on the first dimension.
                            tmp = tf.transpose(
                                layer_cache[_CONV_BRANCHES_SECOND_LAYER_NAME],
                                perm=[1, 0, 2])
                            tmp = tf.expand_dims(tmp, axis=1)
                            tmp = inplace_ops.alias_inplace_update(
                                tmp, (decode_loop_step +
                                      _DECODER_FINAL_CONV_PADDING) *
                                tf.shape(hidden_state)[1],
                                tf.transpose(hidden_state, perm=[1, 0, 2]))
                            tmp = tf.squeeze(tmp, axis=1)
                            hidden_state = layer_cache[
                                _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.transpose(
                                    tmp, perm=[1, 0, 2])

                            batch_size = hidden_state.shape.as_list()[0]
                            hidden_state = tf.slice(
                                hidden_state, [0, decode_loop_step, 0], [
                                    batch_size, _DECODER_FINAL_CONV_PADDING +
                                    1, hparams.hidden_size * 2
                                ])
                    else:
                        hidden_state = tf.pad(
                            hidden_state,
                            paddings=[[0, 0], [_DECODER_FINAL_CONV_PADDING, 0],
                                      [0, 0]])

                    separable_conv_7x1_2 = tf.layers.SeparableConv1D(
                        hparams.hidden_size,
                        7,
                        padding="VALID",
                        name="separable_conv_7x1_2")
                    hidden_state = separable_conv_7x1_2.apply(hidden_state)

                    hidden_state = common_layers.layer_postprocess(
                        residual_state, hidden_state, hparams)

                with tf.variable_scope(_VANILLA_ATTENTION_NAME):
                    residual_state = hidden_state
                    hidden_state = common_layers.layer_preprocess(
                        hidden_state, hparams)

                    attention_cache = layer_cache[
                        _VANILLA_ATTENTION_NAME] if layer_cache is not None else None
                    hidden_state = common_attention.multihead_attention(
                        hidden_state,
                        None,
                        decoder_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        max_relative_position=hparams.max_relative_position,
                        heads_share_relative_embedding=(
                            hparams.heads_share_relative_embedding),
                        add_relative_to_values=hparams.add_relative_to_values,
                        save_weights_to=save_weights_to,
                        cache=attention_cache,
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"),
                        decode_loop_step=decode_loop_step,
                        vars_3d=hparams.get("attention_variables_3d"),
                        activation_dtype=hparams.get("activation_dtype",
                                                     "float32"),
                        weight_dtype=hparams.get("weight_dtype", "float32"))
                    hidden_state = common_layers.layer_postprocess(
                        residual_state, hidden_state, hparams)

                if encoder_output is not None:
                    with tf.variable_scope(_SECOND_ATTEND_TO_ENCODER_NAME):
                        residual_state = hidden_state
                        hidden_state = common_layers.layer_preprocess(
                            hidden_state, hparams)

                        attention_cache = (
                            layer_cache[_SECOND_ATTEND_TO_ENCODER_NAME]
                            if layer_cache is not None else None)
                        hidden_state = common_attention.multihead_attention(
                            hidden_state,
                            encoder_output,
                            encoder_decoder_attention_bias,
                            hparams.attention_key_channels
                            or hparams.hidden_size,
                            hparams.attention_value_channels
                            or hparams.hidden_size,
                            hparams.hidden_size,
                            hparams.num_heads,
                            hparams.attention_dropout,
                            max_relative_position=hparams.
                            max_relative_position,
                            heads_share_relative_embedding=(
                                hparams.heads_share_relative_embedding),
                            add_relative_to_values=hparams.
                            add_relative_to_values,
                            save_weights_to=save_weights_to,
                            cache=attention_cache,
                            make_image_summary=make_image_summary,
                            dropout_broadcast_dims=
                            attention_dropout_broadcast_dims,
                            max_length=hparams.get("max_length"),
                            vars_3d=hparams.get("attention_variables_3d"),
                            activation_dtype=hparams.get(
                                "activation_dtype", "float32"),
                            weight_dtype=hparams.get("weight_dtype",
                                                     "float32"))
                        hidden_state = common_layers.layer_postprocess(
                            residual_state, hidden_state, hparams)

                with tf.variable_scope("dense_layers"):
                    residual_state = hidden_state
                    hidden_state = common_layers.layer_preprocess(
                        hidden_state, hparams)

                    hidden_state = tf.layers.dense(hidden_state,
                                                   int(hparams.hidden_size *
                                                       4),
                                                   activation=tf.nn.swish)
                    hidden_state = tf.nn.dropout(
                        hidden_state, 1 - hparams.layer_prepostprocess_dropout)

                    hidden_state = common_layers.layer_preprocess(
                        hidden_state, hparams)

                    hidden_state = tf.layers.dense(hidden_state,
                                                   hparams.hidden_size)
                    hidden_state = common_layers.layer_postprocess(
                        residual_state, hidden_state, hparams)

        decoder_output = common_layers.layer_preprocess(hidden_state, hparams)
        if num_trainable_top_decoder_layers == 0:
            decoder_output = tf.stop_gradient(decoder_output)
        return decoder_output
예제 #21
0
  def prepare_processing_graph(self, flags):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      flags: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
    with tf.compat.v1.get_default_graph().name_scope('data'):
      desired_samples = flags.desired_samples
      self.wav_filename_placeholder_ = tf.compat.v1.placeholder(
          tf.string, [], name='wav_filename')
      wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
      wav_decoder = tf.audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.compat.v1.placeholder(
          tf.float32, [], name='foreground_volume')
      # signal resampling to generate more training data
      # it will stretch or squeeze input signal proportinally to:
      self.foreground_resampling_placeholder_ = tf.placeholder(tf.float32, [])

      if self.foreground_resampling_placeholder_ != 1.0:
        image = tf.expand_dims(wav_decoder.audio, 0)
        image = tf.expand_dims(image, 2)
        shape = tf.shape(wav_decoder.audio)
        image_resized = tf.image.resize(
            images=image,
            size=(tf.cast((tf.cast(shape[0], tf.float32) *
                           self.foreground_resampling_placeholder_),
                          tf.int32), 1),
            preserve_aspect_ratio=False)
        image_resized_cropped = tf.image.resize_with_crop_or_pad(
            image_resized,
            target_height=desired_samples,
            target_width=1,
        )
        image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3])
        scaled_foreground = tf.multiply(image_resized_cropped,
                                        self.foreground_volume_placeholder_)
      else:
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
      # Shift the sample's start position, and pad any gaps with zeros.
      self.time_shift_padding_placeholder_ = tf.compat.v1.placeholder(
          tf.int32, [2, 2], name='time_shift_padding')
      self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder(
          tf.int32, [2], name='time_shift_offset')
      padded_foreground = tf.pad(
          tensor=scaled_foreground,
          paddings=self.time_shift_padding_placeholder_,
          mode='CONSTANT')
      sliced_foreground = tf.slice(padded_foreground,
                                   self.time_shift_offset_placeholder_,
                                   [desired_samples, -1])
      # Mix in background noise.
      self.background_data_placeholder_ = tf.compat.v1.placeholder(
          tf.float32, [desired_samples, 1], name='background_data')
      self.background_volume_placeholder_ = tf.compat.v1.placeholder(
          tf.float32, [], name='background_volume')
      background_mul = tf.multiply(self.background_data_placeholder_,
                                   self.background_volume_placeholder_)
      background_add = tf.add(background_mul, sliced_foreground)
      background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

      if flags.preprocess == 'raw':
        # return raw audio
        self.output_ = background_clamp
        tf.compat.v1.summary.image(
            'input_audio',
            tf.expand_dims(tf.expand_dims(background_clamp, -1), -1),
            max_outputs=1)
      else:
        # Run the spectrogram and MFCC ops to get a 2D audio 'fingerprint'
        spectrogram = audio_ops.audio_spectrogram(
            background_clamp,
            window_size=flags.window_size_samples,
            stride=flags.window_stride_samples,
            magnitude_squared=True)
        tf.compat.v1.summary.image(
            'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
        # The number of buckets in each FFT row in the spectrogram will depend
        # on how many input samples there are in each window. This can be quite
        # large, with a 160 sample window producing 127 buckets for example. We
        # don't need this level of detail for classification, so we often want
        # to shrink them down to produce a smaller result. That's what this
        # section implements. One method is to use average pooling to merge
        # adjacent buckets, but a more sophisticated approach is to apply the
        # MFCC algorithm to shrink the representation.
        if flags.preprocess == 'average':
          self.output_ = tf.nn.pool(
              input=tf.expand_dims(spectrogram, -1),
              window_shape=[1, flags.average_window_width],
              strides=[1, flags.average_window_width],
              pooling_type='AVG',
              padding='SAME')
          tf.compat.v1.summary.image('shrunk_spectrogram',
                                     self.output_,
                                     max_outputs=1)
        elif flags.preprocess == 'mfcc':
          self.output_ = audio_ops.mfcc(
              spectrogram,
              wav_decoder.sample_rate,
              dct_coefficient_count=flags.fingerprint_width)
          tf.compat.v1.summary.image(
              'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
        elif flags.preprocess == 'micro':
          if not frontend_op:
            raise Exception(
                'Micro frontend op is currently not available when running'
                ' TensorFlow directly from Python, you need to build and run'
                ' through Bazel')
          sample_rate = flags.sample_rate
          window_size_ms = (flags.window_size_samples * 1000) / sample_rate
          window_step_ms = (flags.window_stride_samples * 1000) / sample_rate
          int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16)
          micro_frontend = frontend_op.audio_microfrontend(
              int16_input,
              sample_rate=sample_rate,
              window_size=window_size_ms,
              window_step=window_step_ms,
              num_channels=flags.fingerprint_width,
              out_scale=1,
              out_type=tf.float32)
          self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
          tf.compat.v1.summary.image(
              'micro',
              tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
              max_outputs=1)
        else:
          raise ValueError('Unknown preprocess mode "%s" (should be "mfcc", '
                           ' "average", or "micro")' % (flags.preprocess))

      # Merge all the summaries and write them out to /tmp/retrain_logs (by
      # default)
      self.merged_summaries_ = tf.compat.v1.summary.merge_all(scope='data')
      if flags.summaries_dir:
        self.summary_writer_ = tf.compat.v1.summary.FileWriter(
            flags.summaries_dir + '/data', tf.compat.v1.get_default_graph())
예제 #22
0
    print('chunk data and apply model')

    #if i ==1:
    #    import code
    #   code.interact(local=locals())

    for f in range(len(score_list)):
        stage = math.floor(f / 2)
        IS_EVEN = (f % 2) == 0  #even are big chunks, odd small chunks
        IS_LAST = f == len(
            score_list) - 1  #last is remainder and not abbreviated

        if IS_EVEN & (IS_LAST == False):
            chunk = tf.expand_dims(
                tf.slice(waveform, [stage * max_samples, 0], [max_samples, 1]),
                0)  #eliminate last dets later
            scores = score_fn(waveform=chunk,
                              context_step_samples=context_step_samples)[
                                  'scores']  #["scores"].numpy().ravel()

        elif IS_EVEN & IS_LAST:
            pos = stage * max_samples
            if pos != 0:
                rem = samples_all % pos
            else:  #only should occur in single file case
                rem = samples_all
            chunk = tf.expand_dims(tf.slice(waveform, [pos, 0], [rem, 1]), 0)

            scores = score_fn(
                waveform=chunk,
예제 #23
0
    def _update_block_mask(self, weights, threshold, mask):
        """Performs block-granular masking of the weights.

    Block pruning occurs only if the block_height or block_width is > 1 and
    if the weight tensor, when squeezed, has ndims = 2. Otherwise, elementwise
    pruning occurs.
    Args:
      weights: The weight tensor that needs to be masked.
      threshold: The current threshold value. The function will compute a new
        threshold and return the exponential moving average using the current
        value of threshold
      mask: The mask from the previous pruning update.

    Returns:
      new_threshold: The new value of the threshold based on weights, and
        sparsity at the current global_step
      new_mask: A numpy array of the same size and shape as weights containing
        0 or 1 to indicate which of the values in weights falls below
        the threshold

    Raises:
      ValueError: if block pooling function is not AVG or MAX
    """
        squeezed_weights = tf.squeeze(weights)
        if squeezed_weights.get_shape().ndims != 2 or self._block_dim == [
                1, 1
        ]:
            if self._pruning_method == 'threshold':
                return self._update_mask(weights, threshold)
            # random_cumulative removes at random taking into account previous
            # random modification. random_indepent simply removes at random.
            elif self._pruning_method in [
                    'random_independent', 'random_cumulative'
            ]:
                return self._update_random_mask(weights, mask)
            else:
                raise ValueError('Unknown pruning method: %s' %
                                 self._pruning_method)

        if self._block_pooling_function not in ['AVG', 'MAX']:
            raise ValueError(
                'Unknown pooling function for block sparsity: %s' %
                self._block_pooling_function)

        with tf.name_scope(weights.op.name + '_pruning_ops'):
            abs_weights = tf.abs(squeezed_weights)

            pool_window = [self._block_dim[0], self._block_dim[1]]
            pool_fn = pruning_utils.factorized_pool

            if not self._use_tpu:
                pool_fn = tf.pool
                abs_weights = tf.reshape(abs_weights, [
                    1,
                    abs_weights.get_shape()[0],
                    abs_weights.get_shape()[1], 1
                ])

            pooled_weights = pool_fn(abs_weights,
                                     window_shape=pool_window,
                                     pooling_type=self._block_pooling_function,
                                     strides=pool_window,
                                     padding='SAME',
                                     name=weights.op.name + '_pooled')

            if pooled_weights.get_shape().ndims != 2:
                pooled_weights = tf.squeeze(pooled_weights)

            if self._pruning_method == 'threshold':
                smoothed_threshold, new_mask = self._update_mask(
                    pooled_weights, threshold)
            elif self._pruning_method in [
                    'random_independent', 'random_cumulative'
            ]:
                smoothed_threshold, new_mask = self._update_random_mask(
                    pooled_weights, mask)
            else:
                raise ValueError('Unknown pruning method: %s' %
                                 self._pruning_method)

            ## this is the process that updates the mask.
            updated_mask = pruning_utils.kronecker_product(
                new_mask, tf.ones(self._block_dim))
            sliced_mask = tf.slice(updated_mask, [0, 0], [
                squeezed_weights.get_shape()[0],
                squeezed_weights.get_shape()[1]
            ])

        return smoothed_threshold, tf.reshape(sliced_mask, tf.shape(weights))
예제 #24
0
    def __build_graph(self):
        """
            Build TF graph for analogy-making
        """
        self.__sem_target = tf.placeholder(
            shape=[self._n_slots, self._sem_dim], dtype=tf.float32)
        self.__struct_target = tf.placeholder(
            shape=[self._max_arity, self._n_slots, self._n_slots],
            dtype=tf.float32)

        self.__sem_base = tf.placeholder(\
                shape=[None, self._n_slots, self._sem_dim],
                dtype=tf.float32)
        self.__struct_base = tf.placeholder(
            shape=[None, self._max_arity, self._n_slots, self._n_slots],
            dtype=tf.float32)

        #Construct recoding matrix
        if Path("recode/recode_mat.{}.pickle".format(self._n_slots)).is_file():
            #a recoding matrix with given parameters is already
            # created and serialized, load it
            print("Loading recoding matix...", end="")
            sys.stdout.flush()
            with open("recode/recode_mat.{}.pickle".format(self._n_slots),
                      "rb") as file_h:
                recode_mat = tf.constant(pickle.load(file_h))
            print("Done.")
            sys.stdout.flush()
        else:
            #create recoding matrix and serialize it to a file
            recode_mat = tf.constant(self.__construct_recode_mat())
            with open("recode/recode_mat.{}.pickle".format(self._n_slots),
                      "wb") as file_h:
                with tf.Session():
                    pickle.dump(recode_mat.eval(), file_h)

        #generate all possible states of the semantics of the target
        sem_targets = tf.reshape(
            tf.matmul(
                tf.reshape(recode_mat,
                           [self._n_states * self._n_slots, self._n_slots]),
                self.__sem_target),
            [1, self._n_states, self._n_slots * self._sem_dim])

        #generate all possible states of the structure of the target
        struct_targets = tf.reshape(
            tf.transpose(
                tf.reshape(
                    tf.concat([
                        tf.matmul(
                            tf.reshape(
                                tf.matmul(
                                    tf.reshape(recode_mat, [
                                        self._n_states * self._n_slots,
                                        self._n_slots
                                    ]),
                                    self.__struct_target[a_i],
                                ),
                                [self._n_states, self._n_slots, self._n_slots
                                 ]),
                            tf.transpose(recode_mat, [0, 2, 1]),
                        ) for a_i in range(self._max_arity)
                    ], 0), [
                        self._max_arity, self._n_states,
                        self._n_slots * self._n_slots
                    ]), [1, 0, 2]), [
                        1, self._n_states,
                        self._max_arity * self._n_slots * self._n_slots
                    ])

        #computer number of bases
        n_bases = tf.shape(self.__sem_base)[0]

        #reshapoe bases
        sem_base = tf.reshape(
            tf.tile(
                tf.reshape(self.__sem_base,
                           [n_bases, self._n_slots * self._sem_dim]),
                [1, self._n_states]),
            [n_bases, self._n_states, self._n_slots * self._sem_dim])

        struct_base = tf.reshape(
            tf.tile(
                tf.reshape(
                    self.__struct_base,
                    [n_bases, self._max_arity * self._n_slots * self._n_slots
                     ]), [1, self._n_states]), [
                         n_bases, self._n_states,
                         self._max_arity * self._n_slots * self._n_slots
                     ])

        #compute semantics denominator for cosine similarity
        denom_sem = tf.multiply(
            tf.sqrt(
                tf.reduce_sum(tf.multiply(sem_targets, sem_targets),
                              axis=[2])),
            tf.sqrt(tf.reduce_sum(tf.multiply(sem_base, sem_base), axis=[2])))
        #compute numerator
        num_sem = tf.reduce_sum(tf.multiply(sem_targets, sem_base), axis=[2])
        #compute cosine similarity
        sem_cos = -K.losses.cosine_similarity(sem_targets, sem_base, axis=[2])
        #tf.add(tf.multiply(tf.divide(num_sem, denom_sem), 0.5), 0.5)
        print(sem_cos)
        #compute structure denominator for cosine similarity
        denom_struct = tf.multiply(
            tf.sqrt(
                tf.reduce_sum(tf.multiply(struct_targets, struct_targets),
                              axis=[2])),
            tf.sqrt(
                tf.reduce_sum(tf.multiply(struct_base, struct_base),
                              axis=[2])))
        #compute numerator
        num_struct = tf.reduce_sum(tf.multiply(struct_targets, struct_base),
                                   axis=[2])

        #compute cosine similarity
        struct_cos = -K.losses.cosine_similarity(
            struct_targets, struct_base, axis=[2])
        #tf.divide(num_struct, denom_struct)

        similarities = tf.add(tf.multiply(sem_cos, 1 - self._sigma),
                              tf.multiply(struct_cos, self._sigma))

        self._sem_cos = sem_cos
        self._struct_cos = struct_cos

        #get maximum similarity
        base_max_similarities = tf.reduce_max(similarities, axis=[1])
        #get index of base with max similarity
        self.__best_base_index = tf.argmax(base_max_similarities)
        #get the index of the recoding which lead to the max similarity
        best_recoding_no = \
            tf.argmax(
                tf.reshape(
                        tf.slice(
                            similarities,
                            [self.__best_base_index, 0], [1, self._n_states]),
                        [self._n_states]))
        #maximum similarity value
        self.__best_base_similarity = tf.reduce_max(base_max_similarities)
        #best recoding
        self.__best_recoding = tf.slice(recode_mat, [best_recoding_no, 0, 0],
                                        [1, self._n_slots, self._n_slots])
        #best recoding of semantics
        self.__best_target_sem_recoding = tf.reshape(
            tf.slice(sem_targets, [0, best_recoding_no, 0],
                     [1, 1, self._n_slots * self._sem_dim]),
            [self._n_slots, self._sem_dim])
        #best recoding of structure
        self.__best_target_struct_recoding = tf.reshape(
            tf.slice(struct_targets, [0, best_recoding_no, 0],
                     [1, 1, self._max_arity * self._n_slots * self._n_slots]),
            [self._max_arity, self._n_slots, self._n_slots])
예제 #25
0
def build(input_reader_config,
          model_config,
          lstm_config,
          unroll_length,
          data_augmentation_options=None,
          batch_size=1):
    """Builds a tensor dictionary based on the InputReader config.

  Args:
    input_reader_config: An input_reader_builder.InputReader object.
    model_config: A model.proto object containing the config for the desired
      DetectionModel.
    lstm_config: LSTM specific configs.
    unroll_length: Unrolled length for LSTM training.
    data_augmentation_options: A list of tuples, where each tuple contains a
      data augmentation function and a dictionary containing arguments and their
      values (see preprocessor.py).
    batch_size: Batch size for queue outputs.

  Returns:
    A dictionary of tensors based on items in the input_reader_config.

  Raises:
    ValueError: On invalid input reader proto.
    ValueError: If no input paths are specified.
  """
    if not isinstance(input_reader_config, input_reader_pb2.InputReader):
        raise ValueError('input_reader_config not of type '
                         'input_reader_pb2.InputReader.')

    external_reader_config = input_reader_config.external_input_reader
    external_input_reader_config = external_reader_config.Extensions[
        input_reader_google_pb2.GoogleInputReader.google_input_reader]
    input_reader_type = external_input_reader_config.WhichOneof('input_reader')

    if input_reader_type == 'tf_record_video_input_reader':
        config = external_input_reader_config.tf_record_video_input_reader
        reader_type_class = tf.TFRecordReader
    else:
        raise ValueError('Unsupported reader in input_reader_config: %s' %
                         input_reader_type)

    if not config.input_path:
        raise ValueError('At least one input path must be specified in '
                         '`input_reader_config`.')
    key, value = parallel_reader.parallel_read(
        config.input_path[:],  # Convert `RepeatedScalarContainer` to list.
        reader_class=reader_type_class,
        num_epochs=(input_reader_config.num_epochs
                    if input_reader_config.num_epochs else None),
        num_readers=input_reader_config.num_readers,
        shuffle=input_reader_config.shuffle,
        dtypes=[tf.string, tf.string],
        capacity=input_reader_config.queue_capacity,
        min_after_dequeue=input_reader_config.min_after_dequeue)

    # TODO(yinxiao): Add loading instance mask option.
    decoder = tf_sequence_example_decoder.TFSequenceExampleDecoder()

    keys_to_decode = [
        fields.InputDataFields.image, fields.InputDataFields.groundtruth_boxes,
        fields.InputDataFields.groundtruth_classes
    ]
    tensor_dict = decoder.decode(value, items=keys_to_decode)

    tensor_dict['image'].set_shape([None, None, None, 3])
    tensor_dict['groundtruth_boxes'].set_shape([None, None, 4])

    height = model_config.ssd.image_resizer.fixed_shape_resizer.height
    width = model_config.ssd.image_resizer.fixed_shape_resizer.width

    # If data augmentation is specified in the config file, the preprocessor
    # will be called here to augment the data as specified. Most common
    # augmentations include horizontal flip and cropping.
    if data_augmentation_options:
        images_pre = tf.split(tensor_dict['image'],
                              config.video_length,
                              axis=0)
        bboxes_pre = tf.split(tensor_dict['groundtruth_boxes'],
                              config.video_length,
                              axis=0)
        labels_pre = tf.split(tensor_dict['groundtruth_classes'],
                              config.video_length,
                              axis=0)
        images_proc, bboxes_proc, labels_proc = [], [], []
        cache = preprocessor_cache.PreprocessorCache()

        for i, _ in enumerate(images_pre):
            image_dict = {
                fields.InputDataFields.image:
                images_pre[i],
                fields.InputDataFields.groundtruth_boxes:
                tf.squeeze(bboxes_pre[i], axis=0),
                fields.InputDataFields.groundtruth_classes:
                tf.squeeze(labels_pre[i], axis=0),
            }
            image_dict = preprocessor.preprocess(
                image_dict,
                data_augmentation_options,
                func_arg_map=preprocessor.get_default_func_arg_map(),
                preprocess_vars_cache=cache)
            # Pads detection count to _PADDING_SIZE.
            image_dict[fields.InputDataFields.groundtruth_boxes] = tf.pad(
                image_dict[fields.InputDataFields.groundtruth_boxes],
                [[0, _PADDING_SIZE], [0, 0]])
            image_dict[fields.InputDataFields.groundtruth_boxes] = tf.slice(
                image_dict[fields.InputDataFields.groundtruth_boxes], [0, 0],
                [_PADDING_SIZE, -1])
            image_dict[fields.InputDataFields.groundtruth_classes] = tf.pad(
                image_dict[fields.InputDataFields.groundtruth_classes],
                [[0, _PADDING_SIZE]])
            image_dict[fields.InputDataFields.groundtruth_classes] = tf.slice(
                image_dict[fields.InputDataFields.groundtruth_classes], [0],
                [_PADDING_SIZE])
            images_proc.append(image_dict[fields.InputDataFields.image])
            bboxes_proc.append(
                image_dict[fields.InputDataFields.groundtruth_boxes])
            labels_proc.append(
                image_dict[fields.InputDataFields.groundtruth_classes])
        tensor_dict['image'] = tf.concat(images_proc, axis=0)
        tensor_dict['groundtruth_boxes'] = tf.stack(bboxes_proc, axis=0)
        tensor_dict['groundtruth_classes'] = tf.stack(labels_proc, axis=0)
    else:
        # Pads detection count to _PADDING_SIZE per frame.
        tensor_dict['groundtruth_boxes'] = tf.pad(
            tensor_dict['groundtruth_boxes'],
            [[0, 0], [0, _PADDING_SIZE], [0, 0]])
        tensor_dict['groundtruth_boxes'] = tf.slice(
            tensor_dict['groundtruth_boxes'], [0, 0, 0],
            [-1, _PADDING_SIZE, -1])
        tensor_dict['groundtruth_classes'] = tf.pad(
            tensor_dict['groundtruth_classes'], [[0, 0], [0, _PADDING_SIZE]])
        tensor_dict['groundtruth_classes'] = tf.slice(
            tensor_dict['groundtruth_classes'], [0, 0], [-1, _PADDING_SIZE])

    tensor_dict['image'], _ = preprocessor.resize_image(tensor_dict['image'],
                                                        new_height=height,
                                                        new_width=width)

    num_steps = config.video_length / unroll_length

    init_states = {
        'lstm_state_c':
        tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
        'lstm_state_h':
        tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
        'lstm_state_step':
        tf.constant(num_steps, shape=[]),
    }

    batch = sqss.batch_sequences_with_states(input_key=key,
                                             input_sequences=tensor_dict,
                                             input_context={},
                                             input_length=None,
                                             initial_states=init_states,
                                             num_unroll=unroll_length,
                                             batch_size=batch_size,
                                             num_threads=batch_size,
                                             make_keys_unique=True,
                                             capacity=batch_size * batch_size)

    return _build_training_batch_dict(batch, unroll_length, batch_size)
예제 #26
0
        def _parser_fn(serialized_example):
            """Parses a single tf.Example into image and label tensors."""
            features = {}
            features['image/ct_image'] = tf.FixedLenFeature([], tf.string)
            features['image/label'] = tf.FixedLenFeature([], tf.string)
            parsed = tf.parse_single_example(serialized_example,
                                             features=features)

            spatial_dims = [FLAGS.ct_resolution] * 3
            if FLAGS.sampled_2d_slices:
                noise_shape = [FLAGS.ct_resolution] * 2 + [FLAGS.image_c]
            else:
                noise_shape = [FLAGS.ct_resolution] * 3

            image = tf.decode_raw(parsed['image/ct_image'], tf.float32)
            label = tf.decode_raw(parsed['image/label'], tf.float32)

            if dataset_str != 'train':
                # Preprocess intensity, clip to 0 ~ 1.
                # The training set is already preprocessed.
                image = tf.clip_by_value(image / 1024.0 + 0.5, 0, 1)

            image = tf.reshape(image, spatial_dims)
            label = tf.reshape(label, spatial_dims)

            if dataset_str == 'eval' and FLAGS.sampled_2d_slices:
                return _get_stacked_2d_slices(image, label)

            if FLAGS.sampled_2d_slices:
                # Take random slices of images and label
                begin_idx = tf.random_uniform(shape=[],
                                              minval=0,
                                              maxval=FLAGS.ct_resolution -
                                              FLAGS.image_c + 1,
                                              dtype=tf.int32)
                slice_begin = [0, 0, begin_idx]
                slice_size = [
                    FLAGS.ct_resolution, FLAGS.ct_resolution, FLAGS.image_c
                ]

                image = tf.slice(image, slice_begin, slice_size)
                label = tf.slice(label, slice_begin, slice_size)

            if dataset_str == 'train':
                for flip_axis in [0, 1, 2]:
                    image, label = data_aug_lib.maybe_flip(
                        image, label, flip_axis)
                image, label = data_aug_lib.maybe_rot180(image,
                                                         label,
                                                         static_axis=2)
                image = data_aug_lib.intensity_shift(
                    image, label, FLAGS.per_class_intensity_scale,
                    FLAGS.per_class_intensity_shift)
                image = data_aug_lib.image_corruption(
                    image, label, FLAGS.ct_resolution,
                    FLAGS.image_corrupt_ratio_mean,
                    FLAGS.image_corrupt_ratio_stddev)
                image = data_aug_lib.maybe_add_noise(
                    image, noise_shape, 1, 4, FLAGS.image_noise_probability,
                    FLAGS.image_noise_ratio)
                image, label = data_aug_lib.projective_transform(
                    image, label, FLAGS.ct_resolution,
                    FLAGS.image_translate_ratio, FLAGS.image_transform_ratio,
                    FLAGS.sampled_2d_slices)

            if FLAGS.sampled_2d_slices:
                # Only get the center slice of label.
                label = tf.slice(label, [0, 0, FLAGS.image_c // 2],
                                 [FLAGS.ct_resolution, FLAGS.ct_resolution, 1])

            spatial_dims_w_blocks = [
                FLAGS.image_nx_block,
                FLAGS.ct_resolution // FLAGS.image_nx_block,
                FLAGS.image_ny_block,
                FLAGS.ct_resolution // FLAGS.image_ny_block
            ]
            if not FLAGS.sampled_2d_slices:
                spatial_dims_w_blocks += [FLAGS.ct_resolution]

            image = tf.reshape(image, spatial_dims_w_blocks + [FLAGS.image_c])
            label = tf.reshape(label, spatial_dims_w_blocks)

            label = tf.cast(label, tf.int32)
            label = tf.one_hot(label, FLAGS.label_c)

            data_dtype = tf.as_dtype(FLAGS.mtf_dtype)
            image = tf.cast(image, data_dtype)
            label = tf.cast(label, data_dtype)
            return image, label
예제 #27
0
def main():
    if FLAGS.datasource == 'sinusoid':
        if FLAGS.train:
            test_num_updates = 1
        else:
            test_num_updates = 10
    else:
        if FLAGS.datasource == 'miniimagenet':
            if FLAGS.train:
                test_num_updates = 1  # eval on at least one update during training
            else:
                test_num_updates = 10
        else:
            test_num_updates = 10

    if not FLAGS.train:
        orig_meta_batch_size = FLAGS.meta_batch_size
        # always use meta batch size of 1 when testing.
        FLAGS.meta_batch_size = 1

    if FLAGS.datasource == 'sinusoid':
        data_generator = DataGenerator(FLAGS.update_batch_size * 2,
                                       FLAGS.meta_batch_size)
    else:
        if FLAGS.metatrain_iterations == 0 and FLAGS.datasource == 'miniimagenet':
            assert FLAGS.meta_batch_size == 1
            assert FLAGS.update_batch_size == 1
            data_generator = DataGenerator(
                1, FLAGS.meta_batch_size)  # only use one datapoint,
        else:
            if FLAGS.datasource == 'miniimagenet':  # TODO - use 15 val examples for imagenet?
                if FLAGS.train:
                    data_generator = DataGenerator(
                        FLAGS.update_batch_size + 15, FLAGS.meta_batch_size
                    )  # only use one datapoint for testing to save memory
                else:
                    data_generator = DataGenerator(
                        FLAGS.update_batch_size * 2, FLAGS.meta_batch_size
                    )  # only use one datapoint for testing to save memory
            else:
                data_generator = DataGenerator(
                    FLAGS.update_batch_size * 2, FLAGS.meta_batch_size
                )  # only use one datapoint for testing to save memory

    dim_output = data_generator.dim_output
    if FLAGS.baseline == 'oracle':
        assert FLAGS.datasource == 'sinusoid'
        dim_input = 3
        FLAGS.pretrain_iterations += FLAGS.metatrain_iterations
        FLAGS.metatrain_iterations = 0
    else:
        dim_input = data_generator.dim_input

    if FLAGS.datasource == 'miniimagenet' or FLAGS.datasource == 'omniglot':
        tf_data_load = True
        num_classes = data_generator.num_classes

        if FLAGS.train:  # only construct training model if needed
            random.seed(5)
            image_tensor, label_tensor = data_generator.make_data_tensor()
            inputa = tf.slice(image_tensor, [0, 0, 0],
                              [-1, num_classes * FLAGS.update_batch_size, -1])
            inputb = tf.slice(image_tensor,
                              [0, num_classes * FLAGS.update_batch_size, 0],
                              [-1, -1, -1])
            labela = tf.slice(label_tensor, [0, 0, 0],
                              [-1, num_classes * FLAGS.update_batch_size, -1])
            labelb = tf.slice(label_tensor,
                              [0, num_classes * FLAGS.update_batch_size, 0],
                              [-1, -1, -1])
            input_tensors = {
                'inputa': inputa,
                'inputb': inputb,
                'labela': labela,
                'labelb': labelb
            }
            print("inputa shape", inputa.shape)

        random.seed(6)
        image_tensor, label_tensor = data_generator.make_data_tensor(
            train=False)
        inputa = tf.slice(image_tensor, [0, 0, 0],
                          [-1, num_classes * FLAGS.update_batch_size, -1])
        inputb = tf.slice(image_tensor,
                          [0, num_classes * FLAGS.update_batch_size, 0],
                          [-1, -1, -1])
        labela = tf.slice(label_tensor, [0, 0, 0],
                          [-1, num_classes * FLAGS.update_batch_size, -1])
        labelb = tf.slice(label_tensor,
                          [0, num_classes * FLAGS.update_batch_size, 0],
                          [-1, -1, -1])
        metaval_input_tensors = {
            'inputa': inputa,
            'inputb': inputb,
            'labela': labela,
            'labelb': labelb
        }
    else:
        tf_data_load = False
        input_tensors = None

    model = MAML(dim_input, dim_output, test_num_updates=test_num_updates)
    if FLAGS.train or not tf_data_load:
        model.construct_model(input_tensors=input_tensors, prefix='metatrain_')
    if tf_data_load:
        model.construct_model(input_tensors=metaval_input_tensors,
                              prefix='metaval_')
    model.summ_op = tf.summary.merge_all()

    saver = loader = tf.train.Saver(tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES),
                                    max_to_keep=10)

    sess = tf.InteractiveSession()

    if not FLAGS.train:
        # change to original meta batch size when loading model.
        FLAGS.meta_batch_size = orig_meta_batch_size

    if FLAGS.train_update_batch_size == -1:
        FLAGS.train_update_batch_size = FLAGS.update_batch_size
    if FLAGS.train_update_lr == -1:
        FLAGS.train_update_lr = FLAGS.update_lr

    exp_string = 'cls_' + str(FLAGS.num_classes) + '.mbs_' + str(
        FLAGS.meta_batch_size) + '.ubs_' + str(
            FLAGS.train_update_batch_size) + '.numstep' + str(
                FLAGS.num_updates) + '.updatelr' + str(FLAGS.train_update_lr)

    if FLAGS.num_filters != 64:
        exp_string += 'hidden' + str(FLAGS.num_filters)
    if FLAGS.max_pool:
        exp_string += 'maxpool'
    if FLAGS.stop_grad:
        exp_string += 'stopgrad'
    if FLAGS.baseline:
        exp_string += FLAGS.baseline
    if FLAGS.norm == 'batch_norm':
        exp_string += 'batchnorm'
    elif FLAGS.norm == 'layer_norm':
        exp_string += 'layernorm'
    elif FLAGS.norm == 'None':
        exp_string += 'nonorm'
    else:
        print('Norm setting not recognized.')

    resume_itr = 0
    model_file = None

    tf.global_variables_initializer().run()
    tf.train.start_queue_runners()

    if not FLAGS.rand_init:
        if FLAGS.resume or not FLAGS.train:
            model_file = tf.train.latest_checkpoint(FLAGS.logdir + '/' +
                                                    exp_string)
            if FLAGS.test_iter > 0:
                model_file = model_file[:model_file.index('model'
                                                          )] + 'model' + str(
                                                              FLAGS.test_iter)
            if model_file:
                ind1 = model_file.index('model')
                resume_itr = int(model_file[ind1 + 5:])
                print("Restoring model weights from " + model_file)
                saver.restore(sess, model_file)

    if FLAGS.train:
        train(model, saver, sess, exp_string, data_generator, resume_itr)
    else:
        test(model, saver, sess, exp_string, data_generator, test_num_updates)
예제 #28
0
    def _run():
      """Forward pass through the network."""
      with slim.arg_scope([slim.dropout], is_training=is_training):
        with slim.arg_scope(
            [slim.conv2d, slim.fully_connected],
            weights_initializer=tf.truncated_normal_initializer(stddev=0.01),
            weights_regularizer=slim.l2_regularizer(self._l2_regularization),
            activation_fn=tf.nn.relu,
            trainable=is_training):
          with slim.arg_scope(
              [slim.conv2d, slim.max_pool2d], stride=1, padding='SAME'):
            with slim.arg_scope(
                [slim.conv2d, slim.fully_connected],
                normalizer_fn=slim.batch_norm,
                normalizer_params=batch_norm):
              _, grasp_image = images
              net = slim.conv2d(
                  grasp_image,
                  64, [6, 6],
                  stride=2,
                  scope='conv1_1',
                  activation_fn=None,
                  normalizer_fn=None,
                  normalizer_params=None)
              # Old checkpoints (such as those used for tests) did not have
              # scaling on the separate batch norm operations (those not
              # associated with a conv operation), so only setting the scale
              # parameter in arg_scope would break the tests. We set scale=
              # False for these separate batch norm operations temporarily.
              # However, future users are encouraged to not set scale=False so
              # that barch_norm parameters are consistent through the whole
              # network.
              net = tf.nn.relu(slim.batch_norm(net, scale=False))
              net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool1')
              self.activation_layers.append(net)
              for l in range(2, 2 + self.num_convs[0]):
                net = slim.conv2d(net, 64, [5, 5], scope='conv%d' % l)
                self.activation_layers.append(net)
              net = slim.max_pool2d(net, [3, 3], stride=3, scope='pool2')
              end_points['pool2'] = net
              self.activation_layers.append(net)
              logging.debug('pool2')
              logging.debug(net.get_shape())

              if grasp_param_names is None:
                grasp_param_blocks = [grasp_params]
                grasp_param_block_names = ['fcgrasp']
              else:
                grasp_param_blocks = []
                grasp_param_block_names = []
                # Note: Creating variables must happen in a deterministic
                # order, otherwise some workers will look for variables on the
                # wrong parameter servers, so we sort the grasp_param_names
                # here.
                for block_name in sorted(grasp_param_names):
                  offset, size = grasp_param_names[block_name]
                  grasp_param_blocks += [
                      tf.slice(grasp_params, [0, offset], [-1, size])
                  ]
                  grasp_param_block_names += [block_name]

              grasp_param_tensors = []
              for block, name in zip(grasp_param_blocks,
                                     grasp_param_block_names):
                grasp_param_tensors += [
                    slim.fully_connected(
                        block,
                        256,
                        scope=name,
                        activation_fn=None,
                        normalizer_fn=None,
                        normalizer_params=None)
                ]

              fcgrasp = tf.add_n(grasp_param_tensors)

              # Old checkpoints (such as those used for tests) did not have
              # scaling on the separate batch norm operations (those not
              # associated with a conv operation), so only setting the scale
              # parameter in arg_scope would break the tests. We set scale=
              # False for these separate batch norm operations temporarily.
              # However, future users are encouraged to not set scale=False so
              # that barch_norm parameters are consistent through the whole
              # network.
              fcgrasp = tf.nn.relu(slim.batch_norm(fcgrasp, scale=False))
              fcgrasp = slim.fully_connected(fcgrasp, 64, scope='fcgrasp2')
              context = tf.reshape(fcgrasp, [-1, 1, 1, 64])
              end_points['fcgrasp'] = fcgrasp
              # Tile the image embedding action_batch_size times to align
              # with the expanded action dimension of action_batch_size.
              # Same image is used with all the actions in a action_batch.
              # net pre expansion should be [batch, *, *, *]
              # net post expansion should be [batch x action_batch, *, *, *]
              if tile_batch:
                net = contrib_seq2seq.tile_batch(net, self._action_batch_size)
              net = tf.add(net, context)
              logging.debug('net post add %s', net)
              end_points['vsum'] = net
              self.activation_layers.append(net)
              logging.debug('vsum')
              logging.debug(net.get_shape())
              for l in range(2 + sum(self.num_convs[:1]),
                             2 + sum(self.num_convs[:2])):
                net = slim.conv2d(net, 64, [3, 3], scope='conv%d' % l)
                logging.debug('conv%d', l)
                self.activation_layers.append(net)
              logging.debug(net.get_shape())
              net = slim.max_pool2d(net, [2, 2], stride=2, scope='pool3')
              logging.debug('pool3')
              logging.debug(net.get_shape())
              self.activation_layers.append(net)
              for l in range(2 + sum(self.num_convs[:2]),
                             2 + sum(self.num_convs[:3])):
                net = slim.conv2d(
                    net, 64, [3, 3], scope='conv%d' % l, padding='VALID')
                self.activation_layers.append(net)
              logging.debug('final conv')
              logging.debug(net.get_shape())
              end_points['final_conv'] = net

              batch_size = tf.shape(net)[0]
              if goal_spatial_fn is not None:
                goal_spatial = goal_spatial_fn()
                # Tile goal to match net batch size (e.g. CEM).
                goal_batch_size = tf.shape(goal_spatial)[0]
                goal_spatial = tf.tile(
                    goal_spatial, [batch_size//goal_batch_size, 1, 1, 1])
                # Merging features in style of Fang 2017.
                net = tf.concat([net, goal_spatial], axis=3)
              net = slim.flatten(net, scope='flatten')

              if goal_vector_fn is not None:
                goal_vector = goal_vector_fn()
                goal_batch_size = tf.shape(goal_vector)[0]
                goal_vector = tf.tile(
                    goal_vector, [batch_size//goal_batch_size, 1])
                net = tf.concat([net, goal_vector], axis=1)

              for l in range(self.hid_layers):
                net = slim.fully_connected(net, 64, scope='fc%d' % l)

              name = 'logit'
              if num_classes > 1:
                name = 'logit_%d' % num_classes
              logits = slim.fully_connected(
                  net,
                  num_classes,
                  activation_fn=None,
                  scope=name,
                  normalizer_fn=None,
                  normalizer_params=None)
              end_points['logits'] = logits
              if softmax:
                predictions = tf.nn.softmax(logits)
              else:
                predictions = tf.nn.sigmoid(logits)
              if tile_batch:

                if num_classes > 1:
                  predictions = tf.reshape(
                      predictions, [-1, self._action_batch_size, num_classes])
                else:
                  predictions = tf.reshape(predictions,
                                           [-1, self._action_batch_size])
              end_points['predictions'] = predictions
              return logits, end_points
예제 #29
0
def embedding_postprocessor(
    input_tensor,
    use_token_type=False,
    token_type_ids=None,
    token_type_vocab_size=16,
    token_type_embedding_name='token_type_embeddings',
    use_position_embeddings=True,
    position_embedding_name='position_embeddings',
    initializer_range=0.02,
    max_position_embeddings=512,
    dropout_prob=0.1,
):
    """Performs various post-processing on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length,
      embedding_size].
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      Must be specified if `use_token_type` is True.
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
    token_type_embedding_name: string. The name of the embedding table variable
      for token type ids.
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.

  Returns:
    float tensor with same shape as `input_tensor`.

  Raises:
    ValueError: One of the tensor shapes or input values is invalid.
  """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError('`token_type_ids` must be specified if'
                             '`use_token_type` is True.')
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range),
        )
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range),
            )
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output
예제 #30
0
    def get(self):
        """ Provides input data to the graph. """
        # calculate size of each record (this lists what is contained in the db and how many bytes are occupied)
        record_bytes = 0

        encoding_bytes = 4
        kp_xyz_entries = 3 * self.num_kp
        record_bytes += encoding_bytes*kp_xyz_entries

        encoding_bytes = 4
        kp_uv_entries = 2 * self.num_kp
        record_bytes += encoding_bytes*kp_uv_entries

        kp_vis_entries = self.num_kp
        record_bytes += encoding_bytes*kp_vis_entries

        image_bytes = self.image_size[0] * self.image_size[1] * 3
        record_bytes += image_bytes

        """ READ DATA ITEMS"""
        # Start reader
        reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes)
        _, value = reader.read(tf.train.string_input_producer([self.path_to_db]))

        # decode to floats
        bytes_read = 0
        data_dict = dict()
        record_bytes_float32 = tf.decode_raw(value, tf.float32)

        # 1. Read keypoint xyz
        keypoint_xyz21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_xyz_entries]), [self.num_kp, 3])
        bytes_read += encoding_bytes*kp_xyz_entries
        keypoint_xyz21 /= 1000.0  # scale to meters
        keypoint_xyz21 = self.convert_kp(keypoint_xyz21)

        # calculate wrist coord
        if self.use_wrist_coord:
            wrist_xyz = keypoint_xyz21[16, :] + 2.0*(keypoint_xyz21[0, :] - keypoint_xyz21[16, :])
            keypoint_xyz21 = tf.concat([tf.expand_dims(wrist_xyz, 0),
                                        keypoint_xyz21[1:, :]], 0)

        data_dict['keypoint_xyz21'] = keypoint_xyz21

        # 2. Read keypoint uv AND VIS
        keypoint_uv_vis21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_uv_entries+kp_vis_entries]), [self.num_kp, 3])
        bytes_read += encoding_bytes*(kp_uv_entries+kp_vis_entries)
        keypoint_uv_vis21 = self.convert_kp(keypoint_uv_vis21)
        keypoint_uv21 = keypoint_uv_vis21[:, :2]
        keypoint_vis21 = tf.equal(keypoint_uv_vis21[:, 2], 1.0)

        # calculate wrist vis
        if self.use_wrist_coord:
            wrist_vis = tf.logical_or(keypoint_vis21[16], keypoint_vis21[0])
            keypoint_vis21 = tf.concat([tf.expand_dims(wrist_vis, 0),
                                        keypoint_vis21[1:]], 0)

            wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :])
            keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0),
                                       keypoint_uv21[1:, :]], 0)

        data_dict['keypoint_vis21'] = keypoint_vis21

        if self.coord_uv_noise:
            noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma)
            keypoint_uv21 += noise

        data_dict['keypoint_uv21'] = keypoint_uv21

        # decode to uint8
        record_bytes_uint8 = tf.decode_raw(value, tf.uint8)

        # 4. Read image
        image = tf.reshape(tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]),
                               [self.image_size[0], self.image_size[1], 3])
        image = tf.cast(image, tf.float32)
        bytes_read += image_bytes

        # subtract mean
        image = image / 255.0 - 0.5
        if self.hue_aug:
            image = tf.image.random_hue(image, self.hue_aug_max)
        data_dict['image'] = image

        """ CONSTANTS """
        # Camera intrinsics
        sx = 822.79041
        sy = 822.79041
        tx = 318.47345
        ty = 250.31296
        data_dict['cam_mat'] = tf.constant([[sx, 0.0, tx], [0.0, sy, ty], [0.0, 0.0, 1.0]])

        # Hand side: this dataset only contains left hands
        data_dict['hand_side'] = tf.one_hot(tf.constant(0, dtype=tf.int32), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32)

        assert bytes_read == record_bytes, "Doesnt add up."

        """ DEPENDENT DATA ITEMS: XYZ represenations. """
        # make coords relative to root joint
        kp_coord_xyz_root = keypoint_xyz21[0, :] # this is the palm coord
        kp_coord_xyz21_rel = keypoint_xyz21 - kp_coord_xyz_root  # relative coords in metric coords
        index_root_bone_length = tf.sqrt(tf.reduce_sum(tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :])))
        data_dict['keypoint_scale'] = index_root_bone_length
        data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length  # normalized by length of 12->11

        # calculate local coordinates
        kp_coord_xyz21_local = bone_rel_trafo(data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local)
        data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local

        # calculate viewpoint and coords in canonical coordinates
        kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(kp_coord_xyz21_rel_can), tf.squeeze(rot_mat)
        data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can
        data_dict['rot_mat'] = tf.matrix_inverse(rot_mat)

        """ DEPENDENT DATA ITEMS: HAND CROP """
        if self.hand_crop:
            crop_center = keypoint_uv21[12, ::-1]

            # catch problem, when no valid kp available (happens almost never)
            crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center,
                                  lambda: tf.constant([0.0, 0.0]))
            crop_center.set_shape([2, ])

            if self.crop_center_noise:
                noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_center_noise_sigma)
                crop_center += noise

            crop_scale_noise = tf.constant(1.0)
            if self.crop_scale_noise:
                    crop_scale_noise = tf.squeeze(tf.random_uniform([1], minval=1.0, maxval=1.2))

            if not self.use_wrist_coord:
                wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :])
                keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0),
                                           keypoint_uv21[1:, :]], 0)

            # select visible coords only
            kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21)
            kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21)
            kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1)

            # determine size of crop (measure spatial extend of hw coords first)
            min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0)
            max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size)

            # find out larger distance wrt the center of crop
            crop_size_best = 2*tf.maximum(max_coord - crop_center, crop_center - min_coord)
            crop_size_best = tf.reduce_max(crop_size_best)
            crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0)

            # catch problem, when no valid kp available
            crop_size_best = tf.cond(tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best,
                                  lambda: tf.constant(200.0))
            crop_size_best.set_shape([])

            # calculate necessary scaling
            scale = tf.cast(self.crop_size, tf.float32) / crop_size_best
            scale = tf.minimum(tf.maximum(scale, 1.0), 10.0)
            scale *= crop_scale_noise
            data_dict['crop_scale'] = scale

            if self.crop_offset_noise:
                noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_offset_noise_sigma)
                crop_center += noise

            # Crop image
            img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale)
            data_dict['image_crop'] = tf.squeeze(img_crop)

            # Modify uv21 coordinates
            crop_center_float = tf.cast(crop_center, tf.float32)
            keypoint_uv21_u = (data_dict['keypoint_uv21'][:, 0] - crop_center_float[1]) * scale + self.crop_size // 2
            keypoint_uv21_v = (data_dict['keypoint_uv21'][:, 1] - crop_center_float[0]) * scale + self.crop_size // 2
            keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1)
            data_dict['keypoint_uv21'] = keypoint_uv21

            # Modify camera intrinsics
            scale = tf.reshape(scale, [1, ])
            scale_matrix = tf.dynamic_stitch([[0], [1], [2],
                                              [3], [4], [5],
                                              [6], [7], [8]], [scale, [0.0], [0.0],
                                                               [0.0], scale, [0.0],
                                                               [0.0], [0.0], [1.0]])
            scale_matrix = tf.reshape(scale_matrix, [3, 3])

            crop_center_float = tf.cast(crop_center, tf.float32)
            trans1 = crop_center_float[0] * scale - self.crop_size // 2
            trans2 = crop_center_float[1] * scale - self.crop_size // 2
            trans1 = tf.reshape(trans1, [1, ])
            trans2 = tf.reshape(trans2, [1, ])
            trans_matrix = tf.dynamic_stitch([[0], [1], [2],
                                              [3], [4], [5],
                                              [6], [7], [8]], [[1.0], [0.0], -trans2,
                                                               [0.0], [1.0], -trans1,
                                                               [0.0], [0.0], [1.0]])
            trans_matrix = tf.reshape(trans_matrix, [3, 3])

            data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, data_dict['cam_mat']))

        """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints"""
        # create scoremaps from the subset of 2D annoataion
        keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1)

        scoremap_size = self.image_size
        
        if self.hand_crop:
            scoremap_size = (self.crop_size, self.crop_size)

        scoremap = self.create_multiple_gaussian_map(keypoint_hw21,
                                                     scoremap_size,
                                                     self.sigma,
                                                     valid_vec=keypoint_vis21)
        
        if self.scoremap_dropout:
            scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob,
                                        noise_shape=[1, 1, 21])
            scoremap *= self.scoremap_dropout_prob

        data_dict['scoremap'] = scoremap

        if self.random_crop_to_size:
            tensor_stack = tf.concat([data_dict['image'],
                                      tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1),
                                      tf.cast(data_dict['hand_mask'], tf.float32)], 2)
            s = tensor_stack.get_shape().as_list()
            tensor_stack_cropped = tf.random_crop(tensor_stack,
                                                  [self.random_crop_size, self.random_crop_size, s[2]])
            data_dict = dict()  # delete everything else because the random cropping makes the data invalid anyway
            data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32)

        names, tensors = zip(*data_dict.items())

        if self.shuffle:
            tensors = tf.train.shuffle_batch_join([tensors],
                                                  batch_size=self.batch_size,
                                                  capacity=100,
                                                  min_after_dequeue=50,
                                                  enqueue_many=False)
        else:
            tensors = tf.train.batch_join([tensors],
                                          batch_size=self.batch_size,
                                          capacity=100,
                                          enqueue_many=False)

        return dict(zip(names, tensors))