Exemplo n.º 1
0
def execute(configs):
    tf.reset_default_graph()
    random.seed(configs["random_state"])
    nprand.seed(configs["random_state"])
    DECAY_FACTOR = 0.80
    decay_steps = 1000
    latent_dim = configs["latent_dim"]
    som_dim = [configs["som_dim"], configs["som_dim"]]
    num_classes = 10
    global_step = tf.Variable(0, trainable=False, name="global_step")
    embeddings = tf.get_variable(
        "embeddings",
        som_dim + [latent_dim],
        initializer=tf.truncated_normal_initializer(stddev=0.05))

    x = tf.placeholder(tf.float32, shape=[None, 784])
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    y = tf.placeholder(tf.int32, shape=[None])
    train = tf.placeholder(tf.bool, name="train")
    batch_size = tf.shape(x)[0]

    with tf.variable_scope("encoder"):
        h_conv1 = tf.nn.relu(
            conv2d(x_image, [4, 4, 1, configs["conv_size"]], "conv1"))
        h_pool1 = max_pool_2x2(h_conv1)
        h_conv2 = tf.nn.relu(
            conv2d(h_pool1, [4, 4, configs["conv_size"], configs["conv_size"]],
                   "conv2"))
        h_pool2 = max_pool_2x2(h_conv2)
        flat_size = 7 * 7 * configs["conv_size"]
        h_flat = tf.reshape(h_pool2, [batch_size, flat_size])
        #     h_flat_norm = tf.layers.batch_normalization(h_flat, training=train, renorm=True)
        z_e = tf.keras.layers.Dense(latent_dim)(h_flat)

    z_dist = tf.squared_difference(tf.expand_dims(tf.expand_dims(z_e, 1), 1),
                                   tf.expand_dims(embeddings, 0))
    z_dist_red = tf.reduce_sum(z_dist, axis=-1)
    z_dist_flat = tf.reshape(z_dist_red, [batch_size, -1])
    k = tf.argmin(z_dist_flat, axis=-1)
    k_1 = k // som_dim[1]
    k_2 = k % som_dim[1]
    k_stacked = tf.stack([k_1, k_2], axis=1)
    z_q = tf.gather_nd(embeddings, k_stacked)

    def decoder(z_tensor):
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            h_flat_dec = tf.keras.layers.Dense(flat_size)(z_tensor)
            h_reshaped = tf.reshape(h_flat_dec, tf.shape(h_pool2))
            h_unpool1 = tf.keras.layers.UpSampling2D((2, 2))(h_reshaped)
            h_deconv1 = tf.nn.relu(
                conv2d(h_unpool1,
                       [4, 4, configs["conv_size"], configs["conv_size"]],
                       "deconv1"))
            h_unpool2 = tf.keras.layers.UpSampling2D((2, 2))(h_deconv1)
            h_deconv2 = tf.nn.sigmoid(
                conv2d(h_unpool2, [4, 4, configs["conv_size"], 1], "deconv2"))
            x_hat = h_deconv2
            return x_hat

    x_hat = decoder(z_q)

    beta = 0.25
    loss_rec_mse = tf.losses.mean_squared_error(x_image, x_hat)
    loss_vq = tf.reduce_mean(tf.squared_difference(tf.stop_gradient(z_e), z_q))
    loss_commit = tf.reduce_mean(
        tf.squared_difference(z_e, tf.stop_gradient(z_q)))
    loss = loss_rec_mse + loss_vq + beta * loss_commit

    learning_rate = tf.placeholder_with_default(0.001, [])
    lr_decay = tf.train.exponential_decay(learning_rate,
                                          global_step,
                                          decay_steps,
                                          DECAY_FACTOR,
                                          staircase=True)

    decoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     "decoder")
    decoder_grads = list(zip(tf.gradients(loss, decoder_vars), decoder_vars))
    encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     "encoder")
    grad_z = tf.gradients(loss_rec_mse, z_q)

    encoder_grads = [(tf.gradients(z_e, var, grad_z)[0] +
                      beta * tf.gradients(loss_commit, var)[0], var)
                     for var in encoder_vars]
    embed_grads = list(zip(tf.gradients(loss_vq, embeddings), [embeddings]))

    optimizer = tf.train.AdamOptimizer(lr_decay)
    train_step = optimizer.apply_gradients(decoder_grads + encoder_grads +
                                           embed_grads)

    BATCH_SIZE = configs["batch_size"]
    EPOCHS = configs["n_epochs"]
    NUM_TESTS = 1

    for data_set in configs["DATASETS"]:

        if data_set == "mnist":
            ds_train, ds_test = tf.keras.datasets.mnist.load_data()
        elif data_set == "fashion":
            ds_train, ds_test = tf.keras.datasets.fashion_mnist.load_data()

        data_train = ds_train[0]
        data_train = np.reshape(
            data_train,
            (data_train.shape[0], data_train.shape[1] * data_train.shape[2]))
        data_test = ds_test[0]
        data_test = np.reshape(
            data_test,
            (data_test.shape[0], data_test.shape[1] * data_test.shape[2]))
        labels_test = ds_test[1]
        labels_train = ds_train[1]
        aggregated_mses = []
        aggregated_NMIs = []
        aggregated_purities = []

        for _ in range(NUM_TESTS):
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                indices_unsup = np.arange(data_train.shape[0])
                with tqdm(total=EPOCHS *
                          (data_train.shape[0] // BATCH_SIZE)) as pbar:
                    for epoch in range(EPOCHS):
                        np.random.shuffle(indices_unsup)
                        test_mse = sess.run(loss_rec_mse,
                                            feed_dict={
                                                x: data_test[:100],
                                                train: False
                                            })
                        for i in range(indices_unsup.shape[0] // BATCH_SIZE):
                            batch_data = data_train[
                                indices_unsup[BATCH_SIZE * i:BATCH_SIZE *
                                              (i + 1)]]
                            if i % 100 == 0:
                                train_mse, train_commit, train_loss = sess.run(
                                    [loss_rec_mse, loss_commit, loss],
                                    feed_dict={
                                        x: batch_data,
                                        train: False
                                    })
                            train_step.run(feed_dict={
                                x: batch_data,
                                train: True
                            })
                            pbar.set_postfix(epoch=epoch,
                                             train_mse=train_mse,
                                             train_commit=train_commit,
                                             test_mse=test_mse,
                                             refresh=False)
                            pbar.update(1)

                test_k_all = []
                test_x_hat_all = []
                for i in trange(data_test.shape[0] // 100):
                    batch_data = data_test[100 * i:100 * (i + 1)]
                    test_k_all.extend(
                        sess.run(k, feed_dict={
                            x: batch_data,
                            train: False
                        }))
                    test_x_hat_all.extend(
                        sess.run(x_hat,
                                 feed_dict={
                                     x: batch_data,
                                     train: False
                                 }))
                test_x_hat_all = np.array(test_x_hat_all)
                test_k_all = np.array(test_k_all)

            aggregated_mses.append(
                mean_squared_error(data_test,
                                   np.reshape(test_x_hat_all, [10000, 784])))
            aggregated_NMIs.append(
                normalized_mutual_info_score(test_k_all,
                                             labels_test[:len(test_k_all)]))
            aggregated_purities.append(
                cluster_purity(test_k_all, labels_test[:len(test_k_all)]))

        print("Results for {}".format(data_set))
        print("Test MSE: {} +- {}\nTest NMI: {} +- {}\nTest purity: {} +- {}".
              format(np.mean(aggregated_mses),
                     np.std(aggregated_mses) / np.sqrt(NUM_TESTS),
                     np.mean(aggregated_NMIs),
                     np.std(aggregated_NMIs) / np.sqrt(NUM_TESTS),
                     np.mean(aggregated_purities),
                     np.std(aggregated_purities) / np.sqrt(NUM_TESTS)))

        if not configs["debug_mode"]:
            with open(
                    "../results/vqvae_{}_{}_somdim_{}.tsv".format(
                        data_set, configs["random_state"], configs["som_dim"]),
                    'w') as fp:
                csv_fp = csv.writer(fp, delimiter='\t')
                csv_fp.writerow(["model", "mse", "nmi", "purity"])
                csv_fp.writerow([
                    "vqvae",
                    str(aggregated_mses[0]),
                    str(aggregated_NMIs[0]),
                    str(aggregated_purities[0])
                ])
def fuse_features(nodes, weight_method):
    """Fuse features from different resolutions and return a weighted sum.

  Args:
    nodes: a list of tensorflow features at different levels
    weight_method: feature fusion method. One of:
      - "attn" - Softmax weighted fusion
      - "fastattn" - Fast normalzied feature fusion
      - "sum" - a sum of inputs

  Returns:
    A tensor denoting the fused feature.
  """
    dtype = nodes[0].dtype

    if weight_method == 'attn':
        edge_weights = [
            tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype) for _ in nodes
        ]
        normalized_weights = tf.nn.softmax(tf.stack(edge_weights))
        nodes = tf.stack(nodes, axis=-1)
        new_node = tf.reduce_sum(nodes * normalized_weights, -1)
    elif weight_method == 'fastattn':
        edge_weights = [
            tf.nn.relu(tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype))
            for _ in nodes
        ]
        weights_sum = tf.add_n(edge_weights)
        nodes = [
            nodes[i] * edge_weights[i] / (weights_sum + 0.0001)
            for i in range(len(nodes))
        ]
        new_node = tf.add_n(nodes)
    elif weight_method == 'channel_attn':
        num_filters = int(nodes[0].shape[-1])
        edge_weights = [
            tf.cast(tf.Variable(lambda: tf.ones([num_filters]), name='WSM'),
                    dtype=dtype) for _ in nodes
        ]
        normalized_weights = tf.nn.softmax(tf.stack(edge_weights, -1), axis=-1)
        nodes = tf.stack(nodes, axis=-1)
        new_node = tf.reduce_sum(nodes * normalized_weights, -1)
    elif weight_method == 'channel_fastattn':
        num_filters = int(nodes[0].shape[-1])
        edge_weights = [
            tf.nn.relu(
                tf.cast(tf.Variable(lambda: tf.ones([num_filters]),
                                    name='WSM'),
                        dtype=dtype)) for _ in nodes
        ]
        weights_sum = tf.add_n(edge_weights)
        nodes = [
            nodes[i] * edge_weights[i] / (weights_sum + 0.0001)
            for i in range(len(nodes))
        ]
        new_node = tf.add_n(nodes)
    elif weight_method == 'sum':
        new_node = tf.add_n(nodes)
    else:
        raise ValueError('unknown weight_method {}'.format(weight_method))

    return new_node
Exemplo n.º 3
0
def build_act_enjoy (make_obs_ph, q_func, num_actions, noisy=False, scope="deepq", reuse=None, attack=None, model_path=''):
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

        eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func", noisy=noisy)
        q_values = q_values.get_logits(observations_ph.get())
        #q_values = q_func(observations_ph, num_actions, scope="q_func", noisy=noisy)
        deterministic_actions = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
        chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
        update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))

        act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True},
                         updates=[update_eps_expr])
                         
        # Load model before attacks graph construction so that TF won't
        # complain can't load parameters for attack
        try:
            U.load_state(model_path)
        except:
            pass

        if attack != None:
            
            if attack == 'fgsm':
                def wrapper(x):
                    return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy)
                adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session())
                adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0,
                                                      clip_min=0, clip_max=1.0) * 255.0
            elif attack == 'iterative':
                def wrapper(x):
                    return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True)
                adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session())
                adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0,
                                                      clip_min=0, clip_max=1.0) * 255.0
            elif attack == 'cwl2':
                def wrapper(x):
                    return q_func(x, num_actions, scope="q_func", reuse=True)
                adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session())
                cw_params = {'binary_search_steps': 1,
                             'max_iterations': 100,
                             'learning_rate': 0.1,
                             'initial_const': 10,
                             'clip_min': 0,
                             'clip_max': 1.0}
                adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0

            craft_adv_obs = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                            outputs=adv_observations,
                            givens={update_eps_ph: -1.0, stochastic_ph: True},
                            updates=[update_eps_expr])

        if attack == None:
            craft_adv_obs = None
            return act
        else:
            return act, craft_adv_obs
 def mode(self):
     return tf.cast(
         tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
Exemplo n.º 5
0
def single_level_feature_crop(features, level_boxes, detection_prior_levels,
                              min_mask_level, mask_crop_size):
  """Crop the FPN features at the appropriate levels for each detection.


  Args:
    features: a float tensor of shape [batch_size, num_levels,
      max_feature_size, max_feature_size, num_downsample_channels].
    level_boxes: a float Tensor of the level boxes to crop from.
        [batch_size, num_instances, 4].
    detection_prior_levels: an int Tensor of instance assigned level of shape
        [batch_size, num_instances].
    min_mask_level: minimum FPN level to crop mask feature from.
    mask_crop_size: an int of mask crop size.

  Returns:
    crop_features: a float Tensor of shape [batch_size * num_instances,
        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
        instance feature crop.
  """
  (batch_size, num_levels, max_feature_size,
   _, num_downsample_channels) = features.get_shape().as_list()
  _, num_of_instances, _ = level_boxes.get_shape().as_list()
  level_boxes = tf.cast(level_boxes, tf.int32)
  assert num_of_instances == detection_prior_levels.get_shape().as_list()[1]

  x_start_indices = level_boxes[:, :, 1]
  y_start_indices = level_boxes[:, :, 0]
  # generate the full indices (not just the starting index)
  x_idx_list = []
  y_idx_list = []
  for i in range(mask_crop_size):
    x_idx_list.append(x_start_indices + i)
    y_idx_list.append(y_start_indices + i)

  x_indices = tf.stack(x_idx_list, axis=2)
  y_indices = tf.stack(y_idx_list, axis=2)
  levels = detection_prior_levels - min_mask_level
  height_dim_size = max_feature_size
  level_dim_size = max_feature_size * height_dim_size
  batch_dim_size = num_levels * level_dim_size
  # TODO(weicheng) change this to gather_nd for better readability.
  indices = tf.reshape(
      tf.tile(
          tf.reshape(
              tf.range(batch_size) * batch_dim_size,
              [batch_size, 1, 1, 1]),
          [1, num_of_instances,
           mask_crop_size, mask_crop_size]) +
      tf.tile(
          tf.reshape(levels * level_dim_size,
                     [batch_size, num_of_instances, 1, 1]),
          [1, 1, mask_crop_size, mask_crop_size]) +
      tf.tile(
          tf.reshape(y_indices * height_dim_size,
                     [batch_size, num_of_instances,
                      mask_crop_size, 1]),
          [1, 1, 1, mask_crop_size]) +
      tf.tile(
          tf.reshape(x_indices,
                     [batch_size, num_of_instances,
                      1, mask_crop_size]),
          [1, 1, mask_crop_size, 1]), [-1])

  features_r2 = tf.reshape(features,
                           [-1, num_downsample_channels])
  crop_features = tf.reshape(
      tf.gather(features_r2, indices),
      [batch_size * num_of_instances,
       mask_crop_size, mask_crop_size,
       num_downsample_channels])

  return crop_features
Exemplo n.º 6
0
    def _createStackBidirectionalDynamicRNN(self,
                                            use_gpu,
                                            use_shape,
                                            use_state_tuple,
                                            initial_states_fw=None,
                                            initial_states_bw=None,
                                            scope=None):
        del use_gpu
        del use_state_tuple
        self.layers = [2, 3]
        input_size = 5
        batch_size = 2
        max_length = 8

        initializer = tf.random_uniform_initializer(-0.01,
                                                    0.01,
                                                    seed=self._seed)
        sequence_length = tf.placeholder(tf.int64)

        self.cells_fw = [
            rnn_cell.LSTMCell(  # pylint:disable=g-complex-comprehension
                num_units,
                input_size,
                initializer=initializer,
                state_is_tuple=False) for num_units in self.layers
        ]
        self.cells_bw = [
            rnn_cell.LSTMCell(  # pylint:disable=g-complex-comprehension
                num_units,
                input_size,
                initializer=initializer,
                state_is_tuple=False) for num_units in self.layers
        ]

        inputs = max_length * [
            tf.placeholder(tf.float32,
                           shape=(batch_size, input_size) if use_shape else
                           (None, input_size))
        ]
        inputs_c = tf.stack(inputs)
        inputs_c = tf.transpose(inputs_c, [1, 0, 2])
        outputs, st_fw, st_bw = contrib_rnn.stack_bidirectional_dynamic_rnn(
            self.cells_fw,
            self.cells_bw,
            inputs_c,
            initial_states_fw=initial_states_fw,
            initial_states_bw=initial_states_bw,
            dtype=tf.float32,
            sequence_length=sequence_length,
            scope=scope)

        # Outputs has shape (batch_size, max_length, 2* layer[-1].
        output_shape = [None, max_length, 2 * self.layers[-1]]
        if use_shape:
            output_shape[0] = batch_size

        self.assertAllEqual(outputs.get_shape().as_list(), output_shape)

        input_value = np.random.randn(batch_size, input_size)

        return input_value, inputs, outputs, st_fw, st_bw, sequence_length
Exemplo n.º 7
0
def iou_loss(pred_boxes: FloatType,
             target_boxes: FloatType,
             iou_type: Text = 'iou') -> tf.Tensor:
    """A unified interface for computing various IoU losses.

  Let B and B_gt denotes the pred_box and B_gt is the target box (ground truth):

    IoU = |B & B_gt| / |B | B_gt|

    GIoU = IoU - |C - B U B_gt| / C, where C is the smallest box covering B and
    B_gt.

    DIoU = IoU - E(B, B_gt)^2 / c^2, E is the Euclidean distance of the center
    points of B and B_gt, and c is the diagonal length of the smallest box
    covering the two boxes

    CIoU = IoU - DIoU - a * v, where a is a positive trade-off parameter, and
    v measures the consistency of aspect ratio:
      v = (arctan(w_gt / h_gt) - arctan(w / h)) * 4 / pi^2
    where (w_gt, h_gt) and (w, h) are the width and height of the target and
    predicted box respectively.

  The returned loss is computed as 1 - one of {IoU, GIoU, DIoU, CIoU}.

  Args:
    pred_boxes: predicted boxes, with coordinate [y_min, x_min, y_max, x_max]*.
      It can be multiple anchors, with each anchor box has four coordinates.
    target_boxes: target boxes, with coordinate [y_min, x_min, y_max, x_max]*.
      It can be multiple anchors, with each anchor box has four coordinates.
    iou_type: one of ['iou', 'ciou', 'diou', 'giou'].

  Returns:
    IoU loss float `Tensor`.
  """
    if iou_type not in ('iou', 'ciou', 'diou', 'giou'):
        raise ValueError(
            'Unknown loss_type {}, not iou/ciou/diou/giou'.format(iou_type))

    pred_boxes = tf.convert_to_tensor(pred_boxes, tf.float32)
    target_boxes = tf.cast(target_boxes, pred_boxes.dtype)

    # t_ denotes target boxes and p_ denotes predicted boxes: (y, x, y_max, x_max)
    pred_boxes_list = tf.unstack(pred_boxes, None, axis=-1)
    target_boxes_list = tf.unstack(target_boxes, None, axis=-1)
    assert len(pred_boxes_list) == len(target_boxes_list)
    assert len(pred_boxes_list) % 4 == 0

    iou_loss_list = []
    for i in range(0, len(pred_boxes_list), 4):
        pred_boxes = pred_boxes_list[i:i + 4]
        target_boxes = target_boxes_list[i:i + 4]

        # Compute mask.
        t_ymin, t_xmin, t_ymax, t_xmax = target_boxes
        mask = tf.not_equal((t_ymax - t_ymin) * (t_xmax - t_xmin), 0)
        mask = tf.cast(mask, t_ymin.dtype)
        # Loss should be mask * (1 - iou) = mask - masked_iou.
        pred_boxes = [b * mask for b in pred_boxes]
        iou_loss_list.append(
            mask -
            tf.squeeze(_iou_per_anchor(pred_boxes, target_boxes, iou_type)))
    if len(iou_loss_list) == 1:
        return iou_loss_list[0]
    return tf.reduce_sum(tf.stack(iou_loss_list), 0)
Exemplo n.º 8
0
def build_bifpn_layer(feats, fpn_name, fpn_config, is_training, input_size,
                      fpn_num_filters, min_level, max_level, separable_conv,
                      apply_bn_for_resampling, conv_after_downsample,
                      use_native_resize_op, conv_bn_relu_pattern,
                      pooling_type):
    """Builds a feature pyramid given previous feature pyramid and config."""
    config = fpn_config or get_fpn_config(fpn_name)

    num_output_connections = [0 for _ in feats]
    for i, fnode in enumerate(config.nodes):
        with tf.variable_scope('fnode{}'.format(i)):
            logging.info('fnode %d : %s', i, fnode)
            new_node_width = int(fnode['width_ratio'] * input_size)
            nodes = []
            for idx, input_offset in enumerate(fnode['inputs_offsets']):
                input_node = feats[input_offset]
                num_output_connections[input_offset] += 1
                input_node = resample_feature_map(
                    input_node, '{}_{}_{}'.format(idx, input_offset,
                                                  len(feats)), new_node_width,
                    fpn_num_filters, apply_bn_for_resampling, is_training,
                    conv_after_downsample, use_native_resize_op, pooling_type)
                nodes.append(input_node)

            # Combine all nodes.
            dtype = nodes[0].dtype
            if config.weight_method == 'attn':
                edge_weights = [
                    tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype)
                    for _ in range(len(fnode['inputs_offsets']))
                ]
                normalized_weights = tf.nn.softmax(tf.stack(edge_weights))
                nodes = tf.stack(nodes, axis=-1)
                new_node = tf.reduce_sum(
                    tf.multiply(nodes, normalized_weights), -1)
            elif config.weight_method == 'fastattn':
                edge_weights = [
                    tf.nn.relu(
                        tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype))
                    for _ in range(len(fnode['inputs_offsets']))
                ]
                weights_sum = tf.add_n(edge_weights)
                nodes = [
                    nodes[i] * edge_weights[i] / (weights_sum + 0.0001)
                    for i in range(len(nodes))
                ]
                new_node = tf.add_n(nodes)
            elif config.weight_method == 'sum':
                new_node = tf.add_n(nodes)
            else:
                raise ValueError('unknown weight_method {}'.format(
                    config.weight_method))

            with tf.variable_scope('op_after_combine{}'.format(len(feats))):
                if not conv_bn_relu_pattern:
                    new_node = utils.relu_fn(new_node)

                if separable_conv:
                    conv_op = functools.partial(tf.layers.separable_conv2d,
                                                depth_multiplier=1)
                else:
                    conv_op = tf.layers.conv2d

                new_node = conv_op(
                    new_node,
                    filters=fpn_num_filters,
                    kernel_size=(3, 3),
                    padding='same',
                    use_bias=True if not conv_bn_relu_pattern else False,
                    name='conv')

                new_node = utils.batch_norm_relu(
                    new_node,
                    is_training_bn=is_training,
                    relu=False if not conv_bn_relu_pattern else True,
                    data_format='channels_last',
                    name='bn')

            feats.append(new_node)
            num_output_connections.append(0)

    output_feats = {}
    for l in range(min_level, max_level + 1):
        for i, fnode in enumerate(reversed(config.nodes)):
            if fnode['width_ratio'] == F(l):
                output_feats[l] = feats[-1 - i]
                break
    return output_feats
Exemplo n.º 9
0
    def generate_trips(self, min_gap=1, max_gap=5):
        """Generate a tf Dataset of training triplets with an offset between three frames.

    Args:
      min_gap: (int) the minimum offset between two frames of a sampled triplet.
      max_gap: (int) the maximum offset between two frames of a sampled triplet.

    Returns:
      A tf.data.Dataset of ViewSequences without images, consisting of
      triplets from the input sequence separated by the given offset.
    """
        def mapper(timestamp_trips, rgb_trips, pano_trips, depth_trips,
                   normal_trips, pose_trips):
            """A function mapping a data tuple to ViewTrip."""
            return ViewTrip(self.scene_id, self.sequence_id, timestamp_trips,
                            rgb_trips, pano_trips, depth_trips, normal_trips,
                            tf.zeros([1]), pose_trips, self.intrinsics[0],
                            self.resolution[0])

        with tf.control_dependencies([
                tf.Assert(tf.less(max_gap, self.length()),
                          [max_gap, self.length()])
        ]):
            timestamp_trips = []
            rgb_trips = []
            pano_trips = []
            depth_trips = []
            normal_trips = []
            pose_trips = []
            # generate triplets with an offset that ranges
            # from 'min_gap' to 'max_gap'.
            for stride in range(min_gap, max_gap + 1):
                inds = tf.range(stride, self.length() - stride)
                inds_jitter = tf.random.uniform(
                    minval=-40,
                    maxval=40,
                    shape=[self.length() - 2 * stride],
                    dtype=tf.int32)
                rand_inds = tf.minimum(tf.maximum(inds + inds_jitter, 0),
                                       self.length() - 1)
                timestamp = tf.stack([
                    self.timestamp[:-2 * stride], self.timestamp[2 * stride:],
                    self.timestamp[stride:-stride],
                    tf.gather(self.timestamp, rand_inds)
                ],
                                     axis=1)
                rgb = tf.stack([
                    self.rgb[:-2 * stride], self.rgb[2 * stride:],
                    self.rgb[stride:-stride],
                    tf.gather(self.rgb, rand_inds)
                ],
                               axis=1)
                pano = tf.stack([
                    self.pano[:-2 * stride], self.pano[2 * stride:],
                    self.pano[stride:-stride],
                    tf.gather(self.pano, rand_inds)
                ],
                                axis=1)
                depth = tf.stack([
                    self.depth[:-2 * stride], self.depth[2 * stride:],
                    self.depth[stride:-stride],
                    tf.gather(self.depth, rand_inds)
                ],
                                 axis=1)
                normal = tf.stack([
                    self.normal[:-2 * stride], self.normal[2 * stride:],
                    self.normal[stride:-stride],
                    tf.gather(self.normal, rand_inds)
                ],
                                  axis=1)
                pose = tf.stack([
                    self.pose[:-2 * stride], self.pose[2 * stride:],
                    self.pose[stride:-stride],
                    tf.gather(self.pose, rand_inds)
                ],
                                axis=1)
                timestamp_trips.append(timestamp)
                rgb_trips.append(rgb)
                pano_trips.append(pano)
                depth_trips.append(depth)
                normal_trips.append(normal)
                pose_trips.append(pose)

            timestamp_trips = tf.concat(timestamp_trips, 0)
            rgb_trips = tf.concat(rgb_trips, 0)
            pano_trips = tf.concat(pano_trips, 0)
            depth_trips = tf.concat(depth_trips, 0)
            normal_trips = tf.concat(normal_trips, 0)
            pose_trips = tf.concat(pose_trips, 0)
            dataset = tf.data.Dataset.from_tensor_slices(
                (timestamp_trips, rgb_trips, pano_trips, depth_trips,
                 normal_trips, pose_trips))
            return dataset.map(mapper)
Exemplo n.º 10
0
    def get(self):
        """ Provides input data to the graph. """
        # calculate size of each record (this lists what is contained in the db and how many bytes are occupied)
        record_bytes = 0

        encoding_bytes = 4
        kp_xyz_entries = 3 * self.num_kp
        record_bytes += encoding_bytes*kp_xyz_entries

        encoding_bytes = 4
        kp_uv_entries = 2 * self.num_kp
        record_bytes += encoding_bytes*kp_uv_entries

        kp_vis_entries = self.num_kp
        record_bytes += encoding_bytes*kp_vis_entries

        image_bytes = self.image_size[0] * self.image_size[1] * 3
        record_bytes += image_bytes

        """ READ DATA ITEMS"""
        # Start reader
        reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes)
        _, value = reader.read(tf.train.string_input_producer([self.path_to_db]))

        # decode to floats
        bytes_read = 0
        data_dict = dict()
        record_bytes_float32 = tf.decode_raw(value, tf.float32)

        # 1. Read keypoint xyz
        keypoint_xyz21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_xyz_entries]), [self.num_kp, 3])
        bytes_read += encoding_bytes*kp_xyz_entries
        keypoint_xyz21 /= 1000.0  # scale to meters
        keypoint_xyz21 = self.convert_kp(keypoint_xyz21)

        # calculate wrist coord
        if self.use_wrist_coord:
            wrist_xyz = keypoint_xyz21[16, :] + 2.0*(keypoint_xyz21[0, :] - keypoint_xyz21[16, :])
            keypoint_xyz21 = tf.concat([tf.expand_dims(wrist_xyz, 0),
                                        keypoint_xyz21[1:, :]], 0)

        data_dict['keypoint_xyz21'] = keypoint_xyz21

        # 2. Read keypoint uv AND VIS
        keypoint_uv_vis21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_uv_entries+kp_vis_entries]), [self.num_kp, 3])
        bytes_read += encoding_bytes*(kp_uv_entries+kp_vis_entries)
        keypoint_uv_vis21 = self.convert_kp(keypoint_uv_vis21)
        keypoint_uv21 = keypoint_uv_vis21[:, :2]
        keypoint_vis21 = tf.equal(keypoint_uv_vis21[:, 2], 1.0)

        # calculate wrist vis
        if self.use_wrist_coord:
            wrist_vis = tf.logical_or(keypoint_vis21[16], keypoint_vis21[0])
            keypoint_vis21 = tf.concat([tf.expand_dims(wrist_vis, 0),
                                        keypoint_vis21[1:]], 0)

            wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :])
            keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0),
                                       keypoint_uv21[1:, :]], 0)

        data_dict['keypoint_vis21'] = keypoint_vis21

        if self.coord_uv_noise:
            noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma)
            keypoint_uv21 += noise

        data_dict['keypoint_uv21'] = keypoint_uv21

        # decode to uint8
        record_bytes_uint8 = tf.decode_raw(value, tf.uint8)

        # 4. Read image
        image = tf.reshape(tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]),
                               [self.image_size[0], self.image_size[1], 3])
        image = tf.cast(image, tf.float32)
        bytes_read += image_bytes

        # subtract mean
        image = image / 255.0 - 0.5
        if self.hue_aug:
            image = tf.image.random_hue(image, self.hue_aug_max)
        data_dict['image'] = image

        """ CONSTANTS """
        # Camera intrinsics
        sx = 822.79041
        sy = 822.79041
        tx = 318.47345
        ty = 250.31296
        data_dict['cam_mat'] = tf.constant([[sx, 0.0, tx], [0.0, sy, ty], [0.0, 0.0, 1.0]])

        # Hand side: this dataset only contains left hands
        data_dict['hand_side'] = tf.one_hot(tf.constant(0, dtype=tf.int32), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32)

        assert bytes_read == record_bytes, "Doesnt add up."

        """ DEPENDENT DATA ITEMS: XYZ represenations. """
        # make coords relative to root joint
        kp_coord_xyz_root = keypoint_xyz21[0, :] # this is the palm coord
        kp_coord_xyz21_rel = keypoint_xyz21 - kp_coord_xyz_root  # relative coords in metric coords
        index_root_bone_length = tf.sqrt(tf.reduce_sum(tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :])))
        data_dict['keypoint_scale'] = index_root_bone_length
        data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length  # normalized by length of 12->11

        # calculate local coordinates
        kp_coord_xyz21_local = bone_rel_trafo(data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local)
        data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local

        # calculate viewpoint and coords in canonical coordinates
        kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(kp_coord_xyz21_rel_can), tf.squeeze(rot_mat)
        data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can
        data_dict['rot_mat'] = tf.matrix_inverse(rot_mat)

        """ DEPENDENT DATA ITEMS: HAND CROP """
        if self.hand_crop:
            crop_center = keypoint_uv21[12, ::-1]

            # catch problem, when no valid kp available (happens almost never)
            crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center,
                                  lambda: tf.constant([0.0, 0.0]))
            crop_center.set_shape([2, ])

            if self.crop_center_noise:
                noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_center_noise_sigma)
                crop_center += noise

            crop_scale_noise = tf.constant(1.0)
            if self.crop_scale_noise:
                    crop_scale_noise = tf.squeeze(tf.random_uniform([1], minval=1.0, maxval=1.2))

            if not self.use_wrist_coord:
                wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :])
                keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0),
                                           keypoint_uv21[1:, :]], 0)

            # select visible coords only
            kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21)
            kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21)
            kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1)

            # determine size of crop (measure spatial extend of hw coords first)
            min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0)
            max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size)

            # find out larger distance wrt the center of crop
            crop_size_best = 2*tf.maximum(max_coord - crop_center, crop_center - min_coord)
            crop_size_best = tf.reduce_max(crop_size_best)
            crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0)

            # catch problem, when no valid kp available
            crop_size_best = tf.cond(tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best,
                                  lambda: tf.constant(200.0))
            crop_size_best.set_shape([])

            # calculate necessary scaling
            scale = tf.cast(self.crop_size, tf.float32) / crop_size_best
            scale = tf.minimum(tf.maximum(scale, 1.0), 10.0)
            scale *= crop_scale_noise
            data_dict['crop_scale'] = scale

            if self.crop_offset_noise:
                noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_offset_noise_sigma)
                crop_center += noise

            # Crop image
            img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale)
            data_dict['image_crop'] = tf.squeeze(img_crop)

            # Modify uv21 coordinates
            crop_center_float = tf.cast(crop_center, tf.float32)
            keypoint_uv21_u = (data_dict['keypoint_uv21'][:, 0] - crop_center_float[1]) * scale + self.crop_size // 2
            keypoint_uv21_v = (data_dict['keypoint_uv21'][:, 1] - crop_center_float[0]) * scale + self.crop_size // 2
            keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1)
            data_dict['keypoint_uv21'] = keypoint_uv21

            # Modify camera intrinsics
            scale = tf.reshape(scale, [1, ])
            scale_matrix = tf.dynamic_stitch([[0], [1], [2],
                                              [3], [4], [5],
                                              [6], [7], [8]], [scale, [0.0], [0.0],
                                                               [0.0], scale, [0.0],
                                                               [0.0], [0.0], [1.0]])
            scale_matrix = tf.reshape(scale_matrix, [3, 3])

            crop_center_float = tf.cast(crop_center, tf.float32)
            trans1 = crop_center_float[0] * scale - self.crop_size // 2
            trans2 = crop_center_float[1] * scale - self.crop_size // 2
            trans1 = tf.reshape(trans1, [1, ])
            trans2 = tf.reshape(trans2, [1, ])
            trans_matrix = tf.dynamic_stitch([[0], [1], [2],
                                              [3], [4], [5],
                                              [6], [7], [8]], [[1.0], [0.0], -trans2,
                                                               [0.0], [1.0], -trans1,
                                                               [0.0], [0.0], [1.0]])
            trans_matrix = tf.reshape(trans_matrix, [3, 3])

            data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, data_dict['cam_mat']))

        """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints"""
        # create scoremaps from the subset of 2D annoataion
        keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1)

        scoremap_size = self.image_size
        
        if self.hand_crop:
            scoremap_size = (self.crop_size, self.crop_size)

        scoremap = self.create_multiple_gaussian_map(keypoint_hw21,
                                                     scoremap_size,
                                                     self.sigma,
                                                     valid_vec=keypoint_vis21)
        
        if self.scoremap_dropout:
            scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob,
                                        noise_shape=[1, 1, 21])
            scoremap *= self.scoremap_dropout_prob

        data_dict['scoremap'] = scoremap

        if self.random_crop_to_size:
            tensor_stack = tf.concat([data_dict['image'],
                                      tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1),
                                      tf.cast(data_dict['hand_mask'], tf.float32)], 2)
            s = tensor_stack.get_shape().as_list()
            tensor_stack_cropped = tf.random_crop(tensor_stack,
                                                  [self.random_crop_size, self.random_crop_size, s[2]])
            data_dict = dict()  # delete everything else because the random cropping makes the data invalid anyway
            data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32)

        names, tensors = zip(*data_dict.items())

        if self.shuffle:
            tensors = tf.train.shuffle_batch_join([tensors],
                                                  batch_size=self.batch_size,
                                                  capacity=100,
                                                  min_after_dequeue=50,
                                                  enqueue_many=False)
        else:
            tensors = tf.train.batch_join([tensors],
                                          batch_size=self.batch_size,
                                          capacity=100,
                                          enqueue_many=False)

        return dict(zip(names, tensors))
Exemplo n.º 11
0
def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False):
  """Multi-headed, multi-layer Transformer from "Attention is All You Need".

  This is almost an exact implementation of the original Transformer encoder.

  See the original paper:
  https://arxiv.org/abs/1706.03762

  Also see:
  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
      seq_length], with 1 for positions that can be attended to and 0 in
      positions that should not be.
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
      forward) layer.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  """
  if hidden_size % num_attention_heads != 0:
    raise ValueError(
        "The hidden size (%d) is not a multiple of the number of attention "
        "heads (%d)" % (hidden_size, num_attention_heads))

  attention_head_size = int(hidden_size / num_attention_heads)
  input_shape = get_shape_list(input_tensor, expected_rank=3)
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  input_width = input_shape[2]

  # The Transformer performs sum residuals on all layers so the input needs
  # to be the same as the hidden size.
  if input_width != hidden_size:
    raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
                     (input_width, hidden_size))

  # We keep the representation as a 2D tensor to avoid re-shaping it back and
  # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
  # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
  # help the optimizer.
  prev_output = reshape_to_matrix(input_tensor)

  attn_maps = []
  all_layer_outputs = []
  for layer_idx in range(num_hidden_layers):
    with tf.variable_scope("layer_%d" % layer_idx):
      with tf.variable_scope("attention"):
        attention_heads = []
        with tf.variable_scope("self"):
          attention_head, probs = attention_layer(
              from_tensor=prev_output,
              to_tensor=prev_output,
              attention_mask=attention_mask,
              num_attention_heads=num_attention_heads,
              size_per_head=attention_head_size,
              attention_probs_dropout_prob=attention_probs_dropout_prob,
              initializer_range=initializer_range,
              do_return_2d_tensor=True,
              batch_size=batch_size,
              from_seq_length=seq_length,
              to_seq_length=seq_length)
          attention_heads.append(attention_head)
          attn_maps.append(probs)

        attention_output = None
        if len(attention_heads) == 1:
          attention_output = attention_heads[0]
        else:
          # In the case where we have other sequences, we just concatenate
          # them to the self-attention head before the projection.
          attention_output = tf.concat(attention_heads, axis=-1)

        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.variable_scope("output"):
          attention_output = tf.layers.dense(
              attention_output,
              hidden_size,
              kernel_initializer=create_initializer(initializer_range))
          attention_output = dropout(attention_output, hidden_dropout_prob)
          attention_output = layer_norm(attention_output + prev_output)

      # The activation is only applied to the "intermediate" hidden layer.
      with tf.variable_scope("intermediate"):
        intermediate_output = tf.layers.dense(
            attention_output,
            intermediate_size,
            activation=intermediate_act_fn,
            kernel_initializer=create_initializer(initializer_range))

      # Down-project back to `hidden_size` then add the residual.
      with tf.variable_scope("output"):
        prev_output = tf.layers.dense(
            intermediate_output,
            hidden_size,
            kernel_initializer=create_initializer(initializer_range))
        prev_output = dropout(prev_output, hidden_dropout_prob)
        prev_output = layer_norm(prev_output + attention_output)
        all_layer_outputs.append(prev_output)

  attn_maps = tf.stack(attn_maps, 0)
  if do_return_all_layers:
    return tf.stack([reshape_from_matrix(layer, input_shape)
                     for layer in all_layer_outputs], 0), attn_maps
  else:
    return reshape_from_matrix(prev_output, input_shape), attn_maps
Exemplo n.º 12
0
def _generate_detections_tf(cls_outputs,
                            box_outputs,
                            anchor_boxes,
                            indices,
                            classes,
                            image_id,
                            image_scale,
                            min_score_thresh=0.2,
                            max_boxes_to_draw=50,
                            soft_nms_sigma=0.0,
                            iou_threshold=0.5,
                            use_native_nms=True):
    """Generates detections with model outputs and anchors.

  Args:
    cls_outputs: a numpy array with shape [N, 1], which has the highest class
      scores on all feature levels. The N is the number of selected
      top-K total anchors on all levels.  (k being MAX_DETECTION_POINTS)
    box_outputs: a numpy array with shape [N, 4], which stacks box regression
      outputs on all feature levels. The N is the number of selected top-k
      total anchors on all levels. (k being MAX_DETECTION_POINTS)
    anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of selected top-k total anchors on
      all levels.
    indices: a numpy array with shape [N], which is the indices from top-k
      selection.
    classes: a numpy array with shape [N], which represents the class
      prediction on all selected anchors from top-k selection.
    image_id: an integer number to specify the image id.
    image_scale: a float tensor representing the scale between original image
      and input image for the detector. It is used to rescale detections for
      evaluating with the original groundtruth annotations.
    num_classes: a integer that indicates the number of classes.
    min_score_thresh: A float representing the threshold for deciding when to
      remove boxes based on score.
    max_boxes_to_draw: Max number of boxes to draw.
    soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter;
      See Bodla et al, https://arxiv.org/abs/1704.04503).  When
        `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard)
        NMS.
    iou_threshold: A float representing the threshold for deciding whether boxes
      overlap too much with respect to IOU.
    use_native_nms: a bool that indicates whether to use native nms.

  Returns:
    detections: detection results in a tensor with each row representing
      [image_id, y, x, height, width, score, class]
  """
    logging.info('Using tf version of post-processing.')
    anchor_boxes = tf.gather(anchor_boxes, indices)

    scores = tf.math.sigmoid(cls_outputs)
    # apply bounding box regression to anchors
    boxes = decode_box_outputs_tf(tf.transpose(box_outputs, [1, 0]),
                                  tf.transpose(anchor_boxes, [1, 0]))

    if use_native_nms:
        logging.info('Using native nms.')
        top_detection_idx, scores = tf.image.non_max_suppression_with_scores(
            boxes,
            scores,
            max_boxes_to_draw,
            iou_threshold=iou_threshold,
            score_threshold=min_score_thresh,
            soft_nms_sigma=soft_nms_sigma)
        boxes = tf.gather(boxes, top_detection_idx)
    else:
        logging.info('Using customized nms.')
        scores = tf.expand_dims(scores, axis=1)
        all_detections = tf.concat([boxes, scores], axis=1)
        top_detection_idx = nms_tf(all_detections, iou_threshold)
        detections = tf.gather(all_detections, top_detection_idx)
        scores = detections[:, 4]
        boxes = detections[:, :4]
    height = boxes[:, 2] - boxes[:, 0]
    width = boxes[:, 3] - boxes[:, 1]

    detections = tf.stack([
        tf.cast(tf.repeat(image_id, tf.size(top_detection_idx)), tf.float32),
        boxes[:, 0] * image_scale, boxes[:, 1] * image_scale,
        height * image_scale, width * image_scale, scores,
        tf.cast(tf.gather(classes, top_detection_idx) + 1, tf.float32)
    ],
                          axis=1)
    return detections
Exemplo n.º 13
0
def _generate_detections_tf(cls_outputs,
                            box_outputs,
                            anchor_boxes,
                            indices,
                            classes,
                            image_id,
                            image_scale,
                            image_size,
                            min_score_thresh=MIN_SCORE_THRESH,
                            max_boxes_to_draw=MAX_DETECTIONS_PER_IMAGE,
                            soft_nms_sigma=0.25,
                            iou_threshold=0.5):
    """Generates detections with model outputs and anchors.

  Args:
    cls_outputs: a numpy array with shape [N, 1], which has the highest class
      scores on all feature levels. The N is the number of selected
      top-K total anchors on all levels.  (k being MAX_DETECTION_POINTS)
    box_outputs: a numpy array with shape [N, 4], which stacks box regression
      outputs on all feature levels. The N is the number of selected top-k
      total anchors on all levels. (k being MAX_DETECTION_POINTS)
    anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all
      feature levels. The N is the number of selected top-k total anchors on
      all levels.
    indices: a numpy array with shape [N], which is the indices from top-k
      selection.
    classes: a numpy array with shape [N], which represents the class
      prediction on all selected anchors from top-k selection.
    image_id: an integer number to specify the image id.
    image_scale: a float tensor representing the scale between original image
      and input image for the detector. It is used to rescale detections for
      evaluating with the original groundtruth annotations.
    image_size: a tuple (height, width) or an integer for image size.
    min_score_thresh: A float representing the threshold for deciding when to
      remove boxes based on score.
    max_boxes_to_draw: Max number of boxes to draw.
    soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter;
      See Bodla et al, https://arxiv.org/abs/1704.04503).  When
        `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard)
        NMS.
    iou_threshold: A float representing the threshold for deciding whether boxes
      overlap too much with respect to IOU.

  Returns:
    detections: detection results in a tensor with each row representing
      [image_id, ymin, xmin, ymax, xmax, score, class]
  """
    if not image_size:
        raise ValueError(
            'tf version generate_detection needs non-empty image_size')

    logging.info('Using tf version of post-processing.')
    anchor_boxes = tf.gather(anchor_boxes, indices)

    scores = tf.math.sigmoid(cls_outputs)
    # apply bounding box regression to anchors
    boxes = decode_box_outputs_tf(box_outputs, anchor_boxes)
    # TF API is slightly different from paper, here we follow the paper value:
    # https://github.com/tensorflow/tensorflow/issues/40253.
    top_detection_idx, scores = tf.image.non_max_suppression_with_scores(
        boxes,
        scores,
        max_boxes_to_draw,
        iou_threshold=iou_threshold,
        score_threshold=min_score_thresh,
        soft_nms_sigma=soft_nms_sigma)
    boxes = tf.gather(boxes, top_detection_idx)

    image_size = utils.parse_image_size(image_size)
    detections = tf.stack([
        tf.cast(tf.tile(image_id, tf.shape(top_detection_idx)), tf.float32),
        tf.clip_by_value(boxes[:, 0], 0, image_size[0]) * image_scale,
        tf.clip_by_value(boxes[:, 1], 0, image_size[1]) * image_scale,
        tf.clip_by_value(boxes[:, 2], 0, image_size[0]) * image_scale,
        tf.clip_by_value(boxes[:, 3], 0, image_size[1]) * image_scale, scores,
        tf.cast(tf.gather(classes, top_detection_idx) + 1, tf.float32)
    ],
                          axis=1)
    return detections
Exemplo n.º 14
0
def rnn(cell,
        inputs,
        sequence_length=None,
        initial_state=None,
        ff_keep_prob=1.,
        recur_keep_prob=1.,
        enforce_dropout=False,
        dtype=tf.float32,
        scope=None):
    """ """

    inputs = tf.transpose(inputs, [1, 0, 2])  # (B,T,D) => (T,B,D)

    parallel_iterations = 32
    if sequence_length is not None:
        sequence_length = tf.to_int32(sequence_length)

    with tf.variable_scope(scope or 'RNN') as varscope:
        #if varscope.caching_device is None:
        #  varscope.set_caching_device(lambda op: op.device)
        input_shape = tf.shape(inputs)
        time_steps, batch_size, _ = tf.unstack(input_shape, 3)
        const_time_steps, const_batch_size, const_depth = inputs.get_shape(
        ).as_list()

        if initial_state is not None:
            state = initial_state
        else:
            if not dtype:
                raise ValueError(
                    'If no initial_state is provided, dtype must be.')
            state = cell.zero_state(batch_size, dtype)

        zero_output = tf.zeros(tf.stack([batch_size, cell.output_size]),
                               inputs.dtype)
        if sequence_length is not None:
            min_sequence_length = tf.reduce_min(sequence_length)
            max_sequence_length = tf.reduce_max(sequence_length)

        time = tf.constant(0, dtype=tf.int32, name='time')

        output_ta = tf.TensorArray(dtype=inputs.dtype,
                                   size=time_steps,
                                   tensor_array_name='dynamic_rnn_output')

        input_ta = tf.TensorArray(dtype=inputs.dtype,
                                  size=time_steps,
                                  tensor_array_name='dynamic_rnn_input')

        if ff_keep_prob < 1:
            noise_shape = tf.stack([1, batch_size, const_depth])
            if enforce_dropout is not None:
                inputs = tf.layers.dropout(inputs,
                                           1 - ff_keep_prob,
                                           noise_shape=noise_shape,
                                           training=enforce_dropout)
            else:
                inputs = tf.nn.dropout(inputs,
                                       ff_keep_prob,
                                       noise_shape=noise_shape)

        if recur_keep_prob < 1:
            ones = tf.ones(tf.stack([batch_size, cell.output_size]))
            if enforce_dropout is not None:
                state_dropout = tf.layers.dropout(ones,
                                                  1 - recur_keep_prob,
                                                  training=enforce_dropout)
            else:
                state_dropout = tf.nn.dropout(ones, recur_keep_prob)
            state_dropout = tf.concat(
                [ones] * (cell.state_size // cell.output_size - 1) +
                [state_dropout], 1)
        else:
            state_dropout = 1

        input_ta = input_ta.unstack(inputs)

        #-----------------------------------------------------------
        def _time_step(time, state, output_ta_t):
            """ """

            input_t = input_ta.read(time)

            #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            def _empty_update():
                return zero_output, state

            #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            def _call_cell():
                return cell(input_t, state * state_dropout)

            #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            def _maybe_copy_some_through():
                new_output, new_state = _call_cell()

                return tf.cond(
                    time < min_sequence_length, lambda:
                    (new_output, new_state), lambda: (tf.where(
                        time >= sequence_length, zero_output, new_output
                    ), tf.where(time >= sequence_length, state, new_state)))

            #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

            if sequence_length is not None:
                output, new_state = tf.cond(time >= max_sequence_length,
                                            _empty_update,
                                            _maybe_copy_some_through)
            else:
                (output, new_state) = _call_cell()

            output_ta_t = output_ta_t.write(time, output)

            return (time + 1, new_state, output_ta_t)

        #-----------------------------------------------------------

        _, final_state, output_final_ta = tf.while_loop(
            cond=lambda time, _1, _2: time < time_steps,
            body=_time_step,
            loop_vars=(time, state, output_ta),
            parallel_iterations=parallel_iterations)

        final_outputs = output_final_ta.stack()

        outputs = tf.transpose(final_outputs, [1, 0, 2])  # (T,B,D) => (B,T,D)
        return outputs, final_state
Exemplo n.º 15
0
def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
                                  max_number_of_boxes):
  """Extracts groundtruth data from detection_model and prepares it for eval.

  Args:
    detection_model: A `DetectionModel` object.
    class_agnostic: Whether the detections are class_agnostic.
    max_number_of_boxes: Max number of groundtruth boxes.

  Returns:
    A tuple of:
    groundtruth: Dictionary with the following fields:
      'groundtruth_boxes': [batch_size, num_boxes, 4] float32 tensor of boxes,
        in normalized coordinates.
      'groundtruth_classes': [batch_size, num_boxes] int64 tensor of 1-indexed
        classes.
      'groundtruth_masks': 4D float32 tensor of instance masks (if provided in
        groundtruth)
      'groundtruth_is_crowd': [batch_size, num_boxes] bool tensor indicating
        is_crowd annotations (if provided in groundtruth).
      'groundtruth_area': [batch_size, num_boxes] float32 tensor indicating
        the area (in the original absolute coordinates) of annotations (if
        provided in groundtruth).
      'num_groundtruth_boxes': [batch_size] tensor containing the maximum number
        of groundtruth boxes per image..
      'groundtruth_keypoints': [batch_size, num_boxes, num_keypoints, 2] float32
        tensor of keypoints (if provided in groundtruth).
      'groundtruth_group_of': [batch_size, num_boxes] bool tensor indicating
        group_of annotations (if provided in groundtruth).
      'groundtruth_labeled_classes': [batch_size, num_classes] int64
        tensor of 1-indexed classes.
    class_agnostic: Boolean indicating whether detections are class agnostic.
  """
  input_data_fields = fields.InputDataFields()
  groundtruth_boxes = tf.stack(
      detection_model.groundtruth_lists(fields.BoxListFields.boxes))
  groundtruth_boxes_shape = tf.shape(groundtruth_boxes)
  # For class-agnostic models, groundtruth one-hot encodings collapse to all
  # ones.
  if class_agnostic:
    groundtruth_classes_one_hot = tf.ones(
        [groundtruth_boxes_shape[0], groundtruth_boxes_shape[1], 1])
  else:
    groundtruth_classes_one_hot = tf.stack(
        detection_model.groundtruth_lists(fields.BoxListFields.classes))
  label_id_offset = 1  # Applying label id offset (b/63711816)
  groundtruth_classes = (
      tf.argmax(groundtruth_classes_one_hot, axis=2) + label_id_offset)
  groundtruth = {
      input_data_fields.groundtruth_boxes: groundtruth_boxes,
      input_data_fields.groundtruth_classes: groundtruth_classes
  }
  if detection_model.groundtruth_has_field(fields.BoxListFields.masks):
    groundtruth[input_data_fields.groundtruth_instance_masks] = tf.stack(
        detection_model.groundtruth_lists(fields.BoxListFields.masks))

  if detection_model.groundtruth_has_field(fields.BoxListFields.is_crowd):
    groundtruth[input_data_fields.groundtruth_is_crowd] = tf.stack(
        detection_model.groundtruth_lists(fields.BoxListFields.is_crowd))

  if detection_model.groundtruth_has_field(input_data_fields.groundtruth_area):
    groundtruth[input_data_fields.groundtruth_area] = tf.stack(
        detection_model.groundtruth_lists(input_data_fields.groundtruth_area))

  if detection_model.groundtruth_has_field(fields.BoxListFields.keypoints):
    groundtruth[input_data_fields.groundtruth_keypoints] = tf.stack(
        detection_model.groundtruth_lists(fields.BoxListFields.keypoints))

  if detection_model.groundtruth_has_field(
      fields.BoxListFields.keypoint_visibilities):
    groundtruth[input_data_fields.groundtruth_keypoint_visibilities] = tf.stack(
        detection_model.groundtruth_lists(
            fields.BoxListFields.keypoint_visibilities))

  if detection_model.groundtruth_has_field(fields.BoxListFields.group_of):
    groundtruth[input_data_fields.groundtruth_group_of] = tf.stack(
        detection_model.groundtruth_lists(fields.BoxListFields.group_of))

  if detection_model.groundtruth_has_field(
      fields.InputDataFields.groundtruth_labeled_classes):
    labeled_classes_list = detection_model.groundtruth_lists(
        fields.InputDataFields.groundtruth_labeled_classes)
    labeled_classes = [
        tf.where(x)[:, 0] + label_id_offset for x in labeled_classes_list
    ]
    if len(labeled_classes) > 1:
      num_classes = labeled_classes_list[0].shape[0]
      padded_labeled_classes = []
      for x in labeled_classes:
        padding = num_classes - tf.shape(x)[0]
        padded_labeled_classes.append(tf.pad(x, [[0, padding]]))
      groundtruth[input_data_fields.groundtruth_labeled_classes] = tf.stack(
          padded_labeled_classes)
    else:
      groundtruth[input_data_fields.groundtruth_labeled_classes] = tf.stack(
          labeled_classes)

  groundtruth[input_data_fields.num_groundtruth_boxes] = (
      tf.tile([max_number_of_boxes], multiples=[groundtruth_boxes_shape[0]]))
  return groundtruth
Exemplo n.º 16
0
def add_metric_fn_inputs(params,
                         cls_outputs,
                         box_outputs,
                         metric_fn_inputs,
                         max_detection_points=anchors.MAX_DETECTION_POINTS):
  """Selects top-k predictions and adds the selected to metric_fn_inputs.

  Args:
    params: a parameter dictionary that includes `min_level`, `max_level`,
      `batch_size`, and `num_classes`.
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in
      [batch_size, height, width, num_anchors * 4].
    metric_fn_inputs: a dictionary that will hold the top-k selections.
    max_detection_points: an integer specifing the maximum detection points to
      keep before NMS. Keep all anchors if max_detection_points <= 0.
  """
  batch_size = params['batch_size']
  num_classes = params['num_classes']
  cls_outputs_all = []
  box_outputs_all = []
  # Concatenates class and box of all levels into one tensor.
  for level in range(params['min_level'], params['max_level'] + 1):
    if params['data_format'] == 'channels_first':
      cls_outputs[level] = tf.transpose(cls_outputs[level], [0, 2, 3, 1])
      box_outputs[level] = tf.transpose(box_outputs[level], [0, 2, 3, 1])

    cls_outputs_all.append(tf.reshape(
        cls_outputs[level], [batch_size, -1, num_classes]))
    box_outputs_all.append(tf.reshape(box_outputs[level], [batch_size, -1, 4]))
  cls_outputs_all = tf.concat(cls_outputs_all, 1)
  box_outputs_all = tf.concat(box_outputs_all, 1)

  if max_detection_points > 0:
    # Prune anchors and detections to only keep max_detection_points.
    # Due to some issues, top_k is currently slow in graph model.
    cls_outputs_all_reshape = tf.reshape(cls_outputs_all, [batch_size, -1])
    _, cls_topk_indices = tf.math.top_k(cls_outputs_all_reshape,
                                        k=max_detection_points,
                                        sorted=False)
    indices = cls_topk_indices // num_classes
    classes = cls_topk_indices % num_classes
    cls_indices = tf.stack([indices, classes], axis=2)
    cls_outputs_all_after_topk = tf.gather_nd(
        cls_outputs_all, cls_indices, batch_dims=1)
    box_outputs_all_after_topk = tf.gather_nd(
        box_outputs_all, tf.expand_dims(indices, 2), batch_dims=1)
  else:
    # Keep all anchors, but for each anchor, just keep the max probablity for
    # each class.
    cls_outputs_idx = tf.math.argmax(
        cls_outputs_all, axis=-1, output_type=tf.int32)
    num_anchors = cls_outputs_all.shape[1]

    classes = cls_outputs_idx
    indices = tf.tile(tf.expand_dims(tf.range(num_anchors), axis=0),
                      [batch_size, 1])
    cls_outputs_all_after_topk = tf.reduce_max(cls_outputs_all, -1)
    box_outputs_all_after_topk = box_outputs_all

  metric_fn_inputs['cls_outputs_all'] = cls_outputs_all_after_topk
  metric_fn_inputs['box_outputs_all'] = box_outputs_all_after_topk
  metric_fn_inputs['indices_all'] = indices
  metric_fn_inputs['classes_all'] = classes
Exemplo n.º 17
0
def det_post_process(params: Dict[Any, Any], cls_outputs: Dict[int, tf.Tensor],
                     box_outputs: Dict[int, tf.Tensor], scales: List[float],
                     min_score_thresh, max_boxes_to_draw):
    """Post preprocessing the box/class predictions.

  Args:
    params: a parameter dictionary that includes `min_level`, `max_level`,
      `batch_size`, and `num_classes`.
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in [batch_size, height, width,
      num_anchors * 4].
    scales: a list of float values indicating image scale.
    min_score_thresh: A float representing the threshold for deciding when to
      remove boxes based on score.
    max_boxes_to_draw: Max number of boxes to draw.

  Returns:
    detections_batch: a batch of detection results. Each detection is a tensor
      with each row as [image_id, ymin, xmin, ymax, xmax, score, class].
  """
    if not params['batch_size']:
        # Use combined version for dynamic batch size.
        return det_post_process_combined(params, cls_outputs, box_outputs,
                                         scales, min_score_thresh,
                                         max_boxes_to_draw)

    # TODO(tanmingxing): refactor the code to make it more explicity.
    outputs = {
        'cls_outputs_all': [None],
        'box_outputs_all': [None],
        'indices_all': [None],
        'classes_all': [None]
    }
    det_model_fn.add_metric_fn_inputs(params, cls_outputs, box_outputs,
                                      outputs, -1)

    # Create anchor_label for picking top-k predictions.
    eval_anchors = anchors.Anchors(params['min_level'], params['max_level'],
                                   params['num_scales'],
                                   params['aspect_ratios'],
                                   params['anchor_scale'],
                                   params['image_size'])
    anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes'])

    # Add all detections for each input image.
    detections_batch = []
    for index in range(params['batch_size']):
        cls_outputs_per_sample = outputs['cls_outputs_all'][index]
        box_outputs_per_sample = outputs['box_outputs_all'][index]
        indices_per_sample = outputs['indices_all'][index]
        classes_per_sample = outputs['classes_all'][index]
        detections = anchor_labeler.generate_detections(
            cls_outputs_per_sample,
            box_outputs_per_sample,
            indices_per_sample,
            classes_per_sample,
            image_id=[index],
            image_scale=[scales[index]],
            image_size=params['image_size'],
            min_score_thresh=min_score_thresh,
            max_boxes_to_draw=max_boxes_to_draw,
            disable_pyfun=params.get('disable_pyfun'))
        if params['batch_size'] > 1:
            # pad to fixed length if batch size > 1.
            padding_size = max_boxes_to_draw - tf.shape(detections)[0]
            detections = tf.pad(detections, [[0, padding_size], [0, 0]])
        detections_batch.append(detections)
    return tf.stack(detections_batch, name='detections')
Exemplo n.º 18
0
def _stitch(features):
  """Stitch features on the first dimension."""
  full_mask = tf.greater(features['task'], 1)
  step_mask = tf.reduce_any(full_mask, axis=-1)
  step_mask_exclude_last = tf.pad(step_mask,
                                  [[0, 0], [0, 1]],
                                  constant_values=False)[:, 1:]
  num_sequences = common_layers.shape_list(features['task'])[0]
  num_steps = common_layers.shape_list(features['task'])[1]
  connectors = tf.constant(PADDED_CONCATENATORS)
  # Select connectors
  connector_indices = tf.random.uniform(
      [num_sequences * num_steps], minval=0,
      maxval=len(PADDED_CONCATENATORS), dtype=tf.int32)
  selected_connectors = tf.reshape(
      tf.gather(connectors, connector_indices),
      [num_sequences, num_steps, len(PADDED_CONCATENATORS[0])])
  selected_connectors = tf.multiply(
      selected_connectors,
      tf.expand_dims(tf.to_int32(step_mask_exclude_last), 2),
      name='connector_mask')
  features['task'] = tf.concat([features['task'], selected_connectors], axis=-1)
  ref_offsets = tf.expand_dims(
      tf.cumsum(tf.reduce_sum(tf.to_int32(tf.greater(features['task'], 1)), -1),
                exclusive=True, axis=-1), 2)
  features['task'] = tf.reshape(features['task'], [num_sequences, -1])
  full_mask = tf.greater(features['task'], 1)
  full_mask_int = tf.to_int32(full_mask)
  indices = tf.where(tf.sequence_mask(lengths=tf.reduce_sum(full_mask_int, -1)))
  values = tf.boolean_mask(tf.reshape(features['task'], [-1]),
                           tf.reshape(full_mask, [-1]))
  sparse_task = tf.sparse.SparseTensor(
      indices=indices, values=values,
      dense_shape=tf.to_int64(tf.shape(features['task'])))
  # Stitch task and raw_task
  stitched_features = {}
  stitched_features['task'] = tf.sparse_tensor_to_dense(sparse_task)
  max_len = tf.reduce_max(
      tf.reduce_sum(tf.to_int32(tf.greater(stitched_features['task'], 1)), -1))
  stitched_features['task'] = stitched_features['task'][:, :max_len]
  if 'raw_task' in features:
    connector_strs = tf.reshape(
        tf.gather(tf.constant(CONCATENATORS_STR), connector_indices),
        [num_sequences, num_steps])
    masked_connector_strs = tf.where(
        step_mask_exclude_last,
        connector_strs, tf.fill(tf.shape(connector_strs), ''))
    stitched_features['raw_task'] = tf.strings.reduce_join(
        tf.strings.reduce_join(tf.concat([
            tf.expand_dims(features['raw_task'], 2),
            tf.expand_dims(masked_connector_strs, 2)], axis=2), axis=-1), -1)
  # Stitch screen sequences
  action_lengths = tf.reduce_sum(tf.to_int32(
      tf.greater(features['verb_refs'][:, :, 0, 1],
                 features['verb_refs'][:, :, 0, 0])), -1)
  max_action_length = tf.reduce_max(action_lengths)
  def _pad(tensor, padding_value=0):
    shape_list = common_layers.shape_list(tensor)
    assert len(shape_list) >= 2
    padding_list = [[0, 0], [0, 1]] + [[0, 0]] * (len(shape_list) - 2)
    return tf.pad(tensor[:, :max_action_length],
                  padding_list, constant_values=padding_value)
  for key in features.keys():
    if key.endswith('_refs'):
      features[key] = tf.squeeze(features[key], 2)
      ref_mask = tf.expand_dims(tf.to_int32(
          tf.not_equal(features[key][:, :, 0],
                       features[key][:, :, 1])), 2)
      stitched_features[key] = tf.multiply(
          (features[key] + ref_offsets), ref_mask, name='ref_mask')
      stitched_features[key] = _pad(stitched_features[key])
    elif key in ['verbs', 'objects', 'consumed', 'obj_dom_pos',
                 'obj_text', 'obj_type', 'obj_clickable', 'obj_screen_pos',
                 'verb_refs', 'obj_refs', 'input_refs', 'obj_dom_dist']:
      features[key] = tf.squeeze(features[key], 2)
      stitched_features[key] = features[key]
      stitched_features[key] = _pad(
          stitched_features[key],
          padding_value=-1 if key == 'obj_type' else 0)
    elif key not in ['task', 'raw_task']:
      stitched_features[key] = features[key][:, 0]
  # Append eos to 'task'
  stitched_features['task'] = tf.pad(stitched_features['task'],
                                     [[0, 0], [0, 1]])
  task_mask = tf.to_int32(tf.greater(stitched_features['task'], 1))
  task_eos_mask = tf.pad(task_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1]
  stitched_features['task'] = stitched_features['task'] + (
      task_eos_mask - task_mask)
  # Append eos
  verb_mask = tf.to_int32(tf.greater(stitched_features['verbs'], 1))
  verb_eos_mask = tf.pad(verb_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1]
  verb_eos = verb_eos_mask - verb_mask
  stitched_features['verbs'] = stitched_features['verbs'] + verb_eos
  # Append last step refs to 'verb_refs'
  task_lengths = tf.where(tf.equal(stitched_features['task'], 1))[:, 1]
  eos_pos = tf.to_int32(tf.stack([task_lengths, task_lengths + 1], axis=1))
  action_mask = tf.to_int32(
      tf.sequence_mask(action_lengths, max_action_length + 1))
  action_and_eos_mask = tf.pad(action_mask, [[0, 0], [1, 0]],
                               constant_values=1)[:, :-1]
  verb_ref_eos = action_and_eos_mask - action_mask
  eos_refs = tf.multiply(
      tf.tile(tf.expand_dims(eos_pos, 1), [1, max_action_length + 1, 1]),
      tf.expand_dims(verb_ref_eos, 2), name='verb_ref_eos')
  stitched_features['verb_refs'] += eos_refs
  return stitched_features
Exemplo n.º 19
0
def add_input_distortions(flip_left_right, random_crop, random_scale,
                          random_brightness):
    """Creates the operations to apply the specified distortions.
  During training it can help to improve the results if we run the images
  through simple distortions like crops, scales, and flips. These reflect the
  kind of variations we expect in the real world, and so can help train the
  model to cope with natural data more effectively. Here we take the supplied
  parameters and construct a network of operations to apply them to an image.
  Cropping
  ~~~~~~~~
  Cropping is done by placing a bounding box at a random position in the full
  image. The cropping parameter controls the size of that box relative to the
  input image. If it's zero, then the box is the same size as the input and no
  cropping is performed. If the value is 50%, then the crop box will be half the
  width and height of the input. In a diagram it looks like this:
  <       width         >
  +---------------------+
  |                     |
  |   width - crop%     |
  |    <      >         |
  |    +------+         |
  |    |      |         |
  |    |      |         |
  |    |      |         |
  |    +------+         |
  |                     |
  |                     |
  +---------------------+
  Scaling
  ~~~~~~~
  Scaling is a lot like cropping, except that the bounding box is always
  centered and its size varies randomly within the given range. For example if
  the scale percentage is zero, then the bounding box is the same size as the
  input and no scaling is applied. If it's 50%, then the bounding box will be in
  a random range between half the width and height and full size.
  Args:
    flip_left_right: Boolean whether to randomly mirror images horizontally.
    random_crop: Integer percentage setting the total margin used around the
    crop box.
    random_scale: Integer percentage of how much to vary the scale by.
    random_brightness: Integer range to randomly multiply the pixel values by.
    graph.
  Returns:
    The jpeg input layer and the distorted result tensor.
  """

    jpeg_data = tf.placeholder(tf.string, name='DistortJPGInput')
    decoded_image = tf.image.decode_jpeg(jpeg_data, channels=MODEL_INPUT_DEPTH)
    decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32)
    decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0)
    margin_scale = 1.0 + (random_crop / 100.0)
    resize_scale = 1.0 + (random_scale / 100.0)
    margin_scale_value = tf.constant(margin_scale)
    resize_scale_value = tf.random_uniform(tensor_shape.scalar(),
                                           minval=1.0,
                                           maxval=resize_scale)
    scale_value = tf.multiply(margin_scale_value, resize_scale_value)
    precrop_width = tf.multiply(scale_value, MODEL_INPUT_WIDTH)
    precrop_height = tf.multiply(scale_value, MODEL_INPUT_HEIGHT)
    precrop_shape = tf.stack([precrop_height, precrop_width])
    precrop_shape_as_int = tf.cast(precrop_shape, dtype=tf.int32)
    precropped_image = tf.image.resize_bilinear(decoded_image_4d,
                                                precrop_shape_as_int)
    precropped_image_3d = tf.squeeze(precropped_image, squeeze_dims=[0])
    cropped_image = tf.random_crop(
        precropped_image_3d,
        [MODEL_INPUT_HEIGHT, MODEL_INPUT_WIDTH, MODEL_INPUT_DEPTH])
    if flip_left_right:
        flipped_image = tf.image.random_flip_left_right(cropped_image)
    else:
        flipped_image = cropped_image
    brightness_min = 1.0 - (random_brightness / 100.0)
    brightness_max = 1.0 + (random_brightness / 100.0)
    brightness_value = tf.random_uniform(tensor_shape.scalar(),
                                         minval=brightness_min,
                                         maxval=brightness_max)
    brightened_image = tf.multiply(flipped_image, brightness_value)
    distort_result = tf.expand_dims(brightened_image, 0, name='DistortResult')
    return jpeg_data, distort_result
Exemplo n.º 20
0
    def __init__(self,
                 session,
                 player_id,
                 state_representation_size,
                 num_actions,
                 hidden_layers_sizes=128,
                 replay_buffer_capacity=10000,
                 batch_size=128,
                 replay_buffer_class=ReplayBuffer,
                 learning_rate=0.01,
                 update_target_network_every=1000,
                 learn_every=10,
                 discount_factor=1.0,
                 min_buffer_size_to_learn=1000,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay_duration=int(1e6),
                 optimizer_str="sgd",
                 loss_str="mse"):
        """Initialize the DQN agent."""

        # This call to locals() is used to store every argument used to initialize
        # the class instance, so it can be copied with no hyperparameter change.
        self._kwargs = locals()

        self.player_id = player_id
        self._session = session
        self._num_actions = num_actions
        if isinstance(hidden_layers_sizes, int):
            hidden_layers_sizes = [hidden_layers_sizes]
        self._layer_sizes = hidden_layers_sizes + [num_actions]
        self._batch_size = batch_size
        self._update_target_network_every = update_target_network_every
        self._learn_every = learn_every
        self._min_buffer_size_to_learn = min_buffer_size_to_learn
        self._discount_factor = discount_factor

        self._epsilon_start = epsilon_start
        self._epsilon_end = epsilon_end
        self._epsilon_decay_duration = epsilon_decay_duration

        # TODO(author6) Allow for optional replay buffer config.
        if not isinstance(replay_buffer_capacity, int):
            raise ValueError("Replay buffer capacity not an integer.")
        self._replay_buffer = replay_buffer_class(replay_buffer_capacity)
        self._prev_timestep = None
        self._prev_action = None

        # Step counter to keep track of learning, eps decay and target network.
        self._step_counter = 0

        # Keep track of the last training loss achieved in an update step.
        self._last_loss_value = None

        # Create required TensorFlow placeholders to perform the Q-network updates.
        self._info_state_ph = tf.placeholder(
            shape=[None, state_representation_size],
            dtype=tf.float32,
            name="info_state_ph")
        self._action_ph = tf.placeholder(shape=[None],
                                         dtype=tf.int32,
                                         name="action_ph")
        self._reward_ph = tf.placeholder(shape=[None],
                                         dtype=tf.float32,
                                         name="reward_ph")
        self._is_final_step_ph = tf.placeholder(shape=[None],
                                                dtype=tf.float32,
                                                name="is_final_step_ph")
        self._next_info_state_ph = tf.placeholder(
            shape=[None, state_representation_size],
            dtype=tf.float32,
            name="next_info_state_ph")
        self._legal_actions_mask_ph = tf.placeholder(
            shape=[None, num_actions],
            dtype=tf.float32,
            name="legal_actions_mask_ph")

        self._q_network = snt.nets.MLP(output_sizes=self._layer_sizes)
        self._q_values = self._q_network(self._info_state_ph)
        self._target_q_network = snt.nets.MLP(output_sizes=self._layer_sizes)
        self._target_q_values = self._target_q_network(
            self._next_info_state_ph)

        # Stop gradient to prevent updates to the target network while learning
        self._target_q_values = tf.stop_gradient(self._target_q_values)

        self._update_target_network = self._create_target_network_update_op(
            self._q_network, self._target_q_network)

        # Create the loss operations.
        # Sum a large negative constant to illegal action logits before taking the
        # max. This prevents illegal action values from being considered as target.
        illegal_actions = 1 - self._legal_actions_mask_ph
        illegal_logits = illegal_actions * ILLEGAL_ACTION_LOGITS_PENALTY
        max_next_q = tf.reduce_max(tf.math.add(
            tf.stop_gradient(self._target_q_values), illegal_logits),
                                   axis=-1)
        target = (
            self._reward_ph +
            (1 - self._is_final_step_ph) * self._discount_factor * max_next_q)

        action_indices = tf.stack(
            [tf.range(tf.shape(self._q_values)[0]), self._action_ph], axis=-1)
        predictions = tf.gather_nd(self._q_values, action_indices)

        if loss_str == "mse":
            loss_class = tf.losses.mean_squared_error
        elif loss_str == "huber":
            loss_class = tf.losses.huber_loss
        else:
            raise ValueError("Not implemented, choose from 'mse', 'huber'.")

        self._loss = tf.reduce_mean(
            loss_class(labels=target, predictions=predictions))

        if optimizer_str == "adam":
            self._optimizer = tf.train.AdamOptimizer(
                learning_rate=learning_rate)
        elif optimizer_str == "sgd":
            self._optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        else:
            raise ValueError("Not implemented, choose from 'adam' and 'sgd'.")

        self._learn_step = self._optimizer.minimize(self._loss)
        self._initialize()
Exemplo n.º 21
0
def _iou_per_anchor(pred_boxes: FloatType,
                    target_boxes: FloatType,
                    iou_type: Text = 'iou') -> tf.Tensor:
    """Computing the IoU for a single anchor.

  Args:
    pred_boxes: predicted boxes, with coordinate [y_min, x_min, y_max, x_max].
    target_boxes: target boxes, with coordinate [y_min, x_min, y_max, x_max].
    iou_type: one of ['iou', 'ciou', 'diou', 'giou'].

  Returns:
    IoU loss float `Tensor`.
  """
    # t_ denotes target boxes and p_ denotes predicted boxes.
    t_ymin, t_xmin, t_ymax, t_xmax = target_boxes
    p_ymin, p_xmin, p_ymax, p_xmax = pred_boxes

    zero = tf.convert_to_tensor(0.0, t_ymin.dtype)
    p_width = tf.maximum(zero, p_xmax - p_xmin)
    p_height = tf.maximum(zero, p_ymax - p_ymin)
    t_width = tf.maximum(zero, t_xmax - t_xmin)
    t_height = tf.maximum(zero, t_ymax - t_ymin)
    p_area = p_width * p_height
    t_area = t_width * t_height

    intersect_ymin = tf.maximum(p_ymin, t_ymin)
    intersect_xmin = tf.maximum(p_xmin, t_xmin)
    intersect_ymax = tf.minimum(p_ymax, t_ymax)
    intersect_xmax = tf.minimum(p_xmax, t_xmax)
    intersect_width = tf.maximum(zero, intersect_xmax - intersect_xmin)
    intersect_height = tf.maximum(zero, intersect_ymax - intersect_ymin)
    intersect_area = intersect_width * intersect_height

    union_area = p_area + t_area - intersect_area
    iou_v = tf.math.divide_no_nan(intersect_area, union_area)
    if iou_type == 'iou':
        return iou_v  # iou is the simplest form.

    enclose_ymin = tf.minimum(p_ymin, t_ymin)
    enclose_xmin = tf.minimum(p_xmin, t_xmin)
    enclose_ymax = tf.maximum(p_ymax, t_ymax)
    enclose_xmax = tf.maximum(p_xmax, t_xmax)

    assert iou_type in ('giou', 'diou', 'ciou')
    if iou_type == 'giou':  # giou is the generalized iou.
        enclose_width = tf.maximum(zero, enclose_xmax - enclose_xmin)
        enclose_height = tf.maximum(zero, enclose_ymax - enclose_ymin)
        enclose_area = enclose_width * enclose_height
        giou_v = iou_v - tf.math.divide_no_nan(
            (enclose_area - union_area), enclose_area)
        return giou_v

    assert iou_type in ('diou', 'ciou')
    p_center = tf.stack([(p_ymin + p_ymax) / 2, (p_xmin + p_xmax) / 2])
    t_center = tf.stack([(t_ymin + t_ymax) / 2, (t_xmin + t_xmax) / 2])
    euclidean = tf.linalg.norm(t_center - p_center)
    diag_length = tf.linalg.norm(
        [enclose_ymax - enclose_ymin, enclose_xmax - enclose_xmin])
    diou_v = iou_v - tf.math.divide_no_nan(euclidean**2, diag_length**2)
    if iou_type == 'diou':  # diou is the distance iou.
        return diou_v

    assert iou_type == 'ciou'
    v = _get_v(p_height, p_width, t_height, t_width)
    alpha = tf.math.divide_no_nan(v, ((1 - iou_v) + v))
    return diou_v - alpha * v  # the last one is ciou.
Exemplo n.º 22
0
def _using_motion_vector_with_distortion(depth,
                                         translation,
                                         rotation_angles,
                                         intrinsic_mat,
                                         distortion_coeff=0.0):
    """A helper for using_motion_vector. See docstring therein."""

    if translation.shape.ndims not in (2, 4):
        raise ValueError('\'translation\' should have rank 2 or 4, not %d' %
                         translation.shape.ndims)
    if translation.shape[-1] != 3:
        raise ValueError('translation\'s last dimension should be 3, not %d' %
                         translation.shape[1])
    if translation.shape.ndims == 2:
        translation = tf.expand_dims(tf.expand_dims(translation, 1), 1)

    _, height, width = tf.unstack(tf.shape(depth))
    grid = tf.squeeze(tf.stack(
        tf.meshgrid(tf.range(width), tf.range(height), (1, ))),
                      axis=3)  # 3 x height x width
    grid = tf.to_float(grid)
    intrinsic_mat_inv = tf.linalg.inv(intrinsic_mat)

    normalized_grid = tf.einsum('bij,jhw->bihw', intrinsic_mat_inv, grid)
    radii_squared = tf.reduce_sum(tf.square(normalized_grid[:, :2, :, :]),
                                  axis=1)

    undistortion_factor = quadratic_inverse_distortion_scale(
        distortion_coeff, radii_squared)
    undistortion_factor = tf.stack([
        undistortion_factor, undistortion_factor,
        tf.ones_like(undistortion_factor)
    ],
                                   axis=1)
    normalized_grid *= undistortion_factor

    rot_mat = transform_utils.matrix_from_angles(rotation_angles)
    # We have to treat separately the case of a per-image rotation vector and a
    # per-image rotation field, because the broadcasting capabilities of einsum
    # are limited.
    if rotation_angles.shape.ndims == 2:
        # The calculation here is identical to the one in inverse_warp above.
        # Howeverwe use einsum for better clarity. Under the hood, einsum performs
        # the reshaping and invocation of BatchMatMul, instead of doing it manually,
        # as in inverse_warp.
        pcoords = tf.einsum('bij,bjhw,bhw->bihw', rot_mat, normalized_grid,
                            depth)
    elif rotation_angles.shape.ndims == 4:
        # We push the H and W dimensions to the end, and transpose the rotation
        # matrix elements (as noted above).
        rot_mat = tf.transpose(rot_mat, [0, 3, 4, 1, 2])
        pcoords = tf.einsum('bijhw,bjhw,bhw->bihw', rot_mat, normalized_grid,
                            depth)

    pcoords += tf.transpose(translation, [0, 3, 1, 2])

    x, y, z = tf.unstack(pcoords, axis=1)
    x /= z
    y /= z
    scale = quadraric_distortion_scale(distortion_coeff,
                                       tf.square(x) + tf.square(y))
    x *= scale
    y *= scale

    pcoords = tf.einsum('bij,bjhw->bihw', intrinsic_mat,
                        tf.stack([x, y, tf.ones_like(x)], axis=1))
    x, y, _ = tf.unstack(pcoords, axis=1)

    return x, y, z
Exemplo n.º 23
0
def multilevel_crop_and_resize(features, boxes, output_size=7):
  """Crop and resize on multilevel feature pyramid.

  Generate the (output_size, output_size) set of pixels for each input box
  by first locating the box into the correct feature level, and then cropping
  and resizing it using the correspoding feature map of that level.

  Args:
    features: A dictionary with key as pyramid level and value as features.
      The features are in shape of [batch_size, height_l, width_l, num_filters].
    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row
      represents a box with [y1, x1, y2, x2] in un-normalized coordinates.
    output_size: A scalar to indicate the output crop size.

  Returns:
    A 5-D tensor representing feature crop of shape
    [batch_size, num_boxes, output_size, output_size, num_filters].
  """
  with tf.name_scope('multilevel_crop_and_resize'):
    levels = list(features.keys())
    min_level = min(levels)
    max_level = max(levels)
    _, max_feature_height, max_feature_width, _ = (
        features[min_level].get_shape().as_list())
    # Stacks feature pyramid into a features_all of shape
    # [batch_size, levels, height, width, num_filters].
    features_all = []
    for level in range(min_level, max_level + 1):
      features_all.append(tf.image.pad_to_bounding_box(
          features[level], 0, 0, max_feature_height, max_feature_width))
    features_all = tf.stack(features_all, axis=1)

    # Assigns boxes to the right level.
    box_width = boxes[:, :, 3] - boxes[:, :, 1]
    box_height = boxes[:, :, 2] - boxes[:, :, 0]
    areas_sqrt = tf.sqrt(box_height * box_width)
    levels = tf.cast(tf.floordiv(tf.log(tf.div(areas_sqrt, 224.0)),
                                 tf.log(2.0)) + 4.0, dtype=tf.int32)
    # Maps levels between [min_level, max_level].
    levels = tf.minimum(max_level, tf.maximum(levels, min_level))

    # Projects box location and sizes to corresponding feature levels.
    scale_to_level = tf.cast(
        tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)),
        dtype=boxes.dtype)
    boxes /= tf.expand_dims(scale_to_level, axis=2)
    box_width /= scale_to_level
    box_height /= scale_to_level
    boxes = tf.concat([boxes[:, :, 0:2],
                       tf.expand_dims(box_height, -1),
                       tf.expand_dims(box_width, -1)], axis=-1)

    # Maps levels to [0, max_level-min_level].
    levels -= min_level
    level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
    boundary = tf.cast(
        tf.concat([
            tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] /
                           level_strides - 1,
                           axis=-1),
            tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] /
                           level_strides - 1,
                           axis=-1),
        ], axis=-1),
        boxes.dtype)

    return selective_crop_and_resize(
        features_all, boxes, levels, boundary, output_size)
Exemplo n.º 24
0
    def _add_seq2seq(self):
        """Add the whole sequence-to-sequence model to the graph."""
        hps = self._hps
        vsize = self._vocab.size()  # size of the vocabulary

        with tf.variable_scope('seq2seq'):
            # Some initializers
            self.rand_unif_init = tf.random_uniform_initializer(
                -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123)
            self.trunc_norm_init = tf.truncated_normal_initializer(
                stddev=hps.trunc_norm_init_std)

            # Add embedding matrix (shared by the encoder and decoder inputs)
            with tf.variable_scope('embedding'):
                embedding = tf.get_variable('embedding', [vsize, hps.emb_dim],
                                            dtype=tf.float32,
                                            initializer=self.trunc_norm_init)
                if hps.mode == "train":
                    self._add_emb_vis(embedding)  # add to tensorboard
                emb_enc_inputs = tf.nn.embedding_lookup(
                    embedding, self._enc_batch
                )  # tensor with shape (batch_size, max_enc_steps, emb_size)
                emb_dec_inputs = [
                    tf.nn.embedding_lookup(embedding, x)
                    for x in tf.unstack(self._dec_batch, axis=1)
                ]  # list length max_dec_steps containing shape (batch_size, emb_size)

            # Add the encoder.
            enc_outputs, fw_st, bw_st = self._add_encoder(
                emb_enc_inputs, self._enc_lens)
            self._enc_states = enc_outputs

            # Our encoder is bidirectional and our decoder is unidirectional so we need to reduce the final encoder hidden state to the right size to be the initial decoder hidden state
            self._dec_in_state = self._reduce_states(fw_st, bw_st)

            # Add the decoder.
            with tf.variable_scope('decoder'):
                decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage = self._add_decoder(
                    emb_dec_inputs)

            # Add the output projection to obtain the vocabulary distribution
            with tf.variable_scope('output_projection'):
                w = tf.get_variable('w', [hps.hidden_dim, vsize],
                                    dtype=tf.float32,
                                    initializer=self.trunc_norm_init)
                w_t = tf.transpose(w)
                v = tf.get_variable('v', [vsize],
                                    dtype=tf.float32,
                                    initializer=self.trunc_norm_init)
                vocab_scores = [
                ]  # vocab_scores is the vocabulary distribution before applying softmax. Each entry on the list corresponds to one decoder step
                for i, output in enumerate(decoder_outputs):
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()
                    vocab_scores.append(tf.nn.xw_plus_b(
                        output, w, v))  # apply the linear layer

                vocab_dists = [
                    tf.nn.softmax(s) for s in vocab_scores
                ]  # The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file.

            # For pointer-generator model, calc final distribution from copy distribution and vocabulary distribution
            if FLAGS.pointer_gen:
                final_dists = self._calc_final_dist(vocab_dists,
                                                    self.attn_dists)
            else:  # final distribution is just vocabulary distribution
                final_dists = vocab_dists

            if hps.mode in ['train', 'eval']:
                # Calculate the loss
                with tf.variable_scope('loss'):
                    if FLAGS.pointer_gen:
                        # Calculate the loss per step
                        # This is fiddly; we use tf.gather_nd to pick out the probabilities of the gold target words
                        loss_per_step = [
                        ]  # will be list length max_dec_steps containing shape (batch_size)
                        batch_nums = tf.range(
                            0, limit=hps.batch_size)  # shape (batch_size)
                        for dec_step, dist in enumerate(final_dists):
                            targets = self._target_batch[:,
                                                         dec_step]  # The indices of the target words. shape (batch_size)
                            indices = tf.stack((batch_nums, targets),
                                               axis=1)  # shape (batch_size, 2)
                            gold_probs = tf.gather_nd(
                                dist, indices
                            )  # shape (batch_size). prob of correct words on this step
                            losses = -tf.log(gold_probs)
                            loss_per_step.append(losses)

                        # Apply dec_padding_mask and get loss
                        self._loss = _mask_and_avg(loss_per_step,
                                                   self._dec_padding_mask)

                    else:  # baseline model
                        self._loss = tf.contrib.seq2seq.sequence_loss(
                            tf.stack(vocab_scores, axis=1), self._target_batch,
                            self._dec_padding_mask
                        )  # this applies softmax internally

                    tf.summary.scalar('loss', self._loss)

                    # Calculate coverage loss from the attention distributions
                    if hps.coverage:
                        with tf.variable_scope('coverage_loss'):
                            self._coverage_loss = _coverage_loss(
                                self.attn_dists, self._dec_padding_mask)
                            tf.summary.scalar('coverage_loss',
                                              self._coverage_loss)
                        self._total_loss = self._loss + hps.cov_loss_wt * self._coverage_loss
                        tf.summary.scalar('total_loss', self._total_loss)

        if hps.mode == "decode":
            # We run decode beam search mode one decoder step at a time
            assert len(
                final_dists
            ) == 1  # final_dists is a singleton list containing shape (batch_size, extended_vsize)
            final_dists = final_dists[0]
            topk_probs, self._topk_ids = tf.nn.top_k(
                final_dists, hps.batch_size * 2
            )  # take the k largest probs. note batch_size=beam_size in decode mode
            self._topk_log_probs = tf.log(topk_probs)
Exemplo n.º 25
0
def selective_crop_and_resize(features,
                              boxes,
                              box_levels,
                              boundaries,
                              output_size=7,
                              sample_offset=0.5):
  """Crop and resize boxes on a set of feature maps.

  Given multiple features maps indexed by different levels, and a set of boxes
  where each box is mapped to a certain level, it selectively crops and resizes
  boxes from the corresponding feature maps to generate the box features.

  We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
  figure 3 for reference). Specifically, for each feature map, we select an
  (output_size, output_size) set of pixels corresponding to the box location,
  and then use bilinear interpolation to select the feature value for each
  pixel.

  For performance, we perform the gather and interpolation on all layers as a
  single operation. This is op the multi-level features are first stacked and
  gathered into [2*output_size, 2*output_size] feature points. Then bilinear
  interpolation is performed on the gathered feature points to generate
  [output_size, output_size] RoIAlign feature map.

  Here is the step-by-step algorithm:
    1. The multi-level features are gathered into a
       [batch_size, num_boxes, output_size*2, output_size*2, num_filters]
       Tensor. The Tensor contains four neighboring feature points for each
       vertice in the output grid.
    2. Compute the interpolation kernel of shape
       [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
       can be seen as stacking 2x2 interpolation kernels for all vertices in the
       output grid.
    3. Element-wise multiply the gathered features and interpolation kernel.
       Then apply 2x2 average pooling to reduce spatial dimension to
       output_size.

  Args:
    features: a 5-D tensor of shape
      [batch_size, num_levels, max_height, max_width, num_filters] where
      cropping and resizing are based.
    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
      information of each box w.r.t. the corresponding feature map.
      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
      in terms of the number of pixels of the corresponding feature map size.
    box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
      the 0-based corresponding feature level index of each box.
    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
      the boundary (in (y, x)) of the corresponding feature map for each box.
      Any resampled grid points that go beyond the bounary will be clipped.
    output_size: a scalar indicating the output crop size.
    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
      from grid point.

  Returns:
    features_per_box: a 5-D tensor of shape
      [batch_size, num_boxes, output_size, output_size, num_filters]
      representing the cropped features.
  """
  (batch_size, num_levels, max_feature_height, max_feature_width,
   num_filters) = features.get_shape().as_list()
  _, num_boxes, _ = boxes.get_shape().as_list()

  # Compute the grid position w.r.t. the corresponding feature map.
  box_grid_x = []
  box_grid_y = []
  for i in range(output_size):
    box_grid_x.append(boxes[:, :, 1] +
                      (i + sample_offset) * boxes[:, :, 3] / output_size)
    box_grid_y.append(boxes[:, :, 0] +
                      (i + sample_offset) * boxes[:, :, 2] / output_size)
  box_grid_x = tf.stack(box_grid_x, axis=2)
  box_grid_y = tf.stack(box_grid_y, axis=2)

  # Compute indices for gather operation.
  box_grid_y0 = tf.floor(box_grid_y)
  box_grid_x0 = tf.floor(box_grid_x)
  box_grid_x0 = tf.maximum(0., box_grid_x0)
  box_grid_y0 = tf.maximum(0., box_grid_y0)
  box_gridx0x1 = tf.stack(
      [tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)),
       tf.minimum(box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1))],
      axis=3)
  box_gridy0y1 = tf.stack(
      [tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)),
       tf.minimum(box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1))],
      axis=3)

  x_indices = tf.cast(
      tf.reshape(box_gridx0x1,
                 [batch_size, num_boxes, output_size * 2]), dtype=tf.int32)
  y_indices = tf.cast(
      tf.reshape(box_gridy0y1,
                 [batch_size, num_boxes, output_size * 2]), dtype=tf.int32)

  height_dim_offset = max_feature_width
  level_dim_offset = max_feature_height * height_dim_offset
  batch_dim_offset = num_levels * level_dim_offset
  indices = tf.reshape(
      tf.tile(tf.reshape(tf.range(batch_size) * batch_dim_offset,
                         [batch_size, 1, 1, 1]),
              [1, num_boxes, output_size * 2, output_size * 2]) +
      tf.tile(tf.reshape(box_levels * level_dim_offset,
                         [batch_size, num_boxes, 1, 1]),
              [1, 1, output_size * 2, output_size * 2]) +
      tf.tile(tf.reshape(y_indices * height_dim_offset,
                         [batch_size, num_boxes, output_size * 2, 1]),
              [1, 1, 1, output_size * 2]) +
      tf.tile(tf.reshape(x_indices,
                         [batch_size, num_boxes, 1, output_size * 2]),
              [1, 1, output_size * 2, 1]), [-1])

  features = tf.reshape(features, [-1, num_filters])
  features_per_box = tf.reshape(
      tf.gather(features, indices),
      [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])

  # The RoIAlign feature f can be computed by bilinear interpolation of four
  # neighboring feature points f0, f1, f2, and f3.
  # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
  #                       [f10, f11]]
  # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
  # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
  ly = box_grid_y - box_grid_y0
  lx = box_grid_x - box_grid_x0
  hy = 1.0 - ly
  hx = 1.0 - lx
  kernel_x = tf.reshape(tf.stack([hx, lx], axis=3),
                        [batch_size, num_boxes, 1, output_size*2])
  kernel_y = tf.reshape(tf.stack([hy, ly], axis=3),
                        [batch_size, num_boxes, output_size*2, 1])
  # Uses implicit broadcast to generate the interpolation kernel. The
  # multiplier `4` is for avg pooling.
  interpolation_kernel = kernel_y * kernel_x * 4

  # Interpolates the gathered features with computed interpolation kernels.
  features_per_box *= tf.cast(
      tf.expand_dims(interpolation_kernel, axis=4),
      dtype=features_per_box.dtype)
  features_per_box = tf.reshape(
      features_per_box,
      [batch_size * num_boxes, output_size*2, output_size*2, num_filters])
  features_per_box = tf.nn.avg_pool(
      features_per_box, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID')
  features_per_box = tf.reshape(
      features_per_box,
      [batch_size, num_boxes, output_size, output_size, num_filters])

  return features_per_box
Exemplo n.º 26
0
    def __init__(self, num_emb, batch_size, emb_dim, hidden_dim,
                 sequence_length, start_token, mid_layer,
                 learning_rate=0.005, l2_reg_lambda=0):
        self.num_emb = num_emb
        self.batch_size = batch_size
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.sequence_length = sequence_length
        # self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.r_params = []
        self.grad_clip = 5.0
        self.mid_layer = mid_layer
        self.start_token = tf.constant([start_token] * self.batch_size, dtype=tf.int32)

        self.expected_reward = tf.Variable(tf.zeros([self.sequence_length]))

        with tf.variable_scope('generator'):
            self.r_embeddings = tf.Variable(self.init_matrix([self.num_emb, self.emb_dim]))
            self.r_params.append(self.r_embeddings)
            self.r_recurrent_unit = self.create_recurrent_unit(self.r_params)  # maps h_tm1 to h_t for generator
            self.r_output_unit = self.create_output_unit(self.r_params, self.mid_layer)  # maps h_t to o_t (output token logits)

        # placeholder definition
        self.x = tf.placeholder(tf.int32, shape=[self.batch_size, self.sequence_length]) # sequence of tokens generated by generator
        self.weight = tf.placeholder(tf.float32, shape=[self.batch_size])
        self.temperature = tf.placeholder(tf.float32, name='temperature')
        self.learning_rate = tf.placeholder(tf.float32, name="lr")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
        self.rewards = tf.placeholder(tf.float32, shape=[self.batch_size, self.sequence_length]) # get from rollout policy and discriminator

        # processed for batch
        self.pos_weight = tf.nn.softmax(self.weight[:self.batch_size//2])
        self.neg_weight = -1.0 * tf.nn.softmax(self.weight[self.batch_size//2:] / self.temperature)
        self.f_weight = tf.concat([self.pos_weight, self.neg_weight], axis=0)
        with tf.device("/cpu:0"):
            self.word = tf.nn.dropout(tf.nn.embedding_lookup(self.r_embeddings, self.x), self.dropout_keep_prob)
            self.processed_x = tf.transpose(self.word, perm=[1, 0, 2])  # seq_length x batch_size x emb_dim

        # Initial states
        self.h0 = tf.zeros([self.batch_size, self.hidden_dim])
        self.h0 = tf.stack([self.h0, self.h0])
        # self.avg_h0 = tf.zeros([self.batch_size, self.hidden_dim])

        gen_h = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length,
                                             dynamic_size=False, infer_shape=True)


        # supervised pretraining for generator
        r_predictions = tensor_array_ops.TensorArray(
            dtype=tf.float32, size=self.sequence_length,
            dynamic_size=False, infer_shape=True)

        ta_emb_x = tensor_array_ops.TensorArray(
            dtype=tf.float32, size=self.sequence_length)
        ta_emb_x = ta_emb_x.unstack(self.processed_x)

        def _pretrain_recurrence(i, x_t, h_tm1, r_predictions, gen_h):
            gen_h = gen_h.write(i, tf.unstack(h_tm1)[0])
            h_t = self.r_recurrent_unit(x_t, h_tm1)
            o_t = self.r_output_unit(h_t)
            r_predictions = r_predictions.write(i, o_t)  # batch x vocab_size
            x_tp1 = ta_emb_x.read(i)
            return i + 1, x_tp1, h_t, r_predictions, gen_h

        _, _, _, self.r_predictions, self.gen_h = control_flow_ops.while_loop(
            cond=lambda i, _1, _2, _3, _4: i < self.sequence_length,
            body=_pretrain_recurrence,
            loop_vars=(tf.constant(0, dtype=tf.int32),
                       tf.nn.embedding_lookup(self.r_embeddings, self.start_token),
                       self.h0, r_predictions, gen_h))

        self.r_predictions = tf.transpose(self.r_predictions.stack(), perm=[1, 0, 2])  # batch_size x seq_length x vocab_size

        # clip_reward & log_pred :  batch*seq  x vocab_size
        self.clipped_reward = tf.one_hot(tf.to_int32(tf.reshape(self.x, [-1])), self.num_emb, 1.0, 0.0) * \
            tf.clip_by_value(tf.reshape(self.r_predictions, [-1, self.num_emb]), 1e-20, 1.0)
        self.reward_per_step_snt = tf.reshape(tf.reduce_sum(self.clipped_reward, -1), [self.batch_size, self.sequence_length])
        self.sent_reward = tf.reduce_sum(self.reward_per_step_snt, axis=1)
        self.reward_loss = -tf.reduce_sum(self.sent_reward * self.f_weight) + \
                            l2_reg_lambda * (tf.add_n([tf.nn.l2_loss(var) for var in self.r_params if var not in [self.r_embeddings]]))

        reward_opt = self.optimizer(self.learning_rate)

        self.reward_grad, _ = tf.clip_by_global_norm(tf.gradients(self.reward_loss, self.r_params), self.grad_clip)
        self.reward_updates = reward_opt.apply_gradients(zip(self.reward_grad, self.r_params))
Exemplo n.º 27
0
 def sample(self):
     return self.low + tf.cast(
         tf.stack([p.sample()
                   for p in self.categoricals], axis=-1), tf.int32)
Exemplo n.º 28
0
def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
  """Unstacks all tensors in `tensor_dict` along 0th dimension.

  Unstacks tensor from the tensor dict along 0th dimension and returns a
  tensor_dict containing values that are lists of unstacked, unpadded tensors.

  Tensors in the `tensor_dict` are expected to be of one of the three shapes:
  1. [batch_size]
  2. [batch_size, height, width, channels]
  3. [batch_size, num_boxes, d1, d2, ... dn]

  When unpad_groundtruth_tensors is set to true, unstacked tensors of form 3
  above are sliced along the `num_boxes` dimension using the value in tensor
  field.InputDataFields.num_groundtruth_boxes.

  Note that this function has a static list of input data fields and has to be
  kept in sync with the InputDataFields defined in core/standard_fields.py

  Args:
    tensor_dict: A dictionary of batched groundtruth tensors.
    unpad_groundtruth_tensors: Whether to remove padding along `num_boxes`
      dimension of the groundtruth tensors.

  Returns:
    A dictionary where the keys are from fields.InputDataFields and values are
    a list of unstacked (optionally unpadded) tensors.

  Raises:
    ValueError: If unpad_tensors is True and `tensor_dict` does not contain
      `num_groundtruth_boxes` tensor.
  """
  unbatched_tensor_dict = {
      key: tf.unstack(tensor) for key, tensor in tensor_dict.items()
  }
  if unpad_groundtruth_tensors:
    if (fields.InputDataFields.num_groundtruth_boxes not in
        unbatched_tensor_dict):
      raise ValueError('`num_groundtruth_boxes` not found in tensor_dict. '
                       'Keys available: {}'.format(
                           unbatched_tensor_dict.keys()))
    unbatched_unpadded_tensor_dict = {}
    unpad_keys = set([
        # List of input data fields that are padded along the num_boxes
        # dimension. This list has to be kept in sync with InputDataFields in
        # standard_fields.py.
        fields.InputDataFields.groundtruth_instance_masks,
        fields.InputDataFields.groundtruth_classes,
        fields.InputDataFields.groundtruth_boxes,
        fields.InputDataFields.groundtruth_keypoints,
        fields.InputDataFields.groundtruth_keypoint_visibilities,
        fields.InputDataFields.groundtruth_group_of,
        fields.InputDataFields.groundtruth_difficult,
        fields.InputDataFields.groundtruth_is_crowd,
        fields.InputDataFields.groundtruth_area,
        fields.InputDataFields.groundtruth_weights
    ]).intersection(set(unbatched_tensor_dict.keys()))

    for key in unpad_keys:
      unpadded_tensor_list = []
      for num_gt, padded_tensor in zip(
          unbatched_tensor_dict[fields.InputDataFields.num_groundtruth_boxes],
          unbatched_tensor_dict[key]):
        tensor_shape = shape_utils.combined_static_and_dynamic_shape(
            padded_tensor)
        slice_begin = tf.zeros([len(tensor_shape)], dtype=tf.int32)
        slice_size = tf.stack(
            [num_gt] + [-1 if dim is None else dim for dim in tensor_shape[1:]])
        unpadded_tensor = tf.slice(padded_tensor, slice_begin, slice_size)
        unpadded_tensor_list.append(unpadded_tensor)
      unbatched_unpadded_tensor_dict[key] = unpadded_tensor_list

    unbatched_tensor_dict.update(unbatched_unpadded_tensor_dict)

  return unbatched_tensor_dict
Exemplo n.º 29
0
    def metric_fn(**kwargs):
      """Returns a dictionary that has the evaluation metrics."""
      if params['nms_configs'].get('pyfunc', True):
        detections_bs = []
        for index in range(kwargs['boxes'].shape[0]):
          nms_configs = params['nms_configs']
          detections = tf.numpy_function(
              functools.partial(nms_np.per_class_nms, nms_configs=nms_configs),
              [
                  kwargs['boxes'][index],
                  kwargs['scores'][index],
                  kwargs['classes'][index],
                  tf.slice(kwargs['image_ids'], [index], [1]),
                  tf.slice(kwargs['image_scales'], [index], [1]),
                  params['num_classes'],
                  nms_configs['max_output_size'],
              ], tf.float32)
          detections_bs.append(detections)
        detections_bs = postprocess.transform_detections(
            tf.stack(detections_bs))
      else:
        # These two branches should be equivalent, but currently they are not.
        # TODO(tanmingxing): enable the non_pyfun path after bug fix.
        nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms(
            params, kwargs['boxes'], kwargs['scores'], kwargs['classes'],
            kwargs['image_scales'])
        img_ids = tf.cast(
            tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype)
        detections_bs = [
            img_ids * tf.ones_like(nms_scores),
            nms_boxes[:, :, 1],
            nms_boxes[:, :, 0],
            nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
            nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
            nms_scores,
            nms_classes,
        ]
        detections_bs = tf.stack(detections_bs, axis=-1, name='detnections')

      if params.get('testdev_dir', None):
        logging.info('Eval testdev_dir %s', params['testdev_dir'])
        eval_metric = coco_metric.EvaluationMetric(
            testdev_dir=params['testdev_dir'])
        coco_metrics = eval_metric.estimator_metric_fn(detections_bs,
                                                       tf.zeros([1]))
      else:
        logging.info('Eval val with groudtruths %s.', params['val_json_file'])
        eval_metric = coco_metric.EvaluationMetric(
            filename=params['val_json_file'], label_map=params['label_map'])
        coco_metrics = eval_metric.estimator_metric_fn(
            detections_bs, kwargs['groundtruth_data'])

      # Add metrics to output.
      cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
      box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
      output_metrics = {
          'cls_loss': cls_loss,
          'box_loss': box_loss,
      }
      output_metrics.update(coco_metrics)
      return output_metrics
Exemplo n.º 30
0
    def _log_prob(self, data, num_samples=1):
        """Compute a lower bound on the log likelihood."""
        # Due to memory issues, we need to use num_samples=1 here
        num_samples, proposal_num_samples = 1, num_samples
        batch_size = tf.shape(data)[0]
        # Sample from the proposal and compute the weighs of the "unseen" samples.
        # We share these across the batch dimension.
        # [num_samples, K, data_size]
        proposal_samples = self.proposal.sample(num_samples * (self.K - 1))
        if not self.reparameterize_proposal_samples:
            proposal_samples = tf.stop_gradient(proposal_samples)

        # [num_samples, K]
        log_energy_proposal = tf.reshape(
            self.energy_fn(tf.reshape(proposal_samples, [-1] + self.data_dim)),
            [num_samples, self.K - 1])
        tf.summary.histogram("log_energy_proposal", log_energy_proposal)
        tf.summary.scalar("min_log_energy_proposal",
                          tf.reduce_min(log_energy_proposal))
        tf.summary.scalar("max_log_energy_proposal",
                          tf.reduce_max(log_energy_proposal))
        # [num_samples]
        proposal_lse = tf.reduce_logsumexp(log_energy_proposal, axis=1)

        # [batch_size, num_samples]
        tiled_proposal_lse = tf.tile(proposal_lse[tf.newaxis, :],
                                     [batch_size, 1])

        # Compute the weights of the observed data.
        # [batch_size, 1]
        log_energy_data = tf.reshape(self.energy_fn(data), [batch_size])
        tf.summary.histogram("log_energy_data", log_energy_data)
        tf.summary.scalar("min_log_energy_data",
                          tf.reduce_min(log_energy_data))
        tf.summary.scalar("max_log_energy_data",
                          tf.reduce_max(log_energy_data))

        # [batch_size, num_samples]
        tiled_log_energy_data = tf.tile(log_energy_data[:, tf.newaxis],
                                        [1, num_samples])

        # Add the weights of the proposal samples with the true data weights.
        # [batch_size, num_samples]
        # pylint: disable=invalid-name
        Z_hat = tf.reduce_logsumexp(tf.stack(
            [tiled_log_energy_data, tiled_proposal_lse], axis=-1),
                                    axis=-1)
        Z_hat -= tf.log(tf.to_float(self.K))
        # Perform the log-sum-exp reduction for IWAE
        # [batch_size]
        Z_hat = tf.reduce_logsumexp(Z_hat, axis=1) - tf.log(
            tf.to_float(num_samples))
        # pylint: enable=invalid-name

        try:
            # Try giving the proposal lower bound num_samples if it can use it.
            proposal_lp = self.proposal.log_prob(
                data, num_samples=proposal_num_samples)
        except TypeError:
            proposal_lp = self.proposal.log_prob(data)
        lower_bound = proposal_lp + log_energy_data - Z_hat
        return lower_bound