Пример #1
0
    def _interpolate(im, x, y, out_size):
        with tf.variable_scope('_interpolate'):
            # constants
            num_batch = tf.shape(im)[0]
            height = tf.shape(im)[1]
            width = tf.shape(im)[2]
            channels = tf.shape(im)[3]

            x = tf.cast(x, 'float32')
            y = tf.cast(y, 'float32')
            height_f = tf.cast(height, 'float32')
            width_f = tf.cast(width, 'float32')
            out_height = out_size[0]
            out_width = out_size[1]
            zero = tf.zeros([], dtype='int32')
            max_y = tf.cast(tf.shape(im)[1] - 1, 'int32')
            max_x = tf.cast(tf.shape(im)[2] - 1, 'int32')

            # scale indices from [-1, 1] to [0, width/height]
            # x = (x + 1.0)*(width_f) / 2.0
            # y = (y + 1.0)*(height_f) / 2.0
            x = ((x / (width_f / 2.0)) + 1.0) * (width_f / 2.0)
            y = ((y / (height_f / 2.0)) + 1.0) * (height_f / 2.0)

            # do sampling
            x0 = tf.cast(tf.floor(x), 'int32')
            x1 = x0 + 1
            y0 = tf.cast(tf.floor(y), 'int32')
            y1 = y0 + 1

            x0 = tf.clip_by_value(x0, zero, max_x)
            x1 = tf.clip_by_value(x1, zero, max_x)
            y0 = tf.clip_by_value(y0, zero, max_y)
            y1 = tf.clip_by_value(y1, zero, max_y)
            dim2 = width
            dim1 = width * height
            base = _repeat(tf.range(num_batch) * dim1, out_height * out_width)
            base_y0 = base + y0 * dim2
            base_y1 = base + y1 * dim2
            idx_a = base_y0 + x0
            idx_b = base_y1 + x0
            idx_c = base_y0 + x1
            idx_d = base_y1 + x1

            # use indices to lookup pixels in the flat image and restore
            # channels dim
            im_flat = tf.reshape(im, tf.stack([-1, channels]))
            im_flat = tf.cast(im_flat, 'float32')
            Ia = tf.gather(im_flat, idx_a)
            Ib = tf.gather(im_flat, idx_b)
            Ic = tf.gather(im_flat, idx_c)
            Id = tf.gather(im_flat, idx_d)

            # and finally calculate interpolated values
            x0_f = tf.cast(x0, 'float32')
            x1_f = tf.cast(x1, 'float32')
            y0_f = tf.cast(y0, 'float32')
            y1_f = tf.cast(y1, 'float32')
            wa = tf.expand_dims(((x1_f - x) * (y1_f - y)), 1)
            wb = tf.expand_dims(((x1_f - x) * (y - y0_f)), 1)
            wc = tf.expand_dims(((x - x0_f) * (y1_f - y)), 1)
            wd = tf.expand_dims(((x - x0_f) * (y - y0_f)), 1)
            output = tf.add_n([wa * Ia, wb * Ib, wc * Ic, wd * Id])
            return output
Пример #2
0
def add_contrastive_loss(hidden,
                         hidden_norm=True,
                         temperature=1.0,
                         tpu_context=None,
                         weights=1.0):
    """Compute loss for model.

  Args:
    hidden: hidden vector (`Tensor`) of shape (2 * bsz, dim).
    hidden_norm: whether or not to use normalization on the hidden vector.
    temperature: a `floating` number for temperature scaling.
    tpu_context: context information for tpu.
    weights: a weighting number or vector.

  Returns:
    A loss scalar.
    The logits for contrastive prediction task.
    The labels for contrastive prediction task.
  """
    # Get (normalized) hidden1 and hidden2.
    if hidden_norm:
        hidden = tf.math.l2_normalize(hidden, -1)
    hidden1, hidden2 = tf.split(hidden, 2, 0)
    batch_size = tf.shape(hidden1)[0]

    # Gather hidden1/hidden2 across replicas and create local labels.
    if tpu_context is not None:
        hidden1_large = tpu_cross_replica_concat(hidden1, tpu_context)
        hidden2_large = tpu_cross_replica_concat(hidden2, tpu_context)
        enlarged_batch_size = tf.shape(hidden1_large)[0]
        # TODO(iamtingchen): more elegant way to convert u32 to s32 for replica_id.
        replica_id = tf.cast(tf.cast(xla.replica_id(), tf.uint32), tf.int32)
        labels_idx = tf.range(batch_size) + replica_id * batch_size
        labels = tf.one_hot(labels_idx, enlarged_batch_size * 2)
        masks = tf.one_hot(labels_idx, enlarged_batch_size)
    else:
        hidden1_large = hidden1
        hidden2_large = hidden2
        labels = tf.one_hot(tf.range(batch_size), batch_size * 2)
        masks = tf.one_hot(tf.range(batch_size), batch_size)

    logits_aa = tf.matmul(hidden1, hidden1_large,
                          transpose_b=True) / temperature
    logits_aa = logits_aa - masks * LARGE_NUM
    logits_bb = tf.matmul(hidden2, hidden2_large,
                          transpose_b=True) / temperature
    logits_bb = logits_bb - masks * LARGE_NUM
    logits_ab = tf.matmul(hidden1, hidden2_large,
                          transpose_b=True) / temperature
    logits_ba = tf.matmul(hidden2, hidden1_large,
                          transpose_b=True) / temperature

    loss_a = tf.losses.softmax_cross_entropy(labels,
                                             tf.concat([logits_ab, logits_aa],
                                                       1),
                                             weights=weights)
    loss_b = tf.losses.softmax_cross_entropy(labels,
                                             tf.concat([logits_ba, logits_bb],
                                                       1),
                                             weights=weights)
    loss = loss_a + loss_b

    return loss, logits_ab, labels
Пример #3
0
 def _tf_fn():
   tf_indices = [tf.range(dim) for dim in sshape]
   return tf.cast(tf.stack(tf.meshgrid(*tf_indices, indexing='ij'), axis=-1), dtype=self._dtype)
    def __init__(self,
                 sess,
                 model,
                 batch_size=1,
                 confidence=CONFIDENCE,
                 targeted=TARGETED,
                 learning_rate=LEARNING_RATE,
                 binary_search_steps=BINARY_SEARCH_STEPS,
                 max_iterations=MAX_ITERATIONS,
                 abort_early=ABORT_EARLY,
                 initial_const=INITIAL_CONST,
                 boxmin=-0.5,
                 boxmax=0.5,
                 x_window=0,
                 y_window=0,
                 window_size=-1):
        """
        The L_2 optimized attack. 

        This attack is the most efficient and should be used as the primary 
        attack to evaluate potential defenses.

        Returns adversarial examples for the supplied model.

        confidence: Confidence of adversarial examples: higher produces examples
          that are farther away, but more strongly classified as adversarial.
        batch_size: Number of attacks to run simultaneously.
        targeted: True if we should perform a targetted attack, False otherwise.
        learning_rate: The learning rate for the attack algorithm. Smaller values
          produce better results but are slower to converge.
        binary_search_steps: The number of times we perform binary search to
          find the optimal tradeoff-constant between distance and confidence. 
        max_iterations: The maximum number of iterations. Larger values are more
          accurate; setting too small will require a large learning rate and will
          produce poor results.
        abort_early: If true, allows early aborts if gradient descent gets stuck.
        initial_const: The initial tradeoff-constant to use to tune the relative
          importance of distance and confidence. If binary_search_steps is large,
          the initial constant is not important.
        boxmin: Minimum pixel value (default -0.5).
        boxmax: Maximum pixel value (default 0.5).
        """
        if window_size == -1:
            window_size = model.image_size

        image_size, num_channels, num_labels = model.image_size, model.num_channels, model.num_labels
        self.sess = sess
        self.TARGETED = targeted
        self.LEARNING_RATE = learning_rate
        self.MAX_ITERATIONS = max_iterations
        self.BINARY_SEARCH_STEPS = binary_search_steps
        self.ABORT_EARLY = abort_early
        self.CONFIDENCE = confidence
        self.initial_const = initial_const
        self.batch_size = batch_size

        self.repeat = binary_search_steps >= 10

        self.I_KNOW_WHAT_I_AM_DOING_AND_WANT_TO_OVERRIDE_THE_PRESOFTMAX_CHECK = False

        shape = (batch_size, window_size, window_size, num_channels)

        # the variable we're going to optimize over
        modifier = tf.Variable(np.zeros(
            shape, dtype=np.float32))  #qui ridimensionare per fare porzione

        # these are variables to be more efficient in sending data to tf
        self.timg = tf.Variable(np.zeros(shape), dtype=tf.float32)
        self.tlab = tf.Variable(np.zeros((batch_size, num_labels)),
                                dtype=tf.float32)
        self.const = tf.Variable(np.zeros(batch_size), dtype=tf.float32)

        # and here's what we use to assign them
        self.assign_timg = tf.placeholder(tf.float32, shape)
        self.assign_tlab = tf.placeholder(tf.float32, (batch_size, num_labels))
        self.assign_const = tf.placeholder(tf.float32, [batch_size])

        # the resulting image, tanh'd to keep bounded from boxmin to boxmax
        self.boxmul = (boxmax - boxmin) / 2.
        self.boxplus = (boxmin + boxmax) / 2.

        ###################################################################### editing

        mask = tf.zeros((batch_size, image_size, image_size, num_channels),
                        tf.float32)
        # Get input shapes
        modifier_shape = tf.shape(modifier)
        mask_shape = tf.shape(mask)
        # Make indices grid
        oo, ii, jj, kk = tf.meshgrid(tf.range(modifier_shape[0]),
                                     tf.range(modifier_shape[1]),
                                     tf.range(modifier_shape[2]),
                                     tf.range(modifier_shape[3]),
                                     indexing='ij')
        # Shift indices
        ii += y_window
        jj += x_window
        # Scatter update
        mask_to_apply = tf.tensor_scatter_nd_update(
            mask, tf.stack([oo, ii, jj, kk], axis=-1), modifier)

        self.newimg = tf.tanh(mask_to_apply +
                              self.timg) * self.boxmul + self.boxplus

        ###################################################################### editing

        # prediction BEFORE-SOFTMAX of the model
        self.output = model.predict(self.newimg)

        # distance to the input data
        self.l2dist = tf.reduce_sum(
            tf.square(self.newimg -
                      (tf.tanh(self.timg) * self.boxmul + self.boxplus)),
            [1, 2, 3])

        # compute the probability of the label class versus the maximum other
        real = tf.reduce_sum((self.tlab) * self.output, 1)
        other = tf.reduce_max(
            (1 - self.tlab) * self.output - (self.tlab * 10000), 1)

        if self.TARGETED:
            # if targetted, optimize for making the other class most likely
            loss1 = tf.maximum(0.0, other - real + self.CONFIDENCE)
        else:
            # if untargeted, optimize for making this class least likely.
            loss1 = tf.maximum(0.0, real - other + self.CONFIDENCE)

        # sum up the losses
        self.loss2 = tf.reduce_sum(self.l2dist)
        self.loss1 = tf.reduce_sum(self.const * loss1)
        self.loss = self.loss1 + self.loss2

        # Setup the adam optimizer and keep track of variables we're creating
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE)
        self.train = optimizer.minimize(self.loss, var_list=[modifier])
        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        # these are the variables to initialize when we run
        self.setup = []
        self.setup.append(self.timg.assign(self.assign_timg))
        self.setup.append(self.tlab.assign(self.assign_tlab))
        self.setup.append(self.const.assign(self.assign_const))

        self.init = tf.variables_initializer(var_list=[mask] + new_vars)
Пример #5
0
  def __init__(self,
               session,
               player_id,
               state_representation_size,
               num_actions,
               hidden_layers_sizes=128,
               replay_buffer_capacity=10000,
               batch_size=128,
               replay_buffer_class=ReplayBuffer,
               learning_rate=0.01,
               update_target_network_every=1000,
               learn_every=10,
               discount_factor=1.0,
               min_buffer_size_to_learn=1000,
               epsilon_start=1.0,
               epsilon_end=0.1,
               epsilon_decay_duration=int(1e6),
               optimizer_str="sgd",
               loss_str="mse"):
    """Initialize the DQN agent."""

    # This call to locals() is used to store every argument used to initialize
    # the class instance, so it can be copied with no hyperparameter change.
    self._kwargs = locals()

    self.player_id = player_id
    self._session = session
    self._num_actions = num_actions
    if isinstance(hidden_layers_sizes, int):
      hidden_layers_sizes = [hidden_layers_sizes]
    self._layer_sizes = hidden_layers_sizes
    self._batch_size = batch_size
    self._update_target_network_every = update_target_network_every
    self._learn_every = learn_every
    self._min_buffer_size_to_learn = min_buffer_size_to_learn
    self._discount_factor = discount_factor

    self._epsilon_start = epsilon_start
    self._epsilon_end = epsilon_end
    self._epsilon_decay_duration = epsilon_decay_duration

    # TODO(author6) Allow for optional replay buffer config.
    if not isinstance(replay_buffer_capacity, int):
      raise ValueError("Replay buffer capacity not an integer.")
    self._replay_buffer = replay_buffer_class(replay_buffer_capacity)
    self._prev_timestep = None
    self._prev_action = None

    # Step counter to keep track of learning, eps decay and target network.
    self._step_counter = 0

    # Keep track of the last training loss achieved in an update step.
    self._last_loss_value = None

    # Create required TensorFlow placeholders to perform the Q-network updates.
    self._info_state_ph = tf.placeholder(
        shape=[None, state_representation_size],
        dtype=tf.float32,
        name="info_state_ph")
    self._action_ph = tf.placeholder(
        shape=[None], dtype=tf.int32, name="action_ph")
    self._reward_ph = tf.placeholder(
        shape=[None], dtype=tf.float32, name="reward_ph")
    self._is_final_step_ph = tf.placeholder(
        shape=[None], dtype=tf.float32, name="is_final_step_ph")
    self._next_info_state_ph = tf.placeholder(
        shape=[None, state_representation_size],
        dtype=tf.float32,
        name="next_info_state_ph")
    self._legal_actions_mask_ph = tf.placeholder(
        shape=[None, num_actions],
        dtype=tf.float32,
        name="legal_actions_mask_ph")

    self._q_network = simple_nets.MLP(state_representation_size,
                                      self._layer_sizes, num_actions)
    self._q_values = self._q_network(self._info_state_ph)

    self._target_q_network = simple_nets.MLP(state_representation_size,
                                             self._layer_sizes, num_actions)
    self._target_q_values = self._target_q_network(self._next_info_state_ph)

    # Stop gradient to prevent updates to the target network while learning
    self._target_q_values = tf.stop_gradient(self._target_q_values)

    self._update_target_network = self._create_target_network_update_op(
        self._q_network, self._target_q_network)

    # Create the loss operations.
    # Sum a large negative constant to illegal action logits before taking the
    # max. This prevents illegal action values from being considered as target.
    illegal_actions = 1 - self._legal_actions_mask_ph
    illegal_logits = illegal_actions * ILLEGAL_ACTION_LOGITS_PENALTY
    max_next_q = tf.reduce_max(
        tf.math.add(tf.stop_gradient(self._target_q_values), illegal_logits),
        axis=-1)
    target = (
        self._reward_ph +
        (1 - self._is_final_step_ph) * self._discount_factor * max_next_q)

    action_indices = tf.stack(
        [tf.range(tf.shape(self._q_values)[0]), self._action_ph], axis=-1)
    predictions = tf.gather_nd(self._q_values, action_indices)

    if loss_str == "mse":
      loss_class = tf.losses.mean_squared_error
    elif loss_str == "huber":
      loss_class = tf.losses.huber_loss
    else:
      raise ValueError("Not implemented, choose from 'mse', 'huber'.")

    self._loss = tf.reduce_mean(
        loss_class(labels=target, predictions=predictions))

    if optimizer_str == "adam":
      self._optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    elif optimizer_str == "sgd":
      self._optimizer = tf.train.GradientDescentOptimizer(
          learning_rate=learning_rate)
    else:
      raise ValueError("Not implemented, choose from 'adam' and 'sgd'.")

    self._learn_step = self._optimizer.minimize(self._loss)
    self._initialize()
    def _scan_initial_state(self):
        """Create TensorArrays and indices to track bin assignment.

    availability: TensorArray[queue_size, num_sequences]
      This represents the number of tokens available in the ith bin.
      See implementation note below.

    contents: TensorArray[queue_size, num_sequences * 2]
      This holds the actual contents of the packed strings as well as a bit
      mask indicating where sequences begin. It is stored in a flat vector and
      is accessed in offsets of packed_length.

    top_index: scalar [0, queue_size)
      Integer tensor indicating which index is the "top" bin. See implementation
      note below.

    IMPLEMENTATION_NOTE:
      The FFD algorithm periodically pops the topmost queue and pushes a new
      one to replace it. In order to replicate those semantics with a fixed size
      TensorArray, indexing operations are shifted by top_index. For example,
      instead of:
        `queue_available.read(i)`

      a read is instead performed as:
        `queue_available.read((i - top_index) % queue_size)`

      to account for the fact that the "ith" logical FFD queue is stored at
      position j. This means that the pop / push update can be performed by
      simply incrementing top_index. (And zeroing the old top_index position.)

    Returns:
      The state for the binning scan.
    """

        all_available = tf.ones((self._queue_size, self._num_sequences),
                                dtype=INDEX_DTYPE) * self._packed_length
        total_size = self._packed_length * self._queue_size
        total_size_range = tf.range(total_size, dtype=INDEX_DTYPE)
        empty = tf.zeros((total_size, self._num_sequences * 2),
                         dtype=self._token_dtype)

        availability = tf.TensorArray(
            dtype=INDEX_DTYPE,
            size=self._queue_size,
            dynamic_size=False,
            clear_after_read=False,
            element_shape=(self._num_sequences, )).scatter(
                tf.range(self._queue_size, dtype=INDEX_DTYPE), all_available)

        contents = tf.TensorArray(dtype=self._token_dtype,
                                  size=total_size,
                                  dynamic_size=False,
                                  clear_after_read=False,
                                  element_shape=(self._num_sequences *
                                                 2, )).scatter(
                                                     total_size_range, empty)

        # Which index should be considered the "top" bucket for the purpose of
        # the first-fit descending algorithm.
        top_index = tf.zeros((), dtype=INDEX_DTYPE)

        return availability, contents, top_index
Пример #7
0
def compute_mel_filterbank_features(waveforms,
                                    sample_rate=16000,
                                    dither=1.0 / np.iinfo(np.int16).max,
                                    preemphasis=0.97,
                                    frame_length=25,
                                    frame_step=10,
                                    fft_length=None,
                                    window_fn=functools.partial(
                                        tf.signal.hann_window, periodic=True),
                                    lower_edge_hertz=80.0,
                                    upper_edge_hertz=7600.0,
                                    num_mel_bins=80,
                                    log_noise_floor=1e-3,
                                    apply_mask=True):
    """Implement mel-filterbank extraction using tf ops.

  Args:
    waveforms: float32 tensor with shape [batch_size, max_len]
    sample_rate: sampling rate of the waveform
    dither: stddev of Gaussian noise added to waveform to prevent quantization
      artefacts
    preemphasis: waveform high-pass filtering constant
    frame_length: frame length in ms
    frame_step: frame_Step in ms
    fft_length: number of fft bins
    window_fn: windowing function
    lower_edge_hertz: lowest frequency of the filterbank
    upper_edge_hertz: highest frequency of the filterbank
    num_mel_bins: filterbank size
    log_noise_floor: clip small values to prevent numeric overflow in log
    apply_mask: When working on a batch of samples, set padding frames to zero
  Returns:
    filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1]
  """
    # `stfts` is a complex64 Tensor representing the short-time Fourier
    # Transform of each signal in `signals`. Its shape is
    # [batch_size, ?, fft_unique_bins]
    # where fft_unique_bins = fft_length // 2 + 1

    # Find the wave length: the largest index for which the value is !=0
    # note that waveforms samples that are exactly 0.0 are quite common, so
    # simply doing sum(waveforms != 0, axis=-1) will not work correctly.
    wav_lens = tf.reduce_max(
        tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) *
        tf.to_int32(tf.not_equal(waveforms, 0.0)),
        axis=-1) + 1
    if dither > 0:
        waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither)
    if preemphasis > 0:
        waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1]
        wav_lens -= 1
    frame_length = int(frame_length * sample_rate / 1e3)
    frame_step = int(frame_step * sample_rate / 1e3)
    if fft_length is None:
        fft_length = int(2**(np.ceil(np.log2(frame_length))))

    stfts = tf.contrib.signal.stft(waveforms,
                                   frame_length=frame_length,
                                   frame_step=frame_step,
                                   fft_length=fft_length,
                                   window_fn=window_fn,
                                   pad_end=True)

    stft_lens = (wav_lens + (frame_step - 1)) // frame_step
    masks = tf.to_float(
        tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0),
                      tf.expand_dims(stft_lens, 1)))

    # An energy spectrogram is the magnitude of the complex-valued STFT.
    # A float32 Tensor of shape [batch_size, ?, 257].
    magnitude_spectrograms = tf.abs(stfts)

    # Warp the linear-scale, magnitude spectrograms into the mel-scale.
    num_spectrogram_bins = magnitude_spectrograms.shape[-1].value
    linear_to_mel_weight_matrix = (
        tf.contrib.signal.linear_to_mel_weight_matrix(num_mel_bins,
                                                      num_spectrogram_bins,
                                                      sample_rate,
                                                      lower_edge_hertz,
                                                      upper_edge_hertz))
    mel_spectrograms = tf.tensordot(magnitude_spectrograms,
                                    linear_to_mel_weight_matrix, 1)
    # Note: Shape inference for tensordot does not currently handle this case.
    mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

    log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms))

    if apply_mask:
        log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1)

    return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
Пример #8
0
def real_svg_top(body_output,
                 unused_targets,
                 model_hparams,
                 unused_vocab_size,
                 hard=False):
    """Applies the Mixture Density Network on top of the LSTM outputs.

  Args:
    body_output: outputs from LSTM with shape [batch, seqlen, 1, hidden_size]
    unused_targets: what the ground truth SVG outputted should be (unused).
    model_hparams: hyper-parameters, should include num_mixture,
      mix_temperature, and gauss_temperature.
    unused_vocab_size: unused
    hard: whether to force predict mode functionality, or return all MDN
      components

  Returns:
    The MDN output. Could be shape [batch, seqlen, 1, 10] if in predict mode
      (or hard=True) or shape [batch, seqlen, 1, 4 + 6 * num_mix * 3], in train.
  """
    # mixture of gaussians for 6 args plus 4 extra states for cmds
    num_mix = model_hparams.num_mixture
    nout = 4 + 6 * num_mix * 3

    # the 'hard' option is meant to be used if 'top' is called within body
    with tf.variable_scope('real_top', reuse=tf.AUTO_REUSE):
        ret = tf.layers.dense(body_output, nout, name='top')
        batch_size = common_layers.shape_list(ret)[0]

        if hard or model_hparams.mode == tf.estimator.ModeKeys.PREDICT:
            temperature = model_hparams.mix_temperature

            # apply temperature, do softmax
            command = tf.identity(ret[:, :, :, :4]) / temperature
            command = tf.exp(command -
                             tf.reduce_max(command, axis=[-1], keepdims=True))
            command = command / tf.reduce_sum(
                command, axis=[-1], keepdims=True)

            # sample from the given probs, this is the same as get_pi_idx,
            # and already returns not soft prob
            command = tf.distributions.Categorical(probs=command).sample()
            # this is now [batch, seq, 1], need to make it one_hot
            command = tf.one_hot(command, 4)

            arguments = ret[:, :, :, 4:]
            # args are [batch, seq, 1, 6*3*num_mix]. want [batch * seq * 6, 3*num_mix]
            arguments = tf.reshape(arguments, [-1, 3 * num_mix])

            out_logmix, out_mean, out_logstd = _get_mdn_coef(arguments)
            # these are [batch*seq*6, num_mix]

            # apply temp to logmix
            out_logmix = tf.identity(out_logmix) / temperature
            out_logmix = tf.exp(
                out_logmix -
                tf.reduce_max(out_logmix, axis=[-1], keepdims=True))
            out_logmix = out_logmix / tf.reduce_sum(
                out_logmix, axis=[-1], keepdims=True)
            # get_pi_idx
            out_logmix = tf.distributions.Categorical(
                probs=out_logmix).sample()
            # should now be [batch*seq*6, 1]
            out_logmix = tf.cast(out_logmix, tf.int32)
            out_logmix = tf.reshape(out_logmix, [-1])
            # prepare for gather
            out_logmix = tf.stack([tf.range(tf.size(out_logmix)), out_logmix],
                                  axis=-1)

            chosen_mean = tf.gather_nd(out_mean, out_logmix)
            chosen_logstd = tf.gather_nd(out_logstd, out_logmix)

            # sample!!
            rand_gaussian = (tf.random.normal(tf.shape(chosen_mean)) *
                             tf.sqrt(model_hparams.gauss_temperature))
            arguments = chosen_mean + tf.exp(chosen_logstd) * rand_gaussian
            arguments = tf.reshape(arguments, [batch_size, -1, 1, 6])

            # concat with the command we picked!
            ret = tf.concat([command, arguments], axis=-1)

    return ret
Пример #9
0
def _generate_detections_per_image(boxes,
                                   scores,
                                   max_total_size=100,
                                   nms_iou_threshold=0.3,
                                   score_threshold=0.05,
                                   pre_nms_num_boxes=5000):
    """Generate the final detections per image given the model outputs.

  Args:
    boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box
      predictions on all feature levels. The N is the number of total anchors on
      all levels.
    scores: a tensor with shape [N, num_classes], which stacks class probability
      on all feature levels. The N is the number of total anchors on all levels.
      The num_classes is the number of classes predicted by the model. Note that
      the class_outputs here is the raw score.
    max_total_size: a scalar representing maximum number of boxes retained over
      all classes.
    nms_iou_threshold: a float representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
    score_threshold: a float representing the threshold for deciding when to
      remove boxes based on score.
    pre_nms_num_boxes: an int number of top candidate detections per class
      before NMS.

  Returns:
    nms_boxes: `float` Tensor of shape [max_total_size, 4] representing top
      detected boxes in [y1, x1, y2, x2].
    nms_scores: `float` Tensor of shape [max_total_size] representing sorted
      confidence scores for detected boxes. The values are between [0, 1].
    nms_classes: `int` Tensor of shape [max_total_size] representing classes for
      detected boxes.
    valid_detections: `int` Tensor of shape [1] only the top `valid_detections`
      boxes are valid detections.
  """
    nmsed_boxes = []
    nmsed_scores = []
    nmsed_classes = []
    num_classes_for_box = boxes.get_shape().as_list()[1]
    num_classes = scores.get_shape().as_list()[1]
    for i in range(num_classes):
        boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
        scores_i = scores[:, i]

        # Obtains pre_nms_num_boxes before running NMS.
        scores_i, indices = tf.nn.top_k(scores_i,
                                        k=tf.minimum(
                                            tf.shape(scores_i)[-1],
                                            pre_nms_num_boxes))
        boxes_i = tf.gather(boxes_i, indices)

        (nmsed_indices_i,
         nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
             tf.cast(boxes_i, tf.float32),
             tf.cast(scores_i, tf.float32),
             max_total_size,
             iou_threshold=nms_iou_threshold,
             score_threshold=score_threshold,
             pad_to_max_output_size=True,
             name='nms_detections_' + str(i))
        nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
        nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
        # Sets scores of invalid boxes to -1.
        nmsed_scores_i = tf.where(
            tf.less(tf.range(max_total_size), [nmsed_num_valid_i]),
            nmsed_scores_i, -tf.ones_like(nmsed_scores_i))
        nmsed_classes_i = tf.fill([max_total_size], i)
        nmsed_boxes.append(nmsed_boxes_i)
        nmsed_scores.append(nmsed_scores_i)
        nmsed_classes.append(nmsed_classes_i)

    # Concats results from all classes and sort them.
    nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
    nmsed_scores = tf.concat(nmsed_scores, axis=0)
    nmsed_classes = tf.concat(nmsed_classes, axis=0)
    nmsed_scores, indices = tf.nn.top_k(nmsed_scores,
                                        k=max_total_size,
                                        sorted=True)
    nmsed_boxes = tf.gather(nmsed_boxes, indices)
    nmsed_classes = tf.gather(nmsed_classes, indices)
    valid_detections = tf.reduce_sum(
        tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
Пример #10
0
def _local_perm(inputs, targets, is_masked, perm_size, seq_len):
    """Samples a permutation of the factorization order, and create a mask.

  Args:
    inputs: int64 Tensor in shape [seq_len], input ids.
    targets: int64 Tensor in shape [seq_len], target ids.
    is_masked: bool Tensor in shape [seq_len]. True means being selected
      for partial prediction.
    perm_size: the length of longest permutation. Could be set to be reuse_len.
      Should not be larger than reuse_len or there will be data leaks.
    seq_len: int, sequence length.

  Returns:
    The permutation mask, new targets, target mask, and new inputs.

  """

    # Generate permutation indices
    index = tf.range(seq_len, dtype=tf.int64)
    index = tf.transpose(tf.reshape(index, [-1, perm_size]))
    index = tf.random_shuffle(index)
    index = tf.reshape(tf.transpose(index), [-1])

    # `perm_mask` and `target_mask`
    # non-functional tokens
    non_func_tokens = tf.logical_not(
        tf.logical_or(tf.equal(inputs, SEP_ID), tf.equal(inputs, CLS_ID)))

    non_mask_tokens = tf.logical_and(tf.logical_not(is_masked),
                                     non_func_tokens)
    masked_or_func_tokens = tf.logical_not(non_mask_tokens)

    # Set the permutation indices of non-masked (& non-funcional) tokens to the
    # smallest index (-1):
    # (1) they can be seen by all other positions
    # (2) they cannot see masked positions, so there won"t be information leak
    smallest_index = -tf.ones([seq_len], dtype=tf.int64)
    rev_index = tf.where(non_mask_tokens, smallest_index, index)

    # Create `target_mask`: non-funcional and maksed tokens
    # 1: use mask as input and have loss
    # 0: use token (or [SEP], [CLS]) as input and do not have loss
    target_tokens = tf.logical_and(masked_or_func_tokens, non_func_tokens)
    target_mask = tf.cast(target_tokens, tf.float32)

    # Create `perm_mask`
    # `target_tokens` cannot see themselves
    self_rev_index = tf.where(target_tokens, rev_index, rev_index + 1)

    # 1: cannot attend if i <= j and j is not non-masked (masked_or_func_tokens)
    # 0: can attend if i > j or j is non-masked
    perm_mask = tf.logical_and(self_rev_index[:, None] <= rev_index[None, :],
                               masked_or_func_tokens)
    perm_mask = tf.cast(perm_mask, tf.float32)

    # new target: [next token] for LM and [curr token] (self) for PLM
    new_targets = tf.concat([inputs[0:1], targets[:-1]], axis=0)

    # construct inputs_k
    inputs_k = inputs

    # construct inputs_q
    inputs_q = target_mask

    return perm_mask, new_targets, target_mask, inputs_k, inputs_q
Пример #11
0
    def parser(record):
        """function used to parse tfrecord."""

        record_spec = {
            "input": tf.FixedLenFeature([seq_len], tf.int64),
            "target": tf.FixedLenFeature([seq_len], tf.int64),
            "seg_id": tf.FixedLenFeature([seq_len], tf.int64),
            "label": tf.FixedLenFeature([1], tf.int64),
            "is_masked": tf.FixedLenFeature([seq_len], tf.int64),
        }

        # retrieve serialized example
        example = tf.parse_single_example(serialized=record,
                                          features=record_spec)

        inputs = example.pop("input")
        target = example.pop("target")
        is_masked = tf.cast(example.pop("is_masked"), tf.bool)

        non_reuse_len = seq_len - reuse_len
        assert perm_size <= reuse_len and perm_size <= non_reuse_len

        perm_mask_0, target_0, target_mask_0, input_k_0, input_q_0 = _local_perm(
            inputs[:reuse_len], target[:reuse_len], is_masked[:reuse_len],
            perm_size, reuse_len)

        perm_mask_1, target_1, target_mask_1, input_k_1, input_q_1 = _local_perm(
            inputs[reuse_len:], target[reuse_len:], is_masked[reuse_len:],
            perm_size, non_reuse_len)

        perm_mask_0 = tf.concat(
            [perm_mask_0, tf.ones([reuse_len, non_reuse_len])], axis=1)
        perm_mask_1 = tf.concat(
            [tf.zeros([non_reuse_len, reuse_len]), perm_mask_1], axis=1)
        perm_mask = tf.concat([perm_mask_0, perm_mask_1], axis=0)
        target = tf.concat([target_0, target_1], axis=0)
        target_mask = tf.concat([target_mask_0, target_mask_1], axis=0)
        input_k = tf.concat([input_k_0, input_k_1], axis=0)
        input_q = tf.concat([input_q_0, input_q_1], axis=0)

        if num_predict is not None:
            indices = tf.range(seq_len, dtype=tf.int64)
            bool_target_mask = tf.cast(target_mask, tf.bool)
            indices = tf.boolean_mask(indices, bool_target_mask)

            ##### extra padding due to CLS/SEP introduced after prepro
            actual_num_predict = tf.shape(indices)[0]
            pad_len = num_predict - actual_num_predict

            ##### target_mapping
            target_mapping = tf.one_hot(indices, seq_len, dtype=tf.float32)
            paddings = tf.zeros([pad_len, seq_len], dtype=target_mapping.dtype)
            target_mapping = tf.concat([target_mapping, paddings], axis=0)
            example["target_mapping"] = tf.reshape(target_mapping,
                                                   [num_predict, seq_len])

            ##### target
            target = tf.boolean_mask(target, bool_target_mask)
            paddings = tf.zeros([pad_len], dtype=target.dtype)
            target = tf.concat([target, paddings], axis=0)
            example["target"] = tf.reshape(target, [num_predict])

            ##### target mask
            target_mask = tf.concat([
                tf.ones([actual_num_predict], dtype=tf.float32),
                tf.zeros([pad_len], dtype=tf.float32)
            ],
                                    axis=0)
            example["target_mask"] = tf.reshape(target_mask, [num_predict])
        else:
            example["target"] = tf.reshape(target, [seq_len])
            example["target_mask"] = tf.reshape(target_mask, [seq_len])

        # reshape back to fixed shape
        example["perm_mask"] = tf.reshape(perm_mask, [seq_len, seq_len])
        example["input_k"] = tf.reshape(input_k, [seq_len])
        example["input_q"] = tf.reshape(input_q, [seq_len])

        _convert_example(example, use_bfloat16)

        for k, v in example.items():
            logging.info("%s: %s", k, v)

        return example
Пример #12
0
def _get_final_index(sequence_length, time_major=True):
  indices = [tf.maximum(0, sequence_length - 1),
             tf.range(sequence_length.shape[0])]
  if not time_major:
    indices = indices[-1::-1]
  return tf.stack(indices, axis=1)

def sum_python(N):
    return np.sum(np.arange(N)**2)


#%%
sum_python(10**5)
#%%
# Tensorflow teaser

# Initialize the parameter
N = tf.placeholder('int64', name='input_to_fun')

# a recipe how to produce result
result = tf.reduce_sum(tf.range(N)**2)
result
#%%
result.eval({N: 10**5})
# logger for the tensorboard
writer = tf.summary.FileWriter('Tensorboard_logs', graph=sess.graph)
#%%
with tf.name_scope('Placeholder_examples'):
    # default placeholder that can be arobitrary float32
    # scalar vector, matirx etc
    arbitrary_input = tf.placeholder('float32')

    #input_vector of arbitrary length
    input_vector = tf.placeholder('float32', shape=(None, ))

    # input vector that must have 10 elements and integer type
Пример #14
0
    def _build_train_op(self):
        """Builds a training op.

    Returns:
      train_op: An op performing one step of training.
    """
        target_distribution = tf.stop_gradient(
            self._build_target_distribution())

        # size of indices: batch_size x 1.
        indices = tf.range(tf.shape(self._replay_net_outputs.logits)[0])[:,
                                                                         None]
        # size of reshaped_actions: batch_size x 2.
        reshaped_actions = tf.concat([indices, self._replay.actions[:, None]],
                                     1)
        # For each element of the batch, fetch the logits for its selected action.
        chosen_action_logits = tf.gather_nd(self._replay_net_outputs.logits,
                                            reshaped_actions)

        bellman_errors = (target_distribution[:, None, :] -
                          chosen_action_logits[:, :, None]
                          )  # Input `u' of Eq. 9.
        huber_loss = (  # Eq. 9 of paper.
            tf.to_float(tf.abs(bellman_errors) <= self.kappa) * 0.5 *
            bellman_errors**2 +
            tf.to_float(tf.abs(bellman_errors) > self.kappa) * self.kappa *
            (tf.abs(bellman_errors) - 0.5 * self.kappa))

        tau_hat = (
            (tf.range(self._num_atoms, dtype=tf.float32) + 0.5) /
            self._num_atoms)  # Quantile midpoints.  See Lemma 2 of paper.

        quantile_huber_loss = (  # Eq. 10 of paper.
            tf.abs(tau_hat[None, :, None] - tf.to_float(bellman_errors < 0)) *
            huber_loss)

        # Sum over tau dimension, average over target value dimension.
        loss = tf.reduce_sum(tf.reduce_mean(quantile_huber_loss, 2), 1)

        if self._replay_scheme == 'prioritized':
            target_priorities = self._replay.tf_get_priority(
                self._replay.indices)
            # The original prioritized experience replay uses a linear exponent
            # schedule 0.4 -> 1.0. Comparing the schedule to a fixed exponent of 0.5
            # on 5 games (Asterix, Pong, Q*Bert, Seaquest, Space Invaders) suggested
            # a fixed exponent actually performs better, except on Pong.
            loss_weights = 1.0 / tf.sqrt(target_priorities + 1e-10)
            loss_weights /= tf.reduce_max(loss_weights)

            # Rainbow and prioritized replay are parametrized by an exponent alpha,
            # but in both cases it is set to 0.5 - for simplicity's sake we leave it
            # as is here, using the more direct tf.sqrt(). Taking the square root
            # "makes sense", as we are dealing with a squared loss.
            # Add a small nonzero value to the loss to avoid 0 priority items. While
            # technically this may be okay, setting all items to 0 priority will cause
            # troubles, and also result in 1.0 / 0.0 = NaN correction terms.
            update_priorities_op = self._replay.tf_set_priority(
                self._replay.indices, tf.sqrt(loss + 1e-10))

            # Weight loss by inverse priorities.
            loss = loss_weights * loss
        else:
            update_priorities_op = tf.no_op()

        with tf.control_dependencies([update_priorities_op]):
            if self.summary_writer is not None:
                with tf.variable_scope('Losses'):
                    tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss))
            return self.optimizer.minimize(tf.reduce_mean(loss)), loss
Пример #15
0
def naive_log_likelihood(x, presence=None):
    """Implementation from original repo ripped wholesale"""

    batch_size, n_input_points = x.shape[:2].as_list()

    # Generate gaussian mixture pdfs...
    # [B, 1, n_votes, n_input_dims]
    expanded_votes = tf.expand_dims(_votes, 1)
    expanded_scale = tf.expand_dims(tf.expand_dims(_scales, 1), -1)
    vote_component_pdf = _get_pdf(expanded_votes, expanded_scale)

    # For each part, evaluates all capsule, vote mixture likelihoods
    # [B, n_points, n_caps x n_votes, n_input_dims]
    expanded_x = tf.expand_dims(x, 2)
    vote_log_prob_per_dim = vote_component_pdf.log_prob(expanded_x)

    # Compressing mixture likelihood across all part dimension (ie. 2d point)
    # [B, n_points, n_caps x n_votes]
    vote_log_prob = tf.reduce_sum(vote_log_prob_per_dim, -1)
    dummy_vote_log_prob = tf.zeros([batch_size, n_input_points, 1])
    dummy_vote_log_prob -= 2. * tf.log(10.)
    # adding extra [B, n_points, n_caps x n_votes] to end. WHY?
    vote_log_prob = tf.concat([vote_log_prob, dummy_vote_log_prob], 2)

    # [B, n_points, n_caps x n_votes]
    # CONDITIONAL LOGIT a_(k,n)
    mixing_logits = math_ops.safe_log(_vote_presence_prob)

    dummy_logit = tf.zeros([batch_size, 1]) - 2. * tf.log(10.)
    mixing_logits = tf.concat([mixing_logits, dummy_logit], 1)

    #
    # Following seems relevant only towards compressing ll for loss.
    # REDUNDANCY
    #

    # mixing_logits -> presence (a)
    # vote_log_prob -> Gaussian value (one per vote) for each coordinate

    # BAD -> vote presence / summed vote presence
    mixing_log_prob = mixing_logits - tf.reduce_logsumexp(mixing_logits, 1,
                                                          keepdims=True)

    # BAD -> mixing presence (above) * each vote gaussian prob
    expanded_mixing_logits = tf.expand_dims(mixing_log_prob, 1)
    # Reduce to loglikelihood given k,n combination (capsule, vote)
    mixture_log_prob_per_component\
        = tf.reduce_logsumexp(expanded_mixing_logits + vote_log_prob, 2)

    if presence is not None:
        presence = tf.to_float(presence)
        mixture_log_prob_per_component *= presence

    # Reduce votes to single capsule
    # ^ Misleading, reducing across all parts, multiplying log
    # likelihoods for each part _wrt all capsules_.
    mixture_log_prob_per_example\
        = tf.reduce_sum(mixture_log_prob_per_component, 1)

    # Same as above but across all compressed part likelihoods in a batch.
    mixture_log_prob_per_batch = tf.reduce_mean(
        mixture_log_prob_per_example)

    #
    # Back from compression to argmax (routing to proper k)
    #

    # [B, n_points, n_votes]
    posterior_mixing_logits_per_point = expanded_mixing_logits + vote_log_prob
    # [B, n_points]
    winning_vote_idx = tf.argmax(
        posterior_mixing_logits_per_point[:, :, :-1], 2)

    batch_idx = tf.expand_dims(tf.range(batch_size, dtype=tf.int64), -1)
    batch_idx = snt.TileByDim([1], [winning_vote_idx.shape[-1]])(batch_idx)

    idx = tf.stack([batch_idx, winning_vote_idx], -1)
    winning_vote = tf.gather_nd(_votes, idx)
    winning_pres = tf.gather_nd(_vote_presence_prob, idx)
    vote_presence = tf.greater(mixing_logits[:, :-1],
                               mixing_logits[:, -1:])

    # the first four votes belong to the square
    # Just assuming the votes are ordered by capsule...
    is_from_capsule = winning_vote_idx // _n_votes

    posterior_mixing_probs = tf.nn.softmax(
        posterior_mixing_logits_per_point, -1)[Ellipsis, :-1]

    assert winning_vote.shape == x.shape

    return OutputTuple(
        log_prob=mixture_log_prob_per_batch,
        vote_presence=tf.to_float(vote_presence),
        winner=winning_vote,
        winner_pres=winning_pres,
        is_from_capsule=is_from_capsule,
        mixing_logits=mixing_logits,
        mixing_log_prob=mixing_log_prob,
        # TODO(adamrk): this is broken
        soft_winner=tf.zeros_like(winning_vote),
        soft_winner_pres=tf.zeros_like(winning_pres),
        posterior_mixing_probs=posterior_mixing_probs,
    )
Пример #16
0
def _create_make_unique(inputs):
  """Replaces the lower bits of each element with iota.

  The iota is used to derive the index, and also serves the purpose to
  make each element unique to break ties.

  Args:
    inputs: A tensor with rank of 2 and dtype of tf.float32.
      [batch_size, original_size].

  Returns:
    A tensor after element wise transformation, with dtype the same as inputs.
    [batch_size, original_size].

  Raises:
    ValueError: If the rank of the input tensor does not equal 2.
  """
  if inputs.shape.ndims != 2:
    raise ValueError("Input of top_k_with_unique must be rank-2 "
                     "but got: %s" % inputs.shape)

  height = inputs.shape[0]
  width = inputs.shape[1]
  zeros = tf.zeros([height, width], dtype=tf.int32)

  # Count_mask is used to mask away the low order bits to ensure that every
  # element is distinct.
  log2_ceiling = int(math.ceil(math.log(int(width), 2)))
  next_power_of_two = 1 << log2_ceiling
  count_mask = ~(next_power_of_two - 1)
  count_mask_r0 = tf.constant(count_mask)
  count_mask_r2 = tf.fill([height, width], count_mask_r0)

  # Smallest_normal is the bit representation of the smallest positive normal
  # floating point number. The sign is zero, exponent is one, and the fraction
  # is zero.
  smallest_normal = 1 << 23
  smallest_normal_r0 = tf.constant(smallest_normal, dtype=tf.int32)
  smallest_normal_r2 = tf.fill([height, width], smallest_normal_r0)

  # Low_bit_mask is used to mask away the sign bit when computing the absolute
  # value.
  low_bit_mask = ~(1 << 31)
  low_bit_mask_r0 = tf.constant(low_bit_mask, dtype=tf.int32)
  low_bit_mask_r2 = tf.fill([height, width], low_bit_mask_r0)

  iota = tf.tile(tf.expand_dims(tf.range(width, dtype=tf.int32), 0),
                 [height, 1])

  # Compare the absolute value with positive zero to handle negative zero.
  input_r2 = tf.bitcast(inputs, tf.int32)
  abs_r2 = tf.bitwise.bitwise_and(input_r2, low_bit_mask_r2)
  if_zero_r2 = tf.equal(abs_r2, zeros)
  smallest_normal_preserving_sign_r2 = tf.bitwise.bitwise_or(
      input_r2, smallest_normal_r2)
  input_no_zeros_r2 = tf.where(
      if_zero_r2, smallest_normal_preserving_sign_r2, input_r2)

  # Discard the low-order bits and replace with iota.
  and_r2 = tf.bitwise.bitwise_and(input_no_zeros_r2, count_mask_r2)
  or_r2 = tf.bitwise.bitwise_or(and_r2, iota)
  return tf.bitcast(or_r2, tf.float32)
def _scan_step_fn(state, example, packed_length, queue_size, spacing,
                  num_sequences, token_dtype):  # pylint: disable=g-doc-args
    """Transform function used by tf.data.experimental.scan to process an example.

  This is written as a stateless function rather than a class method because we
  trace it with AutoGraph (in order to simplify the conditional), and this way
  we don't have to worry about handling re-tracing semantics.

  Args:
    See the SequenceDatasetPacker class.

  Returns:
    The updated queue state, and either a packed example or a dummy sequence
    which will be filtered out downstream.
  """

    # Convert TensorArray tuples to lists since we'll need to replace them.
    availability, contents, top_index = state

    lengths = tf.concat([tf.shape(i) for i in example], axis=0)
    start_availability = availability.stack()
    can_fit = tf.reduce_all(tf.greater_equal(start_availability, lengths),
                            axis=1)
    any_can_fit = tf.reduce_any(can_fit, axis=0)

    # AutoGraph will convert this block to a tf.cond
    if any_can_fit:
        # This indicates where in the FFD queue rotation a given index sits
        shifted_range = (tf.range(queue_size, dtype=INDEX_DTYPE) -
                         top_index) % queue_size

        # Mark any indices which cannot accommodate the current example.
        exclusion_mask = tf.cast(tf.logical_not(can_fit),
                                 INDEX_DTYPE) * queue_size

        # Index in [0, queue_size) in which to place the sample. Note, this index
        # is the position in the actual TensorArray, not the index of the FFD queue.
        queue_index = (tf.reduce_min(shifted_range + exclusion_mask) +
                       top_index) % queue_size

        # NOTE(taylorrobie): We emit a non-empty Tensor for downstream checks.
        output_contents = -tf.ones((1, num_sequences), dtype=token_dtype)

    else:
        index_range = top_index * packed_length + tf.range(packed_length)
        output_contents = contents.gather(index_range)

        # Reset the queue state.
        availability = availability.write(
            top_index,
            packed_length * tf.ones((num_sequences, ), dtype=INDEX_DTYPE))
        empty_contents = tf.zeros((packed_length, num_sequences * 2),
                                  dtype=token_dtype)
        contents = contents.scatter(index_range, empty_contents)

        queue_index = top_index
        top_index = (top_index + 1) % queue_size

    pre_assign_availability = availability.read(queue_index)
    space_left = pre_assign_availability - lengths - spacing
    availability = availability.write(queue_index, space_left)

    # ============================================================================
    # == Update contents =========================================================
    # ============================================================================
    # Consider the following case for a seq-to-seq packing:
    #   (padding is represented as underscores)
    #
    #   Queue starting state:
    #     [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...]
    #     [5, 9, _, _, _, _, _, _, _, _, _, ...]
    #
    #   Examples:
    #     [4, 2, 4], [3]
    #
    #   Desired new queue state:
    #     [1, 3, 2, 4, 6, 1, _, _, 4, 2, 4, _, _, ...]
    #     [5, 9, _, _, 3, _, _, _, _, _, _, _, _, ...]
    #
    # This could be acomplished by creating a TensorArray for each of the two
    # sequences, and scattering into the respective arrays. However TensorArray
    # writes are extremely expensive relative to other operations. So instead we
    # store the contents in a single TensorArray of shape (packed_length, 2), and
    # we pad and concatenate the examples such that they can be added in a single
    # assign:
    #
    #              [_, _, _, _, 4, 2, 4]
    #              [3, _, _, _, _, _, _]
    #                        +
    #  [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...]
    #  [5, 9, _, _, _, _, _, _, _, _, _, ...]
    #
    # And in practice, the extra work of padding is neglidgable compared to
    # the gain from vectorizing the TensorArray assign. We also store a bit mask
    # denoting where sequences start which is used to compute segment and
    # position metadata:
    #
    #              [_, _, _, _, 1, _, _]
    #              [1, _, _, _, _, _, _]
    #                        +
    #  [1, _, _, _, _, _, _, _, _, _, _, ...]
    #  [1, _, _, _, _, _, _, _, _, _, _, ...]
    #
    # Both the contents and the mask are concatenated in the same TensorArray
    # for performance.

    start_index = packed_length - pre_assign_availability
    end_index = start_index + lengths
    leftmost = tf.reduce_min(start_index, axis=0)
    rightmost = tf.reduce_max(end_index, axis=0)
    delta = rightmost - leftmost
    pad_indices = [
        tf.stack((start_index[i] - leftmost, rightmost - end_index[i]))
        for i in range(num_sequences)
    ]

    padded_examples = [
        tf.pad(ex, padding[tf.newaxis, :])
        for ex, padding in zip(example, pad_indices)
    ]
    padded_examples = tf.transpose(tf.stack(padded_examples))
    mask_update = tf.one_hot(start_index - leftmost,
                             delta,
                             dtype=contents.dtype,
                             axis=0)

    content_update = tf.concat([padded_examples, mask_update], axis=1)

    index_range = (
        queue_index * packed_length +  # Offset into the right section.
        tf.range(delta, dtype=INDEX_DTYPE) + leftmost)
    contents = contents.scatter(index_range,
                                contents.gather(index_range) + content_update)

    state = (availability, contents, top_index)
    return state, (tf.logical_not(any_can_fit), output_contents)
Пример #18
0
    def __init__(self, item_num, args, reuse=None):
        self.args = args
        self.is_training = tf.placeholder(tf.bool, shape=())
        self.input_seq = tf.placeholder(tf.int32, shape=(None, args.maxlen))
        self.pos = tf.placeholder(tf.int32, shape=None)
        self.exemplar_logits = tf.placeholder(tf.float32, shape=(None, None))
        self.exemplar_pos = tf.placeholder(tf.int32, shape=None)
        self.max_item = tf.placeholder(tf.int32, shape=())
        self.lr = tf.placeholder(tf.float32, shape=())
        self.dropout_rate = tf.placeholder(tf.float32, shape=())
        pos = self.pos
        mask = tf.expand_dims(tf.to_float(tf.not_equal(self.input_seq, 0)), -1)

        with tf.variable_scope("SASRec", reuse=reuse):
            # sequence embedding, item embedding table
            self.seq, item_emb_table = embedding(self.input_seq,
                                                 vocab_size=item_num + 1,
                                                 num_units=args.hidden_units,
                                                 zero_pad=True,
                                                 scale=True,
                                                 l2_reg=args.l2_emb,
                                                 scope="input_embeddings",
                                                 with_t=True,
                                                 reuse=reuse
                                                 )

            # # Positional Encoding
            t, pos_emb_table = embedding(
                tf.tile(tf.expand_dims(tf.range(tf.shape(self.input_seq)[1]), 0), [tf.shape(self.input_seq)[0], 1]),
                vocab_size=args.maxlen,
                num_units=args.hidden_units,
                zero_pad=False,
                scale=False,
                l2_reg=args.l2_emb,
                scope="dec_pos",
                reuse=reuse,
                with_t=True
            )
            self.seq += t

            # Dropout
            self.seq = tf.layers.dropout(self.seq,
                                         rate=self.dropout_rate,
                                         training=tf.convert_to_tensor(self.is_training),
                                         seed=args.random_seed)

            self.seq *= mask

            # Build blocks
            for i in range(args.num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):
                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=args.hidden_units,
                                                   num_heads=args.num_heads,
                                                   dropout_rate=self.dropout_rate,
                                                   seed=args.random_seed,
                                                   is_training=self.is_training,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[args.hidden_units, args.hidden_units],
                                           dropout_rate=self.dropout_rate, is_training=self.is_training,
                                           seed=args.random_seed)
                    self.seq *= mask

            self.seq = normalize(self.seq)

        # find representation
        self.rep = self.seq[:, -1, :]

        # define loss
        seq_emb = tf.reshape(self.rep, [tf.shape(self.input_seq)[0], args.hidden_units])
        indices = pos - 1
        self.labels = tf.one_hot(indices, self.max_item)
        item_emb = tf.nn.embedding_lookup(item_emb_table, tf.range(1, self.max_item + 1))
        self.logits = tf.matmul(seq_emb, tf.transpose(item_emb))
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits))

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)

        # prediction
        self.test_item = tf.placeholder(tf.int32, shape=None)
        self.test_item_emb = tf.nn.embedding_lookup(item_emb_table, self.test_item)
        self.test_logits = tf.matmul(seq_emb, tf.transpose(self.test_item_emb))
        self.test_logits = tf.reshape(self.test_logits, [tf.shape(self.input_seq)[0], tf.shape(self.test_item)[0]])
        self.pred_last = tf.argsort(tf.argsort(-self.test_logits))
Пример #19
0
def dropblock(net,
              is_training,
              keep_prob,
              dropblock_size,
              data_format='channels_first'):
    """DropBlock: a regularization method for convolutional neural networks.

  DropBlock is a form of structured dropout, where units in a contiguous
  region of a feature map are dropped together. DropBlock works better than
  dropout on convolutional layers due to the fact that activation units in
  convolutional layers are spatially correlated.
  See https://arxiv.org/pdf/1810.12890.pdf for details.

  Args:
    net: `Tensor` input tensor.
    is_training: `bool` for whether the model is training.
    keep_prob: `float` or `Tensor` keep_prob parameter of DropBlock. "None"
        means no DropBlock.
    dropblock_size: `int` size of blocks to be dropped by DropBlock.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.
  Returns:
      A version of input tensor with DropBlock applied.
  Raises:
      if width and height of the input tensor are not equal.
  """

    if not is_training or keep_prob is None:
        return net

    tf.logging.info(
        'Applying DropBlock: dropblock_size {}, net.shape {}'.format(
            dropblock_size, net.shape))

    if data_format == 'channels_last':
        _, width, height, _ = net.get_shape().as_list()
    else:
        _, _, width, height = net.get_shape().as_list()
    if width != height:
        raise ValueError('Input tensor with width!=height is not supported.')

    dropblock_size = min(dropblock_size, width)
    # seed_drop_rate is the gamma parameter of DropBlcok.
    seed_drop_rate = (1.0 - keep_prob) * width**2 / dropblock_size**2 / (
        width - dropblock_size + 1)**2

    # Forces the block to be inside the feature map.
    w_i, h_i = tf.meshgrid(tf.range(width), tf.range(width))
    valid_block_center = tf.logical_and(
        tf.logical_and(w_i >= int(dropblock_size // 2),
                       w_i < width - (dropblock_size - 1) // 2),
        tf.logical_and(h_i >= int(dropblock_size // 2),
                       h_i < width - (dropblock_size - 1) // 2))

    valid_block_center = tf.expand_dims(valid_block_center, 0)
    valid_block_center = tf.expand_dims(
        valid_block_center, -1 if data_format == 'channels_last' else 0)

    randnoise = tf.random_uniform(net.shape, dtype=tf.float32)
    block_pattern = (
        1 - tf.cast(valid_block_center, dtype=tf.float32) + tf.cast(
            (1 - seed_drop_rate), dtype=tf.float32) + randnoise) >= 1
    block_pattern = tf.cast(block_pattern, dtype=tf.float32)

    if dropblock_size == width:
        block_pattern = tf.reduce_min(
            block_pattern,
            axis=[1, 2] if data_format == 'channels_last' else [2, 3],
            keepdims=True)
    else:
        if data_format == 'channels_last':
            ksize = [1, dropblock_size, dropblock_size, 1]
        else:
            ksize = [1, 1, dropblock_size, dropblock_size]
        block_pattern = -tf.nn.max_pool(
            -block_pattern,
            ksize=ksize,
            strides=[1, 1, 1, 1],
            padding='SAME',
            data_format='NHWC' if data_format == 'channels_last' else 'NCHW')

    percent_ones = tf.cast(tf.reduce_sum(
        (block_pattern)), tf.float32) / tf.cast(tf.size(block_pattern),
                                                tf.float32)

    net = net / tf.cast(percent_ones, net.dtype) * tf.cast(
        block_pattern, net.dtype)
    return net
    def compute_knowledge_selection_and_loss(self, features, encoder_output,
                                             fact_embedding, fact_lengths,
                                             margin, num_negative_samples):
        """Compute knowledge selection and loss.

    Args:
      features: features.
      encoder_output: <tf.float32>[batch_size, input_length, hidden_dim]
      fact_embedding: <tf.float32>[batch_size*triple_num, max_triple_length,
        emb_dim]
      fact_lengths: # <tf.int32>[batch_size*triple_num]
      margin: integer value for max margin in TransE loss,
      num_negative_samples: shuffle and sample multiple negative examples for
      the TransE loss

    Returns:
      knowledge_weights:
      knowledge_loss:
    """
        hparams = self._hparams
        encoder_output_shape = common_layers.shape_list(encoder_output)
        encoder_hidden_dim = encoder_output_shape[-1]
        inputs = features["inputs"]
        # <tf.float32>[batch_size, input_length, emb_dim]
        inputs = tf.squeeze(inputs, 2)
        # <tf.float32>[batch_size, input_length]
        context_padding = common_attention.embedding_to_padding(inputs)
        # <tf.float32>[batch_size]
        context_lens = tf.to_float(
            common_attention.padding_to_length(context_padding))
        # <tf.float32>[batch_size, 1]
        context_lens = tf.expand_dims(context_lens, -1)
        # Compute context vector summary.
        # <tf.float32>[batch_size, hidden_dim]
        context_vector_summary = compute_summary_embedding(
            encoder_output, context_lens, hparams)
        knowledge_encoder_output = compute_average_embedding(
            fact_embedding, fact_lengths)
        # <tf.float32>[batch_size, triple_num, emb_dim]
        knowledge_encoder_output = tf.reshape(
            knowledge_encoder_output,
            [-1, self.triple_num, encoder_hidden_dim])
        original_knowledge_encoder_output = knowledge_encoder_output
        if hparams.similarity_fuction == "dot_product":
            triple_logits = tf.squeeze(
                tf.matmul(knowledge_encoder_output,
                          tf.expand_dims(context_vector_summary, 2)), -1)
        elif hparams.similarity_fuction == "bilinear":
            # Tile the context vector summary.
            # <tf.float32>[batch_size, triple_num*hidden_dim]
            tiled_context_vector = tf.tile(context_vector_summary,
                                           [1, self.triple_num])
            # <tf.float32>[batch_size, triple_num, hidden_dim]
            context_vector = tf.reshape(
                tiled_context_vector,
                [-1, self.triple_num, encoder_hidden_dim])
            # compute outer product
            context_vector = tf.expand_dims(context_vector, -1)
            knowledge_encoder_output = tf.expand_dims(knowledge_encoder_output,
                                                      2)
            # <tf.float32>[batch_size, triple_num, hidden_dim, hidden_dim]
            outer_product = tf.matmul(context_vector, knowledge_encoder_output)
            outer_product = tf.reshape(
                outer_product,
                [-1, self.triple_num, encoder_hidden_dim * encoder_hidden_dim])
            triple_logits = tf.squeeze(
                tf.layers.dense(outer_product, 1, name="knolwedge_final_mlp"),
                -1)

        avg_triple_loss = 0.0
        triple_labels = features["triple_labels"]

        subject_mask = tf.reshape(
            features["subject_mask"],
            [-1, self.triple_num, hparams.max_triple_length])
        subject_mask = tf.reshape(subject_mask,
                                  [-1, hparams.max_triple_length])

        predicate_mask = tf.reshape(
            features["predicate_mask"],
            [-1, self.triple_num, hparams.max_triple_length])
        predicate_mask = tf.reshape(predicate_mask,
                                    [-1, hparams.max_triple_length])

        object_mask = tf.reshape(
            features["object_mask"],
            [-1, self.triple_num, hparams.max_triple_length])
        object_mask = tf.reshape(object_mask, [-1, hparams.max_triple_length])

        # mask : [bs, max_seq_len, triple_num]
        # the below operation will result in [bs*triple_num,emb_dim]
        subject_length = tf.cast(
            tf.expand_dims(tf.reduce_sum(subject_mask, -1), 1),
            tf.float32)  # [bs*tn]
        object_length = tf.cast(
            tf.expand_dims(tf.reduce_sum(object_mask, -1), 1), tf.float32)
        predicate_length = tf.cast(
            tf.expand_dims(tf.reduce_sum(predicate_mask, -1), 1), tf.float32)

        # expand dimension 2 to be able to broadcast
        subject_mask = tf.cast(tf.expand_dims(subject_mask, 2), tf.float32)
        predicate_mask = tf.cast(tf.expand_dims(predicate_mask, 2), tf.float32)
        object_mask = tf.cast(tf.expand_dims(object_mask, 2), tf.float32)

        subject_vect = tf.reduce_sum(tf.multiply(
            fact_embedding, subject_mask), 1) / (
                subject_length +
                tf.broadcast_to(tf.constant([1e-5]), tf.shape(subject_length)))
        object_vect = tf.reduce_sum(tf.multiply(
            fact_embedding, object_mask), 1) / (
                object_length +
                tf.broadcast_to(tf.constant([1e-5]), tf.shape(object_length)))
        predicate_vect = tf.reduce_sum(
            tf.multiply(fact_embedding, predicate_mask),
            1) / (predicate_length + tf.broadcast_to(
                tf.constant([1e-5]), tf.shape(predicate_length)))

        # Shuffled rows to generate adversarial samples
        shuffled_subject_vect = []
        shuffled_object_vect = []

        for _ in range(num_negative_samples):
            shuffled_subject_vect += [
                tf.gather(
                    subject_vect,
                    tf.random.shuffle(tf.range(tf.shape(subject_vect)[0])))
            ]  # [bs*tn,d]
            shuffled_object_vect += [
                tf.gather(
                    object_vect,
                    tf.random.shuffle(tf.range(tf.shape(object_vect)[0])))
            ]  # [bs*tn,d]

        # KB pretraining loss

        positive_loss = tf.reduce_mean(
            tf.squared_difference(subject_vect + predicate_vect, object_vect))
        negative_loss = 0
        for n_adv in range(num_negative_samples):
            negative_loss += tf.reduce_mean(
                tf.squared_difference(
                    shuffled_subject_vect[n_adv] + predicate_vect,
                    object_vect))
            negative_loss += tf.reduce_mean(
                tf.squared_difference(subject_vect + predicate_vect,
                                      shuffled_object_vect[n_adv]))

        # TransE Loss

        negative_loss = negative_loss / (2 * num_negative_samples)

        transe_loss = tf.clip_by_value(margin + positive_loss - negative_loss,
                                       clip_value_min=0,
                                       clip_value_max=100)
        if hparams.mode != tf.estimator.ModeKeys.PREDICT:
            triple_losses = tf.nn.weighted_cross_entropy_with_logits(
                labels=triple_labels,
                logits=triple_logits,
                pos_weight=hparams.pos_weight)
            avg_triple_loss = tf.reduce_mean(triple_losses)
            tf.summary.scalar("triple_loss", avg_triple_loss)

        return triple_logits, avg_triple_loss, original_knowledge_encoder_output, transe_loss
Пример #21
0
def make_ordered_one_hot_vectors(num, num_tokens):
    """Makes one hot vectors of size [num, num_tokens]."""
    num_repeats = int(np.ceil(num / float(num_tokens)))
    indices = tf.stack([tf.range(num_tokens)] * num_repeats)
    indices = tf.reshape(tf.transpose(indices), [-1])[0:num]
    return tf.one_hot(indices, depth=num_tokens)
Пример #22
0
def positions_for(tokens, past_length):
    batch_size = tf.shape(tokens)[0]
    nsteps = tf.shape(tokens)[1]
    return expand_tile(past_length + tf.range(nsteps), batch_size)
Пример #23
0
  def _build_sampler(self):
    """Build the sampler ops and the log_prob ops."""
    hidden_size = self.params.controller_hidden_size
    num_layers = self.params.controller_num_layers

    arc_seq = []
    sample_log_probs = []
    sample_entropy = []
    all_h = [tf.zeros([1, hidden_size], dtype=tf.float32)]
    all_h_w = [tf.zeros([1, hidden_size], dtype=tf.float32)]

    # sampler ops
    inputs = self.g_emb
    prev_c = tf.zeros([1, hidden_size], dtype=tf.float32)
    prev_h = tf.zeros([1, hidden_size], dtype=tf.float32)

    inputs = self.g_emb
    for layer_id in range(1, num_layers+1):
      next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm)
      prev_c, prev_h = next_c, next_h
      all_h.append(next_h)
      all_h_w.append(tf.matmul(next_h, self.attn_w_1))

      query = tf.matmul(next_h, self.attn_w_2)
      query = query + tf.concat(all_h_w[:-1], axis=0)
      query = tf.tanh(query)
      logits = tf.matmul(query, self.attn_v)
      logits = tf.reshape(logits, [1, layer_id])

      if self.params.controller_temperature:
        logits /= self.params.controller_temperature
      if self.params.controller_tanh_constant:
        logits = self.params.controller_tanh_constant * tf.tanh(logits)
      diff = tf.to_float(layer_id - tf.range(0, layer_id)) ** 2
      logits -= tf.reshape(diff, [1, layer_id]) / 6.0

      skip_index = tf.multinomial(logits, 1)
      skip_index = tf.to_int32(skip_index)
      skip_index = tf.reshape(skip_index, [1])
      arc_seq.append(skip_index)

      log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=skip_index)
      sample_log_probs.append(log_prob)

      entropy = log_prob * tf.exp(-log_prob)
      sample_entropy.append(tf.stop_gradient(entropy))

      inputs = tf.nn.embedding_lookup(
          tf.concat(all_h[:-1], axis=0), skip_index)
      inputs /= (0.1 + tf.to_float(layer_id - skip_index))

      next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm)
      prev_c, prev_h = next_c, next_h
      logits = tf.matmul(next_h, self.w_emb, transpose_b=True)
      if self.params.controller_temperature:
        logits /= self.params.controller_temperature
      if self.params.controller_tanh_constant:
        logits = self.params.controller_tanh_constant * tf.tanh(logits)
      func = tf.multinomial(logits, 1)
      func = tf.to_int32(func)
      func = tf.reshape(func, [1])
      arc_seq.append(func)
      log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=func)
      sample_log_probs.append(log_prob)
      entropy = log_prob * tf.exp(-log_prob)
      sample_entropy.append(tf.stop_gradient(entropy))
      inputs = tf.nn.embedding_lookup(self.w_emb, func)

    arc_seq = tf.concat(arc_seq, axis=0)
    self.sample_arc = arc_seq

    self.sample_log_probs = tf.concat(sample_log_probs, axis=0)
    self.ppl = tf.exp(tf.reduce_mean(self.sample_log_probs))

    sample_entropy = tf.concat(sample_entropy, axis=0)
    self.sample_entropy = tf.reduce_sum(sample_entropy)

    self.all_h = all_h
Пример #24
0
 def initialize(shape, dtype):
     return tf.reshape(1 + tf.range(np.prod(shape), dtype=dtype), shape)
Пример #25
0
def create_cm_sketch(topk_obj_ids, topk_obj_weights, all_entity_sketches,
                     cm_width):
    """Create cm sketches for a set of weighted entities.

  Args:
    topk_obj_ids: batch_size, topk
    topk_obj_weights: batch_size, topk
    all_entity_sketches: num_entities, depth
    cm_width: width of count-min sketch

  Returns:
    k hot dense vectors: batch_size, depth, width
  """
    topk_fact_obj_sketches = tf.gather(all_entity_sketches,
                                       topk_obj_ids,
                                       axis=0)
    # batch_size, topk, depth
    batch_size = tf.shape(topk_fact_obj_sketches)[0]
    topk = tf.shape(topk_fact_obj_sketches)[1]
    cm_depth = tf.shape(topk_fact_obj_sketches)[2]

    # We first create a sparse matrix from the hash values. We will then
    # convert it into dense matrix. This is more efficient than creating
    # k one-hot vectors and then aggregating them into one k-hot vector.

    # First prepare ids of non-zero values in the sparse matrix
    flattened_topk_hash_ids = tf.reshape(topk_fact_obj_sketches, shape=[-1])
    # batch_size * topk * depth
    topk_obj_weights = tf.tile(tf.expand_dims(topk_obj_weights, axis=2),
                               multiples=[1, 1, cm_depth])
    # batch_size, topk, depth
    flattened_topk_obj_weights = tf.reshape(topk_obj_weights, shape=[-1])
    # batch_size * topk * depth
    batch_ids = tf.range(batch_size)
    # batch_size,
    batch_ids = tf.expand_dims(tf.expand_dims(batch_ids, axis=1), axis=2)
    # batch_size, 1, 1
    batch_ids = tf.tile(batch_ids, multiples=[1, topk, cm_depth])
    # batch_size, topk, depth
    flattened_batch_ids = tf.reshape(batch_ids, shape=[-1])
    # batch_size * topk * depth
    depth_ids = tf.range(cm_depth)
    # depth,
    depth_ids = tf.expand_dims(tf.expand_dims(depth_ids, axis=0), axis=1)
    # 1, 1, depth
    depth_ids = tf.tile(depth_ids, multiples=[batch_size, topk, 1])
    # batch_size, topk, depth
    flattened_depth_ids = tf.reshape(depth_ids, shape=[-1])
    # batch_size * topk * depth
    sparse_value_ids = tf.cast(tf.stack(
        [flattened_batch_ids, flattened_depth_ids, flattened_topk_hash_ids],
        axis=1),
                               dtype=tf.int64)

    # Then prepare values of non-zero values in the sparse matrix. Values
    # are sorted to ascending order. If there are duplicates, later (larger)
    # values will be kept.
    sorted_orders = tf.argsort(flattened_topk_obj_weights,
                               direction='ASCENDING',
                               stable=True)
    # batch_size * topk * depth
    sorted_flattened_topk_obj_weights = tf.gather(flattened_topk_obj_weights,
                                                  sorted_orders)
    sorted_sparse_value_ids = tf.gather(sparse_value_ids, sorted_orders)

    # Finally create sketch in sparse tensors and convert it to dense tensors.
    # We donot validate indices here. If multiple values are about to be assigned
    # to the same row and column, we will keep the last value, because the last
    # value is the larger one. This behaviour is by design.
    sparse_k_hot_sketch = tf.SparseTensor(
        indices=sorted_sparse_value_ids,
        values=sorted_flattened_topk_obj_weights,
        dense_shape=[batch_size, cm_depth, cm_width])
    dense_k_hot_sketch = tf.sparse.to_dense(sparse_k_hot_sketch,
                                            validate_indices=False)
    # batch_size, cm_depth, cm_width
    return dense_k_hot_sketch
Пример #26
0
    def __init__(self,
                 session,
                 player_id,
                 info_state_size,
                 num_actions,
                 loss_str="a2c",
                 loss_class=None,
                 hidden_layers_sizes=(128, ),
                 batch_size=16,
                 critic_learning_rate=0.01,
                 pi_learning_rate=0.001,
                 entropy_cost=0.01,
                 num_critic_before_pi=8,
                 additional_discount_factor=1.0,
                 max_global_gradient_norm=None,
                 optimizer_str="sgd"):
        """Initialize the PolicyGradient agent.

    Args:
      session: Tensorflow session.
      player_id: int, player identifier. Usually its position in the game.
      info_state_size: int, info_state vector size.
      num_actions: int, number of actions per info state.
      loss_str: string or None. If string, must be one of ["rpg", "qpg", "rm",
        "a2c"] and defined in `_get_loss_class`. If None, a loss class must be
        passed through `loss_class`. Defaults to "a2c".
      loss_class: Class or None. If Class, it must define the policy gradient
        loss. If None a loss class in a string format must be passed through
        `loss_str`. Defaults to None.
      hidden_layers_sizes: iterable, defines the neural network layers. Defaults
          to (128,), which produces a NN: [INPUT] -> [128] -> ReLU -> [OUTPUT].
      batch_size: int, batch size to use for Q and Pi learning. Defaults to 128.
      critic_learning_rate: float, learning rate used for Critic (Q or V).
        Defaults to 0.001.
      pi_learning_rate: float, learning rate used for Pi. Defaults to 0.001.
      entropy_cost: float, entropy cost used to multiply the entropy loss. Can
        be set to None to skip entropy computation. Defaults to 0.001.
      num_critic_before_pi: int, number of Critic (Q or V) updates before each
        Pi update. Defaults to 8 (every 8th critic learning step, Pi also
        learns).
      additional_discount_factor: float, additional discount to compute returns.
        Defaults to 1.0, in which case, no extra discount is applied.  None that
        users must provide *only one of* `loss_str` or `loss_class`.
      max_global_gradient_norm: float or None, maximum global norm of a gradient
        to which the gradient is shrunk if its value is larger.
      optimizer_str: String defining which optimizer to use. Supported values
        are {sgd, adam}
    """
        assert bool(loss_str) ^ bool(
            loss_class), "Please provide only one option."
        self._kwargs = locals()
        loss_class = loss_class if loss_class else self._get_loss_class(
            loss_str)
        self._loss_class = loss_class

        self.player_id = player_id
        self._session = session
        self._num_actions = num_actions
        self._layer_sizes = hidden_layers_sizes
        self._batch_size = batch_size
        self._extra_discount = additional_discount_factor
        self._num_critic_before_pi = num_critic_before_pi

        self._episode_data = []
        self._dataset = collections.defaultdict(list)
        self._prev_time_step = None
        self._prev_action = None

        # Step counters
        self._step_counter = 0
        self._episode_counter = 0
        self._num_learn_steps = 0

        # Keep track of the last training loss achieved in an update step.
        self._last_loss_value = None

        # Placeholders
        self._info_state_ph = tf.placeholder(shape=[None, info_state_size],
                                             dtype=tf.float32,
                                             name="info_state_ph")
        self._action_ph = tf.placeholder(shape=[None],
                                         dtype=tf.int32,
                                         name="action_ph")
        self._return_ph = tf.placeholder(shape=[None],
                                         dtype=tf.float32,
                                         name="return_ph")

        # Network
        # activate final as we plug logit and qvalue heads afterwards.
        self._net_torso = snt.nets.MLP(output_sizes=self._layer_sizes,
                                       activate_final=True)
        torso_out = self._net_torso(self._info_state_ph)
        self._policy_logits_layer = snt.Linear(output_size=self._num_actions,
                                               name="policy_head")

        self.policy_logits_network = snt.Sequential(
            [self._net_torso, self._policy_logits_layer])

        self._policy_logits = self._policy_logits_layer(torso_out)
        self._policy_probs = tf.nn.softmax(self._policy_logits)

        self._savers = []

        # Add baseline (V) head for A2C.
        if loss_class.__name__ == "BatchA2CLoss":
            self._baseline_layer = snt.Linear(output_size=1, name="baseline")
            self._baseline = tf.squeeze(self._baseline_layer(torso_out),
                                        axis=1)
        else:
            self._q_values_layer = snt.Linear(output_size=self._num_actions,
                                              name="q_values_head")
            self._q_values = self._q_values_layer(torso_out)

        # Critic loss
        # Baseline loss in case of A2C
        if loss_class.__name__ == "BatchA2CLoss":
            self._critic_loss = tf.reduce_mean(
                tf.losses.mean_squared_error(labels=self._return_ph,
                                             predictions=self._baseline))
        else:
            # Q-loss otherwise.
            action_indices = tf.stack(
                [tf.range(tf.shape(self._q_values)[0]), self._action_ph],
                axis=-1)
            value_predictions = tf.gather_nd(self._q_values, action_indices)
            self._critic_loss = tf.reduce_mean(
                tf.losses.mean_squared_error(labels=self._return_ph,
                                             predictions=value_predictions))
        if optimizer_str == "adam":
            self._critic_optimizer = tf.train.AdamOptimizer(
                learning_rate=critic_learning_rate)
        elif optimizer_str == "sgd":
            self._critic_optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=critic_learning_rate)
        else:
            raise ValueError("Not implemented, choose from 'adam' and 'sgd'.")

        def minimize_with_clipping(optimizer, loss):
            grads_and_vars = optimizer.compute_gradients(loss)
            if max_global_gradient_norm is not None:
                grads, variables = zip(*grads_and_vars)
                grads, _ = tf.clip_by_global_norm(grads,
                                                  max_global_gradient_norm)
                grads_and_vars = list(zip(grads, variables))

            return optimizer.apply_gradients(grads_and_vars)

        self._critic_learn_step = minimize_with_clipping(
            self._critic_optimizer, self._critic_loss)

        # Pi loss
        pg_class = loss_class(entropy_cost=entropy_cost)
        if loss_class.__name__ == "BatchA2CLoss":
            self._pi_loss = pg_class.loss(policy_logits=self._policy_logits,
                                          baseline=self._baseline,
                                          actions=self._action_ph,
                                          returns=self._return_ph)
        else:
            self._pi_loss = pg_class.loss(policy_logits=self._policy_logits,
                                          action_values=self._q_values)
        if optimizer_str == "adam":
            self._pi_optimizer = tf.train.AdamOptimizer(
                learning_rate=pi_learning_rate)
        elif optimizer_str == "sgd":
            self._pi_optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=pi_learning_rate)

        self._pi_learn_step = minimize_with_clipping(self._pi_optimizer,
                                                     self._pi_loss)
        self._loss_str = loss_str
        self._initialize()
  def _build_single_q_network(self, observations, head, state_t, state_tp1,
                              done_mask, reward_t, error_weight):
    """Builds the computational graph for a single Q network.

    Briefly, this part is calculating the following two quantities:
    1. q_value = q_fn(observations)
    2. td_error = q_fn(state_t) - reward_t - gamma * q_fn(state_tp1)
    The optimization target is to minimize the td_error.

    Args:
      observations: shape = [batch_size, hparams.fingerprint_length]. The input
        of the Q function.
      head: shape = [1]. The index of the head chosen for decision in bootstrap
        DQN.
      state_t: shape = [batch_size, hparams.fingerprint_length]. The state at
        time step t.
      state_tp1: a list of tensors, with total number of batch_size, each has
        shape = [num_actions, hparams.fingerprint_length]. Note that the
        num_actions can be different for each tensor. The state at time step
        t+1, tp1 is short for t plus 1.
      done_mask: shape = [batch_size, 1] Whether state_tp1 is the terminal
        state.
      reward_t: shape = [batch_size, 1] the reward at time step t.
      error_weight: shape = [batch_size, 1] weight for the loss.

    Returns:
      q_values: Tensor of [batch_size, 1]. The q values for the observations.
      td_error: Tensor of [batch_size, 1]. The TD error.
      weighted_error: Tensor of [batch_size, 1]. The TD error weighted by
        error_weight.
      q_fn_vars: List of tf.Variables. The variables of q_fn when computing
        the q_values of state_t
      q_fn_vars: List of tf.Variables. The variables of q_fn when computing
        the q_values of state_tp1

    """
    with tf.variable_scope('q_fn'):
      # q_value have shape [batch_size, 1].
      q_values = tf.gather(self.q_fn(observations), head, axis=-1)

    # calculating q_fn(state_t)
    # The Q network shares parameters with the action graph.
    with tf.variable_scope('q_fn', reuse=True):
      q_t = self.q_fn(state_t, reuse=True)
    q_fn_vars = tf.trainable_variables(
        scope=tf.get_variable_scope().name + '/q_fn')

    # calculating q_fn(state_tp1)
    with tf.variable_scope('q_tp1', reuse=tf.AUTO_REUSE):
      q_tp1 = [self.q_fn(s_tp1, reuse=tf.AUTO_REUSE) for s_tp1 in state_tp1]
    q_tp1_vars = tf.trainable_variables(
        scope=tf.get_variable_scope().name + '/q_tp1')

    if self.double_q:
      with tf.variable_scope('q_fn', reuse=True):
        q_tp1_online = [self.q_fn(s_tp1, reuse=True) for s_tp1 in state_tp1]
      if self.num_bootstrap_heads:
        num_heads = self.num_bootstrap_heads
      else:
        num_heads = 1
      # determine the action to choose based on online Q estimator.
      q_tp1_online_idx = [
          tf.stack([tf.argmax(q, axis=0),
                    tf.range(num_heads, dtype=tf.int64)],
                   axis=1) for q in q_tp1_online
      ]
      # use the index from max online q_values to compute the value
      # function
      v_tp1 = tf.stack(
          [tf.gather_nd(q, idx) for q, idx in zip(q_tp1, q_tp1_online_idx)],
          axis=0)
    else:
      v_tp1 = tf.stack([tf.reduce_max(q) for q in q_tp1], axis=0)

    # if s_{t+1} is the terminal state, we do not evaluate the Q value of
    # the state.
    q_tp1_masked = (1.0 - done_mask) * v_tp1

    q_t_target = reward_t + self.gamma * q_tp1_masked

    # stop gradient from flowing to the computating graph which computes
    # the Q value of s_{t+1}.
    # td_error has shape [batch_size, 1]
    td_error = q_t - tf.stop_gradient(q_t_target)

    # If use bootstrap, each head is trained with a different subset of the
    # training sample. Like the idea of dropout.
    if self.num_bootstrap_heads:
      head_mask = tf.keras.backend.random_binomial(
          shape=(1, self.num_bootstrap_heads), p=0.6)
      td_error = tf.reduce_mean(td_error * head_mask, axis=1)
    # The loss comes from a traditional trick in convex optimization:
    # http://web.stanford.edu/~boyd/cvxbook/.
    # See Chapter 6 pp. 298
    # It will makes the optimization robust.
    # Specifically, the loss will use l1 instead of l2 loss when the td error
    # gets larger than 1.0. The l2 loss has the disadvantage that it has
    # the tendency to be dominated by outliers. In terms of estimation theory,
    # the asymptotic relative efficiency of the l1 loss estimator is better
    # for heavy-tailed distributions.
    errors = tf.where(
        tf.abs(td_error) < 1.0, tf.square(td_error),
        1.0 * (tf.abs(td_error) - 0.5))
    weighted_error = tf.reduce_mean(error_weight * errors)
    return q_values, td_error, weighted_error, q_fn_vars, q_tp1_vars
Пример #28
0
def naive_mcmc_ll(x, presence=None):
    """Most simple of the optimization schemes.

    Skip the product of closeform probability of part given _all_ data. Rather
    use the value at the argmax as a proxy for each part.
    """

    batch_size, n_input_points = x.shape[:2].as_list()

    # Generate gaussian mixture pdfs...
    # [B, 1, n_votes, n_input_dims]
    expanded_votes = tf.expand_dims(_votes, 1)
    expanded_scale = tf.expand_dims(tf.expand_dims(_scales, 1), -1)
    vote_component_pdf = _get_pdf(expanded_votes, expanded_scale)
    print("vote_component_pdf: ", vote_component_pdf)

    # For each part, evaluates all capsule, vote mixture likelihoods
    # [B, n_points, n_caps x n_votes, n_input_dims]
    expanded_x = tf.expand_dims(x, 2)
    print("expanded_x: ", expanded_x.shape)
    vote_log_prob_per_dim = vote_component_pdf.log_prob(expanded_x)
    print("vote_log_prob_dim: ", vote_log_prob_per_dim.shape)

    # Compressing mixture likelihood across all part dimension (ie. 2d point)
    # [B, n_points, n_caps x n_votes]
    vote_log_prob = tf.reduce_sum(vote_log_prob_per_dim, -1)
    print("vote_log_prob: ", vote_log_prob.shape)
    dummy_vote_log_prob = tf.zeros([batch_size, n_input_points, 1])
    dummy_vote_log_prob -= 2. * tf.log(10.)
    print("dummy_vote: ", dummy_vote_log_prob.shape)
    # adding extra [B, n_points, n_caps x n_votes] to end. WHY?
    vote_log_prob = tf.concat([vote_log_prob, dummy_vote_log_prob], 2)
    print("cat vote_log_prob: ", vote_log_prob.shape)

    # [B, n_points, n_caps x n_votes]
    # CONDITIONAL LOGIT a_(k,n)
    mixing_logits = math_ops.safe_log(_vote_presence_prob)

    dummy_logit = tf.zeros([batch_size, 1]) - 2. * tf.log(10.)
    mixing_logits = tf.concat([mixing_logits, dummy_logit], 1)
    print("mixing_logits : ", mixing_logits.shape)

    # BAD -> vote presence / summed vote presence
    mixing_log_prob = mixing_logits - tf.reduce_logsumexp(mixing_logits, 1,
                                                          keepdims=True)
    print("mixing_log_prob : ", mixing_log_prob.shape)

    expanded_mixing_logits = tf.expand_dims(mixing_log_prob, 1)

    # [B, n_points, n_votes]
    posterior_mixing_logits_per_point = expanded_mixing_logits + vote_log_prob
    print("posterior_mixing_per_point: ",
          posterior_mixing_logits_per_point.shape)
    # [B, n_points]
    winning_vote_idx = tf.argmax(
        posterior_mixing_logits_per_point[:, :, :-1], 2)
    print("winning_vote_idx: ",
          winning_vote_idx.shape)

    batch_idx = tf.expand_dims(tf.range(batch_size, dtype=tf.int64), -1)
    batch_idx = snt.TileByDim([1], [winning_vote_idx.shape[-1]])(batch_idx)

    idx = tf.stack([batch_idx, winning_vote_idx], -1)
    winning_vote = tf.gather_nd(_votes, idx)
    print("winning_vote: ", winning_vote.shape)
    winning_pres = tf.gather_nd(_vote_presence_prob, idx)
    print("winning_pres: ", winning_pres.shape)
    vote_presence = tf.greater(mixing_logits[:, :-1],
                               mixing_logits[:, -1:])
    print("vote_presence: ", vote_presence.shape)

    # the first four votes belong to the square
    # Just assuming the votes are ordered by capsule...
    is_from_capsule = winning_vote_idx // _n_votes
    print("is_from_capsule: ", is_from_capsule.shape)

    posterior_mixing_probs = tf.nn.softmax(
        posterior_mixing_logits_per_point, -1)[Ellipsis, :-1]

    assert winning_vote.shape == x.shape

    # log_prob=mixture_log_prob_per_batch,
    return OutputTuple(
        log_prob=None,
        vote_presence=tf.to_float(vote_presence),
        winner=winning_vote,
        winner_pres=winning_pres,
        is_from_capsule=is_from_capsule,
        mixing_logits=mixing_logits,
        mixing_log_prob=mixing_log_prob,
        # TODO(adamrk): this is broken
        soft_winner=tf.zeros_like(winning_vote),
        soft_winner_pres=tf.zeros_like(winning_pres),
        posterior_mixing_probs=posterior_mixing_probs,
    )
def _compute_object_logits(hparams, object_hidden,
                           screen_encoding, screen_encoding_bias):
  """The output layer for a specific domain."""
  with tf.variable_scope("compute_object_logits", reuse=tf.AUTO_REUSE):
    if hparams.alignment == "cosine_similarity":
      object_hidden = tf.layers.dense(
          object_hidden, units=hparams.hidden_size)
      screen_encoding = tf.layers.dense(
          screen_encoding, units=hparams.hidden_size)
      norm_screen_encoding = tf.math.l2_normalize(screen_encoding, axis=-1)
      norm_obj_hidden = tf.math.l2_normalize(object_hidden, axis=-1)
      align_logits = tf.matmul(norm_screen_encoding,
                               tf.expand_dims(norm_obj_hidden, 3))
    elif hparams.alignment == "scaled_cosine_similarity":
      object_hidden = tf.layers.dense(
          object_hidden, units=hparams.hidden_size)
      screen_encoding = tf.reshape(
          screen_encoding,
          common_layers.shape_list(
              screen_encoding)[:-1] + [hparams.hidden_size])
      screen_encoding = tf.layers.dense(
          screen_encoding, units=hparams.hidden_size)
      norm_screen_encoding = tf.math.l2_normalize(screen_encoding, axis=-1)
      norm_obj_hidden = tf.math.l2_normalize(object_hidden, axis=-1)
      dot_products = tf.matmul(norm_screen_encoding,
                               tf.expand_dims(norm_obj_hidden, 3))
      align_logits = tf.layers.dense(dot_products, units=1)
    elif hparams.alignment == "dot_product_attention":
      object_hidden = tf.layers.dense(
          object_hidden, units=hparams.hidden_size)
      align_logits = tf.matmul(screen_encoding,
                               tf.expand_dims(object_hidden, 3))
    elif hparams.alignment == "mlp_attention":
      batch_size = tf.shape(screen_encoding)[0]
      num_steps = tf.shape(screen_encoding)[1]
      num_objects = tf.shape(screen_encoding)[2]
      tiled_object_hidden = tf.tile(tf.expand_dims(object_hidden, 2),
                                    [1, 1, num_objects, 1])
      align_feature = tf.concat([tiled_object_hidden, screen_encoding], axis=-1)
      align_feature = tf.reshape(
          align_feature,
          [batch_size, num_steps, num_objects, hparams.hidden_size * 2])
      with tf.variable_scope("align", reuse=tf.AUTO_REUSE):
        align_hidden = tf.layers.dense(align_feature, units=hparams.hidden_size)
        align_hidden = common_layers.apply_norm(
            align_hidden, hparams.norm_type, hparams.hidden_size,
            epsilon=hparams.norm_epsilon)
        align_hidden = tf.nn.tanh(align_hidden)
        align_logits = tf.layers.dense(align_hidden, units=1)
    else:
      raise ValueError("Unsupported alignment: %s" % hparams.alignment)

    obj_logits = tf.squeeze(align_logits, [3]) + screen_encoding_bias
    # [batch_size, num_steps]
    batch_size = common_layers.shape_list(obj_logits)[0]
    num_steps = common_layers.shape_list(obj_logits)[1]
    # [batch_size * num_steps, 1]
    batch_indices = tf.to_int64(tf.reshape(
        tf.tile(tf.expand_dims(tf.range(batch_size), 1), [1, num_steps]),
        [-1, 1]))
    step_indices = tf.to_int64(tf.reshape(
        tf.tile(tf.expand_dims(tf.range(num_steps), 0), [batch_size, 1]),
        [-1, 1]))
    object_indices = tf.reshape(tf.argmax(obj_logits, -1), [-1, 1])
    indices = tf.concat([batch_indices, step_indices, object_indices], -1)
    # [batch_size, num_steps, depth]
    depth = tf.shape(screen_encoding)[-1]
    best_logits = tf.reshape(
        tf.gather_nd(screen_encoding, indices=indices),
        [batch_size, num_steps, depth])
    consumed_logits = tf.layers.dense(
        tf.reshape(tf.concat([object_hidden, best_logits], -1),
                   [batch_size, num_steps, hparams.hidden_size * 2]),
        2)
    with tf.control_dependencies([tf.assert_equal(
        tf.reduce_all(tf.math.is_nan(consumed_logits)), False,
        data=[tf.shape(best_logits), best_logits,
              tf.constant("screen_encoding"), screen_encoding,
              tf.constant("indices"), indices],
        summarize=10000, message="consumed_logits_nan")]):
      consumed_logits = tf.identity(consumed_logits)
    return obj_logits, consumed_logits
Пример #30
0
def add_distance_loss_to_center(labels, logits, groundtruth_coords):
    """Add distance loss function for ClickRegression."""
    weights = tf.to_int32(
        tf.not_equal(
            labels,
            model_input.dataset_descriptors[FLAGS.dataset].ignore_label))
    labels *= weights

    # Use GT box to get center if it exists. Less computation required.
    # Otherwise, calculate from label mask.
    if FLAGS.use_groundtruth_box:
        center_x = (groundtruth_coords['xmin'] +
                    groundtruth_coords['xmax']) / 2.0
        center_y = (groundtruth_coords['ymin'] +
                    groundtruth_coords['ymax']) / 2.0
        center = tf.stack([center_y, center_x], axis=1)
    else:
        # Make array of coordinates (each row contains three coordinates)
        ii, jj = tf.meshgrid(tf.range(FLAGS.image_size),
                             tf.range(FLAGS.image_size),
                             indexing='ij')
        coords = tf.stack([tf.reshape(ii, (-1, )),
                           tf.reshape(jj, (-1, ))],
                          axis=-1)
        coords = tf.cast(coords, tf.int32)

        # Rearrange input into one vector per volume
        volumes_flat = tf.reshape(
            labels, [-1, FLAGS.image_size * FLAGS.image_size * 1, 1])
        # Compute total mass for each volume. Add 0.00001 to prevent division by 0
        total_mass = tf.cast(tf.reduce_sum(volumes_flat, axis=1),
                             tf.float32) + ZERO_DIV_OFFSET
        # Compute centre of mass
        center = tf.cast(tf.reduce_sum(volumes_flat * coords, axis=1),
                         tf.float32) / total_mass
        center = center / FLAGS.image_size

    # Normalize coordinates by size of image
    logits = logits / FLAGS.image_size

    # Calculate loss based on the distance metric specified
    # Loss added later in model_fn by tf.losses.get_total_loss()
    if FLAGS.distance_metric == 'mse':
        tf.losses.mean_squared_error(center, logits)
    elif FLAGS.distance_metric in [
            'euclidean', 'euclidean_sqrt', 'euclidean_iter'
    ]:
        distance_to_center = tf.sqrt(
            tf.reduce_sum(tf.square(logits - center), axis=-1) +
            ZERO_DIV_OFFSET)
        if FLAGS.ratio_box_distance:
            distance_to_box = calc_distance_to_edge(groundtruth_coords, logits)
            box_distance_to_center = (tf.to_float(distance_to_center) -
                                      distance_to_box)
            loss = distance_to_center / (box_distance_to_center +
                                         ZERO_DIV_OFFSET)
        else:
            loss = distance_to_center

        if FLAGS.distance_metric == 'euclidean_sqrt':
            loss = tf.sqrt(loss)
        if FLAGS.distance_metric == 'euclidean_iter':
            iter_num = tf.to_float(tf.train.get_or_create_global_step())
            step = (iter_num // FLAGS.euclidean_step) + 1.0
            loss = tf.pow(loss, tf.to_float(1.0 / step))
        tf.losses.compute_weighted_loss(loss)