示例#1
0
    def define_self_prediction_rew(self, convfeat, rep_size, enlargement,
                                   scope):
        #RND.
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        #define expert agent observations random features
        #
        yes_gpu = any(get_available_gpus())
        with tf.variable_scope(
                tf.get_variable_scope(),
                reuse=True), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X_im = np.load(os.getcwd() + '/policies/obs.npy')
            Xr_im = tf.cast(X_im, tf.float32) / 255.
            Xr_im = tf.reshape(Xr_im, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
            Xr_im = tf.clip_by_value((Xr_im - tf.reduce_mean(Xr_im)) /
                                     (tf.math.reduce_std(Xr_im)**0.5), -5.0,
                                     5.0)
            Xr_im = tf.nn.leaky_relu(
                conv(Xr_im,
                     'c1r',
                     nf=convfeat * 1,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2)))
            Xr_im = tf.nn.leaky_relu(
                conv(Xr_im,
                     'c2r',
                     nf=convfeat * 2 * 1,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2)))
            Xr_im = tf.nn.leaky_relu(
                conv(Xr_im,
                     'c3r',
                     nf=convfeat * 2 * 1,
                     rf=3,
                     stride=1,
                     init_scale=np.sqrt(2)))
            Xr_im = [to2d(Xr_im)[::self.demonstration_stride]]
            Xr_im = fc(Xr_im[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))
            Xr_im = tf.stop_gradient(Xr_im)

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()
                   ) == 5:  # B,T,H,W,C ###Batch time height width color?
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)
                X_r_hat = tf.nn.relu(
                    fc(rgbrp,
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(X_r_hat,
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(X_r_hat,
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))
        ####
        #self.im_rew =  tf.math.maximum(1 - tf.divide(tf.reduce_mean(tf.square(self.Xr_im[:(X_r).shape[0]] - X_r), axis=-1, keep_dims=True),tf.add(tf.reduce_mean(tf.square(self.Xr_im[:X_r.shape[0]]), axis=-1, keep_dims=True),tf.reduce_mean(tf.square(X_r), axis=-1, keep_dims=True))),tf.constant(0.5))
        im_rew = tf.reduce_mean(tf.tensordot(tf.stop_gradient(X_r),
                                             Xr_im,
                                             axes=[[1], [1]]),
                                axis=1)
        im_rew = tf.reshape(im_rew, (self.sy_nenvs, self.sy_nsteps - 1))
        #self.int_rew =tf.math.maximum(self.im_rew,self.int_rew)
        self.int_rew = self.int_rew * (1 + tf.math.tanh(im_rew / 100))
        ####

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)
示例#2
0
    def build(self, input_shape):
        if self.data_format == 'channels_last':
            channel_axis = -1
            input_row, input_col = input_shape[1:-1]
        else:
            channel_axis = 1
            input_row, input_col = input_shape[2:]

        if input_shape[channel_axis] is None:
            raise ValueError('The channel dimension of the inputs '
                             'should be defined. Found `None`.')
        input_filter = int(input_shape[channel_axis])

        if self.data_format == 'channels_last':
            input_row, input_col = input_shape[1:-1]
            input_filter = input_shape[3]
        else:
            input_row, input_col = input_shape[2:]
            input_filter = input_shape[1]

        if (((input_row is None) and
             ((self.share_row_combining_weights,
               self.share_col_combining_weights) in [(True, False),
                                                     (False, False)]))
                or ((input_col is None) and
                    ((self.share_row_combining_weights,
                      self.share_col_combining_weights) in [(False, True),
                                                            (False, False)]))):
            raise ValueError('The spatial dimensions of the inputs to '
                             ' a LowRankLocallyConnected2D layer '
                             'should be fully-defined, but layer received '
                             'the inputs shape ' + str(input_shape))

        # Compute output shapes.
        # Compute using the first filter since output will be same across filters.
        kernel_size = self.kernel_size[0] if isinstance(
            self.kernel_size, list) else self.kernel_size

        dilations = self.dilations[0] if isinstance(self.dilations,
                                                    list) else self.dilations

        output_row = conv_utils.conv_output_length(input_row,
                                                   kernel_size[0],
                                                   self.padding,
                                                   self.strides[0],
                                                   dilation=dilations)
        output_col = conv_utils.conv_output_length(input_col,
                                                   kernel_size[1],
                                                   self.padding,
                                                   self.strides[1],
                                                   dilation=dilations)

        if isinstance(self.kernel_size, list):
            # Different filters.
            self.kernel_bases = []
            for i, kernel_size in enumerate(self.kernel_size):
                kernel_bases_shape = (kernel_size[0], kernel_size[1],
                                      input_filter, self.filters)
                self.kernel_bases.append(
                    self.add_weight(shape=kernel_bases_shape,
                                    initializer=self.kernel_initializer,
                                    name='kernel_bases%d' % i,
                                    regularizer=self.kernel_regularizer,
                                    constraint=self.kernel_constraint))
        else:
            self.kernel_bases_shape = (self.kernel_size[0],
                                       self.kernel_size[1], input_filter,
                                       self.spatial_rank * self.filters)
            self.kernel_shape = (output_row, output_col, self.kernel_size[0],
                                 self.kernel_size[1], input_filter,
                                 self.filters)
            self.kernel_bases = self.add_weight(
                shape=self.kernel_bases_shape,
                initializer=self.kernel_initializer,
                name='kernel_bases',
                regularizer=self.kernel_regularizer,
                constraint=self.kernel_constraint)

        self.output_row = output_row
        self.output_col = output_col

        if not (self.share_row_combining_weights
                or self.share_col_combining_weights):
            if self.input_dependent:
                self.combining_weights = None
            else:
                self.combining_weights_shape = (output_row, output_col,
                                                self.spatial_rank)

                initializer = (
                    tf.constant_initializer(1. / np.sqrt(self.spatial_rank))
                    if self.combining_weights_initializer == 'conv_init' else
                    self.combining_weights_initializer)

                self.wts = self.add_weight(
                    shape=self.combining_weights_shape,
                    initializer=initializer,
                    name='combining_weights',
                    regularizer=self.combining_weights_regularizer,
                    constraint=self.combining_weights_constraint)
                # If self.wts is overwritten it is removed from layer.weights.
                # Thus, below assignment is necessary.
                self.combining_weights = self.wts

        else:
            c = 1. / (float(self.share_row_combining_weights) + float(
                self.share_col_combining_weights))  # Scale for init.
            initializer = (tf.constant_initializer(c /
                                                   np.sqrt(self.spatial_rank))
                           if self.combining_weights_initializer == 'conv_init'
                           else self.combining_weights_initializer)
            combining_weights_shape_row = (output_row, self.spatial_rank)
            combining_weights_shape_col = (output_col, self.spatial_rank)

            self.wts_row = tf.constant([[0.]])
            self.wts_col = tf.constant([[0.]])
            if self.share_row_combining_weights:
                self.wts_row = self.add_weight(
                    shape=combining_weights_shape_row,
                    initializer=initializer,
                    name='combining_weights_row',
                    regularizer=self.combining_weights_regularizer,
                    constraint=self.combining_weights_constraint)

            if self.share_col_combining_weights:
                self.wts_col = self.add_weight(
                    shape=combining_weights_shape_col,
                    initializer=tf.constant_initializer(
                        c / np.sqrt(self.spatial_rank))
                    if self.combining_weights_initializer == 'conv_init' else
                    self.combining_weights_initializer,
                    name='combining_weights_col',
                    regularizer=self.combining_weights_regularizer,
                    constraint=self.combining_weights_constraint)

            if self.share_row_combining_weights and self.share_col_combining_weights:
                self.combining_weights = tf.math.add(self.wts_col[tf.newaxis],
                                                     self.wts_row[:,
                                                                  tf.newaxis],
                                                     name='combining_weights')
                self.combining_weights_shape = (output_row, output_col,
                                                self.spatial_rank)

            elif self.share_row_combining_weights:
                self.combining_weights = tf.identity(self.wts_row,
                                                     name='combining_weights')
                self.combining_weights_shape = combining_weights_shape_row

            elif self.share_col_combining_weights:
                self.combining_weights = tf.identity(self.wts_col,
                                                     name='combining_weights')
                self.combining_weights_shape = combining_weights_shape_col

        if not self.input_dependent:
            if self.normalize_weights == 'softmax':
                # Normalize the weights to sum to 1.
                self.combining_weights = tf.nn.softmax(
                    self.combining_weights,
                    axis=-1,
                    name='normalized_combining_weights')
            elif self.normalize_weights == 'norm':
                # Normalize the weights to sum to preserve kernel var.
                self.combining_weights = tf.math.l2_normalize(
                    self.combining_weights,
                    axis=-1,
                    epsilon=1e-12,
                    name='normalized_combining_weights')

        if (self.input_dependent or isinstance(self.kernel_size, list)
                or ((self.share_row_combining_weights,
                     self.share_col_combining_weights) in [(True, False),
                                                           (False, True)])):
            # Different kernel bases can not be combined.
            # Shape may not be defined for one of axes in one dimension separate wts.
            self.kernel = None
        else:
            self.kernel = tf.tensordot(
                self.combining_weights,
                tf.reshape(self.kernel_bases,
                           (self.kernel_size[0], self.kernel_size[1],
                            input_filter, self.spatial_rank, self.filters)),
                [[-1], [-2]],
                name='kernel')

        self.bias_spatial = 0.
        self.bias_channels = 0.
        if self.use_spatial_bias:
            if not (self.share_row_combining_weights
                    or self.share_col_combining_weights):
                self.bias_spatial = self.add_weight(
                    shape=(output_row, output_col, 1),
                    initializer=self.bias_initializer,
                    name='spatial_bias',
                    regularizer=self.bias_regularizer,
                    constraint=self.bias_constraint)

            else:
                self.bias_row = 0.
                self.bias_col = 0.
                if self.share_row_combining_weights:
                    self.bias_row = self.add_weight(
                        shape=(output_row, 1, 1),
                        initializer=self.bias_initializer,
                        name='bias_row',
                        regularizer=self.bias_regularizer,
                        constraint=self.bias_constraint)

                if self.share_col_combining_weights:
                    self.bias_col = self.add_weight(
                        shape=(1, output_col, 1),
                        initializer=self.bias_initializer,
                        name='bias_col',
                        regularizer=self.bias_regularizer,
                        constraint=self.bias_constraint)
                self.bias_spatial = tf.math.add(self.bias_row,
                                                self.bias_col,
                                                name='spatial_bias')

        if self.use_bias:
            self.bias_channels = self.add_weight(
                shape=(1, 1, self.filters),
                initializer=self.bias_initializer,
                name='bias_channels',
                regularizer=self.bias_regularizer,
                constraint=self.bias_constraint)

        self.bias = tf.math.add(self.bias_spatial,
                                self.bias_channels,
                                name='bias')

        if self.data_format == 'channels_last':
            self.input_spec = InputSpec(ndim=4, axes={-1: input_filter})
        else:
            self.input_spec = InputSpec(ndim=4, axes={1: input_filter})

        self.built = True
示例#3
0
 def _scale_expression(expr, w):
   """Scale a linear expression by w."""
   b = tf.matmul(expr.b, w)
   w = tf.tensordot(expr.w, w, axes=1)
   return LinearExpression(w=w, b=b, lower=expr.lower, upper=expr.upper)
示例#4
0
def compute_mel_filterbank_features(waveforms,
                                    sample_rate=16000,
                                    dither=1.0 / np.iinfo(np.int16).max,
                                    preemphasis=0.97,
                                    frame_length=25,
                                    frame_step=10,
                                    fft_length=None,
                                    window_fn=functools.partial(
                                        tf.signal.hann_window, periodic=True),
                                    lower_edge_hertz=80.0,
                                    upper_edge_hertz=7600.0,
                                    num_mel_bins=80,
                                    log_noise_floor=1e-3,
                                    apply_mask=True):
    """Implement mel-filterbank extraction using tf ops.

  Args:
    waveforms: float32 tensor with shape [batch_size, max_len]
    sample_rate: sampling rate of the waveform
    dither: stddev of Gaussian noise added to waveform to prevent quantization
      artefacts
    preemphasis: waveform high-pass filtering constant
    frame_length: frame length in ms
    frame_step: frame_Step in ms
    fft_length: number of fft bins
    window_fn: windowing function
    lower_edge_hertz: lowest frequency of the filterbank
    upper_edge_hertz: highest frequency of the filterbank
    num_mel_bins: filterbank size
    log_noise_floor: clip small values to prevent numeric overflow in log
    apply_mask: When working on a batch of samples, set padding frames to zero
  Returns:
    filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1]
  """
    # `stfts` is a complex64 Tensor representing the short-time Fourier
    # Transform of each signal in `signals`. Its shape is
    # [batch_size, ?, fft_unique_bins]
    # where fft_unique_bins = fft_length // 2 + 1

    # Find the wave length: the largest index for which the value is !=0
    # note that waveforms samples that are exactly 0.0 are quite common, so
    # simply doing sum(waveforms != 0, axis=-1) will not work correctly.
    wav_lens = tf.reduce_max(
        tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) *
        tf.to_int32(tf.not_equal(waveforms, 0.0)),
        axis=-1) + 1
    if dither > 0:
        waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither)
    if preemphasis > 0:
        waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1]
        wav_lens -= 1
    frame_length = int(frame_length * sample_rate / 1e3)
    frame_step = int(frame_step * sample_rate / 1e3)
    if fft_length is None:
        fft_length = int(2**(np.ceil(np.log2(frame_length))))

    stfts = tf.contrib.signal.stft(waveforms,
                                   frame_length=frame_length,
                                   frame_step=frame_step,
                                   fft_length=fft_length,
                                   window_fn=window_fn,
                                   pad_end=True)

    stft_lens = (wav_lens + (frame_step - 1)) // frame_step
    masks = tf.to_float(
        tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0),
                      tf.expand_dims(stft_lens, 1)))

    # An energy spectrogram is the magnitude of the complex-valued STFT.
    # A float32 Tensor of shape [batch_size, ?, 257].
    magnitude_spectrograms = tf.abs(stfts)

    # Warp the linear-scale, magnitude spectrograms into the mel-scale.
    num_spectrogram_bins = magnitude_spectrograms.shape[-1].value
    linear_to_mel_weight_matrix = (
        tf.contrib.signal.linear_to_mel_weight_matrix(num_mel_bins,
                                                      num_spectrogram_bins,
                                                      sample_rate,
                                                      lower_edge_hertz,
                                                      upper_edge_hertz))
    mel_spectrograms = tf.tensordot(magnitude_spectrograms,
                                    linear_to_mel_weight_matrix, 1)
    # Note: Shape inference for tensordot does not currently handle this case.
    mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

    log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms))

    if apply_mask:
        log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1)

    return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
示例#5
0
def lagrangian_optimizer_fmeasure(
    train_set, epsilon, learning_rate, learning_rate_constraint, loops):
  """Implements surrogate-based Lagrangian optimizer (Algorithm 3).

  Specifically solves:
    max F-measure s.t. F-measure(group1) >= F-measure(group0) - epsilon.

  Args:
    train_set: (features, labels, groups)
    epsilon: float, constraint slack.
    learning_rate: float, learning rate for model parameters.
    learning_rate_constraint: float, learning rate for Lagrange multipliers.
    loops: int, number of iterations.

  Returns:
    stochastic_model containing list of models and probabilities,
    deterministic_model.
  """
  x_train, y_train, z_train = train_set
  dimension = x_train.shape[-1]

  tf.reset_default_graph()

  # Data tensors.
  features_tensor = tf.constant(x_train.astype("float32"), name="features")
  labels_tensor = tf.constant(y_train.astype("float32"), name="labels")

  # Linear model.
  weights = tf.Variable(tf.zeros(dimension, dtype=tf.float32),
                        name="weights")
  threshold = tf.Variable(0, name="threshold", dtype=tf.float32)
  predictions_tensor = (tf.tensordot(features_tensor, weights, axes=(1, 0))
                        + threshold)

  # Contexts.
  context = tfco.rate_context(predictions_tensor, labels_tensor)
  context0 = context.subset(z_train < 1)
  context1 = context.subset(z_train > 0)

  # F-measure rates.
  fm_overall = tfco.f_score(context)
  fm1 = tfco.f_score(context1)
  fm0 = tfco.f_score(context0)

  # Rate minimization problem.
  problem = tfco.RateMinimizationProblem(-fm_overall, [fm0 <= fm1 + epsilon])

  # Optimizer.
  optimizer = tfco.LagrangianOptimizerV1(
      tf.train.AdamOptimizer(learning_rate=learning_rate),
      constraint_optimizer=tf.train.AdamOptimizer(
          learning_rate=learning_rate_constraint))
  train_op = optimizer.minimize(problem)

  # Start TF session and initialize variables.
  session = tf.Session()
  session.run(tf.global_variables_initializer())

  # We maintain a list of objectives and model weights during training.
  objectives = []
  violations = []
  models = []

  # Perform full gradient updates.
  for ii in range(loops):

    # Gradient updates.
    session.run(train_op)

    # Checkpoint once in 10 iterations.
    if ii % 10 == 0:
      # Model weights.
      model = [session.run(weights), session.run(threshold)]
      models.append(model)

      # Objective.
      objective = -evaluation.expected_fmeasure(
          x_train, y_train, [model], [1.0])
      objectives.append(objective)

      # Violation.
      fmeasure0, fmeasure1 = evaluation.expected_group_fmeasures(
          x_train, y_train, z_train, [model], [1.0])
      violations.append([fmeasure0 - fmeasure1 - epsilon])

  # Use the recorded objectives and constraints to find the best iterate.
  best_iterate = tfco.find_best_candidate_index(
      np.array(objectives), np.array(violations))
  deterministic_model = models[best_iterate]

  # Use shrinking to find a sparse distribution over iterates.
  probabilities = tfco.find_best_candidate_distribution(
      np.array(objectives), np.array(violations))
  models_pruned = [models[i] for i in range(len(models)) if
                   probabilities[i] > 0.0]
  probabilities_pruned = probabilities[probabilities > 0.0]

  return (models_pruned, probabilities_pruned), deterministic_model
示例#6
0
def create_model(
    bert_config,
    is_training,
    input_ids,
    input_mask,
    segment_ids,
    labels,
    num_labels,
    use_one_hot_embeddings,
    num_segments,
    aggregation_method,
    pretrained_model='bert',
    from_distilled_student=False,
):
    """Creates a classification model."""
    scope = ""
    if from_distilled_student:
        scope = "student"
    parade_model = Parade(bert_config=bert_config,
                          is_training=is_training,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          num_segments=num_segments,
                          pretrained_model=pretrained_model,
                          use_one_hot_embeddings=use_one_hot_embeddings,
                          scope=scope)
    output_layer = None
    if aggregation_method == 'cls_attn':
        output_layer = parade_model.reduced_by_attn()
    elif aggregation_method == 'cls_avg':
        output_layer = parade_model.reduced_by_avg()
    elif aggregation_method == 'cls_max':
        output_layer = parade_model.reduced_by_max()
    elif aggregation_method == 'cls_transformer':
        output_layer = parade_model.reduced_by_transformer(
            is_training, num_transformer_layers=2)
    else:
        raise ValueError(
            "Un-supported model type: {}".format(aggregation_method))

    with tf.variable_scope(scope):
        output_weights = tf.get_variable(
            "output_weights", [num_labels, parade_model.hidden_size],
            initializer=tf.truncated_normal_initializer(stddev=0.02))
        output_bias = tf.get_variable("output_bias", [num_labels],
                                      initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
        logits = tf.tensordot(output_layer, output_weights, axes=[-1, -1])
        logits = tf.nn.bias_add(logits, output_bias)

        log_probs = tf.nn.log_softmax(logits, axis=-1)
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, log_probs)
示例#7
0
def multihead_graph_attention(query_antecedent,
                              memory_antecedent,
                              bias,
                              total_key_depth,
                              total_value_depth,
                              output_depth,
                              num_heads,
                              dropout_rate,
                              image_shapes=None,
                              attention_type="edge_vector",
                              name="multihead_graph_attention",
                              save_weights_to=None,
                              make_image_summary=True,
                              dropout_broadcast_dims=None,
                              adjacency_matrix=None,
                              num_edge_types=5,
                              vars_3d=False,
                              **kwargs):
    """Multihead scaled-dot-product attention with input/output transformations.

  Args:
    query_antecedent: a Tensor with shape [batch, length_q, channels]
    memory_antecedent: a Tensor with shape [batch, length_m, channels] or None
    bias: bias Tensor (see attention_bias())
    total_key_depth: an integer
    total_value_depth: an integer
    output_depth: an integer
    num_heads: an integer dividing total_key_depth and total_value_depth
    dropout_rate: a floating point number
    image_shapes: optional tuple of integer scalars.
                  see comments for attention_image_summary()
    attention_type: a string, either "dot_product", "dot_product_relative",
                    "local_mask_right", "local_unmasked", "masked_dilated_1d",
                    "unmasked_dilated_1d", graph, or any attention function
                    with the signature (query, key, value, **kwargs)
    name: an optional string.
    save_weights_to: an optional dictionary to capture attention weights
      for vizualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    dropout_broadcast_dims:  an optional list of integers less than 4
      specifying in which dimensions to broadcast the dropout decisions.
      saves memory.
    adjacency_matrix: an optional tensor of shape [batch, len_q, len_q]
      containing edge vectors for attention
    num_edge_types: number of edge types, an int
    vars_3d: use 3-dimensional variables for input/output transformations
    **kwargs (dict): Parameters for the attention function

  Returns:
    The result of the attention transformation. The output shape is
        [batch_size, length_q, output_depth]

  Raises:
    ValueError: if the key depth or value depth are not divisible by the
      number of attention heads.
  """
    if total_key_depth % num_heads != 0:
        raise ValueError("Key depth (%d) must be divisible by the number of "
                         "attention heads (%d)." %
                         (total_key_depth, num_heads))
    if total_value_depth % num_heads != 0:
        raise ValueError("Value depth (%d) must be divisible by the number of "
                         "attention heads (%d)." %
                         (total_value_depth, num_heads))
    vars_3d_num_heads = num_heads if vars_3d else None
    with tf.variable_scope(name,
                           default_name="multihead_attention",
                           values=[query_antecedent, memory_antecedent]):

        q, k, v = common_attention.compute_qkv(
            query_antecedent,
            memory_antecedent,
            total_key_depth,
            total_value_depth,
            vars_3d_num_heads=vars_3d_num_heads)
        q = common_attention.split_heads(q, num_heads)
        k = common_attention.split_heads(k, num_heads)
        v = common_attention.split_heads(v, num_heads)

        key_depth_per_head = total_key_depth // num_heads
        if not vars_3d:
            q *= key_depth_per_head**-0.5

        additional_returned_value = None
        if callable(
                attention_type):  # Generic way to extend multihead_attention
            x = attention_type(q, k, v, **kwargs)
            if isinstance(x, tuple):
                x, additional_returned_value = x  # Unpack

        elif attention_type == "edge_vector":
            x = graph_attention(q,
                                k,
                                v,
                                bias,
                                dropout_rate,
                                image_shapes,
                                save_weights_to=save_weights_to,
                                make_image_summary=make_image_summary,
                                dropout_broadcast_dims=dropout_broadcast_dims,
                                adjacency_matrix=adjacency_matrix,
                                num_edge_types=num_edge_types)

        x = common_attention.combine_heads(x)

        # Set last dim specifically.
        x.set_shape(x.shape.as_list()[:-1] + [total_value_depth])

        if vars_3d:
            o_var = tf.get_variable(
                "o", [num_heads, total_value_depth // num_heads, output_depth])
            o_var = tf.reshape(o_var, [total_value_depth, output_depth])
            x = tf.tensordot(x, o_var, axes=1)
        else:
            x = common_layers.dense(x,
                                    output_depth,
                                    use_bias=False,
                                    name="output_transform")
        if additional_returned_value is not None:
            return x, additional_returned_value
        return x
示例#8
0
def color_transform(masks):
    with tf.name_scope("color_transform"):
        n_components = masks.shape.as_list()[-1]
        colors = tf.constant(get_mask_plot_colors(n_components),
                             name="mask_colors")
        return tf.tensordot(masks, colors, axes=1)
示例#9
0
def compute_attention_component(antecedent,
                                total_depth,
                                filter_width=1,
                                padding="VALID",
                                name="c",
                                vars_3d_num_heads=0,
                                sparsity_technique=None,
                                threshold=3.0,
                                training=True,
                                clip_alpha=None,
                                initial_sparsity=None,
                                split_heads=False,
                                num_heads=None):
    """Computes attention compoenent (query, key or value).

  Args:
    antecedent: a Tensor with shape [batch, length, channels]
    total_depth: an integer
    filter_width: An integer specifying how wide you want the attention
      component to be.
    padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding.
    name: a string specifying scope name.
    vars_3d_num_heads: an optional integer (if we want to use 3d variables)
    sparsity_technique: technique used for sparsifying weights.
    threshold: log alpha threshold used for evaluation with variational dropout.
    training: whether model is being trained or not.
    clip_alpha: alpha clipping threshold for variational dropout.
    initial_sparsity: initial sparsity level for lottery ticket &
      scratch experiments.
    split_heads: Whether to prune each head separately.
    num_heads: The number of heads in the attention module.

  Returns:
    c : [batch, length, depth] tensor
  """
    # We don't support 3d attention variables or filter_width > 1 with sparsity
    # techniques
    assert not sparsity_technique or (not vars_3d_num_heads
                                      and filter_width == 1)

    if vars_3d_num_heads > 0:
        assert filter_width == 1
        input_depth = antecedent.get_shape().as_list()[-1]
        depth_per_head = total_depth // vars_3d_num_heads
        initializer_stddev = input_depth**-0.5
        if "q" in name:
            initializer_stddev *= depth_per_head**-0.5
        var = tf.get_variable(
            name,
            [input_depth, vars_3d_num_heads, total_depth // vars_3d_num_heads],
            initializer=tf.random_normal_initializer(
                stddev=initializer_stddev))
        var = tf.cast(var, antecedent.dtype)
        var = tf.reshape(var, [input_depth, total_depth])
        return tf.tensordot(antecedent, var, axes=1)
    if filter_width == 1:
        if sparsity_technique:
            if split_heads:
                # Prune each heads weights separately so that they are free
                # to have different weight magnitude distributions.
                if num_heads is None:
                    raise ValueError(
                        "`num_heads` must be set for split head pruning.")
                if total_depth % num_heads != 0:
                    raise ValueError(
                        "`total_depth` must be divisible by `num_heads`.")
                input_depth = antecedent.get_shape().as_list()[-1]
                depth_per_head = int(total_depth / num_heads)
                masked_head_weights = []
                for head_id in range(num_heads):
                    head_name = name + "_shard_{}".format(head_id)
                    with tf.variable_scope(head_name) as vs:
                        head_weights = tf.get_variable(
                            "kernel", [input_depth, depth_per_head])
                        masked_head_weights.append(
                            pruning.apply_mask(head_weights, vs))
                component_weights = tf.concat(masked_head_weights, axis=1)

                # compute the full component result
                return tf.tensordot(antecedent, component_weights, axes=1)
            else:
                return common_sparse.dense(
                    antecedent,
                    total_depth,
                    use_bias=False,
                    sparsity_technique=sparsity_technique,
                    threshold=threshold,
                    training=training,
                    clip_alpha=clip_alpha,
                    name=name,
                    initial_sparsity=initial_sparsity)
        else:
            return common_layers.dense(antecedent,
                                       total_depth,
                                       use_bias=False,
                                       name=name)
    else:
        return common_layers.conv1d(antecedent,
                                    total_depth,
                                    filter_width,
                                    padding=padding,
                                    name=name)
    def test_correct_output(self, normalize_weights, input_dependent,
                            data_format, combining_weights_initializer):
        spatial_rank = 2
        kernel_size = 3
        filters = 16
        input_chs = 3
        if data_format == 'channels_last':
            input_shape = (1, 32, 32, input_chs)
        if data_format == 'channels_first':
            input_shape = (1, input_chs, 32, 32)

        images = tf.constant(np.random.randn(*input_shape), dtype=tf.float32)
        layer1 = tf.keras.layers.LocallyConnected2D(filters=filters,
                                                    kernel_size=(kernel_size,
                                                                 kernel_size),
                                                    strides=(1, 1),
                                                    padding='valid',
                                                    data_format=data_format)

        layer2 = layers.LowRankLocallyConnected2D(
            filters=filters,
            kernel_size=(kernel_size, kernel_size),
            strides=(1, 1),
            padding='valid',
            spatial_rank=spatial_rank,
            normalize_weights=normalize_weights,
            combining_weights_initializer=combining_weights_initializer,
            share_row_combining_weights=False,
            share_col_combining_weights=False,
            data_format=data_format,
            input_dependent=input_dependent)

        output1 = layer1(images)
        output2 = layer2(images)

        assign_ops = []

        # Kernel from locally connected network.
        kernel1 = layer1.kernel

        combining_weights = layer2.combining_weights
        if input_dependent:
            combining_weights = tf.reduce_mean(combining_weights, axis=0)
        # Kernel from low rank locally connected network.
        kernel2 = tf.tensordot(
            combining_weights,
            tf.reshape(layer2.kernel_bases,
                       (layer2.kernel_size[0], layer2.kernel_size[1],
                        input_chs, layer2.spatial_rank, layer2.filters)),
            [[-1], [-2]],
            name='kernel')
        kernel2 = kernel_low_rank_lc_to_lc(kernel2, data_format)

        assign_ops.append(tf.assign(kernel1, kernel2))

        # Test results consistent with keras locallyconnected2d layer.
        self.evaluate(tf.global_variables_initializer())
        for op in assign_ops:
            self.evaluate(op)

        max_error = np.max(np.abs(self.evaluate(output1 - output2)))
        self.assertLess(max_error, 1e-5)
示例#11
0
文件: ControlGen.py 项目: X11/DAST
    def build_model(self, args):
        # auto-encoder
        with tf.variable_scope('encoder_decoder'):
            # word embedding
            embedding = tf.get_variable('embedding',
                                        initializer=self.word_init)
            # embedding = tf.get_variable('embedding', [self.vocab_size, self.dim_emb])
            enc_inputs = tf.nn.embedding_lookup(embedding, self.enc_inputs)
            dec_inputs = tf.nn.embedding_lookup(embedding, self.dec_inputs)
            with tf.variable_scope('projection'):
                # style information
                projection = {}
                projection['W'] = tf.get_variable(
                    'W', [self.dim_h, self.vocab_size])
                projection['b'] = tf.get_variable('b', [self.vocab_size])
            encoder = self.create_cell(self.dim_h, args.n_layers, self.dropout,
                                       'encoder')
            decoder = self.create_cell(self.dim_h, args.n_layers, self.dropout,
                                       'decoder')
            self.loss_rec, origin_info, transfer_info = self.reconstruction(
                encoder, enc_inputs, self.labels, decoder, dec_inputs,
                self.targets, self.dec_mask, projection)
            _, soft_tsf_ids, self.rec_ids, self.tsf_ids = self.run_decoder(
                decoder, dec_inputs, embedding, projection, origin_info,
                transfer_info)

            # make the real sents and fake sents the same length
            if args.trim_padding:
                fake_probs = fake_probs[:, :1 + self.batch_len, :]

        # discriminator
        with tf.variable_scope('discriminator'):
            classifier_embedding = tf.get_variable('embedding',
                                                   initializer=self.word_init)
            # classifier_embedding = tf.get_variable('embedding', [self.vocab_size, self.dim_emb])
            # remove bos, use dec_inputs to avoid noises adding into enc_inputs
            real_sents = tf.nn.embedding_lookup(classifier_embedding,
                                                self.dec_inputs[:, 1:])
            fake_sents = tf.tensordot(soft_tsf_ids, classifier_embedding,
                                      [[2], [0]])
            fake_sents = fake_sents[:, :
                                    -1, :]  # make the dimension the same as real sents

            # mask the sequences
            mask = tf.sequence_mask(self.enc_lens,
                                    self.max_len - 1,
                                    dtype=tf.float32)
            mask = tf.expand_dims(mask, -1)
            real_sents *= mask
            fake_sents *= mask

            self.loss_d, self.loss_g = self.run_discriminator(
                real_sents, fake_sents, self.labels, args)

        #####   optimizer   #####
        self.loss = self.loss_rec + self.rho * self.loss_g

        theta_eg = retrive_var(['encoder_decoder'])
        theta_d = retrive_var(['discriminator'])

        opt = tf.train.AdamOptimizer(self.learning_rate, beta1=0.5)

        grad, _ = zip(*opt.compute_gradients(self.loss, theta_eg))
        grad, _ = tf.clip_by_global_norm(grad, 30.0)

        self.optimize_tot = opt.apply_gradients(zip(grad, theta_eg))
        self.optimize_rec = opt.minimize(self.loss_rec, var_list=theta_eg)
        self.optimize_d = opt.minimize(self.loss_d, var_list=theta_d)

        self.saver = tf.train.Saver(max_to_keep=5)
示例#12
0
def encoder(features, mode, vocab, hps):
    """Model function.

  Atttention seq2seq model, augmented with an encoder
  over the targets of the nearest neighbors.

  Args:
    features: Dictionary of input Tensors.
    mode: train or eval. Keys from tf.estimator.ModeKeys.
    vocab: A list of strings of words in the vocabulary.
    hps: Hyperparams.

  Returns:
    Encoder outputs.
  """

    # [batch_size, src_len]
    src_inputs = features["src_inputs"]
    src_len = features["src_len"]

    with tf.variable_scope("embeddings"):
        scale = (3.0 / hps.emb_dim)**0.5
        embeddings = tf.get_variable("embeddings", [vocab.size(), hps.emb_dim],
                                     dtype=tf.float32,
                                     initializer=tf.random_uniform_initializer(
                                         minval=-scale, maxval=scale))

    # [batch_size, src_len, emb_dim]
    src_input_emb = tf.nn.embedding_lookup(embeddings, src_inputs)

    if mode == tf_estimator.ModeKeys.TRAIN and hps.emb_drop > 0.:
        src_input_emb = tf.nn.dropout(src_input_emb,
                                      keep_prob=1.0 - hps.emb_drop)
    src_att_context, neighbor_att_context = None, None
    src_copy_context, neighbor_copy_context = None, None
    with tf.variable_scope("src_encoder"):

        # 2 * [batch_size, src_len, encoder_dim]
        src_encoder_outputs, src_encoder_states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=get_rnn_cell(mode=mode,
                                 hps=hps,
                                 input_dim=hps.emb_dim,
                                 num_units=hps.encoder_dim,
                                 num_layers=hps.num_encoder_layers,
                                 dropout=hps.encoder_drop,
                                 cell_type="lstm"),
            cell_bw=get_rnn_cell(mode=mode,
                                 hps=hps,
                                 input_dim=hps.emb_dim,
                                 num_units=hps.encoder_dim,
                                 num_layers=hps.num_encoder_layers,
                                 dropout=hps.encoder_drop,
                                 cell_type="lstm"),
            inputs=src_input_emb,
            dtype=tf.float32,
            sequence_length=src_len)

        # [batch_size, src_len, 2*encoder_dim]
        src_encoder_outputs = tf.concat(src_encoder_outputs, 2)

        with tf.variable_scope("src_att_context"):
            src_att_context = _build_context(
                hps=hps, encoder_outputs=src_encoder_outputs)
        if hps.use_copy:
            with tf.variable_scope("src_copy_context"):
                src_copy_context = _build_context(
                    hps=hps, encoder_outputs=src_encoder_outputs)

    if hps.encode_neighbor or hps.att_neighbor or hps.sum_neighbor:
        # [batch_size, neighbor_len]
        neighbor_inputs = features["neighbor_inputs"]
        neighbor_len = features["neighbor_len"]

        # [batch_size, neighbor_len, emb_dim]
        neighbor_input_emb = tf.nn.embedding_lookup(embeddings,
                                                    neighbor_inputs)

        if mode == tf_estimator.ModeKeys.TRAIN and hps.emb_drop > 0.:
            neighbor_input_emb = tf.nn.dropout(neighbor_input_emb,
                                               keep_prob=1.0 - hps.emb_drop)
        if hps.binary_neighbor:
            neighbor_binary_input = features["neighbor_binary"]
            if hps.binary_dim == 1:
                neighbor_binary_emb = tf.to_float(neighbor_binary_input)
                neighbor_binary_emb = tf.expand_dims(neighbor_binary_emb,
                                                     axis=-1)
            else:
                with tf.variable_scope("binary_emb"):
                    scale = (3.0 / hps.binary_dim)**0.5
                    binary_embeddings = tf.get_variable(
                        "binary_emb", [2, hps.binary_dim],
                        dtype=tf.float32,
                        initializer=tf.random_uniform_initializer(
                            minval=-scale, maxval=scale))
                neighbor_binary_emb = tf.nn.embedding_lookup(
                    binary_embeddings, neighbor_binary_input)

            neighbor_input_emb = tf.concat(
                [neighbor_input_emb, neighbor_binary_emb], axis=2)

        with tf.variable_scope("neighbor_encoder"):
            # 2 * [batch_size, neighbor_len, encoder_dim]
            input_dim = hps.emb_dim
            if hps.binary_neighbor:
                input_dim += hps.binary_dim
            neighbor_encoder_outputs, neighbor_encoder_states = \
                tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=get_rnn_cell(
                        mode=mode, hps=hps,
                        input_dim=input_dim,
                        num_units=hps.neighbor_dim,
                        num_layers=1,
                        dropout=hps.encoder_drop,
                        cell_type="lstm"),
                    cell_bw=get_rnn_cell(
                        mode=mode, hps=hps,
                        input_dim=input_dim,
                        num_units=hps.neighbor_dim,
                        num_layers=1,
                        dropout=hps.encoder_drop,
                        cell_type="lstm"),
                    inputs=neighbor_input_emb,
                    dtype=tf.float32,
                    sequence_length=neighbor_len)
            # [batch_size, neighbor_len, 2*encoder_dim]
            neighbor_encoder_outputs = tf.concat(neighbor_encoder_outputs, 2)
            if hps.att_neighbor:
                with tf.variable_scope("neighbor_att_context"):
                    neighbor_att_context = _build_context(
                        hps=hps, encoder_outputs=neighbor_encoder_outputs)
                if hps.use_copy:
                    with tf.variable_scope("neighbor_copy_context"):
                        neighbor_copy_context = _build_context(
                            hps=hps, encoder_outputs=neighbor_encoder_outputs)
    att_context, copy_context = None, None
    if hps.att_neighbor:
        att_context = tf.concat([src_att_context, neighbor_att_context], 1)
        if hps.use_copy:
            copy_context = tf.concat([src_copy_context, neighbor_copy_context],
                                     1)
    else:
        att_context = src_att_context
        if hps.use_copy:
            copy_context = src_copy_context
    if hps.encode_neighbor:
        neighbor_fw_states, neighbor_bw_states = neighbor_encoder_states
        neighbor_h = tf.concat(
            [neighbor_fw_states[-1].h, neighbor_bw_states[-1].h], axis=1)
        if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.:
            neighbor_h = tf.nn.dropout(neighbor_h, keep_prob=1.0 - hps.drop)
        mem_input = tf.layers.dense(
            neighbor_h,
            units=hps.decoder_dim,
            activation=tf.nn.tanh,
            use_bias=True,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name="mem_input")
        if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.:
            mem_input = tf.nn.dropout(mem_input, keep_prob=1.0 - hps.drop)
    elif hps.sum_neighbor:

        src_fw_states, src_bw_states = src_encoder_states
        src_h = tf.concat([src_fw_states[-1].h, src_bw_states[-1].h], axis=1)

        if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.:
            src_h = tf.nn.dropout(src_h, keep_prob=1.0 - hps.drop)
        src_h = tf.layers.dense(
            src_h,
            units=hps.decoder_dim,
            activation=tf.nn.tanh,
            use_bias=True,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name="proj_src_h")
        neighbor_encoder_outputs = tf.layers.dense(
            neighbor_encoder_outputs,
            units=hps.decoder_dim,
            activation=tf.nn.tanh,
            use_bias=True,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            name="proj_neighbor_out")

        alpha = tf.tensordot(neighbor_encoder_outputs, src_h, axes=1)
        alpha = tf.einsum("bij,bj->bi", neighbor_encoder_outputs, src_h)
        alpha = tf.nn.softmax(alpha)
        mem_input = tf.reduce_sum(
            neighbor_encoder_outputs * tf.expand_dims(alpha, -1), 1)
        # mem_input = tf.reduce_mean(mem_input, axis=1)
        if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.:
            mem_input = tf.nn.dropout(mem_input, keep_prob=1.0 - hps.drop)
    else:
        assert hps.rnn_cell != "hyper_lstm"
        assert hps.att_type != "hyper"
        mem_input = None

    if hps.use_bridge:
        with tf.variable_scope("bridge"):
            out_dim = hps.num_decoder_layers * hps.decoder_dim
            fw_states, bw_states = src_encoder_states
            c_states, h_states = [], []
            for (fw, bw) in zip(fw_states, bw_states):
                c_states.append(tf.concat((fw.c, bw.c), 1))
                h_states.append(tf.concat((fw.h, bw.h), 1))
            cs, hs = c_states[-1], h_states[-1]

            if mode == tf_estimator.ModeKeys.TRAIN and hps.drop > 0.:
                hs = tf.nn.dropout(hs, keep_prob=1.0 - hps.drop)
                cs = tf.nn.dropout(cs, keep_prob=1.0 - hps.drop)

            h_state = tf.layers.dense(
                hs,
                units=out_dim,
                activation=tf.nn.tanh,
                use_bias=True,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                name="h_layer")
            c_state = tf.layers.dense(
                cs,
                units=out_dim,
                activation=tf.nn.tanh,
                use_bias=True,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                name="c_layer")
    else:
        h_state, c_state = None, None
    # print (att_context)
    # dsadsa
    return EncoderOutputs(embeddings=embeddings,
                          mem_input=mem_input,
                          att_context=att_context,
                          copy_context=copy_context,
                          states=(h_state, c_state))
示例#13
0
def lagrangian_optimizer_kld(
    train_set, additive_slack, learning_rate, learning_rate_constraint, loops):
  """Implements surrogate-based Lagrangian optimizer (Algorithm 2).

  Specifically solves:
    min_{theta} sum_{G = 0, 1} KLD(p, pprG(theta))
      s.t. error_rate <= additive_slack,
    where p is the overall proportion of positives and pprG is the positive
    prediction rate for group G.

  We frame this as a constrained optimization problem:
    min_{theta, xi_pos0, xi_pos1, xi_neg0, xi_neg1} {
      -p log(xi_pos0) - (1-p) log(xi_neg0) - p log(xi_pos1)
        -(1-p) log(xi_neg1)}
    s.t.
      error_rate <= additive_slack,
        xi_pos0 <= ppr0(theta), xi_neg0 <= npr0(theta),
        xi_pos1 <= ppr1(theta), xi_neg1 <= npr1(theta),
  and formulate the Lagrangian:
    max_{lambda's >= 0} min_{xi's} {
      -p log(xi_pos0) - (1-p) log(xi_neg0) - p log(xi_pos1)
        -(1-p) log(xi_neg1)
       + lambda_pos0 (xi_pos0 - ppr0(theta))
       + lambda_neg0 (xi_neg0 - npr0(theta))
       + lambda_pos1 (xi_pos1 - ppr1(theta))
       + lambda_neg1 (xi_neg1 - npr1(theta))}
    s.t.
      error_rate <= additive_slack.

  We do best response for the slack variables xi:
    BR for xi_pos0 = p / lambda_pos0
    BR for xi_neg0 = (1 - p) / lambda_neg0
    BR for xi_pos1 = p / lambda_pos1
    BR for xi_neg1 = (1 - p) / lambda_neg1
  We do gradient ascent on the lambda's, where
    Gradient w.r.t. lambda_pos0
      = BR for xi_pos0 - ppr0(theta)
      = p / lambda_pos0 - ppr0(theta)
      = Gradient w.r.t. lambda_pos0 of
        (p log(lambda_pos0) - lambda_pos0 ppr0(theta))
    Gradient w.r.t. lambda_neg0
      = Gradient w.r.t. lambda_neg0 of
        ((1 - p) log(lambda_neg0) - lambda_neg0 npr0(theta))
    Gradient w.r.t. lambda_pos1
      = Gradient w.r.t. lambda_pos1 of
        (p log(lambda_pos1) - lambda_pos1 ppr1(theta))
    Gradient w.r.t. lambda_neg1
      = Gradient w.r.t. lambda_neg1 of
        ((1 - p) log(lambda_neg1) - lambda_neg1 npr1(theta)).
  We do gradient descent on thetas's, with ppr's and npr's replaced with hinge
  surrogates. We use concave lower bounds on ppr's and npr's, so that when they
  get negated in the updates, we get convex upper bounds.

  See Appendix D.1 in the paper for more details.

  Args:
    train_set: (features, labels, groups)
    additive_slack: float, additive slack on error rate constraint
    learning_rate: float, learning rate for model parameters
    learning_rate_constraint: float, learning rate for Lagrange multipliers
    loops: int, number of iterations

  Returns:
    stochastic_model containing list of models and probabilities,
    deterministic_model.
  """
  x_train, y_train, z_train = train_set
  dimension = x_train.shape[-1]

  tf.reset_default_graph()

  # Data tensors.
  features_tensor = tf.constant(x_train.astype("float32"), name="features")
  labels_tensor = tf.constant(y_train.astype("float32"), name="labels")

  # Linear model.
  weights = tf.Variable(tf.zeros(dimension, dtype=tf.float32),
                        name="weights")
  threshold = tf.Variable(0, name="threshold", dtype=tf.float32)
  predictions_tensor = (tf.tensordot(features_tensor, weights, axes=(1, 0))
                        + threshold)

  # Group-specific predictions.
  predictions_group0 = tf.boolean_mask(predictions_tensor, mask=(z_train < 1))
  num_examples0 = np.sum(z_train < 1)
  predictions_group1 = tf.boolean_mask(predictions_tensor, mask=(z_train > 0))
  num_examples1 = np.sum(z_train > 0)

  # We use the TF Constrained Optimization (TFCO) library to set up the
  # constrained optimization problem. The library doesn't currently support best
  # responses for slack variables. So we maintain explicit Lagrange multipliers
  # for the slack variables, and let the library deal with the Lagrange
  # multipliers for the error rate constraint.

  # Since we need to perform a gradient descent update on the model parameters,
  # and an ascent update on the Lagrange multipliers on the slack variables, we
  # create a single "minimization" objective using stop gradients, where a
  # descent gradient update has the effect of minimizing over the model
  # parameters and maximizing over the Lagrange multipliers for the slack
  # variables. As noted above, the ascent update on the Lagrange multipliers for
  # the error rate constraint is done by the library internally.

  # Placeholders for Lagrange multipliers for the four slack variables.
  lambda_pos0 = tf.Variable(0.5, dtype=tf.float32, name="lambda_pos0")
  lambda_neg0 = tf.Variable(0.5, dtype=tf.float32, name="lambda_neg0")
  lambda_pos1 = tf.Variable(0.5, dtype=tf.float32, name="lambda_pos1")
  lambda_neg1 = tf.Variable(0.5, dtype=tf.float32, name="lambda_neg1")

  # Set up prediction rates and surrogate relaxations on them.
  p = np.mean(y_train)  # Proportion of positives.

  # Positive and negative prediction rates for group 0 and group 1.
  ppr_group0 = tf.reduce_sum(tf.cast(
      tf.greater(predictions_group0, tf.zeros(num_examples0, dtype="float32")),
      "float32")) / num_examples0
  npr_group0 = 1 - ppr_group0
  ppr_group1 = tf.reduce_sum(tf.cast(
      tf.greater(predictions_group1, tf.zeros(num_examples1, dtype="float32")),
      "float32")) / num_examples1
  npr_group1 = 1 - ppr_group1

  # Hinge concave lower bounds on the positive and negative prediction rates.
  # In the gradient updates, these get negated and become convex upper bounds.
  # For group 0:
  ppr_hinge_group0 = tf.reduce_sum(
      1 - tf.nn.relu(1 - predictions_group0)) * 1.0 / num_examples0
  npr_hinge_group0 = tf.reduce_sum(
      1 - tf.nn.relu(1 + predictions_group0)) * 1.0 / num_examples0
  # For group 1:
  ppr_hinge_group1 = tf.reduce_sum(
      1 - tf.nn.relu(1 - predictions_group1)) * 1.0 / num_examples1
  npr_hinge_group1 = tf.reduce_sum(
      1 - tf.nn.relu(1 + predictions_group1)) * 1.0 / num_examples1

  # Set up KL-divergence objective for constrained optimization.
  # We use stop gradients to ensure that a single descent gradient update on the
  # objective has the effect of minimizing over the model parameters and
  # maximizing over the Lagrange multipliers for the slack variables.

  # KL-divergence for group 0.
  kld_hinge_pos_group0 = (
      - tf.stop_gradient(lambda_pos0) * ppr_hinge_group0
      - p * tf.log(lambda_pos0) + lambda_pos0 * tf.stop_gradient(ppr_group0))
  kld_hinge_neg_group0 = (
      - tf.stop_gradient(lambda_neg0) * npr_hinge_group0
      - (1 - p) * tf.log(lambda_neg0)
      + lambda_neg0 * tf.stop_gradient(npr_group0))
  kld_hinge_group0 = kld_hinge_pos_group0 + kld_hinge_neg_group0

  # KL-divergence for group 1.
  kld_hinge_pos_group1 = (
      - tf.stop_gradient(lambda_pos1) * ppr_hinge_group1
      - p * tf.log(lambda_pos1) + lambda_pos1 * tf.stop_gradient(ppr_group1))
  kld_hinge_neg_group1 = (
      - tf.stop_gradient(lambda_neg1) * npr_hinge_group1
      - (1 - p) * tf.log(lambda_neg1)
      + lambda_neg1 * tf.stop_gradient(npr_group1))
  kld_hinge_group1 = kld_hinge_pos_group1 + kld_hinge_neg_group1

  # Wrap the objective into a rate object.
  objective = tfco.wrap_rate(kld_hinge_group0 + kld_hinge_group1)

  # Set up error rate constraint for constrained optimization.
  context = tfco.rate_context(predictions_tensor, labels_tensor)
  error = tfco.error_rate(context)
  constraints = [error <= additive_slack]

  # Cretae rate minimization problem object.
  problem = tfco.RateMinimizationProblem(objective, constraints)

  # Set up optimizer.
  optimizer = tfco.LagrangianOptimizerV1(
      tf.train.AdamOptimizer(learning_rate=learning_rate),
      constraint_optimizer=tf.train.AdamOptimizer(
          learning_rate=learning_rate_constraint))
  train_op = optimizer.minimize(problem)

  # Start TF session and initialize variables.
  session = tf.Session()
  session.run(tf.global_variables_initializer())

  # We maintain a list of objectives and model weights during training.
  objectives = []
  violations = []
  models = []

  # Perform full gradient updates.
  for ii in range(loops):

    # Gradient updates.
    session.run(train_op)

    # Checkpoint once in 10 iterations.
    if ii % 10 == 0:
      # Model weights.
      model = [session.run(weights), session.run(threshold)]
      models.append(model)

      # Objective.
      klds = evaluation.expected_group_klds(
          x_train, y_train, z_train, [model], [1.0])
      objectives.append(sum(klds))

      # Violation.
      error = evaluation.expected_error_rate(
          x_train, y_train, [model], [1.0])
      violations.append([error - additive_slack])

  # Use the recorded objectives and constraints to find the best iterate.
  best_iterate = tfco.find_best_candidate_index(
      np.array(objectives), np.array(violations))
  deterministic_model = models[best_iterate]

  # Use shrinking to find a sparse distribution over iterates.
  probabilities = tfco.find_best_candidate_distribution(
      np.array(objectives), np.array(violations))
  models_pruned = [models[i] for i in range(len(models)) if
                   probabilities[i] > 0.0]
  probabilities_pruned = probabilities[probabilities > 0.0]

  return (models_pruned, probabilities_pruned), deterministic_model
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_placeholder_: How much the clip is shifted.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [],
                                                        name='filename')
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(
            tf.float32, [], name='foreground_volme')
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_placeholder_ = tf.placeholder(tf.int32,
                                                      name='timeshift')
        # TODO(see--): Write test with np.roll
        shifted_foreground = tf_roll(scaled_foreground,
                                     self.time_shift_placeholder_)
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [desired_samples, 1], name='background_data')
        self.background_volume_placeholder_ = tf.placeholder(
            tf.float32, [], name='background_volume')
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, shifted_foreground)
        # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0)
        self.background_clamp_ = background_add
        self.background_clamp_ = tf.reshape(
            self.background_clamp_, (1, model_settings['desired_samples']))
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        #stfts = tf.contrib.signal.stft(
        stfts = tf.signal.stft(
            self.background_clamp_,
            frame_length=model_settings['window_size_samples'],
            frame_step=model_settings['window_stride_samples'],
            fft_length=None)
        self.spectrogram_ = tf.abs(stfts)
        num_spectrogram_bins = self.spectrogram_.shape[-1].value
        lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
        linear_to_mel_weight_matrix = \
            tf.signal.linear_to_mel_weight_matrix(
                model_settings['dct_coefficient_count'],
                num_spectrogram_bins, model_settings['sample_rate'],
                lower_edge_hertz, upper_edge_hertz)
        mel_spectrograms = tf.tensordot(self.spectrogram_,
                                        linear_to_mel_weight_matrix, 1)
        mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))
        log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
        self.mfcc_ = tf.signal.mfccs_from_log_mel_spectrograms(
            log_mel_spectrograms
        )[:, :, :model_settings['num_log_mel_features']]  # :13
示例#15
0
def multihead_attention(query_antecedent,
                        memory_antecedent,
                        bias,
                        total_key_depth,
                        total_value_depth,
                        output_depth,
                        num_heads,
                        dropout_rate,
                        shared_rel=False,
                        max_relative_position=None,
                        image_shapes=None,
                        attention_type="dot_product",
                        block_length=128,
                        block_width=128,
                        q_filter_width=1,
                        kv_filter_width=1,
                        q_padding="VALID",
                        kv_padding="VALID",
                        cache=None,
                        gap_size=0,
                        num_memory_blocks=2,
                        name="multihead_attention",
                        save_weights_to=None,
                        make_image_summary=True,
                        dropout_broadcast_dims=None,
                        max_length=None,
                        vars_3d=False,
                        scale_dotproduct=True,
                        **kwargs):
    """Multihead scaled-dot-product attention with input/output transformations.

  Args:
    query_antecedent: a Tensor with shape [batch, length_q, channels]
    memory_antecedent: a Tensor with shape [batch, length_m, channels] or None
    bias: bias Tensor (see attention_bias())
    total_key_depth: an integer
    total_value_depth: an integer
    output_depth: an integer
    num_heads: an integer dividing total_key_depth and total_value_depth
    dropout_rate: a floating point number
    shared_rel: boolean to share relative embeddings
    max_relative_position: Maximum distance between inputs to generate
                           unique relation embeddings for. Only relevant
                           when using "dot_product_relative" attention.
    image_shapes: optional tuple of integer scalars.
                  see comments for attention_image_summary()
    attention_type: a string, either "dot_product", "dot_product_relative",
                    "local_mask_right", "local_unmasked", "masked_dilated_1d",
                    "unmasked_dilated_1d", graph, or any attention function
                    with the signature (query, key, value, **kwargs)
    block_length: an integer - relevant for "local_mask_right"
    block_width: an integer - relevant for "local_unmasked"
    q_filter_width: An integer specifying how wide you want the query to be.
    kv_filter_width: An integer specifying how wide you want the keys and values
                     to be.
    q_padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding.
               kv_padding: One of "VALID", "SAME" or "LEFT". Default is "VALID":
               no padding.
    cache: dict containing Tensors which are the results of previous
           attentions, used for fast decoding. Expects the dict to contrain two
           keys ('k' and 'v'), for the initial call the values for these keys
           should be empty Tensors of the appropriate shape.
               'k' [batch_size, 0, key_channels]
               'v' [batch_size, 0, value_channels]
    gap_size: Integer option for dilated attention to indicate spacing between
              memory blocks.
    num_memory_blocks: Integer option to indicate how many memory blocks to look
                       at.
    name: an optional string.
    save_weights_to: an optional dictionary to capture attention weights
      for vizualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    dropout_broadcast_dims:  an optional list of integers less than 4
      specifying in which dimensions to broadcast the dropout decisions.
      saves memory.
    max_length: an integer - needed by relative attention
    vars_3d: use 3-dimensional variables for input/output transformations
    scale_dotproduct: whether to normalize the attention product.
    **kwargs (dict): Parameters for the attention function

  Caching:
    WARNING: For decoder self-attention, i.e. when memory_antecedent == None,
    the caching assumes that the bias contains future masking.

    The caching works by saving all the previous key and value values so that
    you are able to send just the last query location to this attention
    function. I.e. if the cache dict is provided it assumes the query is of the
    shape [batch_size, 1, hidden_dim] rather than the full memory.

  Returns:
    The result of the attention transformation. The output shape is
        [batch_size, length_q, hidden_dim]
    unless the cache dict is provided in which case only the last memory
    position is calculated and the output shape is [batch_size, 1, hidden_dim]
    Optionally returns an additional loss parameters (ex: load balance loss for
    the experts) returned by the attention_type function.

  Raises:
    ValueError: if the key depth or value depth are not divisible by the
      number of attention heads.
  """
    if total_key_depth % num_heads != 0:
        raise ValueError("Key depth (%d) must be divisible by the number of "
                         "attention heads (%d)." %
                         (total_key_depth, num_heads))
    if total_value_depth % num_heads != 0:
        raise ValueError("Value depth (%d) must be divisible by the number of "
                         "attention heads (%d)." %
                         (total_value_depth, num_heads))
    vars_3d_num_heads = num_heads if vars_3d else 0
    with tf.variable_scope(name,
                           default_name="multihead_attention",
                           values=[query_antecedent, memory_antecedent]):

        if cache is None or memory_antecedent is None:
            q, k, v = common_attention.compute_qkv(
                query_antecedent,
                memory_antecedent,
                total_key_depth,
                total_value_depth,
                q_filter_width,
                kv_filter_width,
                q_padding,
                kv_padding,
                vars_3d_num_heads=vars_3d_num_heads)
        if cache is not None:
            if attention_type != "dot_product":
                # TODO(petershaw): Support caching when using relative position
                # representations, i.e. "dot_product_relative" attention.
                raise NotImplementedError(
                    "Caching is not guaranteed to work with attention types other than"
                    " dot_product.")
            if bias is None:
                raise ValueError(
                    "Bias required for caching. See function docstring "
                    "for details.")

            if memory_antecedent is not None:
                # Encoder-Decoder Attention Cache
                q = common_attention.compute_attention_component(
                    query_antecedent,
                    total_key_depth,
                    q_filter_width,
                    q_padding,
                    "q",
                    vars_3d_num_heads=vars_3d_num_heads)
                k = cache["k_encdec"]
                v = cache["v_encdec"]
            else:
                k = common_attention.split_heads(k, num_heads)
                v = common_attention.split_heads(v, num_heads)
                decode_loop_step = kwargs.get("decode_loop_step")
                if decode_loop_step is None:
                    k = cache["k"] = tf.concat([cache["k"], k], axis=2)
                    v = cache["v"] = tf.concat([cache["v"], v], axis=2)
                else:
                    # Inplace update is required for inference on TPU.
                    # Inplace_ops only supports inplace_update on the first dimension.
                    # The performance of current implementation is better than updating
                    # the tensor by adding the result of matmul(one_hot,
                    # update_in_current_step)
                    tmp_k = tf.transpose(cache["k"], perm=[2, 0, 1, 3])
                    tmp_k = inplace_ops.alias_inplace_update(
                        tmp_k, decode_loop_step, tf.squeeze(k, axis=2))
                    k = cache["k"] = tf.transpose(tmp_k, perm=[1, 2, 0, 3])
                    tmp_v = tf.transpose(cache["v"], perm=[2, 0, 1, 3])
                    tmp_v = inplace_ops.alias_inplace_update(
                        tmp_v, decode_loop_step, tf.squeeze(v, axis=2))
                    v = cache["v"] = tf.transpose(tmp_v, perm=[1, 2, 0, 3])

        q = common_attention.split_heads(q, num_heads)
        if cache is None:
            k = common_attention.split_heads(k, num_heads)
            v = common_attention.split_heads(v, num_heads)

        key_depth_per_head = total_key_depth // num_heads
        if not vars_3d:
            if scale_dotproduct:
                q *= key_depth_per_head**-0.5

        additional_returned_value = None
        if callable(
                attention_type):  # Generic way to extend multihead_attention
            x = attention_type(q, k, v, **kwargs)
            if isinstance(x, tuple):
                x, additional_returned_value = x  # Unpack
        elif attention_type == "dot_product":
            x = common_attention.dot_product_attention(
                q,
                k,
                v,
                bias,
                dropout_rate,
                image_shapes,
                save_weights_to=save_weights_to,
                make_image_summary=make_image_summary,
                dropout_broadcast_dims=dropout_broadcast_dims)
        elif attention_type == "dot_product_relative":
            x = common_attention.dot_product_attention_relative(
                q,
                k,
                v,
                bias,
                max_relative_position,
                dropout_rate,
                image_shapes,
                make_image_summary=make_image_summary)
        elif attention_type == "dot_product_relative_v2":
            x = common_attention.dot_product_self_attention_relative_v2(
                q,
                k,
                v,
                bias,
                max_length,
                dropout_rate,
                image_shapes,
                make_image_summary=make_image_summary,
                dropout_broadcast_dims=dropout_broadcast_dims)
        elif attention_type == "local_within_block_mask_right":
            x = common_attention.masked_within_block_local_attention_1d(
                q, k, v, block_length=block_length)
        elif attention_type == "rel_local_mask_right":
            x = common_attention.masked_rel_local_attention_1d(
                q,
                k,
                v,
                block_length=block_length,
                make_image_summary=make_image_summary,
                dropout_rate=dropout_rate,
                share_rel_embed=shared_rel)
        elif attention_type == "local_mask_right":
            x = common_attention.masked_local_attention_1d(
                q,
                k,
                v,
                block_length=block_length,
                make_image_summary=make_image_summary)
        elif attention_type == "local_unmasked":
            x = common_attention.local_attention_1d(q,
                                                    k,
                                                    v,
                                                    block_length=block_length,
                                                    filter_width=block_width)
        elif attention_type == "masked_dilated_1d":
            x = common_attention.masked_dilated_self_attention_1d(
                q, k, v, block_length, block_width, gap_size,
                num_memory_blocks)
        else:
            assert attention_type == "unmasked_dilated_1d"
            x = common_attention.dilated_self_attention_1d(
                q, k, v, block_length, block_width, gap_size,
                num_memory_blocks)
        x = common_attention.combine_heads(x)

        # Set last dim specifically.
        x.set_shape(x.shape.as_list()[:-1] + [total_value_depth])

        if vars_3d:
            o_var = tf.get_variable(
                "o", [num_heads, total_value_depth // num_heads, output_depth])
            o_var = tf.cast(o_var, x.dtype)
            o_var = tf.reshape(o_var, [total_value_depth, output_depth])
            x = tf.tensordot(x, o_var, axes=1)
        else:
            x = common_layers.dense(x,
                                    output_depth,
                                    use_bias=False,
                                    name="output_transform")
        if additional_returned_value is not None:
            return x, additional_returned_value
        return x
示例#16
0
    def lagrangian_optimizer(train_set,
                             epsilon=epsilon,
                             learning_rate=0.01,
                             learning_rate_constraint=0.01,
                             loops=2000):
        tf.reset_default_graph()

        x_train, y_train, z_train = train_set
        num_examples = x_train.shape[0]
        dimension = x_train.shape[-1]

        # Data tensors.
        features_tensor = tf.constant(x_train.astype("float32"),
                                      name="features")
        labels_tensor = tf.constant(y_train.astype("float32"), name="labels")

        # Linear model.
        weights = tf.Variable(tf.zeros(dimension, dtype=tf.float32),
                              name="weights")
        threshold = tf.Variable(0, name="threshold", dtype=tf.float32)
        predictions_tensor = (
            tf.tensordot(features_tensor, weights, axes=(1, 0)) + threshold)

        predictions_group0 = tf.boolean_mask(predictions_tensor,
                                             mask=(z_train < 1))
        num0 = np.sum(z_train < 1)
        predictions_group1 = tf.boolean_mask(predictions_tensor,
                                             mask=(z_train > 0))
        num1 = np.sum(z_train > 0)

        # Set up rates.
        context = tfco.rate_context(predictions_tensor, labels_tensor)
        true_positive_rate = tfco.true_positive_rate(context)
        true_negative_rate = tfco.true_negative_rate(context)

        context0 = context.subset(z_train < 1)
        true_positive_rate0 = tfco.true_positive_rate(context0)

        context1 = context.subset(z_train > 0)
        true_positive_rate1 = tfco.true_positive_rate(context1)

        # Set up slack variables.
        slack_tpr = tf.Variable(0.5, dtype=tf.float32)
        slack_tnr = tf.Variable(0.5, dtype=tf.float32)

        # Projection ops for slacks.
        projection_ops = []
        projection_ops.append(
            tf.assign(slack_tpr, tf.clip_by_value(slack_tpr, 0.001, 0.999)))
        projection_ops.append(
            tf.assign(slack_tnr, tf.clip_by_value(slack_tnr, 0.001, 0.999)))

        # Set up 1 - G-mean objective.
        objective = tfco.wrap_rate(1.0 - tf.sqrt(slack_tpr * slack_tnr))

        # Set up slack constraints.
        constraints = []
        constraints.append(tfco.wrap_rate(slack_tpr) <= true_positive_rate)
        constraints.append(tfco.wrap_rate(slack_tnr) <= true_negative_rate)

        # Set up fairness equal-opportunity constraints.
        constraints.append(
            true_positive_rate0 <= true_positive_rate1 + epsilon)
        constraints.append(
            true_positive_rate1 <= true_positive_rate0 + epsilon)

        # Set up constraint optimization problem.
        problem = tfco.RateMinimizationProblem(objective, constraints)

        # Set up solver.
        optimizer = tf.train.AdamOptimizer(learning_rate)
        constraint_optimizer = tf.train.AdamOptimizer(learning_rate_constraint)
        lagrangian_optimizer = tfco.ProxyLagrangianOptimizerV1(
            optimizer=optimizer, constraint_optimizer=constraint_optimizer)
        train_op = lagrangian_optimizer.minimize(problem)

        # Start TF session and initialize variables.
        session = tf.Session()
        tf.set_random_seed(654321)  # Set random seed for reproducibility.
        session.run(tf.global_variables_initializer())

        # We maintain a list of objectives and model weights during training.
        objectives = []
        violations = []
        models = []

        # Perform  full gradient updates.
        for ii in xrange(loops):
            # Gradient update.
            session.run(train_op)
            # Projection.
            session.run(projection_ops)

            # Checkpoint once in 100 iterations.
            if ii % 100 == 0:
                # Model weights.
                model = [session.run(weights), session.run(threshold)]
                models.append(model)

                # Snapshot performace
                error, tpr0, tpr1 = evaluate_expected_results(
                    train_set, [model], [1.0])
                objectives.append(error)
                violations.append(
                    [tpr0 - tpr1 - epsilon, tpr1 - tpr0 - epsilon])

        # Use the recorded objectives and constraints to find the best iterate.
        # Best model
        best_iterate = tfco.find_best_candidate_index(np.array(objectives),
                                                      np.array(violations))
        best_model = models[best_iterate]

        # Stochastic model over a subset of classifiers.
        probabilities = tfco.find_best_candidate_distribution(
            np.array(objectives), np.array(violations))
        models_pruned = [
            models[i] for i in range(len(models)) if probabilities[i] > 0.0
        ]
        probabilities_pruned = probabilities[probabilities > 0.0]

        # Stochastic model over all classifiers.
        probabilities_all = probabilities * 0.0 + 1.0 / len(probabilities)

        # Return Pruned models, Avg models, Best model
        results = {
            'stochastic': (models, probabilities_all),
            'pruned': (models_pruned, probabilities_pruned),
            'best': best_model,
            'objectives': objectives,
            'violations': violations
        }
        return results
示例#17
0
def smpl_model_batched(model_path, betas, pose, trans, simplify=False):
    """
  Construct a compute graph that takes in parameters and outputs a tensor as
  model vertices. Face indices are also returned as a numpy ndarray.

  Parameters:
  ---------
  pose: Also known as 'theta', a [24,3] tensor indicating child joint rotation
  relative to parent joint. For root joint it's global orientation.
  Represented in a axis-angle format.

  betas: Parameter for model shape. A tensor of shape [10] as coefficients of
  PCA components. Only 10 components were released by SMPL author.

  trans: Global translation tensor of shape [3].

  Return:
  ------
  A tensor for vertices, and a numpy ndarray as face indices.

  """
    # For detailed comments see smpl_np.py
    with open(model_path, 'rb') as f:
        params = pickle.load(f)

    J_regressor = tf.constant(
        np.array(params['J_regressor'].todense(), dtype=np.float64))
    weights = tf.constant(params['weights'], dtype=np.float64)
    posedirs = tf.constant(params['posedirs'], dtype=np.float64)
    v_template = tf.constant(params['v_template'], dtype=np.float64)
    shapedirs = tf.constant(params['shapedirs'], dtype=np.float64)
    f = params['f']

    kintree_table = params['kintree_table']
    id_to_col = {kintree_table[1, i]: i for i in range(kintree_table.shape[1])}
    parent = {
        i: id_to_col[kintree_table[0, i]]
        for i in range(1, kintree_table.shape[1])
    }

    v_shaped = tf.tensordot(betas, shapedirs, axes=[[1], [2]]) + v_template
    J = tf.matmul(J_regressor, v_shaped)
    pose_cube = tf.reshape(pose, (-1, 1, 3))
    R_cube_big = rodrigues(pose_cube)
    if simplify:
        v_posed = v_shaped
    else:
        R_cube = R_cube_big[1:]
        I_cube = tf.expand_dims(tf.eye(3, dtype=tf.float64), axis=0) + \
                 tf.zeros((R_cube.get_shape()[0], 3, 3), dtype=tf.float64)
        lrotmin = tf.squeeze(tf.reshape((R_cube - I_cube), (-1, 1)))
        v_posed = v_shaped + tf.tensordot(lrotmin, posedirs, axes=[[1], [2]])
    results = []
    results.append(
        with_zeros(
            tf.concat((R_cube_big[0], tf.reshape(J[0, :], (3, 1))), axis=1)))
    for i in range(1, kintree_table.shape[1]):
        results.append(
            tf.matmul(
                results[parent[i]],
                with_zeros(
                    tf.concat((R_cube_big[i],
                               tf.reshape(J[i, :] - J[parent[i], :], (3, 1))),
                              axis=1))))
    stacked = tf.stack(results, axis=0)
    results = stacked - \
              pack(
                tf.matmul(
                  stacked,
                  tf.reshape(
                    tf.concat((J, tf.zeros((24, 1), dtype=tf.float64)), axis=1),
                    (24, 4, 1)
                  )
                )
              )
    T = tf.tensordot(weights, results, axes=((1), (0)))
    rest_shape_h = tf.concat(
        (v_posed,
         tf.ones((v_posed.get_shape().as_list()[0], 1), dtype=tf.float64)),
        axis=1)
    v = tf.matmul(T, tf.reshape(rest_shape_h, (-1, 4, 1)))
    v = tf.reshape(v, (-1, 4))[:, :3]
    result = v + tf.reshape(trans, (1, 3))
    return result
示例#18
0
 def step_state(state):
     return state + tf.reduce_sum(
         input_tensor=tf.tensordot(data, state, ([1], [1])))
示例#19
0
def runMoreRnn(path=None, epochs=10, saveResult=True):
	trainData, validData, testData, wordId = loadWordIdsFromFiles()
	trainData = np.array(trainData, np.float32)
	# validData = np.array(validData, np.float32)
	testData = np.array(testData, np.float32)
	vocabSz = len(wordId)

	info = loadInfo('rnn', path)
	learnRate = info['learning rate']
	batchSz = info['batch size']
	embedSz = info['embed size']
	rnnSz = info['rnn size']
	winSz = info['win size']
	numWin = (trainData.shape[0] - 1) // (batchSz * winSz)
	# each batch has winSz * numWin words
	batchLen = winSz * numWin

	testNumWin = (testData.shape[0] - 1) // (batchSz * winSz)
	testBatchLen = winSz * testNumWin

	inp = tf.placeholder(tf.int32, shape=[batchSz, winSz])
	# ans = tf.placeholder(tf.int32, shape=[batchSz * winSz])
	ans = tf.placeholder(tf.int32, shape=[batchSz, winSz])

	E = tf.Variable(tf.random_normal([vocabSz, embedSz], stddev=0.1))
	embed = tf.nn.embedding_lookup(E, inp)

	rnn = BasicRNNCell(rnnSz, activation='relu')
	initialState = rnn.zero_state(batchSz, tf.float32)
	output, nextState = tf.nn.dynamic_rnn(rnn, embed, initial_state=initialState)
	# output = tf.reshape(output, [batchSz * winSz, rnnSz])

	W = tf.Variable(tf.random_normal([rnnSz, vocabSz], stddev=.1))
	B = tf.Variable(tf.random_normal([vocabSz], stddev=.1))
	# logits = tf.matmul(output, W) + B
	logits = tf.tensordot(output, W, [[2], [0]]) + B

	ents = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ans)
	loss = tf.reduce_sum(ents)
	train = tf.train.GradientDescentOptimizer(learnRate).minimize(loss)

	trainPerp = np.zeros(epochs + 1, dtype=np.float32)
	trainPerp[0] = info['train perplexity']
	testPerp = np.zeros(epochs + 1, dtype=np.float32)
	testPerp[0] = info['test perplexity']
	with tf.Session() as sess:
		loadSession(sess, 'rnn', path)
		startTime = time.time()
		epoch = 0
		print('epoch:', end=' ')
		while epoch < epochs:
			epoch += 1
			win = 0
			state = sess.run(initialState)
			testState = sess.run(initialState)
			# print(state, testState)
			winStart, winEnd = 0, winSz
			while win < numWin:
				inInp = np.array([trainData[i * batchLen + winStart:i * batchLen + winEnd] for i in range(batchSz)])
				# inAns = np.reshape(np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz)
				inAns = np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)])
				_, state, outLoss = sess.run([train, nextState, loss], {inp: inInp, ans: inAns, nextState: state})
				trainPerp[epoch] += outLoss
				if win < testNumWin:
					inInp = np.array([testData[i * testBatchLen + winStart:i * testBatchLen + winEnd] for i in range(batchSz)])
					# inAns = np.reshape(np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz)
					inAns = np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)])
					testState, testOutLoss = sess.run([nextState, loss], {inp: inInp, ans: inAns, nextState: testState})
					testPerp[epoch] += testOutLoss
				winStart, winEnd = winEnd, winEnd + winSz
				win += 1
			print(epoch + info['epochs'], end=' ')
		trainPerp[1:] = np.exp(trainPerp[1:] / (trainData.shape[0] // (batchSz * batchLen) * (batchSz * batchLen)))
		testPerp[1:] = np.exp(testPerp[1:] / (testData.shape[0] // (batchSz * testBatchLen) * (batchSz * testBatchLen)))
		print(f'\nelapsed: {time.time() - startTime}')
		print('train perplexity:', trainPerp[-1])
		print('test perplexity:', testPerp[-1])

		info['epochs'] += epochs
		info['train perplexity'] = trainPerp[-1]
		info['test perplexity'] = testPerp[-1]
		if saveResult:
			save(sess, info)
	drawPerplexity(trainPerp, testPerp, info['epochs'] - epochs)
示例#20
0
    for l in range(num_hidden_layers):
        current_layer = tf.layers.dense(previous_layer,
                                        num_hidden_neurons[l],
                                        activation=tf.nn.sigmoid)
        previous_layer = current_layer

    dnn_output = tf.layers.dense(previous_layer, matrix_size)

with tf.name_scope('loss'):
    print("dnn_output = ", dnn_output)

    x_trial = tf.transpose(dnn_output)
    print("x_trial = ", x_trial)

    temp1 = (tf.tensordot(tf.transpose(x_trial), x_trial, axes=1) * A)
    temp2 = (1 - tf.tensordot(tf.transpose(x_trial),
                              tf.tensordot(A, x_trial, axes=1),
                              axes=1)) * np.eye(matrix_size)
    func = tf.tensordot((temp1 - temp2), x_trial, axes=1)

    print(temp1)
    print(temp2)
    print(func)

    func = tf.transpose(func)
    x_trial = tf.transpose(x_trial)

    loss = tf.losses.mean_squared_error(func, x_trial)

learning_rate = 0.001
示例#21
0
 def option_values(values, policy):
   return tf.tensordot(
       values[:, policy, Ellipsis], self._policy_weights[policy], axes=[1, 0])
示例#22
0
  def testPCgradBasic(self,
                      denylist,
                      allowlist,
                      pcgrad_var_idx):
    tf.disable_eager_execution()
    for dtype in [tf.dtypes.float32, tf.dtypes.float64]:
      with self.session(graph=tf.Graph()):
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        const0_np = np.array([1., 0.], dtype=dtype.as_numpy_dtype)
        const1_np = np.array([-1., -1.], dtype=dtype.as_numpy_dtype)
        const2_np = np.array([-1., 1.], dtype=dtype.as_numpy_dtype)

        var0 = tf.Variable(var0_np, dtype=dtype, name='first_var/var0')
        var1 = tf.Variable(var1_np, dtype=dtype, name='second_var/var1')
        const0 = tf.constant(const0_np)
        const1 = tf.constant(const1_np)
        const2 = tf.constant(const2_np)
        loss0 = tf.tensordot(var0, const0, 1) + tf.tensordot(var1, const2, 1)
        loss1 = tf.tensordot(var0, const1, 1) + tf.tensordot(var1, const0, 1)

        learning_rate = lambda: 0.001
        opt = tf.train.GradientDescentOptimizer(learning_rate)
        losses = loss0 + loss1
        opt_grads = opt.compute_gradients(losses, var_list=[var0, var1])

        pcgrad_opt = pcgrad.PCGrad(
            tf.train.GradientDescentOptimizer(learning_rate),
            denylist=denylist,
            allowlist=allowlist)
        pcgrad_col_opt = pcgrad.PCGrad(
            tf.train.GradientDescentOptimizer(learning_rate),
            use_collection_losses=True,
            denylist=denylist,
            allowlist=allowlist)
        losses = [loss0, loss1]
        pcgrad_grads = pcgrad_opt.compute_gradients(
            losses, var_list=[var0, var1])
        tf.add_to_collection(pcgrad.PCGRAD_LOSSES_COLLECTION, loss0)
        tf.add_to_collection(pcgrad.PCGRAD_LOSSES_COLLECTION, loss1)
        pcgrad_grads_collection = pcgrad_col_opt.compute_gradients(
            None, var_list=[var0, var1])

        with tf.Graph().as_default():
          # Shouldn't return non-slot variables from other graphs.
          self.assertEmpty(opt.variables())

        self.evaluate(tf.global_variables_initializer())
        grad_vec, pcgrad_vec, pcgrad_col_vec = self.evaluate(
            [opt_grads, pcgrad_grads, pcgrad_grads_collection])
        # Make sure that both methods take grads of the same vars.
        self.assertAllCloseAccordingToType(pcgrad_vec, pcgrad_col_vec)

        results = [{
            'var': var0,
            'pcgrad_vec': [0.5, -1.5],
            'result': [0.9995, 2.0015]
        }, {
            'var': var1,
            'pcgrad_vec': [0.5, 1.5],
            'result': [2.9995, 3.9985]
        }]
        grad_var_idx = {0, 1}.difference(pcgrad_var_idx)

        self.assertAllCloseAccordingToType(
            grad_vec[0][0], [0.0, -1.0], atol=1e-5)
        self.assertAllCloseAccordingToType(
            grad_vec[1][0], [0.0, 1.0], atol=1e-5)
        pcgrad_vec_idx = 0
        for var_idx in pcgrad_var_idx:
          self.assertAllCloseAccordingToType(
              pcgrad_vec[pcgrad_vec_idx][0],
              results[var_idx]['pcgrad_vec'],
              atol=1e-5)
          pcgrad_vec_idx += 1

        for var_idx in grad_var_idx:
          self.assertAllCloseAccordingToType(
              pcgrad_vec[pcgrad_vec_idx][0], grad_vec[var_idx][0], atol=1e-5)
          pcgrad_vec_idx += 1

        self.evaluate(opt.apply_gradients(pcgrad_grads))
        self.assertAllCloseAccordingToType(
            self.evaluate([results[idx]['var'] for idx in pcgrad_var_idx]),
            [results[idx]['result'] for idx in pcgrad_var_idx])
示例#23
0
def apply_ccm(image, ccm):
    """Applies a color correction matrix."""
    shape = tf.shape(image)
    image = tf.reshape(image, [-1, 3])
    image = tf.tensordot(image, ccm, axes=[[-1], [-1]])
    return tf.reshape(image, shape)
示例#24
0
def broadcast_matmul_train(x,
                           variational_params,
                           clip_alpha=None,
                           eps=common.EPSILON):
    R"""Training computation for VD matrix multiplication with N input matrices.

  Multiplies a 3D tensor `x` with a set of 2D parameters. Each 2D matrix
  `x[i, :, :]` in the input tensor is multiplied indendently with the
  parameters, resulting in a 3D output tensor with shape
  `x.shape[:2] + weight_parameters[0].shape[1]`.

  Args:
    x: 3D Tensor representing the input batch.
    variational_params: 2-tuple of Tensors, where the first tensor is the
      unscaled weight values and the second is the log of the alpha values
      for the hard concrete distribution.
    clip_alpha: Int or None. If integer, we clip the log \alpha values to
      [-clip_alpha, clip_alpha]. If None, don't clip the values.
    eps: Small constant value to use in log and sqrt operations to avoid NaNs.

  Returns:
    Output Tensor of the batched matmul operation.

  Raises:
    RuntimeError: If the variational_params argument is not a 2-tuple.
  """
    theta, log_sigma2 = _verify_variational_params(variational_params)
    theta.get_shape().assert_has_rank(2)
    log_sigma2.get_shape().assert_has_rank(2)

    # The input data must have be rank 2 or greater
    assert x.get_shape().ndims >= 2
    input_rank = x.get_shape().ndims

    if clip_alpha is not None:
        # Compute the log_alphas and then compute the
        # log_sigma2 again so that we can clip on the
        # log alpha magnitudes
        log_alpha = common.compute_log_alpha(log_sigma2, theta, eps,
                                             clip_alpha)
        log_sigma2 = common.compute_log_sigma2(log_alpha, theta, eps)

    # Compute the mean and standard deviation of the distributions over the
    # activations
    mu_activation = tf.tensordot(x, theta, [[input_rank - 1], [0]])

    var_activation = tf.tensordot(tf.square(x), tf.exp(log_sigma2),
                                  [[input_rank - 1], [0]])
    std_activation = tf.sqrt(var_activation + eps)

    # Reshape the output back to the rank of the input
    input_shape = x.get_shape().as_list()
    weight_shape = theta.get_shape().as_list()
    output_shape = input_shape[:-1] + [weight_shape[1]]
    mu_activation.set_shape(output_shape)
    std_activation.set_shape(output_shape)

    # NOTE: We sample noise for each weight in theta, which will be shared by
    # each matrix product that was done. This is equivalent to sampling the same
    # set of weights for all matrix products done by this op in an iteration.
    # The element-wise multiply below broadcasts.
    num_pad_dims = len(output_shape) - 2
    padding = [tf.constant(1, dtype=tf.int32) for _ in range(num_pad_dims)]

    # NOTE: On GPU, the first dim may not be defined w/ the Transformer. Create
    # a tf.Tensor from the list shape and TF should match the first dim
    # appropriately
    batch_size = tf.shape(x)[0]
    data_dim = tf.shape(theta)[-1]
    noise_shape = tf.stack([batch_size] + padding + [data_dim], axis=0)

    output = mu_activation + std_activation * tf.random_normal(noise_shape)
    return output
    def _static_subsample(self, indicator, batch_size, labels):
        """Returns subsampled minibatch.

    Args:
      indicator: boolean tensor of shape [N] whose True entries can be sampled.
        N should be a complie time constant.
      batch_size: desired batch size. This scalar cannot be None.
      labels: boolean tensor of shape [N] denoting positive(=True) and negative
        (=False) examples. N should be a complie time constant.

    Returns:
      sampled_idx_indicator: boolean tensor of shape [N], True for entries which
        are sampled. It ensures the length of output of the subsample is always
        batch_size, even when number of examples set to True in indicator is
        less than batch_size.

    Raises:
      ValueError: if labels and indicator are not 1D boolean tensors.
    """
        # Check if indicator and labels have a static size.
        if not indicator.shape.is_fully_defined():
            raise ValueError(
                'indicator must be static in shape when is_static is'
                'True')
        if not labels.shape.is_fully_defined():
            raise ValueError('labels must be static in shape when is_static is'
                             'True')
        if not isinstance(batch_size, int):
            raise ValueError(
                'batch_size has to be an integer when is_static is'
                'True.')

        input_length = tf.shape(indicator)[0]

        # Set the number of examples set True in indicator to be at least
        # batch_size.
        num_true_sampled = tf.reduce_sum(tf.cast(indicator, tf.float32))
        additional_false_sample = tf.less_equal(
            tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)),
            batch_size - num_true_sampled)
        indicator = tf.logical_or(indicator, additional_false_sample)

        # Shuffle indicator and label. Need to store the permutation to restore the
        # order post sampling.
        permutation = tf.random_shuffle(tf.range(input_length))
        indicator = ops.matmul_gather_on_zeroth_axis(
            tf.cast(indicator, tf.float32), permutation)
        labels = ops.matmul_gather_on_zeroth_axis(tf.cast(labels, tf.float32),
                                                  permutation)

        # index (starting from 1) when indicator is True, 0 when False
        indicator_idx = tf.where(tf.cast(indicator, tf.bool),
                                 tf.range(1, input_length + 1),
                                 tf.zeros(input_length, tf.int32))

        # Replace -1 for negative, +1 for positive labels
        signed_label = tf.where(
            tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32),
            tf.scalar_mul(-1, tf.ones(input_length, tf.int32)))
        # negative of index for negative label, positive index for positive label,
        # 0 when indicator is False.
        signed_indicator_idx = tf.multiply(indicator_idx, signed_label)
        sorted_signed_indicator_idx = tf.nn.top_k(signed_indicator_idx,
                                                  input_length,
                                                  sorted=True).values

        [num_positive_samples, num_negative_samples
         ] = self._get_num_pos_neg_samples(sorted_signed_indicator_idx,
                                           batch_size)

        sampled_idx = self._get_values_from_start_and_end(
            sorted_signed_indicator_idx, num_positive_samples,
            num_negative_samples, batch_size)

        # Shift the indices to start from 0 and remove any samples that are set as
        # False.
        sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32)
        sampled_idx = tf.multiply(
            tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32),
            sampled_idx)

        sampled_idx_indicator = tf.cast(
            tf.reduce_sum(tf.one_hot(sampled_idx, depth=input_length), axis=0),
            tf.bool)

        # project back the order based on stored permutations
        reprojections = tf.one_hot(permutation,
                                   depth=input_length,
                                   dtype=tf.float32)
        return tf.cast(
            tf.tensordot(tf.cast(sampled_idx_indicator, tf.float32),
                         reprojections,
                         axes=[0, 0]), tf.bool)
示例#26
0
        def attention(ratelayer, inputs, tag, attention_size=32):
            ratelayer.attention_size = attention_size
            ratelayer.tag = tag
            if isinstance(inputs, tuple):
                print("Attention layer - inputs is tuple, concat")
                # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
                inputs = tf.concat(inputs, 2)

            if ratelayer.time_major:
                # (T,B,D) => (B,T,D)
                inputs = tf.transpose(inputs, [1, 0, 2])

            hidden_size = inputs.shape[
                2].value  # D value - hidden size of the RNN layer
            print("hidden_size in attention layer", hidden_size)
            print("Att input shape", inputs.shape)

            # Trainable parameters
            with tf.variable_scope('v_' + ratelayer.tag):
                w_omega = tf.get_variable(initializer=tf.random_normal(
                    [hidden_size + FLAGS.latent_dim, ratelayer.attention_size],
                    stddev=0.1),
                                          name='w_omega')
                ratelayer.vars['w_omega'] = w_omega
                # b_omega = tf.get_variable(initializer=tf.random_normal(
                #     [ratelayer.attention_size], stddev=0.1), name='b_omega')
                # ratelayer.vars['b_omega'] = b_omega
                u_omega = tf.get_variable(initializer=tf.random_normal(
                    [ratelayer.attention_size], stddev=0.1),
                                          name='u_omega')
                ratelayer.vars['u_omega'] = u_omega
                b_v = tf.get_variable(initializer=tf.random_normal([1],
                                                                   stddev=0.1),
                                      name='b_v')
                ratelayer.vars['b_v'] = b_v
                # init for projection vars
                ratelayer.vars['project_' + self.tag] = tf.get_variable(
                    initializer=tf.random_normal(
                        [FLAGS.latent_dim, FLAGS.latent_dim], stddev=0.1),
                    name='project_' + ratelayer.tag + '_matrix')
                ratelayer.vars['project_bias_' +
                               ratelayer.tag] = tf.get_variable(
                                   initializer=tf.random_normal(
                                       [FLAGS.latent_dim], stddev=0.1),
                                   name='b_projection_' + ratelayer.tag)

            # transform and tile
            ratelayer.vars['projected_' + ratelayer.tag + '_latent'] = \
                dot(ratelayer.vars[ratelayer.tag + '_latent'], ratelayer.vars['project_' + ratelayer.tag]) \
                + ratelayer.vars['project_bias_' + ratelayer.tag]
            ratelayer.vars['projected_' + ratelayer.tag + '_latent'] = \
                tf.nn.sigmoid(ratelayer.vars['projected_'+ratelayer.tag+'_latent'])
            projected_latent = tf.tile(
                tf.expand_dims(ratelayer.vars['projected_' + ratelayer.tag +
                                              '_latent'],
                               axis=0), [inputs.shape[0], 1, 1])

            # concat and non-linear attention additive one like
            # in https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html
            v1 = tf.concat([inputs, projected_latent], axis=2)
            v = tf.tanh(tf.tensordot(v1, w_omega, axes=1))
            vu = tf.tensordot(v, u_omega, axes=1, name='vu')

            # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
            print("vu shape", vu.shape)  # vu shape (4, 2005)
            alphas = tf.nn.softmax(vu, name='alphas', axis=0)  # (B,T) shape

            # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
            output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 0)
            return output, alphas
    def body(self, features):
        hparams = self._hparams
        ps_devices = self._ps_devices
        single_device = (len(ps_devices) == 1)
        assert hparams.num_model_shards % len(ps_devices) == 0
        shards_per_device = hparams.num_model_shards // len(ps_devices)
        model_devices = [
            ps_devices[i // shards_per_device]
            for i in range(hparams.num_model_shards)
        ]
        print("model_devices = %s" % model_devices)
        mp = expert_utils.Parallelism(model_devices, reuse=False)
        targets_vocab_size = self._problem_hparams.vocabulary[
            "targets"].vocab_size
        # squeeze out channels, heights
        targets = tf.squeeze(features["targets_raw"], [2, 3])
        targets_embedding_var = mp(
            tf.get_variable,
            "embedding", [[targets_vocab_size, hparams.model_d]] * mp.n,
            initializer=tf.random_normal_initializer(0.0,
                                                     hparams.model_d**-0.5))
        shifted_targets = common_layers.shift_right_2d(targets)
        # Bypass the symbol modality and use a different embedding on each shard.
        if single_device:
            targets_embedding_var_combined = tf.concat(targets_embedding_var,
                                                       1)
            decoder_input_combined = common_layers.embedding(
                shifted_targets,
                targets_vocab_size,
                hparams.model_d * mp.n,
                multiplier=hparams.model_d**0.5,
                embedding_var=targets_embedding_var_combined,
            )
            decoder_input = tf.split(decoder_input_combined, mp.n, axis=2)
        else:
            targets_embedding_var_combined = None
            decoder_input = mp(
                common_layers.embedding,
                shifted_targets,
                targets_vocab_size,
                hparams.model_d,
                multiplier=hparams.model_d**0.5,
                embedding_var=targets_embedding_var,
            )
        decoder_self_attention_bias = mp(
            common_attention.attention_bias_lower_triangle,
            tf.shape(targets)[1])
        if "targets_segmentation" in features:
            # "Packed" dataset - keep the examples from seeing each other.
            targets_segmentation = features["targets_segmentation"]
            targets_position = features["targets_position"]
            decoder_self_attention_bias = mp(
                tf.add, decoder_self_attention_bias,
                mp(common_attention.attention_bias_same_segment,
                   targets_segmentation, targets_segmentation))
            decoder_input = mp(
                common_attention.add_timing_signal_1d_given_position,
                decoder_input, targets_position)
        else:
            targets_position = None
            decoder_self_attention_bias = mp(
                common_attention.attention_bias_lower_triangle,
                tf.shape(targets)[1])
            decoder_input = mp(common_attention.add_timing_signal_1d,
                               decoder_input)

        if self.has_input:
            inputs = tf.squeeze(features["inputs_raw"], [2, 3])
            inputs_vocab_size = self._problem_hparams.vocabulary[
                "inputs"].vocab_size
            # share everything for now
            share_inputs_and_targets_embedding = True
            if share_inputs_and_targets_embedding:
                assert inputs_vocab_size == targets_vocab_size
                inputs_embedding_var = targets_embedding_var
                inputs_embedding_var_combined = targets_embedding_var_combined
            if single_device:
                encoder_input_combined = common_layers.embedding(
                    inputs,
                    inputs_vocab_size,
                    hparams.model_d * mp.n,
                    multiplier=hparams.model_d**0.5,
                    embedding_var=inputs_embedding_var_combined,
                )
                encoder_input = tf.split(encoder_input_combined, mp.n, axis=2)
            else:
                encoder_input = mp(
                    common_layers.embedding,
                    inputs,
                    inputs_vocab_size,
                    hparams.model_d,
                    multiplier=hparams.model_d**0.5,
                    embedding_var=inputs_embedding_var,
                )
            if "inputs_segmentation" in features:
                # "Packed" dataset - keep the examples from seeing each other.
                inputs_segmentation = features["inputs_segmentation"]
                inputs_position = features["inputs_position"]
                encoder_self_attention_bias = mp(
                    common_attention.attention_bias_same_segment,
                    inputs_segmentation, inputs_segmentation)
                encoder_decoder_attention_bias = mp(
                    common_attention.attention_bias_same_segment,
                    targets_segmentation, inputs_segmentation)
                encoder_input = mp(
                    common_attention.add_timing_signal_1d_given_position,
                    encoder_input, inputs_position)
            else:
                encoder_padding = tf.to_float(tf.equal(inputs, 0))
                ignore_padding = common_attention.attention_bias_ignore_padding(
                    encoder_padding)
                encoder_self_attention_bias = ignore_padding
                encoder_decoder_attention_bias = ignore_padding
                inputs_position = None
                encoder_input = mp(common_attention.add_timing_signal_1d,
                                   encoder_input)

            # encoder stack here
            with tf.variable_scope("encoder"):
                encoder_input = mp(tf.nn.dropout, encoder_input,
                                   1.0 - hparams.layer_prepostprocess_dropout)
                encoder_output = _layer_stack(mp, encoder_input,
                                              encoder_self_attention_bias,
                                              hparams.encoder_layers, hparams)
        else:
            encoder_decoder_attention_bias = None
            encoder_output = None

        with tf.variable_scope("decoder"):
            decoder_input = mp(tf.nn.dropout, decoder_input,
                               1.0 - hparams.layer_prepostprocess_dropout)
            decoder_output = _layer_stack(
                mp,
                decoder_input,
                decoder_self_attention_bias,
                layers=hparams.decoder_layers,
                hparams=hparams,
                encoder_output=encoder_output,
                encoder_decoder_attention_bias=encoder_decoder_attention_bias)

        # Bypass the symbol modality and compute logits directly.
        # We compute a different set of logits on each shard, and sum them.
        # Share the weights with the target embedding.
        output_var = targets_embedding_var
        output_var_combined = targets_embedding_var_combined
        if single_device:
            decoder_output = tf.concat(decoder_output, 2)
            logits = tf.tensordot(decoder_output, output_var_combined,
                                  [[2], [1]])
            num, denom = common_layers.padded_cross_entropy(
                logits, targets, hparams.label_smoothing)
            training_loss = num / denom
        else:
            logits = mp(tf.tensordot, decoder_output, output_var,
                        [[[2], [1]]] * mp.n)
            logits = expert_utils.all_reduce_ring(logits, mp)
            # On each device, we compute the loss for a part of the batch.
            # This is faster than computing the whole loss on one shard.
            mp, logits = expert_utils.reduce_by_device(mp, logits,
                                                       lambda l: l[0])

            def _loss_for_shard(logits, targets, shard):
                logits = common_layers.approximate_split(logits, mp.n,
                                                         0)[shard]
                targets = common_layers.approximate_split(targets, mp.n,
                                                          0)[shard]
                return common_layers.padded_cross_entropy(
                    logits, targets, hparams.label_smoothing)

            num, denom = mp(_loss_for_shard, logits, targets, range(mp.n))
            training_loss = tf.add_n(num) / tf.add_n(denom)
            logits = logits[0]
        logits = tf.expand_dims(tf.expand_dims(logits, 2), 3)
        # override training loss so that it is not computed externally.
        losses = {"training": training_loss}
        return logits, losses
示例#28
0
 def graph_fn(tensora, tensorb):
     return tf.tensordot(tensora, tensorb, axes=1)
示例#29
0
def boolean_mask(boxlist,
                 indicator,
                 fields=None,
                 scope=None,
                 use_static_shapes=False,
                 indicator_sum=None):
    """Select boxes from BoxList according to indicator and return new BoxList.

  `boolean_mask` returns the subset of boxes that are marked as "True" by the
  indicator tensor. By default, `boolean_mask` returns boxes corresponding to
  the input index list, as well as all additional fields stored in the boxlist
  (indexing into the first dimension).  However one can optionally only draw
  from a subset of fields.

  Args:
    boxlist: BoxList holding N boxes
    indicator: a rank-1 boolean tensor
    fields: (optional) list of fields to also gather from.  If None (default),
      all fields are gathered from.  Pass an empty fields list to only gather
      the box coordinates.
    scope: name scope.
    use_static_shapes: Whether to use an implementation with static shape
      gurantees.
    indicator_sum: An integer containing the sum of `indicator` vector. Only
      required if `use_static_shape` is True.

  Returns:
    subboxlist: a BoxList corresponding to the subset of the input BoxList
      specified by indicator
  Raises:
    ValueError: if `indicator` is not a rank-1 boolean tensor.
  """
    with tf.name_scope(scope, 'BooleanMask'):
        if indicator.shape.ndims != 1:
            raise ValueError('indicator should have rank 1')
        if indicator.dtype != tf.bool:
            raise ValueError('indicator should be a boolean tensor')
        if use_static_shapes:
            if not (indicator_sum and isinstance(indicator_sum, int)):
                raise ValueError('`indicator_sum` must be a of type int')
            selected_positions = tf.cast(indicator, dtype=tf.float32)
            indexed_positions = tf.cast(tf.multiply(
                tf.cumsum(selected_positions), selected_positions),
                                        dtype=tf.int32)
            one_hot_selector = tf.one_hot(indexed_positions - 1,
                                          indicator_sum,
                                          dtype=tf.float32)
            sampled_indices = tf.cast(tf.tensordot(tf.cast(tf.range(
                tf.shape(indicator)[0]),
                                                           dtype=tf.float32),
                                                   one_hot_selector,
                                                   axes=[0, 0]),
                                      dtype=tf.int32)
            return gather(boxlist, sampled_indices, use_static_shapes=True)
        else:
            subboxlist = box_list.BoxList(
                tf.boolean_mask(boxlist.get(), indicator))
            if fields is None:
                fields = boxlist.get_extra_fields()
            for field in fields:
                if not boxlist.has_field(field):
                    raise ValueError(
                        'boxlist must contain all specified fields')
                subfieldlist = tf.boolean_mask(boxlist.get_field(field),
                                               indicator)
                subboxlist.add_field(field, subfieldlist)
            return subboxlist
示例#30
0
def startLstm(epochs=10, saveResult=True):
	trainData, validData, testData, wordId = loadWordIdsFromFiles()
	trainData = np.array(trainData, np.float32)
	# validData = np.array(validData, np.float32)
	testData = np.array(testData, np.float32)
	vocabSz = len(wordId)

	learnRate = 0.001
	embedSz = 128
	rnnSz, batchSz, winSz = 512, 10, 5
	numWin = (trainData.shape[0] - 1) // (batchSz * winSz)
	# each batch has winSz * numWin words
	batchLen = winSz * numWin

	testNumWin = (testData.shape[0] - 1) // (batchSz * winSz)
	testBatchLen = winSz * testNumWin

	inp = tf.placeholder(tf.int32, shape=[batchSz, winSz])
	# ans = tf.placeholder(tf.int32, shape=[batchSz * winSz])
	ans = tf.placeholder(tf.int32, shape=[batchSz, winSz])

	E = tf.Variable(tf.random_normal([vocabSz, embedSz], stddev=0.1))
	embed = tf.nn.embedding_lookup(E, inp)

	rnn = LSTMCell(rnnSz)
	initialState = rnn.zero_state(batchSz, tf.float32)
	output, nextState = tf.nn.dynamic_rnn(rnn, embed, initial_state=initialState)
	# output = tf.reshape(output, [batchSz * winSz, rnnSz])

	W = tf.Variable(tf.random_normal([rnnSz, vocabSz], stddev=.1))
	B = tf.Variable(tf.random_normal([vocabSz], stddev=.1))
	# logits = tf.matmul(output, W) + B
	logits = tf.tensordot(output, W, [[2], [0]]) + B

	ents = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ans)
	loss = tf.reduce_sum(ents)
	train = tf.train.GradientDescentOptimizer(learnRate).minimize(loss)

	trainPerp = np.zeros(epochs, dtype=np.float32)
	testPerp = np.zeros(epochs, dtype=np.float32)
	with tf.Session() as sess:
		startTime = time.time()
		sess.run(tf.global_variables_initializer())
		epoch = 0
		print('epoch:', end=' ')
		while epoch < epochs:
			win = 0
			inState = sess.run(initialState)
			testState = sess.run(initialState)
			# print(inState, testState)
			winStart, winEnd = 0, winSz
			while win < numWin:
				inInp = np.array([trainData[i * batchLen + winStart:i * batchLen + winEnd] for i in range(batchSz)])
				# inAns = np.reshape(np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz)
				inAns = np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)])
				_, inState, outLoss = sess.run([train, nextState, loss], {inp: inInp, ans: inAns, nextState: inState})
				trainPerp[epoch] += outLoss
				if win < testNumWin:
					inInp = np.array([testData[i * testBatchLen + winStart:i * testBatchLen + winEnd] for i in range(batchSz)])
					# inAns = np.reshape(np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz)
					inAns = np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)])
					testState, testOutLoss = sess.run([nextState, loss], {inp: inInp, ans: inAns, nextState: testState})
					testPerp[epoch] += testOutLoss
				winStart, winEnd = winEnd, winEnd + winSz
				win += 1
			epoch += 1
			print(epoch, end=' ')
		trainPerp = np.exp(trainPerp / (trainData.shape[0] // (batchSz * batchLen) * (batchSz * batchLen)))
		testPerp = np.exp(testPerp / (testData.shape[0] // (batchSz * testBatchLen) * (batchSz * testBatchLen)))
		print(f'\nelapsed: {time.time() - startTime}')
		print('train perplexity:', trainPerp[-1])
		print('test perplexity:', testPerp[-1])

		info = {'style': 'lstm', 'batch size': batchSz, 'embed size': embedSz, 'rnn size': rnnSz, 'win size': winSz,
		        'learning rate': learnRate, 'epochs': epochs, 'train perplexity': trainPerp[-1], 'test perplexity': testPerp[-1]}
		if saveResult:
			save(sess, info)
	drawPerplexity(trainPerp, testPerp)