예제 #1
0
 def testNoPS(self):
     p = cluster_factory.Cluster.Params()
     p.worker.name = '/job:trainer'
     p.worker.replicas = 1
     p.ps.name = '/job:trainer'
     p.ps.replicas = 1
     c = cluster_factory.Cluster(p)
     g = tf.Graph()
     vs = []
     with g.as_default():
         with tf.device(c.GetPlacer()):
             for i in range(10):
                 vs.append(tf.get_variable('x%d' % i, (10, 10, 10)))
             sum_all = tf.add_n(vs)
     for v in vs:
         self.assertEqual(
             v.device,
             cluster.MakeDeviceString(job_name='/job:trainer',
                                      task_id=0,
                                      device_name='CPU',
                                      device_id=0))
     self.assertEqual(
         sum_all.device,
         cluster.MakeDeviceString(job_name='/job:trainer',
                                  task_id=0,
                                  device_name='CPU',
                                  device_id=0))
예제 #2
0
 def testPSWithGPUs(self):
     p = cluster_factory.Cluster.Params()
     p.worker.name = '/job:trainer'
     p.worker.replicas = 1
     p.ps.name = '/job:ps'
     p.ps.replicas = 4
     p.ps.gpus_per_replica = 2
     c = cluster_factory.Cluster(p)
     g = tf.Graph()
     vs = []
     with g.as_default():
         with tf.device(c.GetPlacer()):
             for i in range(10):
                 vs.append(tf.get_variable('x%d' % i, (10, 10, 10)))
             sum_all = tf.add_n(vs)
     for i, v in enumerate(vs):
         self.assertEqual(
             v.device,
             cluster.MakeDeviceString(job_name='/job:ps',
                                      task_id=(i / 2) % 4,
                                      device_name='GPU',
                                      device_id=i % 2))
     self.assertEqual(
         sum_all.device,
         cluster.MakeDeviceString(job_name='/job:trainer',
                                  task_id=0,
                                  device_name='CPU',
                                  device_id=0))
예제 #3
0
 def testDefaultParamsWithDynamicShape(self):
     p = cluster_factory.Cluster.Params()
     c = cluster_factory.Cluster(p)
     g = tf.Graph()
     vs = []
     with g.as_default():
         with tf.device(c.GetPlacer()):
             for i in range(10):
                 dyn_shape = tf.constant([2], dtype=tf.int32)
                 dyn_shape = tf.placeholder_with_default(dyn_shape,
                                                         shape=[None])
                 v = tf.get_variable('x%d_wb/var' % i,
                                     initializer=tf.random.uniform(
                                         dyn_shape, dtype=tf.float64),
                                     validate_shape=False)
                 vs.append(v)
             sum_all = tf.add_n(vs)
     for v in vs:
         self.assertEqual(
             v.device,
             cluster.MakeDeviceString(job_name='/job:localhost',
                                      task_id=0,
                                      device_name='CPU',
                                      device_id=0))
     self.assertEqual(
         sum_all.device,
         cluster.MakeDeviceString(job_name='/job:localhost',
                                  task_id=0,
                                  device_name='CPU',
                                  device_id=0))
예제 #4
0
    def FrontendAndEncoderFProp(self,
                                theta,
                                input_batch_src,
                                initial_state=None):
        """FProps through the frontend and encoder.

    Args:
      theta: A NestedMap object containing weights' values of this layer and its
        children layers.
      input_batch_src: An input NestedMap as per `BaseAsrFrontend.FProp`.
      initial_state: None or a NestedMap object containing the initial states.

    Returns:
      A NestedMap as from `AsrEncoder.FProp`.
    """
        p = self.params
        if p.frontend:
            with tf.name_scope('frontend'):
                input_batch_src = self.frontend.FProp(theta.frontend,
                                                      input_batch_src)
        with layers_with_attention.AuxLossContext() as aux_loss_ctx:
            if initial_state:
                encoder_outputs = self.encoder.FProp(theta.encoder,
                                                     input_batch_src,
                                                     state0=initial_state)
            else:
                encoder_outputs = self.encoder.FProp(theta.encoder,
                                                     input_batch_src)
            # get aux loss if there is.
            if aux_loss_ctx.aux_losses:
                assert isinstance(aux_loss_ctx.aux_losses, list)
                assert len(aux_loss_ctx.aux_losses) >= 1
                aux_loss = tf.add_n(aux_loss_ctx.aux_losses)
                encoder_outputs.aux_loss = aux_loss
            return encoder_outputs
예제 #5
0
 def testDefaultParams(self):
   p = cluster_factory.Cluster.Params()
   c = cluster_factory.Cluster(p)
   self.assertFalse(c.add_summary)
   g = tf.Graph()
   vs = []
   with g.as_default():
     with tf.device(c.GetPlacer()):
       for i in range(10):
         vs.append(tf.get_variable('x%d' % i, (10, 10, 10)))
       sum_all = tf.add_n(vs)
   for v in vs:
     self.assertEqual(
         v.device,
         c._MakeDeviceString(
             job_name='/job:localhost',
             task_id=0,
             device_name='CPU',
             device_id=0))
   self.assertEqual(
       sum_all.device,
       c._MakeDeviceString(
           job_name='/job:localhost',
           task_id=0,
           device_name='CPU',
           device_id=0))
예제 #6
0
    def testManyHotLabels(self):
        batch_size = 7
        num_classes = 400
        num_positive = 5

        # To help keep the test simple, we put the positive labels on the
        # first 'num_positive' classes in every example.
        labels = np.zeros((batch_size, num_classes), np.float32)
        labels[:, :num_positive] = 1.0

        logits = np.random.uniform(size=labels.shape).astype(
            np.float32) * 10 + 1e7
        losses = label_lib.MultiLabelContrastiveLoss(
            tf.convert_to_tensor(labels, dtype=tf.float32),
            tf.convert_to_tensor(logits, dtype=tf.float32))

        # Verify that the multi-label loss is equivalent to the average softmax
        # cross entropy of each positive pair vs. all negative pairs.
        negative_pair_logits = logits[:, num_positive:]

        one_vs_all_labels = np.zeros(
            (batch_size, num_classes - num_positive + 1), np.float32)
        one_vs_all_labels[:, 0] = 1

        expected_loss_terms = []
        for i in range(num_positive):
            one_vs_all_logits = np.concatenate(
                [logits[:, i:(i + 1)], negative_pair_logits], axis=1)
            expected_loss_terms.append(
                tf.nn.softmax_cross_entropy_with_logits(
                    labels=one_vs_all_labels, logits=one_vs_all_logits))
        expected_loss = tf.add_n(expected_loss_terms) / num_positive
        self.assertAllClose(expected_loss, losses)
예제 #7
0
 def Merge(xs):
     rets = []
     for x in zip(*xs):
         if x[0] is None:
             rets.append(None)
         else:
             rets.append(tf.add_n(list(x)))
     return tuple(rets)
예제 #8
0
    def testParallelLayer(self):
        g = tf.Graph()
        with g.as_default():
            tf.set_random_seed(24332)
            p = layers.ParallelLayer.Params().Set(
                name='test',
                merge=lambda xs: tuple([tf.add_n(x) for x in zip(*xs)]),
                sub=[
                    lingvo_layers.FCLayer.Params().Set(name='foo',
                                                       input_dim=32,
                                                       output_dim=4),
                    lingvo_layers.FCLayer.Params().Set(name='bar',
                                                       input_dim=32,
                                                       output_dim=4),
                    layers.SequentialLayer.Params().Set(
                        name='seq',
                        sub=[
                            lingvo_layers.FCLayer.Params().Set(name='baz',
                                                               input_dim=32,
                                                               output_dim=4),
                            lingvo_layers.DropoutLayer.Params().Set(
                                name='dropout', keep_prob=0.5)
                        ])
                ])
            p.is_eval = True
            l = p.Instantiate()
            x = tf.random_normal(shape=[2, 32])
            y = l.FPropDefaultTheta(x)

        with self.session(graph=g) as sess:
            sess.run(tf.global_variables_initializer())
            x_val, y_val, w = sess.run([x, y, l.vars])

        out = []
        act = x_val
        # relu(act \dot w + b)
        out += [np.maximum(0, np.matmul(act, w.foo.w) + w.foo.b)]
        self.assertEqual(out[-1].shape, (2, 4))
        out += [np.maximum(0, np.matmul(act, w.bar.w) + w.bar.b)]
        self.assertEqual(out[-1].shape, (2, 4))
        out += [np.maximum(0, np.matmul(act, w.seq.baz.w) + w.seq.baz.b)]
        self.assertEqual(out[-1].shape, (2, 4))

        np_result = out[0]
        for v in out[1:]:
            np_result = np.add(np_result, v)
        self.assertAllClose(np_result, y_val)
예제 #9
0
파일: pruning.py 프로젝트: snsun/lingvo
    def _update_mask(self, weights, threshold):
        """Updates the mask for a given weight tensor.

    This functions first computes the cdf of the weight tensor, and estimates
    the threshold value such that 'desired_sparsity' fraction of weights
    have magnitude less than the threshold.

    Args:
      weights: The weight tensor that needs to be masked.
      threshold: The current threshold value. The function will compute a new
        threshold and return the exponential moving average using the current
        value of threshold

    Returns:
      new_threshold: The new value of the threshold based on weights, and
        sparsity at the current global_step
      new_mask: A numpy array of the same size and shape as weights containing
        0 or 1 to indicate which of the values in weights falls below
        the threshold

    Raises:
      ValueError: if sparsity is not defined
    """
        if self._sparsity is None:
            raise ValueError('Sparsity variable undefined')

        sparsity = self._get_sparsity(weights.op.name)
        with tf.name_scope(weights.op.name + '_pruning_ops'):
            abs_weights = tf.abs(weights)
            k = tf.cast(
                tf.round(
                    tf.cast(tf.size(abs_weights), tf.float32) *
                    (1 - sparsity)), tf.int32)
            # Sort the entire array
            values, _ = tf.nn.top_k(tf.reshape(abs_weights, [-1]),
                                    k=tf.size(abs_weights))
            # Grab the (k-1) th value
            current_threshold = tf.gather(values, k - 1)
            smoothed_threshold = tf.add_n([
                tf.multiply(current_threshold, 1 - self._spec.threshold_decay),
                tf.multiply(threshold, self._spec.threshold_decay)
            ])

            new_mask = tf.cast(
                tf.greater_equal(abs_weights, smoothed_threshold), tf.float32)

        return smoothed_threshold, new_mask
예제 #10
0
 def testPSRandomSize(self):
   p = cluster_factory.Cluster.Params()
   p.worker.name = '/job:trainer'
   p.ps.name = '/job:ps'
   p.ps.replicas = 10
   c = cluster_factory.Cluster(p)
   g = tf.Graph()
   vs = []
   np.random.seed(301)
   with g.as_default():
     with tf.device(c.GetPlacer()):
       # Creates 200 variables with different sizes.
       for i in range(200):
         if i % 13:
           size = np.random.randint(10000)
         elif i % 7:
           size = np.random.randint(100)
         else:
           size = np.random.randint(10)
         vs.append(tf.get_variable('x%d' % i, shape=(size)))
       sum_all = tf.add_n([tf.reduce_sum(x) for x in vs])
   # Computes the total size of variables placed on each device.
   total_size = {}  # device name -> size
   for v in vs:
     size = tf.TensorShape(v.op.get_attr('shape')).num_elements()
     if v.device in total_size:
       total_size[v.device] += size
     else:
       total_size[v.device] = size
   for (device, allocated) in zip(
       sorted(total_size),
       [91701, 91361, 90346, 88738, 87240, 89265, 91944, 92472, 88051, 95053]):
     self.assertEqual(total_size[device], allocated)
   self.assertEqual(
       sum_all.device,
       cluster.MakeDeviceString(
           job_name='/job:trainer',
           replica_id=0,
           task_id=0,
           device_name='CPU',
           device_id=0))
예제 #11
0
 def GradSum(v, *gs):
     tf.logging.info('GradSum: %s: %s', v, gs)
     if all(g is None for g in gs):
         return None
     return tf.add_n([g for g in gs if g is not None])
예제 #12
0
파일: graddrop.py 프로젝트: vcj-huy/lingvo
        def _Gradient(inputs, _, original_grad):

            # Compute the gradients for each loss w.r.t. the inputs.
            # TODO(jngiam): Look into whether TF dedups this computation.
            per_loss_grads = []
            for loss, _ in self._losses:
                per_loss_grad = tf.gradients(loss, self._output_tensor)[0]
                if per_loss_grad is None:
                    tf.logging.warning(
                        'Loss %s did not result in a gradient during '
                        'GradDrop computation.', loss)
                else:
                    per_loss_grads.append(per_loss_grad)

            if not per_loss_grads:
                raise ValueError('No valid gradients for GradDrop.')

            # Multiply the gradients with the inputs.
            grads = per_loss_grads
            if p.use_input_sign_only:
                input_abs = tf.abs(
                    tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs)
                grads = [grad * ((inputs) / (input_abs)) for grad in grads]
            else:
                grads = [grad * inputs for grad in grads]

            # Sum gradient over batch, assuming that batch is always on dim 0.
            if p.marginalize_batch_dim:
                grads = [
                    tf.reduce_sum(grad, axis=0, keepdims=True)
                    for grad in grads
                ]

            # First discretize all gradients into their sign values.
            grad_sign_positive = [
                tf.cast(grad > 0.0, tf.float32) for grad in grads
            ]
            grad_sign_negative = [
                tf.cast(grad < 0.0, tf.float32) for grad in grads
            ]

            # Calculate the probability of positive gradients based on equation (1)
            # in the GradDrop paper.
            grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads])
            prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon))
            # Implementation of different scales for the keep function. Larger
            # scales result in steeper keep functions.
            prob_pos *= p.keep_prob_function_scale

            if p.keep_prob_function == 'sigmoid':
                # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0
                # allows the function scale in sigmoid to be compatible with the
                # function scale in the linear case.
                prob_pos = tf.sigmoid(4.0 * prob_pos)
            elif p.keep_prob_function == 'linear':
                prob_pos += 0.5

            # The main, default mode of GradDrop. Only gradients of one sign are kept,
            # and which sign is calculated via equation (1) of the main paper.
            prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape),
                               tf.float32) - 0.5
            grad_masks = [
                (gsp - gsn) * prob_pos >= 0
                for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive)
            ]

            # This diag value gives us the percentage of grads which are kept.
            gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks]
            diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks))
            summary_utils.scalar('average_grad_mask', diag)
            leak_ratios = [leak_ratio for _, leak_ratio in self._losses]
            transformed_per_loss_grads = [
                grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32))
                for (leak, grad,
                     grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks)
            ]

            transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads),
                                       original_grad.dtype)

            if not p.keep_gradnorm_constant:
                return transformed_grad

            transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2))
            original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2))
            return transformed_grad * original_grad_norm / (
                transformed_grad_norm + p.epsilon)
예제 #13
0
    def ComputeLoss(self, theta, predictions, input_batch):
        """Computes loss and other metrics for the given predictions.

    Args:
      theta: A `.NestedMap` object containing variable values of this task.
      predictions: The output of `ComputePredictions`.
      input_batch: A `.NestedMap` object containing input tensors to this tower.

    Returns:
      A tuple (metrics, per_example_tensors), where
        - `metrics` is a dict of str keys to (metric, weight) values
        - `per_example_tensors` is a dict of str keys to tensors describing each
          training example, where the first dimension of each tensor is the
          batch index.
    """
        p = self.params

        # During TPU training, collect the encodings and ids from all TPUs so the
        # loss can be computed over all query-result pairs in the global batch.
        # To avoid duplicating work, each TPU operates on a non-overlapping
        # slice of these pairs. Specifically, each TPU uses queries drawn from its
        # local batch and results from the global batch.

        # Encodings of the local and global examples, keyed by modality.
        local_flat_encodings = py_utils.NestedMap({
            modality: tf.reshape(predictions[modality].encodings,
                                 [-1, p.joint_embedding_dim])
            for modality in predictions
        })
        global_flat_encodings = tpu_utils.ConcatenateAcrossReplicas(
            local_flat_encodings)

        def _ComputePerQueryLoss(query_modality, result_modality):
            labeler_inputs = label_lib.ExamplePairs.BetweenLocalAndGlobalBatches(
                input_batch,
                query_modality=query_modality,
                result_modality=result_modality)
            labels = p.label_fn(labeler_inputs)

            # [num_queries, num_results]
            flat_similarities = self.score_function(
                local_flat_encodings[query_modality],
                global_flat_encodings[result_modality])

            flat_labels = tf.reshape(labels, flat_similarities.shape)
            # [num_queries]
            return label_lib.MultiLabelContrastiveLoss(
                labels=flat_labels, logits=flat_similarities)

        loss_terms = []
        metrics = {}
        for direction, loss_weight in p.loss_weights.items():
            query_modality, result_modality = direction
            if not loss_weight:
                logging.info('Skipping %s retrieval', direction)
                continue
            per_query_losses = _ComputePerQueryLoss(query_modality,
                                                    result_modality)
            mean_per_query_loss = tf.reduce_mean(per_query_losses)
            loss_terms.append(loss_weight * mean_per_query_loss)
            metrics['loss_{}_to_{}'.format(
                query_modality, result_modality)] = (mean_per_query_loss, 1)

        regularization_losses = utils.CollectRegularizationLosses(self)
        if p.regularization_loss_weight and regularization_losses:
            tf.logging.info('Adding TF1 regularization loss: %s',
                            regularization_losses)
            total_reg_loss = tf.reduce_sum(regularization_losses)
            loss_terms.append(p.regularization_loss_weight * total_reg_loss)
            metrics['loss_regularization'] = (total_reg_loss, 1)

        loss = tf.add_n(loss_terms)
        metrics['loss'] = (loss, 1)
        return metrics, {}
예제 #14
0
    def FPropTower(self, theta, input_batch):
        with layers_with_attention.AuxLossContext() as aux_loss_ctx:
            assert aux_loss_ctx is not None
            p = self.params
            fprop_dtype = py_utils.FPropDtype(p)
            tf.logging.info('input_batch=%r', input_batch)
            ids = input_batch.ids
            labels_ids = input_batch.labels
            paddings = tf.cast(input_batch.paddings, fprop_dtype)
            weights = tf.cast(input_batch.weights, fprop_dtype)
            tf.logging.info('inputs={}'.format(
                (ids, paddings, labels_ids, weights)))

            batch_size = tf.shape(ids)[0]
            state0 = self.lm.zero_state(theta.lm, batch_size)
            labels = py_utils.NestedMap(class_ids=labels_ids,
                                        class_weights=weights)
            xent_output, _ = self.lm.FProp(theta.lm,
                                           ids,
                                           paddings,
                                           state0,
                                           labels,
                                           segment_ids=input_batch.segment_ids,
                                           segment_pos=input_batch.segment_pos)

            # +input_batch.num_sentences to account for the end of sequence symbol.
            num_words = tf.cast(
                tf.reduce_sum(
                    input_batch.word_count +
                    tf.cast(input_batch.num_sentences, dtype=tf.int32)),
                fprop_dtype)
            predicted_labels = tf.cast(xent_output.per_example_argmax,
                                       labels_ids.dtype)
            num_sentences = tf.reduce_sum(input_batch.num_sentences)

            num_preds = tf.cast(xent_output.total_weight, fprop_dtype)
            mean_acc = tf.reduce_sum(
                tf.cast(tf.equal(labels_ids, predicted_labels), fprop_dtype) *
                weights) / tf.math.maximum(num_preds, 1)
            avg_xent = xent_output.avg_xent
            aux_loss_tensors = aux_loss_ctx.aux_losses
            if aux_loss_tensors:
                assert isinstance(aux_loss_tensors, list)
                assert len(aux_loss_tensors) >= 1
                # scalar
                assert p.aux_loss_weight > 0
                aux_loss = p.aux_loss_weight * tf.add_n(aux_loss_tensors)
            else:
                # scalar
                aux_loss = tf.zeros_like(avg_xent)

            loss = avg_xent + aux_loss
            return {
                'loss': (loss, num_preds),
                'avg_xent': (avg_xent, num_preds),
                'aux_loss': (aux_loss, num_preds),
                'fraction_of_correct_next_step_preds': (mean_acc, num_preds),
                'log_pplx': (xent_output.avg_xent, num_preds),
                'log_pplx_per_word':
                (xent_output.total_xent / num_words, num_words),
                'num_predictions': (num_preds, 1),
                'num_words': (num_words, 1),
                'num_sentences': (num_sentences, 1)
            }, {}