def FProp(self, theta, inputs, *extra_inputs):

        initial_step_seed = py_utils.GetStepSeed()
        final_step_seed = py_utils.GenerateSeedFromName(
            tf.no_op(name='new_step_seed').name)
        num_layers = len(self.sub_layers)

        def Bak(inputs, outputs, d_outputs):
            """Backward step."""
            del inputs  # unused
            output_acts, step_seeds = outputs
            d_outputs = d_outputs[0]

            d_layer_thetas = []
            for layer_idx in reversed(range(num_layers)):
                f_seed, g_seed = step_seeds[layer_idx]
                layer = self.sub_layers[layer_idx]
                layer_theta = theta.sub_layers[layer_idx]

                input_acts, d_inputs, d_theta = layer.ReverseAndGrad(
                    layer_theta, output_acts, d_outputs, f_seed, g_seed,
                    *extra_inputs)

                d_layer_thetas.append(d_theta)
                # Passes reconstructed inputs to the previous layer.
                output_acts = input_acts
                d_outputs = d_inputs
            py_utils.ResetStepSeed(final_step_seed)
            d_theta = py_utils.NestedMap()
            d_theta.sub_layers = list(reversed(d_layer_thetas))

            extra_grads = [tf.zeros_like(t) for t in extra_inputs]
            return [
                tf.zeros_like(initial_step_seed), d_theta, d_inputs,
                extra_grads
            ]

        def Fwd(xs):
            """Forward pass."""
            initial_step_seed, theta, acts, extra_inputs = xs

            py_utils.ResetStepSeed(initial_step_seed)
            layer_step_seeds = []

            for layer_theta, layer in zip(theta.sub_layers, self.sub_layers):
                acts, f_seed, g_seed = layer.FProp(layer_theta, acts,
                                                   *extra_inputs)
                layer_step_seeds += [(f_seed, g_seed)]
            return [acts, layer_step_seeds]

        if self.params.custom_gradient:
            acts, _ = py_utils.CallDefun(
                Fwd, [initial_step_seed, theta, inputs, extra_inputs], Bak)
            py_utils.ResetStepSeed(final_step_seed)
            return acts
        else:
            acts = inputs
            for layer_theta, layer in zip(theta.sub_layers, self.sub_layers):
                acts, _, _ = layer.FProp(layer_theta, acts, *extra_inputs)
            return acts
示例#2
0
        def SendRecv(graph, dtype):
            to_send = np.array(3.1415 + 2j).astype(dtype.as_numpy_dtype)
            with graph.as_default():
                ch = sendrecv.Channel(dtype, shape, sender, recver, "test")
                with tf.device(sender):

                    # py_utils.CallDefun requires non-empty inputs. Same below.
                    def Send(_):
                        src_val = tf.constant(to_send)
                        ch.Send(src_val)
                        return tf.convert_to_tensor(1.0)

                    send_op = py_utils.CallDefun(Send, tf.convert_to_tensor(0))

                with tf.device(recver):

                    def Recv(_):
                        return ch.Recv()

                    recv_val = py_utils.CallDefun(Recv,
                                                  tf.convert_to_tensor(0))
            return send_op, recv_val, to_send
示例#3
0
 def FProp(self, theta, current_step):
   return py_utils.CallDefun(self._combined,
                             tf.convert_to_tensor(current_step))
示例#4
0
 def FProp(self, theta, current_step):
   return py_utils.CallDefun(self._exp,
                             tf.cast(current_step, dtype=self.params.dtype))
示例#5
0
    def FProp(self, theta, input_tensor):
        p = self.params

        if self._output_tensor is not None:
            raise ValueError('FProp was already called.')

        def _Gradient(inputs, _, original_grad):

            # Compute the gradients for each loss w.r.t. the inputs.
            # TODO(jngiam): Look into whether TF dedups this computation.
            per_loss_grads = []
            for loss, _ in self._losses:
                per_loss_grad = tf.gradients(loss, self._output_tensor)[0]
                if per_loss_grad is None:
                    tf.logging.warning(
                        'Loss %s did not result in a gradient during '
                        'GradDrop computation.', loss)
                else:
                    per_loss_grads.append(per_loss_grad)

            if not per_loss_grads:
                raise ValueError('No valid gradients for GradDrop.')

            # Multiply the gradients with the inputs.
            grads = per_loss_grads
            if p.use_input_sign_only:
                input_abs = tf.abs(
                    tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs)
                grads = [grad * ((inputs) / (input_abs)) for grad in grads]
            else:
                grads = [grad * inputs for grad in grads]

            # Sum gradient over batch, assuming that batch is always on dim 0.
            if p.marginalize_batch_dim:
                grads = [
                    tf.reduce_sum(grad, axis=0, keepdims=True)
                    for grad in grads
                ]

            # First discretize all gradients into their sign values.
            grad_sign_positive = [
                tf.cast(grad > 0.0, tf.float32) for grad in grads
            ]
            grad_sign_negative = [
                tf.cast(grad < 0.0, tf.float32) for grad in grads
            ]

            # Calculate the probability of positive gradients based on equation (1)
            # in the GradDrop paper.
            grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads])
            prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon))
            # Implementation of different scales for the keep function. Larger
            # scales result in steeper keep functions.
            prob_pos *= p.keep_prob_function_scale

            if p.keep_prob_function == 'sigmoid':
                # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0
                # allows the function scale in sigmoid to be compatible with the
                # function scale in the linear case.
                prob_pos = tf.sigmoid(4.0 * prob_pos)
            elif p.keep_prob_function == 'linear':
                prob_pos += 0.5

            # The main, default mode of GradDrop. Only gradients of one sign are kept,
            # and which sign is calculated via equation (1) of the main paper.
            prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape),
                               tf.float32) - 0.5
            grad_masks = [
                (gsp - gsn) * prob_pos >= 0
                for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive)
            ]

            # This diag value gives us the percentage of grads which are kept.
            gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks]
            diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks))
            summary_utils.scalar('average_grad_mask', diag)
            leak_ratios = [leak_ratio for _, leak_ratio in self._losses]
            transformed_per_loss_grads = [
                grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32))
                for (leak, grad,
                     grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks)
            ]

            transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads),
                                       original_grad.dtype)

            if not p.keep_gradnorm_constant:
                return transformed_grad

            transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2))
            original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2))
            return transformed_grad * original_grad_norm / (
                transformed_grad_norm + p.epsilon)

        output_tensor = py_utils.CallDefun(tf.identity, input_tensor,
                                           _Gradient)
        self._output_tensor = tf.identity(output_tensor)
        return self._output_tensor