def decayed_lr(): """Helper to recompute learning rate; most helpful in eager-mode.""" global_step_recomp = math_ops.cast(global_step, dtype) completed_fraction = global_step_recomp / first_decay_steps def compute_step(completed_fraction, geometric=False): """Helper for `cond` operation.""" if geometric: i_restart = math_ops.floor( math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) / math_ops.log(t_mul)) sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul) completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart else: i_restart = math_ops.floor(completed_fraction) completed_fraction -= i_restart return i_restart, completed_fraction i_restart, completed_fraction = control_flow_ops.cond( math_ops.equal(t_mul, 1.0), lambda: compute_step(completed_fraction, geometric=False), lambda: compute_step(completed_fraction, geometric=True)) m_fac = m_mul**i_restart cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(learning_rate, decayed, name=name)
def _SinGrad(op, grad): """Returns grad * cos(x).""" x = op.inputs[0] with ops.control_dependencies([grad.op]): if x.dtype.is_complex: x = math_ops.conj(x) return grad * math_ops.cos(x)
def __call__(self, step): with ops.name_scope(self.name, "NoisyLinearCosineDecay", [self.initial_learning_rate, step]) as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) initial_variance = math_ops.cast(self.initial_variance, dtype) variance_decay = math_ops.cast(self.variance_decay, dtype) num_periods = math_ops.cast(self.num_periods, dtype) alpha = math_ops.cast(self.alpha, dtype) beta = math_ops.cast(self.beta, dtype) global_step_recomp = math_ops.cast(step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps variance = initial_variance / ( math_ops.pow(1.0 + global_step_recomp, variance_decay)) std = math_ops.sqrt(variance) noisy_linear_decayed = ( linear_decayed + random_ops.random_normal( linear_decayed.shape, stddev=std)) completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) noisy_linear_cosine_decayed = ( (alpha + noisy_linear_decayed) * cosine_decayed + beta) return math_ops.multiply( initial_learning_rate, noisy_linear_cosine_decayed, name=name)
def _TanGrad(op, grad): """Returns grad * 1/sec^2(x).""" x = op.inputs[0] with ops.control_dependencies([grad.op]): secx = math_ops.inv(math_ops.cos(x)) secx2 = math_ops.square(secx) return grad * secx2
def decayed_lr(learning_rate, global_step, decay_steps, initial_variance, variance_decay, num_periods, alpha, beta, name): """Helper to recompute learning rate; most helpful in eager-mode.""" with ops.name_scope(name, "NoisyLinearCosineDecay", [learning_rate, global_step]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype decay_steps = math_ops.cast(decay_steps, dtype) initial_variance = math_ops.cast(initial_variance, dtype) variance_decay = math_ops.cast(variance_decay, dtype) num_periods = math_ops.cast(num_periods, dtype) alpha = math_ops.cast(alpha, dtype) beta = math_ops.cast(beta, dtype) global_step_recomp = math_ops.cast(global_step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps variance = initial_variance / ( math_ops.pow(1.0 + global_step_recomp, variance_decay)) std = math_ops.sqrt(variance) noisy_linear_decayed = ( linear_decayed + random_ops.random_normal( linear_decayed.shape, stddev=std)) completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) noisy_linear_cosine_decayed = ( (alpha + noisy_linear_decayed) * cosine_decayed + beta) return math_ops.multiply( learning_rate, noisy_linear_cosine_decayed, name=name)
def _raised_cosine_window(name, default_name, window_length, periodic, dtype, a, b): """Helper function for computing a raised cosine window. Args: name: Name to use for the scope. default_name: Default name to use for the scope. window_length: A scalar `Tensor` or integer indicating the window length. periodic: A bool `Tensor` indicating whether to generate a periodic or symmetric window. dtype: A floating point `DType`. a: The alpha parameter to the raised cosine window. b: The beta parameter to the raised cosine window. Returns: A `Tensor` of shape `[window_length]` of type `dtype`. Raises: ValueError: If `dtype` is not a floating point type or `window_length` is not scalar or `periodic` is not scalar. """ if not dtype.is_floating: raise ValueError('dtype must be a floating point type. Found %s' % dtype) with ops.name_scope(name, default_name, [window_length, periodic]): window_length = ops.convert_to_tensor(window_length, dtype=dtypes.int32, name='window_length') window_length.shape.assert_has_rank(0) window_length_const = tensor_util.constant_value(window_length) if window_length_const == 1: return array_ops.ones([1], dtype=dtype) periodic = math_ops.cast( ops.convert_to_tensor(periodic, dtype=dtypes.bool, name='periodic'), dtypes.int32) periodic.shape.assert_has_rank(0) even = 1 - math_ops.mod(window_length, 2) n = math_ops.cast(window_length + periodic * even - 1, dtype=dtype) count = math_ops.cast(math_ops.range(window_length), dtype) cos_arg = constant_op.constant(2 * np.pi, dtype=dtype) * count / n if window_length_const is not None: return math_ops.cast(a - b * math_ops.cos(cos_arg), dtype=dtype) return control_flow_ops.cond( math_ops.equal(window_length, 1), lambda: array_ops.ones([1], dtype=dtype), lambda: math_ops.cast(a - b * math_ops.cos(cos_arg), dtype=dtype))
def angles_to_projective_transforms(angles, image_height, image_width, name=None): """Returns projective transform(s) for the given angle(s). Args: angles: A scalar angle to rotate all images by, or (for batches of images) a vector with an angle to rotate each image in the batch. The rank must be statically known (the shape is not `TensorShape(None)`. image_height: Height of the image(s) to be transformed. image_width: Width of the image(s) to be transformed. Returns: A tensor of shape (num_images, 8). Projective transforms which can be given to `tf.contrib.image.transform`. """ with ops.name_scope(name, "angles_to_projective_transforms"): angle_or_angles = ops.convert_to_tensor(angles, name="angles", dtype=dtypes.float32) if len(angle_or_angles.get_shape()) == 0: # pylint: disable=g-explicit-length-test angles = angle_or_angles[None] elif len(angle_or_angles.get_shape()) == 1: angles = angle_or_angles else: raise TypeError("Angles should have rank 0 or 1.") x_offset = ((image_width - 1) - (math_ops.cos(angles) * (image_width - 1) - math_ops.sin(angles) * (image_height - 1))) / 2.0 y_offset = ((image_height - 1) - (math_ops.sin(angles) * (image_width - 1) + math_ops.cos(angles) * (image_height - 1))) / 2.0 num_angles = array_ops.shape(angles)[0] return array_ops.concat(values=[ math_ops.cos(angles)[:, None], -math_ops.sin(angles)[:, None], x_offset[:, None], math_ops.sin(angles)[:, None], math_ops.cos(angles)[:, None], y_offset[:, None], array_ops.zeros((num_angles, 2), dtypes.float32), ], axis=1)
def testBackwardOverForward(self, forward_prop_first): c = constant_op.constant(1.) # Watching depends depends on nesting, not creation order if forward_prop_first: forward_accumulator = forwardprop.ForwardAccumulator(c, .1) gradient_tape = backprop.GradientTape() else: gradient_tape = backprop.GradientTape() forward_accumulator = forwardprop.ForwardAccumulator(c, .1) with gradient_tape as tape: with forward_accumulator as acc: tape.watch(c) d = math_ops.cos(c) self.assertTrue(tape_lib.should_record_backprop( (acc.jvp(d), ))) self.assertAllClose(-.1 * math_ops.cos(1.), tape.gradient(acc.jvp(d), c))
def _TanGrad(op, grad): """Returns grad * 1/sec^2(x).""" x = op.inputs[0] with ops.control_dependencies([grad.op]): x = math_ops.conj(x) secx = math_ops.reciprocal(math_ops.cos(x)) secx2 = math_ops.square(secx) return grad * secx2
def RFF_map(self, input_tensor, seed, stddev, input_shape, output_dim): #input_tensor = tf.concat([input_tensor_1, input_tensor_2], axis=1) #print("Information that the adversary can get: {}".format(input_tensor)) #random_state = check_random_state(seed) gamma = stddev omega_matrix_shape = [input_shape, output_dim] bias_shape = [output_dim] """ This is the tensorflow version RFF mapping, but I refer to the scikit-learn version.!!!! @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ np.random.seed(9) self._stddev = stddev omega_matrix_shape = [self.arg.dim*2, output_dim] bias_shape = [output_dim] omega_matrix = constant_op.constant( np.random.normal( scale=1.0 / self._stddev, size=omega_matrix_shape), dtype=dtypes.float32) bias = constant_op.constant( np.random.uniform( low=0.0, high=2 * np.pi, size=bias_shape), dtype=dtypes.float32) x_omega_plus_bias = math_ops.add( math_ops.matmul(input_tensor, omega_matrix), bias) @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ """ omega_matrix = constant_op.constant(np.sqrt(2 * gamma) * np.random.normal(size=omega_matrix_shape), dtype=dtypes.float32) bias = constant_op.constant( np.random.uniform( 0.0, 2 * np.pi, size=bias_shape), dtype=dtypes.float32) x_omega_plus_bias = math_ops.add( math_ops.matmul(input_tensor, omega_matrix), bias) ''' omega_matrix = constant_op.constant(np.sqrt(2 * gamma) * random_state.normal(size=omega_matrix_shape),dtype=dtypes.float32) bias = constant_op.constant( random_state.uniform( 0.0, 2 * np.pi, size=bias_shape), dtype=dtypes.float32) x_omega_plus_bias = math_ops.add( math_ops.matmul(input_tensor, omega_matrix), bias) ''' return math.sqrt(2.0 / output_dim) * math_ops.cos(x_omega_plus_bias)
def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None): """Applies cosine decay to the learning rate. See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent with Warm Restarts. https://arxiv.org/abs/1608.03983 When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a cosine decay function to a provided initial learning rate. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python global_step = min(global_step, decay_steps) cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps)) decayed = (1 - alpha) * cosine_decay + alpha decayed_learning_rate = learning_rate * decayed ``` Example usage: ```python decay_steps = 1000 lr_decayed = cosine_decay(learning_rate, global_step, decay_steps) ``` Args: learning_rate: A scalar `float32` or `float64` Tensor or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number of steps to decay over. alpha: A scalar `float32` or `float64` Tensor or a Python number. Minimum learning rate value as a fraction of learning_rate. name: String. Optional name of the operation. Defaults to 'CosineDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. """ if global_step is None: raise ValueError("cosine decay requires global_step") with ops.name_scope(name, "CosineDecay", [learning_rate, global_step]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype global_step = math_ops.cast(global_step, dtype) decay_steps = math_ops.cast(decay_steps, dtype) global_step = math_ops.minimum(global_step, decay_steps) completed_fraction = global_step / decay_steps cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(learning_rate, decayed)
def testBatchBackwardOverForward(self, forward_prop_first): x = constant_op.constant(1.) tangents = random_ops.random_normal(shape=[10], seed=1) expected = [-t * math_ops.cos(1.) for t in tangents] if forward_prop_first: batch_acc = forwardprop.ForwardAccumulator._batch_accumulator(x, tangents) gradient_tape = backprop.GradientTape(persistent=True) else: gradient_tape = backprop.GradientTape(persistent=True) batch_acc = forwardprop.ForwardAccumulator._batch_accumulator(x, tangents) with gradient_tape as tape: with batch_acc as acc: tape.watch(x) y = math_ops.cos(x) self.assertTrue(tape_lib.should_record_backprop((acc.jvp(y),))) jvps = acc.jvp(y) d2y_dx2 = [tape.gradient(dy_dx, x) for dy_dx in jvps] self.assertAllClose(expected, d2y_dx2)
def angles_to_projective_transforms(angles, image_height, image_width, name=None): """Returns projective transform(s) for the given angle(s). Args: angles: A scalar angle to rotate all images by, or (for batches of images) a vector with an angle to rotate each image in the batch. The rank must be statically known (the shape is not `TensorShape(None)`. image_height: Height of the image(s) to be transformed. image_width: Width of the image(s) to be transformed. Returns: A tensor of shape (num_images, 8). Projective transforms which can be given to `tf.contrib.image.transform`. """ with ops.name_scope(name, "angles_to_projective_transforms"): angle_or_angles = ops.convert_to_tensor( angles, name="angles", dtype=dtypes.float32) if len(angle_or_angles.get_shape()) == 0: # pylint: disable=g-explicit-length-test angles = angle_or_angles[None] elif len(angle_or_angles.get_shape()) == 1: angles = angle_or_angles else: raise TypeError("Angles should have rank 0 or 1.") x_offset = ((image_width - 1) - (math_ops.cos(angles) * (image_width - 1) - math_ops.sin(angles) * (image_height - 1))) / 2.0 y_offset = ((image_height - 1) - (math_ops.sin(angles) * (image_width - 1) + math_ops.cos(angles) * (image_height - 1))) / 2.0 num_angles = array_ops.shape(angles)[0] return array_ops.concat( values=[ math_ops.cos(angles)[:, None], -math_ops.sin(angles)[:, None], x_offset[:, None], math_ops.sin(angles)[:, None], math_ops.cos(angles)[:, None], y_offset[:, None], array_ops.zeros((num_angles, 2), dtypes.float32), ], axis=1)
def Test(self): np.random.seed(1) n = shape_[-1] batch_shape = shape_[:-2] np_dtype = dtype_.as_numpy_dtype a = np.random.uniform(low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: a += 1j * np.random.uniform(low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) a += np.conj(a.T) a = np.tile(a, batch_shape + (1, 1)) # Optimal stepsize for central difference is O(epsilon^{1/3}). epsilon = np.finfo(np_dtype).eps delta = 0.1 * epsilon**(1.0 / 3.0) # tolerance obtained by looking at actual differences using # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64): tol = 1e-2 else: tol = 1e-7 with self.session(use_gpu=True): tf_a = constant_op.constant(a) if compute_v_: tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a) # (complex) Eigenvectors are only unique up to an arbitrary phase # We normalize the vectors such that the first component has phase 0. top_rows = tf_v[..., 0:1, :] if tf_a.dtype.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) tf_v *= phase outputs = [tf_e, tf_v] else: tf_e = linalg_ops.self_adjoint_eigvals(tf_a) outputs = [tf_e] for b in outputs: x_init = np.random.uniform(low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: x_init += 1j * np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape( [n, n]).astype(np_dtype) x_init += np.conj(x_init.T) x_init = np.tile(x_init, batch_shape + (1, 1)) theoretical, numerical = gradient_checker.compute_gradient( tf_a, tf_a.get_shape().as_list(), b, b.get_shape().as_list(), x_init_value=x_init, delta=delta) self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
def __call__(self, step): with ops.name_scope(self.name, "SGDRLearningRate", [self.learning_rate, step, self.t_0, self.t_mul, self.m_mul]) as name: learning_rate = ops.convert_to_tensor(self.learning_rate, name="learning_rate") dtype = learning_rate.dtype step = math_ops.cast(step, dtype) t_0 = math_ops.cast(self.t_0, dtype) t_mul = math_ops.cast(self.t_mul, dtype) m_mul = math_ops.cast(self.m_mul, dtype) c_one = math_ops.cast(constant_op.constant(1.0), dtype) c_half = math_ops.cast(constant_op.constant(0.5), dtype) c_pi = math_ops.cast(constant_op.constant(math.pi), dtype) # Find normalized value of the current step x_val = math_ops.div(step, t_0) def compute_step(x_val, geometric=False): if geometric: # Consider geometric series where t_mul != 1 # 1 + t_mul + t_mul^2 ... = (1 - t_mul^i_restart) / (1 - t_mul) # First find how many restarts were performed for a given x_val # Find maximal integer i_restart value for which this equation holds # x_val >= (1 - t_mul^i_restart) / (1 - t_mul) # x_val * (1 - t_mul) <= (1 - t_mul^i_restart) # t_mul^i_restart <= (1 - x_val * (1 - t_mul)) # tensorflow allows only log with base e # i_restart <= log(1 - x_val * (1 - t_mul) / log(t_mul) # Find how many restarts were performed i_restart = math_ops.floor(math_ops.log(c_one - x_val * (c_one - t_mul)) / math_ops.log(t_mul)) # Compute the sum of all restarts before the current one sum_r = (c_one - t_mul ** i_restart) / (c_one - t_mul) # Compute our position within the current restart x_val = (x_val - sum_r) / t_mul ** i_restart else: # Find how many restarts were performed i_restart = math_ops.floor(x_val) # Compute our position within the current restart x_val = x_val - i_restart return i_restart, x_val i_restart, x_val = control_flow_ops.cond( math_ops.equal(t_mul, c_one), lambda: compute_step(x_val, geometric=False), lambda: compute_step(x_val, geometric=True) ) # If m_mul < 1, then the initial learning rate of every new restart will be # smaller, i.e., by a factor of m_mul ** i_restart at i_restart-th restart m_fac = learning_rate * (m_mul ** i_restart) return math_ops.multiply(c_half * m_fac, (math_ops.cos(x_val * c_pi) + c_one), name=name)
def _add_sinusoids_signal(x, time, min_timescale=1.0, max_timescale=1.0e4): """Adds a bunch of sinusoids of different frequencies to a Tensor. Each channel of the input Tensor is incremented by a sinusoid of a different frequency and phase. This allows attention to learn to use absolute and relative positions. Timing signals should be added to some precursors of both the query and the memory inputs to attention. The use of relative position is possible because sin(x+y) and cos(x+y) can be experessed in terms of y, sin(x) and cos(x). In particular, we use a geometric sequence of timescales starting with min_timescale and ending with max_timescale. The number of different timescales is equal to channels / 2. For each timescale, we generate the two sinusoidal signals sin(timestep/timescale) and cos(timestep/timescale). All of these sinusoids are concatenated in the channels dimension. Args: x: a Tensor with shape [batch, length, channels] min_timescale: a float max_timescale: a float Returns: a Tensor the same shape as x. """ channels = x.get_shape().as_list()[-1] if x.get_shape().ndims == 3: # [batch_size, timesteps, dim] length = array_ops.shape(x)[1] position = math_ops.to_float(math_ops.range(length)) elif x.get_shape().ndims == 2: # [batch_size, dim] length = 1 position = math_ops.to_float(math_ops.range(time, time + 1)) else: raise ValueError("need a Tensor with rank 2 or 3") num_timescales = channels // 2 log_timescale_increment = ( math.log(float(max_timescale) / float(min_timescale)) / (math_ops.to_float(num_timescales) - 1)) inv_timescales = min_timescale * math_ops.exp( math_ops.to_float(math_ops.range(num_timescales)) * -log_timescale_increment) scaled_time = array_ops.expand_dims( position, 1) * array_ops.expand_dims(inv_timescales, 0) signal = array_ops.concat( [math_ops.sin(scaled_time), math_ops.cos(scaled_time)], axis=1) signal = array_ops.pad(signal, [[0, 0], [0, math_ops.mod(channels, 2)]]) if x.get_shape().ndims == 3: signal = array_ops.reshape(signal, [1, length, channels]) else: signal = array_ops.reshape(signal, [1, channels]) return x + signal
def decayed_lr(): """Helper to recompute learning rate; most helpful in eager-mode.""" global_step_recomp = math_ops.cast(global_step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) completed_fraction = global_step_recomp / decay_steps cosine_decayed = 0.5 * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(learning_rate, decayed)
def map(self, input_tensor): """Maps each row of input_tensor using random Fourier features. Args: input_tensor: a `Tensor` containing input features. It's shape is [batch_size, self._input_dim]. Returns: A `Tensor` of shape [batch_size, self._output_dim] containing RFFM-mapped features. Raises: InvalidShapeError: if the shape of the `input_tensor` is inconsistent with expected input dimension. """ input_tensor_shape = input_tensor.get_shape() if len(input_tensor_shape) != 2: raise dkm.InvalidShapeError( 'The shape of the tensor should be 2. Got %d instead.' % len(input_tensor_shape)) features_dim = input_tensor_shape[1] if features_dim != self._input_dim: raise dkm.InvalidShapeError( 'Invalid dimension: expected %d input features, got %d instead.' % (self._input_dim, features_dim)) # Add ops that compute (deterministically) omega_matrix and bias based on # the provided seed. # TODO (sibyl-vie3Poto): Storing the mapper's parameters (omega_matrix and bias) as id:626 gh:627 # constants incurs no RPC calls to the parameter server during distributed # training. However, if the parameters grow too large (for instance if they # don't fit into memory or if they blow up the size of the GraphDef proto), # stroring them as constants is no longer an option. In this case, we should # have a heuristic to choose out of one of the following alternatives: # a) store them as variables (in the parameter server) # b) store them as worker local variables # c) generating on the fly the omega matrix at each step np.random.seed(self._seed) omega_matrix_shape = [self._input_dim, self._output_dim] bias_shape = [self._output_dim] omega_matrix = constant_op.constant(np.random.normal( scale=1.0 / self._stddev, size=omega_matrix_shape), dtype=dtypes.float32) bias = constant_op.constant(np.random.uniform(low=0.0, high=2 * np.pi, size=bias_shape), dtype=dtypes.float32) x_omega_plus_bias = math_ops.add( math_ops.matmul(input_tensor, omega_matrix), bias) return math.sqrt( 2.0 / self._output_dim) * math_ops.cos(x_omega_plus_bias)
def compute_step(warming_up=False): if warming_up: completed_fraction = global_step_recomp / warmup_steps gain = w_fac + (1 - w_fac) * completed_fraction else: completed_fraction = (global_step_recomp - warmup_steps ) / (decay_steps - warmup_steps) cosine_decayed = 0.5 * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) gain = (1 - self.alpha) * cosine_decayed + self.alpha return gain
def Test(self): np.random.seed(1) n = shape_[-1] batch_shape = shape_[:-2] np_dtype = dtype_.as_numpy_dtype a = np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: a += 1j * np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) a += np.conj(a.T) a = np.tile(a, batch_shape + (1, 1)) # Optimal stepsize for central difference is O(epsilon^{1/3}). epsilon = np.finfo(np_dtype).eps delta = 0.1 * epsilon**(1.0 / 3.0) # tolerance obtained by looking at actual differences using # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64): tol = 1e-2 else: tol = 1e-7 with self.session(use_gpu=True): tf_a = constant_op.constant(a) if compute_v_: tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a) # (complex) Eigenvectors are only unique up to an arbitrary phase # We normalize the vectors such that the first component has phase 0. top_rows = tf_v[..., 0:1, :] if tf_a.dtype.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) tf_v *= phase outputs = [tf_e, tf_v] else: tf_e = linalg_ops.self_adjoint_eigvals(tf_a) outputs = [tf_e] for b in outputs: x_init = np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: x_init += 1j * np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) x_init += np.conj(x_init.T) x_init = np.tile(x_init, batch_shape + (1, 1)) theoretical, numerical = gradient_checker.compute_gradient( tf_a, tf_a.get_shape().as_list(), b, b.get_shape().as_list(), x_init_value=x_init, delta=delta) self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
def map(self, input_tensor): """Maps each row of input_tensor using random Fourier features. Args: input_tensor: a `Tensor` containing input features. It's shape is [batch_size, self._input_dim]. Returns: A `Tensor` of shape [batch_size, self._output_dim] containing RFFM-mapped features. Raises: InvalidShapeError: if the shape of the `input_tensor` is inconsistent with expected input dimension. """ input_tensor_shape = input_tensor.get_shape() if len(input_tensor_shape) != 2: raise dkm.InvalidShapeError( 'The shape of the tensor should be 2. Got %d instead.' % len(input_tensor_shape)) features_dim = input_tensor_shape[1] if features_dim != self._input_dim: raise dkm.InvalidShapeError( 'Invalid dimension: expected %d input features, got %d instead.' % (self._input_dim, features_dim)) # Add ops that compute (deterministically) omega_matrix and bias based on # the provided seed. # TODO(sibyl-vie3Poto): Storing the mapper's parameters (omega_matrix and bias) as # constants incurs no RPC calls to the parameter server during distributed # training. However, if the parameters grow too large (for instance if they # don't fit into memory or if they blow up the size of the GraphDef proto), # stroring them as constants is no longer an option. In this case, we should # have a heuristic to choose out of one of the following alternatives: # a) store them as variables (in the parameter server) # b) store them as worker local variables # c) generating on the fly the omega matrix at each step np.random.seed(self._seed) omega_matrix_shape = [self._input_dim, self._output_dim] bias_shape = [self._output_dim] omega_matrix = constant_op.constant( np.random.normal( scale=1.0 / self._stddev, size=omega_matrix_shape), dtype=dtypes.float32) bias = constant_op.constant( np.random.uniform( low=0.0, high=2 * np.pi, size=bias_shape), dtype=dtypes.float32) x_omega_plus_bias = math_ops.add( math_ops.matmul(input_tensor, omega_matrix), bias) return math.sqrt(2.0 / self._output_dim) * math_ops.cos(x_omega_plus_bias)
def decayed_lr(): """Helper to recompute learning rate; most helpful in eager-mode.""" global_step_recomp = math_ops.cast(global_step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
def Compute(x): e, v = linalg_ops.self_adjoint_eig(x) # (complex) Eigenvectors are only unique up to an arbitrary phase # We normalize the vectors such that the first component has phase 0. top_rows = v[..., 0:1, :] if dtype_.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) v *= phase return e, v
def cosine_decay_fn(global_step): if global_step is None: raise ValueError("global_step is required for cosine_decay.") global_step = math_ops.minimum(global_step, decay_steps) completed_fraction = math_ops.to_float(global_step) / math_ops.to_float( decay_steps) fraction = 2.0 * num_periods * completed_fraction decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) if zero_after is not None: decayed = array_ops.where( math_ops.greater_equal(fraction, 2 * zero_after), 0.0, decayed) return decayed
def cosine_decay_fn(global_step): if global_step is None: raise ValueError("global_step is required for cosine_decay.") global_step = math_ops.minimum(global_step, decay_steps) completed_fraction = math_ops.to_float( global_step) / math_ops.to_float(decay_steps) fraction = 2.0 * num_periods * completed_fraction decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) if zero_after is not None: decayed = array_ops.where( math_ops.greater_equal(fraction, 2 * zero_after), 0.0, decayed) return decayed
def get_rotation_matrix(angles, image_height, image_width, name=None): """Returns projective transform(s) for the given angle(s). Args: angles: A scalar angle to rotate all images by, or (for batches of images) a vector with an angle to rotate each image in the batch. The rank must be statically known (the shape is not `TensorShape(None)`). image_height: Height of the image(s) to be transformed. image_width: Width of the image(s) to be transformed. name: The name of the op. Returns: A tensor of shape (num_images, 8). Projective transforms which can be given to operation `image_projective_transform_v2`. If one row of transforms is [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point `(x, y)` to a transformed *input* point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where `k = c0 x + c1 y + 1`. """ with ops.name_scope(name, 'rotation_matrix'): x_offset = ((image_width - 1) - (math_ops.cos(angles) * (image_width - 1) - math_ops.sin(angles) * (image_height - 1))) / 2.0 y_offset = ((image_height - 1) - (math_ops.sin(angles) * (image_width - 1) + math_ops.cos(angles) * (image_height - 1))) / 2.0 num_angles = array_ops.shape(angles)[0] return array_ops.concat(values=[ math_ops.cos(angles)[:, None], -math_ops.sin(angles)[:, None], x_offset[:, None], math_ops.sin(angles)[:, None], math_ops.cos(angles)[:, None], y_offset[:, None], array_ops.zeros((num_angles, 2), dtypes.float32), ], axis=1)
def _add_sinusoids_signal(x, time, min_timescale=1.0, max_timescale=1.0e4): """Adds a bunch of sinusoids of different frequencies to a Tensor. Each channel of the input Tensor is incremented by a sinusoid of a different frequency and phase. This allows attention to learn to use absolute and relative positions. Timing signals should be added to some precursors of both the query and the memory inputs to attention. The use of relative position is possible because sin(x+y) and cos(x+y) can be experessed in terms of y, sin(x) and cos(x). In particular, we use a geometric sequence of timescales starting with min_timescale and ending with max_timescale. The number of different timescales is equal to channels / 2. For each timescale, we generate the two sinusoidal signals sin(timestep/timescale) and cos(timestep/timescale). All of these sinusoids are concatenated in the channels dimension. Args: x: a Tensor with shape [batch, length, channels] min_timescale: a float max_timescale: a float Returns: a Tensor the same shape as x. """ channels = x.get_shape().as_list()[-1] if x.get_shape().ndims == 3: # [batch_size, timesteps, dim] length = array_ops.shape(x)[1] position = math_ops.to_float(math_ops.range(length)) elif x.get_shape().ndims == 2: # [batch_size, dim] length = 1 position = math_ops.to_float(math_ops.range(time, time + 1)) else: raise ValueError("need a Tensor with rank 2 or 3") num_timescales = channels // 2 log_timescale_increment = ( math.log(float(max_timescale) / float(min_timescale)) / (math_ops.to_float(num_timescales) - 1)) inv_timescales = min_timescale * math_ops.exp( math_ops.to_float(math_ops.range(num_timescales)) * -log_timescale_increment) scaled_time = array_ops.expand_dims(position, 1) * array_ops.expand_dims(inv_timescales, 0) signal = array_ops.concat([math_ops.sin(scaled_time), math_ops.cos(scaled_time)], axis=1) signal = array_ops.pad(signal, [[0, 0], [0, math_ops.mod(channels, 2)]]) if x.get_shape().ndims == 3: signal = array_ops.reshape(signal, [1, length, channels]) else: signal = array_ops.reshape(signal, [1, channels]) return x + signal
def __call__(self, step): with ops.name_scope_v2(self.name or "CosineDecay"): initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) global_step_recomp = math_ops.cast(step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) completed_fraction = global_step_recomp / decay_steps cosine_decayed = 0.5 * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - self.alpha) * cosine_decayed + self.alpha return math_ops.multiply(initial_learning_rate, decayed)
def __call__(self, step): with ops.name_scope_v2(self.name or "CosineDecay"): initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) global_step_recomp = math_ops.cast(step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) completed_fraction = global_step_recomp / decay_steps cosine_decayed = 0.5 * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - self.alpha) * cosine_decayed + self.alpha return math_ops.multiply(initial_learning_rate, decayed)
def _sample_n(self, n, seed=None): # We use 2 uniform random floats to generate polar random variates. # http://dl.acm.org/citation.cfm?id=179631 # Theorem 2. Let G, H be iid variates, uniformly distributed on [0,1]. # Let theta = 2*pi*H, let R = sqrt(df*(G^(-2/df) - 1)) for df > 0. # Let X = R*cos(theta), and let Y = R*sin(theta). # Then X ~ t_df and Y ~ t_df. # The variates X and Y are not independent. shape = array_ops.concat(0, ([2, n], self.batch_shape())) uniform = random_ops.random_uniform(shape=shape, dtype=self.dtype, seed=seed) samples_g, samples_h = array_ops.unpack(uniform, num=2) theta = (2.0 * math.pi) * samples_h r = math_ops.sqrt(self.df * (math_ops.pow(samples_g, -2 / self.df) - 1)) samples = r * math_ops.cos(theta) return samples * self.sigma + self.mu
def restart_decay_fn(global_step): if global_step is None: raise ValueError("global_step is required for cosine_decay.") global_step = math_ops.minimum(global_step, decay_steps) num = math_ops.mod(num_periods * math_ops.to_float(global_step), decay_steps) fraction = num / math_ops.to_float(decay_steps) decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) if zero_after is not None: tmp = math_ops.to_float( num_periods * global_step) / math_ops.to_float(decay_steps) decayed = array_ops.where(math_ops.greater_equal(tmp, zero_after), 0.0, decayed) return decayed
def restart_decay_fn(global_step): if global_step is None: raise ValueError("global_step is required for cosine_decay.") global_step = math_ops.minimum(global_step, decay_steps) num = math_ops.mod(num_periods * math_ops.to_float(global_step), decay_steps) fraction = num / math_ops.to_float(decay_steps) decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) if zero_after is not None: tmp = math_ops.to_float( num_periods * global_step) / math_ops.to_float(decay_steps) decayed = array_ops.where( math_ops.greater_equal(tmp, zero_after), 0.0, decayed) return decayed
def getRotatePoint(map_shape, rotate_center, rotate_theta, origin_point): """ 实现功能,得到绕旋转中心旋转theta角度后的坐标 :param map_shape:原始地图的尺寸,因为Image中的坐标原点在图片左上角,需要改变坐标系 Tensor-[height,width,channel] :param rotate_center:旋转中心 Tensor-[loc_x,loc_y] :param rotate_theta:旋转角度 Tensor-[theta] :param origin_point:需要进行旋转操作的点集 Tensor-[loc_x,loc_y] :return: rotate_point_list: Tensor-[loc_x,loc_y] """ row = map_shape[0] center_x = rotate_center[0] center_y = row - rotate_center[1] point_x = origin_point[0] point_y = row - origin_point[1] after_rotate_x = math_ops.round( (point_x - center_x) * math_ops.cos(rotate_theta) - (point_y - center_y) * math_ops.sin(rotate_theta) + center_x) after_rotate_y = row - math_ops.round( (point_x - center_x) * math_ops.sin(rotate_theta) + (point_y - center_y) * math_ops.cos(rotate_theta) + center_y) rotate_point = [after_rotate_x, after_rotate_y] rotate_point = tf.reshape(rotate_point, [2]) return rotate_point
def decayed_lr(learning_rate, global_step, decay_steps, alpha, name): """Helper to recompute learning rate; most helpful in eager-mode.""" with ops.name_scope(name, "CosineDecay", [learning_rate, global_step]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype decay_steps = math_ops.cast(decay_steps, dtype) global_step_recomp = math_ops.cast(global_step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) completed_fraction = global_step_recomp / decay_steps cosine_decayed = 0.5 * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(learning_rate, decayed)
def f(x, order): with backprop.GradientTape(persistent=persistent) as tape: tape.watch(x) # Note that having a tape active, even if we don't use it, forces us # down a different function call path. Symbolic gradients should work # here too; correctness of tape gradients are tested elsewhere. y = def_function.function(lambda: math_ops.cos(x))() tape_dy = tape.gradient(y, x) for _ in range(order): y, = gradients_impl.gradients(y, [x]) if order > 0: y1 = tape_dy for _ in range(order - 1): y1, = gradients_impl.gradients(y1, [x]) else: y1 = y return y, y1
def Compute(x): e, v = linalg_ops.eig(x) # We sort eigenvalues by e.real+e.imag to have consistent # order between runs b_dims = len(e.shape) - 1 idx = sort_ops.argsort(math_ops.real(e) + math_ops.imag(e), axis=-1) e = array_ops.gather(e, idx, batch_dims=b_dims) v = array_ops.gather(v, idx, batch_dims=b_dims) # (complex) Eigenvectors are only unique up to an arbitrary phase # We normalize the vectors such that the first component has phase 0. top_rows = v[..., 0:1, :] angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) v *= phase return e, v
def _sample_n(self, n, seed=None): # We use 2 uniform random floats to generate polar random variates. # http://dl.acm.org/citation.cfm?id=179631 # Theorem 2. Let G, H be iid variates, uniformly distributed on [0,1]. # Let theta = 2*pi*H, let R = sqrt(df*(G^(-2/df) - 1)) for df > 0. # Let X = R*cos(theta), and let Y = R*sin(theta). # Then X ~ t_df and Y ~ t_df. # The variates X and Y are not independent. shape = array_ops.concat(0, ([2, n], self.batch_shape())) uniform = random_ops.random_uniform(shape=shape, dtype=self.dtype, seed=seed) samples_g, samples_h = array_ops.unpack(uniform, num=2) theta = (2. * math.pi) * samples_h r = math_ops.sqrt(self.df * (math_ops.pow(samples_g, -2 / self.df) - 1)) samples = r * math_ops.cos(theta) return samples * self.sigma + self.mu
def _raised_cosine_window(name, default_name, window_length, periodic, dtype, a, b): """Helper function for computing a raised cosine window. Args: name: Name to use for the scope. default_name: Default name to use for the scope. window_length: A scalar `Tensor` or integer indicating the window length. periodic: A bool `Tensor` indicating whether to generate a periodic or symmetric window. dtype: A floating point `DType`. a: The alpha parameter to the raised cosine window. b: The beta parameter to the raised cosine window. Returns: A `Tensor` of shape `[window_length]` of type `dtype`. Raises: ValueError: If `dtype` is not a floating point type or `window_length` is not scalar or `periodic` is not scalar. """ if not dtype.is_floating: raise ValueError('dtype must be a floating point type. Found %s' % dtype) with ops.name_scope(name, default_name, [window_length, periodic]): window_length = ops.convert_to_tensor(window_length, dtype=dtypes.int32, name='window_length') window_length.shape.assert_has_rank(0) periodic = math_ops.cast( ops.convert_to_tensor(periodic, dtype=dtypes.bool, name='periodic'), dtypes.int32) periodic.shape.assert_has_rank(0) even = 1 - math_ops.mod(window_length, 2) n = math_ops.cast(window_length + periodic * even - 1, dtype=dtype) count = math_ops.cast(math_ops.range(window_length), dtype) cos_arg = constant_op.constant(2 * np.pi, dtype=dtype) * count / n return control_flow_ops.cond( math_ops.equal(window_length, 1), lambda: array_ops.ones([1], dtype=dtype), lambda: math_ops.cast(a - b * math_ops.cos(cos_arg), dtype=dtype))
def decayed_lr(learning_rate, global_step, first_decay_steps, t_mul, m_mul, alpha, name): """Helper to recompute learning rate; most helpful in eager-mode.""" with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="initial_learning_rate") dtype = learning_rate.dtype first_decay_steps = math_ops.cast(first_decay_steps, dtype) alpha = math_ops.cast(alpha, dtype) t_mul = math_ops.cast(t_mul, dtype) m_mul = math_ops.cast(m_mul, dtype) global_step_recomp = math_ops.cast(global_step, dtype) completed_fraction = global_step_recomp / first_decay_steps def compute_step(completed_fraction, geometric=False): """Helper for `cond` operation.""" if geometric: i_restart = math_ops.floor( math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) / math_ops.log(t_mul)) sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul) completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart else: i_restart = math_ops.floor(completed_fraction) completed_fraction -= i_restart return i_restart, completed_fraction i_restart, completed_fraction = control_flow_ops.cond( math_ops.equal(t_mul, 1.0), lambda: compute_step(completed_fraction, geometric=False), lambda: compute_step(completed_fraction, geometric=True)) m_fac = m_mul**i_restart cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(learning_rate, decayed, name=name)
def scaled_add_op(x, scale, y): cwd = os.getcwd() outputs = { "output_types": [dtypes.float32], "output_shapes": [tensor_shape.TensorShape([SIZE])], } base_dir = os.path.join(cwd, "tensorflow/python/ipu") gp_path = os.path.join(base_dir, "tests/add_scaled_vector_add_codelet.cc") lib_path = os.path.join(base_dir, "libadd_partial_gradients_custom.so") return ipu.custom_ops.precompiled_user_op( [x, scale, y, math_ops.cos(x), math_ops.cosh(y)], lib_path, gp_path, outs=outputs, inputs_with_gradients=[0, 2])
def _NormalizingSvd(tf_a): tf_s, tf_u, tf_v = linalg_ops.svd(tf_a, compute_uv=True, full_matrices=True) # Singular vectors are only unique up to an arbitrary phase. We normalize # the vectors such that the first component of u (if m >=n) or v (if n > m) # have phase 0. m = tf_a.shape[-2] n = tf_a.shape[-1] if m >= n: top_rows = tf_u[..., 0:1, :] else: top_rows = tf_v[..., 0:1, :] if tf_u.dtype.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) tf_u *= phase[..., :m] tf_v *= phase[..., :n] return tf_s, tf_u, tf_v
def cosine_decay(learning_rate, global_step, maximum_steps, name=None): """ """ from tensorflow.python.ops import math_ops from tensorflow.python.framework import ops if global_step is None: raise ValueError("global_step is required for cosine_decay.") with ops.name_scope(name, "CosineDecay", [learning_rate, global_step, maximum_steps]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype global_step = math_ops.cast(global_step, dtype) maximum_steps = math_ops.cast(maximum_steps, dtype) p = tf.mod(global_step / maximum_steps, 1) return learning_rate * (0.5 + 0.5 * math_ops.cos(p * np.pi))
def decayed_lr(learning_rate, global_step, first_decay_steps, t_mul, m_mul, alpha, name): """Helper to recompute learning rate; most helpful in eager-mode.""" with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step] ) as name: learning_rate = ops.convert_to_tensor( learning_rate, name="initial_learning_rate") dtype = learning_rate.dtype first_decay_steps = math_ops.cast(first_decay_steps, dtype) alpha = math_ops.cast(alpha, dtype) t_mul = math_ops.cast(t_mul, dtype) m_mul = math_ops.cast(m_mul, dtype) global_step_recomp = math_ops.cast(global_step, dtype) completed_fraction = global_step_recomp / first_decay_steps def compute_step(completed_fraction, geometric=False): """Helper for `cond` operation.""" if geometric: i_restart = math_ops.floor( math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) / math_ops.log(t_mul)) sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul) completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart else: i_restart = math_ops.floor(completed_fraction) completed_fraction -= i_restart return i_restart, completed_fraction i_restart, completed_fraction = control_flow_ops.cond( math_ops.equal(t_mul, 1.0), lambda: compute_step(completed_fraction, geometric=False), lambda: compute_step(completed_fraction, geometric=True)) m_fac = m_mul**i_restart cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(learning_rate, decayed, name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "SGDRDecay") as name: initial_learning_rate = ops.convert_to_tensor_v2( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype first_decay_steps = math_ops.cast(self.first_decay_steps, dtype) k_decay = math_ops.cast(self.k_decay, dtype) alpha = math_ops.cast(self.alpha, dtype) t_mul = math_ops.cast(self._t_mul, dtype) m_mul = math_ops.cast(self._m_mul, dtype) global_step_recomp = math_ops.cast(step, dtype) completed_fraction = global_step_recomp / first_decay_steps def compute_step(completed_fraction, geometric=False): """Helper for `cond` operation.""" if geometric: i_restart = math_ops.floor( math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) / math_ops.log(t_mul)) sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul) completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart else: i_restart = math_ops.floor(completed_fraction) completed_fraction -= i_restart return i_restart, completed_fraction i_restart, completed_fraction = control_flow_ops.cond( math_ops.equal(t_mul, 1.0), lambda: compute_step(completed_fraction, geometric=False), lambda: compute_step(completed_fraction, geometric=True)) m_fac = m_mul**i_restart cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos( constant_op.constant(math.pi) * math_ops.pow(completed_fraction, k_decay))) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(initial_learning_rate, decayed, name=name)
def sample(self, n, seed=None, name="sample"): """Sample `n` observations from the Student t Distributions. Args: n: `Scalar`, type int32, the number of observations to sample. seed: Python integer, the random seed. name: The name to give this op. Returns: samples: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape` with values of type `self.dtype`. """ with ops.op_scope([self._df, self._mu, self._sigma, n], self.name): with ops.name_scope(name): n = ops.convert_to_tensor(n, name="n") n_val = tensor_util.constant_value(n) # We use 2 uniform random floats to generate polar random variates. # http://dl.acm.org/citation.cfm?id=179631 # Theorem 2. Let G, H be iid variates, uniformly distributed on [0,1]. # Let theta = 2*pi*H, let R = sqrt(df*(G^(-2/df) - 1)) for df > 0. # Let X = R*cos(theta), and let Y = R*sin(theta). # Then X ~ t_df and Y ~ t_df. # The variates X and Y are not independent. shape = array_ops.concat( 0, [array_ops.pack([2, n]), self.batch_shape()]) uniform = random_ops.random_uniform(shape=shape, dtype=self.dtype, seed=seed) samples_g, samples_h = array_ops.unpack(uniform, num=2) theta = (2 * np.pi) * samples_h r = math_ops.sqrt(self._df * (math_ops.pow(samples_g, -2 / self._df) - 1)) samples = r * math_ops.cos(theta) # Provide some hints to shape inference inferred_shape = tensor_shape.vector(n_val).concatenate( self.get_batch_shape()) samples.set_shape(inferred_shape) return samples * self._sigma + self._mu
def RFF_map(self, input_tensor, seed, stddev, output_dim): """ Refer to the scikit learn package "RFF sampler" and tensorflow RFF mapping. """ random_state = check_random_state(seed) gamma = stddev omega_matrix_shape = [3072, output_dim] bias_shape = [output_dim] """ Tensorflow Version is elaborated below: np.random.seed(9) self._stddev = stddev omega_matrix_shape = [self.arg.dim*2, output_dim] bias_shape = [output_dim] omega_matrix = constant_op.constant( np.random.normal( scale=1.0 / self._stddev, size=omega_matrix_shape), dtype=dtypes.float32) bias = constant_op.constant( np.random.uniform( low=0.0, high=2 * np.pi, size=bias_shape), dtype=dtypes.float32) x_omega_plus_bias = math_ops.add( math_ops.matmul(input_tensor, omega_matrix), bias) """ omega_matrix = constant_op.constant(np.sqrt(2 * gamma) * random_state.normal(size=omega_matrix_shape), dtype=dtypes.float32) bias = constant_op.constant( random_state.uniform( 0.0, 2 * np.pi, size=bias_shape), dtype=dtypes.float32) x_omega_plus_bias = math_ops.add( math_ops.matmul(input_tensor, omega_matrix), bias) return math.sqrt(2.0 / output_dim) * math_ops.cos(x_omega_plus_bias)
def sample(self, n, seed=None, name="sample"): """Sample `n` observations from the Student t Distributions. Args: n: `Scalar`, type int32, the number of observations to sample. seed: Python integer, the random seed. name: The name to give this op. Returns: samples: a `Tensor` of shape `(n,) + self.batch_shape + self.event_shape` with values of type `self.dtype`. """ with ops.name_scope(self.name): with ops.op_scope([self._df, self._mu, self._sigma, n], name): n = ops.convert_to_tensor(n, name="n") n_val = tensor_util.constant_value(n) # We use 2 uniform random floats to generate polar random variates. # http://dl.acm.org/citation.cfm?id=179631 # Theorem 2. Let G, H be iid variates, uniformly distributed on [0,1]. # Let theta = 2*pi*H, let R = sqrt(df*(G^(-2/df) - 1)) for df > 0. # Let X = R*cos(theta), and let Y = R*sin(theta). # Then X ~ t_df and Y ~ t_df. # The variates X and Y are not independent. shape = array_ops.concat(0, [array_ops.pack([2, n]), self.batch_shape()]) uniform = random_ops.random_uniform(shape=shape, dtype=self.dtype, seed=seed) samples_g, samples_h = array_ops.unpack(uniform, num=2) theta = (2 * np.pi) * samples_h r = math_ops.sqrt(self._df * (math_ops.pow(samples_g, -2 / self._df) - 1)) samples = r * math_ops.cos(theta) # Provide some hints to shape inference inferred_shape = tensor_shape.vector(n_val).concatenate( self.get_batch_shape()) samples.set_shape(inferred_shape) return samples * self._sigma + self._mu
def decayed_lr(): """Helper to recompute learning rate; most helpful in eager-mode.""" global_step_recomp = math_ops.cast(global_step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps variance = initial_variance / ( math_ops.pow(1.0 + global_step_recomp, variance_decay)) std = math_ops.sqrt(variance) noisy_linear_decayed = ( linear_decayed + random_ops.random_normal( linear_decayed.shape, stddev=std)) completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) noisy_linear_cosine_decayed = ( (alpha + noisy_linear_decayed) * cosine_decayed + beta) return math_ops.multiply( learning_rate, noisy_linear_cosine_decayed, name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "LinearCosineDecay") as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) num_periods = math_ops.cast(self.num_periods, dtype) alpha = math_ops.cast(self.alpha, dtype) beta = math_ops.cast(self.beta, dtype) global_step_recomp = math_ops.cast(step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta return math_ops.multiply(initial_learning_rate, linear_cosine_decayed, name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "SGDRDecay") as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype first_decay_steps = math_ops.cast(self.first_decay_steps, dtype) alpha = math_ops.cast(self.alpha, dtype) t_mul = math_ops.cast(self._t_mul, dtype) m_mul = math_ops.cast(self._m_mul, dtype) global_step_recomp = math_ops.cast(step, dtype) completed_fraction = global_step_recomp / first_decay_steps def compute_step(completed_fraction, geometric=False): """Helper for `cond` operation.""" if geometric: i_restart = math_ops.floor( math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) / math_ops.log(t_mul)) sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul) completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart else: i_restart = math_ops.floor(completed_fraction) completed_fraction -= i_restart return i_restart, completed_fraction i_restart, completed_fraction = control_flow_ops.cond( math_ops.equal(t_mul, 1.0), lambda: compute_step(completed_fraction, geometric=False), lambda: compute_step(completed_fraction, geometric=True)) m_fac = m_mul**i_restart cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos( constant_op.constant(math.pi) * completed_fraction)) decayed = (1 - alpha) * cosine_decayed + alpha return math_ops.multiply(initial_learning_rate, decayed, name=name)
def decayed_lr(learning_rate, global_step, decay_steps, num_periods, alpha, beta, name): """Helper to recompute learning rate; most helpful in eager-mode.""" with ops.name_scope(name, "LinearCosineDecay", [learning_rate, global_step]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype decay_steps = math_ops.cast(decay_steps, dtype) num_periods = math_ops.cast(num_periods, dtype) alpha = math_ops.cast(alpha, dtype) beta = math_ops.cast(beta, dtype) global_step_recomp = math_ops.cast(global_step, dtype) global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
def get_multi_engine_graph_def(mode="FP32"): """Create a simple graph and return its graph_def.""" dtype = dtypes.float32 if mode.upper() == "FP16": dtype = dtypes.float16 else: pass g = ops.Graph() with g.as_default(): x = aops.placeholder(shape=[None, 3, 7, 5], name="input", dtype=dtype) with g.name_scope("Global_scope"): with g.name_scope("first_scope"): e = cop.constant( np.random.randn(3, 2, 3, 4), name="weights", dtype=dtype) conv = nn.conv2d( input=x, filter=e, data_format="NCHW", strides=[1, 1, 1, 1], padding="VALID", name="conv") b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias1", dtype=dtype) t = conv * b b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias2", dtype=dtype) q = conv / b edge = mops.sin(q) edge1 = mops.cos(conv) with g.name_scope("test_scope"): de = edge + edge1 t -= edge1 q *= edge t += q t -= de k = aops.squeeze(t, name="output") print(k.dtype) return g.as_graph_def()
def _sine_discontinuity(value): """A special case for dealing with discontinuities. Decides whether `value` is close to an integer, and if so computes: lim x->n |sin(x * pi)| / sin(x * pi) = sign(sin(n * pi)) = cos(n * pi) Args: value: The floating point Tensor value which may lead to a discontinuity. Returns: A tuple of (is_discontinuous, sign): is_discontinuous: A boolean Tensor of the same shape as `value`, indicating whether it is near an integer. sign: A floating point Tensor indicating the sign of the discontinuity (being near 1 or -1 when `is_discontinuous` is True), of the same shape and type as `value`. """ normalized = value / num_latent_values_float is_discontinuous = self._close_to_integer(normalized) sign = math_ops.cos(normalized * numpy.pi) return is_discontinuous, sign
def _SinGrad(op, grad): """Returns grad * cos(x).""" x = op.inputs[0] with ops.control_dependencies([grad]): x = math_ops.conj(x) return grad * math_ops.cos(x)
def noisy_linear_cosine_decay(learning_rate, global_step, decay_steps, initial_variance=1.0, variance_decay=0.55, num_periods=0.5, alpha=0.0, beta=0.001, name=None): """Applies noisy linear cosine decay to the learning rate. See [Bello et al., ICML2017] Neural Optimizer Search with RL. https://arxiv.org/abs/1709.07417 Note that linear cosine decay is more aggressive than cosine decay and larger initial learning rates can typically be used. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies a noisy linear cosine decay function to a provided initial learning rate. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python global_step = min(global_step, decay_steps) linear_decay = (decay_steps - global_step) / decay_steps) cosine_decay = 0.5 * ( 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta decayed_learning_rate = learning_rate * decayed ``` where eps_t is 0-centered gaussian noise with variance initial_variance / (1 + global_step) ** variance_decay Example usage: ```python decay_steps = 1000 lr_decayed = noisy_linear_cosine_decay( learning_rate, global_step, decay_steps) ``` Args: learning_rate: A scalar `float32` or `float64` Tensor or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number of steps to decay over. initial_variance: initial variance for the noise. See computation above. variance_decay: decay for the noise's variance. See computation above. num_periods: Number of periods in the cosine part of the decay. See computation above. alpha: See computation above. beta: See computation above. name: String. Optional name of the operation. Defaults to 'NoisyLinearCosineDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. """ if global_step is None: raise ValueError("noisy linear cosine decay requires global_step") with ops.name_scope(name, "NoisyLinearCosineDecay", [learning_rate, global_step]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype global_step = math_ops.cast(global_step, dtype) decay_steps = math_ops.cast(decay_steps, dtype) global_step = math_ops.minimum(global_step, decay_steps) initial_variance = math_ops.cast(initial_variance, dtype) variance_decay = math_ops.cast(variance_decay, dtype) num_periods = math_ops.cast(num_periods, dtype) alpha = math_ops.cast(alpha, dtype) beta = math_ops.cast(beta, dtype) linear_decayed = (decay_steps - global_step) / decay_steps variance = initial_variance / ( math_ops.pow(1.0 + global_step, variance_decay)) std = math_ops.sqrt(variance) noisy_linear_decayed = ( linear_decayed + random_ops.random_normal( linear_decayed.shape, stddev=std)) completed_fraction = global_step / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) noisy_linear_cosine_decayed = ( (alpha + noisy_linear_decayed) * cosine_decayed + beta) return math_ops.multiply( learning_rate, noisy_linear_cosine_decayed, name=name)