def decode_greedy(self, sequence_length, first_word): initial_state = self._lstm_cell.get_initial_state( dtype=tf.float32, batch_size=1) sequence = [first_word] current_word = first_word current_id = tf.expand_dims(self._words_to_indices(current_word), 0) current_state = initial_state for _ in range(sequence_length): token_embeddings = tf.nn.embedding_lookup(self._embeddings, current_id) lstm_outputs, current_state = self._lstm_cell(token_embeddings, current_state) lstm_outputs = tf.reshape(lstm_outputs, [-1, self._lstm_cell.output_size]) logits = self._logit_layer(lstm_outputs) softmax = tf.nn.softmax(logits) next_ids = tf.math.argmax(softmax, axis=1) next_words = self._indices_to_words(next_ids)[0] current_id = next_ids current_word = next_words sequence.append(current_word) return sequence
def _map_fn(features, labels): features = tf.expand_dims(features, 0) features = module(features) features = tf.squeeze(features, 0) return features, labels
def __init__(self, dim: int, mean_reversion: types.RealTensor, volatility: Union[types.RealTensor, Callable[..., types.RealTensor]], initial_discount_rate_fn, corr_matrix: types.RealTensor = None, dtype: tf.DType = None, name: str = None): """Initializes the HJM model. Args: dim: A Python scalar which corresponds to the number of factors comprising the model. mean_reversion: A real positive `Tensor` of shape `[dim]`. Corresponds to the mean reversion rate of each factor. volatility: A real positive `Tensor` of the same `dtype` and shape as `mean_reversion` or a callable with the following properties: (a) The callable should accept a scalar `Tensor` `t` and returns a 1-D `Tensor` of shape `[dim]`. The function returns instantaneous volatility `sigma(t)`. When `volatility` is specified is a real `Tensor`, each factor is assumed to have a constant instantaneous volatility. Corresponds to the instantaneous volatility of each factor. initial_discount_rate_fn: A Python callable that accepts expiry time as a real `Tensor` of the same `dtype` as `mean_reversion` and returns a `Tensor` of shape `input_shape`. Corresponds to the zero coupon bond yield at the present time for the input expiry time. corr_matrix: A `Tensor` of shape `[dim, dim]` and the same `dtype` as `mean_reversion`. Corresponds to the correlation matrix `Rho`. dtype: The default dtype to use when converting values to `Tensor`s. Default value: `None` which maps to `tf.float32`. name: Python string. The name to give to the ops created by this class. Default value: `None` which maps to the default name `gaussian_hjm_model`. """ self._name = name or 'gaussian_hjm_model' with tf.name_scope(self._name): self._dtype = dtype or tf.float32 self._dim = dim self._factors = dim def _instant_forward_rate_fn(t): t = tf.convert_to_tensor(t, dtype=self._dtype) def _log_zero_coupon_bond(x): r = tf.convert_to_tensor(initial_discount_rate_fn(x), dtype=self._dtype) return -r * x rate = -gradient.fwd_gradient( _log_zero_coupon_bond, t, use_gradient_tape=True, unconnected_gradients=tf.UnconnectedGradients.ZERO) return rate def _initial_discount_rate_fn(t): return tf.convert_to_tensor(initial_discount_rate_fn(t), dtype=self._dtype) self._instant_forward_rate_fn = _instant_forward_rate_fn self._initial_discount_rate_fn = _initial_discount_rate_fn self._mean_reversion = tf.convert_to_tensor(mean_reversion, dtype=dtype, name='mean_reversion') self._batch_shape = [] self._batch_rank = 0 # Setup volatility if callable(volatility): self._volatility = volatility else: volatility = tf.convert_to_tensor(volatility, dtype=dtype) jump_locations = [[]] * dim volatility = tf.expand_dims(volatility, axis=-1) self._volatility = piecewise.PiecewiseConstantFunc( jump_locations=jump_locations, values=volatility, dtype=dtype) if corr_matrix is None: corr_matrix = tf.eye(dim, dim, dtype=self._dtype) self._rho = tf.convert_to_tensor(corr_matrix, dtype=dtype, name='rho') self._sqrt_rho = tf.linalg.cholesky(self._rho) # Volatility function def _vol_fn(t, state): """Volatility function of Gaussian-HJM.""" del state volatility = self._volatility(tf.expand_dims( t, -1)) # shape=(dim, 1) return self._sqrt_rho * volatility # Drift function def _drift_fn(t, state): """Drift function of Gaussian-HJM.""" x = state # shape = [self._factors, self._factors] y = self.state_y(tf.expand_dims(t, axis=-1))[..., 0] drift = tf.math.reduce_sum(y, axis=-1) - self._mean_reversion * x return drift self._exact_discretization_setup(dim) super(quasi_gaussian_hjm.QuasiGaussianHJM, self).__init__(dim, _drift_fn, _vol_fn, self._dtype, self._name)
def call(self, inputs): if (not isinstance(inputs, random_variable.RandomVariable) and not isinstance(self.kernel, random_variable.RandomVariable) and not isinstance(self.bias, random_variable.RandomVariable)): return super(DenseDVI, self).call(inputs) self.call_weights() inputs_mean, inputs_variance, inputs_covariance = get_moments(inputs) kernel_mean, kernel_variance, _ = get_moments(self.kernel) if self.use_bias: bias_mean, _, bias_covariance = get_moments(self.bias) # E[outputs] = E[inputs] * E[kernel] + E[bias] mean = tf.tensordot(inputs_mean, kernel_mean, [[-1], [0]]) if self.use_bias: mean = tf.nn.bias_add(mean, bias_mean) # Cov = E[inputs**2] Cov(kernel) + E[W]^T Cov(inputs) E[W] + Cov(bias) # For first term, assume Cov(kernel) = 0 on off-diagonals so we only # compute diagonal term. covariance_diag = tf.tensordot(inputs_variance + inputs_mean**2, kernel_variance, [[-1], [0]]) # Compute quadratic form E[W]^T Cov E[W] from right-to-left. First is # [..., features, features], [features, units] -> [..., features, units]. cov_w = tf.tensordot(inputs_covariance, kernel_mean, [[-1], [0]]) # Next is [..., features, units], [features, units] -> [..., units, units]. w_cov_w = tf.tensordot(cov_w, kernel_mean, [[-2], [0]]) covariance = w_cov_w if self.use_bias: covariance += bias_covariance covariance = tf.linalg.set_diag( covariance, tf.linalg.diag_part(covariance) + covariance_diag) if self.activation in (tf.keras.activations.relu, tf.nn.relu): # Compute activation's moments with variable names from Wu et al. (2018). variance = tf.linalg.diag_part(covariance) scale = tf.sqrt(variance) mu = mean / (scale + tf.keras.backend.epsilon()) mean = scale * soft_relu(mu) pairwise_variances = (tf.expand_dims(variance, -1) * tf.expand_dims(variance, -2)) # [..., units, units] rho = covariance / tf.sqrt(pairwise_variances + tf.keras.backend.epsilon()) rho = tf.clip_by_value(rho, -1. / (1. + tf.keras.backend.epsilon()), 1. / (1. + tf.keras.backend.epsilon())) s = covariance / (rho + tf.keras.backend.epsilon()) mu1 = tf.expand_dims(mu, -1) # [..., units, 1] mu2 = tf.linalg.matrix_transpose(mu1) # [..., 1, units] a = (soft_relu(mu1) * soft_relu(mu2) + rho * tfp.distributions.Normal(0., 1.).cdf(mu1) * tfp.distributions.Normal(0., 1.).cdf(mu2)) gh = tf.asinh(rho) bar_rho = tf.sqrt(1. - rho**2) gr = gh + rho / (1. + bar_rho) # Include numerically stable versions of gr and rho when multiplying or # dividing them. The sign of gr*rho and rho/gr is always positive. safe_gr = tf.abs(gr) + 0.5 * tf.keras.backend.epsilon() safe_rho = tf.abs(rho) + tf.keras.backend.epsilon() exp_negative_q = gr / (2. * math.pi) * tf.exp( -safe_rho / (2. * safe_gr * (1 + bar_rho)) + (gh - rho) / (safe_gr * safe_rho) * mu1 * mu2) covariance = s * (a + exp_negative_q) elif self.activation not in (tf.keras.activations.linear, None): raise NotImplementedError('Activation is {}. Deterministic variational ' 'inference is only available if activation is ' 'ReLU or None.'.format(self.activation)) return generated_random_variables.MultivariateNormalFullCovariance( mean, covariance)
def vol_fn(_, x): return tf.expand_dims(tf.ones_like(x), -1)
def drift_fn(t, x): del t, x return tf.expand_dims(tf.constant(drift, dtype=tf.float32), 0)
def _observation_log_probs(self, observations, mask): # Let E be the underlying event shape # M the number of steps in the HMM # N the number of states of the HMM # # Then the incoming observations have shape # # observations : batch_o [M] E # # and the mask (if present) has shape # # mask : batch_m [M] # # Let this HMM distribution have batch shape batch_d # We need to broadcast all three of these batch shapes together # into the shape batch. # # We need to move the step dimension to the first dimension to make # them suitable for folding or scanning over. # # When we call `log_prob` for our observations we need to # do this for each state the observation could correspond to. # We do this by expanding the dimensions by 1 so we end up with: # # observations : [M] batch [1] [E] # # After calling `log_prob` we get # # observation_log_probs : [M] batch [N] # # We wish to use `mask` to select from this so we also # reshape and broadcast it up to shape # # mask : [M] batch [N] observation_tensor_shape = tf.shape(input=observations) observation_batch_shape = observation_tensor_shape[:-1 - self. _underlying_event_rank] observation_event_shape = observation_tensor_shape[ -1 - self._underlying_event_rank:] if mask is not None: mask_tensor_shape = tf.shape(mask) mask_batch_shape = mask_tensor_shape[:-1] batch_shape = tf.broadcast_dynamic_shape(observation_batch_shape, self.batch_shape_tensor()) if mask is not None: batch_shape = tf.broadcast_dynamic_shape(batch_shape, mask_batch_shape) observations = tf.broadcast_to( observations, tf.concat([batch_shape, observation_event_shape], axis=0)) observation_rank = tf.rank(observations) underlying_event_rank = self._underlying_event_rank observations = distribution_util.move_dimension( observations, observation_rank - underlying_event_rank - 1, 0) observations = tf.expand_dims(observations, observation_rank - underlying_event_rank) observation_log_probs = self._observation_distribution.log_prob( observations) if mask is not None: mask = tf.broadcast_to( mask, tf.concat([batch_shape, [self._num_steps]], axis=0)) mask = distribution_util.move_dimension(mask, -1, 0) mask = tf.expand_dims(mask, -1) mask = tf.broadcast_to(mask, tf.shape(observation_log_probs)) observation_log_probs = tf1.where( mask, tf.zeros_like(observation_log_probs), observation_log_probs) return observation_log_probs
def prepare_grid(*, times, time_step, dtype, num_time_steps=None, times_grid=None): """Prepares grid of times for path generation. Args: times: Rank 1 `Tensor` of increasing positive real values. The times at which the path points are to be evaluated. time_step: Rank 0 real `Tensor`. Maximal distance between points in resulting grid. dtype: `tf.Dtype` of the input and output `Tensor`s. num_time_steps: Number of points on the grid. If suppied, a uniform grid is constructed for `[time_step, times[-1] - time_step]` consisting of max(0, num_time_steps - len(times)) points that is then concatenated with times. This parameter guarantees the number of points on the time grid is `max(len(times), num_time_steps)` and that `times` are included to the grid. Default value: `None`, which means that a uniform grid is created. containing all points from 'times` and the uniform grid of points between `[0, times[-1]]` with grid size equal to `time_step`. times_grid: An optional rank 1 `Tensor` representing time discretization grid. If `times` are not on the grid, then the nearest points from the grid are used. Default value: `None`, which means that times grid is computed using `time_step` and `num_time_steps`. Returns: Tuple `(all_times, mask, time_indices)`. `all_times` is a 1-D real `Tensor`. If `num_time_steps` is supplied the shape of the output is `max(num_time_steps, len(times))`. Otherwise consists of all points from 'times` and the uniform grid of points between `[0, times[-1]]` with grid size equal to `time_step`. `mask` is a boolean 1-D `Tensor` of the same shape as 'all_times', showing which elements of 'all_times' correspond to THE values from `times`. Guarantees that times[0]=0 and mask[0]=False. `time_indices`. An integer `Tensor` of the same shape as `times` indicating `times` indices in `all_times`. """ if times_grid is None: if num_time_steps is None: all_times, time_indices = _grid_from_time_step(times=times, time_step=time_step, dtype=dtype) else: all_times, time_indices = _grid_from_num_times( times=times, time_step=time_step, num_time_steps=num_time_steps) else: all_times = times_grid time_indices = tf.searchsorted(times_grid, times) # Adjust indices to bring `times` closer to `times_grid`. times_diff_1 = tf.gather(times_grid, time_indices) - times times_diff_2 = tf.gather(times_grid, tf.nn.relu(time_indices - 1)) - times time_indices = tf.where( tf.math.abs(times_diff_2) > tf.math.abs(times_diff_1), time_indices, tf.nn.relu(time_indices - 1)) # Create a boolean mask to identify the iterations that have to be recorded. # Use `tf.scatter_nd`because it handles duplicates. mask = tf.scatter_nd(indices=tf.expand_dims(tf.cast(time_indices, dtype=tf.int64), axis=1), updates=tf.fill(tf.shape(times), True), shape=tf.shape(all_times, out_type=tf.int64)) return all_times, mask, time_indices
def main(): #download google pre-trained neural network local_zip_file = 'inception5h.zip' if not os.path.exists(local_zip_file): #download model_url = urllib.request.urlopen(url) with open(local_zip_file, 'wb') as output: output.write(model_url.read()) #extract with zipfile.ZipFile(local_zip_file, 'r') as zip_ref: zip_ref.extractall(data_dir) model_fn = 'tensorflow_inseption_graph.pb' #Creating tf session and loading the model graph = tf.Graph() sess = tfc.InteractiveSession(graph=graph) with tfc.gfile.FastGFile((local_zip_file), 'rb') as f: graph_def = tf.io.gfile.GFile() graph_def.ParseFromString(f.read()) t_input = tf.placeholder(np.float32, name='input') #define input tensor imagenet_mean = 117.0 t_preprocessed = tf.expand_dims(t_input - imagenet_mean, 0) tf.import_graph_def(graph_def, {'input': t_preprocessed}) layers = [ op.name for op in graph.get_operations() if op.type == 'Cony2D' and 'import/' in op.name ] feature_nums = [ int(graph.get_tensor_by_name(model_name + ':0').get_shape()[-1]) for name in layers ] print('Number of layers: ', len(layers)) print('Total numbers of feature channels:', sum(feature_nums)) def render_deepdream(t_obj, img0=img_noise, iter_n=10, step=1.5, octave_n=4, octave_scale=1.4): t_score = tf.reduce_mean(t_obj) #defining optimization objective t_grad = tf.gradients(t_score, t_input)[0] #split the image into a number of octaves img = img0 octaves = [] for _ in range(octave_n - 1): hw = img.shape[:2] lo = resize(img, np.int32(np.float32(hw) / octave_scale)) hi = img - resize(low, hw) img = lo octaves.append(hi) #generate details octave by octave for octave in range(octave_n): if octave > 0: hi = octaves[-octave] img = resize(img, hi.shape[:2]) + hi for _ in range(iter_n): g = calc_grad_tiled(img, t_grad) img += g * (step / (np.abs(g).mean() + 1e-7)) #output deep dreamed image showarray(img / 255.0) #Pick a layer to enchance my image layer = 'mixed4d_3x3_bottleneck_pre_relu' channel = 139 img0 = PIL.Image.open('image.jpg') img0 = np.float32(img0) #Apply gradient ascent to the layer render_deepdream(tf.square(T('mixed4c')), img0)
def __init__(self, dataset_spec, gamma: Union[float, tf.Tensor], reward_fn: Optional[Callable] = None, solve_for_state_action_ratio: bool = True, divergence_limit: Union[float, np.ndarray, tf.Tensor] = 0.0, divergence_type: Text = 'rkl', nu_learning_rate: Union[float, tf.Tensor] = 0.1, zeta_learning_rate: Union[float, tf.Tensor] = 0.1, algae_alpha: Union[float, tf.Tensor] = 1.0, limit_episodes: Optional[int] = None): """Initializes the solver. Args: dataset_spec: The spec of the dataset that will be given. gamma: The discount factor to use. reward_fn: A function that takes in an EnvStep and returns the reward for that step. If not specified, defaults to just EnvStep.reward. solve_for_state_action_ratio: Whether to solve for state-action density ratio. Defaults to True. When solving an environment with a large state/action space (taxi), better to set this to False to avoid OOM issues. divergence_limit: The limit on the f-divergence between the weights and the empirical distribution. divergence_type: The type of f-divergence to use, e.g., 'kl'. Defaults to 'rkl', reverse KL. nu_learning_rate: Learning rate for nu. zeta_learning_rate: Learning rate for zeta. algae_alpha: Regularizer coefficient on Df(dpi || dD). limit_episodes: How many episodes to take from the dataset. Defaults to None (take whole dataset). """ self._dataset_spec = dataset_spec self._gamma = gamma if reward_fn is None: reward_fn = lambda env_step: env_step.reward self._reward_fn = reward_fn self._solve_for_state_action_ratio = solve_for_state_action_ratio if (not self._solve_for_state_action_ratio and not self._dataset_spec.has_log_probability()): raise ValueError('Dataset must contain log-probability when ' 'solve_for_state_action_ratio is False.') # Get number of states/actions. observation_spec = self._dataset_spec.observation action_spec = self._dataset_spec.action if not common_lib.is_categorical_spec(observation_spec): raise ValueError('Observation spec must be discrete and bounded.') self._num_states = observation_spec.maximum + 1 if not common_lib.is_categorical_spec(action_spec): raise ValueError('Action spec must be discrete and bounded.') self._num_actions = action_spec.maximum + 1 self._dimension = 1 + (self._num_states * self._num_actions if self._solve_for_state_action_ratio else self._num_states) # For learning data weight self._divergence_limit = tf.convert_to_tensor(divergence_limit, dtype=tf.float32) if tf.rank(self._divergence_limit) < 1: self._divergence_limit = tf.expand_dims(self._divergence_limit, -1) self._two_sided_limit = tf.concat( [self._divergence_limit, self._divergence_limit], -1) self._num_limits = int(self._two_sided_limit.shape[0]) # The lagrange multiplier w.r.t. data weight constraint self._alpha = tf.Variable(np.zeros(self._two_sided_limit.shape), dtype=tf.float32) self._algae_alpha = tf.convert_to_tensor(algae_alpha, dtype=tf.float32) if tf.rank(self._algae_alpha) < 1: self._algae_alpha = tf.expand_dims(self._algae_alpha, -1) if self._algae_alpha.shape[-1] != self._two_sided_limit.shape[-1]: self._algae_alpha *= tf.ones_like(self._two_sided_limit) self._algae_alpha_sign = 2 * ( tf.cast(self._algae_alpha >= 0, tf.float32) - 0.5) self._divergence_type = divergence_type if self._divergence_type not in ['kl', 'rkl', 'chi2']: raise ValueError('Unsupported divergence type %s.' % self._divergence_type) self._nu_learning_rate = nu_learning_rate self._zeta_learning_rate = zeta_learning_rate # We have two variables to counteract the bias introduced by algae_alpha. self._nu = tf.zeros([self._dimension, self._num_limits]) self._nu2 = tf.zeros([self._dimension, self._num_limits]) self._zeta = tf.zeros([self._dimension, self._num_limits]) self._zeta2 = tf.zeros([self._dimension, self._num_limits]) self._limit_episodes = limit_episodes
def train_step(self, dataset: dataset_lib.OffpolicyDataset, target_policy: tf_policy.TFPolicy, regularizer: float = 1e-6): """Performs single iteration of CoinDICE. Args: dataset: The dataset to sample experience from. target_policy: The policy whose value we want to estimate. regularizer: A small constant to add to matrices before inverting them or to floats before taking square root. Returns: Estimated average per-step reward of the target policy. """ # First compute Lagrangian loss. saddle_bellman_residuals = (tf.matmul(self._a_vec, self._nu) - self._weighted_rewards[:, None]) saddle_bellman_residuals *= -1 * self._algae_alpha_sign saddle_zetas = tf.gather(self._zeta, self._nu_indices) saddle_initial_nu_values = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu, self._initial_nu_indices), axis=1) saddle_init_nu_loss = ((1 - self._gamma) * saddle_initial_nu_values * self._algae_alpha_sign) # This second optimization switches the sign of algae_alpha. # We add these two together to get the final loss, and thus counteract # the bias introduced by algae_alpha. saddle_bellman_residuals2 = (tf.matmul(self._a_vec, self._nu2) - self._weighted_rewards[:, None]) saddle_bellman_residuals2 *= 1 * self._algae_alpha_sign saddle_zetas2 = tf.gather(self._zeta2, self._nu_indices) saddle_initial_nu_values2 = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu2, self._initial_nu_indices), axis=1) saddle_init_nu_loss2 = ((1 - self._gamma) * saddle_initial_nu_values2 * -1 * self._algae_alpha_sign) saddle_loss = 0.5 * ( saddle_init_nu_loss + saddle_bellman_residuals * saddle_zetas + -tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas) + -saddle_init_nu_loss2 + -saddle_bellman_residuals2 * saddle_zetas2 + tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas2)) # Find optimal weights by doing binary search on alpha (lambda in the # paper). left = tf.constant([-8., -8.]) right = tf.constant([32., 32.]) for _ in range(16): mid = 0.5 * (left + right) self._alpha.assign(mid) weights, log_weights = self._get_weights(saddle_loss) divergence = self._compute_divergence(weights, log_weights) divergence_violation = divergence - self._two_sided_limit left = tf.where(divergence_violation > 0., mid, left) right = tf.where(divergence_violation > 0., right, mid) self._alpha.assign(0.5 * (left + right)) weights, log_weights = self._get_weights(saddle_loss) # Now that we have weights, we reconstruct the Bellman residual matrices. data_weights = tf.stop_gradient(weights) avg_saddle_loss = (tf.reduce_sum(data_weights * saddle_loss, axis=0) / tf.reduce_sum(data_weights, axis=0)) weighted_state_action_count = tf.reduce_sum( tf.one_hot(self._nu_indices, self._dimension)[:, :, None] * weights[:, None, :], axis=0) weighted_state_action_count = tf.gather(weighted_state_action_count, self._nu_indices) my_td_mat = tf.einsum('ai, ab, ab, aj -> bij', tf.one_hot(self._nu_indices, self._dimension), 1.0 / weighted_state_action_count, weights, self._a_vec) my_bias = tf.reduce_sum( tf.transpose(weights)[:, :, None] * tf.one_hot(self._nu_indices, self._dimension)[None, :, :] * tf.reshape(self._weighted_rewards, [1, -1, 1]) * 1.0 / tf.transpose(weighted_state_action_count)[:, :, None], axis=1) # Solve for nu using primal form; i.e., E[(nu - B nu)^2] - (1-g) * E[nu0]. with tf.GradientTape(watch_accessed_variables=False, persistent=True) as tape: tape.watch([self._nu, self._nu2, self._alpha]) bellman_residuals = tf.matmul( my_td_mat, tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None] bellman_residuals = tf.transpose(tf.squeeze(bellman_residuals, -1)) bellman_residuals = tf.gather(bellman_residuals, self._nu_indices) initial_nu_values = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu, self._initial_nu_indices), axis=1) bellman_residuals *= self._algae_alpha_sign init_nu_loss = ((1 - self._gamma) * initial_nu_values * self._algae_alpha_sign) nu_loss = (tf.math.square(bellman_residuals) / 2.0 + tf.math.abs(self._algae_alpha) * init_nu_loss) loss = (data_weights * nu_loss / tf.reduce_sum(data_weights, axis=0, keepdims=True)) bellman_residuals2 = tf.matmul( my_td_mat, tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :, None] bellman_residuals2 = tf.transpose( tf.squeeze(bellman_residuals2, -1)) bellman_residuals2 = tf.gather(bellman_residuals2, self._nu_indices) initial_nu_values2 = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu2, self._initial_nu_indices), axis=1) bellman_residuals2 *= -1 * self._algae_alpha_sign init_nu_loss2 = ((1 - self._gamma) * initial_nu_values2 * -1 * self._algae_alpha_sign) nu_loss2 = (tf.math.square(bellman_residuals2) / 2.0 + tf.math.abs(self._algae_alpha) * init_nu_loss2) loss2 = (data_weights * nu_loss2 / tf.reduce_sum(data_weights, axis=0, keepdims=True)) divergence = self._compute_divergence(weights, log_weights) divergence_violation = divergence - self._two_sided_limit # Extra loss if for the 'terminal' state (index = -1). extra_loss = tf.reduce_sum(tf.math.square(self._nu[-1, :])) extra_loss2 = tf.reduce_sum(tf.math.square(self._nu2[-1, :])) nu_grad = tape.gradient(loss + extra_loss, [self._nu])[0] nu_grad2 = tape.gradient(loss2 + extra_loss2, [self._nu2])[0] avg_loss = tf.reduce_sum(0.5 * (loss - loss2) / tf.math.abs(self._algae_alpha), axis=0) nu_jacob = tape.jacobian(nu_grad, [self._nu])[0] nu_hess = tf.stack( [nu_jacob[:, i, :, i] for i in range(self._num_limits)], axis=0) nu_jacob2 = tape.jacobian(nu_grad2, [self._nu2])[0] nu_hess2 = tf.stack( [nu_jacob2[:, i, :, i] for i in range(self._num_limits)], axis=0) for idx, div in enumerate(divergence): tf.summary.scalar('divergence%d' % idx, div) # Perform Newton step on nu. nu_transformed = tf.transpose( tf.squeeze( tf.linalg.solve( nu_hess + regularizer * tf.eye(self._dimension), tf.expand_dims(-tf.transpose(nu_grad), axis=-1)))) self._nu = self._nu + self._nu_learning_rate * nu_transformed nu_transformed2 = tf.transpose( tf.squeeze( tf.linalg.solve( nu_hess2 + regularizer * tf.eye(self._dimension), tf.expand_dims(-tf.transpose(nu_grad2), axis=-1)))) self._nu2 = self._nu2 + self._nu_learning_rate * nu_transformed2 # Perform step on zeta based on fact that zeta* = (nu* - bellman nu*)/a. zetas = tf.matmul(my_td_mat, tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None] zetas = tf.transpose(tf.squeeze(zetas, -1)) zetas *= -self._algae_alpha_sign zetas /= tf.math.abs(self._algae_alpha) self._zeta = self._zeta + self._zeta_learning_rate * (zetas - self._zeta) zetas2 = tf.matmul(my_td_mat, tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :, None] zetas2 = tf.transpose(tf.squeeze(zetas2, -1)) zetas2 *= 1 * self._algae_alpha_sign zetas2 /= tf.math.abs(self._algae_alpha) self._zeta2 = (self._zeta2 + self._zeta_learning_rate * (zetas2 - self._zeta2)) return [ avg_saddle_loss * self._algae_alpha_sign, avg_loss * self._algae_alpha_sign, divergence ]
def prepare_dataset(self, dataset: dataset_lib.OffpolicyDataset, target_policy: tf_policy.TFPolicy): """Performs pre-computations on dataset to make solving easier.""" episodes, valid_steps = dataset.get_all_episodes( limit=self._limit_episodes) total_num_steps_per_episode = tf.shape(valid_steps)[1] - 1 num_episodes = tf.shape(valid_steps)[0] num_samples = num_episodes * total_num_steps_per_episode valid_and_not_last = tf.logical_and(valid_steps, episodes.discount > 0) valid_indices = tf.squeeze( tf.where(tf.reshape(valid_and_not_last[:, :-1], [-1]))) # Flatten all tensors so that each data sample is a tuple of # (initial_env_step, env_step, next_env_step). initial_env_step = tf.nest.map_structure( lambda t: tf.squeeze( tf.reshape( tf.repeat(t[:, 0:1, ...], axis=1, repeats=total_num_steps_per_episode), [num_samples, -1])), episodes) initial_env_step = tf.nest.map_structure( lambda t: tf.gather(t, valid_indices), initial_env_step) tfagents_initial_env_step = dataset_lib.convert_to_tfagents_timestep( initial_env_step) env_step = tf.nest.map_structure( lambda t: tf.squeeze( tf.reshape(t[:, 0:total_num_steps_per_episode, ...], [num_samples, -1])), episodes) env_step = tf.nest.map_structure(lambda t: tf.gather(t, valid_indices), env_step) tfagents_env_step = dataset_lib.convert_to_tfagents_timestep(env_step) next_env_step = tf.nest.map_structure( lambda t: tf.squeeze( tf.reshape(t[:, 1:total_num_steps_per_episode + 1, ...], [num_samples, -1])), episodes) next_env_step = tf.nest.map_structure( lambda t: tf.gather(t, valid_indices), next_env_step) tfagents_next_env_step = dataset_lib.convert_to_tfagents_timestep( next_env_step) # Get target probabilities for initial and next steps. initial_target_probs = target_policy.distribution( tfagents_initial_env_step).action.probs_parameter() next_target_probs = target_policy.distribution( tfagents_next_env_step).action.probs_parameter() # Map states and actions to indices into tabular representation. initial_states = tf.tile( tf.reshape(initial_env_step.observation, [-1, 1]), [1, self._num_actions]) initial_actions = tf.tile( tf.reshape(tf.range(self._num_actions), [1, -1]), [initial_env_step.observation.shape[0], 1]) initial_nu_indices = self._get_index(initial_states, initial_actions) next_states = tf.tile(tf.reshape(next_env_step.observation, [-1, 1]), [1, self._num_actions]) next_actions = tf.tile( tf.reshape(tf.range(self._num_actions), [1, -1]), [next_env_step.observation.shape[0], 1]) next_nu_indices = self._get_index(next_states, next_actions) next_nu_indices = tf.where( tf.expand_dims(next_env_step.is_absorbing(), -1), -1 * tf.ones_like(next_nu_indices), next_nu_indices) nu_indices = self._get_index(env_step.observation, env_step.action) target_log_probabilities = target_policy.distribution( tfagents_env_step).action.log_prob(env_step.action) if not self._solve_for_state_action_ratio: policy_ratio = tf.exp(target_log_probabilities - env_step.get_log_probability()) else: policy_ratio = tf.ones([ target_log_probabilities.shape[0], ]) policy_ratios = tf.tile(tf.reshape(policy_ratio, [-1, 1]), [1, self._num_actions]) # Bellman residual matrix of size [n_data, n_dim]. a_vec = tf.one_hot(nu_indices, self._dimension) - tf.reduce_sum( self._gamma * tf.expand_dims(next_target_probs * policy_ratios, axis=-1) * tf.one_hot(next_nu_indices, self._dimension), axis=1) state_action_count = self._get_state_action_counts(env_step) # Bellman residual matrix of size [n_dim, n_dim]. td_mat = tf.einsum('ai, a, aj -> ij', tf.one_hot(nu_indices, self._dimension), 1.0 / tf.cast(state_action_count, tf.float32), a_vec) # Reward vector of size [n_data]. weighted_rewards = policy_ratio * self._reward_fn(env_step) # Reward vector of size [n_dim]. bias = tf.reduce_sum(tf.one_hot(nu_indices, self._dimension) * tf.reshape(weighted_rewards, [-1, 1]) * 1.0 / tf.cast(state_action_count, tf.float32)[:, None], axis=0) # Initialize. self._nu = np.ones_like(self._nu) * bias[:, None] self._nu2 = np.ones_like(self._nu2) * bias[:, None] self._a_vec = a_vec self._td_mat = td_mat self._bias = bias self._weighted_rewards = weighted_rewards self._state_action_count = state_action_count self._nu_indices = nu_indices self._initial_nu_indices = initial_nu_indices self._initial_target_probs = initial_target_probs
def interpolate(x_values, spline_data, dtype=None, name=None): """Interpolates spline values for the given `x_values` and the `spline_data`. Constant extrapolation is performed for the values outside the domain `spline_data.x_data`. This means that for `x > max(spline_data.x_data)`, `interpolate(x, spline_data) = spline_data.y_data[-1]` and for `x < min(spline_data.x_data)`, `interpolate(x, spline_data) = spline_data.y_data[0]`. For the interpolation formula refer to p.548 of [1]. ## References: [1]: R. Sedgewick, Algorithms in C, 1990, p. 545-550. Link: http://index-of.co.uk/Algorithms/Algorithms%20in%20C.pdf Args: x_values: A real `Tensor` of shape `batch_shape + [num_points]`. spline_data: An instance of `SplineParameters`. `spline_data.x_data` should have the same batch shape as `x_values`. dtype: Optional dtype for `x_values`. Default value: `None` which maps to the default dtype inferred by TensorFlow. name: Python `str` name prefixed to ops created by this function. Default value: `None` which is mapped to the default name `cubic_spline_interpolate`. Returns: A `Tensor` of the same shape and `dtype` as `x_values`. Represents the interpolated values. Raises: ValueError: If `x_values` batch shape is different from `spline_data.x_data` batch shape. """ with tf.compat.v1.name_scope(name, default_name="cubic_spline_interpolate", values=[spline_data, x_values]): # Unpack the spline data x_data = spline_data.x_data y_data = spline_data.y_data spline_coeffs = spline_data.spline_coeffs x_values = tf.convert_to_tensor(x_values, dtype=dtype, name="x_values") # Check that all the x_values are within the boundaries if x_values.shape.as_list()[:-1] != x_data.shape.as_list()[:-1]: msg = ("The input tensor has a different number of rows than the " "number of splines: {} != {}") raise ValueError( msg.format(x_values.shape.as_list()[:-1], x_data.shape.as_list()[:-1])) # Determine the splines to use. indices = tf.searchsorted(x_data, x_values, side="right") - 1 # Prepares the `indices` so that it can be used in gather_nd. index_matrix = _prepare_indices(indices) # This selects all elements for the start of the spline interval. # Make sure indices lie in the permissible range indices_lower = tf.maximum(indices, 0) selection_matrix = tf.concat( [index_matrix, tf.expand_dims(indices_lower, -1)], -1) # This selects all elements for the end of the spline interval. # Make sure indices lie in the permissible range indices_upper = tf.minimum(indices + 1, x_data.shape.as_list()[-1] - 1) selection_matrix_1 = tf.concat( [index_matrix, tf.expand_dims(indices_upper, -1)], -1) # Calculate dx and dy. # Simplified logic: # dx = x_data[indices + 1] - x_data[indices] # dy = y_data[indices + 1] - y_data[indices] # indices is a tensor with different values per row/spline # Hence use a selection matrix with gather_nd x0 = tf.gather_nd(x_data, selection_matrix) x1 = tf.gather_nd(x_data, selection_matrix_1) dx = x1 - x0 y0 = tf.gather_nd(y_data, selection_matrix) y1 = tf.gather_nd(y_data, selection_matrix_1) dy = y1 - y0 spline_coeffs0 = tf.gather_nd(spline_coeffs, selection_matrix) spline_coeffs1 = tf.gather_nd(spline_coeffs, selection_matrix_1) t = (x_values - x0) / dx t = tf.where(dx > 0, t, tf.zeros_like(t)) df = ((t + 1.0) * spline_coeffs1 * 2.0) - ( (t - 2.0) * spline_coeffs0 * 2.0) df1 = df * t * (t - 1) / 6.0 result = y0 + (t * dy) + (dx * dx * df1) # Use constant extrapolation outside the domain upper_bound = tf.expand_dims(tf.reduce_max(x_data, -1), -1) + tf.zeros_like(result) lower_bound = tf.expand_dims(tf.reduce_min(x_data, -1), -1) + tf.zeros_like(result) result = tf.where( tf.logical_and(x_values <= upper_bound, x_values >= lower_bound), result, tf.where(x_values > upper_bound, y0, y1)) return result
def _sample_paths(self, times, num_samples, random_type, skip, seed, normal_draws=None, times_grid=None, validate_args=False): """Returns a sample of paths from the process.""" # Note: all the notations below are the same as in [1]. num_requested_times = tf.shape(times)[0] params = [self._mean_reversion, self._volatility] if self._corr_matrix is not None: params = params + [self._corr_matrix] times, keep_mask = _prepare_grid( times, times_grid, *params) # Add zeros as a starting location dt = times[1:] - times[:-1] if dt.shape.is_fully_defined(): steps_num = dt.shape.as_list()[-1] else: steps_num = tf.shape(dt)[-1] # TODO(b/148133811): Re-enable Sobol test when TF 2.2 is released. if random_type == random.RandomType.SOBOL: raise ValueError('Sobol sequence for Euler sampling is temporarily ' 'unsupported when `time_step` or `times` have a ' 'non-constant value') if normal_draws is None: # In order to use low-discrepancy random_type we need to generate the # sequence of independent random normals upfront. We also precompute # random numbers for stateless random type in order to ensure independent # samples for multiple function calls whith different seeds. if random_type in (random.RandomType.SOBOL, random.RandomType.HALTON, random.RandomType.HALTON_RANDOMIZED, random.RandomType.STATELESS, random.RandomType.STATELESS_ANTITHETIC): normal_draws = utils.generate_mc_normal_draws( num_normal_draws=self._dim, num_time_steps=steps_num, num_sample_paths=num_samples, random_type=random_type, seed=seed, dtype=self._dtype, skip=skip) else: normal_draws = None else: if validate_args: draws_times = tf.shape(normal_draws)[0] asserts = tf.assert_equal( draws_times, tf.shape(times)[0] - 1, # We have added `0` to `times` message='`tf.shape(normal_draws)[1]` should be equal to the ' 'number of all `times` plus the number of all jumps of ' 'the piecewise constant parameters.') with tf.compat.v1.control_dependencies([asserts]): normal_draws = tf.identity(normal_draws) # The below is OK because we support exact discretization with piecewise # constant mr and vol. mean_reversion = self._mean_reversion(times) volatility = self._volatility(times) if self._corr_matrix is not None: corr_matrix = _get_parameters( times + tf.math.reduce_min(dt) / 2, self._corr_matrix)[0] corr_matrix_root = tf.linalg.cholesky(corr_matrix) else: corr_matrix_root = None exp_x_t = self._conditional_mean_x(times, mean_reversion, volatility) var_x_t = self._conditional_variance_x(times, mean_reversion, volatility) if self._dim == 1: mean_reversion = tf.expand_dims(mean_reversion, axis=0) cond_fn = lambda i, *args: i < tf.size(dt) def body_fn(i, written_count, current_x, rate_paths): """Simulate hull-white process to the next time point.""" if normal_draws is None: normals = random.mv_normal_sample( (num_samples,), mean=tf.zeros((self._dim,), dtype=mean_reversion.dtype), random_type=random_type, seed=seed) else: normals = normal_draws[i] if corr_matrix_root is not None: normals = tf.linalg.matvec(corr_matrix_root[i], normals) vol_x_t = tf.math.sqrt(tf.nn.relu(tf.transpose(var_x_t)[i])) # If numerically `vol_x_t == 0`, the gradient of `vol_x_t` becomes `NaN`. # To prevent this, we explicitly set `vol_x_t` to zero tensor at zero # values so that the gradient is set to zero at this values. vol_x_t = tf.where(vol_x_t > 0.0, vol_x_t, 0.0) next_x = (tf.math.exp(-tf.transpose(mean_reversion)[i + 1] * dt[i]) * current_x + tf.transpose(exp_x_t)[i] + vol_x_t * normals) f_0_t = self._instant_forward_rate_fn(times[i + 1]) # Update `rate_paths` rate_paths = utils.maybe_update_along_axis( tensor=rate_paths, do_update=keep_mask[i + 1], ind=written_count, axis=1, new_tensor=tf.expand_dims(next_x, axis=1) + f_0_t) written_count += tf.cast(keep_mask[i + 1], dtype=tf.int32) return (i + 1, written_count, next_x, rate_paths) rate_paths = tf.zeros((num_samples, num_requested_times, self._dim), dtype=self._dtype) # Include initial state, if necessary f0_t = self._instant_forward_rate_fn(times[0]) rate_paths = utils.maybe_update_along_axis( tensor=rate_paths, do_update=keep_mask[0], ind=0, axis=1, new_tensor=f0_t) written_count = tf.cast(keep_mask[0], dtype=tf.int32) initial_x = tf.zeros((num_samples, self._dim), dtype=self._dtype) # TODO(b/157232803): Use tf.cumsum instead? _, _, _, rate_paths = tf.while_loop( cond_fn, body_fn, (0, written_count, initial_x, rate_paths)) return rate_paths
def sample_discount_curve_paths( self, times, curve_times, num_samples=1, random_type=None, seed=None, skip=0, time_step=None, times_grid=None, normal_draws=None, validate_args=False, name=None): """Returns a sample of simulated discount curves for the Hull-white model. ### References: [1]: Leif B.G. Andersen and Vladimir V. Piterbarg. Interest Rate Modeling, Volume II: Term Structure Models. 2010. Args: times: Rank 1 `Tensor` of positive real values. The times at which the discount curves are to be evaluated. curve_times: Rank 1 `Tensor` of positive real values. The maturities at which discount curve is computed at each simulation time. num_samples: Positive scalar `int`. The number of paths to draw. random_type: Enum value of `RandomType`. The type of (quasi)-random number generator to use to generate the paths. Default value: None which maps to the standard pseudo-random numbers. seed: Seed for the random number generator. The seed is only relevant if `random_type` is one of `[STATELESS, PSEUDO, HALTON_RANDOMIZED, PSEUDO_ANTITHETIC, STATELESS_ANTITHETIC]`. For `PSEUDO`, `PSEUDO_ANTITHETIC` and `HALTON_RANDOMIZED` the seed should be an Python integer. For `STATELESS` and `STATELESS_ANTITHETIC` must be supplied as an integer `Tensor` of shape `[2]`. Default value: `None` which means no seed is set. skip: `int32` 0-d `Tensor`. The number of initial points of the Sobol or Halton sequence to skip. Used only when `random_type` is 'SOBOL', 'HALTON', or 'HALTON_RANDOMIZED', otherwise ignored. Default value: `0`. time_step: Scalar real `Tensor`. Maximal distance between time grid points in Euler scheme. Used only when Euler scheme is applied. Default value: `None`. times_grid: An optional rank 1 `Tensor` representing time discretization grid. If `times` are not on the grid, then the nearest points from the grid are used. When supplied, `time_step` and jumps of the piecewise constant arguments are ignored. Default value: `None`, which means that the times grid is computed using `time_step`. When exact sampling is used, the shape should be equal to `[num_time_points + 1]` where `num_time_points` is `tf.shape(times)[0]` plus the number of jumps of the Hull-White piecewise constant parameters. The grid should include the initial time point which is usually set to `0.0`. normal_draws: A `Tensor` of shape `[num_samples, num_time_points, dim]` and the same `dtype` as `times`. Represents random normal draws to compute increments `N(0, t_{n+1}) - N(0, t_n)`. When supplied, `num_samples` argument is ignored and the first dimensions of `normal_draws` is used instead. When exact sampling is used, `num_time_points` should be equal to `tf.shape(times)[0]` plus the number of jumps of the Hull-White piecewise constant parameters. Default value: `None` which means that the draws are generated by the algorithm. validate_args: Python `bool`. When `True` and `normal_draws` are supplied, checks that `tf.shape(normal_draws)[1]` is equal to the total number of time steps performed by the sampler. When `False` invalid dimension may silently render incorrect outputs. Default value: `False`. name: Str. The name to give this op. Default value: `sample_discount_curve_paths`. Returns: A tuple containing two `Tensor`s. The first element is a `Tensor` of shape [num_samples, m, k, dim] and contains the simulated bond curves where `m` is the size of `curve_times`, `k` is the size of `times` and `dim` is the dimension of the process. The second element is a `Tensor` of shape [num_samples, k, dim] and contains the simulated short rate paths. Raises: ValueError: (a) If `times` has rank different from `1`. (b) If Euler scheme is used by times is not supplied. (c) When neither `times_grid` nor `time_step` are supplied and Euler scheme is used. (d) If `normal_draws` is supplied and `dim` is mismatched. tf.errors.InvalidArgumentError: If `normal_draws` is supplied and the number of time steps implied by `times_grid` or `times_step` is mismatched. """ name = name or self._name + '_sample_discount_curve_paths' with tf.name_scope(name): times = tf.convert_to_tensor(times, self._dtype, name='times') curve_times = tf.convert_to_tensor(curve_times, self._dtype, name='curve_times') if times_grid is not None: times_grid = tf.convert_to_tensor(times_grid, self._dtype, name='times_grid') mean_reversion = self._mean_reversion(times) volatility = self._volatility(times) y_t = self._compute_yt(times, mean_reversion, volatility) rate_paths = self.sample_paths( times=times, num_samples=num_samples, random_type=random_type, skip=skip, time_step=time_step, times_grid=times_grid, normal_draws=normal_draws, validate_args=validate_args, seed=seed) short_rate = tf.expand_dims(rate_paths, axis=1) # Reshape all `Tensor`s so that they have the dimensions same as (or # broadcastable to) the output shape # ([num_smaples,num_curve_times,num_sim_times,dim]). num_curve_nodes = tf.shape(curve_times)[0] # m num_sim_steps = tf.shape(times)[0] # k times = tf.reshape( tf.repeat(tf.expand_dims(times, axis=-1), self._dim, axis=-1), (1, 1, num_sim_steps, self._dim)) curve_times = tf.reshape(curve_times, (1, num_curve_nodes, 1, 1)) curve_times = tf.repeat(curve_times, self._dim, axis=-1) mean_reversion = tf.reshape( mean_reversion, (1, 1, self._dim, num_sim_steps)) # Transpose so the `dim` is the trailing dimension. mean_reversion = tf.transpose(mean_reversion, [0, 1, 3, 2]) # Calculate the variable `y(t)` (described in [1], section 10.1.6.1) # so that we have the full Markovian state to compute the P(t,T). y_t = tf.reshape(tf.transpose(y_t), (1, 1, num_sim_steps, self._dim)) return self._bond_reconstitution(times, times + curve_times, mean_reversion, short_rate, y_t), rate_paths
def _coord_grid_to_mesh_grid(coord_grid): if len(coord_grid) == 1: return tf.expand_dims(coord_grid[0], -1) return tf.stack(values=tf.meshgrid(*coord_grid, indexing="ij"), axis=-1)
def testExplicitBlocks(self, dynamic_shape, batch_shape): block_sizes = tf.convert_to_tensor(value=[2, 1, 3]) block_sizes = tf1.placeholder_with_default( block_sizes, shape=None if dynamic_shape else block_sizes.shape) exp = tfb.Exp() sp = tfb.Softplus() aff = tfb.Affine(scale_diag=[2., 3., 4.]) blockwise = tfb.Blockwise(bijectors=[exp, sp, aff], block_sizes=block_sizes) x = tf.cast([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=tf.float32) for s in batch_shape: x = tf.expand_dims(x, 0) x = tf.tile(x, [s] + [1] * (tensorshape_util.rank(x.shape) - 1)) x = tf1.placeholder_with_default( x, shape=None if dynamic_shape else x.shape) # Identity to break the caching. blockwise_y = tf.identity(blockwise.forward(x)) blockwise_fldj = blockwise.forward_log_det_jacobian(x, event_ndims=1) blockwise_x = blockwise.inverse(blockwise_y) blockwise_ildj = blockwise.inverse_log_det_jacobian(blockwise_y, event_ndims=1) if not dynamic_shape: self.assertEqual(blockwise_y.shape, batch_shape + [6]) self.assertEqual(blockwise_fldj.shape, batch_shape + []) self.assertEqual(blockwise_x.shape, batch_shape + [6]) self.assertEqual(blockwise_ildj.shape, batch_shape + []) self.assertAllEqual(self.evaluate(tf.shape(input=blockwise_y)), batch_shape + [6]) self.assertAllEqual(self.evaluate(tf.shape(input=blockwise_fldj)), batch_shape + []) self.assertAllEqual(self.evaluate(tf.shape(input=blockwise_x)), batch_shape + [6]) self.assertAllEqual(self.evaluate(tf.shape(input=blockwise_ildj)), batch_shape + []) expl_y = tf.concat([ exp.forward(x[..., :2]), sp.forward(x[..., 2:3]), aff.forward(x[..., 3:]), ], axis=-1) expl_fldj = sum([ exp.forward_log_det_jacobian(x[..., :2], event_ndims=1), sp.forward_log_det_jacobian(x[..., 2:3], event_ndims=1), aff.forward_log_det_jacobian(x[..., 3:], event_ndims=1) ]) expl_x = tf.concat([ exp.inverse(expl_y[..., :2]), sp.inverse(expl_y[..., 2:3]), aff.inverse(expl_y[..., 3:]) ], axis=-1) expl_ildj = sum([ exp.inverse_log_det_jacobian(expl_y[..., :2], event_ndims=1), sp.inverse_log_det_jacobian(expl_y[..., 2:3], event_ndims=1), aff.inverse_log_det_jacobian(expl_y[..., 3:], event_ndims=1) ]) self.assertAllClose(self.evaluate(expl_y), self.evaluate(blockwise_y)) self.assertAllClose(self.evaluate(expl_fldj), self.evaluate(blockwise_fldj)) self.assertAllClose(self.evaluate(expl_x), self.evaluate(blockwise_x)) self.assertAllClose(self.evaluate(expl_ildj), self.evaluate(blockwise_ildj))
def coroutine_model(): g = yield tfd.LogNormal(0., 1.) df = yield tfd.Exponential(1.) loc = yield tfd.Sample(tfd.Normal(0, g), 20) yield tfd.StudentT(tf.expand_dims(df, -1), loc, 1)
def _expand_to_beam_size(tensor_BxU, beam_size): tensor_Bx1xU = tf.expand_dims(tensor_BxU, axis=1) tile_dims = [1] * tensor_Bx1xU.shape.ndims tile_dims[1] = beam_size tensor_BxMxU = tf.tile(tensor_Bx1xU, tile_dims) return tensor_BxMxU
def _expand_dims(self, inputs, axis): if tf_utils.is_sparse(inputs): return tf.sparse.expand_dims(inputs, axis) else: return tf.expand_dims(inputs, axis)
def vol_fn(t, x): del t, x return tf.expand_dims(tf.constant(vol, dtype=tf.float32), 0)
def apply(self, x1, x2, example_ndims=0): """Apply the kernel function pairs of inputs. Args: x1: `Tensor` input to the kernel, of shape `B1 + E1 + F`, where `B1` and `E1` may be empty (ie, no batch/example dims, resp.) and `F` (the feature shape) must have rank equal to the kernel's `feature_ndims` property. Batch shape must broadcast with the batch shape of `x2` and with the kernel's batch shape. Example shape must broadcast with example shape of `x2`. `x1` and `x2` must have the same *number* of example dims (ie, same rank). x2: `Tensor` input to the kernel, of shape `B2 + E2 + F`, where `B2` and `E2` may be empty (ie, no batch/example dims, resp.) and `F` (the feature shape) must have rank equal to the kernel's `feature_ndims` property. Batch shape must broadcast with the batch shape of `x2` and with the kernel's batch shape. Example shape must broadcast with example shape of `x2`. `x1` and `x2` must have the same *number* of example example_ndims: A python integer, the number of example dims in the inputs. In essence, this parameter controls how broadcasting of the kernel's batch shape with input batch shapes works. The kernel batch shape will be broadcast against everything to the left of the combined example and feature dimensions in the input shapes. Returns: `Tensor` containing the results of applying the kernel function to inputs `x1` and `x2`. If the kernel parameters' batch shape is `Bk` then the shape of the `Tensor` resulting from this method call is `broadcast(Bk, B1, B2) + broadcast(E1, E2)`. Given an index set `S`, a kernel function is mathematically defined as a real- or complex-valued function on `S` satisfying the positive semi-definiteness constraint: ```none sum_i sum_j (c[i]*) c[j] k(x[i], x[j]) >= 0 ``` for any finite collections `{x[1], ..., x[N]}` in `S` and `{c[1], ..., c[N]}` in the reals (or the complex plane). '*' is the complex conjugate, in the complex case. This method most closely resembles the function described in the mathematical definition of a kernel. Given a PositiveSemidefiniteKernel `k` with scalar parameters and inputs `x` and `y` in `S`, `apply(x, y)` yields a single scalar value. #### Examples ```python import tensorflow_probability as tfp # Suppose `SomeKernel` acts on vectors (rank-1 tensors) scalar_kernel = tfp.positive_semidefinite_kernels.SomeKernel(param=.5) scalar_kernel.batch_shape # ==> [] # `x` and `y` are batches of five 3-D vectors: x = np.ones([5, 3], np.float32) y = np.ones([5, 3], np.float32) scalar_kernel.apply(x, y).shape # ==> [5] ``` The above output is the result of vectorized computation of the five values ```none [k(x[0], y[0]), k(x[1], y[1]), ..., k(x[4], y[4])] ``` Now we can consider a kernel with batched parameters: ```python batch_kernel = tfp.positive_semidefinite_kernels.SomeKernel(param=[.2, .5]) batch_kernel.batch_shape # ==> [2] batch_kernel.apply(x, y).shape # ==> Error! [2] and [5] can't broadcast. ``` The parameter batch shape of `[2]` and the input batch shape of `[5]` can't be broadcast together. We can fix this in either of two ways: 1. Give the parameter a shape of `[2, 1]` which will correctly broadcast with `[5]` to yield `[2, 5]`: ```python batch_kernel = tfp.positive_semidefinite_kernels.SomeKernel( param=[[.2], [.5]]) batch_kernel.batch_shape # ==> [2, 1] batch_kernel.apply(x, y).shape # ==> [2, 5] ``` 2. By specifying `example_ndims`, which tells the kernel to treat the `5` in the input shape as part of the "example shape", and "pushing" the kernel batch shape to the left: ```python batch_kernel = tfp.positive_semidefinite_kernels.SomeKernel(param=[.2, .5]) batch_kernel.batch_shape # ==> [2] batch_kernel.apply(x, y, example_ndims=1).shape # ==> [2, 5] """ with self._name_and_control_scope(self._name): x1 = tf.convert_to_tensor(x1, name='x1', dtype_hint=self.dtype) x2 = tf.convert_to_tensor(x2, name='x2', dtype_hint=self.dtype) should_expand_dims = (example_ndims == 0) if should_expand_dims: example_ndims += 1 x1 = tf.expand_dims(x1, -(self.feature_ndims + 1)) x2 = tf.expand_dims(x2, -(self.feature_ndims + 1)) result = self._apply(x1, x2, example_ndims=example_ndims) if should_expand_dims: result = tf.squeeze(result, axis=-1) return result
def test_solving_backward_pde_for_sde_with_const_coeffs(self): # Integration test for converting 2d SDE with constant coeffs to a # backward Kolmogorov PDE and solving it. # The SDE is: # dS_x = (dW_1 + dW_2) / sqrt(2) # dS_y = (dW_1 + dW_2) / sqrt(2) # It is of course trivial, but we'll solve it the hard way for the sake of # testing. # The Kolmogorov backwards PDE is: # u_{t} + D u_{xx} / 2 + D u_{yy} / 2 + D u_{xy} = 0 # The equation can be rewritten as `u_{t} + D u_{zz} = 0`, where # z = (x + y) / sqrt(2). # If the final condition is a gaussian centered at (0, 0) with variance # sigma, then the solution is: # `u(x, y, t) = gaussian((x + y)/sqrt(2), sigma + 2D(t_final - t)) * # gaussian((x - y)/sqrt(2), sigma)`. def vol_fn(t, grid): del t xs = grid[..., 1] vol_elem = tf.ones_like(xs) / np.sqrt( 2) # all 4 elements are equal. return tf.stack( (tf.stack((vol_elem, vol_elem), axis=-1), tf.stack((vol_elem, vol_elem), axis=-1)), axis=-1) drift_fn = lambda t, grid: tf.zeros(grid.shape) process = GenericItoProcess(dim=2, volatility_fn=vol_fn, drift_fn=drift_fn, dtype=tf.float32) grid = grids.uniform_grid(minimums=[-10, -20], maximums=[10, 20], sizes=[201, 301], dtype=tf.float32) ys = self.evaluate(grid[0]) xs = self.evaluate(grid[1]) diff_coeff = 1 time_step = 0.1 final_t = 3 final_variance = 1 variance_along_diagonal = final_variance + 2 * diff_coeff * final_t def expected_fn(x, y): return (_gaussian( (x + y) / np.sqrt(2), variance_along_diagonal) * _gaussian( (x - y) / np.sqrt(2), final_variance)) expected = np.array([[expected_fn(x, y) for x in xs] for y in ys]) final_values = tf.expand_dims(tf.constant(np.outer( _gaussian(ys, final_variance), _gaussian(xs, final_variance)), dtype=tf.float32), axis=0) result = self.evaluate( process.fd_solver_backward(start_time=final_t, end_time=0, coord_grid=grid, values_grid=final_values, time_step=time_step, dtype=tf.float32)[0]) self.assertLess( np.max(np.abs(result - expected)) / np.max(expected), 0.01)
def add_batch_dim(nest): return tf.nest.map_structure(lambda t: tf.expand_dims(t, 0), nest)
def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs if FLAGS.version2 and FLAGS.ensemble_size > 1: images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1]) if not (FLAGS.member_sampling or FLAGS.expected_probs): labels = tf.tile(labels, [FLAGS.ensemble_size]) if FLAGS.num_train_samples > 1: images = tf.tile(images, [FLAGS.num_train_samples, 1, 1, 1]) with tf.GradientTape() as tape: logits = model(images, training=True) probs = tf.nn.softmax(logits) # Diversity evaluation. if FLAGS.version2 and FLAGS.ensemble_size > 1: per_probs = tf.reshape( probs, tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]], 0)) diversity_results = ed.metrics.average_pairwise_diversity( per_probs, FLAGS.ensemble_size) if FLAGS.num_train_samples > 1: probs = tf.reshape( probs, tf.concat( [[FLAGS.num_train_samples, -1], probs.shape[1:]], 0)) probs = tf.reduce_mean(probs, 0) if FLAGS.member_sampling and FLAGS.version2 and FLAGS.ensemble_size > 1: idx = tf.random.uniform([], maxval=FLAGS.ensemble_size, dtype=tf.int64) idx_one_hot = tf.expand_dims( tf.one_hot(idx, FLAGS.ensemble_size, dtype=probs.dtype), 0) probs_shape = probs.shape probs = tf.reshape(probs, [FLAGS.ensemble_size, -1]) probs = tf.matmul(idx_one_hot, probs) probs = tf.reshape(probs, tf.concat([[-1], probs_shape[1:]], 0)) elif FLAGS.expected_probs and FLAGS.version2 and FLAGS.ensemble_size > 1: probs = tf.reshape( probs, tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]], 0)) probs = tf.reduce_mean(probs, 0) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, probs)) filtered_variables = [] for var in model.trainable_variables: # Apply l2 on the slow weights and bias terms. This excludes BN # parameters and fast weight approximate posterior/prior parameters, # but pay caution to their naming scheme. if 'kernel' in var.name or 'bias' in var.name: filtered_variables.append(tf.reshape(var, (-1, ))) l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss( tf.concat(filtered_variables, axis=0)) kl = sum(model.losses) / train_dataset_size kl_scale = tf.cast(optimizer.iterations + 1, kl.dtype) kl_scale /= FLAGS.kl_annealing_steps kl_scale = tf.minimum(1., kl_scale) kl_loss = kl_scale * kl # Scale the loss given the TPUStrategy will reduce sum all gradients. loss = negative_log_likelihood + l2_loss + kl_loss scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) # Separate learning rate implementation. grad_list = [] if FLAGS.fast_weight_lr_multiplier != 1.0: grads_and_vars = list(zip(grads, model.trainable_variables)) for vec, var in grads_and_vars: # Apply different learning rate on the fast weight approximate # posterior/prior parameters. This is excludes BN and slow weights, # but pay caution to the naming scheme. if ('batch_norm' not in var.name and 'kernel' not in var.name): grad_list.append( (vec * FLAGS.fast_weight_lr_multiplier, var)) else: grad_list.append((vec, var)) optimizer.apply_gradients(grad_list) else: optimizer.apply_gradients(zip(grads, model.trainable_variables)) metrics['train/ece'].update_state(labels, probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, probs) if FLAGS.version2 and FLAGS.ensemble_size > 1: for k, v in diversity_results.items(): training_diversity['train/' + k].update_state(v)
def posterior_marginals(self, observations, name=None): """Compute marginal posterior distribution for each state. This function computes, for each time step, the marginal conditional probability that the hidden Markov model was in each possible state given the observations that were made at each time step. So if the hidden states are `z[0],...,z[num_steps - 1]` and the observations are `x[0], ..., x[num_steps - 1]`, then this function computes `P(z[i] | x[0], ..., x[num_steps - 1])` for all `i` from `0` to `num_steps - 1`. This operation is sometimes called smoothing. It uses a form of the forward-backward algorithm. Note: the behavior of this function is undefined if the `observations` argument represents impossible observations from the model. Args: observations: A tensor representing a batch of observations made on the hidden Markov model. The rightmost dimension of this tensor gives the steps in a sequence of observations from a single sample from the hidden Markov model. The size of this dimension should match the `num_steps` parameter of the hidden Markov model object. The other dimensions are the dimensions of the batch and these are broadcast with the hidden Markov model's parameters. name: Python `str` name prefixed to Ops created by this class. Default value: "HiddenMarkovModel". Returns: posterior_marginal: A `Categorical` distribution object representing the marginal probability of the hidden Markov model being in each state at each step. The rightmost dimension of the `Categorical` distributions batch will equal the `num_steps` parameter providing one marginal distribution for each step. The other dimensions are the dimensions corresponding to the batch of observations. Raises: ValueError: if rightmost dimension of `observations` does not have size `num_steps`. """ with tf.name_scope(name or "posterior_marginals"): with tf.control_dependencies(self._runtime_assertions): observation_tensor_shape = tf.shape(input=observations) with self._observation_shape_preconditions( observation_tensor_shape): observation_batch_shape = observation_tensor_shape[:-1 - self. _underlying_event_rank] observation_event_shape = observation_tensor_shape[ -1 - self._underlying_event_rank:] batch_shape = tf.broadcast_dynamic_shape( observation_batch_shape, self.batch_shape_tensor()) log_init = tf.broadcast_to( self._log_init, tf.concat([batch_shape, [self._num_states]], axis=0)) log_transition = self._log_trans observations = tf.broadcast_to( observations, tf.concat([batch_shape, observation_event_shape], axis=0)) observation_rank = tf.rank(observations) underlying_event_rank = self._underlying_event_rank observations = distribution_util.move_dimension( observations, observation_rank - underlying_event_rank - 1, 0) observations = tf.expand_dims( observations, observation_rank - underlying_event_rank) observation_log_probs = self._observation_distribution.log_prob( observations) log_adjoint_prob = tf.zeros_like(log_init) def forward_step(log_previous_step, log_prob_observation): return _log_vector_matrix( log_previous_step, log_transition) + log_prob_observation log_prob = log_init + observation_log_probs[0] forward_log_probs = tf.scan(forward_step, observation_log_probs[1:], initializer=log_prob, name="forward_log_probs") forward_log_probs = tf.concat( [[log_prob], forward_log_probs], axis=0) def backward_step(log_previous_step, log_prob_observation): return _log_matrix_vector( log_transition, log_prob_observation + log_previous_step) backward_log_adjoint_probs = tf.scan( backward_step, observation_log_probs[1:], initializer=log_adjoint_prob, reverse=True, name="backward_log_adjoint_probs") total_log_prob = tf.reduce_logsumexp( input_tensor=forward_log_probs[-1], axis=-1) backward_log_adjoint_probs = tf.concat( [backward_log_adjoint_probs, [log_adjoint_prob]], axis=0) log_likelihoods = forward_log_probs + backward_log_adjoint_probs marginal_log_probs = distribution_util.move_dimension( log_likelihoods - total_log_prob[..., tf.newaxis], 0, -2) return categorical.Categorical(logits=marginal_log_probs)
def _batch_interp_with_gather_nd(x, x_ref_min, x_ref_max, y_ref, nd, fill_value, batch_dims): """N-D interpolation that works with leading batch dims.""" dtype = x.dtype # In this function, # x.shape = [A1, ..., An, D, nd], where n = batch_dims # and # y_ref.shape = [A1, ..., An, C1, C2,..., Cnd, B1,...,BM] # y_ref[A1, ..., An, i1,...,ind] is a shape [B1,...,BM] Tensor with the value # at index [i1,...,ind] in the interpolation table. # and x_ref_max have shapes [A1, ..., An, nd]. # ny[k] is number of y reference points in interp dim k. ny = tf.cast(tf.shape(y_ref)[batch_dims:batch_dims + nd], dtype) # Map [x_ref_min, x_ref_max] to [0, ny - 1]. # This is the (fractional) index of x. # x_idx_unclipped[A1, ..., An, d, k] is the fractional index into dim k of # interpolation table for the dth x value. x_ref_min_expanded = tf.expand_dims(x_ref_min, axis=-2) x_ref_max_expanded = tf.expand_dims(x_ref_max, axis=-2) x_idx_unclipped = (ny - 1) * (x - x_ref_min_expanded) / ( x_ref_max_expanded - x_ref_min_expanded) # Wherever x is NaN, x_idx_unclipped will be NaN as well. # Keep track of the nan indices here (so we can impute NaN later). # Also eliminate any NaN indices, since there is not NaN in 32bit. nan_idx = tf.math.is_nan(x_idx_unclipped) x_idx_unclipped = tf.where(nan_idx, tf.cast(0., dtype=dtype), x_idx_unclipped) # x_idx.shape = [A1, ..., An, D, nd] x_idx = tf.clip_by_value(x_idx_unclipped, tf.zeros((), dtype=dtype), ny - 1) # Get the index above and below x_idx. # Naively we could set idx_below = floor(x_idx), idx_above = ceil(x_idx), # however, this results in idx_below == idx_above whenever x is on a grid. # This in turn results in y_ref_below == y_ref_above, and then the gradient # at this point is zero. So here we 'jitter' one of idx_below, idx_above, # so that they are at different values. This jittering does not affect the # interpolated value, but does make the gradient nonzero (unless of course # the y_ref values are the same). idx_below = tf.floor(x_idx) idx_above = tf.minimum(idx_below + 1, ny - 1) idx_below = tf.maximum(idx_above - 1, 0) # These are the values of y_ref corresponding to above/below indices. # idx_below_int32.shape = x.shape[:-1] + [nd] idx_below_int32 = tf.cast(idx_below, dtype=tf.int32) idx_above_int32 = tf.cast(idx_above, dtype=tf.int32) # idx_below_list is a length nd list of shape x.shape[:-1] int32 tensors. idx_below_list = tf.unstack(idx_below_int32, axis=-1) idx_above_list = tf.unstack(idx_above_int32, axis=-1) # Use t to get a convex combination of the below/above values. # t.shape = [A1, ..., An, D, nd] t = x_idx - idx_below # x, and tensors shaped like x, need to be added to, and selected with # (using tf.where) the output y. This requires appending singletons. def _expand_x_fn(tensor): # Reshape tensor to tensor.shape + [1] * M. extended_shape = tf.concat([ tf.shape(tensor), tf.ones_like(tf.shape(y_ref)[batch_dims + nd:]) ], axis=0) return tf.reshape(tensor, extended_shape) # Now, t.shape = [A1, ..., An, D, nd] + [1] * (rank(y_ref) - nd - batch_dims) t = _expand_x_fn(t) s = 1 - t # Re-insert NaN wherever x was NaN. nan_idx = _expand_x_fn(nan_idx) t = tf.where(nan_idx, tf.constant(np.nan, dtype), t) terms = [] # Our work above has located x's fractional index inside a cube of above/below # indices. The distance to the below indices is t, and to the above indices # is s. # Drawing lines from x to the cube walls, we get 2**nd smaller cubes. Each # term in the result is a product of a reference point, gathered from y_ref, # multiplied by a volume. The volume is that of the cube opposite to the # reference point. E.g. if the reference point is below x in every axis, the # volume is that of the cube with corner above x in every axis, s[0]*...*s[nd] # We could probably do this with one massive gather, but that would be very # unreadable and un-debuggable. It also would create a large Tensor. for zero_ones_list in _binary_count(nd): gather_from_y_ref_idx = [] opposite_volume_t_idx = [] opposite_volume_s_idx = [] for k, zero_or_one in enumerate(zero_ones_list): if zero_or_one == 0: # If the kth iterate has zero_or_one = 0, # Will gather from the 'below' reference point along axis k. gather_from_y_ref_idx.append(idx_below_list[k]) # Now append the index to gather for computing opposite_volume. # This could be done by initializing opposite_volume to 1, then here: # opposite_volume *= tf.gather(s, indices=k, axis=tf.rank(x) - 1) # but that puts a gather in the 'inner loop.' Better to append the # index and do one larger gather down below. opposite_volume_s_idx.append(k) else: gather_from_y_ref_idx.append(idx_above_list[k]) # Append an index to gather, having the same effect as # opposite_volume *= tf.gather(t, indices=k, axis=tf.rank(x) - 1) opposite_volume_t_idx.append(k) # Compute opposite_volume (volume of cube opposite the ref point): # Recall t.shape = s.shape = [D, nd] + [1, ..., 1] # Gather from t and s along the 'nd' axis, which is rank(x) - 1. ov_axis = tf.rank(x) - 1 opposite_volume = (tf.reduce_prod( tf.gather(t, indices=tf.cast(opposite_volume_t_idx, dtype=tf.int32), axis=ov_axis), axis=ov_axis) * tf.reduce_prod(tf.gather( s, indices=tf.cast(opposite_volume_s_idx, dtype=tf.int32), axis=ov_axis), axis=ov_axis)) # pyformat: disable y_ref_pt = tf.gather_nd(y_ref, tf.stack(gather_from_y_ref_idx, axis=-1), batch_dims=batch_dims) terms.append(y_ref_pt * opposite_volume) y = tf.math.add_n(terms) if tf.debugging.is_numeric_tensor(fill_value): # Recall x_idx_unclipped.shape = [D, nd], # so here we check if it was out of bounds in any of the nd dims. # Thus, oob_idx.shape = [D]. oob_idx = tf.reduce_any( (x_idx_unclipped < 0) | (x_idx_unclipped > ny - 1), axis=-1) # Now, y.shape = [D, B1,...,BM], so we'll have to broadcast oob_idx. oob_idx = _expand_x_fn(oob_idx) # Shape [D, 1,...,1] oob_idx |= tf.fill(tf.shape(y), False) y = tf.where(oob_idx, fill_value, y) return y
def posterior_mode(self, observations, name=None): """Compute maximum likelihood sequence of hidden states. When this function is provided with a sequence of observations `x[0], ..., x[num_steps - 1]`, it returns the sequence of hidden states `z[0], ..., z[num_steps - 1]`, drawn from the underlying Markov chain, that is most likely to yield those observations. It uses the [Viterbi algorithm]( https://en.wikipedia.org/wiki/Viterbi_algorithm). Note: the behavior of this function is undefined if the `observations` argument represents impossible observations from the model. Note: if there isn't a unique most likely sequence then one of the equally most likely sequences is chosen. Args: observations: A tensor representing a batch of observations made on the hidden Markov model. The rightmost dimensions of this tensor correspond to the dimensions of the observation distributions of the underlying Markov chain. The next dimension from the right indexes the steps in a sequence of observations from a single sample from the hidden Markov model. The size of this dimension should match the `num_steps` parameter of the hidden Markov model object. The other dimensions are the dimensions of the batch and these are broadcast with the hidden Markov model's parameters. name: Python `str` name prefixed to Ops created by this class. Default value: "HiddenMarkovModel". Returns: posterior_mode: A `Tensor` representing the most likely sequence of hidden states. The rightmost dimension of this tensor will equal the `num_steps` parameter providing one hidden state for each step. The other dimensions are those of the batch. Raises: ValueError: if the `observations` tensor does not consist of sequences of `num_steps` observations. #### Examples ```python tfd = tfp.distributions # A simple weather model. # Represent a cold day with 0 and a hot day with 1. # Suppose the first day of a sequence has a 0.8 chance of being cold. initial_distribution = tfd.Categorical(probs=[0.8, 0.2]) # Suppose a cold day has a 30% chance of being followed by a hot day # and a hot day has a 20% chance of being followed by a cold day. transition_distribution = tfd.Categorical(probs=[[0.7, 0.3], [0.2, 0.8]]) # Suppose additionally that on each day the temperature is # normally distributed with mean and standard deviation 0 and 5 on # a cold day and mean and standard deviation 15 and 10 on a hot day. observation_distribution = tfd.Normal(loc=[0., 15.], scale=[5., 10.]) # This gives the hidden Markov model: model = tfd.HiddenMarkovModel( initial_distribution=initial_distribution, transition_distribution=transition_distribution, observation_distribution=observation_distribution, num_steps=7) # Suppose we observe gradually rising temperatures over a week: temps = [-2., 0., 2., 4., 6., 8., 10.] # We can now compute the most probable sequence of hidden states: model.posterior_mode(temps) # The result is [0 0 0 0 0 1 1] telling us that the transition # from "cold" to "hot" most likely happened between the # 5th and 6th days. ``` """ with tf.name_scope(name or "posterior_mode"): with tf.control_dependencies(self._runtime_assertions): observation_tensor_shape = tf.shape(input=observations) with self._observation_shape_preconditions( observation_tensor_shape): observation_batch_shape = observation_tensor_shape[:-1 - self. _underlying_event_rank] observation_event_shape = observation_tensor_shape[ -1 - self._underlying_event_rank:] batch_shape = tf.broadcast_dynamic_shape( observation_batch_shape, self.batch_shape_tensor()) log_init = tf.broadcast_to( self._log_init, tf.concat([batch_shape, [self._num_states]], axis=0)) observations = tf.broadcast_to( observations, tf.concat([batch_shape, observation_event_shape], axis=0)) observation_rank = tf.rank(observations) underlying_event_rank = self._underlying_event_rank observations = distribution_util.move_dimension( observations, observation_rank - underlying_event_rank - 1, 0) # We need to compute the probability of each observation for # each possible state. # This requires inserting an extra index just before the # observation event indices that will be broadcast with the # last batch index in `observation_distribution`. observations = tf.expand_dims( observations, observation_rank - underlying_event_rank) observation_log_probs = self._observation_distribution.log_prob( observations) log_prob = log_init + observation_log_probs[0] if self._num_steps == 1: most_likely_end = tf.argmax(input=log_prob, axis=-1) return most_likely_end[..., tf.newaxis] def forward_step(previous_step_pair, log_prob_observation): log_prob_previous = previous_step_pair[0] log_prob = (log_prob_previous[..., tf.newaxis] + self._log_trans + log_prob_observation[..., tf.newaxis, :]) most_likely_given_successor = tf.argmax(input=log_prob, axis=-2) max_log_p_given_successor = tf.reduce_max( input_tensor=log_prob, axis=-2) return (max_log_p_given_successor, most_likely_given_successor) forward_log_probs, all_most_likely_given_successor = tf.scan( forward_step, observation_log_probs[1:], initializer=(log_prob, tf.zeros(tf.shape(input=log_init), dtype=tf.int64)), name="forward_log_probs") most_likely_end = tf.argmax(input=forward_log_probs[-1], axis=-1) # We require the operation that gives C from A and B where # C[i...j] = A[i...j, B[i...j]] # and A = most_likely_given_successor # B = most_likely_successor. # tf.gather requires indices of known shape so instead we use # reduction with tf.one_hot(B) to pick out elements from B def backward_step(most_likely_successor, most_likely_given_successor): return tf.reduce_sum( input_tensor=(most_likely_given_successor * tf.one_hot(most_likely_successor, self._num_states, dtype=tf.int64)), axis=-1) backward_scan = tf.scan(backward_step, all_most_likely_given_successor, most_likely_end, reverse=True) most_likely_sequences = tf.concat( [backward_scan, [most_likely_end]], axis=0) return distribution_util.move_dimension( most_likely_sequences, 0, -1)
def state_y(self, t: types.RealTensor, name: str = None) -> types.RealTensor: """Computes the state variable `y(t)` for tha Gaussian HJM Model. For Gaussian HJM model, the state parameter y(t), can be analytically computed as follows: y_ij(t) = exp(-k_i * t) * exp(-k_j * t) * ( int_0^t rho_ij * sigma_i(u) * sigma_j(u) * du) Args: t: A rank 1 real `Tensor` of shape `[num_times]` specifying the time `t`. name: Python string. The name to give to the ops created by this function. Default value: `None` which maps to the default name `state_y`. Returns: A real `Tensor` of shape [self._factors, self._factors, num_times] containing the computed y_ij(t). """ name = name or 'state_y' with tf.name_scope(name): t = tf.convert_to_tensor(t, dtype=self._dtype) t_shape = tf.shape(t) t = tf.broadcast_to(t, tf.concat([[self._dim], t_shape], axis=0)) time_index = tf.searchsorted(self._jump_locations, t) # create a matrix k2(i,j) = k(i) + k(j) mr2 = tf.expand_dims(self._mean_reversion, axis=-1) # Add a dimension corresponding to `num_times` mr2 = tf.expand_dims(mr2 + tf.transpose(mr2), axis=-1) def _integrate_volatility_squared(vol, l_limit, u_limit): # create sigma2_ij = sigma_i * sigma_j vol = tf.expand_dims(vol, axis=-2) vol_squared = tf.expand_dims(self._rho, axis=-1) * ( vol * tf.transpose(vol, perm=[1, 0, 2])) return vol_squared / mr2 * (tf.math.exp(mr2 * u_limit) - tf.math.exp(mr2 * l_limit)) is_constant_vol = tf.math.equal( tf.shape(self._jump_values_vol)[-1], 0) v_squared_between_vol_knots = tf.cond( is_constant_vol, lambda: tf.zeros(shape=(self._dim, self._dim, 0), dtype=self._dtype), lambda: _integrate_volatility_squared( # pylint: disable=g-long-lambda self._jump_values_vol, self._padded_knots, self. _jump_locations)) v_squared_at_vol_knots = tf.concat([ tf.zeros((self._dim, self._dim, 1), dtype=self._dtype), utils.cumsum_using_matvec(v_squared_between_vol_knots) ], axis=-1) vn = tf.concat([self._zero_padding, self._jump_locations], axis=1) v_squared_t = _integrate_volatility_squared( self._volatility(t), tf.gather(vn, time_index, batch_dims=1), t) v_squared_t += tf.gather(v_squared_at_vol_knots, time_index, batch_dims=-1) return tf.math.exp(-mr2 * t) * v_squared_t
def regression_loss(logits, labels, num_steps, steps, seq_lens, loss_type, normalize_indices, variance_lambda, huber_delta): """Loss function based on regressing to the correct indices. In the paper, this is called Cycle-back Regression. There are 3 variants of this loss: i) regression_mse: MSE of the predicted indices and ground truth indices. ii) regression_mse_var: MSE of the predicted indices that takes into account the variance of the similarities. This is important when the rate at which sequences go through different phases changes a lot. The variance scaling allows dynamic weighting of the MSE loss based on the similarities. iii) regression_huber: Huber loss between the predicted indices and ground truth indices. Args: logits: Tensor, Pre-softmax similarity scores after cycling back to the starting sequence. labels: Tensor, One hot labels containing the ground truth. The index where the cycle started is 1. num_steps: Integer, Number of steps in the sequence embeddings. steps: Tensor, step indices/frame indices of the embeddings of the shape [N, T] where N is the batch size, T is the number of the timesteps. seq_lens: Tensor, Lengths of the sequences from which the sampling was done. This can provide additional temporal information to the alignment loss. loss_type: String, This specifies the kind of regression loss function. Currently supported loss functions: regression_mse, regression_mse_var, regression_huber. normalize_indices: Boolean, If True, normalizes indices by sequence lengths. Useful for ensuring numerical instabilities don't arise as sequence indices can be large numbers. variance_lambda: Float, Weight of the variance of the similarity predictions while cycling back. If this is high then the low variance similarities are preferred by the loss while making this term low results in high variance of the similarities (more uniform/random matching). huber_delta: float, Huber delta described in tf.keras.losses.huber_loss. Returns: loss: Tensor, A scalar loss calculated using a variant of regression. """ # Just to be safe, we stop gradients from labels as we are generating labels. labels = tf.stop_gradient(labels) steps = tf.stop_gradient(steps) if normalize_indices: float_seq_lens = tf.cast(seq_lens, tf.float32) tile_seq_lens = tf.tile(tf.expand_dims(float_seq_lens, axis=1), [1, num_steps]) steps = tf.cast(steps, tf.float32) / tile_seq_lens else: steps = tf.cast(steps, tf.float32) beta = tf.nn.softmax(logits) true_time = tf.reduce_sum(steps * labels, axis=1) pred_time = tf.reduce_sum(steps * beta, axis=1) if loss_type in ['regression_mse', 'regression_mse_var']: if 'var' in loss_type: # Variance aware regression. pred_time_tiled = tf.tile(tf.expand_dims(pred_time, axis=1), [1, num_steps]) pred_time_variance = tf.reduce_sum( tf.square(steps - pred_time_tiled) * beta, axis=1) # Using log of variance as it is numerically stabler. pred_time_log_var = tf.math.log(pred_time_variance) squared_error = tf.square(true_time - pred_time) return tf.reduce_mean( tf.math.exp(-pred_time_log_var) * squared_error + variance_lambda * pred_time_log_var) else: return tf.reduce_mean( tf.keras.losses.mean_squared_error(y_true=true_time, y_pred=pred_time)) elif loss_type == 'regression_huber': return tf.reduce_mean( tf.keras.losses.huber_loss(y_true=true_time, y_pred=pred_time, delta=huber_delta)) else: raise ValueError( 'Unsupported regression loss %s. Supported losses are: ' 'regression_mse, regresstion_mse_var and regression_huber.' % loss_type)
def _add_right_batch_dim(obs, event_shape): ndims = prefer_static.rank_from_shape(prefer_static.shape(obs)) event_ndims = prefer_static.rank_from_shape(event_shape) return tf.expand_dims(obs, ndims - event_ndims)
def _piecewise_fn(t, x): """Volatility function of the GBM with piecewise constant volatility.""" vol = self._sigma(t) * tf.expand_dims(x, -1) return vol