def _find_best_span(args): """Compute the best span.""" current_start_scores, current_end_scores = args # [seq_len], [seq_len] start_max, start_backpointers, _, _ = tf.scan( fn=_cumulative_max, elems=current_start_scores, initializer=(float("-inf"), -1, 0, 1), back_prop=False, reverse=False) end_max, end_backpointers, _, _ = tf.scan(fn=_cumulative_max, elems=current_end_scores, initializer=(float("-inf"), -1, seq_len - 1, -1), back_prop=False, reverse=True) # [] total_max = start_max + end_max best_index = tf.argmax(total_max, -1) best_start = start_backpointers[best_index] best_end = end_backpointers[best_index] best_score = total_max[best_index] return best_start, best_end, best_score
def reverse_part(inputs, hparams, n_bits): """Reverse part of Benes block. Repeatably applies interleaved Residual Switch layer and Reverse Shuffle Layer. One set of weights used for all Switch layers. Args: inputs: inputs for reverse part. Should be outputs from forward part. hparams: params of the network. n_bits: count of repeated layer applications. Returns: tf.Tensor: output of reverse part. """ reverse_rsu = RSU("reverse_switch", hparams.dropout, hparams.mode) def reverse_step(state, _): with tf.variable_scope("reverse"): new_state = reverse_rsu(state) return reverse_shuffle_layer(new_state) reverse_outputs = tf.scan(reverse_step, tf.range(n_bits, n_bits * 2), initializer=inputs, parallel_iterations=1, swap_memory=True) return reverse_outputs[-1, :, :, :]
def forward_pass_states(self, processed_input, initial_hidden): all_hidden_states = tf.scan( self.forward_pass_gru, processed_input, initializer=initial_hidden, name='states') return all_hidden_states
def lstm(self, inps): weight, bias = self.recurrent_weight, self.recurrent_bias init_state = tf.zeros(shape=[2, inps.shape[0], self.hidden_size], dtype=tf.float32) def step(hprev, x): st_1, ct_1 = tf.unstack(hprev) rows, columns, values, row_indices, row_offsets, column_indices = self.dynamic_gate( x) fc_gate = kernels.spmm(rows, columns, values, row_indices, row_offsets, column_indices, tf.transpose(tf.concat([x, st_1], -1)), False, False) fc_gate = tf.transpose(fc_gate) + bias i, f, g, o = tf.split(fc_gate, 4, axis=1) i, f, g, o = tf.sigmoid(i), tf.sigmoid(f), tf.tanh(g), tf.sigmoid( o) ct = ct_1 * f + g * i st = tf.tanh(ct) * o return tf.stack([st, ct]) states = tf.scan(step, tf.transpose(inps, [1, 0, 2]), initializer=init_state) return tf.transpose(states, [1, 2, 0, 3])[0]
def diagonal_neural_gpu(inputs, hparams, name=None): """Improved Neural GPU as in https://arxiv.org/abs/1702.08727.""" with tf.variable_scope(name, "diagonal_neural_gpu"): def step(state_tup, inp): """Single step of the improved Neural GPU.""" state, _ = state_tup x = state for layer in range(hparams.num_hidden_layers): x, new_loss = common_layers.diagonal_conv_gru( x, (hparams.kernel_height, hparams.kernel_width), hparams.hidden_size, dropout=hparams.dropout, name="dcgru_%d" % layer) # Padding input is zeroed-out in the modality, we check this by summing. padding_inp = tf.less(tf.reduce_sum(tf.abs(inp), axis=[1, 2]), 0.00001) new_state = tf.where(padding_inp, state, x) # No-op where inp is padding. return new_state, new_loss final_state, losses = tf.scan(step, tf.transpose(inputs, [1, 0, 2, 3]), initializer=(inputs, tf.constant(0.0)), parallel_iterations=1, swap_memory=True) return final_state[0, :, :, :, :], 2.0 * tf.reduce_mean(losses)
def forward_part(block_out, hparams, n_bits): """Forward part of Benes block. Repeatably applies interleaved Residual Switch layer and Shuffle Layer. One set of weights used for all Switch layers. Args: block_out: TODO(authors) document. hparams: params of the network. n_bits: count of repeated layer applications. Returns: tf.Tensor: output of forward part. """ forward_rsu = RSU("switch", hparams.dropout, hparams.mode) def forward_step(state, _): with tf.variable_scope("forward"): new_state = forward_rsu(state) return shuffle_layer(new_state) forward_outputs = tf.scan(forward_step, tf.range(0, n_bits), initializer=block_out, parallel_iterations=1, swap_memory=True) return forward_outputs[-1, :, :, :]
def truncated_rtrl(self, dL_dy, **kwargs): """The origin LSTM_97 real-time recurrent training algorithm :param dL_dy: dL/dy with shape (num_steps(=1), batch_size, *y.shape) :return: (grads_and_vars, dS/dW) """ # Step 0: Split new_c outside the scan loop S = self._new_state_tensor[1] state_size = S.shape[1] Ss = tf.split(S, num_or_size_splits=state_size, axis=1) # Step 1: Update dS/dW = (dS/dW1, ..., dS/dWm) mascot = tf.placeholder(tf.float32) dS_dW = [] for dS_dWj_tau, Wj in zip(self.gradient_buffer_placeholder, self.custom_var_list): dS_dWj = [] split_dS_dWj_tau = tf.split(dS_dWj_tau, num_or_size_splits=state_size, axis=-1) for dSi_dWj_tau, Si in zip(split_dS_dWj_tau, Ss): dSi_dWj = [] for b in range(hub.batch_size): grad = tf.gradients(Si[b], Wj)[0] dSi_dWj_n = dSi_dWj_tau[b] + tf.expand_dims(grad, -1) dSi_dWj.append(tf.expand_dims(dSi_dWj_n, 0)) # Concatenate along batches dSi_dWj = tf.concat(dSi_dWj, axis=0) # (B, *W, 1) dS_dWj.append(dSi_dWj) dS_dWj = tf.concat(dS_dWj, axis=-1) # (B, *W, S) dS_dW.append(dS_dWj) dS_dW = tuple(dS_dW) # Step 2: Compute dL/dW as dL/dy * dy/dS * dS/dW # = \sum_{n over batches} \sum_{k over states} ... # dL/dy * dy/dS * dS/dW # (1) dL_dy.shape = (?(=1), B, D) in dL_dy # (2) dy_dS = (dy1/dS, ..., dyn/dS) in self._grad_tensors #TODO dL_dy = tf.reshape(dL_dy, shape=(-1, 1, dL_dy.shape[2])) # (B, 1, D) dy_dS = self._grad_tensors # (B, D, S) def calc_dL_dW(_, mass): dldy, dyds, dsdw = mass # Calculate dL/dS dlds = tf.matmul(dldy, dyds) # (1, S) # Calculate dL/dW dL_dW = [] for dsdwj in dsdw: # (*wj.shape, S) dL_dW.append(tf.reduce_sum(tf.multiply(dsdwj, dlds), axis=-1)) return tuple(dL_dW) dL_dS_batch = tf.scan(calc_dL_dW, (dL_dy, dy_dS, dS_dW), initializer=(mascot, ) * len(self.custom_var_list)) dL_dS = [tf.reduce_sum(t, axis=0) for t in dL_dS_batch] # Step 3: Return (((dW1, W1), ..., (dWn, Wn)), dS/dW) grads_and_vars = [(g, v) for g, v in zip(dL_dS, self.custom_var_list)] return tuple(grads_and_vars), dS_dW
def forward_pass_states(self, processed_input, initial_hidden): all_hidden_states = tf.scan( self.forward_pass_lstm, processed_input, initializer=initial_hidden, name='states') all_hidden_states = all_hidden_states[:, 0, :, :] return all_hidden_states
def get_first_occurrence_indices(reference, symbol, optimize_for_tpu=False): """For each row in reference, get index after the first occurrence of symbol. If symbol is not present on a row, return reference.shape[1] instead. Args: reference: [B, T] tensor of elements of the same type as symbol. symbol: int or [] scalar tensor of the same dtype as symbol. optimize_for_tpu: bool, whether to use a TPU-capable variant. Returns: A [B] reference of tf.int32 where x[i] is such that reference[i, x[i]-1] == symbol, and reference[i, j] != symbol for j<i-1. If symbol is not present on row i then x[i] = T. """ if optimize_for_tpu: # Run code which can be compiled on TPU. # Transpose refernce to [T, B] reference = tf.transpose(reference, [1, 0]) range_tensor = tf.range(reference.shape.as_list()[0]) indexes = tf.stack([range_tensor] * reference.shape.as_list()[1], 1) symbol = tf.stack([symbol] * reference.shape.as_list()[1], 0) initial_indices = tf.constant(reference.shape.as_list()[0], shape=[reference.shape.as_list()[1]], dtype=tf.int32) # We want a function which moves backwards. def fn(current_index, elems): ref, ind = elems return tf.where(tf.equal(ref, symbol), ind + 1, current_index) min_indexes = tf.scan(fn, (reference, indexes), initializer=initial_indices, parallel_iterations=1, reverse=True) return min_indexes[0] batch_size, max_length = reference.get_shape().as_list() symbol = tf.convert_to_tensor(symbol) symbol.shape.assert_is_compatible_with([]) # Add symbol at the end of each row, to make sure tf.where works. tensor = tf.concat( [reference, tf.tile(symbol[None, None], [batch_size, 1])], axis=1) index_all_occurrences = tf.where(tf.equal(tensor, symbol)) index_all_occurrences = tf.cast(index_all_occurrences, tf.int32) # `index_all_occurrences` is a [N, 2] tensor with coordinates of all positions # of `symbol` in `tensor`. So N will be >= batch size since there can be # several `symbol` in one row of tensor. We need to take only the position # of the first occurrence for each row. `segment_min` does that, taking the # lowest column index for each row index. index_first_occurrences = tf.segment_min(index_all_occurrences[:, 1], index_all_occurrences[:, 0]) index_first_occurrences.set_shape([batch_size]) index_first_occurrences = tf.minimum(index_first_occurrences + 1, max_length) return index_first_occurrences
def discounted_return(reward, length, discount): """Discounted Monte-Carlo returns.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) return_ = tf.reverse( tf.transpose( tf.scan(lambda agg, cur: cur + discount * agg, tf.transpose(tf.reverse(mask * reward, [1]), [1, 0]), tf.zeros_like(reward[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(return_), 'return')
def gamma_scales_log_prob_fn(params): assert num_classes == 2 def unmarshal(params): results = [] n_dimensions_used = 0 if regression_use_beta_scales: dim_list = [num_features, num_features, 1] else: dim_list = [num_features, 1] for n_to_add in dim_list: results.append( params[Ellipsis, n_dimensions_used:n_dimensions_used + n_to_add]) n_dimensions_used += n_to_add return tuple(results) log_prob = 0. if regression_use_beta_scales: beta, beta_log_scales, overall_log_scale = unmarshal(params) # p(per-variable scales) log_prob += tf.reduce_sum( tfd.TransformedDistribution( tfd.Gamma(0.5, 0.5), tfb.Invert(tfb.Exp())).log_prob(beta_log_scales), -1) else: beta, overall_log_scale = unmarshal(params) beta_log_scales = 0.0 # p(overall scale) log_prob += tf.reduce_sum( tfd.Normal(0., 10.).log_prob(overall_log_scale), -1) # p(beta) log_prob += tf.reduce_sum(tfd.Normal(0., 1.).log_prob(beta), -1) # p(y | x, beta) scaled_beta = beta * tf.exp(overall_log_scale) * tf.exp( beta_log_scales) if batch_size: def body(_, i): logits = tf.einsum("nd,md->mn", x[i:i + batch_size], scaled_beta) return tf.reduce_sum( tfd.Bernoulli(logits=logits).log_prob( y[i:i + batch_size]), -1) log_prob += tf.reduce_sum( tf.scan(body, tf.range(0, x.shape[0], batch_size), initializer=tf.zeros(tf.shape(params)[:1]), parallel_iterations=1), 0) else: logits = tf.einsum("nd,md->mn", x, scaled_beta) log_prob += tf.reduce_sum( tfd.Bernoulli(logits=logits).log_prob(y), -1) return log_prob
def discounted_rewards(reward, done, gae_gamma, end_values): """Discounted rewards.""" not_done = tf.expand_dims(1 - tf.cast(done, tf.float32), axis=2) end_values = end_values * not_done[-1, :, :] return_ = tf.scan(lambda agg, cur: cur + gae_gamma * agg, tf.expand_dims(reward, axis=2) * not_done, initializer=end_values, reverse=True, back_prop=False, parallel_iterations=2) return tf.check_numerics(return_, "return")
def lambda_advantage(reward, value, length, discount): """Generalized Advantage Estimation.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) next_value = tf.concat([value[:, 1:], tf.zeros_like(value[:, -1:])], 1) delta = reward + discount * next_value - value advantage = tf.reverse( tf.transpose( tf.scan(lambda agg, cur: cur + discount * agg, tf.transpose(tf.reverse(mask * delta, [1]), [1, 0]), tf.zeros_like(delta[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(advantage), 'advantage')
def lambda_return(reward, value, length, discount, lambda_): """TD-lambda returns.""" timestep = tf.range(reward.shape[1].value) mask = tf.cast(timestep[None, :] < length[:, None], tf.float32) sequence = mask * reward + discount * value * (1 - lambda_) discount = mask * discount * lambda_ sequence = tf.stack([sequence, discount], 2) return_ = tf.reverse( tf.transpose( tf.scan(lambda agg, cur: cur[0] + cur[1] * agg, tf.transpose(tf.reverse(sequence, [1]), [1, 2, 0]), tf.zeros_like(value[:, -1]), 1, False), [1, 0]), [1]) return tf.check_numerics(tf.stop_gradient(return_), 'return')
def _build(self, **kwargs): # self.init_state should be called for the first time inside this method # so that it can be initialized within the appropriate graph # :: Define output # Make sure input has been defined if self.input_ is None: raise ValueError('!! input not found') assert isinstance(self.input_, Input) # Input placeholder has a shape of [batch_size, num_steps, *sample_shape] self.input_.set_group_shape((None, None)) # Transpose input so as to fit the input of tf.scan input_placeholder = self.input_() # Build a shadow in order to foreknow the nested structure of `initializer` initializer = self._build_while_free() # Get elems to feed tf.scan elems = transpose_tensor(input_placeholder, [1, 0]) if self.loss_in_loop: from tframe.models import Predictor assert isinstance(self, Predictor) targets_placeholder = self._targets.tensor elems = (elems, transpose_tensor(targets_placeholder, [1, 0])) # Send stuff into tf.scan and get results results = tf.scan(self, elems, initializer=initializer, name='Scan') scan_outputs, state_sequences = self._unwrap_outputs(results) # Activate state slot assert isinstance(self._state_slot, NestedTensorSlot) # Get last state and distribute to all recurrent-child last_state = Recurrent._extract_tensors(state_sequences, lambda t: t[-1]) self._new_state_tensor = last_state self._distribute_last_tensors() # Plug last state to corresponding slot self._state_slot.plug(last_state) self._update_group.add(self._state_slot) # TODO: BETA if hub.use_rtrl: self._update_group.add(self.grad_buffer_slot) if hub.test_grad: self._update_group.add(self.grad_delta_slot) # Transpose scan outputs to get final outputs outputs = transpose_tensor(scan_outputs, [1, 0]) # Output has a shape of [batch_size, num_steps, *output_shape] self.outputs.plug(outputs)
def ComputeChainStats(chain, target_mean, num_leapfrog_steps): # Chain is [num_steps, batch, num_dims] num_steps = tf.shape(chain)[0] counts = tf.to_float(tf.range(1, num_steps + 1)) chain_mean = tf.cumsum(chain, 0) / counts[:, tf.newaxis, tf.newaxis] bias = target_mean - tf.reduce_mean(chain_mean, 1) variance = tf.reduce_mean( tf.square(chain_mean - tf.reduce_mean(chain_mean, 1, keep_dims=True)), 1) inst_bias = target_mean - tf.reduce_mean(chain, 1) inst_variance = tf.reduce_mean(tf.square(target_mean - chain), 1) def reducer(_, idx): chain_mean = tf.reduce_mean(chain[idx // 2:idx], 0) bias = tf.reduce_mean(target_mean - chain_mean, 0) variance = tf.reduce_mean( tf.square(chain_mean - tf.reduce_mean(chain_mean, 0)), 0) return bias, variance indices = 1 + tf.range(num_steps) warmupped_bias, warmupped_variance = tf.scan(reducer, indices, initializer=(chain[0, 0], chain[0, 0])) half_steps = num_steps // 2 half_chain = chain[half_steps:] error_sq = tf.reduce_mean( tf.square(tf.reduce_mean(half_chain, 0) - target_mean), 0) ess = utils.EffectiveSampleSize(half_chain) / tf.to_float(half_steps) ess_per_grad = ess / tf.to_float(num_leapfrog_steps) rhat = tfp.mcmc.potential_scale_reduction(half_chain) autocorr = tf.reduce_mean( utils.SanitizedAutoCorrelation(half_chain, 0, max_lags=300), 1) return ChainStats(bias=bias, variance=variance, error_sq=error_sq, inst_bias=inst_bias, inst_variance=inst_variance, ess=ess, ess_per_grad=ess_per_grad, rhat=rhat, warmupped_bias=warmupped_bias, warmupped_variance=warmupped_variance, autocorr=autocorr)
def _perform_update_steps(self, observ, action, old_mean, old_logstd, reward, length): """Perform multiple update steps of value function and policy. The advantage is computed once at the beginning and shared across iterations. We need to decide for the summary of one iteration, and thus choose the one after half of the iterations. Args: observ: Sequences of observations. action: Sequences of actions. old_mean: Sequences of action means of the behavioral policy. old_logstd: Sequences of action log stddevs of the behavioral policy. reward: Sequences of rewards. length: Batch of sequence lengths. Returns: Summary tensor. """ # tf.Print('observ, action, old_mean, old_logstd, reward, length:',observ, action, old_mean, old_logstd, reward, length) return_ = utility.discounted_return(reward, length, self._config.discount) value = self._network(observ, length).value if self._config.gae_lambda: advantage = utility.lambda_return(reward, value, length, self._config.discount, self._config.gae_lambda) else: advantage = return_ - value mean, variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True) advantage = (advantage - mean) / (tf.sqrt(variance) + 1e-8) advantage = tf.Print(advantage, [tf.reduce_mean(return_), tf.reduce_mean(value)], 'return and value: ') advantage = tf.Print(advantage, [tf.reduce_mean(advantage)], 'normalized advantage: ') # pylint: disable=g-long-lambda value_loss, policy_loss, summary = tf.scan( lambda _1, _2: self._update_step( observ, action, old_mean, old_logstd, reward, advantage, length ), tf.range(self._config.update_epochs), [0., 0., ''], parallel_iterations=1) print_losses = tf.group( tf.Print(0, [tf.reduce_mean(value_loss)], 'value loss: '), tf.Print(0, [tf.reduce_mean(policy_loss)], 'policy loss: ')) with tf.control_dependencies([value_loss, policy_loss, print_losses]): return summary[self._config.update_epochs // 2]
def horseshoe_log_prob_fn(params): assert num_classes == 2 (z, r1_local, r2_local, r1_global, r2_global) = tf.split( params, [num_features, num_features, num_features, 1, 1], axis=-1) def indep(d): return tfd.Independent(d, 1) zero = tf.zeros(num_features) one = tf.ones(num_features) half = 0.5 * one p_z = indep(tfd.Normal(zero, one)) p_r1_local = indep(tfd.HalfNormal(one)) p_r2_local = indep(tfd.InverseGamma(half, half)) p_r1_global = indep(tfd.HalfNormal([1.])) p_r2_global = indep(tfd.InverseGamma([0.5], [0.5])) log_prob = (p_z.log_prob(z) + p_r1_local.log_prob(r1_local) + p_r2_local.log_prob(r2_local) + p_r1_global.log_prob(r1_global) + p_r2_global.log_prob(r2_global)) lambda_ = r1_local * tf.sqrt(r2_local) tau = r1_global * tf.sqrt(r2_global) beta = z * lambda_ * tau if batch_size: def body(_, i): logits = tf.einsum("nd,md->mn", x[i:i + batch_size], beta) return tfd.Independen(tfd.Bernoulli(logits=logits), 1).log_prob(y[i:i + batch_size]) log_prob += tf.reduce_sum( tf.scan(body, tf.range(0, x.shape[0], batch_size), initializer=tf.zeros(tf.shape(params)[:1]), parallel_iterations=1), 0) else: logits = tf.einsum("nd,md->mn", x, beta) log_prob += tfd.Independent(tfd.Bernoulli(logits=logits), 1).log_prob(y) return log_prob
def integrate(self, func, y0, t): time_delta_grid = t[1:] - t[:-1] def scan_func(y, t_dt): # recall the necessary variables n_ = self.n_ F_b = self.F_b t, dt = t_dt # Differential updation dy = self._step_func(func, t, dt, y) # Make code more modular. dy = tf.cast(dy, dtype=y.dtype) # Failsafe out = y + dy # the result after differential updation # Conditional to use specialized Integrator vs Normal Integrator (n=0) if n_ > 0: # Extract the last n variables for fire times fire_t = y[-n_:] # Value of change in firing times if neuron didnt fire = 0 l = tf.zeros(tf.shape(fire_t), dtype=fire_t.dtype) # Value of change in firing times if neuron fired = Current Time - Last Fire Time l_ = t - fire_t # Check if Voltage is initially less than Firing Threshold z = tf.less(y[:n_], F_b) # Check if Voltage is more than Firing Threshold after updation z_ = tf.greater_equal(out[:n_], F_b) # tf.where(cond,a,b) chooses elements from a/b based on condition df = tf.where(tf.logical_and(z, z_), l_, l) fire_t_ = fire_t + df # Update firing time return tf.concat([out[:-n_], fire_t_], 0) else: return out y = tf.scan(scan_func, (t[:-1], time_delta_grid), y0) return tf.concat([[y0], y], axis=0)
def get_ht_ctx(self, emb_y, target_hidden_state_0, annotations, a_m, y_m): res = tf.scan( self.one_time_step, elems=(emb_y, y_m), initializer=(target_hidden_state_0, tf.zeros([tf.shape(annotations)[0], self.context_dim]), tf.zeros([ tf.shape(annotations)[0], tf.shape(annotations)[1], tf.shape(annotations)[2] ]), tf.zeros([ tf.shape(annotations)[0], tf.shape(annotations)[1], tf.shape(annotations)[2] ]), annotations, a_m)) return res
def gamma_scales2_log_prob_fn(params): assert num_classes == 2 (z, local_scale, global_scale) = tf.split(params, [num_features, num_features, 1], axis=-1) def indep(d): return tfd.Independent(d, 1) zero = tf.zeros(num_features) one = tf.ones(num_features) half = 0.5 * one p_z = indep(tfd.Normal(zero, one)) p_local_scale = indep(tfd.Gamma(half, half)) p_global_scale = indep(tfd.Gamma([0.5], [0.5])) log_prob = (p_z.log_prob(z) + p_local_scale.log_prob(local_scale) + p_global_scale.log_prob(global_scale)) beta = z * local_scale * global_scale if batch_size: def body(_, i): logits = tf.einsum("nd,md->mn", x[i:i + batch_size], beta) return tfd.Independen(tfd.Bernoulli(logits=logits), 1).log_prob(y[i:i + batch_size]) log_prob += tf.reduce_sum( tf.scan(body, tf.range(0, x.shape[0], batch_size), initializer=tf.zeros(tf.shape(params)[:1]), parallel_iterations=1), 0) else: logits = tf.einsum("nd,md->mn", x, beta) log_prob += tfd.Independent(tfd.Bernoulli(logits=logits), 1).log_prob(y) return log_prob
def calculate_generalized_advantage_estimator( reward, value, done, gae_gamma, gae_lambda): # pylint: disable=g-doc-args """Generalized advantage estimator. Returns: GAE estimator. It will be one element shorter than the input; this is because to compute GAE for [0, ..., N-1] one needs V for [1, ..., N]. """ # pylint: enable=g-doc-args next_value = value[1:, :] next_not_done = 1 - tf.cast(done[1:, :], tf.float32) delta = (reward[:-1, :] + gae_gamma * next_value * next_not_done - value[:-1, :]) return_ = tf.reverse(tf.scan( lambda agg, cur: cur[0] + cur[1] * gae_gamma * gae_lambda * agg, [tf.reverse(delta, [0]), tf.reverse(next_not_done, [0])], tf.zeros_like(delta[0, :]), parallel_iterations=1), [0]) return tf.check_numerics(return_, "return")
def _update_value(self, observ, reward, length): """Perform multiple update steps of the value baseline. We need to decide for the summary of one iteration, and thus choose the one after half of the iterations. Args: observ: Sequences of observations. reward: Sequences of reward. length: Batch of sequence lengths. Returns: Summary tensor. """ with tf.name_scope('update_value'): loss, summary = tf.scan( lambda _1, _2: self._update_value_step(observ, reward, length), tf.range(self._config.update_epochs_value), [0., ''], parallel_iterations=1) print_loss = tf.Print(0, [tf.reduce_mean(loss)], 'value loss: ') with tf.control_dependencies([loss, print_loss]): return summary[self._config.update_epochs_value // 2]
def hmc(y, X, epsilon, L, start_q, alpha, n): config = tf.ConfigProto() if args.mode == 'cpu': config = tf.ConfigProto(device_count={'GPU': 0}, allow_soft_placement=True) with tf.Session(config=config) as sess: ty = tf.Variable(initial_value=y) tX = tf.Variable(initial_value=X) tU = lambda beta: U(ty, tX, beta, alpha) #tgrad_U = lambda beta: tf.gradients(ys=U(ty, tX, beta, alpha), xs=beta)[0] tgrad_U = lambda beta: grad_U(ty, tX, beta, alpha) z = tf.Variable(initial_value=np.zeros(n, dtype='float32')) cur_q = tf.Variable(initial_value=start_q) sess.run(tf.global_variables_initializer()) def update(current_q, _): q = tf.identity(current_q) p = tf.random.normal(current_q.get_shape()) current_p = tf.identity(p) p = p - 0.5 * epsilon * tgrad_U(q) for i in range(L): # position step q = q + epsilon * p # momentum step if i < L - 1: p = p - epsilon * tgrad_U(q) # negate for symmetry p = -(p - 0.5 * epsilon * tgrad_U(q)) current_U = tU(current_q) current_K = 0.5 * (tf.transpose(current_p) @ current_p) proposed_U = tU(q) proposed_K = 0.5 * (tf.transpose(p) @ p) ratio = (current_U - proposed_U + current_K - proposed_K)[0][0] return tf.cond(tf.less(tf.log(tf.random.uniform(())), ratio), lambda: q, lambda: current_q) sc = tf.squeeze(tf.scan(update, z, initializer=cur_q)) return sess.run(sc)
def lstm(self, inps): weight, bias = self.recurrent_weight, self.recurrent_bias init_state = tf.zeros(shape=[2, inps.shape[0], self.hidden_size], dtype=tf.float32) def step(hprev, x): st_1, ct_1 = tf.unstack(hprev) fc_gate = tf.matmul(weight, tf.transpose(tf.concat([x, st_1], -1))) fc_gate = tf.transpose(fc_gate) + bias i, f, g, o = tf.split(fc_gate, 4, axis=1) i, f, g, o = tf.sigmoid(i), tf.sigmoid(f), tf.tanh(g), tf.sigmoid( o) ct = ct_1 * f + g * i st = tf.tanh(ct) * o return tf.stack([st, ct]) states = tf.scan(step, tf.transpose(inps, [1, 0, 2]), initializer=init_state) return tf.transpose(states, [1, 2, 0, 3])[0]
def construct_model(self, images, actions, rewards): """Build convolutional lstm video predictor using CDNA, or DNA. Args: images: list of tensors of ground truth image sequences there should be a 4D image ?xWxHxC for each timestep actions: list of action tensors each action should be in the shape ?x1xZ rewards: list of reward tensors each reward should be in the shape ?x1xZ Returns: gen_images: predicted future image frames gen_rewards: predicted future rewards latent_mean: mean of approximated posterior latent_std: std of approximated posterior Raises: ValueError: if more than 1 mask specified for DNA model. """ context_frames = self.hparams.video_num_input_frames buffer_size = self.hparams.reward_prediction_buffer_size if buffer_size == 0: buffer_size = context_frames if buffer_size > context_frames: raise ValueError( "Buffer size is bigger than context frames %d %d." % (buffer_size, context_frames)) batch_size = common_layers.shape_list(images[0])[0] ss_func = self.get_scheduled_sample_func(batch_size) def process_single_frame(prev_outputs, inputs): """Process a single frame of the video.""" cur_image, input_reward, action = inputs time_step, prev_image, prev_reward, frame_buf, lstm_states = prev_outputs # sample from softmax (by argmax). this is noop for non-softmax loss. prev_image = self.get_sampled_frame(prev_image) generated_items = [prev_image] groundtruth_items = [cur_image] done_warm_start = tf.greater(time_step, context_frames - 1) input_image, = self.get_scheduled_sample_inputs( done_warm_start, groundtruth_items, generated_items, ss_func) # Prediction pred_image, lstm_states, _ = self.construct_predictive_tower( input_image, None, action, lstm_states, latent) if self.hparams.reward_prediction: reward_input_image = self.get_sampled_frame(pred_image) if self.hparams.reward_prediction_stop_gradient: reward_input_image = tf.stop_gradient(reward_input_image) with tf.control_dependencies([time_step]): frame_buf = [reward_input_image] + frame_buf[:-1] pred_reward = self.reward_prediction(frame_buf, None, action, latent) pred_reward = common_video.decode_to_shape( pred_reward, common_layers.shape_list(input_reward), "reward_dec") else: pred_reward = prev_reward time_step += 1 outputs = (time_step, pred_image, pred_reward, frame_buf, lstm_states) return outputs # Latent tower latent = None if self.hparams.stochastic_model: latent_mean, latent_std = self.construct_latent_tower(images, time_axis=0) latent = common_video.get_gaussian_tensor(latent_mean, latent_std) # HACK: Do first step outside to initialize all the variables lstm_states = [None] * (5 if self.hparams.small_mode else 7) frame_buffer = [tf.zeros_like(images[0])] * buffer_size inputs = images[0], rewards[0], actions[0] init_image_shape = common_layers.shape_list(images[0]) if self.is_per_pixel_softmax: init_image_shape[-1] *= 256 init_image = tf.zeros(init_image_shape, dtype=images.dtype) prev_outputs = (tf.constant(0), init_image, tf.zeros_like(rewards[0]), frame_buffer, lstm_states) initializers = process_single_frame(prev_outputs, inputs) first_gen_images = tf.expand_dims(initializers[1], axis=0) first_gen_rewards = tf.expand_dims(initializers[2], axis=0) inputs = (images[1:-1], rewards[1:-1], actions[1:-1]) outputs = tf.scan(process_single_frame, inputs, initializers) gen_images, gen_rewards = outputs[1:3] gen_images = tf.concat((first_gen_images, gen_images), axis=0) gen_rewards = tf.concat((first_gen_rewards, gen_rewards), axis=0) if self.hparams.stochastic_model: return gen_images, gen_rewards, [latent_mean], [latent_std] else: return gen_images, gen_rewards, None, None
def define_ppo_epoch(memory, hparams, action_space, batch_size, distributional_size=1, distributional_subscale=0.04, distributional_threshold=0.0, epoch=-1): """PPO epoch.""" observation, reward, done, action, old_pdf, value_sm = memory # This is to avoid propagating gradients through simulated environment. observation = tf.stop_gradient(observation) action = tf.stop_gradient(action) reward = tf.stop_gradient(reward) if hasattr(hparams, "rewards_preprocessing_fun"): reward = hparams.rewards_preprocessing_fun(reward) done = tf.stop_gradient(done) value_sm = tf.stop_gradient(value_sm) old_pdf = tf.stop_gradient(old_pdf) value = value_sm if distributional_size > 1: value = _distributional_to_value( value_sm, distributional_size, distributional_subscale, distributional_threshold) advantage = calculate_generalized_advantage_estimator( reward, value, done, hparams.gae_gamma, hparams.gae_lambda) if distributional_size > 1: # Create discounted reward values range. half = distributional_size // 2 value_range = tf.to_float(tf.range(-half, half)) + 0.5 # Mid-bucket value. value_range *= distributional_subscale # Acquire new discounted rewards by using the above range as end-values. end_values = tf.expand_dims(value_range, 0) discounted_reward = discounted_rewards( reward, done, hparams.gae_gamma, end_values) # Re-normalize the discounted rewards to integers, in [0, dist_size] range. discounted_reward /= distributional_subscale discounted_reward += half discounted_reward = tf.maximum(discounted_reward, 0.0) discounted_reward = tf.minimum(discounted_reward, distributional_size) # Multiply the rewards by 2 for greater fidelity and round to integers. discounted_reward = tf.stop_gradient(tf.round(2 * discounted_reward)) # The probabilities corresponding to the end values from old predictions. discounted_reward_prob = tf.stop_gradient(value_sm[-1]) discounted_reward_prob = tf.nn.softmax(discounted_reward_prob, axis=-1) else: discounted_reward = tf.stop_gradient(advantage + value[:-1]) discounted_reward_prob = discounted_reward # Unused in this case. advantage_mean, advantage_variance = tf.nn.moments(advantage, axes=[0, 1], keep_dims=True) advantage_normalized = tf.stop_gradient( (advantage - advantage_mean)/(tf.sqrt(advantage_variance) + 1e-8)) add_lists_elementwise = lambda l1, l2: [x + y for x, y in zip(l1, l2)] number_of_batches = ((hparams.epoch_length-1) * hparams.optimization_epochs // hparams.optimization_batch_size) epoch_length = hparams.epoch_length if hparams.effective_num_agents is not None: number_of_batches *= batch_size number_of_batches //= hparams.effective_num_agents epoch_length //= hparams.effective_num_agents assert number_of_batches > 0, "Set the paremeters so that number_of_batches>0" lr = learning_rate.learning_rate_schedule(hparams) shuffled_indices = [tf.random.shuffle(tf.range(epoch_length - 1)) for _ in range(hparams.optimization_epochs)] shuffled_indices = tf.concat(shuffled_indices, axis=0) shuffled_indices = shuffled_indices[:number_of_batches * hparams.optimization_batch_size] indices_of_batches = tf.reshape(shuffled_indices, shape=(-1, hparams.optimization_batch_size)) input_tensors = [observation, action, discounted_reward, discounted_reward_prob, advantage_normalized, old_pdf] ppo_step_rets = tf.scan( lambda a, i: add_lists_elementwise( # pylint: disable=g-long-lambda a, define_ppo_step( [tf.gather(t, indices_of_batches[i, :]) for t in input_tensors], hparams, action_space, lr, epoch=epoch, distributional_size=distributional_size, distributional_subscale=distributional_subscale )), tf.range(number_of_batches), [0., 0., 0.], parallel_iterations=1) ppo_summaries = [tf.reduce_mean(ret) / number_of_batches for ret in ppo_step_rets] ppo_summaries.append(lr) summaries_names = [ "policy_loss", "value_loss", "entropy_loss", "learning_rate" ] summaries = [tf.summary.scalar(summary_name, summary) for summary_name, summary in zip(summaries_names, ppo_summaries)] losses_summary = tf.summary.merge(summaries) for summary_name, summary in zip(summaries_names, ppo_summaries): losses_summary = tf.Print(losses_summary, [summary], summary_name + ": ") return losses_summary
def regular_log_prob_fn(params): if regression_hier_type == "none": beta = params beta_scaled = beta elif regression_hier_type == "centered": mu_0 = params[Ellipsis, -1] tau_0 = tf.nn.softplus(params[Ellipsis, -2]) beta = params[Ellipsis, :-2] beta_scaled = beta elif regression_hier_type == "non_centered": mu_0 = params[Ellipsis, -1] tau_0 = tf.nn.softplus(params[Ellipsis, -2]) beta = params[Ellipsis, :-2] beta_scaled = beta / tf.expand_dims( tau_0, -1) + tf.expand_dims(mu_0, -1) else: raise ValueError("Unknown regression_hier_type:" + regression_hier_type) if batch_size: def body(_, i): y_dist = tfd.Categorical(logits=tf.einsum( "ij,kjm->kim", x[i:i + batch_size], tf.reshape(beta_scaled, [-1, num_features, num_classes]))) return tf.reduce_sum(y_dist.log_prob(y[i:i + batch_size]), -1) log_prob = tf.reduce_sum( tf.scan(body, tf.range(0, x.shape[0], batch_size), initializer=tf.zeros(tf.shape(params)[:1]), parallel_iterations=1), 0) else: y_dist = tfd.Categorical(logits=tf.einsum( "ij,kjm->kim", x, tf.reshape(beta_scaled, [-1, num_features, num_classes]))) log_prob = tf.reduce_sum(y_dist.log_prob(y), -1) def make_beta_dist(loc, scale): if regression_beta_prior == "normal": return tfd.Normal(loc=loc, scale=scale) else: if tf.convert_to_tensor(loc).shape.ndims == 0: loc = tf.fill( tf.stack([ tf.shape(params)[0], num_features * num_classes ]), loc) if tf.convert_to_tensor(scale).shape.ndims == 0: scale = tf.fill( tf.stack([ tf.shape(params)[0], num_features * num_classes ]), scale) scale = tf.linalg.LinearOperatorDiag(scale) return tfd.MultivariateStudentTLinearOperator(loc=loc, scale=scale, df=t_dof) if regression_hier_type == "none": beta_dist = make_beta_dist(loc=0.0, scale=10.0) else: mu_0_dist = tfd.Normal(loc=0.0, scale=10.0) tau_0_dist = tfd.Gamma(2.0, 1.0) log_prob += mu_0_dist.log_prob(mu_0) + tau_0_dist.log_prob( tau_0) if regression_hier_type == "centered": mu_0 = tf.tile(tf.expand_dims(mu_0, -1), [1, num_features * num_classes]) tau_0 = tf.tile(tf.expand_dims(tau_0, -1), [1, num_features * num_classes]) beta_dist = make_beta_dist(loc=mu_0, scale=1.0 / tau_0) elif regression_hier_type == "non_centered": beta_dist = make_beta_dist(loc=0.0, scale=1.0) log_prob += tf.reduce_sum(beta_dist.log_prob(beta), -1) return log_prob
def vtrace_from_importance_weights(log_rhos, discounts, rewards, values, bootstrap_value, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, name='vtrace_from_importance_weights'): r"""V-trace from log importance weights. Calculates V-trace actor critic targets as described in "IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures" by Espeholt, Soyer, Munos et al. In the notation used throughout documentation and comments, T refers to the time dimension ranging from 0 to T-1. B refers to the batch size and NUM_ACTIONS refers to the number of actions. This code also supports the case where all tensors have the same number of additional dimensions, e.g., `rewards` is `[T, B, C]`, `values` is `[T, B, C]`, `bootstrap_value` is `[B, C]`. Args: log_rhos: A float32 tensor of shape `[T, B, NUM_ACTIONS]` representing the log importance sampling weights, i.e. log(target_policy(a) / behaviour_policy(a)). V-trace performs operations on rhos in log-space for numerical stability. discounts: A float32 tensor of shape `[T, B]` with discounts encountered when following the behaviour policy. rewards: A float32 tensor of shape `[T, B]` containing rewards generated by following the behaviour policy. values: A float32 tensor of shape `[T, B]` with the value function estimates wrt. the target policy. bootstrap_value: A float32 of shape `[B]` with the value function estimate at time T. clip_rho_threshold: A scalar float32 tensor with the clipping threshold for importance weights (rho) when calculating the baseline targets (vs). rho^bar in the paper. If None, no clipping is applied. clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). If None, no clipping is applied. name: The name scope that all V-trace operations will be created in. Returns: A VTraceReturns namedtuple (vs, pg_advantages) where: vs: A float32 tensor of shape `[T, B]`. Can be used as target to train a baseline (V(x_t) - vs_t)^2. pg_advantages: A float32 tensor of shape `[T, B]`. Can be used as the advantage in the calculation of policy gradients. """ log_rhos = tf.convert_to_tensor(log_rhos, dtype=tf.float32) discounts = tf.convert_to_tensor(discounts, dtype=tf.float32) rewards = tf.convert_to_tensor(rewards, dtype=tf.float32) values = tf.convert_to_tensor(values, dtype=tf.float32) bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32) if clip_rho_threshold is not None: clip_rho_threshold = tf.convert_to_tensor(clip_rho_threshold, dtype=tf.float32) if clip_pg_rho_threshold is not None: clip_pg_rho_threshold = tf.convert_to_tensor(clip_pg_rho_threshold, dtype=tf.float32) # Make sure tensor ranks are consistent. rho_rank = log_rhos.shape.ndims # Usually 2. values.shape.assert_has_rank(rho_rank) bootstrap_value.shape.assert_has_rank(rho_rank - 1) discounts.shape.assert_has_rank(rho_rank) rewards.shape.assert_has_rank(rho_rank) if clip_rho_threshold is not None: clip_rho_threshold.shape.assert_has_rank(0) if clip_pg_rho_threshold is not None: clip_pg_rho_threshold.shape.assert_has_rank(0) with tf.name_scope( name, values=[log_rhos, discounts, rewards, values, bootstrap_value]): rhos = tf.exp(log_rhos) if clip_rho_threshold is not None: clipped_rhos = tf.minimum(clip_rho_threshold, rhos, name='clipped_rhos') else: clipped_rhos = rhos cs = tf.minimum(1.0, rhos, name='cs') # Append bootstrapped value to get [v1, ..., v_t+1] values_t_plus_1 = tf.concat( [values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0) deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values) # Note that all sequences are reversed, computation starts from the back. sequences = ( tf.reverse(discounts, axis=[0]), tf.reverse(cs, axis=[0]), tf.reverse(deltas, axis=[0]), ) # V-trace vs are calculated through a scan from the back to the beginning # of the given trajectory. def scanfunc(acc, sequence_item): discount_t, c_t, delta_t = sequence_item return delta_t + discount_t * c_t * acc initial_values = tf.zeros_like(bootstrap_value) vs_minus_v_xs = tf.scan(fn=scanfunc, elems=sequences, initializer=initial_values, parallel_iterations=1, back_prop=False, name='scan') # Reverse the results back to original order. vs_minus_v_xs = tf.reverse(vs_minus_v_xs, [0], name='vs_minus_v_xs') # Add V(x_s) to get v_s. vs = tf.add(vs_minus_v_xs, values, name='vs') # Advantage for policy gradient. vs_t_plus_1 = tf.concat( [vs[1:], tf.expand_dims(bootstrap_value, 0)], axis=0) if clip_pg_rho_threshold is not None: clipped_pg_rhos = tf.minimum(clip_pg_rho_threshold, rhos, name='clipped_pg_rhos') else: clipped_pg_rhos = rhos pg_advantages = (clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values)) # Make sure no gradients backpropagated through the returned values. return VTraceReturns(vs=tf.stop_gradient(vs), pg_advantages=tf.stop_gradient(pg_advantages))
def shuffle_network(inputs, hparams): """Neural Shuffle-Network with skip connections between blocks. Args: inputs: inputs to the Shuffle-Exchange network. Should be in length of power of 2. hparams: Model configuration Returns: tf.Tensor: Outputs of the Shuffle-Exchange last layer """ def forward_step(state, layer_nr): with tf.variable_scope("forward"): last_state, residuals = state prev = residuals[layer_nr, :, :, :] switch = SwitchLayer("switch", hparams.dropout, hparams.mode) cur = switch(last_state, prev) return shuffle_layer(cur), residuals def reverse_step(state, layer_nr): with tf.variable_scope("reverse"): last_state, residuals = state prev = residuals[layer_nr, :, :, :] switch = SwitchLayer("reverse_switch", hparams.dropout, hparams.mode) cur = switch(last_state, prev) return reverse_shuffle_layer(cur), residuals input_shape = tf.shape(inputs) n_bits = tf.log(tf.cast(input_shape[1] - 1, tf.float32)) / tf.log(2.0) n_bits = tf.cast(n_bits, tf.int32) + 1 queue_shape = [n_bits * 2, input_shape[0], input_shape[1], input_shape[2]] residuals_queue = tf.zeros(queue_shape) block_out = tf.tanh(inputs) for k in range(hparams.num_hidden_layers): with tf.variable_scope("benes_block_" + str(k), reuse=tf.AUTO_REUSE): forward_outputs, _ = tf.scan( forward_step, tf.range(0, n_bits), initializer=(block_out, residuals_queue), parallel_iterations=1, swap_memory=True) forward_tensors = [tf.expand_dims(block_out, axis=0), forward_outputs] forward_outputs = tf.concat(forward_tensors, axis=0) forward_last = forward_outputs[-1, :, :, :] reverse_outputs, _ = tf.scan( reverse_step, tf.range(n_bits, n_bits * 2), initializer=(forward_last, residuals_queue), parallel_iterations=1, swap_memory=True) block_out = reverse_outputs[-1, :, :, :] residuals_queue = tf.concat([forward_outputs, reverse_outputs], axis=0) last_layer = SwitchLayer("last_layer", hparams.dropout, hparams.mode) return last_layer(block_out, residuals_queue[n_bits * 2, :, :, :])