def __init__(self, observations, env_spec): with tf.name_scope('fully_conv_model'): spatial_streams = { name: spatial_stream(observations[name], spec) for name, spec in env_spec.observation_spec.items() if spec.is_spatial } fc = Concatenate()( [Flatten()(x) for x in spatial_streams.values()]) fc = Dense( 256, activation='relu', name='fc', kernel_initializer=tf.keras.initializers.Orthogonal())(fc) with tf.name_scope('policy'): self.policy = {} for name, spec in env_spec.action_spec.items(): with tf.name_scope(name): if spec.obs_space: logits = Conv2D( 1, 1, activation='linear', data_format='channels_first', kernel_initializer=tf.keras.initializers. Orthogonal(gain=0.1))( spatial_streams[spec.obs_space]) logits = Flatten()(logits) else: logits = Dense( np.prod(spec.sizes), activation='linear', kernel_initializer=tf.keras.initializers. Orthogonal(gain=0.1))(fc) if name == 'function_id': logits = tf.where( observations['available_actions'] > 0, logits, -1000 * tf.ones_like(logits), name='mask_unavailable_functions') self.policy[name] = tfp.distributions.Categorical( logits=logits) with tf.name_scope('actions'): self.actions = { name: dist.sample(name=name + '_sample') for name, dist in self.policy.items() } with tf.name_scope('value'): self.value = value_output(fc)
def _variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var)
def _summary(): with tf.name_scope('ActorCriticLoss'): tf.summary.scalar("values", tf.reduce_mean(value)) tf.summary.scalar("returns", tf.reduce_mean(returns)) tf.summary.scalar("advantages", tf.reduce_mean(advantages)) tf.summary.scalar("explained_variance_of_return_by_value", common.explained_variance(value, returns))
def _ph_op(self): with tf.name_scope("init_ph"): x, y, y_feature = self._input_shapes # x driving series self.x = tf.placeholder(dtype=tf.float32, shape=(None, ) + x, name='x') # future values of driving series self.y = tf.placeholder(dtype=tf.float32, shape=(None, ) + y, name='y') # future values of the ancillary series self.y_features = tf.placeholder(dtype=tf.float32, shape=(None, ) + y_feature, name='y_features') self.mu = tf.placeholder_with_default(0., shape=(), name='mu') self.std = tf.placeholder_with_default(1., shape=(), name='std') self.keep_prob = tf.placeholder_with_default(1., shape=(), name='keep_prob') self.is_training = tf.placeholder_with_default(True, shape=(), name='is_training') self.gen_len = tf.placeholder_with_default(1, shape=(), name='gen_len') self.flag = tf.placeholder(shape=(), dtype=tf.bool)
def preprocess_observations(self): def one_hot_encode(x, scale): x = tf.squeeze(x, axis=1) x = tf.cast(x, tf.int32) return tf.one_hot(x, scale, axis=1) def preprocess_observation(input_obs, spec): if spec.is_spatial: features = Lambda( lambda x: tf.split(x, x.get_shape()[1], axis=1))(input_obs) for f in spec.features: if f.type == FeatureType.CATEGORICAL: features[f.index] = Lambda( lambda x: one_hot_encode(x, f.scale))( features[f.index]) else: features[f.index] = Lambda(lambda x: x / f.scale)( features[f.index]) return features else: return input_obs with tf.name_scope('preprocess_observations'): return { name: preprocess_observation(self.input_observations[name], spec) for name, spec in self.env_spec.observation_spec.items() }
def _train_op(self): with tf.name_scope("train_op"): d_opt = tf.train.GradientDescentOptimizer(self.d_lr) var_list = tf.trainable_variables(self.scope + "/discriminator") gvs, d_norm = clip_grads(self.d_loss, var_list) self.d_train = d_opt.minimize(self.d_loss, var_list=var_list, global_step=self._global_step) g_opt = tf.train.AdamOptimizer(self.g_lr) var_list = tf.trainable_variables(self.scope + "/generator") gvs, g_norm = clip_grads(self.g_loss, var_list) self.g_train = g_opt.minimize(self.g_loss, var_list=var_list, global_step=self._global_step) # g_train = g_opt.apply_gradients(gvs, global_step=self._global_step) self.train = tf.cond(self.flag, lambda: self.g_train, lambda: self.d_train) self._summary_dict.update({ "distance": self._gen_norm(self.x_fake, self.y), "g_norm": g_norm, "d_norm": d_norm, "g_loss": self.g_loss, "d_loss": self.d_loss })
def _build_select_slate_op(self): p_no_click = self._prob_no_click_ph p = self._doc_affinity_scores_ph q = self._net_outputs.q_values[0] with tf.name_scope('select_slate'): self._output_slate = self._select_slate_fn(self._slate_size, p_no_click, p, q) self._output_slate = tf.Print( self._output_slate, [tf.constant('cp 1'), self._output_slate, p, q], summarize=10000) self._output_slate = tf.reshape(self._output_slate, (self._slate_size, )) self._action_counts = tf.get_variable( 'action_counts', shape=[self._num_candidates], initializer=tf.zeros_initializer()) output_slate = tf.reshape(self._output_slate, [-1]) output_one_hot = tf.one_hot(output_slate, self._num_candidates) update_ops = [] for i in range(self._slate_size): update_ops.append( tf.assign_add(self._action_counts, output_one_hot[i])) self._select_action_update_op = tf.group(*update_ops)
def _summary_op(self): with tf.name_scope("summary_op"): # self._summary_list += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) metrics = regr_metrics(y=self.y, y_hat=self.y_hat) metrics = {k: tf.reduce_mean(v) for k, v in metrics.items()} self._summary_dict.update(metrics) self.summary = summary_op(t_dict=self._summary_dict)
def _build_networks(self): with tf.name_scope('networks'): self._replay_net_outputs = self._network_adapter( self._replay.states, 'Online') self._replay_next_target_net_outputs = self._network_adapter( self._replay.states, 'Target') self._net_outputs = self._network_adapter(self.state_ph, 'Online') self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
def _build_networks(self): with tf.name_scope('networks'): self._replay_net_outputs = self._network_adapter( self._replay.states, 'Online') self._replay_next_target_net_outputs = self._network_adapter( self._replay.states, 'Target') self._net_outputs = self._network_adapter(self.state_ph, 'Online') self._build_select_slate_op()
def value_loss(self): with tf.name_scope('value_loss'): loss = tf.losses.mean_squared_error( self.model.value, self.input_returns) * self.value_factor tf.summary.scalar('value_loss', loss, family='losses') return loss
def _loss_op(self): with tf.name_scope("loss_op"): self.d_loss = tf.reduce_mean(self._fake_d) - tf.reduce_mean( self._true_d) self.g_loss = -tf.reduce_mean(self._fake_d) # reg = self._reg(tf.shape(self.x)[0], self.d, self.x, self.x_fake) # self.d_loss += reg self.loss = [self.d_loss, self.g_loss]
def _train_op(self): with tf.name_scope("train_op"): opt = train_fn(global_step=self._global_step) gvs, norm = clip_grads(self.loss, self.vars) # self.train = opt.apply_gradients(gvs, global_step=self._global_step) self.train = opt.minimize(self.loss, var_list=self.vars, global_step=self._global_step) self._summary_dict.update({"norm": norm})
def prediction_loss(self, truths, palette): def spatial_loss(truth_features, predicted_features, space_desc): feature_losses = [] for truth, prediction, spec in zip(truth_features, predicted_features, space_desc.features): if spec.type == FeatureType.CATEGORICAL: truth = tf.transpose(truth, (0, 2, 3, 1)) prediction = tf.transpose(prediction, (0, 2, 3, 1)) feature_losses.append( tf.losses.softmax_cross_entropy(truth, prediction)) summary_image = tf.argmax( tf.concat([truth, prediction], 2), 3) summary_image = tf.gather( palette[space_desc.index][spec.index], summary_image) tf.summary.image(spec.name, summary_image) else: feature_losses.append( tf.losses.mean_squared_error(truth, prediction)) summary_image = tf.concat([truth, prediction], 3) tf.summary.image(spec.name, tf.transpose(summary_image, (0, 2, 3, 1))) tf.summary.scalar(spec.name, feature_losses[-1]) return tf.reduce_mean(tf.stack(feature_losses)) with tf.name_scope('prediction_loss'): spatial_losses = [] for s in self.env_spec.spaces: with tf.name_scope(s.name): loss = spatial_loss(truths[s.index], self.out_pred[s.index], s) spatial_losses.append(loss) tf.summary.scalar('loss', loss) loss = tf.reduce_mean(tf.stack(spatial_losses)) tf.summary.scalar('loss', loss) return loss
def selu(x): """ SELU activation https://arxiv.org/abs/1706.02515 :param x: :return: """ with tf.name_scope('elu') as scope: alpha = 1.6732632423543772848170429916717 scale = 1.0507009873554804934193349852946 return scale * tf.where(x >= 0.0, x, alpha * tf.nn.elu(x))
def init_subagents(self, model_fns, obs_specs, act_specs, policy_clses, n_subagents=0, subagent_variable_scopes=[]): assert n_subagents == len(model_fns) == len(obs_specs) == len( policy_clses ) == len(act_specs) == len( subagent_variable_scopes ), "The number of subagents is not equal to the number of model_fns, or obs_specs, or act_specs" self.subagents = {} for model_fn, obs_spec, act_spec, policy_cls, subagent_variable_scope in zip( model_fns, obs_specs, act_specs, policy_clses, subagent_variable_scopes): subagent = Subagent() subagent_dir = self.subagent_dirs[subagent_variable_scope] print(LOGGING_MSG_HEADER, 'resetting tf graph for subagent: ', subagent_variable_scope) tf.reset_default_graph() subagent.sess_mgr = SessionManager( base_path=subagent_dir, training_enabled=False, model_variable_scope=subagent_variable_scope) subagent.sess = subagent.sess_mgr.sess subagent.variable_scope = subagent_variable_scope with subagent.sess.graph.as_default(): with tf.name_scope( subagent.sess_mgr.main_tf_vs.original_name_scope): subagent.model = model_fn(obs_spec, act_spec) subagent.value = subagent.model.outputs[-1] subagent.policy = policy_cls(act_spec, subagent.model.outputs[:-1]) print(LOGGING_MSG_HEADER, subagent.variable_scope, ' model setup successful') subagent.sess_mgr.restore_or_init() print(LOGGING_MSG_HEADER, subagent.variable_scope, ' model restore successful') self.subagents[subagent_variable_scope] = subagent self.subagents_idx_key_dict = {} for idx, subagent_variable_scope in enumerate(self.subagents.keys()): self.subagents_idx_key_dict[idx] = subagent_variable_scope print(LOGGING_MSG_HEADER + "{} subagents are available: {}".format( self.n_subagents, self.subagents_idx_key_dict)) print("type their respective index to select them")
def __init__(self, observations, env_spec, dense_layer_size=(512, ), activation='elu', output_predictions_fn=None): with tf.name_scope('model'): with tf.name_scope('input'): spatial_features = [ input_block(observations[name], name, spec) for name, spec in env_spec.observation_spec.items() if spec.is_spatial ] with tf.name_scope('core'): spatial_features = [Flatten()(f) for f in spatial_features] spatial_features = Concatenate( name='concatenate_features')(spatial_features) dense = spatial_features for i, size in enumerate(dense_layer_size): op = Dense(size, activation=activation, name='state_dense_' + str(i)) dense = op(dense) tf.summary.histogram( 'state_dense_' + str(i) + '_kernel_weights', op.weights[0]) tf.summary.scalar('dense_zero_fraction', tf.nn.zero_fraction(dense)) tf.summary.histogram('dense_input', spatial_features) tf.summary.histogram('dense_output', dense) with tf.name_scope('value'): self.value = value_output(dense) with tf.name_scope('policy'): self.policy = policy_output(dense, observations['available_actions'], env_spec.action_spec) with tf.name_scope('actions'): self.actions = sample_policy(self.policy) if output_predictions_fn: with tf.name_scope('prediction'): self.prediction = [ output_predictions_fn(dense, s) for s in env_spec.observation_spec.values() if s.is_spatial ]
def state_rewards(states, actions, rewards, next_states, contexts, weight_index=None, state_indices=None, weight_vector=1.0, offset_vector=0.0, summarize=False): """Returns the rewards that are linear mapping of next_states. Args: states: A [batch_size, num_state_dims] Tensor representing a batch of states. actions: A [batch_size, num_action_dims] Tensor representing a batch of actions. rewards: A [batch_size] Tensor representing a batch of rewards. next_states: A [batch_size, num_state_dims] Tensor representing a batch of next states. contexts: A list of [batch_size, num_context_dims] Tensor representing a batch of contexts. weight_index: (integer) Index of contexts lists that specify weighting. state_indices: (a list of Numpy integer array) Indices of states dimensions to be mapped. weight_vector: (a number or a list or Numpy array) The weighting vector, broadcastable to `next_states`. offset_vector: (a number or a list of Numpy array) The off vector. summarize: (boolean) enable summary ops. Returns: A new tf.float32 [batch_size] rewards Tensor, and tf.float32 [batch_size] discounts tensor. """ del states, actions, rewards # unused args stats = {} record_tensor(next_states, state_indices, stats) next_states = index_states(next_states, state_indices) weight = tf.constant( weight_vector, dtype=next_states.dtype, shape=next_states[0].shape) weights = tf.expand_dims(weight, 0) offset = tf.constant( offset_vector, dtype=next_states.dtype, shape=next_states[0].shape) offsets = tf.expand_dims(offset, 0) if weight_index is not None: weights *= contexts[weight_index] rewards = tf.to_float(tf.reduce_sum(weights * (next_states+offsets), axis=1)) if summarize: with tf.name_scope('RewardFn/'): summarize_stats(stats) return rewards, tf.ones_like(rewards)
def _network_adapter(self, states, scope): self._validate_states(states) with tf.name_scope('network'): q_value_list = [] for slate in self._all_possible_slates: user = tf.squeeze(states[:, 0, :, :], axis=2) docs = [] for i in slate: docs.append(tf.squeeze(states[:, i + 1, :, :], axis=2)) q_value_list.append( self.network(user, tf.concat(docs, axis=1), scope)) q_values = tf.concat(q_value_list, axis=1) return dqn_agent.DQNNetworkType(q_values)
def preprocess_spatial_observation(input_obs, spec, categorical_embedding_dims=16, non_categorical_scaling='log'): with tf.name_scope('preprocess_spatial_obs'): features = Lambda(lambda x: tf.split(x, x.get_shape()[1], axis=1))(input_obs) for f in spec.features: if f.is_categorical: features[f.index] = Lambda(lambda x: tf.squeeze(x, axis=1))(features[f.index]) features[f.index] = Embedding(f.scale, categorical_embedding_dims)(features[f.index]) features[f.index] = Permute((3, 1, 2))(features[f.index]) else: if non_categorical_scaling == 'log': features[f.index] = Lambda(lambda x: tf.log(x + 1e-10))(features[f.index]) elif non_categorical_scaling == 'normalize': features[f.index] = Lambda(lambda x: x / f.scale)(features[f.index]) return features
def _loss_op(self): with tf.name_scope("loss_op"): weights = tf.ones_like(self.y, name='weights') self.loss = sequence_loss(self.y_hat, self.y, weights=weights, loss_fn=which_loss(self._config.loss)) self._summary_dict.update({"loss": self.loss}) if hasattr(self, '_reg'): reg = tf.reduce_sum(self._reg) self.loss += reg self._summary_dict.update({"loss": self.loss, "reg": reg}) else: self._summary_dict.update({"loss": self.loss})
def tanh_similarity(states, actions, rewards, next_states, contexts, mse_scale=1.0, state_scales=1.0, goal_scales=1.0, summarize=False): """Returns the similarity between next_states and contexts using tanh and mse. Args: states: A [batch_size, num_state_dims] Tensor representing a batch of states. actions: A [batch_size, num_action_dims] Tensor representing a batch of actions. rewards: A [batch_size] Tensor representing a batch of rewards. next_states: A [batch_size, num_state_dims] Tensor representing a batch of next states. contexts: A list of [batch_size, num_context_dims] Tensor representing a batch of contexts. mse_scale: A float, to scale mse before tanh. state_scales: multiplicative scale for (next) states. A scalar or 1D tensor, must be broadcastable to number of state dimensions. goal_scales: multiplicative scale for contexts. A scalar or 1D tensor, must be broadcastable to number of goal dimensions. summarize: (boolean) enable summary ops. Returns: A new tf.float32 [batch_size] rewards Tensor, and tf.float32 [batch_size] discounts tensor. """ del states, actions, rewards # Unused mse = tf.reduce_mean(tf.squared_difference(next_states * state_scales, contexts[0] * goal_scales), -1) tanh = tf.tanh(mse_scale * mse) if summarize: with tf.name_scope('RewardFn/'): tf.summary.scalar('mean_mse', tf.reduce_mean(mse)) tf.summary.histogram('mse', mse) tf.summary.scalar('mean_tanh', tf.reduce_mean(tanh)) tf.summary.histogram('tanh', tanh) rewards = tf.to_float(1 - tanh) return rewards, tf.ones_like(rewards)
def entropy_loss(self): with tf.name_scope('entropy_loss'): entropies = [ dist.entropy() for name, dist in self.model.policy.items() ] entropy = tf.reduce_mean(tf.add_n(entropies)) entropy_loss = -entropy * self.entropy_factor entropy_masked = tf.stack(entropies, axis=-1) * tf.gather( self.function_args_mask, self.input_actions['function_id']) entropy_masked = tf.reduce_mean(tf.reduce_sum(entropy_masked, axis=-1)) tf.summary.scalar('policy_entropy', entropy, family='entropy') tf.summary.scalar('policy_entropy_masked', entropy_masked, family='entropy') tf.summary.scalar('entropy_loss', entropy_loss, family='losses') return entropy_loss
def _network_adapter(self, states, scope): self._validate_states(states) with tf.name_scope('network'): # Since we decompose the slate optimization into an item-level # optimization problem, the observation space is the user state # observation plus all documents' observations. In the Dopamine DQN agent # implementation, there is one head for each possible action value, which # is designed for computing the argmax operation in the action space. # In our implementation, we generate one output for each document. q_value_list = [] for i in range(self._num_candidates): user = tf.squeeze(states[:, 0, :, :], axis=2) doc = tf.squeeze(states[:, i + 1, :, :], axis=2) q_value_list.append(self.network(user, doc, scope)) q_values = tf.concat(q_value_list, axis=1) return dqn_agent.DQNNetworkType(q_values)
def policy_loss(self): with tf.name_scope('policy_loss'): log_probs = [ dist.log_prob(self.input_actions[name]) for name, dist in self.model.policy.items() ] log_probs = tf.stack(log_probs, axis=-1) log_probs = log_probs * tf.gather( self.function_args_mask, self.input_actions['function_id']) advantage = self.input_returns - self.model.value policy_loss = -tf.reduce_mean( tf.reduce_sum(log_probs, axis=-1) * tf.stop_gradient(advantage)) * self.policy_factor tf.summary.scalar('policy_loss', policy_loss, family='losses') return policy_loss
def _attend(self, query, key, value, key_class_id): """Transformer attention function.""" with tf.name_scope('attend'): q_shape = tf.shape(query) v_shape = tf.shape(value) n_q = q_shape[0] h_q = q_shape[1] w_q = q_shape[2] d = q_shape[3] n_v = v_shape[0] h_v = v_shape[1] w_v = v_shape[2] c = v_shape[3] q = tf.reshape(query, [-1, d]) # [n_q*Hq*Wq, d] k = tf.reshape(key, [-1, d]) # [n_v*Hv*Wv, d] x [Nq*Hq*Wq, d] --> [n_v*Hv*Wv, Nq*Hq*Wq] logits = tf.matmul(k, q, transpose_b=True) d_scale = tf.rsqrt(tf.cast(d, logits.dtype)) # logits: [n_v, Hv*Wv, n_q*Hq*Wq] logits = tf.reshape(d_scale * logits, [n_v, h_v * w_v, -1]) # attn: [n_v, Hv*Wv, n_q*Hq*Wq] attn = self.get_support_set_softmax(logits, key_class_id) # aggregate: v = tf.reshape(value, [n_v, h_v * w_v, c]) # [n_v, Hv*Wv, n_q*Hq*Wq] x [n_v, Hv*Wv, c] --> [n_v, n_q*Hq*Wq, c] v_agg = tf.einsum('ijk,ijl->ikl', attn, v) v_agg = tf.reshape(v_agg, [n_v, n_q, h_q, w_q, c]) v_agg.set_shape([None, None, None, None, value.shape[-1]]) return v_agg # [N_c, n_q, Hq, Wq, c]
def create_sampling_ops(self, use_staging): """Creates the ops necessary to sample from the replay buffer. Creates the transition dictionary containing the sampling tensors. Args: use_staging: bool, when True it would use a staging area to prefetch the next sampling batch. """ with tf.name_scope('sample_replay'): with tf.device('/cpu:*'): transition_type = self.memory.get_transition_elements() transition_tensors = tf.py_func( self.memory.sample_transition_batch, [], [return_entry.type for return_entry in transition_type], name='replay_sample_py_func') self._set_transition_shape(transition_tensors, transition_type) if use_staging: transition_tensors = self._set_up_staging(transition_tensors) self._set_transition_shape(transition_tensors, transition_type) # Unpack sample transition into member variables. self.unpack_transition(transition_tensors, transition_type)
def _loss_op(self): with tf.name_scope("loss_op"): # labels = tf.distributions.Uniform(low=0.7, high=1.2).sample(tf.shape(self._true_d)) labels = tf.ones_like(self._true_d) d_loss_true = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self._true_d, labels=labels, )) # labels = tf.distributions.Uniform(low=0., high=0.3).sample(tf.shape(self._fake_d)) labels = tf.zeros_like(self._fake_d) d_loss_fake = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=self._fake_d, labels=labels)) self.d_loss = d_loss_true + d_loss_fake self.g_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=self._fake_d, labels=tf.ones_like( self._fake_d))) self.loss = [self.d_loss, self.g_loss]
def diff_distance(states, actions, rewards, next_states, contexts, state_scales=1.0, goal_scales=1.0, reward_scales=1.0, weight_index=None, weight_vector=None, summarize=False, termination_epsilon=1e-4, state_indices=None, goal_indices=None, norm='L2', epsilon=1e-10): """Returns the difference in euclidean distance between states/next_states and contexts. Args: states: A [batch_size, num_state_dims] Tensor representing a batch of states. actions: A [batch_size, num_action_dims] Tensor representing a batch of actions. rewards: A [batch_size] Tensor representing a batch of rewards. next_states: A [batch_size, num_state_dims] Tensor representing a batch of next states. contexts: A list of [batch_size, num_context_dims] Tensor representing a batch of contexts. state_scales: multiplicative scale for (next) states. A scalar or 1D tensor, must be broadcastable to number of state dimensions. goal_scales: multiplicative scale for goals. A scalar or 1D tensor, must be broadcastable to number of goal dimensions. reward_scales: multiplicative scale for rewards. A scalar or 1D tensor, must be broadcastable to number of reward dimensions. weight_index: (integer) The context list index that specifies weight. weight_vector: (a number or a list or Numpy array) The weighting vector, broadcastable to `next_states`. summarize: (boolean) enable summary ops. termination_epsilon: terminate if dist is less than this quantity. state_indices: (a list of integers) list of state indices to select. goal_indices: (a list of integers) list of goal indices to select. vectorize: Return a vectorized form. norm: L1 or L2. epsilon: small offset to ensure non-negative/zero distance. Returns: A new tf.float32 [batch_size] rewards Tensor, and tf.float32 [batch_size] discounts tensor. """ del actions, rewards # Unused stats = {} record_tensor(next_states, state_indices, stats, 'next_states') next_states = index_states(next_states, state_indices) states = index_states(states, state_indices) goals = index_states(contexts[0], goal_indices) next_sq_dists = tf.squared_difference(next_states * state_scales, goals * goal_scales) sq_dists = tf.squared_difference(states * state_scales, goals * goal_scales) record_tensor(sq_dists, None, stats, 'sq_dists') if weight_vector is not None: next_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype) sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype) if weight_index is not None: next_sq_dists *= contexts[weight_index] sq_dists *= contexts[weight_index] if norm == 'L1': next_dist = tf.sqrt(next_sq_dists + epsilon) dist = tf.sqrt(sq_dists + epsilon) next_dist = tf.reduce_sum(next_dist, -1) dist = tf.reduce_sum(dist, -1) elif norm == 'L2': next_dist = tf.reduce_sum(next_sq_dists, -1) next_dist = tf.sqrt(next_dist + epsilon) # tf.gradients fails when tf.sqrt(-0.0) dist = tf.reduce_sum(sq_dists, -1) dist = tf.sqrt(dist + epsilon) # tf.gradients fails when tf.sqrt(-0.0) else: raise NotImplementedError(norm) discounts = next_dist > termination_epsilon if summarize: with tf.name_scope('RewardFn/'): tf.summary.scalar('mean_dist', tf.reduce_mean(dist)) tf.summary.histogram('dist', dist) summarize_stats(stats) diff = dist - next_dist diff *= reward_scales return tf.to_float(diff), tf.to_float(discounts)
def __init__(self, num_actions, observation_size, stack_size, use_staging=True, replay_capacity=1000000, batch_size=32, update_horizon=1, gamma=1.0, wrapped_memory=None): """Initializes a graph wrapper for the python replay memory. Args: num_actions: int, number of possible actions. observation_size: int, size of an input frame. stack_size: int, number of frames to use in state stack. use_staging: bool, when True it would use a staging area to prefetch the next sampling batch. replay_capacity: int, number of transitions to keep in memory. batch_size: int. update_horizon: int, length of update ('n' in n-step update). gamma: int, the discount factor. wrapped_memory: The 'inner' memory data structure. Defaults to None, which creates the standard DQN replay memory. Raises: ValueError: If update_horizon is not positive. ValueError: If discount factor is not in [0, 1]. """ if replay_capacity < update_horizon + 1: raise ValueError( 'Update horizon (%i) should be significantly smaller ' 'than replay capacity (%i).' % (update_horizon, replay_capacity)) if not update_horizon >= 1: raise ValueError('Update horizon must be positive.') if not 0.0 <= gamma <= 1.0: raise ValueError('Discount factor (gamma) must be in [0, 1].') # Allow subclasses to create self.memory. if wrapped_memory is not None: self.memory = wrapped_memory else: self.memory = OutOfGraphReplayMemory(num_actions, observation_size, stack_size, replay_capacity, batch_size, update_horizon, gamma) with tf.name_scope('replay'): with tf.name_scope('add_placeholders'): self.add_obs_ph = tf.placeholder(tf.uint8, [observation_size], name='add_obs_ph') self.add_action_ph = tf.placeholder(tf.int32, [], name='add_action_ph') self.add_reward_ph = tf.placeholder(tf.float32, [], name='add_reward_ph') self.add_terminal_ph = tf.placeholder(tf.uint8, [], name='add_terminal_ph') self.add_legal_actions_ph = tf.placeholder( tf.float32, [num_actions], name='add_legal_actions_ph') add_transition_ph = [ self.add_obs_ph, self.add_action_ph, self.add_reward_ph, self.add_terminal_ph, self.add_legal_actions_ph ] with tf.device('/cpu:*'): self.add_transition_op = tf.py_func(self.memory.add, add_transition_ph, [], name='replay_add_py_func') self.transition = tf.py_func( self.memory.sample_transition_batch, [], [ tf.uint8, tf.int32, tf.float32, tf.uint8, tf.uint8, tf.int32, tf.float32 ], name='replay_sample_py_func') if use_staging: # To hide the py_func latency use a staging area to pre-fetch the next # batch of transitions. (states, actions, rewards, next_states, terminals, indices, next_legal_actions) = self.transition # StagingArea requires all the shapes to be defined. states.set_shape( [batch_size, observation_size, stack_size]) actions.set_shape([batch_size]) rewards.set_shape([batch_size]) next_states.set_shape( [batch_size, observation_size, stack_size]) terminals.set_shape([batch_size]) indices.set_shape([batch_size]) next_legal_actions.set_shape([batch_size, num_actions]) # Create the staging area in CPU. prefetch_area = tf.contrib.staging.StagingArea([ tf.uint8, tf.int32, tf.float32, tf.uint8, tf.uint8, tf.int32, tf.float32 ]) self.prefetch_batch = prefetch_area.put( (states, actions, rewards, next_states, terminals, indices, next_legal_actions)) else: self.prefetch_batch = tf.no_op() if use_staging: # Get the sample_transition_batch in GPU. This would do the copy from # CPU to GPU. self.transition = prefetch_area.get() (self.states, self.actions, self.rewards, self.next_states, self.terminals, self.indices, self.next_legal_actions) = self.transition # Since these are py_func tensors, no information about their shape is # present. Setting the shape only for the necessary tensors self.states.set_shape([None, observation_size, stack_size]) self.next_states.set_shape([None, observation_size, stack_size])