def _prepare_networks(self, hparams, sess): self.action = tf.placeholder(shape=(1, ), dtype=tf.int32) batch_env = batch_env_factory(hparams) self.reward, self.done = batch_env.simulate(self.action) self.observation = batch_env.observ self.reset_op = batch_env.reset(tf.constant([0], dtype=tf.int32)) environment_wrappers = hparams.environment_spec.wrappers wrappers = copy.copy( environment_wrappers) if environment_wrappers else [] to_initialize = [batch_env] for w in wrappers: batch_env = w[0](batch_env, **w[1]) to_initialize.append(batch_env) def initialization_lambda(): for batch_env in to_initialize: batch_env.initialize(sess) self.initialize = initialization_lambda obs_copy = batch_env.observ + 0 actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams) self.policy_probs = actor_critic.policy.probs[0, 0, :] self.value = actor_critic.value[0, :]
def define_collect(hparams, scope, eval_phase, collect_level=-1, policy_to_actions_lambda=None, on_simulated=False): """Collect trajectories.""" with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): batch_env = batch_env_factory(hparams) environment_wrappers = hparams.environment_spec.wrappers wrappers = copy.copy( environment_wrappers) if environment_wrappers else [] # Put memory wrapper at the level you want to gather observations at. # Negative indices need to be shifted for insert to work correctly. collect_level = collect_level if \ collect_level >= 0 else len(wrappers) + collect_level + 1 wrappers.insert(collect_level, [_MemoryWrapper, {}]) rollout_metadata = None speculum = None for w in wrappers: batch_env = w[0](batch_env, **w[1]) if w[0] == _MemoryWrapper: rollout_metadata = _rollout_metadata(batch_env) speculum = batch_env.speculum eval_phase = tf.convert_to_tensor(eval_phase) on_simulated = tf.convert_to_tensor(on_simulated) memory = [ tf.get_variable("collect_memory_{}".format(name), shape=[hparams.epoch_length] + shape, dtype=dtype, initializer=tf.zeros_initializer(), trainable=False) for (shape, dtype, name) in rollout_metadata ] cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env), trainable=False) should_reset_var = tf.Variable(True, trainable=False) zeros_tensor = tf.zeros(len(batch_env)) def group(): return tf.group(batch_env.reset(tf.range(len(batch_env))), tf.assign(cumulative_rewards, zeros_tensor)) reset_op = tf.cond( tf.logical_or(should_reset_var, tf.logical_or(eval_phase, on_simulated)), group, tf.no_op) with tf.control_dependencies([reset_op]): reset_once_op = tf.assign(should_reset_var, False) with tf.control_dependencies([reset_once_op]): def step(index, scores_sum, scores_num): """Single step.""" index %= hparams.epoch_length # Only needed in eval runs. # Note - the only way to ensure making a copy of tensor is to run simple # operation. We are waiting for tf.copy: # https://github.com/tensorflow/tensorflow/issues/11186 obs_copy = batch_env.observ + 0 def env_step(arg1, arg2): # pylint: disable=unused-argument """Step of the environment.""" actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams) policy = actor_critic.policy if policy_to_actions_lambda: action = policy_to_actions_lambda(policy) else: action = tf.cond(eval_phase, policy.mode, policy.sample) postprocessed_action = actor_critic.action_postprocessing( action) simulate_output = batch_env.simulate(postprocessed_action[0, ...]) pdf = policy.prob(action)[0] value_function = actor_critic.value[0] pdf = tf.reshape(pdf, shape=(hparams.num_agents, )) value_function = tf.reshape(value_function, shape=(hparams.num_agents, )) with tf.control_dependencies(simulate_output): return tf.identity(pdf), tf.identity(value_function) pdf, value_function = tf.while_loop( lambda _1, _2: tf.equal(speculum.size(), 0), env_step, [ tf.constant(0.0, shape=(hparams.num_agents, )), tf.constant(0.0, shape=(hparams.num_agents, )) ], parallel_iterations=1, back_prop=False, ) with tf.control_dependencies([pdf, value_function]): obs, reward, done, action = speculum.dequeue() done = tf.reshape(done, (len(batch_env), )) to_save = [obs, reward, done, action, pdf, value_function] save_ops = [ tf.scatter_update(memory_slot, index, value) for memory_slot, value in zip(memory, to_save) ] cumulate_rewards_op = cumulative_rewards.assign_add(reward) agent_indices_to_reset = tf.where(done)[:, 0] with tf.control_dependencies([cumulate_rewards_op]): scores_sum_delta = tf.reduce_sum( tf.gather(cumulative_rewards, agent_indices_to_reset)) scores_num_delta = tf.count_nonzero(done, dtype=tf.int32) with tf.control_dependencies(save_ops + [scores_sum_delta, scores_num_delta]): reset_env_op = batch_env.reset(agent_indices_to_reset) reset_cumulative_rewards_op = tf.scatter_update( cumulative_rewards, agent_indices_to_reset, tf.gather(zeros_tensor, agent_indices_to_reset)) with tf.control_dependencies( [reset_env_op, reset_cumulative_rewards_op]): return [ index + 1, scores_sum + scores_sum_delta, scores_num + scores_num_delta ] def stop_condition(i, _, resets): return tf.cond(eval_phase, lambda: resets < hparams.num_eval_agents, lambda: i < hparams.epoch_length) init = [tf.constant(0), tf.constant(0.0), tf.constant(0)] index, scores_sum, scores_num = tf.while_loop(stop_condition, step, init, parallel_iterations=1, back_prop=False) mean_score = tf.cond(tf.greater(scores_num, 0), lambda: scores_sum / tf.cast(scores_num, tf.float32), lambda: 0.) printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ") with tf.control_dependencies([index, printing]): memory = [tf.identity(mem) for mem in memory] mean_score_summary = tf.cond( tf.greater(scores_num, 0), lambda: tf.summary.scalar("mean_score_this_iter", mean_score), str) summaries = tf.summary.merge([ mean_score_summary, tf.summary.scalar("episodes_finished_this_iter", scores_num) ]) return memory, summaries
def define_collect(hparams, scope, eval_phase, collect_level=-1, policy_to_actions_lambda=None): """Collect trajectories. Args: hparams: HParams. scope: var scope. eval_phase: bool, is eval phase. collect_level: int, which level to collect observations. policy_to_actions_lambda: lambda. Returns: Returns memory (observtions, rewards, dones, actions, pdfs, values_functions) containing a rollout of environment from collect_level of nested wrapper structure. Note that pdfs and values_functions are meaningful only if collect_level==-1. """ to_initialize = [] with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): batch_env = batch_env_factory(hparams) to_initialize.append(batch_env) environment_wrappers = hparams.environment_spec.wrappers wrappers = copy.copy(environment_wrappers) if environment_wrappers else [] # Put memory wrapper at the level you want to gather observations at. # Negative indices need to be shifted for insert to work correctly. collect_level = collect_level if \ collect_level >= 0 else len(wrappers) + collect_level + 1 wrappers.insert(collect_level, [_MemoryWrapper, {}]) rollout_metadata = None speculum = None for w in wrappers: tf.logging.info("Applying wrapper %s(%s) to env %s." % (str(w[0]), str(w[1]), str(batch_env))) batch_env = w[0](batch_env, **w[1]) to_initialize.append(batch_env) if w[0] == _MemoryWrapper: rollout_metadata = _rollout_metadata(batch_env) speculum = batch_env.speculum def initialization_lambda(sess): for batch_env in to_initialize: batch_env.initialize(sess) memory = [ tf.get_variable("collect_memory_%d_%s" % (hparams.epoch_length, name), shape=[hparams.epoch_length] + shape, dtype=dtype, initializer=tf.zeros_initializer(), trainable=False) for (shape, dtype, name) in rollout_metadata] cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env), trainable=False) eval_phase = tf.convert_to_tensor(eval_phase) should_reset_var = tf.Variable(True, trainable=False) zeros_tensor = tf.zeros(len(batch_env)) if "force_beginning_resets" in hparams: force_beginning_resets = hparams.force_beginning_resets else: force_beginning_resets = False force_beginning_resets = tf.convert_to_tensor(force_beginning_resets) def reset_ops_group(): return tf.group(batch_env.reset(tf.range(len(batch_env))), tf.assign(cumulative_rewards, zeros_tensor)) reset_op = tf.cond( tf.logical_or(should_reset_var.read_value(), force_beginning_resets), reset_ops_group, tf.no_op) with tf.control_dependencies([reset_op]): reset_once_op = tf.assign(should_reset_var, False) with tf.control_dependencies([reset_once_op]): def step(index, scores_sum, scores_num): """Single step.""" index %= hparams.epoch_length # Only needed in eval runs. # Note - the only way to ensure making a copy of tensor is to run simple # operation. We are waiting for tf.copy: # https://github.com/tensorflow/tensorflow/issues/11186 obs_copy = batch_env.observ + 0 def env_step(arg1, arg2, arg3): # pylint: disable=unused-argument """Step of the environment.""" actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams) policy = actor_critic.policy if policy_to_actions_lambda: action = policy_to_actions_lambda(policy) else: action = tf.cond(eval_phase, policy.mode, policy.sample) postprocessed_action = actor_critic.action_postprocessing(action) reward, done = batch_env.simulate(postprocessed_action[0, ...]) pdf = policy.prob(action)[0] value_function = actor_critic.value[0] pdf = tf.reshape(pdf, shape=(hparams.num_agents,)) value_function = tf.reshape(value_function, shape=(hparams.num_agents,)) done = tf.reshape(done, shape=(hparams.num_agents,)) with tf.control_dependencies([reward, done]): return tf.identity(pdf), tf.identity(value_function), \ tf.identity(done) # TODO(piotrmilos): while_body is executed at most once, # thus should be replaced with tf.cond pdf, value_function, top_level_done = tf.while_loop( lambda _1, _2, _3: tf.equal(speculum.size(), 0), env_step, [ tf.constant(0.0, shape=(hparams.num_agents,)), tf.constant(0.0, shape=(hparams.num_agents,)), tf.constant(False, shape=(hparams.num_agents,)) ], parallel_iterations=1, back_prop=False, ) with tf.control_dependencies([pdf, value_function]): obs, reward, done, action = speculum.dequeue() to_save = [obs, reward, done, action, pdf, value_function] save_ops = [tf.scatter_update(memory_slot, index, value) for memory_slot, value in zip(memory, to_save)] cumulate_rewards_op = cumulative_rewards.assign_add(reward) agent_indices_to_reset = tf.where(top_level_done)[:, 0] with tf.control_dependencies([cumulate_rewards_op]): # TODO(piotrmilos): possibly we need cumulative_rewards.read_value() scores_sum_delta = tf.reduce_sum( tf.gather(cumulative_rewards.read_value(), agent_indices_to_reset)) scores_num_delta = tf.count_nonzero(done, dtype=tf.int32) with tf.control_dependencies(save_ops + [scores_sum_delta, scores_num_delta]): reset_env_op = batch_env.reset(agent_indices_to_reset) reset_cumulative_rewards_op = tf.scatter_update( cumulative_rewards, agent_indices_to_reset, tf.gather(zeros_tensor, agent_indices_to_reset)) with tf.control_dependencies([reset_env_op, reset_cumulative_rewards_op]): return [index + 1, scores_sum + scores_sum_delta, scores_num + scores_num_delta] def stop_condition(i, _, resets): return tf.cond(eval_phase, lambda: resets < hparams.num_eval_agents, lambda: i < hparams.epoch_length) init = [tf.constant(0), tf.constant(0.0), tf.constant(0)] index, scores_sum, scores_num = tf.while_loop( stop_condition, step, init, parallel_iterations=1, back_prop=False) # We handle force_beginning_resets differently. We assume that all envs are # reseted at the end of episod (though it happens at the beginning of the # next one scores_num = tf.cond(force_beginning_resets, lambda: scores_num + len(batch_env), lambda: scores_num) with tf.control_dependencies([scores_sum]): scores_sum = tf.cond( force_beginning_resets, lambda: scores_sum + tf.reduce_sum(cumulative_rewards.read_value()), lambda: scores_sum) mean_score = tf.cond(tf.greater(scores_num, 0), lambda: scores_sum / tf.cast(scores_num, tf.float32), lambda: 0.) printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ") with tf.control_dependencies([index, printing]): memory = [mem.read_value() for mem in memory] # When generating real data together with PPO training we must use single # agent. For PPO to work we reshape the history, as if it was generated # by real_ppo_effective_num_agents. if getattr(hparams, "effective_num_agents", None): new_memory = [] effective_num_agents = hparams.effective_num_agents assert hparams.epoch_length % effective_num_agents == 0, ( "The rollout of hparams.epoch_length will be distributed amongst" "effective_num_agents of agents") new_epoch_length = int(hparams.epoch_length / effective_num_agents) for mem, info in zip(memory, rollout_metadata): shape, _, name = info new_shape = [effective_num_agents, new_epoch_length] + shape[1:] perm = list(range(len(shape)+1)) perm[0] = 1 perm[1] = 0 mem = tf.transpose(mem, perm=perm) mem = tf.reshape(mem, shape=new_shape) mem = tf.transpose(mem, perm=perm, name="collect_memory_%d_%s" % (new_epoch_length, name)) new_memory.append(mem) memory = new_memory mean_score_summary = tf.cond( tf.greater(scores_num, 0), lambda: tf.summary.scalar("mean_score_this_iter", mean_score), str) summaries = tf.summary.merge( [mean_score_summary, tf.summary.scalar("episodes_finished_this_iter", scores_num)]) return memory, summaries, initialization_lambda
def define_collect(hparams, scope, eval_phase, collect_level=-1, policy_to_actions_lambda=None): """Collect trajectories.""" to_initialize = [] with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): batch_env = batch_env_factory(hparams) to_initialize.append(batch_env) environment_wrappers = hparams.environment_spec.wrappers wrappers = copy.copy(environment_wrappers) if environment_wrappers else [] # Put memory wrapper at the level you want to gather observations at. # Negative indices need to be shifted for insert to work correctly. collect_level = collect_level if \ collect_level >= 0 else len(wrappers) + collect_level + 1 wrappers.insert(collect_level, [_MemoryWrapper, {}]) rollout_metadata = None speculum = None for w in wrappers: batch_env = w[0](batch_env, **w[1]) to_initialize.append(batch_env) if w[0] == _MemoryWrapper: rollout_metadata = _rollout_metadata(batch_env) speculum = batch_env.speculum def initialization_lambda(sess): for batch_env in to_initialize: batch_env.initialize(sess) memory = [tf.get_variable("collect_memory_{}".format(name), shape=[hparams.epoch_length]+shape, dtype=dtype, initializer=tf.zeros_initializer(), trainable=False) for (shape, dtype, name) in rollout_metadata] cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env), trainable=False) eval_phase = tf.convert_to_tensor(eval_phase) should_reset_var = tf.Variable(True, trainable=False) zeros_tensor = tf.zeros(len(batch_env)) if "force_beginning_resets" in hparams: force_beginning_resets = hparams.force_beginning_resets else: force_beginning_resets = False def group(): return tf.group(batch_env.reset(tf.range(len(batch_env))), tf.assign(cumulative_rewards, zeros_tensor)) reset_op = tf.cond( tf.logical_or(should_reset_var, tf.convert_to_tensor( force_beginning_resets)), group, tf.no_op) with tf.control_dependencies([reset_op]): reset_once_op = tf.assign(should_reset_var, False) with tf.control_dependencies([reset_once_op]): def step(index, scores_sum, scores_num): """Single step.""" index %= hparams.epoch_length # Only needed in eval runs. # Note - the only way to ensure making a copy of tensor is to run simple # operation. We are waiting for tf.copy: # https://github.com/tensorflow/tensorflow/issues/11186 obs_copy = batch_env.observ + 0 def env_step(arg1, arg2): # pylint: disable=unused-argument """Step of the environment.""" actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams) policy = actor_critic.policy if policy_to_actions_lambda: action = policy_to_actions_lambda(policy) else: action = tf.cond(eval_phase, policy.mode, policy.sample) postprocessed_action = actor_critic.action_postprocessing(action) simulate_output = batch_env.simulate(postprocessed_action[0, ...]) pdf = policy.prob(action)[0] value_function = actor_critic.value[0] pdf = tf.reshape(pdf, shape=(hparams.num_agents,)) value_function = tf.reshape(value_function, shape=(hparams.num_agents,)) with tf.control_dependencies(simulate_output): return tf.identity(pdf), tf.identity(value_function) pdf, value_function = tf.while_loop( lambda _1, _2: tf.equal(speculum.size(), 0), env_step, [tf.constant(0.0, shape=(hparams.num_agents,)), tf.constant(0.0, shape=(hparams.num_agents,))], parallel_iterations=1, back_prop=False,) with tf.control_dependencies([pdf, value_function]): obs, reward, done, action = speculum.dequeue() done = tf.reshape(done, (len(batch_env),)) to_save = [obs, reward, done, action, pdf, value_function] save_ops = [tf.scatter_update(memory_slot, index, value) for memory_slot, value in zip(memory, to_save)] cumulate_rewards_op = cumulative_rewards.assign_add(reward) agent_indices_to_reset = tf.where(done)[:, 0] with tf.control_dependencies([cumulate_rewards_op]): scores_sum_delta = tf.reduce_sum( tf.gather(cumulative_rewards, agent_indices_to_reset)) scores_num_delta = tf.count_nonzero(done, dtype=tf.int32) with tf.control_dependencies(save_ops + [scores_sum_delta, scores_num_delta]): reset_env_op = batch_env.reset(agent_indices_to_reset) reset_cumulative_rewards_op = tf.scatter_update( cumulative_rewards, agent_indices_to_reset, tf.gather(zeros_tensor, agent_indices_to_reset)) with tf.control_dependencies([reset_env_op, reset_cumulative_rewards_op]): return [index + 1, scores_sum + scores_sum_delta, scores_num + scores_num_delta] def stop_condition(i, _, resets): return tf.cond(eval_phase, lambda: resets < hparams.num_eval_agents, lambda: i < hparams.epoch_length) init = [tf.constant(0), tf.constant(0.0), tf.constant(0)] index, scores_sum, scores_num = tf.while_loop( stop_condition, step, init, parallel_iterations=1, back_prop=False) mean_score = tf.cond(tf.greater(scores_num, 0), lambda: scores_sum / tf.cast(scores_num, tf.float32), lambda: 0.) printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ") with tf.control_dependencies([index, printing]): memory = [tf.identity(mem) for mem in memory] mean_score_summary = tf.cond( tf.greater(scores_num, 0), lambda: tf.summary.scalar("mean_score_this_iter", mean_score), str) summaries = tf.summary.merge( [mean_score_summary, tf.summary.scalar("episodes_finished_this_iter", scores_num)]) return memory, summaries, initialization_lambda