def visualize(logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True): """Recover checkpoint and render videos from it. Args: logdir: Logging directory of the trained algorithm. outdir: Directory to store rendered videos in. num_agents: Number of environments to simulate in parallel. num_episodes: Total number of episodes to simulate. checkpoint: Checkpoint name to load; defaults to most recent. env_processes: Whether to step environments in separate processes. """ config = utility.load_config(logdir) with tf.device('/cpu:0'): batch_env = utility.define_batch_env(lambda: _create_environment(config, outdir), num_agents, env_processes) graph = utility.define_simulation_graph(batch_env, config.algorithm, config) total_steps = num_episodes * config.max_length loop = _define_loop(graph, total_steps) saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step')) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables(sess, saver, config.logdir, checkpoint, resume=True) for unused_score in loop.run(sess, saver, total_steps): pass batch_env.close()
def main(argv): del argv # Unused. config = utility.load_config(FLAGS.logdir) policy_layers = config.policy_layers value_layers = config.value_layers env = config.env(render=True, log_path=FLAGS.log_path, env_randomizer=None) network = config.network with tf.Session() as sess: agent = simple_ppo_agent.SimplePPOPolicy(sess, env, network, policy_layers=policy_layers, value_layers=value_layers, checkpoint=os.path.join( FLAGS.logdir, FLAGS.checkpoint)) sum_reward = 0 observation = env.reset() while True: action = agent.get_action([observation]) observation, reward, done, _ = env.step(action[0]) # This sleep is to prevent serial communication error on the real robot. time.sleep(0.002) sum_reward += reward if done: break tf.logging.info("reward: {}".format(sum_reward))
def main(argv): del argv # Unused. config = utility.load_config(LOG_DIR) policy_layers = config.policy_layers value_layers = config.value_layers env = config.env(render=True) network = config.network with tf.Session() as sess: agent = simple_ppo_agent.SimplePPOPolicy(sess, env, network, policy_layers=policy_layers, value_layers=value_layers, checkpoint=os.path.join( LOG_DIR, CHECKPOINT)) sum_reward = 0 observation = env.reset() while True: action = agent.get_action([observation]) observation, reward, done, _ = env.step(action[0]) time.sleep(0.002) sum_reward += reward if done: break tf.logging.info("reward: %s", sum_reward)
def __init__(self, world, id, json_data): self.tf_scope = 'agent' self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) super().__init__(world, id, json_data) self._build_graph(json_data) self._init_normalizers() return
def train(config, env_processes): """Training and evaluation entry point yielding scores. Resolves some configuration attributes, creates environments, graph, and training loop. By default, assigns all operations to the CPU. Args: config: Object providing configurations via attributes. env_processes: Whether to step environments in separate processes. Yields: Evaluation scores. """ tf.reset_default_graph() with config.unlocked: config.network = functools.partial(utility.define_network, config.network, config) config.policy_optimizer = getattr(tf.train, config.policy_optimizer) config.value_optimizer = getattr(tf.train, config.value_optimizer) if config.update_every % config.num_agents: tf.logging.warn( 'Number of algorithms should divide episodes per update.') with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config), config.num_agents, env_processes) graph = utility.define_simulation_graph(batch_env, config.algorithm, config) loop = _define_loop(graph, config.logdir, config.update_every * config.max_length, config.eval_episodes * config.max_length) total_steps = int(config.steps / config.update_every * (config.update_every + config.eval_episodes)) # Exclude episode related variables since the Python state of environments is # not checkpointed and thus new episodes start after resuming. saver = utility.define_saver(exclude=(r'.*_temporary/.*', )) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables(sess, saver, config.logdir) for score in loop.run(sess, saver, total_steps): yield score batch_env.close()