def visualize(logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True):
  """Recover checkpoint and render videos from it.

  Args:
    logdir: Logging directory of the trained algorithm.
    outdir: Directory to store rendered videos in.
    num_agents: Number of environments to simulate in parallel.
    num_episodes: Total number of episodes to simulate.
    checkpoint: Checkpoint name to load; defaults to most recent.
    env_processes: Whether to step environments in separate processes.
  """
  config = utility.load_config(logdir)
  with tf.device('/cpu:0'):
    batch_env = utility.define_batch_env(lambda: _create_environment(config, outdir), num_agents,
                                         env_processes)
    graph = utility.define_simulation_graph(batch_env, config.algorithm, config)
    total_steps = num_episodes * config.max_length
    loop = _define_loop(graph, total_steps)
  saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step'))
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  with tf.Session(config=sess_config) as sess:
    utility.initialize_variables(sess, saver, config.logdir, checkpoint, resume=True)
    for unused_score in loop.run(sess, saver, total_steps):
      pass
  batch_env.close()
def main(argv):
    del argv  # Unused.
    config = utility.load_config(FLAGS.logdir)
    policy_layers = config.policy_layers
    value_layers = config.value_layers
    env = config.env(render=True, log_path=FLAGS.log_path, env_randomizer=None)
    network = config.network

    with tf.Session() as sess:
        agent = simple_ppo_agent.SimplePPOPolicy(sess,
                                                 env,
                                                 network,
                                                 policy_layers=policy_layers,
                                                 value_layers=value_layers,
                                                 checkpoint=os.path.join(
                                                     FLAGS.logdir,
                                                     FLAGS.checkpoint))

        sum_reward = 0
        observation = env.reset()
        while True:
            action = agent.get_action([observation])
            observation, reward, done, _ = env.step(action[0])
            # This sleep is to prevent serial communication error on the real robot.
            time.sleep(0.002)
            sum_reward += reward
            if done:
                break
        tf.logging.info("reward: {}".format(sum_reward))
Пример #3
0
def main(argv):
    del argv  # Unused.
    config = utility.load_config(LOG_DIR)
    policy_layers = config.policy_layers
    value_layers = config.value_layers
    env = config.env(render=True)
    network = config.network

    with tf.Session() as sess:
        agent = simple_ppo_agent.SimplePPOPolicy(sess,
                                                 env,
                                                 network,
                                                 policy_layers=policy_layers,
                                                 value_layers=value_layers,
                                                 checkpoint=os.path.join(
                                                     LOG_DIR, CHECKPOINT))

        sum_reward = 0
        observation = env.reset()
        while True:
            action = agent.get_action([observation])
            observation, reward, done, _ = env.step(action[0])
            time.sleep(0.002)
            sum_reward += reward
            if done:
                break
        tf.logging.info("reward: %s", sum_reward)
Пример #4
0
    def __init__(self, world, id, json_data):
        self.tf_scope = 'agent'
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        super().__init__(world, id, json_data)
        self._build_graph(json_data)
        self._init_normalizers()
        return
Пример #5
0
def train(config, env_processes):
    """Training and evaluation entry point yielding scores.

  Resolves some configuration attributes, creates environments, graph, and
  training loop. By default, assigns all operations to the CPU.

  Args:
    config: Object providing configurations via attributes.
    env_processes: Whether to step environments in separate processes.

  Yields:
    Evaluation scores.
  """
    tf.reset_default_graph()
    with config.unlocked:
        config.network = functools.partial(utility.define_network,
                                           config.network, config)
        config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
        config.value_optimizer = getattr(tf.train, config.value_optimizer)
    if config.update_every % config.num_agents:
        tf.logging.warn(
            'Number of algorithms should divide episodes per update.')
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config), config.num_agents,
            env_processes)
        graph = utility.define_simulation_graph(batch_env, config.algorithm,
                                                config)
        loop = _define_loop(graph, config.logdir,
                            config.update_every * config.max_length,
                            config.eval_episodes * config.max_length)
        total_steps = int(config.steps / config.update_every *
                          (config.update_every + config.eval_episodes))
    # Exclude episode related variables since the Python state of environments is
    # not checkpointed and thus new episodes start after resuming.
    saver = utility.define_saver(exclude=(r'.*_temporary/.*', ))
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(sess, saver, config.logdir)
        for score in loop.run(sess, saver, total_steps):
            yield score
    batch_env.close()