예제 #1
0
 def GetCheckpointer(self, path_, max_to_keep_):
     checkpointer = Checkpointer(path_,
                                 global_step=self._ckpt.step,
                                 tf_agent=self._agent,
                                 max_to_keep=max_to_keep_)
     checkpointer.initialize_or_restore()
     return checkpointer
def save_model():

  optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = 1e-3);
  obs_spec = TensorSpec((7,), dtype = tf.float32, name = 'observation');
  action_spec = BoundedTensorSpec((1,), dtype = tf.int32, minimum = 0, maximum = 3, name = 'action');
  actor_net = ActorDistributionRnnNetwork(obs_spec, action_spec, lstm_size = (100,100));
  value_net = ValueRnnNetwork(obs_spec);
  agent = ppo_agent.PPOAgent(
    time_step_spec = time_step_spec(obs_spec),
    action_spec = action_spec,
    optimizer = optimizer,
    actor_net = actor_net,
    value_net = value_net,
    normalize_observations = True,
    normalize_rewards = True,
    use_gae = True,
    num_epochs = 1,
  );
  checkpointer = Checkpointer(
    ckpt_dir = 'checkpoints/policy',
    max_to_keep = 1,
    agent = agent,
    policy = agent.policy,
    global_step = tf.compat.v1.train.get_or_create_global_step());
  checkpointer.initialize_or_restore();
  saver = policy_saver.PolicySaver(agent.policy);
  saver.save('final_policy');
예제 #3
0
 def GetCheckpointer(self):
     checkpointer = Checkpointer(
         self._params["ML"]["BehaviorTFAAgents"]["CheckpointPath", "", ""],
         global_step=self._ckpt.step,
         tf_agent=self._agent,
         max_to_keep=self._params["ML"]["BehaviorTFAAgents"][
             "NumCheckpointsToKeep", "", 3])
     checkpointer.initialize_or_restore()
     return checkpointer
예제 #4
0
 def get_checkpointer(self):
     """Checkpointer handling the saving and loading of agents
 
 Keyword Arguments:
     log_path {string} -- path to the checkpoints (default: {"/"})
 
 Returns:
     Checkpointer -- tf-checkpoint handler
 """
     checkpointer = Checkpointer(
         self._params["ML"]["Agent"]["checkpoint_path"],
         global_step=self._ckpt.step,
         tf_agent=self._agent,
         max_to_keep=self._params["ML"]["Agent"]["max_ckpts_to_keep"])
     checkpointer.initialize_or_restore()
     return checkpointer
    min_q_value=min_q,
    max_q_value=max_q,
    epsilon_greedy=lambda: decay_epsilon_greedy(train_step),
    n_step_update=n_steps,
    target_categorical_q_network=target_q_net,
    target_update_tau=tau,
    target_update_period=1,
    td_errors_loss_fn=loss,
    gamma=gamma,
    train_step_counter=train_step)
agent.initialize()

# 3. Restoring agent's training progress (Checkpoint)...
checkpoint_dir = 'checkpoint/'
train_checkpointer = Checkpointer(ckpt_dir=checkpoint_dir,
                                  max_to_keep=1,
                                  agent=agent,
                                  policy=agent.policy)
train_checkpointer.initialize_or_restore()


# 8. Evaluating the agent.
def evaluate(env, policy, num_episodes):
    total_return = 0.0

    for _ in range(num_episodes):
        time_step = env.reset()
        episode_return = 0.0
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = env.step(action_step.action)
            episode_return += time_step.reward
예제 #6
0
def main(_):
    # Environment
    env_name = "Breakout-v4"
    train_num_parallel_environments = 5
    max_steps_per_episode = 1000
    # Replay buffer
    replay_buffer_capacity = 50000
    init_replay_buffer = 500
    # Driver
    collect_steps_per_iteration = 1 * train_num_parallel_environments
    # Training
    train_batch_size = 32
    train_iterations = 100000
    train_summary_interval = 200
    train_checkpoint_interval = 200
    # Evaluation
    eval_num_parallel_environments = 5
    eval_summary_interval = 500
    eval_num_episodes = 20
    # File paths
    path = pathlib.Path(__file__)
    parent_dir = path.parent.resolve()
    folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S")
    train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint")
    train_summary_dir = str(parent_dir / folder_name / "train_summary")
    eval_summary_dir = str(parent_dir / folder_name / "eval_summary")

    # Parallel training environment
    tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * train_num_parallel_environments))
    tf_env.seed([42] * tf_env.batch_size)
    tf_env.reset()

    # Parallel evaluation environment
    eval_tf_env = TFPyEnvironment(
        ParallelPyEnvironment([
            lambda: suite_atari.load(
                env_name,
                env_wrappers=
                [lambda env: TimeLimit(env, duration=max_steps_per_episode)],
                gym_env_wrappers=[AtariPreprocessing, FrameStack4],
            )
        ] * eval_num_parallel_environments))
    eval_tf_env.seed([42] * eval_tf_env.batch_size)
    eval_tf_env.reset()

    # Creating the Deep Q-Network
    preprocessing_layer = keras.layers.Lambda(
        lambda obs: tf.cast(obs, np.float32) / 255.)

    conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
    fc_layer_params = [512]

    q_net = QNetwork(tf_env.observation_spec(),
                     tf_env.action_spec(),
                     preprocessing_layers=preprocessing_layer,
                     conv_layer_params=conv_layer_params,
                     fc_layer_params=fc_layer_params)

    # Creating the DQN Agent
    optimizer = keras.optimizers.RMSprop(lr=2.5e-4,
                                         rho=0.95,
                                         momentum=0.0,
                                         epsilon=0.00001,
                                         centered=True)

    epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=1.0,  # initial ε
        decay_steps=2500000,
        end_learning_rate=0.01)  # final ε

    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = DqnAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        target_update_period=200,
        td_errors_loss_fn=keras.losses.Huber(reduction="none"),
        gamma=0.99,  # discount factor
        train_step_counter=global_step,
        epsilon_greedy=lambda: epsilon_fn(global_step))
    agent.initialize()

    # Creating the Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)

    # Observer: Replay Buffer Observer
    replay_buffer_observer = replay_buffer.add_batch

    # Observer: Training Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size),
        tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size),
    ]

    # Creating the Collect Driver
    collect_driver = DynamicStepDriver(tf_env,
                                       agent.collect_policy,
                                       observers=[replay_buffer_observer] +
                                       train_metrics,
                                       num_steps=collect_steps_per_iteration)

    # Initialize replay buffer
    initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                            tf_env.action_spec())
    init_driver = DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer_observer,
                   ShowProgress()],
        num_steps=init_replay_buffer)
    final_time_step, final_policy_state = init_driver.run()

    # Creating the Dataset
    dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size,
                                       num_steps=2,
                                       num_parallel_calls=3).prefetch(3)

    # Optimize by wrapping some of the code in a graph using TF function.
    collect_driver.run = function(collect_driver.run)
    agent.train = function(agent.train)

    print("\n\n++++++++++++++++++++++++++++++++++\n")

    # Create checkpoint
    train_checkpointer = Checkpointer(
        ckpt_dir=train_checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        # replay_buffer=replay_buffer,
        global_step=global_step,
        # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics')
    )

    # Restore checkpoint
    # train_checkpointer.initialize_or_restore()

    # Summary writers and metrics
    train_summary_writer = tf.summary.create_file_writer(train_summary_dir)
    eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir)
    eval_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size,
                                       buffer_size=eval_num_episodes),
        tf_metrics.AverageEpisodeLengthMetric(
            batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes)
    ]

    # Create evaluate callback function
    eval_callback = evaluate(eval_metrics=eval_metrics,
                             eval_tf_env=eval_tf_env,
                             eval_policy=agent.policy,
                             eval_num_episodes=eval_num_episodes,
                             train_step=global_step,
                             eval_summary_writer=eval_summary_writer)

    # Train agent
    train_agent(tf_env=tf_env,
                train_iterations=train_iterations,
                global_step=global_step,
                agent=agent,
                dataset=dataset,
                collect_driver=collect_driver,
                train_metrics=train_metrics,
                train_checkpointer=train_checkpointer,
                train_checkpoint_interval=train_checkpoint_interval,
                train_summary_writer=train_summary_writer,
                train_summary_interval=train_summary_interval,
                eval_summary_interval=eval_summary_interval,
                eval_callback=eval_callback)

    print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
예제 #7
0
#     num_steps=update_period,
# )
# final_time_step, final_policty_state = init_driver.run()

# %% Dataset
dataset = replay_buffer.as_dataset(sample_batch_size=SAMPLE_BATCH_SIZE,
                                   num_steps=2,
                                   num_parallel_calls=PARALLEL_STEPS)

# %% Checkpoint
from tf_agents.utils.common import Checkpointer

train_checkpointer = Checkpointer(
    ckpt_dir=CHECKPOINT_DIR,
    max_to_keep=4,
    agent=agent,
    policy=agent.policy,
    replay_buffer=replay_buffer,
    global_step=train_step,
)

train_checkpointer.initialize_or_restore()

# %% Policy saver
tf_policy_saver = policies.policy_saver.PolicySaver(agent.policy)


def save_policy():
    tf_policy_saver.save(POLICY_SAVE_DIR)


# %% Iterate