def GetCheckpointer(self, path_, max_to_keep_): checkpointer = Checkpointer(path_, global_step=self._ckpt.step, tf_agent=self._agent, max_to_keep=max_to_keep_) checkpointer.initialize_or_restore() return checkpointer
def save_model(): optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = 1e-3); obs_spec = TensorSpec((7,), dtype = tf.float32, name = 'observation'); action_spec = BoundedTensorSpec((1,), dtype = tf.int32, minimum = 0, maximum = 3, name = 'action'); actor_net = ActorDistributionRnnNetwork(obs_spec, action_spec, lstm_size = (100,100)); value_net = ValueRnnNetwork(obs_spec); agent = ppo_agent.PPOAgent( time_step_spec = time_step_spec(obs_spec), action_spec = action_spec, optimizer = optimizer, actor_net = actor_net, value_net = value_net, normalize_observations = True, normalize_rewards = True, use_gae = True, num_epochs = 1, ); checkpointer = Checkpointer( ckpt_dir = 'checkpoints/policy', max_to_keep = 1, agent = agent, policy = agent.policy, global_step = tf.compat.v1.train.get_or_create_global_step()); checkpointer.initialize_or_restore(); saver = policy_saver.PolicySaver(agent.policy); saver.save('final_policy');
def GetCheckpointer(self): checkpointer = Checkpointer( self._params["ML"]["BehaviorTFAAgents"]["CheckpointPath", "", ""], global_step=self._ckpt.step, tf_agent=self._agent, max_to_keep=self._params["ML"]["BehaviorTFAAgents"][ "NumCheckpointsToKeep", "", 3]) checkpointer.initialize_or_restore() return checkpointer
def get_checkpointer(self): """Checkpointer handling the saving and loading of agents Keyword Arguments: log_path {string} -- path to the checkpoints (default: {"/"}) Returns: Checkpointer -- tf-checkpoint handler """ checkpointer = Checkpointer( self._params["ML"]["Agent"]["checkpoint_path"], global_step=self._ckpt.step, tf_agent=self._agent, max_to_keep=self._params["ML"]["Agent"]["max_ckpts_to_keep"]) checkpointer.initialize_or_restore() return checkpointer
min_q_value=min_q, max_q_value=max_q, epsilon_greedy=lambda: decay_epsilon_greedy(train_step), n_step_update=n_steps, target_categorical_q_network=target_q_net, target_update_tau=tau, target_update_period=1, td_errors_loss_fn=loss, gamma=gamma, train_step_counter=train_step) agent.initialize() # 3. Restoring agent's training progress (Checkpoint)... checkpoint_dir = 'checkpoint/' train_checkpointer = Checkpointer(ckpt_dir=checkpoint_dir, max_to_keep=1, agent=agent, policy=agent.policy) train_checkpointer.initialize_or_restore() # 8. Evaluating the agent. def evaluate(env, policy, num_episodes): total_return = 0.0 for _ in range(num_episodes): time_step = env.reset() episode_return = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = env.step(action_step.action) episode_return += time_step.reward
def main(_): # Environment env_name = "Breakout-v4" train_num_parallel_environments = 5 max_steps_per_episode = 1000 # Replay buffer replay_buffer_capacity = 50000 init_replay_buffer = 500 # Driver collect_steps_per_iteration = 1 * train_num_parallel_environments # Training train_batch_size = 32 train_iterations = 100000 train_summary_interval = 200 train_checkpoint_interval = 200 # Evaluation eval_num_parallel_environments = 5 eval_summary_interval = 500 eval_num_episodes = 20 # File paths path = pathlib.Path(__file__) parent_dir = path.parent.resolve() folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S") train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint") train_summary_dir = str(parent_dir / folder_name / "train_summary") eval_summary_dir = str(parent_dir / folder_name / "eval_summary") # Parallel training environment tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * train_num_parallel_environments)) tf_env.seed([42] * tf_env.batch_size) tf_env.reset() # Parallel evaluation environment eval_tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * eval_num_parallel_environments)) eval_tf_env.seed([42] * eval_tf_env.batch_size) eval_tf_env.reset() # Creating the Deep Q-Network preprocessing_layer = keras.layers.Lambda( lambda obs: tf.cast(obs, np.float32) / 255.) conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)] fc_layer_params = [512] q_net = QNetwork(tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=preprocessing_layer, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) # Creating the DQN Agent optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=2500000, end_learning_rate=0.01) # final ε global_step = tf.compat.v1.train.get_or_create_global_step() agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=200, td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=global_step, epsilon_greedy=lambda: epsilon_fn(global_step)) agent.initialize() # Creating the Replay Buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) # Observer: Replay Buffer Observer replay_buffer_observer = replay_buffer.add_batch # Observer: Training Metrics train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size), tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size), ] # Creating the Collect Driver collect_driver = DynamicStepDriver(tf_env, agent.collect_policy, observers=[replay_buffer_observer] + train_metrics, num_steps=collect_steps_per_iteration) # Initialize replay buffer initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) init_driver = DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer_observer, ShowProgress()], num_steps=init_replay_buffer) final_time_step, final_policy_state = init_driver.run() # Creating the Dataset dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size, num_steps=2, num_parallel_calls=3).prefetch(3) # Optimize by wrapping some of the code in a graph using TF function. collect_driver.run = function(collect_driver.run) agent.train = function(agent.train) print("\n\n++++++++++++++++++++++++++++++++++\n") # Create checkpoint train_checkpointer = Checkpointer( ckpt_dir=train_checkpoint_dir, max_to_keep=1, agent=agent, # replay_buffer=replay_buffer, global_step=global_step, # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics') ) # Restore checkpoint # train_checkpointer.initialize_or_restore() # Summary writers and metrics train_summary_writer = tf.summary.create_file_writer(train_summary_dir) eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir) eval_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes), tf_metrics.AverageEpisodeLengthMetric( batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes) ] # Create evaluate callback function eval_callback = evaluate(eval_metrics=eval_metrics, eval_tf_env=eval_tf_env, eval_policy=agent.policy, eval_num_episodes=eval_num_episodes, train_step=global_step, eval_summary_writer=eval_summary_writer) # Train agent train_agent(tf_env=tf_env, train_iterations=train_iterations, global_step=global_step, agent=agent, dataset=dataset, collect_driver=collect_driver, train_metrics=train_metrics, train_checkpointer=train_checkpointer, train_checkpoint_interval=train_checkpoint_interval, train_summary_writer=train_summary_writer, train_summary_interval=train_summary_interval, eval_summary_interval=eval_summary_interval, eval_callback=eval_callback) print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
# num_steps=update_period, # ) # final_time_step, final_policty_state = init_driver.run() # %% Dataset dataset = replay_buffer.as_dataset(sample_batch_size=SAMPLE_BATCH_SIZE, num_steps=2, num_parallel_calls=PARALLEL_STEPS) # %% Checkpoint from tf_agents.utils.common import Checkpointer train_checkpointer = Checkpointer( ckpt_dir=CHECKPOINT_DIR, max_to_keep=4, agent=agent, policy=agent.policy, replay_buffer=replay_buffer, global_step=train_step, ) train_checkpointer.initialize_or_restore() # %% Policy saver tf_policy_saver = policies.policy_saver.PolicySaver(agent.policy) def save_policy(): tf_policy_saver.save(POLICY_SAVE_DIR) # %% Iterate