def train_dyke_agent(train_env: TFPyEnvironment, eval_env: TFPyEnvironment, agent: DqnAgent, train_steps: int, steps_per_episode: int, eval_episodes: int) -> Dict[str, Any]: """ Trains the DQN agent on the dyke maintenance task. :param train_env: The training environment. :param eval_env: The environment for testing agent performance. :param agent: The agent. :param train_steps: The number of training steps to use. :param steps_per_episode: The number of time steps that can be taken in a single dyke environment episode. :param eval_episodes: The number of episodes to use per evaluation. :return: A mapping to various metrics pertaining to the training's results. """ losses: np.ndarray = np.zeros(shape=(train_steps, steps_per_episode)) evaluations: np.ndarray = np.zeros(shape=(train_steps, eval_episodes)) train_metrics: Tuple = (AverageReturnMetric, ) train_metric_results: np.ndarray = np.zeros(shape=(len(train_metrics), train_steps, steps_per_episode)) for step in range(train_steps): # we uniformly sample experiences (single time steps) from one episode per train step print('STEP %d/%d' % (step + 1, train_steps)) train_env.reset() rep_buf = _dyke_replay_buffer(train_env, agent, steps_per_episode) train_metric_inst: Tuple = tuple( [metric() for metric in train_metrics]) # instantiate the metrics obs: Tuple = (rep_buf.add_batch, ) + train_metric_inst _ = DynamicStepDriver( env=train_env, policy=agent.collect_policy, observers=obs, num_steps=steps_per_episode ).run( ) # experience a single episode using the agent's current configuration dataset: tf.data.Dataset = rep_buf.as_dataset( sample_batch_size=_REP_BUF_BATCH_SIZE, num_steps=_REP_BUF_NUM_STEPS) iterator = iter(dataset) for tr in range(steps_per_episode): trajectories, _ = next(iterator) losses[step, tr] = agent.train(experience=trajectories).loss for met in range(len(train_metrics)): train_metric_results[ met, step, tr] = train_metric_inst[met].result().numpy() evaluations[step, :] = _evaluate_dyke_agent(eval_env, agent, eval_episodes) return { 'loss': losses, 'eval': evaluations, 'train-metrics': train_metric_results }
def save_environment_agent_video( filename: str, agent: tf_agent.TFAgent, tf_env: TFPyEnvironment, py_env: TimeLimit, num_episodes: int = 1, ) -> None: """ Save a video of an agent acting in the environment. Render method needs to be available in the python version of the environment. TODO: - how to prevent opening a window when saving a video? - sometimes nothing is saved? - gym wrappers monitoring VideoRecorder :param filename: A valid path to which a file with the video will be saved. :param agent: An agent whose policy will be evaluated. :param tf_env: A TensorFlow environment used for interaction with the agent. :param py_env: A Python OpenAI Gym environment used for rendering the video. Environment has to provide `render` method. :param num_episodes: A number of episodes to evaluate. :return: A video is saved to filename. """ with imageio.get_writer(filename, fps=60) as video: for _ in range(num_episodes): time_step = tf_env.reset() video.append_data(py_env.render()) while not time_step.is_last(): action_step = agent.policy.action(time_step) time_step = tf_env.step(action_step.action) video.append_data(py_env.render()) py_env.close()
def compute_total_reward(env: TFPyEnvironment, policy): total_reward = 0.0 time_step = env.reset() while not time_step.is_last(): policy_step = policy.action(time_step) time_step = env.step(policy_step.action) total_reward += time_step.reward return total_reward.numpy()[0]
def step_episode( environment: TFPyEnvironment, policy: tf_policy.TFPolicy, replay_buffer: ReplayBuffer ) -> typing.Tuple[int, int]: done = False environment.reset() curr_episode_rewards = [] episode_reward = 0 episode_length = 0 while not done: reward, done = step(environment, policy, replay_buffer) curr_episode_rewards.append(reward) episode_length += 1 if done: episode_reward = sum(curr_episode_rewards) return episode_reward, episode_length
def _evaluate_dyke_agent(env: TFPyEnvironment, agent: DqnAgent, num_episodes: int = 10) -> np.ndarray: returns: np.ndarray = np.zeros(shape=(num_episodes, )) for ep in range(num_episodes): time_step: TimeStep = env.reset() episode_return: float = 0.0 while not time_step.is_last(): action_step = agent.policy.action(time_step) time_step = env.step(action_step.action) episode_return += time_step.reward returns[ep] = episode_return return returns
def test_ppo(self): env_class = PolicyUnittestEnv learning_rate = 1e-1 iterations = 20 batch_size = 100 steps_per_episode = 13 env = env_class(batch_size, steps_per_episode) env = TFPyEnvironment(env) eval_env = env_class(batch_size, steps_per_episode) eval_env = TFPyEnvironment(eval_env) algorithm = create_algorithm(env, learning_rate=learning_rate) driver = SyncOffPolicyDriver(env, algorithm, debug_summaries=DEBUGGING, summarize_grads_and_vars=DEBUGGING) replayer = driver.exp_replayer eval_driver = OnPolicyDriver(eval_env, algorithm, training=False, greedy_predict=True) env.reset() eval_env.reset() time_step = driver.get_initial_time_step() policy_state = driver.get_initial_policy_state() for i in range(iterations): time_step, policy_state = driver.run(max_num_steps=batch_size * steps_per_episode, time_step=time_step, policy_state=policy_state) experience = replayer.replay_all() driver.train(experience, num_updates=4, mini_batch_size=25) replayer.clear() eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.info("%d reward=%f", i, float(tf.reduce_mean(eval_time_step.reward))) eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.info("reward=%f", float(tf.reduce_mean(eval_time_step.reward))) self.assertAlmostEqual(1.0, float(tf.reduce_mean(eval_time_step.reward)), delta=1e-1)
def compute_avg_return(env: tf_py_environment.TFPyEnvironment, policy, num_episodes): total_return = 0.0 for _ in range(num_episodes): time_step = env.reset() episode_return = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = env.step(action_step.action) episode_return += time_step.reward total_return += episode_return avg_return = total_return / num_episodes return avg_return.numpy()[0]
def load_model_checkpoint(c):#returns the model at given chkpoint dir_name = tf.train.latest_checkpoint(c.model_dir) #if ver_name =='None': # check_or_make_dir(dir_name) #else: # dir_name = os.path.join(dir_name,ver_name) dummy_env= TFPyEnvironment(StockEnvBasic(**c.default_env)) time_step = dummy_env.reset() temp = ValueNet(**c.model_vars) #initialize model temp(time_step.observation) checkpoint2 = tf.train.Checkpoint(module=temp) status=checkpoint2.restore(dir_name) return temp,checkpoint2
def create_video(py_environment: PyEnvironment, tf_environment: TFPyEnvironment, policy: tf_policy, num_episodes=10, video_filename='imageio.mp4'): print("Generating video %s" % video_filename) with imageio.get_writer(video_filename, fps=60) as video: for episode in range(num_episodes): print("Generating episode %d of %d" % (episode, num_episodes)) time_step = tf_environment.reset() video.append_data(py_environment.render()) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_environment.step(action_step.action) video.append_data(py_environment.render())
def compute_average_reward(env: tf_py_environment.TFPyEnvironment, policy: tf_policy.Base, num_episodes=10) -> float: total_reward = 0 for _ in range(num_episodes): time_step: ts.TimeStep = env.reset() episode_reward = 0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = env.step(action_step.action) episode_reward += time_step.reward # print(action_step.action.numpy()[0], end=' ') print(time_step.observation.numpy()) total_reward += episode_reward return total_reward / num_episodes
def test_planning_policy_batch_environment_model(): """ Ensure that planning policy is operational. """ # number of trajectories for planning and planning horizon population_size = 3 planner_horizon = 5 number_of_particles = 1 # setup the environment and a model of it py_env = suite_gym.load("MountainCar-v0") tf_env = TFPyEnvironment(py_env) reward = MountainCarReward(tf_env.observation_spec(), tf_env.action_spec()) terminates = MountainCarTermination(tf_env.observation_spec()) network = LinearTransitionNetwork(tf_env.observation_spec()) transition_model = KerasTransitionModel( [network], tf_env.observation_spec(), tf_env.action_spec(), ) initial_state = MountainCarInitialState(tf_env.observation_spec()) environment_model = EnvironmentModel( transition_model=transition_model, reward_model=reward, termination_model=terminates, initial_state_distribution_model=initial_state, ) # setup the trajectory optimiser random_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) trajectory_optimiser = PolicyTrajectoryOptimiser(random_policy, planner_horizon, population_size, number_of_particles) planning_policy = PlanningPolicy(environment_model, trajectory_optimiser) # test whether it runs collect_driver_planning_policy = DynamicEpisodeDriver(tf_env, planning_policy, num_episodes=1) time_step = tf_env.reset() collect_driver_planning_policy.run(time_step)
def compute_average_return(env: tf_py_environment.TFPyEnvironment, policy, num_episodes: int = 1) -> float: total_return = 0.0 for _ in range(num_episodes): time_step_ = env.reset() episode_return = 0.0 while not any(time_step_.is_last()): action_step = policy.action(time_step_) time_step_ = env.step(action=action_step.action) episode_return += np.mean(time_step_.reward) total_return += episode_return average_return = total_return / num_episodes return average_return
def create_video(py_environment: PyEnvironment, tf_environment: TFPyEnvironment, policy: tf_policy, num_episodes=10, video_filename='imageio.mp4'): print("Generating video %s" % video_filename) with imageio.get_writer(video_filename, fps=60) as video: for episode in range(num_episodes): episode_return = 0.0 time_step = tf_environment.reset() video.append_data(py_environment.render()) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_environment.step(action_step.action) episode_return += time_step.reward video.append_data(py_environment.render()) print( f"Generated episode {episode} of {num_episodes}. Return:{episode_return} " )
def evaluate_episode(policy, env_params): """Use naive while loop to evaluate policy in single episode.""" if 'n_monsters' in env_params: env = MultiMonsterEnvironment elif 'is_jumping' in env_params: env = JumpingEnvironment else: env = LakeMonsterEnvironment py_env = env(**env_params) tf_env = TFPyEnvironment(py_env) ts = tf_env.reset() n_steps = 0 while not ts.is_last(): action = policy.action(ts) ts = tf_env.step(action.action) n_steps += 1 reward = ts.reward.numpy().item() return reward, n_steps * py_env.step_size
def create_policy_eval_video(self, env, policy, filename, num_episodes=5, fps=30): filename = filename + ".mp4" tf_env = TFPyEnvironment(env) with imageio.get_writer(filename, fps=fps) as video: for _ in range(num_episodes): time_step = tf_env.reset() tf_env.step(1) video.append_data(env.render()) while not time_step.is_last(): action_step = policy.action(time_step) time_step = tf_env.step(action_step.action) video.append_data(env.render()) video.close() return self.embed_mp4(filename)
def create_many_policy_gif(uid, file_path, monster_speed=4.0): """Create a gif superimposing the actions of many policies.""" n_steps = 300 # = timeout_factor / step_size step_size = 0.01 fps = 10 p_paths = glob.glob(configs.POLICY_DIR + uid + '*') all_positions = [] colors = [] for p_path in tqdm(p_paths): color = (np.random.randint(256), np.random.randint(128), 0) policy = tf.saved_model.load(p_path) env_params = policy.get_metadata() env_params = tf_to_py(env_params) # overwriting parameters env_params['step_size'] = step_size env_params['monster_speed'] = monster_speed py_env = LakeMonsterEnvironment(**env_params) tf_env = TFPyEnvironment(py_env) time_step = tf_env.reset() agent_positions = {} for step in range(n_steps): if not time_step.is_last(): action = policy.action(time_step) time_step = tf_env.step(action.action) theta = py_env.total_monster_rotation - py_env.total_agent_rotation c, s = np.cos(theta), np.sin(theta) rot_matrix = np.array(((c, -s), (s, c))) agent_positions[step] = np.dot(rot_matrix, np.array((py_env.r, 0))) all_positions.append(agent_positions) colors.append(color) with imageio.get_writer(file_path, mode='I', fps=fps) as gif: for step in range(n_steps): positions = [item[step] for item in all_positions] im = render_many_agents(positions, colors, step, step_size, 4, monster_speed) gif.append_data(np.array(im)) pygifsicle.optimize(file_path)
def episode_as_video(py_env, policy, filepath, fps=10): """Create mp4 video through py_environment render method.""" tf_env = TFPyEnvironment(py_env) with imageio.get_writer('tmp.mp4', fps=fps) as video: time_step = tf_env.reset() video.append_data(py_env.render()) while not time_step.is_last(): action = policy.action(time_step).action time_step = tf_env.step(action) video.append_data(py_env.render()) for _ in range(3 * fps): # play for 3 more seconds video.append_data(py_env.render()) # giving video file a more descriptive name _, result = py_env.determine_reward() assert filepath.split('.')[1] == 'mp4' split = filepath.split('.') split[0] += '-' + result filepath = '.'.join(split) os.rename('tmp.mp4', filepath)
def compute_mean_reward(environment: TFPyEnvironment, policy: tf_policy.Base, num_episodes=10) -> float: """ Evaluate mean reward over `num_episodes` Implementation is taken from Tensorflow official documentation tutorial: https://www.tensorflow.org/agents/tutorials/6_reinforce_tutorial#metrics_and_evaluation """ total_reward = 0.0 for _ in range(num_episodes): time_step = environment.reset() episode_reward = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = environment.step(action_step.action) episode_reward += time_step.reward total_reward += episode_reward avg_rewards = total_reward / num_episodes return avg_rewards.numpy()[0]
def episode_as_gif(py_env, policy, save_path, fps=10, show_path=True): """Create gif through py_environment render method.""" tf_env = TFPyEnvironment(py_env) path = [] with imageio.get_writer(save_path, mode='I', fps=fps) as gif: time_step = tf_env.reset() # using the policy_state to deal with scripted_policy possibility policy_state = policy.get_initial_state(batch_size=1) gif.append_data(py_env.render()) while not time_step.is_last(): action = policy.action(time_step, policy_state) time_step = tf_env.step(action.action) im, real_position = py_env.render('return_real') path.append(real_position) if show_path: im = render_agent_path(im, path) policy_state = action.state gif.append_data(np.array(im)) for _ in range(fps): # play for 1 more seconds gif.append_data(py_env.render()) pygifsicle.optimize(save_path)
def main(_): # Environment env_name = "Breakout-v4" train_num_parallel_environments = 5 max_steps_per_episode = 1000 # Replay buffer replay_buffer_capacity = 50000 init_replay_buffer = 500 # Driver collect_steps_per_iteration = 1 * train_num_parallel_environments # Training train_batch_size = 32 train_iterations = 100000 train_summary_interval = 200 train_checkpoint_interval = 200 # Evaluation eval_num_parallel_environments = 5 eval_summary_interval = 500 eval_num_episodes = 20 # File paths path = pathlib.Path(__file__) parent_dir = path.parent.resolve() folder_name = path.stem + time.strftime("_%Y%m%d_%H%M%S") train_checkpoint_dir = str(parent_dir / folder_name / "train_checkpoint") train_summary_dir = str(parent_dir / folder_name / "train_summary") eval_summary_dir = str(parent_dir / folder_name / "eval_summary") # Parallel training environment tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * train_num_parallel_environments)) tf_env.seed([42] * tf_env.batch_size) tf_env.reset() # Parallel evaluation environment eval_tf_env = TFPyEnvironment( ParallelPyEnvironment([ lambda: suite_atari.load( env_name, env_wrappers= [lambda env: TimeLimit(env, duration=max_steps_per_episode)], gym_env_wrappers=[AtariPreprocessing, FrameStack4], ) ] * eval_num_parallel_environments)) eval_tf_env.seed([42] * eval_tf_env.batch_size) eval_tf_env.reset() # Creating the Deep Q-Network preprocessing_layer = keras.layers.Lambda( lambda obs: tf.cast(obs, np.float32) / 255.) conv_layer_params = [(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)] fc_layer_params = [512] q_net = QNetwork(tf_env.observation_spec(), tf_env.action_spec(), preprocessing_layers=preprocessing_layer, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params) # Creating the DQN Agent optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0, epsilon=0.00001, centered=True) epsilon_fn = keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=1.0, # initial ε decay_steps=2500000, end_learning_rate=0.01) # final ε global_step = tf.compat.v1.train.get_or_create_global_step() agent = DqnAgent( tf_env.time_step_spec(), tf_env.action_spec(), q_network=q_net, optimizer=optimizer, target_update_period=200, td_errors_loss_fn=keras.losses.Huber(reduction="none"), gamma=0.99, # discount factor train_step_counter=global_step, epsilon_greedy=lambda: epsilon_fn(global_step)) agent.initialize() # Creating the Replay Buffer replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) # Observer: Replay Buffer Observer replay_buffer_observer = replay_buffer.add_batch # Observer: Training Metrics train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=tf_env.batch_size), tf_metrics.AverageEpisodeLengthMetric(batch_size=tf_env.batch_size), ] # Creating the Collect Driver collect_driver = DynamicStepDriver(tf_env, agent.collect_policy, observers=[replay_buffer_observer] + train_metrics, num_steps=collect_steps_per_iteration) # Initialize replay buffer initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(), tf_env.action_spec()) init_driver = DynamicStepDriver( tf_env, initial_collect_policy, observers=[replay_buffer_observer, ShowProgress()], num_steps=init_replay_buffer) final_time_step, final_policy_state = init_driver.run() # Creating the Dataset dataset = replay_buffer.as_dataset(sample_batch_size=train_batch_size, num_steps=2, num_parallel_calls=3).prefetch(3) # Optimize by wrapping some of the code in a graph using TF function. collect_driver.run = function(collect_driver.run) agent.train = function(agent.train) print("\n\n++++++++++++++++++++++++++++++++++\n") # Create checkpoint train_checkpointer = Checkpointer( ckpt_dir=train_checkpoint_dir, max_to_keep=1, agent=agent, # replay_buffer=replay_buffer, global_step=global_step, # metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics') ) # Restore checkpoint # train_checkpointer.initialize_or_restore() # Summary writers and metrics train_summary_writer = tf.summary.create_file_writer(train_summary_dir) eval_summary_writer = tf.summary.create_file_writer(eval_summary_dir) eval_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes), tf_metrics.AverageEpisodeLengthMetric( batch_size=eval_tf_env.batch_size, buffer_size=eval_num_episodes) ] # Create evaluate callback function eval_callback = evaluate(eval_metrics=eval_metrics, eval_tf_env=eval_tf_env, eval_policy=agent.policy, eval_num_episodes=eval_num_episodes, train_step=global_step, eval_summary_writer=eval_summary_writer) # Train agent train_agent(tf_env=tf_env, train_iterations=train_iterations, global_step=global_step, agent=agent, dataset=dataset, collect_driver=collect_driver, train_metrics=train_metrics, train_checkpointer=train_checkpointer, train_checkpoint_interval=train_checkpoint_interval, train_summary_writer=train_summary_writer, train_summary_interval=train_summary_interval, eval_summary_interval=eval_summary_interval, eval_callback=eval_callback) print("\n\n++++++++++ END OF TF_AGENTS RL TRAINING ++++++++++\n\n")
def test_off_policy_algorithm(self, algorithm_ctor, use_rollout_state, sync_driver): logging.info("{} {}".format(algorithm_ctor.__name__, sync_driver)) batch_size = 128 if use_rollout_state: steps_per_episode = 5 mini_batch_length = 8 unroll_length = 8 env_class = RNNPolicyUnittestEnv else: steps_per_episode = 12 mini_batch_length = 2 unroll_length = 12 env_class = PolicyUnittestEnv env = TFPyEnvironment( env_class( batch_size, steps_per_episode, action_type=ActionType.Continuous)) eval_env = TFPyEnvironment( env_class( batch_size, steps_per_episode, action_type=ActionType.Continuous)) common.set_global_env(env) algorithm = algorithm_ctor() algorithm.set_summary_settings(summarize_grads_and_vars=True) algorithm.use_rollout_state = use_rollout_state if sync_driver: driver = SyncOffPolicyDriver(env, algorithm) else: driver = AsyncOffPolicyDriver([env], algorithm, num_actor_queues=1, unroll_length=unroll_length, learn_queue_cap=1, actor_queue_cap=1) eval_driver = OnPolicyDriver(eval_env, algorithm, training=False) eval_env.reset() driver.start() if sync_driver: time_step = driver.get_initial_time_step() policy_state = driver.get_initial_policy_state() for i in range(5): time_step, policy_state = driver.run( max_num_steps=batch_size * steps_per_episode, time_step=time_step, policy_state=policy_state) for i in range(500): if sync_driver: time_step, policy_state = driver.run( max_num_steps=batch_size * mini_batch_length * 2, time_step=time_step, policy_state=policy_state) whole_replay_buffer_training = False clear_replay_buffer = False else: driver.run_async() whole_replay_buffer_training = True clear_replay_buffer = True driver.algorithm.train( mini_batch_size=128, mini_batch_length=mini_batch_length, whole_replay_buffer_training=whole_replay_buffer_training, clear_replay_buffer=clear_replay_buffer) eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.log_every_n_seconds( logging.INFO, "%d reward=%f" % (i, float(tf.reduce_mean(eval_time_step.reward))), n_seconds=1) driver.stop() self.assertAlmostEqual( 1.0, float(tf.reduce_mean(eval_time_step.reward)), delta=2e-1)
def train_agent( env: TFPyEnvironment, agent: Union[ReinforceAgent, PPOAgent], data_collection_driver: DynamicEpisodeDriver, replay_buffer: TFUniformReplayBuffer, num_iters: int, global_step=None, metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None, policy_metrics: Optional[Sequence[tf_metric.TFStepMetric]] = None, policy_summary_writers: Optional[Sequence[tf.summary.SummaryWriter]] = None, eval_env: Optional[TFPyEnvironment] = None, eval_summary_writer: Optional[tf.summary.SummaryWriter] = None, num_eval_episodes: int = 1, eval_metrics: Optional[List[tf_metric.TFStepMetric]] = None, per_step_eval_metrics: Optional[List[Any]] = None, eval_freq: int = 10, log_freq: int = 5, save_freq: int = 5, model_save_path: Optional[str] = None, tf_log_stream_path: Optional[str] = None) -> None: """ Function for putting the pieces together to train and evaluate an agent. :param env: The environment for which the agent will be trained. :param agent: The agent to train. :param data_collection_driver: The driver used for data collection and metric tracking. :param replay_buffer: Replay buffer in which to store experience. :param num_iters: The number of training iterations to perform. :param global_step: A counter of the number of training iterations. :param metrics: A list of the metrics to track during training. :param policy_metrics: A list of metrics related to the policy distribution to track during training. :param policy_summary_writers: A list of summary writers to facilitate overlaying plots of policy metrics in TensorBoard. :param eval_env: The environment in which to play out evaluations of the policy. :param eval_summary_writer: The summary writer used for evaluation metrics. :param num_eval_episodes: The number of evaluation episodes to run at each evaluation point. :param eval_metrics: The metrics to track when evaluating the policy (with episodic resolution). :param per_step_eval_metrics: The metrics to track when evaluating the policy (with time step resolution). :param eval_freq: The number of training iterations between runs of policy evaluation logging. :param log_freq: The frequency with which to log values to TensorBoard. :param save_freq: The number of training iterations between model saves. :param model_save_path: Directory in which to save model checkpoints (weights etc). If None model will not be saved. :param tf_log_stream_path: """ # Get the initial states of the agent and environment before training. time_step = env.reset() policy_state = agent.collect_policy.get_initial_state(env.batch_size) # Set up the model saving infrastructure if a path to save to is provided. save_model = bool(model_save_path) if save_model: # Ensure that we save all trackable values (i.e. variables) from the TensorFlow Agent. checkpoint = tf.train.Checkpoint(agent=agent) # The checkpoint manager enables us to save multiple versions of the check point at # different training steps. We save the 20 most recent saves to span a wide section of # training. checkpoint_manager = tf.train.CheckpointManager(checkpoint, model_save_path, max_to_keep=20) else: # Warn the user that training will continue but models will not be saved. warn("No save directory provided. Model will not be saved.") if metrics is None: metrics = [] if per_step_eval_metrics is None: per_step_eval_metrics = [] # Set up a minimal training loop to simply test training mechanics work. for i in range(num_iters): with tf.summary.record_if(lambda: tf.math.equal(global_step % log_freq, 0)): # Collect experience. time_step, policy_state = data_collection_driver.run( time_step=time_step, policy_state=policy_state ) # Now the replay buffer should have data in it so we can collect the data and train the # agent. experience = replay_buffer.gather_all() agent.train(experience) # Clear the replay buffer and return to play. replay_buffer.clear() for metric in metrics: metric.tf_summaries( train_step=global_step, step_metrics=metrics[:2] ) # Run the policy tracking metrics one at a time each on their own summary writer to # enable shared axes on TensorBoard. for metric, summary_writer in zip(policy_metrics, policy_summary_writers): with summary_writer.as_default(): tf.summary.scalar(name=metric.name, data=metric.result(), step=global_step) if eval_summary_writer and eval_metrics and eval_env: if i > 0 and global_step % eval_freq == 0: evaluate_policy( eval_metrics, eval_env, agent.policy, per_step_metrics=per_step_eval_metrics, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix="Metrics", logging=True, tf_log_stream_path=tf_log_stream_path ) # Periodically save the model provided that we have the infrastructure in place. if save_model and i > 0 and (i + 1) % save_freq == 0: checkpoint_manager.save(i + 1) if i % (num_iters // 100) == 0: print(f"\tCompleted: {i / num_iters * 100} %") checkpoint_manager.save(num_iters)
def reset_and_fire_on_life_lost(trajectory): global prev_lives lives = tf_env.pyenv.envs[0].ale.lives() if prev_lives != lives: tf_env.reset() tf_env.step(1) prev_lives = lives watch_driver = DynamicStepDriver(tf_env, saved_policy, observers=[ save_frames, reset_and_fire_on_life_lost, ShowProgress(1000) ], num_steps=1000) tf_env.reset() # reset the env time_step = tf_env.step(1) # fire the ball to begin playing policy_state = saved_policy.get_initial_state() # empty state () final_time_step, final_policy_state = watch_driver.run( time_step, policy_state) # render a window that shows the agent plays (works on the jupyter notebook) renderingUtils = RenderingUtils(frames) renderingUtils.plot_animation() renderingUtils.generate_gif("breakout.gif") renderingUtils.create_policy_eval_video(env, saved_policy, "trained-agent")
# Main training loop time_step, policy_state = None, None for it in range(N_ITERATIONS): if COLLECT_RANDOM: print('Running random driver...') time_step, policy_state = random_driver.run(time_step, policy_state) print('Running agent driver...') time_step, policy_state = driver.run(time_step, policy_state) print('Training...') for train_it in range(BUFFER_LENGTH//BATCH_SIZE): experience, _ = replay_buffer.get_next(sample_batch_size=BATCH_SIZE, num_steps=2) agent.train(experience) if (train_it + 1) % 100 == 0: print('{0} training iterations'.format(train_it + 1)) print('Saving...') # Save to checkpoint checkpointer.save(global_step) # Save policy policy_saver.save(os.path.relpath('policy')) # Show total reward of actual policy for 1 episode total_reward = 0.0 eval_ts = eval_env.reset() num_steps = 0 while (not eval_ts.is_last()) and num_steps < EVAL_MAX_STEPS: action_step = agent.policy.action(eval_ts) eval_ts = eval_env.step(action_step.action) total_reward += eval_ts.reward num_steps += 1 print('Iteration = {0}: Steps taken: = {1} of {2}: Total reward = {3}'.format(it, num_steps, EVAL_MAX_STEPS, total_reward))
def test_all_mepo_variants_work(transition_model, trajectory_sampler, model_free_agent_type): """ Mepo Agent has prespecified transition model, trajectory sampler and model-free agent types. Here we check that all combinations execute without errors. """ # setup the environment and a prespecified model components py_env = suite_gym.load("MountainCarContinuous-v0") tf_env = TFPyEnvironment(py_env) time_step_spec = tf_env.time_step_spec() observation_spec = tf_env.observation_spec() action_spec = tf_env.action_spec() reward_model = MountainCarReward(observation_spec, action_spec) initial_state_distribution_model = MountainCarInitialState( observation_spec) # some parameters need to be set correctly ensemble_size = 2 num_elites = 10 population_size = num_elites + 10 horizon = 1 # define agent, many transition model and trajectory optimiser parameters can # be arbitrary agent = MepoAgent( time_step_spec, action_spec, transition_model, 1, 10, tf.nn.relu, ensemble_size, False, 1, 1, [tf.keras.callbacks.EarlyStopping(monitor="loss", patience=3)], reward_model, initial_state_distribution_model, trajectory_sampler, horizon, population_size, model_free_agent_type, 1, 10, tf.nn.relu, 2, ) # we need some training data random_policy = RandomTFPolicy( time_step_spec, action_spec, info_spec=agent.collect_policy.info_spec, ) model_training_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( random_policy.trajectory_spec, batch_size=1, max_length=1000) collect_driver_random_policy = TFDriver( tf_env, random_policy, observers=[model_training_buffer.add_batch], max_steps=10, disable_tf_function=True, ) initial_time_step = tf_env.reset() collect_driver_random_policy.run(initial_time_step) pets_agent_trainer = BackgroundPlanningAgentTrainer(10, 10) tf_training_scheduler = pets_agent_trainer.create_training_scheduler( agent, model_training_buffer) training_losses = tf_training_scheduler.maybe_train( tf.constant(10, dtype=tf.int64)) assert EnvironmentModelComponents.TRANSITION in training_losses # test the agent collect_driver_planning_policy = TFDriver( tf_env, agent.collect_policy, observers=[model_training_buffer.add_batch], max_steps=10, disable_tf_function=True, ) time_step = tf_env.reset() collect_driver_planning_policy.run(time_step)
# 8. Evaluating the agent. def evaluate(env, policy, num_episodes): total_return = 0.0 for _ in range(num_episodes): time_step = env.reset() episode_return = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = env.step(action_step.action) episode_return += time_step.reward total_return += episode_return average_return = total_return / num_episodes return average_return.numpy()[0] # Resetting the train step. agent.train_step_counter.assign(0) # Resetting eval environment. eval_env.reset() # Evaluate the agent's policy once before training. num_of_episodes = 1 avg_return = evaluate(eval_env, agent.policy, num_of_episodes) print('\nAverage return in', num_of_episodes, 'episodes =', avg_return) carla_environment.close()
if cumulative_done: self._episode_ended = True return ts.termination(self._state, reward) else: return ts.transition(self._state, reward, discount=0.98) from tf_agents.environments.tf_py_environment import TFPyEnvironment tf_env = TFPyEnvironment(YoushiEnv) #tf_env = YoushiEnv() #tf_agent = tf.saved_model.load(saved_models_path) q_net = tf.saved_model.load("MyPolicyHard") time_step = tf_env.reset() display = DisplayIA.Display() lost = False score = 0 print(type(q_net)) while not time_step.is_last(): display.refresh(time_step.observation) action = q_net.action(time_step) time_step = tf_env.step(action) score += 1 print(score)
def test_off_policy_algorithm(self, algorithm_ctor, use_rollout_state, sync_driver): logging.info("{} {}".format(algorithm_ctor.__name__, sync_driver)) batch_size = 128 if use_rollout_state: steps_per_episode = 5 mini_batch_length = 8 unroll_length = 8 env_class = RNNPolicyUnittestEnv else: steps_per_episode = 12 mini_batch_length = 2 unroll_length = 12 env_class = PolicyUnittestEnv env = TFPyEnvironment( env_class(batch_size, steps_per_episode, action_type=ActionType.Continuous)) eval_env = TFPyEnvironment( env_class(batch_size, steps_per_episode, action_type=ActionType.Continuous)) algorithm = algorithm_ctor(env) algorithm.use_rollout_state = use_rollout_state if sync_driver: driver = SyncOffPolicyDriver(env, algorithm, use_rollout_state=use_rollout_state, debug_summaries=True, summarize_grads_and_vars=True) else: driver = AsyncOffPolicyDriver( [env], algorithm, use_rollout_state=algorithm.use_rollout_state, num_actor_queues=1, unroll_length=unroll_length, learn_queue_cap=1, actor_queue_cap=1, debug_summaries=True, summarize_grads_and_vars=True) replayer = driver.exp_replayer eval_driver = OnPolicyDriver(eval_env, algorithm, training=False, greedy_predict=True) eval_env.reset() driver.start() if sync_driver: time_step = driver.get_initial_time_step() policy_state = driver.get_initial_policy_state() for i in range(5): time_step, policy_state = driver.run(max_num_steps=batch_size * steps_per_episode, time_step=time_step, policy_state=policy_state) for i in range(500): if sync_driver: time_step, policy_state = driver.run(max_num_steps=batch_size * mini_batch_length * 2, time_step=time_step, policy_state=policy_state) experience, _ = replayer.replay( sample_batch_size=128, mini_batch_length=mini_batch_length) else: driver.run_async() experience = replayer.replay_all() driver.train(experience, mini_batch_size=128, mini_batch_length=mini_batch_length) eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.info("%d reward=%f", i, float(tf.reduce_mean(eval_time_step.reward))) driver.stop() self.assertAlmostEqual(1.0, float(tf.reduce_mean(eval_time_step.reward)), delta=2e-1)
def evaluate_policy(metrics: List[Any], environment: TFPyEnvironment, policy: tf_agents.policies.tf_policy.Base, per_step_metrics: Optional[List[tf.Module]] = None, num_episodes: int = 1, train_step: Optional[Any] = None, summary_writer: Optional[tf.summary.SummaryWriter] = None, summary_prefix: str = "Eval", logging: bool = False, tf_log_stream_path: Optional[str] = None) -> None: """ Track performance (via metrics) using policy in the environment provided. Prints a dictionary of results {metric_name: metric_value}. *NOTE*: Because placeholders are not compatible with Eager mode this is not compatible with python policies. This function is adapted from tf_agents.eval.metric_utils.eager_compute to allow for per time step logging. :param metrics: List of metrics to compute. :param environment: tf_environment instance. :param policy: tf_policy instance used to step the environment. :param per_step_metrics: List of metrics to be passed as observers to run every time step during evaluation. :param num_episodes: Number of episodes to compute the metrics over. :param train_step: An optional step to write summaries against. :param summary_writer: An optional writer for generating metric summaries. :param summary_prefix: An optional prefix scope for metric summaries. :param logging: Option to enable logging to the console of standard metrics. :param tf_log_stream_path: Path to a file which tf.print calls are set to write to. If none tf.print statements print to sys.stdout. """ # Reset the state of all metrics (e.g. running totals for averages). for metric in metrics + per_step_metrics: metric.reset() # Attain the initial state of the environment and policy. time_step = environment.reset() policy_state = policy.get_initial_state(environment.batch_size) # Set up a driver to run the evaluation episodes while logging the desired metrics. driver = DynamicEpisodeDriver( environment, policy, observers=metrics, transition_observers=per_step_metrics, num_episodes=num_episodes) # Run the driver which adds experience to the replay buffer. driver.run(time_step, policy_state) # If we have the required prerequisites then perform the TensorBoard logging as well as logging # results to the console. if train_step and summary_writer: # Utilise a (possibly) different summary writer to put the evaluation metrics to # TensorBoard. with summary_writer.as_default(): for m in metrics: # Attain the full name of the metric to record. tag = "/".join([summary_prefix, m.name]) # Simply calculating and forming the scalar summary in the current context with a # default summary writer does the logging to TensorBoard for us. tf.summary.scalar(name=tag, data=m.result(), step=train_step) # If requested to then log metrics to the console. if logging and train_step: for m in metrics: tf.print(f"Evaluation at step {train_step.numpy()}: {m.name}\t{m.result()}", output_stream=f'file://{tf_log_stream_path}' if tf_log_stream_path else sys.stdout)