def play(env, model, video_path, num_episodes, timesteps, metadata): video_recorder = None for i_episodes in range(num_episodes): video_recorder = VideoRecorder( env=env, path=video_path, metadata=metadata, enabled=video_path is not None) obs = env.reset() for t in range(timesteps): obs = [np.array([[list(obs)]])] video_recorder.capture_frame() action = model.predict(obs)[0] obs, rew, done, info = env.step(action) env.render() theta.append(obs[0]) theta_dot.append(obs[1]) actions.append(action[0]) if done: print("Episode finished after {} timesteps".format(t + 1)) num_episodes += 1 # save video of first episode print("Saved video.") video_recorder.close() video_recorder.enabled = False break env.close() return theta
def run_policy(env, get_action, env_params_list, max_ep_len=None, episode_id=0, record=False, recording_path=None, no_render=False, use_baselines=False): if record: if os.name == "nt": full_path = os.path.join(pathlib.Path().absolute(), recording_path) full_path_len = len(full_path) nb_char_to_remove = full_path_len - 245 if nb_char_to_remove > 0: recording_path = recording_path[:-nb_char_to_remove] video_recorder = VideoRecorder(env, recording_path + "_ep" + str(episode_id) + ".mp4", enabled=True) if use_baselines: env.get_raw_env().set_environment(**env_params_list[episode_id]) else: env.set_environment(**env_params_list[episode_id]) if use_baselines: _, o = env.reset() else: o = env.reset() r, d, ep_ret, ep_len, n = 0, False, 0, 0, 0 while True: if record and video_recorder.enabled: video_recorder.capture_frame() if not record and not no_render: env.render() time.sleep(1e-3) a = get_action(o) o, r, d, i = env.step(a) if use_baselines: ep_ret += i[0]["original_reward"][0] else: ep_ret += r ep_len += 1 if d or (ep_len == max_ep_len): print('Episode %d \t EpRet %.3f \t EpLen %d' % (episode_id, ep_ret, ep_len)) if record and video_recorder.enabled: video_recorder.close() video_recorder.enabled = False break return ep_ret
def play(env, act, stochastic, video_path): num_episodes = 0 video_recorder = None video_recorder = VideoRecorder(env, video_path, enabled=video_path is not None) obs = env.reset() while True: env.unwrapped.render() video_recorder.capture_frame() action = act(np.array(obs)[None], stochastic=stochastic)[0] obs, rew, done, _ = env.step(action) if done: obs = env.reset() if video_recorder.enabled: # save video of first episode print("Saved video.") video_recorder.close() video_recorder.enabled = False
def save_video(agent, video_path): num_episodes = 0 # video_recorder = None video_recorder = VideoRecorder( agent.env, video_path, enabled=video_path is not None) state = agent.env.reset() state = state2tensor(state) for t in count(): agent.env.unwrapped.render() video_recorder.capture_frame() action = select_best_action(state=state, agent=agent) next_state, rew, done, info = agent.env.step(action.item()) next_state = state2tensor(next_state) state = next_state if done: # save video of first episode print("Saved video.") video_recorder.close() video_recorder.enabled = False break
def _hrl_run_episodes(env, agent: HIROAgent, n_steps, n_episodes, max_episode_len=None, logger=None, step_number=None, video_outdir=None): """Run multiple episodes and return returns.""" assert (n_steps is None) != (n_episodes is None) evaluation_videos_dir = f'{video_outdir}/evaluation_videos' os.makedirs(evaluation_videos_dir, exist_ok=True) video_recorder = VideoRecorder( env, path=f'{evaluation_videos_dir}/evaluation_{step_number}.mp4') video_recorder.enabled = step_number is not None logger = logger or logging.getLogger(__name__) scores = [] successes = 0 success_rate = 0 terminate = False timestep = 0 env.evaluate = True reset = True while not terminate: if reset: # env.seed(np.random.randint(0, 2 ** 32 - 1)) obs_dict = env.reset() fg = obs_dict['desired_goal'] obs = obs_dict['observation'] sg = env.subgoal_space.sample() done = False test_r = 0 episode_len = 0 info = {} a = agent.act_low_level(obs, sg) obs_dict, r, done, info = env.step(a) video_recorder.capture_frame() obs = obs_dict['observation'] # select subgoal for the lower level controller. n_sg = agent.act_high_level(obs, fg, sg, timestep) test_r += r episode_len += 1 timestep += 1 reset = done or episode_len == max_episode_len or info.get( "needs_reset", False) agent.observe(obs, fg, n_sg, r, done, reset, timestep) sg = n_sg if reset: logger.info("evaluation episode %s length:%s R:%s", len(scores), episode_len, test_r) success = agent.evaluate_final_goal(fg, obs) successes += 1 if success else 0 logger.info(f"{successes} successes so far.") # As mixing float and numpy float causes errors in statistics # functions, here every score is cast to float. scores.append(float(test_r)) if n_steps is None: terminate = len(scores) >= n_episodes else: terminate = timestep >= n_steps # If all steps were used for a single unfinished episode if len(scores) == 0: scores.append(float(test_r)) logger.info("evaluation episode %s length:%s R:%s", len(scores), episode_len, test_r) success_rate = successes / n_episodes logger.info(f"Success Rate: {success_rate}") if step_number is not None: print("Saved video.") video_recorder.close() return scores, success_rate
def run_learner(env_name, agent_class, max_episodes=1000, n_deterministic_episodes=10, output_freq=10, env_seed=None, tf_seed=None, max_time=200, video_save_root=None, **kwargs): """ Run the reinforcement learning process (either can be in deterministic or training mode) :param env_name: name of the environment to use for training :param agent_class: class that can be used to create an agent object :param max_episodes: number of episodes to use for training :param n_deterministic_episodes: number of deterministic episodes to test the controller :param output_freq: Frequency at which to output episode results :param env_seed: random seed for the environment (to control the start conditions) :param tf_seed: random seed for tensorflow (to control network initialization) :param max_time: maximum number of time steps before terminating an episode :param video_save_root: directory to specify video save location :param kwargs: passed to the agent's __init__ method :return: trained agent and data from run """ # Create the OpaenAI gym environment env = gym.make(env_name) # Set up video saving directroy if video_save_root and not video_save_root.exists(): video_save_root.mkdir() # Implement random seeds if env_seed: env.seed(env_seed) if tf_seed: tf.compat.v1.random.set_random_seed(tf_seed) # Find state and action sizes of the environment state_size = env.observation_space.shape[0] if not hasattr(env.action_space, 'n'): action_size = env.action_space.shape[0] else: action_size = env.action_space.n # Create agent agent = agent_class(state_size, action_size, actor_limit=env.action_space.high[0], # size of action (assumes all actions scaled the same) **kwargs) # Initialize data lists for episodes return_info = dict(score_list=[], deterministic_runs=[], states=[], actions=[], rewards=[], dones=[]) # Iterate through episodes for episode in range(max_episodes + n_deterministic_episodes): # Reset environment for beginning of episode state = env.reset() # Determine if the action should be stochastic or not (only true during training) deterministic_action = episode >= max_episodes # Sum of reward over an episode episode_rewards_sum = 0 # Episode data lists states = [] rewards = [] actions = [] dones = [] # Set up video recorder if we want a video of this episode video_recorder = None if deterministic_action and video_save_root: video_path = str(video_save_root / f'episode-{agent.current_episode_number}.mp4') video_recorder = VideoRecorder(env, video_path, enabled=video_save_root is not None) # Conduct episode done = False t = 0 while not done: # Show the frame and save the movie frame if this is in testing mode if deterministic_action: env.render() video_recorder.capture_frame() # Get action from agent action = agent.act(state, deterministic=deterministic_action) # Get signals from environment new_state, reward, done, info = env.step(action) # Save data states.append(state) actions.append(action) rewards.append(reward) dones.append(done) episode_rewards_sum += reward # enforce maximum time for episode done = done or t == max_time - 1 # Tell the agent a step in the environment has occured if not deterministic_action: agent.on_t_update(state, action, new_state, reward, done) # Update state value state = new_state if done: # Output data if episode % output_freq == 0 or deterministic_action: print(f"episode: {episode}/{max_episodes}, " f"score: {episode_rewards_sum}, " f"critic loss: {'-' if not agent.critic_losses else np.abs(agent.critic_losses[-1])}, " f"actor loss: {'-' if not agent.actor_losses else np.abs(agent.actor_losses[-1])}, " f"loss: {'-' if not agent.updates else np.abs(agent.updates[-1])}, " f"greedy: {deterministic_action}") if episode > 0 and episode % 100 == 0: avg_reward = np.average(return_info['score_list'][-100:]) print(f"Average reward over 100 episodes: {avg_reward}") if deterministic_action: # Save episode reward return_info['deterministic_runs'].append(episode_rewards_sum) # Close the video recorder video_recorder.close() video_recorder.enabled = False else: # Save episode reward return_info['score_list'].append(episode_rewards_sum) # Inform the agent that an episode has completed agent.on_episode_complete(episode) # Update data lists return_info['states'].append(states) return_info['actions'].append(actions) return_info['rewards'].append(rewards) return_info['dones'].append(dones) # Update time t += 1 # Save the agent if training has occurred if max_episodes > 0: agent.save(agent.sess, global_step=max_episodes) return agent, return_info
def run_policy(env, get_action, env_params_list, max_ep_len=None, episode_id=0, record=False, recording_path=None, no_render=False, use_baselines=False): ''' Run an episode of a trained policy. Args: env: Environment get_action: Policy function env_params_list: List of tasks among one must be loaded max_ep_len: Maximum number of steps allowed in the episode episode_id: Id of the episode to load in `env_params_list` record: Whether a video of the episode should be recorded recording_path: Path on which the video must be saved no_render: Whether the episode must be ran without a frame rendering it use_baselines: Whether the policy was trained using OpenAI Baselines ''' if record: if os.name == "nt": full_path = os.path.join(pathlib.Path().absolute(), recording_path) full_path_len = len(full_path) nb_char_to_remove = full_path_len - 245 if nb_char_to_remove > 0: recording_path = recording_path[:-nb_char_to_remove] video_recorder = VideoRecorder(env, recording_path + "_ep" + str(episode_id) + ".mp4", enabled=True) if use_baselines: env.get_raw_env().set_environment(**env_params_list[episode_id]) else: env.set_environment(**env_params_list[episode_id]) if use_baselines: _, o = env.reset() else: o = env.reset() r, d, ep_ret, ep_len, n = 0, False, 0, 0, 0 while True: if record and video_recorder.enabled: video_recorder.capture_frame() if not record and not no_render: env.render() time.sleep(1e-3) a = get_action(o) o, r, d, i = env.step(a) if use_baselines: ep_ret += i[0]["original_reward"][0] else: ep_ret += r ep_len += 1 if d or (ep_len == max_ep_len): print('Episode %d \t EpRet %.3f \t EpLen %d' % (episode_id, ep_ret, ep_len)) if record and video_recorder.enabled: video_recorder.close() video_recorder.enabled = False break return ep_ret
def work(self): self.summary_writer.add_graph(self.sess.graph) n_episodes = 1000 episode_i = 0 episode_len = 0 cur_state = preprocessor_for_ddn.obs_to_state(self.env.reset()) count = 1 cum_reward = 0 # start_life = 5 # need_restart = True while episode_i < n_episodes: # setup video recorder video_path = os.path.join(BASE_VIDEO_PATH, f"{episode_i}.mp4") video_recorder = VideoRecorder(self.env, video_path, enabled=video_path is not None) # 1) sync from global model to local model # self._copy_to_local() # 2) collect t_max steps (if terminated then i++) steps = [] # print(self.local_model.predict_policy(cur_state, self.sess), int(np.argmax(cur_state))% (6), int(np.argmax(cur_state)/(6))) for _ in range(self.t_max): # if need_restart: # action = 0 # need_restart = False # print('start using life: ' + str(start_life)) # else: action = self.local_model.get_action(cur_state, self.sess) # print(action) next_state, reward, done, info = self.env.step(action) next_state = preprocessor_for_ddn.obs_to_state(next_state) # if start_life > info['ale.lives']: # need_restart = True # start_life = info['ale.lives'] # capture video video_recorder.capture_frame() # reward *= MULT_FAC cum_reward += np.power(self.gamma, episode_len) * reward if reward != 0: print('cum_reward: ' + str(cum_reward)) # print(episode_len) episode_len = episode_len + 1 # steps.append( # Step( # cur_step=cur_state, # action=action, # next_step=next_state, # reward=reward, # done=done # ) # ) if done or episode_len >= MAX_STEPS_PER_EPISODE: self.history.append(episode_len) summary = tf.Summary() summary.value.add(tag='Perf/episode_len', simple_value=float(episode_len)) summary.value.add(tag='Perf/episode_reward', simple_value=float(cum_reward)) self.summary_writer.add_summary(summary, episode_i) print(episode_i) print(summary) print( 'worker {}: episode {} finished in {} steps, cumulative reward: {}' .format(self.name, episode_i, episode_len, cum_reward)) # print(action) if episode_i % 100 == 0 and episode_i != 0: saver.save( self.sess, self.model_path + '/model-' + str(episode_i) + '.cptk') print("Saved Model") cum_reward = 0 episode_i = episode_i + 1 episode_len = 0 # start_life = 5 # need_restart = True cur_state = preprocessor_for_ddn.obs_to_state( self.env.reset()) break cur_state = next_state # save video print(f"Saving video to {video_path}") video_recorder.close() video_recorder.enabled = False print(f"Video saved")