def play(agent_dir, num_episodes, max_episode_steps, save_videos): agent = get_agent(gin.query_parameter("train.agent"))(make_env_fn( gin.query_parameter("train.env_id"), episode_time_limit=max_episode_steps)) agent.pretrain_setup(gin.query_parameter("train.total_timesteps")) ckpt_path = tf.train.latest_checkpoint( os.path.join(agent_dir, "best-weights")) checkpoint = tf.train.Checkpoint(agent) checkpoint.restore( ckpt_path).assert_existing_objects_matched().expect_partial() env = agent.make_env() if save_videos: env = Monitor( env, os.path.join(agent_dir, "monitor"), video_callable=lambda _: True, force=True, ) try: episodes = 0 obs = env.reset() while episodes < num_episodes: action = agent.act(np.expand_dims(obs, 0), deterministic=True).numpy() obs, _, done, _ = env.step(action[0]) env.render() if done: obs = env.reset() episodes += 1 except KeyboardInterrupt: env.close()
def run(self): """ Run the agent to see it work """ from gym.wrappers import Monitor env = Monitor(self.env, './video', force=True) state = env.reset() reward_sum = 0 episode_number = 0 while episode_number < 2: # forward the policy network and sample an action from the returned probability aprob, h = policy_forward(state) action = 0 if np.random.uniform( ) < aprob else 1 # randomly take 1 of two actions. we are sampling from a bernoulli distribution here # step the environment and get new measurements state, reward, done, info = env.step(action) reward_sum += reward env.render() if done: # an episode finished episode_number += 1 print("Episode finished with total reward", reward_sum) reward_sum = 0 state = env.reset() # reset env
class Environment(object): def __init__(self, game, record=False, width=84, height=84, seed=0): self.game = gym.make(game) self.game.seed(seed) if record: self.game = Monitor(self.game, './video', force=True) self.width = width self.height = height self._toTensor = T.Compose([T.ToPILImage(), T.ToTensor()]) gym_ple def play_sample(self, mode: str = 'human'): observation = self.game.reset() while True: screen = self.game.render(mode=mode) if mode == 'rgb_array': screen = self.preprocess(screen) action = self.game.action_space.sample() observation, reward, done, info = self.game.step(action) if done: break self.game.close() def preprocess(self, screen): preprocessed: np.array = cv2.resize(screen, (self.height, self.width)) # 84 * 84 로 변경 preprocessed = np.dot(preprocessed[..., :3], [0.299, 0.587, 0.114]) # Gray scale 로 변경 # preprocessed: np.array = preprocessed.transpose((2, 0, 1)) # (C, W, H) 로 변경 preprocessed: np.array = preprocessed.astype('float32') / 255. return preprocessed def init(self): """ @return observation """ return self.game.reset() def get_screen(self): screen = self.game.render('rgb_array') screen = self.preprocess(screen) return screen def step(self, action: int): observation, reward, done, info = self.game.step(action) return observation, reward, done, info def reset(self): """ :return: observation array """ observation = self.game.reset() observation = self.preprocess(observation) return observation @property def action_space(self): return self.game.action_space.n
def __init__(self, render=None, max_episode_steps=2000, deterministic=True): monitor = None action_repeat = True episodic_life = True env = retro.make("SuperMarioBros-Nes") if monitor is not None: env = Monitor(env, monitor) if render is not None: env = AutoRenderer(env, auto_render_period=render) if action_repeat: env = FrameStack(env, 8) env = TimeLimit(env, max_episode_steps=max_episode_steps) if episodic_life: env = EpisodicLifeEnv(env, [0] * 9) env.reset() _, _, _, first_info = env.step( [0] * 9) # TODO the order of the info dict is random self.first_info = first_info env.reset() self.env = env raw_env = env.unwrapped self.index_right = raw_env.buttons.index("RIGHT") self.index_a = raw_env.buttons.index("A") self.index_b = raw_env.buttons.index("B") self.obs_shape = len(first_info.values()) self.agent = Agent(2, self.obs_shape, deterministic, embed=True) self.weight_shape = self.agent.weight_shape() self.n_weights = self.weight_shape[0] * self.agent.weight_shape()[1] with open("scaler.pickle", "rb") as pickle_out_file: self.scaler = pickle.load(pickle_out_file)
def test_video_callable_records_videos(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, temp) env.reset() env.close() results = monitoring.load_results(temp) assert len(results['videos']) == 1, "Videos: {}".format(results['videos'])
def test_video_callable_false_does_not_record(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, temp, video_callable=False) env.reset() env.close() results = monitoring.load_results(temp) assert len(results['videos']) == 0
def test_semisuper_succeeds(): """Regression test. Ensure that this can write""" with helpers.tempdir() as temp: env = gym.make('SemisuperPendulumDecay-v0') env = Monitor(temp)(env) env.reset() env.step(env.action_space.sample()) env.close()
def test_semisuper_succeeds(): """Regression test. Ensure that this can write""" with helpers.tempdir() as temp: env = gym.make('SemisuperPendulumDecay-v0') env = Monitor(env, temp) env.reset() env.step(env.action_space.sample()) env.close()
def test_video_callable_false_does_not_record(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(temp, video_callable=False)(env) env.reset() env.close() results = monitoring.load_results(temp) assert len(results['videos']) == 0
def main(): startTime = time.time() env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) results_file = open("MoreExactReward12.csv", 'a') agent = DDPG(env, results_file) env = Monitor(env, directory='experiments/' + ENV_NAME, force=True) results_file.write("Episodes Spent Training; " + str(TEST) + " Episode Eval Avg; Learned Reward Map \n") for episode in range(EPISODES): state = env.reset() if (episode % 20 == 0): print("episode:", episode) # Train for step in range(env.spec.timestep_limit): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Testing: if episode % 100 == 0 and episode > 100: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(env.spec.timestep_limit): #env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print('episode: ', episode, 'Evaluation Average Reward:', ave_reward) results_file.write(str(episode) + "; " + str(ave_reward) + ";") results_file.write("%s \n" % (np.array_str( agent.actor_network.net[-1].eval())).replace("\n", " ")) results_file.write("Time Training (" + str(EPISODES) + "episodes);" + str(time.time() - startTime) + "\n") results_file.write( "Final Learned Reward Map; %s \n" % (np.array_str(agent.actor_network.net[-1].eval())).replace("\n", " ")) results_file.write("Evaluation Episode; Reward \n") for episode in range(100): total_reward = 0 state = env.reset() for j in range(env.spec.timestep_limit): action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break results_file.write(str(episode) + "; " + str(total_reward) + "\n") results_file.write("endExperiment\n\n") results_file.close()
def main(): finishedTraining = EPISODES startTime = time.time() env = filter_env.makeFilteredEnv(gym.make(ENV_NAME)) results_file = open("ResultsNew.csv", 'a') agent = DDPG(env, results_file) env = Monitor(env, directory='experiments/' + ENV_NAME, force=True) results_file.write("Episodes Spent Training; " + str(TEST) + " Episode Eval Avg \n") for episode in range(EPISODES): state = env.reset() if (episode % 20 == 0): print("episode:", episode) # Train for step in range(env.spec.timestep_limit): action = agent.noise_action(state) next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) state = next_state if done: break # Testing: if (episode + 1) % 100 == 0 and episode > 100: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(env.spec.timestep_limit): env.render() action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward / TEST print('episode: ', episode, 'Evaluation Average Reward:', ave_reward) results_file.write(str(episode) + "; " + str(ave_reward) + "\n") if ave_reward > 800 and finishedTraining > episode + 300: finishedTraining = episode + 300 elif (episode >= finishedTraining): break results_file.write("Time Training (" + str(EPISODES) + "episodes);" + str(time.time() - startTime) + "\n") results_file.write("Evaluation Episode; Reward \n") for episode in range(100): total_reward = 0 env.reset() state = env.env.env.set_test(episode) for j in range(env.spec.timestep_limit): action = agent.action(state) # direct action for test state, reward, done, _ = env.step(action) total_reward += reward if done: break results_file.write(str(episode) + "; " + str(total_reward) + "\n") results_file.write("endExperiment\n\n") results_file.close()
def test_video_callable_records_videos(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(temp)(env) env.reset() env.close() results = monitoring.load_results(temp) assert len(results['videos']) == 1, "Videos: {}".format( results['videos'])
def test_write_upon_reset_false(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=False) env.reset() files = glob.glob(os.path.join(temp, '*')) assert not files, "Files: {}".format(files) env.close() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0
def cart_pole_with_qlearning(): from gym.wrappers import Monitor env = gym.make('CartPole-v0') experiment_filename = './cartpole-experiment-1' env = Monitor(env, experiment_filename, force=True) observation = env.reset() goal_average_steps = 195 max_number_of_steps = 200 number_of_iterations_to_average = 100 number_of_features = env.observation_space.shape[0] last_time_steps = np.ndarray(0) cart_position_bins = pd.cut([-2.4, 2.4], bins=10, retbins=True)[1][1:-1] pole_angle_bins = pd.cut([-2, 2], bins=10, retbins=True)[1][1:-1] cart_velocity_bins = pd.cut([-1, 1], bins=10, retbins=True)[1][1:-1] angle_rate_bins = pd.cut([-3.5, 3.5], bins=10, retbins=True)[1][1:-1] learner = QLearner(state_discretization=Binning([[-2.4, 2.4], [-2, 2], [-1., 1], [-3.5, 3.5]], [10] * 4), discrete_actions=[i for i in range(env.action_space.n)], alpha=0.2, gamma=1, random_action_rate=0.5, random_action_decay_rate=0.99) for episode in range(50000): action = learner.set_initial_state(observation) for step in range(max_number_of_steps - 1): observation, reward, done, info = env.step(action) if done: reward = -200 observation = env.reset() action = learner.move(observation, reward) if done: last_time_steps = np.append(last_time_steps, [int(step + 1)]) if len(last_time_steps) > number_of_iterations_to_average: last_time_steps = np.delete(last_time_steps, 0) break if last_time_steps.mean() > goal_average_steps: print "Goal reached!" print "Episodes before solve: ", episode + 1 print u"Best 100-episode performance {} {} {}".format(last_time_steps.max(), unichr(177), # plus minus sign last_time_steps.std()) break env.close()
def run(episodes=1): env = gym.make('obstacle-v0') env = Monitor(env, 'out', force=True) for _ in range(episodes): env.reset() env.unwrapped.automatic_rendering_callback = env.video_recorder.capture_frame # Capture in-between frames done = False while not done: action = env.unwrapped.dynamics.desired_action observation, reward, done, info = env.step(action) env.render() env.close()
def test_write_upon_reset_true(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(directory=temp, video_callable=False, write_upon_reset=True)(env) env.reset() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0, "Files: {}".format(files) env.close() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0
def main(): env = 'MiniGrid-Empty-v0' env = gym.make(env, size=ENV_SIZE) env = ch.envs.Torch(env) env = ch.envs.Runner(env) env = Monitor(env, "./vid", video_callable=lambda episode_id: True, force=True) for task_config in env.sample_tasks(4): env.reset_task(task_config) env.reset() transition = env.run(get_random_action, episodes=5, render=RENDER)
def test_write_upon_reset_true(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') # TODO: Fix Cartpole to not configure itself automatically # assert not env._configured env = Monitor(env, directory=temp, video_callable=False, write_upon_reset=True) env.configure() env.reset() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0, "Files: {}".format(files) env.close() files = glob.glob(os.path.join(temp, '*')) assert len(files) > 0
def cart_pole_1(): env = gym.make('CartPole-v0') # print('[cart_pole_1]', env.action_space) # Discrete(2) # print('[cart_pole_1]', env.observation_space) # Box(4,) # # action取非负整数0或1。Box表示一个n维的盒子,因此observation是一个4维的数组。我们可以试试box的上下限。 # print('[cart_pole_1]', env.observation_space.high) # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38] # print('[cart_pole_1]', env.observation_space.low) # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] env = Monitor(env=env, directory='./tmp/cartpole-experiment-0202', video_callable=False, write_upon_reset=True) observation = env.reset() # 重置环境的状态,返回观察 for t in range(100): env.render() # 重绘环境的一帧 print('[cart_pole_1] observation old:', observation) action = env.action_space.sample() # action = t % 2 print('[cart_pole_1] action', action) observation, reward, done, info = env.step( action) # 推进一个时间步长,返回observation,reward,done,info print('[cart_pole_1] observation new:', observation, '[reward, done, info]:', reward, done, info) if done: print("[observation] Done after {} time steps".format(t + 1)) break env.close()
def test_env_reuse(): with helpers.tempdir() as temp: env = gym.make('Autoreset-v0') env = Monitor(temp)(env) env.reset() env.step(None) _, _, done, _ = env.step(None) assert done env.step(None) _, _, done, _ = env.step(None) assert done env.close()
def test_steps_limit_restart(): with helpers.tempdir() as temp: env = gym.make('test.StepsLimitCartpole-v0') env = Monitor(env, temp, video_callable=False) env.reset() # Episode has started _, _, done, info = env.step(env.action_space.sample()) assert done == False # Limit reached, now we get a done signal and the env resets itself _, _, done, info = env.step(env.action_space.sample()) assert done == True assert env.episode_id == 1 env.close()
def enjoy(policy, env, save_path=None, save_video=False, obs_fn=None, nepochs=100): """ Enjoy and flush your result using Monitor class. """ if save_video: assert save_path is not None, 'A path to save videos must be provided!' policy.cuda() policy.eval() if save_video: env = Monitor(env, directory=save_path) for e in range(0, 100): done = False obs = env.reset() episode_rwd = 0 while not done: env.render() if obs_fn is not None: obs = obs_fn(obs) obs = Variable(torch.from_numpy(obs[np.newaxis])).float().cuda() value, action, logprob, mean = policy(obs) action = action.data[0].cpu().numpy() obs, reward, done, _ = env.step(action) episode_rwd += reward print('Episode reward is', episode_rwd)
def log_policy_rollout(self, policy, env_name, pytorch_policy=False): env = Monitor(gym.make(env_name), './video', force=True) # env = gym.make(env_name) # env.monitor.start('./video', force=True) done = False episode_reward = 0 episode_length = 0 observation = env.reset() while not done: if pytorch_policy: observation = torch.tensor(observation, dtype=torch.float32) action = policy.act(observation)[0].data.cpu().numpy() else: action = policy.act(observation)[0] observation, reward, done, info = env.step(action) episode_reward += reward episode_length += 1 print('Total reward:', episode_reward) print('Total length:', episode_length) env.close() # env.monitor.close() show_video()
class Simulation(): def __init__(self, environment="CartPole-v0", save_every=5): env = gym.make(environment) self.env = Monitor( env, './video', video_callable=lambda episode_no: episode_no % save_every == 0, force=True) if environment == "Pong-v0": self.env = wrap_deepmind(env, frame_stack=True, scale=True) self.environment = environment #self.env.seed(0) def reset(self): observation = self.env.reset() if self.environment == "Pong-v0": observation = torch.from_numpy(np.stack(observation)).transpose_( 0, 2).transpose_(1, 2).float().unsqueeze(0) else: observation = torch.from_numpy(observation).float().unsqueeze(0) return observation def step(self, action): observation, reward, is_done, info = self.env.step(action) if self.environment == "Pong-v0": observation = torch.from_numpy(np.stack(observation)).transpose_( 0, 2).transpose_(1, 2).float().unsqueeze(0) else: observation = torch.from_numpy(observation).float().unsqueeze(0) return observation, reward, is_done, info def render(self): self.env.render() def close(self): self.env.close()
def play(N=1000): # Change this to 'AssaultNoFrameskip-v4' to play the second game env = wrap_atari_deepmind('BreakoutNoFrameskip-v4', False) env = Monitor(env, directory + "/", force=True) agent.copy(DQN_online[4], sess_o) tot_reward = [] episode = 1 i = 0 while i < N: r = 0 s = env.reset() terminal = False episode_reward = 0 while not terminal: env.render() a = agent.get_action(agent, env, np.array(s)) s_next, r, terminal, dizi = env.step(a) episode_reward += r i = i + 1 s = s_next tot_reward.append(episode_reward) print("Episode reward: ", episode_reward) episode = episode + 1 env.close()
def test(model, args, verbose=True): # Initialize environment and model env = Monitor(gym.make(args.env), './recordings', force=True) model.eval() # Initialize variables done, ep_reward = False, [] s = env.reset() hx, cx = init_hidden(1, args.size_hidden) # Generate rollout while not done: # and step < env.spec.timestep_limit: # Render if enabled if args.render: env.render() # Take a step in environment logit, _, _, _ = model.forward(s, hx, cx) prob = F.softmax(logit, dim=-1) action = prob.multinomial(1).data s, r, done, _ = env.step(action.squeeze().numpy()) ep_reward.append(r) if done: break # Close environment and show performance env.close() if verbose is True: print('Test agent achieved a reward of', np.sum(ep_reward))
def test_one(agent, dir_record, itr): agent.env.seed(itr) env_record = Monitor(agent.env, directory=dir_record) ob = env_record.reset() agent.frame_sequence.insert(atari_img_preprocess(ob)) while True: fs1 = agent.frame_sequence.memory_as_array() ## Find next action action = agent.next_action() ob, reward, done, _ = env_record.step(action) agent.frame_sequence.insert(atari_img_preprocess(ob)) fs2 = agent.frame_sequence.memory_as_array() ## Save results into the replay memory agent.replay_memory.insert(fs1, action, np.clip(reward, -1, 1), fs2, done) if done: break #end total_reward = env_record.get_episode_rewards()[0] env_record.close() return total_reward #end
def evaluate(agent, env, n_episodes=5, render=False, record=False): total_rewards = [] if record: env = Monitor(env, './videos/', force=True) for episode in range(n_episodes): obs = env.reset() obs = obs_reshape(obs) total_reward = 0.0 episode_length = 0 done = False while not done: action = agent.act(obs.reshape(1, *obs.shape)) next_obs, reward, done, _ = env.step(action[0]) next_obs = obs_reshape(next_obs) obs = next_obs total_reward += reward episode_length += 1 if render: env.render() total_rewards.append(total_reward) # print(f">> episode = {episode + 1} / {n_episodes}, total_reward = {total_reward:10.4f}, episode_length = {episode_length}") if render: env.close() return np.mean(total_rewards)
def main(): parser = argparse.ArgumentParser(description='Load the Saved Model') parser.add_argument('-checkpoint', '--checkpoint', help='Give me a checkpoint for the model', required=True) args = vars(parser.parse_args()) checkpoint_path = 'saved_networks/DUEL_DQN-SpaceInvaders-v0_evaluation/SpaceInvaders-v0-' + args[ 'checkpoint'] env = gym.make(ENV_NAME) # if TRAIN: # Train mode # for _ in range(NUM_EPISODES): # terminal = False # observation = env.reset() # for _ in range(random.randint(1, NO_OP_STEPS)): # last_observation = observation # observation, _, _, _ = env.step(0) # Do nothing # state = agent.get_initial_state(observation, last_observation) # while not terminal: # last_observation = observation # action = agent.get_action(state) # observation, reward, terminal, _ = env.step(action) # # env.render() # processed_observation = preprocess(observation, last_observation) # state = agent.run(state, action, reward, terminal, processed_observation) agent = Agent(num_actions=env.action_space.n, checkpoint_path=checkpoint_path) env = Monitor(env, './SpaceInvaders-1', force=True) total_reward = 0.0 with open('log_DuelingDQN.txt', 'a+') as open_file: for _ in range(NUM_EPISODES_AT_TEST): terminal = False observation = env.reset() for _ in range(random.randint(1, NO_OP_STEPS)): last_observation = observation observation, _, _, _ = env.step(0) # Do nothing state = agent.get_initial_state(observation, last_observation) while not terminal: last_observation = observation action = agent.get_action_at_test(state) observation, reward, terminal, _ = env.step(action) #env.render() processed_observation = preprocess(observation, last_observation) state = np.append(state[1:, :, :], processed_observation, axis=0) ## Collect all the things you want total_reward += reward avg_reward = total_reward / float(NUM_EPISODES_AT_TEST) open_file.write(args['checkpoint'] + '\t' + 'average_reward=' + str(avg_reward)) open_file.write('\n')
def log_policy_rollout(params, actor, env_name, video_name): cur_time = time.strftime("[%Y-%m-%d_%H:%M:%S]", time.localtime()) save_path_name = os.path.join( params.save_path, 'video', '{}->{}{}.mp4'.format(params.prefix, video_name, cur_time)) env = gen_env(env_name) env = Monitor(env, save_path_name, force=True) done = False episode_reward = 0. episode_length = 0. action_list = [] observation = env.reset() print('\n > Sampling trajectory...') while not done: action = actor.gen_action( torch.tensor(observation, dtype=torch.float32).cuda())[0] action_list.append(action) action = action.cpu() if type(env.action_space.sample()) is type(int(0)): action = int(action) observation, reward, done, info = env.step(action) episode_reward += reward episode_length += 1 # print("Action Series: {}".format(action_list)) print(' > Total reward:', episode_reward) print(' > Total length:', episode_length) print('------------------------------------') env.close() print('Finished Sampling, saved video in {}.\n'.format(save_path_name))
def run_video_agent(model, eps=500): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") env = gym.make('BipedalWalker-v3') env = Monitor(env, './vid', video_callable=lambda episode_id: True, force=True) obs = env.reset() last_obs = obs fitness = 0.0 for _ in range(eps): env.render() obs = torch.from_numpy(obs).float().to(device) action = (model(obs).detach()).cpu().numpy() new_obs, reward, done, info = env.step(action) fitness += reward obs = new_obs if done: break env.close() print("Best score ", fitness)
def main(): """ You can test your game when you finish setting up your environment. Input range from 0 to 5: 0 : South (Down) 1 : North (Up) 2 : East (Right) 3 : West (Left) 4: Pick up 5: Drop off """ GAME = "Assignment1-Taxi-v2" env = gym.make(GAME) n_state = env.observation_space.n n_action = env.action_space.n env = Monitor(env, "taxi_simple", force=True) s = env.reset() steps = 100 for step in range(steps): env.render() action = int(input("Please type in the next action:")) s, r, done, info = env.step(action) print(s) print(r) print(done) print(info) # close environment and monitor env.close()
def simulate(env, agent, deterministic=True, num_episodes=3, render=True, wait_after_render=1e-3, render_kwargs=None, record_video=False): render_kwargs = render_kwargs or dict() assert env.max_episode_steps > 0 if record_video: env = Monitor(env, directory='./data') episode_info = [] for _ in range(num_episodes): obs = env.reset() agent.reset() done = False episode_return = 0 t = 0 while not done: if render: env.render(**render_kwargs) time.sleep(wait_after_render) with torch.no_grad(): action = agent.act(obs, deterministic) obs, reward, done, _ = env.step(action) episode_return += reward t += 1 episode_info.append((t, episode_return)) return episode_info
def save_video(): from gym.wrappers import Monitor algorithm = "Deep_DPG" env_id = "Pendulum-v0" env = gym.make(env_id) env = Monitor(env, f"videos/{algorithm}-{env_id}", force=True) lower_bound = env.action_space.low.item() upper_bound = env.action_space.high.item() observation_shape = env.observation_space.shape[0] action_shape = env.action_space.shape[0] model = get_actor_model(observation_shape, action_shape, lower_bound, upper_bound) model.load_weights(f"checkpoints/{algorithm}-{env_id}/latest") state = env.reset() while True: tf_state = tf.expand_dims(state, axis=0) action = get_action(tf_state, model, None, training=False) state, _, done = env_step(env, action[0]) if done: break env.close()
class Environment_gym: def __init__(self): # Build an environment # Create and record episode - remove Monitor statement if recording not desired self.env = Monitor( gym.make('one-stationary-evader-v0'), './tmp/pursuit_evasion_infer_pursuer_vs_stationary_evader', force=True) def reset(self): # Reset agent state to top-left grid corner #Reset state self.state = self.env.reset() #Initialize Agent Parameters #Get observed state space self.observed_state_space = self.env.get_observed_state_space() #Set initial state distribution self.initial_state_dist = [] self.initial_state = self.env.get_initial_state() for self.state in self.observed_state_space: if self.state == self.initial_state: self.initial_state_dist.append(1) else: self.initial_state_dist.append(0) return self.state
def test_only_complete_episodes_written(): with helpers.tempdir() as temp: env = gym.make('CartPole-v0') env = Monitor(env, temp, video_callable=False) env.reset() d = False while not d: _, _, d, _ = env.step(env.action_space.sample()) env.reset() env.step(env.action_space.sample()) env.close() # Only 1 episode should be written results = monitoring.load_results(temp) assert len(results['episode_lengths']) == 1, "Found {} episodes written; expecting 1".format(len(results['episode_lengths']))
class OpenAIGym(Environment): def __init__(self, gym_id, monitor=None, monitor_safe=False, monitor_video=0): """ Initialize OpenAI Gym. Args: gym_id: OpenAI Gym environment ID. See https://gym.openai.com/envs monitor: Output directory. Setting this to None disables monitoring. monitor_safe: Setting this to True prevents existing log files to be overwritten. Default False. monitor_video: Save a video every monitor_video steps. Setting this to 0 disables recording of videos. """ self.gym_id = gym_id self.gym = gym.make(gym_id) # Might raise gym.error.UnregisteredEnv or gym.error.DeprecatedEnv if monitor: if monitor_video == 0: video_callable = False else: video_callable = (lambda x: x % monitor_video == 0) self.gym = Monitor(self.gym, monitor, force=not monitor_safe, video_callable=video_callable) def __str__(self): return 'OpenAIGym({})'.format(self.gym_id) def close(self): self.gym = None def reset(self): return self.gym.reset() def execute(self, action): if isinstance(self.gym.action_space, gym.spaces.Box): action = [action] # some gym environments expect a list (f.i. Pendulum-v0) state, reward, terminal, _ = self.gym.step(action) return state, reward, terminal @property def states(self): if isinstance(self.gym.observation_space, Discrete): return dict(shape=(), type='float') else: return dict(shape=tuple(self.gym.observation_space.shape), type='float') @property def actions(self): if isinstance(self.gym.action_space, Discrete): return dict(continuous=False, num_actions=self.gym.action_space.n) elif len(self.gym.action_space.shape) == 1: return dict(continuous=True) elif len(self.gym.action_space.shape) > 1: return {'action' + str(n): dict(continuous=True) for n in range(len(self.gym.action_space.shape))} else: raise TensorForceError() def monitor(self, path): self.gym = Monitor(self.gym, path)
class GymEnvironment(VideoCapableEnvironment): """ Wraps an Open AI Gym environment """ def __init__(self, env_name, state_builder=ALEStateBuilder(), repeat_action=4, no_op=30, monitoring_path=None): assert isinstance(state_builder, StateBuilder), 'state_builder should inherit from StateBuilder' assert isinstance(repeat_action, (int, tuple)), 'repeat_action should be int or tuple' if isinstance(repeat_action, int): assert repeat_action >= 1, "repeat_action should be >= 1" elif isinstance(repeat_action, tuple): assert len(repeat_action) == 2, 'repeat_action should be a length-2 tuple: (min frameskip, max frameskip)' assert repeat_action[0] < repeat_action[1], 'repeat_action[0] should be < repeat_action[1]' super(GymEnvironment, self).__init__() self._state_builder = state_builder self._env = gym.make(env_name) self._env.env.frameskip = repeat_action self._no_op = max(0, no_op) self._done = True if monitoring_path is not None: self._env = Monitor(self._env, monitoring_path, video_callable=need_record) @property def available_actions(self): return self._env.action_space.n @property def state(self): return None if self._state is None else self._state_builder(self._state) @property def lives(self): return self._env.env.ale.lives() @property def frame(self): return Image.fromarray(self._state) def do(self, action): self._state, self._reward, self._done, _ = self._env.step(action) self._score += self._reward return self.state, self._reward, self._done def reset(self): super(GymEnvironment, self).reset() self._state = self._env.reset() # Random number of initial no-op to introduce stochasticity if self._no_op > 0: for _ in six.moves.range(np.random.randint(1, self._no_op)): self._state, _, _, _ = self._env.step(0) return self.state
def test_env_reuse(): with helpers.tempdir() as temp: env = gym.make('Autoreset-v0') env = Monitor(env, temp) env.reset() _, _, done, _ = env.step(None) assert not done _, _, done, _ = env.step(None) assert done _, _, done, _ = env.step(None) assert not done _, _, done, _ = env.step(None) assert done env.close()
class GymEnvironment(Environment): def __init__(self, env_id, directory=None, force=True, monitor_video=0): super(GymEnvironment, self).__init__(env_id=env_id) self._env = gym.make(env_id) if directory: if monitor_video == 0: video_callable = False else: video_callable = (lambda x: x % monitor_video == 0) self._env = Monitor(self._env, directory, video_callable=video_callable, force=force) def __str__(self): return 'OpenAIGym({})'.format(self._env_id) def close(self): if not self._closed: self._env.close() self._closed = True def reset(self, return_spec=True): self._reset() state = self._env.reset() if return_spec: return EnvSpec(action=None, state=None, reward=0, done=False, next_state=state) return state def step(self, action, state, return_spec=True): self._step() if isinstance(action, (list, np.ndarray)): if isinstance(self._env.action_space, Discrete) or isinstance(action, (list, np.ndarray)): action = action[0] if isinstance(self._env.action_space, Box) and not isinstance(action, (list, np.ndarray)): action = list(action) next_state, reward, done, _ = self._env.step(action) if return_spec: return EnvSpec( action=action, state=state, reward=reward, done=done, next_state=next_state) return next_state, reward, done @property def num_states(self): return self._env.observation_space.shape[0] @property def num_actions(self): if isinstance(self._env.action_space, Box): return self._env.action_space.shape[0] else: return self._env.action_space.n @property def is_continuous(self): return not isinstance(self._env.action_space, Discrete)
def evaluate(self, n_games=1, save_path="./records", use_monitor=True, record_video=True, verbose=True, t_max=100000): """Plays an entire game start to end, records the logs(and possibly mp4 video), returns reward. :param save_path: where to save the report :param record_video: if True, records mp4 video :return: total reward (scalar) """ env = self.make_env() if not use_monitor and record_video: raise warn("Cannot video without gym monitor. If you still want video, set use_monitor to True") if record_video : env = Monitor(env,save_path,force=True) elif use_monitor: env = Monitor(env, save_path, video_callable=lambda i: False, force=True) game_rewards = [] for _ in range(n_games): # initial observation observation = env.reset() # initial memory prev_memories = [np.zeros((1,) + tuple(mem.output_shape[1:]), dtype=get_layer_dtype(mem)) for mem in self.agent.agent_states] t = 0 total_reward = 0 while True: res = self.agent_step(self.preprocess_observation(observation)[None, ...], *prev_memories) action, new_memories = res[0], res[1:] observation, reward, done, info = env.step(action[0]) total_reward += reward prev_memories = new_memories if done or t >= t_max: if verbose: print("Episode finished after {} timesteps with reward={}".format(t + 1, total_reward)) break t += 1 game_rewards.append(total_reward) env.close() del env return game_rewards
def test_no_monitor_reset_unless_done(): def assert_reset_raises(env): errored = False try: env.reset() except error.Error: errored = True assert errored, "Env allowed a reset when it shouldn't have" with helpers.tempdir() as temp: # Make sure we can reset as we please without monitor env = gym.make('CartPole-v0') env.reset() env.step(env.action_space.sample()) env.step(env.action_space.sample()) env.reset() # can reset once as soon as we start env = Monitor(env, temp, video_callable=False) env.reset() # can reset multiple times in a row env.reset() env.reset() env.step(env.action_space.sample()) env.step(env.action_space.sample()) assert_reset_raises(env) # should allow resets after the episode is done d = False while not d: _, _, d, _ = env.step(env.action_space.sample()) env.reset() env.reset() env.step(env.action_space.sample()) assert_reset_raises(env) env.close()
class PolicyMonitor(object): """ Helps evaluating a policy by running an episode in an environment, saving a video, and plotting summaries to Tensorboard. Args: env: environment to run in policy_net: A policy estimator summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries """ def __init__(self, env, policy_net, summary_writer, saver=None): self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True) self.global_policy_net = policy_net self.summary_writer = summary_writer self.saver = saver self.sp = StateProcessor() self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) try: os.makedirs(self.video_dir) except FileExistsError: pass # Local policy net with tf.variable_scope("policy_eval"): self.policy_net = PolicyEstimator(policy_net.num_outputs) # Op to copy params from global policy/value net parameters self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES)) def _policy_net_predict(self, state, sess): feed_dict = { self.policy_net.states: [state] } preds = sess.run(self.policy_net.predictions, feed_dict) return preds["probs"][0] def eval_once(self, sess): with sess.as_default(), sess.graph.as_default(): # Copy params to local model global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op]) # Run an episode done = False state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) total_reward = 0.0 episode_length = 0 while not done: action_probs = self._policy_net_predict(state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state)) total_reward += reward episode_length += 1 state = next_state # Add summaries episode_summary = tf.Summary() episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward") episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length") self.summary_writer.add_summary(episode_summary, global_step) self.summary_writer.flush() if self.saver is not None: self.saver.save(sess, self.checkpoint_path) tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length)) return total_reward, episode_length def continuous_eval(self, eval_every, sess, coord): """ Continuously evaluates the policy every [eval_every] seconds. """ try: while not coord.should_stop(): self.eval_once(sess) # Sleep until next evaluation cycle time.sleep(eval_every) except tf.errors.CancelledError: return
epsilon = 0.05 if sample > epsilon: with torch.no_grad(): return (policy_net(state.to(device)).max(1)[1].data[0]).to(torch.device("cpu")) else: return random.randrange(4) env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % 50 == 0, resume=True) for i in [6]: print("Loading Checkpoint from dqn{}.model".format(i)) checkpoint = torch.load("dqn{}.model".format(i)) episode = checkpoint['episode'] policy_net.load_state_dict(checkpoint['state_dict']) for i_episode in range(200): state = env.reset() state = process(state) state = torch.cat(tuple([state] * 4), dim=1) episode_reward = 0 for t in count(): action = get_action() next_state, reward, done, _ = env.step(action) num_steps+=1 episode_reward += reward next_state = process(next_state) next_state = torch.cat((state[:,1:,:,:],next_state), dim=1) if done: break print("reward is {}".format(episode_reward)) print(num_steps)
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy( q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) replay_memory.append(Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # Record videos # Use the gym env Monitor wrapper env = Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every ==0) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps-1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # Maybe update the target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # Save transition to replay memory replay_memory.append(Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) # Calculate q values and targets (Double DQN) q_values_next = q_estimator.predict(sess, next_states_batch) best_actions = np.argmax(q_values_next, axis=1) q_values_next_target = target_estimator.predict(sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \ discount_factor * q_values_next_target[np.arange(batch_size), best_actions] # Perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode+1], episode_rewards=stats.episode_rewards[:i_episode+1]) env.monitor.close() return stats