def main(): config = {'starting-floor': 0, 'total-floors': 5, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=1, retro=True, realtime_mode=False, config=config) env.seed(1) print(env.observation_space) print(env.action_space) obs = env.reset() plt.imshow(obs) plt.show() obs, reward, done, info = env.step(env.action_space.sample()) print('obs', obs) print('reward', reward) print('done', done) print('info', info) plt.imshow(obs) plt.show() env.close()
def create_env(starting_floor = 0, total_floors = 10, worker_id = 1): """ Here we set up the environement according to the assignment instructions. The total floors is update by one if equal to starting floor. """ assert starting_floor < total_floors, "Invalid Floors Specified Start: " + str(starting_floor) + " total: " + str(total_floors) config = {'starting-floor': starting_floor, 'total-floors': total_floors, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0 } env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=worker_id, docker_training=False, retro=True,realtime_mode=False,config=config) env.seed(1) #_ = env.reset() return env
def main(): parser = argparse.ArgumentParser(description='PPO Atari') parser.add_argument( '--checkpoint', type=str, default=None, help= 'Where checkpoint file should be loaded from (usually results/checkpoint.pth)' ) parser.add_argument('--seed', type=int, default=419, help='Random seed for training') parser.add_argument('--lr', type=float, default=1e-4, help="learning rate") # parser.add_argument('--continue', action='store_true') args = parser.parse_args() i = 0 if not os.path.exists("results"): os.mkdir("results") while True: file_name = "results/experiment_" + str(i) if not os.path.exists(file_name): dir_to_make = file_name break i += 1 os.mkdir(dir_to_make) save_loc = dir_to_make + "/" print("Saving results to", dir_to_make) ############## Hyperparameters ############## solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 5 # print avg reward in the interval max_episodes = 50000 # max training episodes max_timesteps = 512 # max timesteps in one episode n_latent_var = 32 # number of variables in hidden layer update_timestep = 1024 # update policy every n timesteps lr = 0.1 betas = (0.9, 0.999) gamma = 0.7 # discount factor K_epochs = 8 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = args.seed ############################################# # np.random.seed(random_seed) random.seed(random_seed) config = { 'starting-floor': 0, 'total-floors': 9, 'dense-reward': 10, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } worker_id = int(np.random.randint(999, size=1)) env = ObstacleTowerEnv('./ObstacleTower/obstacletower', docker_training=False, worker_id=worker_id, retro=True, realtime_mode=False, config=config, greyscale=True) env.seed(args.seed) env = PyTorchFrame(env) env = FrameStack(env, 10) env = HumanActionEnv(env) memory = Memory() env_shape = env.observation_space.shape state_dim = np.prod(env_shape) action_dim = env.action_space.n n_latent_var = 600 ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) if (args.checkpoint): print(f"Loading a policy - { args.checkpoint } ") ppo.policy.load_state_dict(torch.load(args.checkpoint)) # logging variables running_reward = 0 avg_length = 0 timestep = 0 # training loop for i_episode in range(1, max_episodes + 1): state = env.reset() for t in range(max_timesteps): timestep += 1 # Running policy_old: action = ppo.policy_old.act(np.array(state), memory) state, reward, done, _ = env.step(action) # Saving reward and is_terminal: memory.rewards.append(reward) memory.is_terminals.append(done) # update if its time if timestep % update_timestep == 0: ppo.update(memory) memory.clear_memory() timestep = 0 running_reward += reward if done: break avg_length += t # stop training if avg_reward > solved_reward if running_reward > (log_interval * solved_reward): print("########## Solved! ##########") torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name)) break # logging if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = int((running_reward / log_interval)) print('Episode {} \t avg length: {} \t reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0 torch.save( ppo.policy.state_dict(), os.path.join(save_loc, "checkpoint_" + str(i_episode) + "_eps.pth")) print("Saved models after", i_episode) torch.save(ppo.policy.state_dict(), os.path.join(save_loc, "final_checkpoint.pth"))
next_obs, reward, done, info = env.step(action) yield big_obs(next_obs, info) if done: break obs = next_obs env.close() if __name__ == '__main__': config = { 'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=1, docker_training=False, retro=True, realtime_mode=False, config=config) env.seed(1) agent = RandomAgent(env.observation_space, env.action_space) export_video('export_.mp4', 168, 168, 10, run_fn(env, agent))
def main(): config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=1, retro=True, realtime_mode=False, config=config) print(env.observation_space) print(env.action_space) hyper_params = { "seed": 6, # which seed to use "replay-buffer-size": int(5e3), # replay buffer size "learning-rate": 1e-4, # learning rate for Adam optimizer "discount-factor": 0.99, # discount factor "num-steps": int(1e6), # total number of steps to run the environment for "batch-size": 32, # number of transitions to optimize at the same time "learning-starts": 5000, # number of steps before learning starts "learning-freq": 1, # number of iterations between every optimization step "use-double-dqn": True, # use double deep Q-learning "target-update-freq": 1000, # number of iterations between every target network update "eps-start": 1.0, # e-greedy start threshold "eps-end": 0.01, # e-greedy end threshold "eps-fraction": 0.05, # fraction of num-steps "print-freq": 10 } np.random.seed(hyper_params["seed"]) random.seed(hyper_params["seed"]) #assert "NoFrameskip" in hyper_params["env"], "Require environment with no frameskip" #env = gym.make(hyper_params["env"]) env.seed(hyper_params["seed"]) #env = NoopResetEnv(env, noop_max=30) #env = MaxAndSkipEnv(env, skip=4) #env = EpisodicLifeEnv(env) #env = FireResetEnv(env) # env = WarpFrame(env) env = PyTorchFrame(env) # env = ClipRewardEnv(env) # env = FrameStack(env, 4) replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"]) agent = DQNAgent( env.observation_space, env.action_space, replay_buffer, use_double_dqn=hyper_params["use-double-dqn"], lr=hyper_params["learning-rate"], batch_size=hyper_params["batch-size"], gamma=hyper_params["discount-factor"] ) model_num = 500 agent.policy_network.load_state_dict(torch.load('./Models/' + str(model_num) + '_policy.pt',map_location=torch.device(device))) eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"]) episode_rewards = [0.0] ep_nums = model_num state = env.reset() for t in range(hyper_params["num-steps"]): fraction = min(1.0, float(t) / eps_timesteps) eps_threshold = hyper_params["eps-start"] + fraction * (hyper_params["eps-end"] - hyper_params["eps-start"]) sample = random.random() # TODO # select random action if sample is less equal than eps_threshold # take step in env # add state, action, reward, next_state, float(done) to reply memory - cast done to float # add reward to episode_reward if sample > eps_threshold: action = agent.act(np.array(state)) else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) agent.memory.add(state, action, reward, next_state, float(done)) state = next_state episode_rewards[-1] += reward if done: state = env.reset() episode_rewards.append(0.0) ep_nums += 1 if ep_nums % 50 == 0: agent.save_models(ep_nums) plot(episode_rewards,ep_nums) if t > hyper_params["learning-starts"] and t % hyper_params["learning-freq"] == 0: agent.optimise_td_loss() if t > hyper_params["learning-starts"] and t % hyper_params["target-update-freq"] == 0: agent.update_target_network() num_episodes = len(episode_rewards) if done and hyper_params["print-freq"] is not None and len(episode_rewards) % hyper_params[ "print-freq"] == 0: mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) print("********************************************************") print("steps: {}".format(t)) print("episodes: {}".format(num_episodes)) print("mean 100 episode reward: {}".format(mean_100ep_reward)) print("% time spent exploring: {}".format(int(100 * eps_threshold))) print("********************************************************") #if done and ep_nums % 10 == 0: # animate(env,agent,"anim/progress_"+str(ep_nums)) # state = env.reset() animate(env,agent,"anim/final") env.close()
file_name = "results/experiment_"+str(i) if not os.path.exists(file_name): dir_to_make = file_name break i+=1 os.mkdir(dir_to_make) save_loc = dir_to_make+"/" print("Saving results to", dir_to_make) config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } worker_id = int(np.random.randint(999, size=1)) env = ObstacleTowerEnv('./ObstacleTower/obstacletower', docker_training=False, worker_id=worker_id,retro=True, realtime_mode=False, config=config, greyscale=True) env.seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) env = PyTorchFrame(env) # Change Name # env = FrameStack(env, 10) env = HumanActionEnv(env) state = env.reset() # Defines shapes for placeholders in tf graphs state_shape = state.shape frame_height = state.shape[1] frame_width = state.shape[2]
# config=config) # return env # # return _thunk # #envs = [make_env(i) for i in range(1, num_envs)] #envs = SubprocVecEnv(envs) config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=1, docker_training=False, retro=True, realtime_mode=False, config=config) def init_weights(m): if isinstance(m, nn.Linear): nn.init.normal_(m.weight, mean=0., std=0.1) nn.init.constant_(m.bias, 0.1) class ActorCritic(nn.Module): def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0): super(ActorCritic, self).__init__() self.critic = nn.Sequential( nn.Linear(num_inputs, hidden_size), nn.ReLU(),
'total-floors': 9, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } worker_id = int(np.random.randint(999, size=1)) print(worker_id) env = ObstacleTowerEnv('./ObstacleTower/obstacletower', docker_training=False, worker_id=worker_id, retro=True, realtime_mode=args.realtime, config=config, greyscale=False) env = ObstacleTowerEvaluation(env, eval_seeds) while not env.evaluation_complete: # Deleted the try catch because the error txt file was confusing episode_rew = run_episode(env) env.close() if error_occurred: print(-100.0) else: print(env.results['average_reward'] * 10000)
if __name__ == "__main__": config = { 'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1, 'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0, 'allowed-modules': 0, 'allowed-floors': 0, } env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=1, retro=True, realtime_mode=True, config=config) # env = WarpFrame(env) # env = PyTorchFrame(env) # env = ClipRewardEnv(env) # env = FrameStack(env, 4) agent = MyAgent(env.observation_space, env.action_space) state = env.reset() for t in itertools.count(): env.render() # Animate action = agent.act(np.array(state)) next_state, reward, done, _ = env.step(action)