예제 #1
0
def main():
    config = {'starting-floor': 0, 'total-floors': 5, 'dense-reward': 1,
              'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0,
              'allowed-modules': 0,
              'allowed-floors': 0,
              }
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                           worker_id=1, retro=True, realtime_mode=False, config=config)
    env.seed(1)
    print(env.observation_space)
    print(env.action_space)

    obs = env.reset()

    plt.imshow(obs)
    plt.show()

    obs, reward, done, info = env.step(env.action_space.sample())
    print('obs', obs)
    print('reward', reward)
    print('done', done)
    print('info', info)

    plt.imshow(obs)
    plt.show()
    env.close()
예제 #2
0
def create_env(starting_floor = 0, total_floors = 10, worker_id = 1):
    """
    Here we set up the environement according to the assignment instructions.
    The total floors is update by one if equal to starting floor.
    """

    assert starting_floor < total_floors, "Invalid Floors Specified Start: " + str(starting_floor) + " total: " + str(total_floors)


    config = {'starting-floor': starting_floor, 'total-floors': total_floors, 'dense-reward': 1,
              'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0,
              'allowed-modules': 0,
              'allowed-floors': 0
              }
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=worker_id, docker_training=False, retro=True,realtime_mode=False,config=config)
    env.seed(1)
    #_ = env.reset()
    return env
예제 #3
0
def main():

    parser = argparse.ArgumentParser(description='PPO Atari')
    parser.add_argument(
        '--checkpoint',
        type=str,
        default=None,
        help=
        'Where checkpoint file should be loaded from (usually results/checkpoint.pth)'
    )
    parser.add_argument('--seed',
                        type=int,
                        default=419,
                        help='Random seed for training')
    parser.add_argument('--lr', type=float, default=1e-4, help="learning rate")
    # parser.add_argument('--continue', action='store_true')
    args = parser.parse_args()

    i = 0
    if not os.path.exists("results"):
        os.mkdir("results")
    while True:
        file_name = "results/experiment_" + str(i)
        if not os.path.exists(file_name):
            dir_to_make = file_name
            break
        i += 1

    os.mkdir(dir_to_make)
    save_loc = dir_to_make + "/"
    print("Saving results to", dir_to_make)
    ############## Hyperparameters ##############
    solved_reward = 230  # stop training if avg_reward > solved_reward
    log_interval = 5  # print avg reward in the interval
    max_episodes = 50000  # max training episodes
    max_timesteps = 512  # max timesteps in one episode
    n_latent_var = 32  # number of variables in hidden layer
    update_timestep = 1024  # update policy every n timesteps
    lr = 0.1
    betas = (0.9, 0.999)
    gamma = 0.7  # discount factor
    K_epochs = 8  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = args.seed
    #############################################

    # np.random.seed(random_seed)
    random.seed(random_seed)
    config = {
        'starting-floor': 0,
        'total-floors': 9,
        'dense-reward': 10,
        'lighting-type': 0,
        'visual-theme': 0,
        'default-theme': 0,
        'agent-perspective': 1,
        'allowed-rooms': 0,
        'allowed-modules': 0,
        'allowed-floors': 0,
    }
    worker_id = int(np.random.randint(999, size=1))
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                           docker_training=False,
                           worker_id=worker_id,
                           retro=True,
                           realtime_mode=False,
                           config=config,
                           greyscale=True)
    env.seed(args.seed)
    env = PyTorchFrame(env)
    env = FrameStack(env, 10)
    env = HumanActionEnv(env)

    memory = Memory()
    env_shape = env.observation_space.shape
    state_dim = np.prod(env_shape)
    action_dim = env.action_space.n
    n_latent_var = 600
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    if (args.checkpoint):
        print(f"Loading a policy - { args.checkpoint } ")
        ppo.policy.load_state_dict(torch.load(args.checkpoint))

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes + 1):
        state = env.reset()
        for t in range(max_timesteps):
            timestep += 1

            # Running policy_old:
            action = ppo.policy_old.act(np.array(state), memory)
            state, reward, done, _ = env.step(action)

            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                timestep = 0
            running_reward += reward

            if done:
                break

        avg_length += t
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval * solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(),
                       './PPO_{}.pth'.format(env_name))
            break

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = int((running_reward / log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
            torch.save(
                ppo.policy.state_dict(),
                os.path.join(save_loc,
                             "checkpoint_" + str(i_episode) + "_eps.pth"))
            print("Saved models after", i_episode)
    torch.save(ppo.policy.state_dict(),
               os.path.join(save_loc, "final_checkpoint.pth"))
예제 #4
0
        next_obs, reward, done, info = env.step(action)
        yield big_obs(next_obs, info)
        if done:
            break
        obs = next_obs
    env.close()


if __name__ == '__main__':
    config = {
        'starting-floor': 0,
        'total-floors': 9,
        'dense-reward': 1,
        'lighting-type': 0,
        'visual-theme': 0,
        'default-theme': 0,
        'agent-perspective': 1,
        'allowed-rooms': 0,
        'allowed-modules': 0,
        'allowed-floors': 0,
    }
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                           worker_id=1,
                           docker_training=False,
                           retro=True,
                           realtime_mode=False,
                           config=config)
    env.seed(1)
    agent = RandomAgent(env.observation_space, env.action_space)
    export_video('export_.mp4', 168, 168, 10, run_fn(env, agent))
예제 #5
0
def main():
    config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1,
              'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0,
              'allowed-modules': 0,
              'allowed-floors': 0,
              }
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                           worker_id=1, retro=True, realtime_mode=False, config=config)
    print(env.observation_space)
    print(env.action_space)

    hyper_params = {
        "seed": 6,  # which seed to use
        "replay-buffer-size": int(5e3),  # replay buffer size
        "learning-rate": 1e-4,  # learning rate for Adam optimizer
        "discount-factor": 0.99,  # discount factor
        "num-steps": int(1e6),  # total number of steps to run the environment for
        "batch-size": 32,  # number of transitions to optimize at the same time
        "learning-starts": 5000,  # number of steps before learning starts
        "learning-freq": 1,  # number of iterations between every optimization step
        "use-double-dqn": True,  # use double deep Q-learning
        "target-update-freq": 1000,  # number of iterations between every target network update
        "eps-start": 1.0,  # e-greedy start threshold
        "eps-end": 0.01,  # e-greedy end threshold
        "eps-fraction": 0.05,  # fraction of num-steps
        "print-freq": 10
    }

    np.random.seed(hyper_params["seed"])
    random.seed(hyper_params["seed"])

    #assert "NoFrameskip" in hyper_params["env"], "Require environment with no frameskip"
    #env = gym.make(hyper_params["env"])
    env.seed(hyper_params["seed"])

    #env = NoopResetEnv(env, noop_max=30)
    #env = MaxAndSkipEnv(env, skip=4)
    #env = EpisodicLifeEnv(env)
    #env = FireResetEnv(env)
    # env = WarpFrame(env)
    env = PyTorchFrame(env)
    # env = ClipRewardEnv(env)
    # env = FrameStack(env, 4)

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    agent = DQNAgent(
        env.observation_space,
        env.action_space,
        replay_buffer,
        use_double_dqn=hyper_params["use-double-dqn"],
        lr=hyper_params["learning-rate"],
        batch_size=hyper_params["batch-size"],
        gamma=hyper_params["discount-factor"]
    )

    model_num = 500
    agent.policy_network.load_state_dict(torch.load('./Models/' + str(model_num) + '_policy.pt',map_location=torch.device(device)))

    eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"])
    episode_rewards = [0.0]
    ep_nums = model_num

    state = env.reset()
    for t in range(hyper_params["num-steps"]):
        fraction = min(1.0, float(t) / eps_timesteps)
        eps_threshold = hyper_params["eps-start"] + fraction * (hyper_params["eps-end"] - hyper_params["eps-start"])
        sample = random.random()
        # TODO
        #  select random action if sample is less equal than eps_threshold
        # take step in env
        # add state, action, reward, next_state, float(done) to reply memory - cast done to float
        # add reward to episode_reward
        if sample > eps_threshold:
            action = agent.act(np.array(state))
        else:
            action = env.action_space.sample()

        next_state, reward, done, _ = env.step(action)
        agent.memory.add(state, action, reward, next_state, float(done))
        state = next_state

        episode_rewards[-1] += reward
        if done:
            state = env.reset()
            episode_rewards.append(0.0)
            ep_nums += 1
            if ep_nums % 50 == 0:
                agent.save_models(ep_nums)
                plot(episode_rewards,ep_nums)




        if t > hyper_params["learning-starts"] and t % hyper_params["learning-freq"] == 0:
            agent.optimise_td_loss()

        if t > hyper_params["learning-starts"] and t % hyper_params["target-update-freq"] == 0:
            agent.update_target_network()

        num_episodes = len(episode_rewards)

        if done and hyper_params["print-freq"] is not None and len(episode_rewards) % hyper_params[
            "print-freq"] == 0:
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            print("********************************************************")
            print("steps: {}".format(t))
            print("episodes: {}".format(num_episodes))
            print("mean 100 episode reward: {}".format(mean_100ep_reward))
            print("% time spent exploring: {}".format(int(100 * eps_threshold)))
            print("********************************************************")


        #if done and ep_nums % 10 == 0:
        #    animate(env,agent,"anim/progress_"+str(ep_nums))
        #    state = env.reset()

    animate(env,agent,"anim/final")


    env.close()
예제 #6
0
        file_name = "results/experiment_"+str(i)
        if not os.path.exists(file_name):
            dir_to_make = file_name
            break
        i+=1
    os.mkdir(dir_to_make)
    save_loc = dir_to_make+"/"
    print("Saving results to", dir_to_make)
    config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1,
              'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0,
              'allowed-modules': 0,
              'allowed-floors': 0,
              }

    worker_id = int(np.random.randint(999, size=1))
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower', docker_training=False, worker_id=worker_id,retro=True, realtime_mode=False, config=config, greyscale=True)
    env.seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    env = PyTorchFrame(env) # Change Name
    # env = FrameStack(env, 10)
    env = HumanActionEnv(env)


    state = env.reset()


    # Defines shapes for placeholders in tf graphs
    state_shape = state.shape
    frame_height = state.shape[1]
    frame_width = state.shape[2]
#                       config=config)
#        return env
#
#    return _thunk
#
#envs = [make_env(i) for i in range(1, num_envs)]
#envs = SubprocVecEnv(envs)

config = {'starting-floor': 0, 'total-floors': 9, 'dense-reward': 1,
          'lighting-type': 0, 'visual-theme': 0, 'default-theme': 0, 'agent-perspective': 1, 'allowed-rooms': 0,
          'allowed-modules': 0,
          'allowed-floors': 0,
          }

env = ObstacleTowerEnv('./ObstacleTower/obstacletower', worker_id=1, docker_training=False, retro=True,
                       realtime_mode=False,
                       config=config)

def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)
        

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
            nn.Linear(num_inputs, hidden_size),
            nn.ReLU(),
예제 #8
0
        'total-floors': 9,
        'dense-reward': 1,
        'lighting-type': 0,
        'visual-theme': 0,
        'default-theme': 0,
        'agent-perspective': 1,
        'allowed-rooms': 0,
        'allowed-modules': 0,
        'allowed-floors': 0,
    }
    worker_id = int(np.random.randint(999, size=1))
    print(worker_id)
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                           docker_training=False,
                           worker_id=worker_id,
                           retro=True,
                           realtime_mode=args.realtime,
                           config=config,
                           greyscale=False)
    env = ObstacleTowerEvaluation(env, eval_seeds)

    while not env.evaluation_complete:
        # Deleted the try catch because the error txt file was confusing
        episode_rew = run_episode(env)

    env.close()
    if error_occurred:
        print(-100.0)
    else:
        print(env.results['average_reward'] * 10000)
예제 #9
0
if __name__ == "__main__":
    config = {
        'starting-floor': 0,
        'total-floors': 9,
        'dense-reward': 1,
        'lighting-type': 0,
        'visual-theme': 0,
        'default-theme': 0,
        'agent-perspective': 1,
        'allowed-rooms': 0,
        'allowed-modules': 0,
        'allowed-floors': 0,
    }
    env = ObstacleTowerEnv('./ObstacleTower/obstacletower',
                           worker_id=1,
                           retro=True,
                           realtime_mode=True,
                           config=config)

    # env = WarpFrame(env)
    # env = PyTorchFrame(env)
    # env = ClipRewardEnv(env)
    # env = FrameStack(env, 4)

    agent = MyAgent(env.observation_space, env.action_space)

    state = env.reset()
    for t in itertools.count():
        env.render()  # Animate
        action = agent.act(np.array(state))
        next_state, reward, done, _ = env.step(action)