예제 #1
0
def q1_run(num_timesteps):
    # Get Atari games.
    benchmark = gym.benchmark_spec('Atari40M')

    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = 0  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed, expt_dir='tmp/gym-results2')

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learning(
        env=env,
        q_func=DQN,
        runname="normal_run",
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion2(num_timesteps),
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGET_UPDATE_FREQ
    )
예제 #2
0
def q2_run(num_timesteps):
    schedulers = {"no_explore": ConstantSchedule(0.1),
                  "delayed_decay": PiecewiseSchedule([(0, 1.0), (0.25e6, 1.0), (1.25e6, 0.1)], outside_value=0.1),
                  "slower_decay": LinearSchedule(1500000, 0.1)}

    for name, exploration_schedule in schedulers.items():
        # Get Atari games.
        benchmark = gym.benchmark_spec('Atari40M')

        # Change the index to select a different game.
        task = benchmark.tasks[3]

        # Run training
        seed = 0  # Use a seed of zero (you may want to randomize the seed!)
        env = get_env(task, seed)
        env.reset()

        optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS))

        dqn_learning(
            env=env,
            q_func=DQN,
            runname=name,
            optimizer_spec=optimizer_spec,
            exploration=exploration_schedule,
            stopping_criterion=stopping_criterion2(num_timesteps),
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            gamma=GAMMA,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ
        )
예제 #3
0
    def __init__(self, settings):
        self.check_settings(settings)

        # Constants
        self.batch_size = settings["batch_size"]
        self.checkpoint_frequency = settings["checkpoint_frequency"]
        self.device = settings["device"]
        self.dtype = (torch.cuda.FloatTensor
                      if self.device.type == "cuda" else torch.FloatTensor)
        self.env_name = settings["env"]
        self.env = get_env(settings["env"], 6)
        self.eps_cliff = settings["eps_cliff"]
        self.eps_start = settings["eps_start"]
        self.eps_end = settings["eps_end"]
        self.frame_history_len = settings["frame_history_len"]
        self.gamma = settings["gamma"]
        self.learning_freq = settings["learning_freq"]
        self.learning_start = settings["learning_start"]
        self.logs_dir = settings["logs_dir"]
        self.log_freq = settings["log_freq"]
        self.memory_size = settings["memory_size"]
        self.model_name = settings["model_name"]
        self.num_actions = self.env.action_space.n
        settings["num_actions"] = self.num_actions
        settings["num_channels"] = self.frame_history_len
        self.out_dir = settings["out_dir"]
        self.target_update_freq = settings["target_update_freq"]
        self.total_timesteps = settings["total_timesteps"]

        # Init models
        self.Q = DQN(settings).to(self.device)
        self.target_Q = DQN(settings).to(self.device)
        self.target_Q.load_state_dict(self.Q.state_dict())
        self.target_Q.eval()

        # Init model supporting objects
        self.memory = ReplayBuffer(self.memory_size, self.frame_history_len)
        self.optimizer = optim.RMSprop(self.Q.parameters(),
                                       lr=settings["lr"],
                                       alpha=0.95,
                                       eps=0.01)
        self.loss = F.smooth_l1_loss

        # Logging
        self.writer = SummaryWriter(self.logs_dir)
예제 #4
0
    def eval_model(self, epoch, n=100):
        self.Q.eval()
        env = get_env(self.env_name, 6, monitor=False)
        rewards = []
        durations = []
        for _e in tqdm(range(n)):
            memory = ReplayBuffer(10000, self.frame_history_len)
            state = env.reset()[..., np.newaxis]
            reward_acc = 0.0
            for t in range(10000):
                if state is None:
                    break

                memory.store_frame(state)
                recent_observations = memory.encode_recent_observation()

                action = self.select_epsilon_greedy_action(
                    recent_observations, None, 0.05).item()
                state, reward, done, _ = env.step(action)

                if done:
                    state = env.reset()

                state = state[..., np.newaxis]
                reward_acc += reward

            durations.append(t)
        self.Q.train()
        sum_rewards = sum(rewards)
        sum_durations = sum(durations)
        self.writer.add_scalar(
            f"Mean Reward ({n} episodes)",
            round(sum_rewards / len(rewards), 2),
            epoch,
        )
        self.writer.add_scalar(
            f"Mean Duration ({n} episodes)",
            round(sum_durations / len(durations), 2),
            epoch,
        )
        self.writer.add_scalar(
            f"Mean Reward per Timestep ({n} episodes)",
            round(sum_rewards / sum_durations, 2),
            epoch,
        )
예제 #5
0
def bonus_run(num_timesteps):
    def make_range_black(arr: np.ndarray, start, end):
        arr[:, start:end, :] = 0

    frame_filters = {"no_left_side": lambda x: make_range_black(x, 0, x.shape[1] // 4),
                     "no_middle_side": lambda x: make_range_black(x, x.shape[1] // 4, x.shape[1] // 2), }

    for name, frame_filter in frame_filters.items():
        # Get Atari games.
        benchmark = gym.benchmark_spec('Atari40M')

        # Change the index to select a different game.
        task = benchmark.tasks[3]

        # Run training
        seed = 0  # Use a seed of zero (you may want to randomize the seed!)
        env = get_env(task, seed)
        env.reset()

        optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS))

        dqn_learning(
            env=env,
            q_func=DQN,
            runname=name,
            frame_filter=frame_filter,
            optimizer_spec=optimizer_spec,
            exploration=LinearSchedule(1000000, 0.1),
            stopping_criterion=stopping_criterion2(num_timesteps),
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            gamma=GAMMA,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ
        )
예제 #6
0
                        type=int,
                        default=30,
                        help='Number of epochs to train')
    parser.add_argument('--k',
                        type=int,
                        default=10,
                        help='Number of Value Iterations')
    parser.add_argument('--l_i',
                        type=int,
                        default=3,
                        help='Number of channels in input layer')
    parser.add_argument('--l_h',
                        type=int,
                        default=150,
                        help='Number of channels in first hidden layer')
    parser.add_argument(
        '--l_q',
        type=int,
        default=9,
        help='Number of channels in q layer (~actions) in VI-module')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='Batch size')
    config = parser.parse_args()

    seed = 0  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(seed)

    main(env, 1000000, config=config)
예제 #7
0
def main(env, num_timesteps):
    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = random.randint(0,100)  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed)

    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    # empty dict to hold all results
    Stats = {}

    new_lr = 0.001
    new_gamma = 0.999
    exploration_sches = [LinearSchedule(1000000, 0.1), ConstantSchedule(0.05),
                         ConstantSchedule(0.15), LinearSchedule(500000, 0.05)]

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=new_lr, alpha=ALPHA, eps=EPS),
    )

    env = get_env(task, seed)
    Stats["lr=0.001, gamma=0.999"] = dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=new_gamma,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        feature_tested="lr=0.001, gamma=0.999"
    )

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    env = get_env(task, seed)
    Stats["Default"] = dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        feature_tested=""
    )

    plt.clf()
    plt.xlabel('Timesteps')
    plt.ylabel('Mean Reward (past 100 episodes)')
    num_items = len(Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"], label="lr=0.001, gamma=0.999")
    num_items = len(Stats["Default"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["Default"]["mean_episode_rewards"], label="Default")
    plt.legend()
    plt.title("Performance")
    plt.savefig('Final-Performance.png')
예제 #8
0
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        feature_tested=""
    )

    plt.clf()
    plt.xlabel('Timesteps')
    plt.ylabel('Mean Reward (past 100 episodes)')
    num_items = len(Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"], label="lr=0.001, gamma=0.999")
    num_items = len(Stats["Default"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["Default"]["mean_episode_rewards"], label="Default")
    plt.legend()
    plt.title("Performance")
    plt.savefig('Final-Performance.png')


if __name__ == '__main__':
    # Get Atari games.
    benchmark = gym.benchmark_spec('Atari40M')

    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = 0  # datetime.now()  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed)

    main(env, task.max_timesteps)
예제 #9
0
        state, reward, done, _ = env.step(action)
        state = state[..., np.newaxis]
        memory.store_effect(last_idx, action, reward, done)

        reward_acc += reward

        if done:
            break

        time.sleep(0.05)

    logging.info(f"Total Reward: {reward_acc}")
    logging.info(f"Average Reward per Timestep: {reward_acc / _step}")
    logging.info(f"Timesteps: {_step}")


if __name__ == "__main__":
    # Initialize environment
    env = get_env("StarGunnerDeterministic-v4", 6, monitor=False)

    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_actions = env.action_space.n
    num_channels = 4
    model = load_model_checkpoint("out/checkpoints/dqn_1250000", num_actions,
                                  num_channels).to(device)

    # play using model
    play_using_model(env, model, device)
예제 #10
0
        reward_per_timestep = total_reward / total_timesteps
        timesteps_per_episode = total_timesteps / curr_episode
        all_rewards_per_episode.append(reward_per_episode)

        print_policy_statistics(reward_per_episode,
                                reward_per_timestep,
                                timesteps_per_episode,
                                wrapped_stdev(all_rewards_per_episode),
                                episode_num=curr_episode)

    return total_reward / num_episodes, total_reward / total_timesteps, total_timesteps / num_episodes, wrapped_stdev(
        all_rewards_per_episode)


if __name__ == '__main__':
    env = get_env(GAME, 6, monitor=False)

    if len(sys.argv) != 2:
        print('Incorrect number of arguments: python3 runner.py <random|dqn>')
        exit(1)

    if sys.argv[1] == 'dqn':
        print('Simulating DQN...')

        device = torch.device("cpu")
        settings = {
            "num_actions": env.action_space.n,
            "num_channels": CHANNELS
        }
        model = DQN(settings)
        model.load_state_dict(
예제 #11
0
    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )


if __name__ == '__main__':

    env_id = 'Pong' + 'NoFrameskip-v4'

    env = get_env(env_id, seed=0)

    main(env, 10e6 * 40)
예제 #12
0
    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )


if __name__ == '__main__':

    # Run training
    seed = 0  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env('Breakout-v0', seed)

    main(env, 1e6)