def test_frozenlake_value_debug():
    for i in range(1):
        ll_runs = 1
        steps = 20000
        ep_s = ExponentialDecay(steps // 10, 0.5, 0.05, steps)
        device = 'cuda'
        actions = 2
        env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=frozen_lake, max_steps=40)
        critic = DiscreteVTable((env.height, env.width)).to(device)
        critic.weights.data = v_table
        policy = VPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=0.0).to(device)
        batch_size = 16 * ll_runs
        join = OneObsToState()

        done = torch.tensor([0], dtype=torch.uint8)
        state = env.reset()
        env.render()
        print("")
        while not done.all():
            lookahead_state, lookahead_reward, lookahead_done, info = env.lookahead()
            action_dist = policy(join(lookahead_state), lookahead_reward, lookahead_done)
            action = action_dist.sample()
            n, reward, done, reset, info = env.step(action)
            env.render()
            print("")
def test_frozenlake_window_sizes():
    for _ in range(3):
        ll_runs = 600
        steps = 30000
        ep_s = ExponentialDecay(steps // 10, 0.05, steps)
        replay_window = ll_runs * steps // 20
        batch_size = 32 * ll_runs

        batch_size = 8 * ll_runs
        run_deep_q_on(frozen_lake, ll_runs=ll_runs, eps_sched=ep_s, replay_window=replay_window, batch_size=8 * ll_runs,
                      workers=1,
                      steps=steps, logging_freq=100, run_id=f'frozenlake_bt_{batch_size}', warmup=1000)

        batch_size = 16 * ll_runs
        run_deep_q_on(frozen_lake, ll_runs=ll_runs, eps_sched=ep_s, replay_window=replay_window,
                      batch_size=16 * ll_runs, workers=1,
                      steps=steps, logging_freq=100, run_id=f'frozenlake_bt_{batch_size}', warmup=1000)

        batch_size = 32 * ll_runs
        run_deep_q_on(frozen_lake, ll_runs=ll_runs, eps_sched=ep_s, replay_window=replay_window,
                      batch_size=32 * ll_runs, workers=1,
                      steps=steps, logging_freq=100, run_id=f'frozenlake_bt_{batch_size}', warmup=1000)

        batch_size = 64 * ll_runs
        run_deep_q_on(frozen_lake, ll_runs=ll_runs, eps_sched=ep_s, replay_window=replay_window,
                      batch_size=64 * ll_runs, workers=1,
                      steps=steps, logging_freq=100, run_id=f'frozenlake_bt_{batch_size}', warmup=1000)
示例#3
0
def test_F3_oh_value():

    for i in range(3):
        ll_runs = 1
        steps = 10000
        ep_s = ExponentialDecay(steps / 16, 0.5, 0.05, steps)
        lr_s = ConstantSched(0.05)
        device = 'cuda'
        actions = 3
        obs_shape = (1, )
        batch_size = 32

        env = gym.make('F3-v0')
        #env = RewardPerStep(env, reward_per_step=-0.01)
        env = TimeLimit(env, max_episode_steps=20)
        env = NormalizeFunctional(env,
                                  obs_f=normalize_obs,
                                  reward_f=normalize_reward)
        env = LookAhead(env)
        env = Reset(env)
        #env = Monitor(env)
        env = BatchTensor(env, device='cuda')

        #critic = FixupV(obs_shape, 4).to(device)
        critic = OneHotV(obs_shape, 12).to(device)
        policy = VPolicy(critic,
                         actions,
                         EpsilonGreedyProperDiscreteDist,
                         epsilon=1.0).to(device)

        exp_buffer = ExpBuffer(max_timesteps=steps // 10,
                               ll_runs=ll_runs,
                               batch_size=batch_size,
                               observation_shape=obs_shape)

        stepper = td_value.Stepper(env, OneObsToState(), exp_buffer)

        run_on(stepper=stepper,
               learner=td_value.train_one_value,
               env=env,
               critic=critic,
               policy=policy,
               ll_runs=ll_runs,
               eps_sched=ep_s,
               actions=actions,
               exp_buffer=exp_buffer,
               batch_size=batch_size,
               discount=0.8,
               lr_sched=lr_s,
               rendermode='episodic',
               steps=steps,
               logging_freq=1,
               run_id=f'f5_value_{i}',
               warmup_steps=0)
def test_cliffwalk_q_baseline():
    for i in range(3):
        ll_runs = 600
        steps = 5000
        ep_s = ExponentialDecay(steps // 10, 0.4, 0.02, steps)
        device = 'cuda'
        actions = 4
        env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=cliff_walk, max_steps=40)
        critic = DiscreteQTable((env.height, env.width), actions).to(device)
        policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device)
        batch_size = 16 * ll_runs
        exp_buffer = ExpBuffer(max_timesteps=steps//10, ll_runs=ll_runs, batch_size=batch_size, observation_shape=env.observation_space_shape)
        run_on(stepper=one_step, learner=train_one, env=env, critic=critic, policy=policy,
               ll_runs=ll_runs, eps_sched=ep_s,
               exp_buffer=exp_buffer, batch_size=batch_size, discount=0.8,
               steps=steps, logging_freq=100, run_id=f'cliffwalk_q_{i}', warmup_steps=10)
def test_anthill_importance_sampled():
    for i in range(10):
        ll_runs = 600
        steps = 40000
        ep_s = ExponentialDecay(steps // 10, 0.05, steps)
        replay_window = ll_runs * steps // 10
        device = 'cuda'
        actions = 4
        env = gym.make('SimpleGrid-v2', n=ll_runs, device=device, map_string=anthill)
        critic = DiscreteQTable((env.height, env.width), actions).to(device)
        policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device)
        batch_size = 16 * ll_runs
        exp_buffer = PrioritizedExpBuffer(replay_window, batch_size, True, *env.observation_space_shape)
        run_deep_q_on(env=env, critic=critic, policy=policy,
                      ll_runs=ll_runs, eps_sched=ep_s,
                      exp_buffer=exp_buffer, batch_size=batch_size,
                      workers=1, discount=0.8,
                      steps=steps, logging_freq=100, run_id=f'anthill_imp_smp_{i}', warmup=1000)
def test_puddlejump_baseline():
    for i in range(5):
        ll_runs = 600
        steps = 40000
        ep_s = ExponentialDecay(steps // 10, 0.05, steps)
        replay_window = ll_runs * steps // 10
        device = 'cuda'
        actions = 4
        env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=puddle_jumping, max_steps=100)
        critic = DiscreteQTable((env.height, env.width), actions).to(device)
        policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device)
        exp_buffer = ExpBuffer(replay_window, *env.observation_space_shape)
        batch_size = 16 * ll_runs
        run_deep_q_on(env=env, critic=critic, policy=policy,
                      ll_runs=ll_runs, eps_sched=ep_s,
                      exp_buffer=exp_buffer, batch_size=batch_size,
                      workers=1, discount=0.8,
                      steps=steps, logging_freq=100, run_id=f'puddle_baseline_{i}', warmup=1000)
def test_frozenlake_value_importance_sampled():
    for i in range(3):
        ll_runs = 600
        steps = 5000
        ep_s = ExponentialDecay(steps // 10, 0.4, 0.02, steps)
        device = 'cuda'
        actions = 2
        env = gym.make('SimpleGrid-v3', n=ll_runs, device=device, map_string=frozen_lake, max_steps=40)
        critic = DiscreteVTable((env.height, env.width)).to(device)
        policy = VPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=0.5).to(device)
        batch_size = 16 * ll_runs
        exp_buffer = PrioritizedExpBuffer(max_timesteps=steps//10, ll_runs=ll_runs, batch_size=batch_size,
                                          observation_shape=env.observation_space_shape, importance_sample=True)

        run_on(stepper=one_step_value, learner=train_one_value, env=env, critic=critic, policy=policy,
               ll_runs=ll_runs, eps_sched=ep_s,
               exp_buffer=exp_buffer, batch_size=batch_size, discount=0.99,
               steps=steps, logging_freq=100, run_id=f'frozenlake_value_imps_{i}', warmup_steps=10, lr=0.05)
示例#8
0
def test_frozenlake_deepq_grid_search():
    for _ in range(3):
        for discount in np.arange(0.84, 1.0, 0.04):
            ll_runs = 600
            batch_size = 16 * ll_runs
            steps = 15000
            ep_s = ExponentialDecay(half_life=steps // 7.0,
                                    scale=0.4,
                                    bias=0.02,
                                    steps=steps)
            lr_s = ConstantSched(0.05)
            device = 'cuda'
            actions = 4
            env = gym.make('SimpleGrid-v3',
                           n=ll_runs,
                           device=device,
                           map_string=frozen_lake,
                           max_steps=40,
                           reward_per_timestep=-0.01)
            critic = FixupQ((env.height, env.width), actions, 4).to(device)
            policy = QPolicy(critic,
                             actions,
                             EpsilonGreedyProperDiscreteDist,
                             epsilon=1.0).to(device)
            exp_buffer = ExpBuffer(
                max_timesteps=steps // 8,
                ll_runs=ll_runs,
                batch_size=batch_size,
                observation_shape=env.observation_space_shape)
            run_on(stepper=one_step,
                   learner=train_one,
                   env=env,
                   critic=critic,
                   policy=policy,
                   ll_runs=ll_runs,
                   eps_sched=ep_s,
                   lr_sched=lr_s,
                   exp_buffer=exp_buffer,
                   batch_size=batch_size,
                   discount=0.99,
                   steps=steps,
                   logging_freq=100,
                   run_id=f'frozenlake_deepq_discount_{discount}',
                   warmup_steps=10)
示例#9
0
def test_F5_deep_q_proj():
    for i in range(10):
        ll_runs = 1
        steps = 1000
        ep_s = ExponentialDecay(steps / 15, 0.3, 0.05, steps)
        lr_s = ConstantSched(0.05)
        device = 'cuda'
        actions = 3
        obs_shape = (1, )
        batch_size = 16 * ll_runs

        env = gym.make('F5-v0')
        env = TimeLimit(env, max_episode_steps=50)
        env = NormalizeFunctional(env,
                                  obs_f=norm_f5,
                                  reward_f=normalize_reward)
        env = Reset(env)
        env = Monitor(env)
        env = BatchTensor(env, device='cuda')

        #critic = ProjFixupQ(obs_shape, actions, 20, 4).to(device)
        critic = EnsembleQ(obs_shape, actions, hidden=20, blocks=4).to(device)
        behaviour_policy = QPolicy(critic, actions,
                                   EpsilonGreedyProperDiscreteDist).to(device)
        greedy_policy = QPolicy(critic, actions, GreedyDist).to(device)
        exp_buffer = ExpBuffer(max_timesteps=steps // 10,
                               ll_runs=ll_runs,
                               batch_size=batch_size,
                               observation_shape=obs_shape)
        algo = Q(env,
                 critic,
                 behaviour_policy,
                 greedy_policy,
                 exp_buffer,
                 device=device,
                 plot=FastPlot(actions))
        algo.run(run_id='base_line',
                 steps=steps,
                 batch_size=batch_size,
                 discount_factor=0.95,
                 lr_sched=lr_s,
                 eps_sched=ep_s,
                 logging_freq=10)
示例#10
0
def test_frozenlake_value_grid():
    for i in range(3):
        for step_penalty in np.arange(0.002, 0.1, 0.002):
            eps = 0.4
            steps = 8000
            ll_runs = 600
            ep_s = ExponentialDecay(steps // 5, eps, 0.02, steps)
            lr_s = ConstantSched(0.05)
            device = 'cuda'
            actions = 2
            env = gym.make('SimpleGrid-v3',
                           n=ll_runs,
                           device=device,
                           map_string=frozen_lake,
                           max_steps=40,
                           reward_per_timestep=-step_penalty)
            critic = FixupV((env.height, env.width), 4).to(device)
            policy = VPolicy(critic,
                             actions,
                             EpsilonGreedyProperDiscreteDist,
                             epsilon=0.5).to(device)
            batch_size = 16 * ll_runs
            exp_buffer = ExpBuffer(
                max_timesteps=steps // 8,
                ll_runs=ll_runs,
                batch_size=batch_size,
                observation_shape=env.observation_space_shape)

            run_on(stepper=one_step_value,
                   learner=train_one_value,
                   env=env,
                   critic=critic,
                   policy=policy,
                   ll_runs=ll_runs,
                   eps_sched=ep_s,
                   exp_buffer=exp_buffer,
                   batch_size=batch_size,
                   discount=0.99,
                   steps=steps,
                   logging_freq=100,
                   run_id=f'frozenlake_step_{step_penalty}',
                   warmup_steps=10,
                   lr_sched=lr_s)
def test_fake_lunar_lander():
    for i in range(2):
        ll_runs = 600
        steps = 20000
        ep_s = ExponentialDecay(steps // 10, 0.05, steps)
        replay_window = ll_runs * steps // 10
        device = 'cuda'
        actions = 5
        env = gym.make('GridLunarLander-v0', n=ll_runs, device=device)
        length = sum([np.prod(shape) for shape in env.observation_space_shape])
        critic = DiscreteQTable((length,), actions).to(device)
        policy = QPolicy(critic, actions, EpsilonGreedyProperDiscreteDist, epsilon=1.0).to(device)

        exp_buffer = ExpBuffer(replay_window, *env.observation_space_shape)
        batch_size = 16 * ll_runs
        run_deep_q_on(env=env, critic=critic, policy=policy,
                      ll_runs=ll_runs, eps_sched=ep_s,
                      exp_buffer=exp_buffer, batch_size=batch_size,
                      workers=1, discount=0.8,
                      steps=steps, logging_freq=100, run_id=f'frozenlake_baseline_{i}', warmup=1000)
示例#12
0
def test_lawn_deepq_baseline():
    for i in range(3):
        ll_runs = 1
        steps = 10000
        ep_s = ExponentialDecay(steps // 10, 0.4, 0.02, steps)
        lr_s = ConstantSched(0.05)
        device = 'cuda'
        actions = 4
        env = gym.make('SimpleGrid-v3',
                       n=ll_runs,
                       device=device,
                       map_string=lawn,
                       max_steps=40)
        critic = FixupQ((env.height, env.width), actions, 4).to(device)
        policy = QPolicy(critic,
                         actions,
                         EpsilonGreedyProperDiscreteDist,
                         epsilon=1.0).to(device)
        batch_size = 16 * ll_runs
        exp_buffer = ExpBuffer(max_timesteps=steps // 10,
                               ll_runs=ll_runs,
                               batch_size=batch_size,
                               observation_shape=env.observation_space_shape)
        run_on(stepper=one_step,
               learner=train_one,
               env=env,
               critic=critic,
               policy=policy,
               ll_runs=ll_runs,
               eps_sched=ep_s,
               exp_buffer=exp_buffer,
               batch_size=batch_size,
               discount=0.8,
               lr_sched=lr_s,
               rendermode='parallel',
               steps=steps,
               logging_freq=100,
               run_id=f'lawn_deeoq_{i}',
               warmup_steps=10)