예제 #1
0
def main(env, num_timesteps, config):
    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learing(
        config=config,
        env=env,
        q_func=VIN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )
예제 #2
0
def q2_run(num_timesteps):
    schedulers = {"no_explore": ConstantSchedule(0.1),
                  "delayed_decay": PiecewiseSchedule([(0, 1.0), (0.25e6, 1.0), (1.25e6, 0.1)], outside_value=0.1),
                  "slower_decay": LinearSchedule(1500000, 0.1)}

    for name, exploration_schedule in schedulers.items():
        # Get Atari games.
        benchmark = gym.benchmark_spec('Atari40M')

        # Change the index to select a different game.
        task = benchmark.tasks[3]

        # Run training
        seed = 0  # Use a seed of zero (you may want to randomize the seed!)
        env = get_env(task, seed)
        env.reset()

        optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS))

        dqn_learning(
            env=env,
            q_func=DQN,
            runname=name,
            optimizer_spec=optimizer_spec,
            exploration=exploration_schedule,
            stopping_criterion=stopping_criterion2(num_timesteps),
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            gamma=GAMMA,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ
        )
def main(env, num_timesteps):
    def stopping_criterion(env):
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(2000000, 0.05)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )
예제 #4
0
def main(env, num_timesteps):
    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    run = runs.runs[RUN_INDEX]
    exploration_schedule = run.schedule

    print("Starting {}; max_timesteps = {}".format(run.run_name, task.max_timesteps))

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        statistics_file_name=run.statistics_file_name
    )
예제 #5
0
def main(env, num_timesteps, experiment_config, experiment_name):

    q_func = DQNLRelu if experiment_config['adv_model'] else DQN

    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=experiment_config['lr'],
                    alpha=experiment_config['alpha'],
                    eps=experiment_config['eps']),
    )

    exploration_schedule = LinearSchedule(1000000,
                                          experiment_config['min_eps'])

    dqn_learing(experiment_name=experiment_name,
                env=env,
                q_func=q_func,
                optimizer_spec=optimizer_spec,
                exploration=exploration_schedule,
                stopping_criterion=stopping_criterion,
                replay_buffer_size=experiment_config['replay_size'],
                batch_size=experiment_config['batch'],
                gamma=experiment_config['gamma'],
                learning_starts=experiment_config['learning_start'],
                learning_freq=experiment_config['learning_freq'],
                frame_history_len=experiment_config['frame_hist'],
                target_update_freq=experiment_config['target_update_freq'],
                output_path=experiment_config['output'])
예제 #6
0
def q1_run(num_timesteps):
    # Get Atari games.
    benchmark = gym.benchmark_spec('Atari40M')

    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = 0  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed, expt_dir='tmp/gym-results2')

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learning(
        env=env,
        q_func=DQN,
        runname="normal_run",
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion2(num_timesteps),
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGET_UPDATE_FREQ
    )
예제 #7
0
파일: main.py 프로젝트: zivzone/gym-airsim
def main(env):
	global args
	args = parser.parse_args()

	optimizer_spec = OptimizerSpec(
		constructor=optim.RMSprop,
		kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
	)

	exploration_schedule = LinearSchedule(1000000, 0.1)

	dqn_learing(
		env=env,
		q_func=DQN,
		checkpoint_path=args.checkpoint,
		optimizer_spec=optimizer_spec,
		exploration=exploration_schedule,
		stopping_criterion=None,
		replay_buffer_size=REPLAY_BUFFER_SIZE,
		batch_size=BATCH_SIZE,
		gamma=GAMMA,
		learning_starts=LEARNING_STARTS,
		learning_freq=LEARNING_FREQ,
		frame_history_len=FRAME_HISTORY_LEN,
		target_update_freq=TARGET_UPDATE_FREQ,
	   )
예제 #8
0
def main(env, num_timesteps):
    # This is just a rough estimate
    num_iterations = float(num_timesteps) / 4.0

    # define learning rate and exploration schedules below
    lr_multiplier = 1.0
    lr_schedule = PiecewiseSchedule([
        (0, 1e-4 * lr_multiplier),
        (num_iterations / 10, 1e-4 * lr_multiplier),
        (num_iterations / 2,  5e-5 * lr_multiplier),
    ], outside_value=5e-5 * lr_multiplier)

    optimizer_spec = OptimizerSpec(
        constructor=optim.Adam,
        kwargs=dict(eps=1e-4),
        lr_schedule=lr_schedule
    )

    exploration_schedule = PiecewiseSchedule([
        (0, 1.0),
        (1e6, 0.1),
        (num_iterations / 2, 0.01),
    ], outside_value=0.01)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion(num_timesteps),
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        grad_norm_clipping=GRAD_NORM_CLIPPING
    )
예제 #9
0
def bonus_run(num_timesteps):
    def make_range_black(arr: np.ndarray, start, end):
        arr[:, start:end, :] = 0

    frame_filters = {"no_left_side": lambda x: make_range_black(x, 0, x.shape[1] // 4),
                     "no_middle_side": lambda x: make_range_black(x, x.shape[1] // 4, x.shape[1] // 2), }

    for name, frame_filter in frame_filters.items():
        # Get Atari games.
        benchmark = gym.benchmark_spec('Atari40M')

        # Change the index to select a different game.
        task = benchmark.tasks[3]

        # Run training
        seed = 0  # Use a seed of zero (you may want to randomize the seed!)
        env = get_env(task, seed)
        env.reset()

        optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS))

        dqn_learning(
            env=env,
            q_func=DQN,
            runname=name,
            frame_filter=frame_filter,
            optimizer_spec=optimizer_spec,
            exploration=LinearSchedule(1000000, 0.1),
            stopping_criterion=stopping_criterion2(num_timesteps),
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            gamma=GAMMA,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ
        )
예제 #10
0
def main(env):

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        num_actions1=num_actions1,
        num_actions2=num_actions2
    )
예제 #11
0
def main(env, num_timesteps):
    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = random.randint(0,100)  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed)

    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    # empty dict to hold all results
    Stats = {}

    new_lr = 0.001
    new_gamma = 0.999
    exploration_sches = [LinearSchedule(1000000, 0.1), ConstantSchedule(0.05),
                         ConstantSchedule(0.15), LinearSchedule(500000, 0.05)]

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=new_lr, alpha=ALPHA, eps=EPS),
    )

    env = get_env(task, seed)
    Stats["lr=0.001, gamma=0.999"] = dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=new_gamma,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        feature_tested="lr=0.001, gamma=0.999"
    )

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    env = get_env(task, seed)
    Stats["Default"] = dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        feature_tested=""
    )

    plt.clf()
    plt.xlabel('Timesteps')
    plt.ylabel('Mean Reward (past 100 episodes)')
    num_items = len(Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"], label="lr=0.001, gamma=0.999")
    num_items = len(Stats["Default"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["Default"]["mean_episode_rewards"], label="Default")
    plt.legend()
    plt.title("Performance")
    plt.savefig('Final-Performance.png')
예제 #12
0
benchmark = gym.benchmark_spec('Atari40M')
task = benchmark.tasks[3]
seed = 0  # Use a seed of zero (you may want to randomize the seed!)
env = get_env(task, seed)
num_timesteps = task.max_timesteps


def stopping_criterion(env):
    # notice that here t is the number of steps of the wrapped env,
    # which is different from the number of steps in the underlying env
    return get_wrapper_by_name(env,
                               "Monitor").get_total_steps() >= num_timesteps


optimizer_spec = OptimizerSpec(
    constructor=optim.RMSprop,
    kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
)

exploration_schedule = LinearSchedule(1000000, 0.1)

USE_CUDA = torch.cuda.is_available()
dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
) else torch.FloatTensor


class Variable(autograd.Variable):
    def __init__(self, data, *args, **kwargs):
        if USE_CUDA:
            data = data.cuda()
        super(Variable, self).__init__(data, *args, **kwargs)
예제 #13
0
def atari_learn(env, args, num_timesteps):
    logdir = os.path.join('data', args.exp_name)

    num_iterations = float(num_timesteps) / 4.0

    # lr_multiplier = 1.0
    # lr_schedule = PiecewiseSchedule([
    #     (0, 1e-4 * lr_multiplier),
    #     (num_iterations / 10, 1e-4 * lr_multiplier),
    #     (num_iterations / 2, 5e-5 * lr_multiplier),
    # ],
    #     outside_value=5e-5 * lr_multiplier)
    # optimizer = dqn.OptimizerSpec(
    #     constructor=tf.train.AdamOptimizer,
    #     kwargs=dict(epsilon=1e-4),
    #     lr_schedule=lr_schedule
    # )

    def stopping_criterion(env):
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    # optimizer_spec = OptimizerSpec(
    #     constructor=optim.RMSprop,
    #     kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    # )

    optimizer_spec = OptimizerSpec(
        constructor=optim.Adam,
        kwargs=dict(lr=LEARNING_RATE),
    )

    exploration_schedule = LinearSchedule(30000, 0.01)

    # exploration_schedule = PiecewiseSchedule(
    #     [
    #         (0, 1.0),
    #         (1e6, 0.1),
    #         (num_iterations / 2, 0.01),
    #     ], outside_value=0.01
    # )

    logz.configure_output_dir(logdir)

    if args.dueling:
        dqn_learning(
            env=env,
            method=args.method,
            game=args.env,
            q_func=Dueling_DQN,
            optimizer_spec=optimizer_spec,
            exploration=exploration_schedule,
            stopping_criterion=stopping_criterion,
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=args.batch_size,
            gamma=args.gamma,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ,
            double=args.double,
            dueling=args.dueling,
            logdir=logdir,
            svrl=args.svrl,
            me_type=args.me_type,
            maskp=args.maskp,
            maskstep=args.maskstep,
            maskscheduler=args.maskscheduler,
        )
    else:
        dqn_learning(
            env=env,
            method=args.method,
            game=args.env,
            q_func=DQN,
            optimizer_spec=optimizer_spec,
            exploration=exploration_schedule,
            stopping_criterion=stopping_criterion,
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=args.batch_size,
            gamma=args.gamma,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ,
            double=args.double,
            dueling=args.dueling,
            logdir=logdir,
            svrl=args.svrl,
            me_type=args.me_type,
            maskp=args.maskp,
            maskstep=args.maskstep,
            maskscheduler=args.maskscheduler,
        )

    env.close()