def main(env, num_timesteps, config): def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learing( config=config, env=env, q_func=VIN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, )
def q2_run(num_timesteps): schedulers = {"no_explore": ConstantSchedule(0.1), "delayed_decay": PiecewiseSchedule([(0, 1.0), (0.25e6, 1.0), (1.25e6, 0.1)], outside_value=0.1), "slower_decay": LinearSchedule(1500000, 0.1)} for name, exploration_schedule in schedulers.items(): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) env.reset() optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS)) dqn_learning( env=env, q_func=DQN, runname=name, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion2(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ )
def main(env, num_timesteps): def stopping_criterion(env): return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(2000000, 0.05) dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, )
def main(env, num_timesteps): def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) run = runs.runs[RUN_INDEX] exploration_schedule = run.schedule print("Starting {}; max_timesteps = {}".format(run.run_name, task.max_timesteps)) dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, statistics_file_name=run.statistics_file_name )
def main(env, num_timesteps, experiment_config, experiment_name): q_func = DQNLRelu if experiment_config['adv_model'] else DQN def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=experiment_config['lr'], alpha=experiment_config['alpha'], eps=experiment_config['eps']), ) exploration_schedule = LinearSchedule(1000000, experiment_config['min_eps']) dqn_learing(experiment_name=experiment_name, env=env, q_func=q_func, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=experiment_config['replay_size'], batch_size=experiment_config['batch'], gamma=experiment_config['gamma'], learning_starts=experiment_config['learning_start'], learning_freq=experiment_config['learning_freq'], frame_history_len=experiment_config['frame_hist'], target_update_freq=experiment_config['target_update_freq'], output_path=experiment_config['output'])
def q1_run(num_timesteps): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed, expt_dir='tmp/gym-results2') optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learning( env=env, q_func=DQN, runname="normal_run", optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion2(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ )
def main(env): global args args = parser.parse_args() optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learing( env=env, q_func=DQN, checkpoint_path=args.checkpoint, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=None, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ, )
def main(env, num_timesteps): # This is just a rough estimate num_iterations = float(num_timesteps) / 4.0 # define learning rate and exploration schedules below lr_multiplier = 1.0 lr_schedule = PiecewiseSchedule([ (0, 1e-4 * lr_multiplier), (num_iterations / 10, 1e-4 * lr_multiplier), (num_iterations / 2, 5e-5 * lr_multiplier), ], outside_value=5e-5 * lr_multiplier) optimizer_spec = OptimizerSpec( constructor=optim.Adam, kwargs=dict(eps=1e-4), lr_schedule=lr_schedule ) exploration_schedule = PiecewiseSchedule([ (0, 1.0), (1e6, 0.1), (num_iterations / 2, 0.01), ], outside_value=0.01) dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, grad_norm_clipping=GRAD_NORM_CLIPPING )
def bonus_run(num_timesteps): def make_range_black(arr: np.ndarray, start, end): arr[:, start:end, :] = 0 frame_filters = {"no_left_side": lambda x: make_range_black(x, 0, x.shape[1] // 4), "no_middle_side": lambda x: make_range_black(x, x.shape[1] // 4, x.shape[1] // 2), } for name, frame_filter in frame_filters.items(): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) env.reset() optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS)) dqn_learning( env=env, q_func=DQN, runname=name, frame_filter=frame_filter, optimizer_spec=optimizer_spec, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=stopping_criterion2(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ )
def main(env): optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, num_actions1=num_actions1, num_actions2=num_actions2 )
def main(env, num_timesteps): # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = random.randint(0,100) # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) # empty dict to hold all results Stats = {} new_lr = 0.001 new_gamma = 0.999 exploration_sches = [LinearSchedule(1000000, 0.1), ConstantSchedule(0.05), ConstantSchedule(0.15), LinearSchedule(500000, 0.05)] optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=new_lr, alpha=ALPHA, eps=EPS), ) env = get_env(task, seed) Stats["lr=0.001, gamma=0.999"] = dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=new_gamma, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, feature_tested="lr=0.001, gamma=0.999" ) optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) env = get_env(task, seed) Stats["Default"] = dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, feature_tested="" ) plt.clf() plt.xlabel('Timesteps') plt.ylabel('Mean Reward (past 100 episodes)') num_items = len(Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"]) plt.plot(range(num_items), Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"], label="lr=0.001, gamma=0.999") num_items = len(Stats["Default"]["mean_episode_rewards"]) plt.plot(range(num_items), Stats["Default"]["mean_episode_rewards"], label="Default") plt.legend() plt.title("Performance") plt.savefig('Final-Performance.png')
benchmark = gym.benchmark_spec('Atari40M') task = benchmark.tasks[3] seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) num_timesteps = task.max_timesteps def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) USE_CUDA = torch.cuda.is_available() dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor class Variable(autograd.Variable): def __init__(self, data, *args, **kwargs): if USE_CUDA: data = data.cuda() super(Variable, self).__init__(data, *args, **kwargs)
def atari_learn(env, args, num_timesteps): logdir = os.path.join('data', args.exp_name) num_iterations = float(num_timesteps) / 4.0 # lr_multiplier = 1.0 # lr_schedule = PiecewiseSchedule([ # (0, 1e-4 * lr_multiplier), # (num_iterations / 10, 1e-4 * lr_multiplier), # (num_iterations / 2, 5e-5 * lr_multiplier), # ], # outside_value=5e-5 * lr_multiplier) # optimizer = dqn.OptimizerSpec( # constructor=tf.train.AdamOptimizer, # kwargs=dict(epsilon=1e-4), # lr_schedule=lr_schedule # ) def stopping_criterion(env): return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps # optimizer_spec = OptimizerSpec( # constructor=optim.RMSprop, # kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), # ) optimizer_spec = OptimizerSpec( constructor=optim.Adam, kwargs=dict(lr=LEARNING_RATE), ) exploration_schedule = LinearSchedule(30000, 0.01) # exploration_schedule = PiecewiseSchedule( # [ # (0, 1.0), # (1e6, 0.1), # (num_iterations / 2, 0.01), # ], outside_value=0.01 # ) logz.configure_output_dir(logdir) if args.dueling: dqn_learning( env=env, method=args.method, game=args.env, q_func=Dueling_DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=args.batch_size, gamma=args.gamma, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ, double=args.double, dueling=args.dueling, logdir=logdir, svrl=args.svrl, me_type=args.me_type, maskp=args.maskp, maskstep=args.maskstep, maskscheduler=args.maskscheduler, ) else: dqn_learning( env=env, method=args.method, game=args.env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=args.batch_size, gamma=args.gamma, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ, double=args.double, dueling=args.dueling, logdir=logdir, svrl=args.svrl, me_type=args.me_type, maskp=args.maskp, maskstep=args.maskstep, maskscheduler=args.maskscheduler, ) env.close()