def main(env, num_timesteps, experiment_config, experiment_name): q_func = DQNLRelu if experiment_config['adv_model'] else DQN def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=experiment_config['lr'], alpha=experiment_config['alpha'], eps=experiment_config['eps']), ) exploration_schedule = LinearSchedule(1000000, experiment_config['min_eps']) dqn_learing(experiment_name=experiment_name, env=env, q_func=q_func, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=experiment_config['replay_size'], batch_size=experiment_config['batch'], gamma=experiment_config['gamma'], learning_starts=experiment_config['learning_start'], learning_freq=experiment_config['learning_freq'], frame_history_len=experiment_config['frame_hist'], target_update_freq=experiment_config['target_update_freq'], output_path=experiment_config['output'])
def main(env, num_timesteps, config): def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learing( config=config, env=env, q_func=VIN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, )
def q1_run(num_timesteps): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed, expt_dir='tmp/gym-results2') optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learning( env=env, q_func=DQN, runname="normal_run", optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion2(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ )
def main(env): global args args = parser.parse_args() optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learing( env=env, q_func=DQN, checkpoint_path=args.checkpoint, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=None, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ, )
def q2_run(num_timesteps): schedulers = {"no_explore": ConstantSchedule(0.1), "delayed_decay": PiecewiseSchedule([(0, 1.0), (0.25e6, 1.0), (1.25e6, 0.1)], outside_value=0.1), "slower_decay": LinearSchedule(1500000, 0.1)} for name, exploration_schedule in schedulers.items(): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) env.reset() optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS)) dqn_learning( env=env, q_func=DQN, runname=name, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion2(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ )
def main(env, num_timesteps): def stopping_criterion(env): return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(2000000, 0.05) dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, )
def main(env, num_timesteps, config): def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_testing( config=config, env=env, q_func=VIN, exploration=exploration_schedule, )
def bonus_run(num_timesteps): def make_range_black(arr: np.ndarray, start, end): arr[:, start:end, :] = 0 frame_filters = {"no_left_side": lambda x: make_range_black(x, 0, x.shape[1] // 4), "no_middle_side": lambda x: make_range_black(x, x.shape[1] // 4, x.shape[1] // 2), } for name, frame_filter in frame_filters.items(): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) env.reset() optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS)) dqn_learning( env=env, q_func=DQN, runname=name, frame_filter=frame_filter, optimizer_spec=optimizer_spec, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=stopping_criterion2(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ )
def main(config, env): """ Run DQN on Atari :param config: :param env: :return: """ FLAGS = update_tf_wrapper_args(args, utils.gatedpixelcnn_bonus.FLAGS) def stopping_criterion(env, t): # t := number of steps of wrapped env # different from number of steps in underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= \ config.max_timesteps # optimizer_spec = OptimizerSpec( # constructor=torch.optim.Adam, # kwargs=dict(lr=config.learning_rate, eps=config.epsilon), # ) optimizer_spec = OptimizerSpec(constructor=torch.optim.RMSprop, kwargs=dict(lr=config.learning_rate, momentum=config.momentum, eps=config.epsilon)) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learn( env=env, q_func=DQN, optimizer_spec=optimizer_spec, density=PixelBonus, cnn_kwargs=FLAGS, config=config, exploration=exploration_schedule, stopping_criterion=stopping_criterion, )
def main(env): optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, num_actions1=num_actions1, num_actions2=num_actions2 )
num_timesteps = task.max_timesteps def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) USE_CUDA = torch.cuda.is_available() dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor class Variable(autograd.Variable): def __init__(self, data, *args, **kwargs): if USE_CUDA: data = data.cuda() super(Variable, self).__init__(data, *args, **kwargs) OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs"])
def train_model(env, conv_layers, learning_rate=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=100000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, double_dqn=False, **network_kwargs) -> tf.keras.Model: """Train a DQN model. Parameters ------- env: gym.Env openai gym conv_layers: list a list of triples that defines the conv network learning_rate: float learning rate for adam optimizer total_timesteps: int number of env steps to run the environment buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every train_freq steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to store a checkpoint during training checkpoint_path: str the fs path for storing the checkpoints learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. double_dqn: bool specifies if double q-learning is used during training Returns ------- dqn: an instance of tf.Module that contains the trained model """ q_func = build_dueling_q_func(conv_layers, **network_kwargs) dqn = DeepQ(model_builder=q_func, observation_shape=env.observation_space.shape, num_actions=env.action_space.n, learning_rate=learning_rate, gamma=gamma, double_dqn=double_dqn) manager = None if checkpoint_path is not None: load_path = osp.expanduser(checkpoint_path) ckpt = tf.train.Checkpoint(model=dqn.q_network) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=5) ckpt.restore(manager.latest_checkpoint) print("Restoring from {}".format(manager.latest_checkpoint)) current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = 'logs/gradient_tape/' + current_time + '/train' train_summary_writer = tf.summary.create_file_writer(train_log_dir) # Create the replay buffer replay_buffer = ReplayMemory(buffer_size) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(total_timesteps=int(exploration_fraction * total_timesteps), initial_prob=1.0, final_prob=exploration_final_eps) dqn.update_target() episode_rewards = [0.0] obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) for t in range(total_timesteps): update_eps = exploration.step_to(t) action, _, _, _ = dqn.step(tf.constant(obs), update_eps=update_eps) action = action[0].numpy() new_obs, reward, done, _ = env.step(action) # Store transition in the replay buffer. new_obs = np.expand_dims(np.array(new_obs), axis=0) replay_buffer.add(obs[0], action, reward, new_obs[0], float(done)) obs = new_obs episode_rewards[-1] += reward if done: obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, _ = tf.ones_like(rewards), None td_loss = dqn.train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network every target_network_update_freq steps dqn.update_target() reward_100_mean = np.round(np.mean(episode_rewards[-101:-1]), 1) number_episodes = len(episode_rewards) - 1 if done and print_freq is not None and number_episodes % print_freq == 0: format_str = "Steps: {}, Episodes: {}, 100 ep reward average: {}, Reward: {}, Epsilon-greedy %explore: {}" print( format_str.format(t, number_episodes, reward_100_mean, episode_rewards[-2], int(100 * exploration.value(t)))) with train_summary_writer.as_default(): tf.summary.scalar('loss', dqn.train_loss_metrics.result(), step=t) tf.summary.scalar('reward', episode_rewards[-2], step=t) if checkpoint_path is not None and t % checkpoint_freq == 0: manager.save() # Every training step, reset the loss metric dqn.train_loss_metrics.reset_states() return dqn.q_network
def dqn_learing(env, q_func, optimizer_spec, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: input_channel: int number of channel of input. num_actions: int number of actions optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer exploration: Schedule (defined in utils.schedule) schedule for probability of chosing random action. stopping_criterion: (env, t) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network grad_norm_clipping: float or None If not None gradients' norms are clipped to this value. """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n # Construct an epilson greedy policy with given exploration schedule def select_epilson_greedy_action(model, obs, t): sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0 # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history return model(Variable(obs, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([[random.randrange(num_actions)]]) # Initialize target q function and q function Q = q_func(input_arg, num_actions).type(dtype) target_Q = q_func(input_arg, num_actions).type(dtype) # Construct Q network optimizer function # optimizer_func = construct_optimizer_func(Q, optimizer_spec) optimizer = torch.optim.Adam(Q.parameters()) # Construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 10000 for t in count(): ### Check stopping criterion if stopping_criterion is not None and stopping_criterion(env): break ### Step the env and store the transition # Store lastest observation in replay memory and last_idx can be used to store action, reward, done last_idx = replay_buffer.store_frame(last_obs) # encode_recent_observation will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # recent_observations: shape(img_h, img_w, frame_history_len) are input to to the model recent_observations = replay_buffer.encode_recent_observation( ).transpose(2, 0, 1) # Choose random action if not yet start learning if t > learning_starts: action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0] else: action = random.randrange(num_actions) # Advance one step obs, reward, done, _ = env.step(action) replay_buffer.store_effect(last_idx, action, reward, done) # Resets the environment when reaching an episode boundary. if done: obs = env.reset() last_obs = obs ### Perform experience replay and train the network. # Note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Use the replay buffer to sample a batch of transitions # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample( batch_size) # Convert numpy nd_array to torch variables for calculation obs_batch = Variable( torch.from_numpy(obs_batch.transpose(0, 3, 1, 2)).type(dtype) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).long()) rew_batch = Variable(torch.from_numpy(rew_batch)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch.transpose( 0, 3, 1, 2)).type(dtype) / 255.0) done_mask = torch.from_numpy(done_mask) if USE_CUDA: act_batch = act_batch.cuda() rew_batch = rew_batch.cuda() done_mask = done_mask.cuda() # Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)) # Compute next Q value, based on which acion gives max Q values next_max_Q_values = Variable(torch.zeros(batch_size).type(dtype)) # # Detach variable from the current graph since we don't want gradients to propagated next_max_Q_values[done_mask == 0] = target_Q( next_obs_batch).detach().max(1)[0] # Compute Bellman error, use huber loss to mitigate outlier impact target_Q_values = rew_batch + (gamma * next_max_Q_values) bellman_error = F.smooth_l1_loss(current_Q_values, target_Q_values) # Construct and optimizer and clear previous gradients optimizer = optimizer_func(t) optimizer.zero_grad() # run backward pass and clip the gradient bellman_error.backward() nn.utils.clip_grad_norm(Q.parameters(), grad_norm_clipping) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % target_update_freq == 0: target_Q.load_state_dict(Q.state_dict()) ### 4. Log progress episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts: print("Timestep %d" % (t, )) print("mean reward (100 episodes) %f" % mean_episode_reward) print("best mean reward %f" % best_mean_episode_reward) print("episodes %d" % len(episode_rewards)) print("exploration %f" % exploration.value(t)) print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) sys.stdout.flush()
env.seed(SEED) torch.manual_seed(SEED) np.random.seed(SEED) random.seed(SEED) env = wrap_deepmind(env) env = JoypadSpace(env, COMPLEX_MOVEMENT) expt_dir = 'Game_play3' env = wrappers.Monitor(env, expt_dir, force=True, video_callable=False) optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(2000000, 0.05, 0.05) annelation_schedule = LinearSchedule(2000000, 1.0, 0.4) # recollect_experience(env2,DQN) dqfd_learn( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, alpha=ALPHA_P, annelation=annelation_schedule,
plt.style.use('ggplot') NUM_EPISODES = 12000 BATCH_SIZE = 128 GAMMA = 1.0 REPLAY_MEMORY_SIZE = 1000000 LEARNING_RATE = 0.00025 ALPHA = 0.95 EPS = 0.01 optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(50000, 0.1, 1) agent = hDQN( optimizer_spec=optimizer_spec, replay_memory_size=REPLAY_MEMORY_SIZE, batch_size=BATCH_SIZE, ) env = StochasticMDPEnv() agent, stats, visits = hdqn_learning( env=env, agent=agent, num_episodes=NUM_EPISODES, exploration_schedule=exploration_schedule, gamma=GAMMA,
def atari_learn(env, args, num_timesteps): logdir = os.path.join('data', args.exp_name) num_iterations = float(num_timesteps) / 4.0 # lr_multiplier = 1.0 # lr_schedule = PiecewiseSchedule([ # (0, 1e-4 * lr_multiplier), # (num_iterations / 10, 1e-4 * lr_multiplier), # (num_iterations / 2, 5e-5 * lr_multiplier), # ], # outside_value=5e-5 * lr_multiplier) # optimizer = dqn.OptimizerSpec( # constructor=tf.train.AdamOptimizer, # kwargs=dict(epsilon=1e-4), # lr_schedule=lr_schedule # ) def stopping_criterion(env): return get_wrapper_by_name( env, "Monitor").get_total_steps() >= num_timesteps # optimizer_spec = OptimizerSpec( # constructor=optim.RMSprop, # kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), # ) optimizer_spec = OptimizerSpec( constructor=optim.Adam, kwargs=dict(lr=LEARNING_RATE), ) exploration_schedule = LinearSchedule(30000, 0.01) # exploration_schedule = PiecewiseSchedule( # [ # (0, 1.0), # (1e6, 0.1), # (num_iterations / 2, 0.01), # ], outside_value=0.01 # ) logz.configure_output_dir(logdir) if args.dueling: dqn_learning( env=env, method=args.method, game=args.env, q_func=Dueling_DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=args.batch_size, gamma=args.gamma, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ, double=args.double, dueling=args.dueling, logdir=logdir, svrl=args.svrl, me_type=args.me_type, maskp=args.maskp, maskstep=args.maskstep, maskscheduler=args.maskscheduler, ) else: dqn_learning( env=env, method=args.method, game=args.env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=args.batch_size, gamma=args.gamma, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ, double=args.double, dueling=args.dueling, logdir=logdir, svrl=args.svrl, me_type=args.me_type, maskp=args.maskp, maskstep=args.maskstep, maskscheduler=args.maskscheduler, ) env.close()
def dqn_learn(env, exploration=LinearSchedule(EPOCHS // 2, 0.1), optimizer_spec=optimizer): # 初始化 Q = DQN(STATE_VEC_DIM, DQN_HIDDEN_DIM1, DQN_HIDDEN_DIM2, NUM_ACTIONS) Q_target = DQN(STATE_VEC_DIM, DQN_HIDDEN_DIM1, DQN_HIDDEN_DIM2, NUM_ACTIONS) optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) replay_buffer = deque() loss_func = torch.nn.MSELoss() num_param_updates = 0 for epoch_id in range(EPOCHS): print("\n ###### Epoch: %s/%s ######" % (epoch_id, EPOCHS)) start = time.time() total_loss = 0 # 环境初始化 obs = env.reset() # obs 是一个 list while True: # 选择动作 sample = random.random() threshold = exploration.value(epoch_id) if sample > threshold: observation = torch.tensor(obs).unsqueeze(0).type( DTYPE) # observation 是一个 tensor value = Q(observation).cpu().data.numpy() action = value.argmax(-1)[0] else: action = np.random.randint(NUM_ACTIONS) # 执行动作 reward, new_obs, done, _ = env.step(action) replay_buffer.append((obs, action, reward, new_obs, done)) if len(replay_buffer) > REPLAY_SIZE: replay_buffer.popleft() obs = new_obs if len(replay_buffer) > BATCH_SIZE: # print("执行经验回放") # 首先准备输入数据 minibatch = random.sample(replay_buffer, BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] done_batch = [data[4] for data in minibatch] # 第一维是 batch_size state_tensor = Variable(torch.tensor(state_batch).type(DTYPE)) action_tensor = Variable( torch.tensor(action_batch).type(DLONGTYPE)) reward_tensor = Variable( torch.tensor(reward_batch).type(DTYPE)) next_state_tensor = Variable( torch.tensor(next_state_batch).type(DTYPE)) done_tensor = Variable(torch.tensor(done_batch).type(DTYPE)) # Q 网络得到的估计值 q_values = Q(state_tensor) # action_tensor 的 shape 是 [32] # action_tensor.unsqueeze(1) 是 [32, 1] # q_values 是 [32, 19] # gather 就是取出 action_tensor 对应的动作的 Q 值 q_s_a = q_values.gather(1, action_tensor.unsqueeze(1)) # q_s_a 变成 [32] q_s_a = q_s_a.squeeze() # 目标值 # .max(1) 按第 2 个维度求最大值,返回最大值以及索引 # 所以用 [0] 取最大动作值 # Q_target(next_state_tensor).max(1)[0]: batch_size target_v = reward_tensor + GAMMA * ( 1 - done_tensor ) * Q_target(next_state_tensor).detach().max(1)[0] loss = loss_func(q_s_a, target_v) total_loss += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() num_param_updates += 1 if num_param_updates % REPLACE_TARGET_FREQ == 0: Q_target.load_state_dict(Q.state_dict()) if done: break end = time.time() print("Epoch %s Time: %.2f s Total Loss: %.2f" % (epoch_id, end - start, total_loss)) # 每训练 10000 步,进行一次评估 if epoch_id % 5 == 0: print("--------------------------------------------------------") print("--------------进入测试阶段") print("--------------------------------------------------------") obs = env.reset(False) while True: observation = torch.tensor(obs).unsqueeze(0).type(DTYPE) value = Q(observation).cpu().data.numpy() action = value.argmax(-1)[0] reward, new_obs, done, info = env.step(action, False) obs = new_obs if done: gold_results = info[0] pred_results = info[1] break acc, p, r, f = get_ner_fmeasure(gold_results, pred_results) print("acc: %.4f, p: %.4f, r: %.4f, f: %.4f; \n" % (acc, p, r, f))
action_dim = env.action_space.n kwargs = { "action_dim": action_dim, "discount": args.discount, "gradient_clip": args.gradient_clip, } # Initialize policy # ---------------------------------------------- if args.policy == "DQN": kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs) kwargs["learning_rate"] = 1e-4 policy = DQN.DQN(**kwargs) eps_schedule = LinearSchedule(1.0, 0.01, 1e6) # annealing epsilon args.batch_size = 64 elif args.policy == "Double_DQN": kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs) kwargs["learning_rate"] = 1e-4 policy = Double_DQN.DoubleDQN(**kwargs) eps_schedule = LinearSchedule(1.0, 0.01, 1e6) # annealing epsilon args.batch_size = 64 # ---------------------------------------------- elif args.policy == "Dueling_DQN": kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs) kwargs["learning_rate"] = 1e-4 policy = Dueling_DQN.DuelingDQN(**kwargs) eps_schedule = LinearSchedule(1.0, 0.01, 1e6) # annealing epsilon elif args.policy == "Dueling_Double_DQN": kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs)
def main(env, num_timesteps): # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = random.randint(0,100) # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) # empty dict to hold all results Stats = {} new_lr = 0.001 new_gamma = 0.999 exploration_sches = [LinearSchedule(1000000, 0.1), ConstantSchedule(0.05), ConstantSchedule(0.15), LinearSchedule(500000, 0.05)] optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=new_lr, alpha=ALPHA, eps=EPS), ) env = get_env(task, seed) Stats["lr=0.001, gamma=0.999"] = dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=new_gamma, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, feature_tested="lr=0.001, gamma=0.999" ) optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) env = get_env(task, seed) Stats["Default"] = dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, feature_tested="" ) plt.clf() plt.xlabel('Timesteps') plt.ylabel('Mean Reward (past 100 episodes)') num_items = len(Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"]) plt.plot(range(num_items), Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"], label="lr=0.001, gamma=0.999") num_items = len(Stats["Default"]["mean_episode_rewards"]) plt.plot(range(num_items), Stats["Default"]["mean_episode_rewards"], label="Default") plt.legend() plt.title("Performance") plt.savefig('Final-Performance.png')
def q_learning(env, num_episodes, discount_factor=1.0, lr=0.00025, exploration_schedule=LinearSchedule(50000, 0.1, 1.0)): """ Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy while following an epsilon-greedy policy Args: env: OpenAI environment. num_episodes: Number (can be divided by 1000) of episodes to run for. Ex: 12000 discount_factor: Lambda time discount factor. lr: TD learning rate. exploration_schedule: Schedule (defined in utils.schedule) schedule for probability of chosing random action. Returns: A tuple (Q, stats, visits). Q is the optimal action-value function, a dictionary mapping state -> action values. stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. visits is an 2D-array indicating how many time each state being visited in every 1000 episodes. """ # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.nA)) # Keep track of useful statistics stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) n_thousand_episode = int(np.floor(num_episodes / 1000)) visits = np.zeros((n_thousand_episode, env.nS)) total_timestep = 0 for i_thousand_episode in range(n_thousand_episode): for i_episode in range(1000): current_state = env.reset() visits[i_thousand_episode][current_state-1] += 1 # Keep track number of time-step per episode only for plotting for t in itertools.count(): total_timestep += 1 # Get annealing exploration rate (epislon) from exploration_schedule epsilon = exploration_schedule.value(total_timestep) # Improve epsilon greedy policy using lastest updated Q policy = make_epsilon_greedy_policy(Q, epsilon, env.nA) # Choose the action based on epsilon greedy policy action_probs = policy(current_state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) visits[i_thousand_episode][next_state-1] += 1 # Use the greedy action to evaluate Q, not the one we actually follow greedy_next_action = Q[next_state].argmax() # Evaluate Q using estimated action value of (next_state, greedy_next_action) td_target = reward + discount_factor * Q[next_state][greedy_next_action] td_error = td_target - Q[current_state][action] Q[current_state][action] += lr * td_error # Update statistics stats.episode_rewards[i_thousand_episode*1000 + i_episode] += reward stats.episode_lengths[i_thousand_episode*1000 + i_episode] = t if done: break else: current_state = next_state return Q, stats, visits
def dqn_learn(env, q_func, optimizer_spec, density, cnn_kwargs, config, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=None): """ Run Deep Q-learning algorithm. """ # this is just to make sure that you're operating in the correct environment assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_shape = env.observation_space.shape else: img_h, img_w, img_c = env.observation_space.shape input_shape = (img_h, img_w, config.frame_history_len * img_c) num_actions = env.action_space.n # define Q network and target network (instantiate 2 DQN's) in_channel = input_shape[-1] Q = q_func(in_channel, num_actions) target_Q = deepcopy(Q) # define C network and target C C = q_func(in_channel, num_actions) target_C = deepcopy(C) # call tensorflow wrapper to get density model if config.bonus: tf_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) pixel_bonus = density(cnn_kwargs, sess, num_actions) tf.initialize_all_variables().run(session=sess) if USE_CUDA: Q.cuda() target_Q.cuda() C.cuda() target_C.cuda() # define eps-greedy exploration strategy def select_action(model, bonus_model, obs, t): """ Selects random action w prob eps; otherwise returns best action :param exploration: :param t: :return: """ def get_best_action(obs): obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0) / 255.0 Q_val = model(Variable(obs, volatile=True)) C_val = bonus_model(Variable(obs, volatile=True)) b = C_val if config.gaussian_ts: b = config.alpha * torch.distributions.normal.Normal(0, C_val) return (Q_val + b).data.max(1)[1].view(1, 1) if config.egreedy_exploration: sample = random.random() eps_threshold = exploration.value(t) if sample > eps_threshold: return get_best_action(obs) else: # return random action return LongTensor([[random.randrange(num_actions)]]) # no exploration; just take best action else: return get_best_action(obs) # construct torch optimizer optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs) # C optimizer C_optimizer = optimizer_spec.constructor(C.parameters(), **optimizer_spec.kwargs) # construct the replay buffer if config.mmc: replay_buffer = MMCReplayBuffer(config.replay_buffer_size, config.frame_history_len) else: replay_buffer = ReplayBuffer(config.replay_buffer_size, config.frame_history_len) ############### # RUN ENV # ############### num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() prev = time.time() # index trackers for updating mc returns episode_indices_in_buffer = [] reward_each_timestep = [] timesteps_in_buffer = [] cur_timestep = 0 # t denotes frames for t in itertools.count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env, t): break ### 2. Step the env and store the transition # process last_obs to include context from previous frame last_idx = replay_buffer.store_frame(last_obs) # record where this is in the buffer episode_indices_in_buffer.append(last_idx) timesteps_in_buffer.append(cur_timestep) # one more step in episode cur_timestep += 1 # take latest observation pushed into buffer and compute corresponding input # that should be given to a Q network by appending some previous frames recent_obs = replay_buffer.encode_recent_observation() # recent_obs.shape is also (84, 84, 4) # choose random action if not yet started learning if t > config.learning_starts: action = select_action(Q, C, recent_obs, t)[0][0] else: action = random.randrange(num_actions) # advance one step obs, reward, done, _ = env.step(action) # clip reward to be in [-1, +1] reward = max(-1.0, min(reward, 1.0)) ############################################### # do density model stuff here if config.bonus: # just assume this is true intrinsic_reward = pixel_bonus.bonus(obs, action, t, num_actions) if t % config.log_freq == 0: logging.info('t: {}\t intrinsic reward: {}'.format( t, intrinsic_reward)) curr = time.time() diff = curr - prev prev = curr logging.info("Timestep %d" % (t, )) logging.info("Time elapsed %f" % diff) # utils.save_image(pixel_bonus.sample_images(img_dim**2), 'images/iteration_{}.png'.format(t), nrow=img_dim, padding=0) # pixel_bonus.sample_images(3, t) # utils.save_image(frame / 8.,'images/obs_{}.png'.format(t),padding=0) bonus = intrinsic_reward # TODO: add bonus/intrinsic_reward to replay buffer pixel_bonus.writer.add_scalar('data/bonus', bonus, t) # add intrinsic reward to clipped reward # NOTE: don't add bonus since we separate Q and C reward += intrinsic_reward # clip reward to be in [-1, +1] once again reward = max(-1.0, min(reward, 1.0)) assert -1.0 <= reward <= 1.0 ################################################ # store reward in list to use for calculating MMC update reward_each_timestep.append(reward) replay_buffer.store_effect(last_idx, action, reward, done, bonus) # reset environment when reaching episode boundary if done: # only if computing MC return if config.mmc: # episode has terminated --> need to do MMC update here # loop through all transitions of this past episode and add in mc_returns print(len(timesteps_in_buffer), len(reward_each_timestep)) assert len(timesteps_in_buffer) == len(reward_each_timestep) mc_returns = np.zeros(len(timesteps_in_buffer)) # compute mc returns r = 0 for i in reversed(range(len(mc_returns))): r = reward_each_timestep[i] + config.gamma * r mc_returns[i] = r # populate replay buffer for j in range(len(mc_returns)): # get transition tuple in reward buffer and update update_idx = episode_indices_in_buffer[j] # put mmc return back into replay buffer replay_buffer.mc_return_t[update_idx] = mc_returns[j] # reset because end of episode episode_indices_in_buffer = [] timesteps_in_buffer = [] cur_timestep = 0 reward_each_timestep = [] # reset obs = env.reset() last_obs = obs ### 3. Perform experience replay and train the network. # note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken # perform training if (t > config.learning_starts and t % config.learning_freq == 0 and replay_buffer.can_sample(config.batch_size)): # sample batch of transitions if config.mmc: # also grab MMC batch if computing MMC return obs_batch, act_batch, rew_batch, next_obs_batch, bonus_batch, done_mask, mc_batch = \ replay_buffer.sample(config.batch_size) mc_batch = Variable( torch.from_numpy(mc_batch).type(FloatTensor)) else: obs_batch, act_batch, rew_batch, next_obs_batch, bonus_batch, done_mask = \ replay_buffer.sample(config.batch_size) # convert variables to torch tensor variables obs_batch = Variable( torch.from_numpy(obs_batch).type(FloatTensor) / 255.0) act_batch = Variable(torch.from_numpy(act_batch).type(LongTensor)) rew_batch = Variable(torch.from_numpy(rew_batch).type(FloatTensor)) next_obs_batch = Variable( torch.from_numpy(next_obs_batch).type(FloatTensor) / 255.0) bonus_batch = Variable( torch.from_numpy(bonus_batch).type(FloatTensor)) not_done_mask = Variable( torch.from_numpy(1 - done_mask).type(FloatTensor)) # 3.c: train the model: perform gradient step and update the network current_Q_values = Q(obs_batch).gather( 1, act_batch.unsqueeze(1)).squeeze() # this gives you a FloatTensor of size 32 // gives values of max next_max_q = target_Q(next_obs_batch).detach().max(1)[0] # torch.FloatTensor of size 32 next_Q_values = not_done_mask * next_max_q # this is [r(x,a) + gamma * max_a' Q(x', a')] target_Q_values = rew_batch + (config.gamma * next_Q_values) if config.mmc: # replace target_Q_values with mixed target target_Q_values = ((1 - config.beta) * target_Q_values) + (config.beta * mc_batch) # use huber loss loss = F.smooth_l1_loss(current_Q_values, target_Q_values) # zero out gradient optimizer.zero_grad() # backward pass loss.backward() # gradient clipping for params in Q.parameters(): params.grad.data.clamp_(-1, 1) # perform param update optimizer.step() num_param_updates += 1 # periodically update the target network if num_param_updates % config.target_update_freq == 0: target_Q = deepcopy(Q) ######### REPEAT ABOVE FOR C NETWORK ################## current_C_values = C(obs_batch).gather( 1, act_batch.unsqueeze(1)).squeeze() # this gives you a FloatTensor of size 32 // gives values of max next_max_c = target_C(next_obs_batch).detach().max(1)[0] # torch.FloatTensor of size 32 next_C_values = not_done_mask * next_max_c # this is [r(x,a) + gamma * max_a' Q(x', a')] target_C_values = bonus_batch + (config.gamma * next_C_values) #if config.mmc: # replace target_Q_values with mixed target # target_C_values = ((1-config.beta) * target_C_values) + (config.beta * # mc_batch) # use huber loss C_loss = F.smooth_l1_loss(current_C_values, target_C_values) # zero out gradient C_optimizer.zero_grad() # backward pass C_loss.backward() # gradient clipping for params in C.parameters(): params.grad.data.clamp_(-1, 1) # perform param update C_optimizer.step() num_param_updates += 1 # periodically update the target network if num_param_updates % config.target_update_freq == 0: target_C = deepcopy(C) ### 4. Log progress episode_rewards = get_wrapper_by_name( env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) # Tensorboard logging pixel_bonus.writer.add_scalar('data/bonus', intrinsic_reward, t) pixel_bonus.writer.add_scalar('data/Q_loss', loss, t) #pixel_bonus.writer.add_scalar('data/C_loss', C_loss, t) pixel_bonus.writer.add_scalar('data/episode_reward', episode_rewards[-1], t) # save statistics Statistic["mean_episode_rewards"].append(mean_episode_reward) Statistic["best_mean_episode_rewards"].append( best_mean_episode_reward) Statistic["episode_rewards"].append(episode_rewards) if t % config.log_freq == 0 and t > config.learning_starts: # curr = time.time() # diff = curr - prev # prev = curr # logging.info("Timestep %d" % (t,)) # logging.info("Time elapsed %f" % diff) logging.info("mean reward (100 episodes) %f" % mean_episode_reward) logging.info("best mean reward %f" % best_mean_episode_reward) logging.info("episodes %d" % len(episode_rewards)) logging.info("exploration %f" % exploration.value(t)) sys.stdout.flush()
if __name__ == '__main__': # make env args = xworld_args.parser().parse_args() args.visible_radius_unit_side = config.visible_radius_unit_side args.visible_radius_unit_front = config.visible_radius_unit_front args.ego_centric = config.ego_centric args.map_config = config.map_config_file env = xworld_navi_goal.XWorldNaviGoal(args) env.teacher.israndom_goal = False env.teacher.goal_id = 0 # exploration strategy exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps) # learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # train model model = DRQN(env, config) shutil.copyfile('./configs/drqn_xworld.py', config.output_path+'config.py') shutil.copy(os.path.realpath(__file__), config.output_path) shutil.copy(config.map_config_file, config.output_path) if config.deploy_only: model.deploy() else: model.run(exp_schedule, lr_schedule)
action_dim = env.action_space.n kwargs = { "action_dim": action_dim, "discount": args.discount, "gradient_clip": args.gradient_clip, } # Initialize policy # ---------------------------------------------- if args.policy == "DQN_per": kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs) kwargs["learning_rate"] = 1e-4 policy = DQN_per.DQN_PER(**kwargs) eps_schedule = LinearSchedule(1.0, 0.01, 1e6) # annealing epsilon beta_schedule = LinearSchedule(args.beta0_per, 1.0, args.max_timesteps - args.start_timesteps) # annealing beta args.batch_size = 64 elif args.policy == "Double_DQN_per": kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs) kwargs["learning_rate"] = 1e-4 policy = Double_DQN_per.DoubleDQN_PER(**kwargs) eps_schedule = LinearSchedule(1.0, 0.01, 1e6) # annealing epsilon beta_schedule = LinearSchedule(args.beta0_per, 1.0, args.max_timesteps - args.start_timesteps) # annealing beta args.batch_size = 64 # ---------------------------------------------- elif args.policy == "Dueling_DQN_per":