def main(): #Make OpenAI gym environment + wrappers date_time = now.strftime("_%H:%M:%S_%m-%d-%Y") env = gym.make("PongNoFrameskip-v4") env = gym.wrappers.Monitor(env, './data_dqn_ataripong' + date_time) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) #skip 4 frames & max over last_obs env = wrap_deepmind(env) env = wrap_pytorch(env) #obs shape = num_channels x width x height obs_space_shape = env.observation_space.shape[0] action_space_shape = env.action_space.n #Set random seeds seed = 6582 torch.manual_seed(seed) if torch.cuda.is_available: torch.cuda.manual_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) #Initialize Replay Memory (Line 1) replay_memory = ReplayMemory(max_size=100000) #Make Q-Network and Target Q-Network (Lines 2 & 3) qnet = Atari_Dueling_DQN(obs_space_shape, action_space_shape).to(device) target_qnet = Atari_Dueling_DQN(obs_space_shape, action_space_shape).to(device) target_qnet.load_state_dict(qnet.state_dict()) #Training Parameters (Changes from Mnih et al. outlined in README.md) optimizer = optim.Adam(qnet.parameters()) num_frames = 1400000 gamma = 0.99 replay_start_size = 50000 target_network_update_freq = 10000 #Train obs = env.reset() num_episodes = 0 for t in range(1, num_frames + 1): epsilon = epsilon_at_t(t) #------------------------------------------------------------------- #Take one step in the environment & add to Replay Memory (Line 7-11) #------------------------------------------------------------------- torch.set_grad_enabled(False) #Select action with epsilon-greedy exploration (Line 7,8) if random.random() > epsilon: ts_obs = torch.from_numpy(obs.astype( np.float32)).unsqueeze(0).to(device) ts_qvals = qnet(ts_obs) action = ts_qvals.max(-1)[1].item() else: action = random.randrange(action_space_shape) torch.set_grad_enabled(True) #Execute action and get reward + next_obs (Line 9, 10) next_obs, reward, done, _ = env.step(action) #Store transition in Replay Memory replay_memory.add(obs, next_obs, action, reward, done) obs = next_obs if done: obs = env.reset() num_episodes += 1 #Populate Replay Memory with <replay_start_size> experiences before learning if t > replay_start_size: #--------------------------------------------- #Sample batch & compute loss & update network (Lines 12 - 15) #--------------------------------------------- obs_minibatch, next_obs_minibatch, actions_minibatch, rewards_minibatch, done_minibatch = replay_memory.sample( ) ts_obs, ts_rewards, ts_next_obs, ts_done = map( lambda x: torch.FloatTensor(x).to(device), [ obs_minibatch, rewards_minibatch, next_obs_minibatch, done_minibatch ]) ts_actions = torch.LongTensor(actions_minibatch).to(device) torch.set_grad_enabled(False) # Compute Target Values (as per Double-DQN update rule) ts_next_qvals_outer = qnet( ts_next_obs) #(32, 2) (outer Qnet, evaluates value) ts_next_qvals_inner = target_qnet( ts_next_obs) #(32, 2) (inner Qnet, evaluates action) ts_next_action_inner = ts_next_qvals_inner.argmax( -1, keepdim=True) #(32, 1) ts_next_action_qvals_outer = ts_next_qvals_outer.gather( -1, ts_next_action_inner).view( -1) #(32, ) (use inner actions to evaluate outer Q values) ts_target_q = ts_rewards + gamma * ts_next_action_qvals_outer * ( 1 - ts_done) torch.set_grad_enabled(True) #Compute predicted ts_pred_q = qnet(ts_obs).gather(-1, ts_actions).view(-1) #(32,) #Calculate Loss & Perform gradient descent (Line 14) loss = F.smooth_l1_loss(ts_pred_q, ts_target_q) optimizer.zero_grad() loss.backward() optimizer.step() #Update target network ever <target_network_update_freq> steps (Line 15) if t % target_network_update_freq == 0: target_qnet.load_state_dict(qnet.state_dict()) #Log to Terminal episode_rewards = env.env.env.env.env.env.env.env.get_episode_rewards() print('Timesteps', t, 'Episode', num_episodes, 'Mean Reward', np.mean(episode_rewards[-100:])) env.env.close()
num_frames = 1000000 batch_size = 32 learning_rate = 0.0001 # create environment # env_id = "PongNoFrameskip-v4" # env_id = 'SpaceInvadersNoFrameskip-v4' # env_id = 'MsPacmanNoFrameskip-v4' # env_id = 'VideoPinballNoFrameskip-v4' # env_id = 'MontezumaRevengeNoFrameskip-v4' # env_id = 'QbertNoFrameskip-v4' env_id = sys.argv[1] env = make_atari(env_id) # env = gym.wrappers.Monitor(env, 'stats', video_callable=lambda episode_id: False, force=True, resume=False) env = wrap_deepmind(env) env = wrap_pytorch(env) # create networks current_model = CnnDQN(env.observation_space.shape, env.action_space.n) target_model = CnnDQN(env.observation_space.shape, env.action_space.n) if USE_CUDA: current_model = current_model.cuda() target_model = target_model.cuda() # setup optimizer optimizer = optim.Adam(current_model.parameters(), lr = learning_rate) # initialize replay memory replay_buffer = ReplayBuffer(100000) # train model
def get_env(env_id, frame_stack): env = make_atari(env_id) env = wrap_deepmind(env, frame_stack) env = wrap_pytorch(env) return env
def main(): env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) observation_space = env.observation_space.shape action_sapce = env.action_space.n model = CnnDQN(observation_space, action_sapce) if USE_CUDA: model = model.cuda() optimizer = optim.Adam(model.parameters()) replay_buffer = ReplayBuffer(1000) batch_size = 32 gamma = 0.99 replay_initial = 100 num_frames = 14000 losses = [] all_rewards = [] x_axis1 = [] x_axis2= [] episode_reward = 0 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000 # 要求探索率随着迭代次数增加而减小 epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) state = env.reset() for frame_idx in range(1, num_frames + 1): #显示动画 env.render() epsilon = epsilon_by_frame(frame_idx) action = model.act(state, epsilon) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() x_axis1.append(frame_idx) all_rewards.append(episode_reward) episode_reward = 0 if frame_idx+1 > replay_initial: loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size) x_axis2.append(frame_idx) losses.append(np.array(loss.data.cpu())) if frame_idx % 100 == 0: plt.figure(1) plt.subplot(121) plt.plot(x_axis1, all_rewards) plt.subplot(122) plt.plot(x_axis2, losses) plt.show() env.close()
def get_env(): env = make_atari("PongNoFrameskip-v4") env = wrap_deepmind(env) env = wrap_pytorch(env) return env