示例#1
0
def main():
    #Make OpenAI gym environment + wrappers
    date_time = now.strftime("_%H:%M:%S_%m-%d-%Y")
    env = gym.make("PongNoFrameskip-v4")
    env = gym.wrappers.Monitor(env, './data_dqn_ataripong' + date_time)
    assert 'NoFrameskip' in env.spec.id
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)  #skip 4 frames & max over last_obs
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)  #obs shape = num_channels x width x height
    obs_space_shape = env.observation_space.shape[0]
    action_space_shape = env.action_space.n

    #Set random seeds
    seed = 6582
    torch.manual_seed(seed)
    if torch.cuda.is_available:
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    env.seed(seed)

    #Initialize Replay Memory (Line 1)
    replay_memory = ReplayMemory(max_size=100000)

    #Make Q-Network and Target Q-Network (Lines 2 & 3)
    qnet = Atari_Dueling_DQN(obs_space_shape, action_space_shape).to(device)
    target_qnet = Atari_Dueling_DQN(obs_space_shape,
                                    action_space_shape).to(device)
    target_qnet.load_state_dict(qnet.state_dict())

    #Training Parameters (Changes from Mnih et al. outlined in README.md)
    optimizer = optim.Adam(qnet.parameters())
    num_frames = 1400000
    gamma = 0.99
    replay_start_size = 50000
    target_network_update_freq = 10000

    #Train
    obs = env.reset()
    num_episodes = 0
    for t in range(1, num_frames + 1):
        epsilon = epsilon_at_t(t)

        #-------------------------------------------------------------------
        #Take one step in the environment & add to Replay Memory (Line 7-11)
        #-------------------------------------------------------------------
        torch.set_grad_enabled(False)
        #Select action with epsilon-greedy exploration (Line 7,8)
        if random.random() > epsilon:
            ts_obs = torch.from_numpy(obs.astype(
                np.float32)).unsqueeze(0).to(device)
            ts_qvals = qnet(ts_obs)
            action = ts_qvals.max(-1)[1].item()
        else:
            action = random.randrange(action_space_shape)
        torch.set_grad_enabled(True)

        #Execute action and get reward + next_obs (Line 9, 10)
        next_obs, reward, done, _ = env.step(action)

        #Store transition in Replay Memory
        replay_memory.add(obs, next_obs, action, reward, done)

        obs = next_obs

        if done:
            obs = env.reset()
            num_episodes += 1

        #Populate Replay Memory with <replay_start_size> experiences before learning
        if t > replay_start_size:
            #---------------------------------------------
            #Sample batch & compute loss & update network (Lines 12 - 15)
            #---------------------------------------------
            obs_minibatch, next_obs_minibatch, actions_minibatch, rewards_minibatch, done_minibatch = replay_memory.sample(
            )

            ts_obs, ts_rewards, ts_next_obs, ts_done = map(
                lambda x: torch.FloatTensor(x).to(device), [
                    obs_minibatch, rewards_minibatch, next_obs_minibatch,
                    done_minibatch
                ])
            ts_actions = torch.LongTensor(actions_minibatch).to(device)

            torch.set_grad_enabled(False)
            # Compute Target Values (as per Double-DQN update rule)
            ts_next_qvals_outer = qnet(
                ts_next_obs)  #(32, 2) (outer Qnet, evaluates value)
            ts_next_qvals_inner = target_qnet(
                ts_next_obs)  #(32, 2) (inner Qnet, evaluates action)
            ts_next_action_inner = ts_next_qvals_inner.argmax(
                -1, keepdim=True)  #(32, 1)
            ts_next_action_qvals_outer = ts_next_qvals_outer.gather(
                -1, ts_next_action_inner).view(
                    -1)  #(32, ) (use inner actions to evaluate outer Q values)
            ts_target_q = ts_rewards + gamma * ts_next_action_qvals_outer * (
                1 - ts_done)
            torch.set_grad_enabled(True)

            #Compute predicted
            ts_pred_q = qnet(ts_obs).gather(-1, ts_actions).view(-1)  #(32,)

            #Calculate Loss & Perform gradient descent (Line 14)
            loss = F.smooth_l1_loss(ts_pred_q, ts_target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #Update target network ever <target_network_update_freq> steps (Line 15)
            if t % target_network_update_freq == 0:
                target_qnet.load_state_dict(qnet.state_dict())

        #Log to Terminal
        episode_rewards = env.env.env.env.env.env.env.env.get_episode_rewards()
        print('Timesteps', t, 'Episode', num_episodes, 'Mean Reward',
              np.mean(episode_rewards[-100:]))
    env.env.close()
    num_frames = 1000000
    batch_size = 32
    learning_rate = 0.0001

    # create environment
    # env_id = "PongNoFrameskip-v4"
    # env_id = 'SpaceInvadersNoFrameskip-v4'
    # env_id = 'MsPacmanNoFrameskip-v4'
    # env_id = 'VideoPinballNoFrameskip-v4'
    # env_id = 'MontezumaRevengeNoFrameskip-v4'
    # env_id = 'QbertNoFrameskip-v4'
    env_id = sys.argv[1]
    env    = make_atari(env_id)
    # env = gym.wrappers.Monitor(env, 'stats', video_callable=lambda episode_id: False, force=True, resume=False)
    env    = wrap_deepmind(env)
    env    = wrap_pytorch(env)

    # create networks
    current_model = CnnDQN(env.observation_space.shape, env.action_space.n)
    target_model  = CnnDQN(env.observation_space.shape, env.action_space.n)
    if USE_CUDA:
        current_model = current_model.cuda()
        target_model  = target_model.cuda()

    # setup optimizer
    optimizer = optim.Adam(current_model.parameters(), lr = learning_rate)

    # initialize replay memory
    replay_buffer = ReplayBuffer(100000)

    # train model
示例#3
0
def get_env(env_id, frame_stack):
    env = make_atari(env_id)
    env = wrap_deepmind(env, frame_stack)
    env = wrap_pytorch(env)
    return env
示例#4
0
def main():
    env_id = "PongNoFrameskip-v4"
    env = make_atari(env_id)
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)

    observation_space = env.observation_space.shape
    action_sapce = env.action_space.n

    model = CnnDQN(observation_space, action_sapce)

    if USE_CUDA:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters())

    replay_buffer = ReplayBuffer(1000)

    batch_size = 32
    gamma = 0.99
    replay_initial = 100
    num_frames = 14000

    losses = []
    all_rewards = []
    x_axis1 = []
    x_axis2= []
    episode_reward = 0

    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 30000

    # 要求探索率随着迭代次数增加而减小
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

    state = env.reset()

    for frame_idx in range(1, num_frames + 1):
        #显示动画
        env.render()
        epsilon = epsilon_by_frame(frame_idx)
        action = model.act(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward

        if done:
            state = env.reset()
            x_axis1.append(frame_idx)
            all_rewards.append(episode_reward)
            episode_reward = 0

        if frame_idx+1 > replay_initial:
            loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size)
            x_axis2.append(frame_idx)
            losses.append(np.array(loss.data.cpu()))



        if frame_idx % 100 == 0:
            plt.figure(1)
            plt.subplot(121)
            plt.plot(x_axis1, all_rewards)
            plt.subplot(122)
            plt.plot(x_axis2, losses)
            plt.show()

    env.close()
示例#5
0
def get_env():
    env = make_atari("PongNoFrameskip-v4")
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)
    return env