Пример #1
0
def do_q_learning(env, reward_function, train_episodes, figure=False):
    alpha = 0.01
    gamma = 0.9
    epsilon = 0.1
    policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2,
                       output=4)  # 4 actions output, up, right, down, left
    replay_buffer = ReplayBuffer()
    # Play with a random policy and see
    # run_current_policy(env.env, policy)
    agg_interval = 100
    avg_history = {'episodes': [], 'timesteps': [], 'reward': []}
    # Train the network to predict actions for each of the states
    for episode_i in range(train_episodes):
        episode_timestep = 0
        episode_reward = 0.0
        env.__init__()
        # todo : the first current state should be 0
        cur_state = env.cur_state
        counter = 0
        done = False
        while not done:
            # Let each episode be of 30 steps
            counter += 1
            done = counter >= 30

            # todo : check if this line is working
            action = policy.select_action(cur_state.reshape(1, -1), epsilon)

            # take action in the environment
            next_state = env.step(action)
            reward = reward_function(next_state)

            # add the transition to replay buffer
            replay_buffer.add(cur_state, action, next_state, reward, done)

            # sample minibatch of transitions from the replay buffer
            # the sampling is done every timestep and not every episode
            sample_transitions = replay_buffer.sample()

            # update the policy using the sampled transitions
            policy.update_policy(**sample_transitions)

            episode_reward += reward
            episode_timestep += 1

            cur_state = next_state

        avg_history['episodes'].append(episode_i + 1)
        avg_history['timesteps'].append(episode_timestep)
        avg_history['reward'].append(episode_reward)

        learning_policy_progress.update()

    if figure:
        plt.plot(avg_history['episodes'], avg_history['reward'])
        plt.title('Reward')
        plt.xlabel('Episode')
        plt.ylabel('Reward')
        plt.show()
    return policy.q_model
Пример #2
0
    while not done:
        # select action
        action = cp_policy.select_action(cur_state.reshape(1, -1), cp_epsilon)

        # take action in the environment
        next_state, reward, done, info = cp_env.step(action)

        # add the transition to replay buffer
        replay_buffer.add(cur_state, action, next_state, reward, done)

        # sample minibatch of transitions from the replay buffer
        # the sampling is done every timestep and not every episode
        sample_transitions = replay_buffer.sample()

        # update the policy using the sampled transitions
        cp_policy.update_policy(**sample_transitions)

        episode_reward += reward
        episode_timestep += 1

        cur_state = next_state

    avg_reward += episode_reward
    avg_timestep += episode_timestep

    if (episode_i + 1) % agg_interval == 0:
        cp_avg_history['episodes'].append(episode_i + 1)
        cp_avg_history['timesteps'].append(avg_timestep / float(agg_interval))
        cp_avg_history['reward'].append(avg_reward / float(agg_interval))
        avg_timestep = 0
        avg_reward = 0.0
            env.render()

        # Keep track of max position
        if next_state[0] > max_position:
            max_position = next_state[0]

        # Adjust reward for task completion
        if next_state[0] >= 0.5:
            reward += 10

        replay_buffer.add(cur_state, action, next_state, reward, done)

        # TODO : Change the sample size and check any improvements
        sampled_transitions = replay_buffer.sample()
        # the q updation occurs for all transitions in all episodes, just like TD updates
        env_policy.update_policy(**sampled_transitions)
        ep_reward += reward
        ep_timesteps += 1

        cur_state = next_state

    history['reward'].append(ep_reward)
    history['timesteps'].append(ep_timesteps)
    history['episodes'].append(episode+1)
    if episode % mod_episode == 0:
        print('Epoch : {} Success : {} Avg Reward : {} Timesteps : {} Max position : {}'.format(
            episode, max_position >= 0.5, history['reward'][-1], history['timesteps'][-1], max_position))

    # decay the epsilon after every episode
    epsilon -= epsilon_decay