Exemplo n.º 1
0
def run_q_learning(env, num_episodes, gamma, alpha, epsilon):
    agent = QLearningAgent(env.action_space.n,
                           gamma=gamma,
                           alpha=alpha,
                           epsilon=epsilon)

    stats = {
        'episode_lengths': np.zeros(num_episodes),
        'episode_rewards': np.zeros(num_episodes)
    }

    for i_episode in range(num_episodes):

        if (i_episode + 1) % 20 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes),
                  end="")
            sys.stdout.flush()

        state = env.reset()
        done = False
        t = 0
        while not done:
            action = agent.step(state)
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            stats['episode_rewards'][i_episode] += reward
            stats['episode_lengths'][i_episode] = t

            agent.update(state, action, next_state, reward)
            t += 1
            state = next_state

    print()

    return agent, stats
Exemplo n.º 2
0
def run_q_learning(num_episodes,
                   max_eps_length,
                   env,
                   dom_no,
                   loop_no,
                   run_no,
                   with_options=False,
                   factored=False):
    if with_options:
        print('with skills')
        if factored:
            skills_file = open(
                RESULTS_PATH + 'generated_options/' + str(dom_no) + '/' +
                str(loop_no) + '/' + str(run_no) + '_factored_skills.pickle',
                "rb")
        else:
            skills_file = open(
                RESULTS_PATH + 'generated_options/' + str(dom_no) +
                '/generated_skills.pickle', "rb")
        skills = pickle.load(skills_file)
        number_of_actions = env.action_space.n + len(skills)
    else:
        number_of_actions = env.action_space.n
    agent = QLearningAgent(number_of_actions,
                           gamma=0.9,
                           alpha=0.12,
                           epsilon=0.1)

    stats = EpisodeStats(episode_lengths=np.zeros(num_episodes),
                         episode_rewards=np.zeros(num_episodes))

    input_list = []
    output_list = []
    for i_episode in range(num_episodes):

        if (i_episode + 1) % 20 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes),
                  end="")
            sys.stdout.flush()

        state = env.reset()
        for t in range(max_eps_length):

            action = agent.step(state)
            if action >= env.action_space.n:
                input = state + (action - env.action_space.n, )
                input_list.append(list(input))
                option = skills[action - env.action_space.n]
                if factored:
                    state_in = option.factor_state(state, global_set)
                else:
                    state_in = state
                if option.in_initialisation_set(state_in):
                    next_state, reward, done, _, total_steps = execute_option(
                        25,
                        env,
                        option,
                        state_in,
                        factored=factored,
                        state_uf=state)
                    stats.episode_lengths[i_episode] = t + total_steps - 1
                else:
                    next_state = state
                    reward = -1
                    done = False
                output_list.append(list(next_state))
            else:
                next_state, reward, done, _ = env.step(action)
                stats.episode_lengths[i_episode] = t

            # Update statistics
            stats.episode_rewards[i_episode] += reward

            agent.update(state, action, next_state, reward)

            if done:
                break

            state = next_state
    cols = global_set
    df_out = pd.DataFrame.from_records(output_list, columns=cols)
    cols.append('Action')
    df_in = pd.DataFrame.from_records(input_list, columns=cols)
    data_path = './data/' + str(loop_no)
    if not (os.path.isdir(data_path)):
        os.makedirs(data_path)
    if run_no > 0:
        print("SHOULDNT B HERE")
        df_in2 = pd.read_csv(data_path + '/input_data_task_' +
                             str(run_no - 1) + '.csv',
                             index_col=False)
        df_out2 = pd.read_csv(data_path + '/output_data_task_' +
                              str(run_no - 1) + '.csv',
                              index_col=False)
        df_in = df_in.append(df_in2, ignore_index=True, sort=False)
        df_out = df_out.append(df_out2, ignore_index=True, sort=False)
    df_in.drop(df_in.filter(regex='Unname'), axis=1, inplace=True)
    df_out.drop(df_out.filter(regex='Unname'), axis=1, inplace=True)
    df_in.to_csv(data_path + '/input_data_task_' + str(run_no) + '.csv')
    df_out.to_csv(data_path + '/output_data_task_' + str(run_no) + '.csv')
    return agent, stats