Пример #1
0
    def test(self, q_values):
        # load environment
        loader = EnvironmentLoader("environments/maps")
        env = loader.load_map(self.map)

        # set up agent

        agent = QLearningAgent(alpha=0,
                               epsilon_policy=self.epsilon_policy,
                               discount=0,
                               action_space=env.action_space,
                               state_space=env.state_space,
                               q_values=q_values)

        # reset everything
        agent.reset()
        env.reset()

        # generate epsilon values (runs infinitely long)
        for epsilon in self.epsilon_policy:

            # set epsilon for current epoch
            agent.epsilon = epsilon

            env.reset_position()
            done = False

            pygame.display.set_caption(f'KI-Labor GridWorld - Test')

            while not done:
                env.renderer.update_info(self.hyperparameters, epsilon)

                event = self.event_occured(
                    timeout_ms=RenderSettings.TIME_BETWEEN_FRAMES,
                    renderer=env.renderer)
                env.render(q_values)

                done = self.step(agent, env)

                if done and event is None:
                    event = self.event_occured(
                        timeout_ms=RenderSettings.TIME_BETWEEN_FRAMES,
                        renderer=env.renderer)
                    env.render(q_values, True)

                # handle events
                if event == "reset":
                    return
                elif event == "skip":
                    break
                elif event == "pause":
                    while self.event_occured(renderer=env.renderer) != "pause":
                        continue
Пример #2
0
def main():
    # Set log level.
    logging.basicConfig(level=logging.DEBUG)

    # Set a random seed for consistency in agent AND environment.
    if config.RANDOM_SEED is not None:
        np.random.seed(config.RANDOM_SEED)

    # Make environment.
    env = EnvCatcher(grid_size=config.GRID_SIZE,
                     env_type='episodic',
                     verbose=False,
                     random_seed=config.RANDOM_SEED)

    # Make agent.
    agent = QLearningAgent(actions=list(range(env.action_space)),
                           learning_rate=config.LEARNING_RATE,
                           discount_factor=config.DISCOUNT_FACTOR,
                           epsilon=config.EPSILON)

    # Run the RL Loop.
    episode_returns = rl.run_loop(agent=agent,
                                  env=env,
                                  max_num_steps=config.MAX_NUM_STEPS,
                                  report_every_n=config.REPORT_EVERY_N)

    # Save the data.
    date_string = time.strftime("%Y%m%d-%H%M%S")
    filename = ('qlearn_grid_{}_nep_{}_'.format(
        config.GRID_SIZE, len(episode_returns)) + date_string + '.csv')
    rl.save_episode_returns(filename=filename, episode_returns=episode_returns)
Пример #3
0
def generate_graph(loop_no=None):
    wall_1, wall_2, key_1, key_2, door_1, door_2 = generate_room(loop_no)
    G = nx.DiGraph()
    for i in range(100):
        if i < 50:
            rand_x = np.random.randint(1, 3)
            rand_y = np.random.randint(5, 7)
            start = (rand_x, rand_y)
            env_name = 'gym_minigrid.envs:MiniGrid-MainRoom-v1'
            env = gym.make(env_name,
                           agent_start=start,
                           key_pos=[key_1, key_2],
                           wall_pos=[wall_1, wall_2],
                           door_pos=[door_1, door_2])
        else:
            start_pos = (6, 2)
            if key_2 != (6, 1):
                start_pos = (6, 1)
            env = gym.make('MiniGrid-MainRoom-v1',
                           agent_start=start_pos,
                           keys_carried=[1, 0],
                           door_state=[1, 0],
                           key_pos=[key_1, key_2],
                           wall_pos=[wall_1, wall_2],
                           door_pos=[door_1, door_2])
        number_of_actions = env.action_space.n
        agent = QLearningAgent(number_of_actions,
                               gamma=0.9,
                               alpha=0.12,
                               epsilon=0.1)
        state = env.reset()
        for i in range(1000):
            action = np.random.randint(0, number_of_actions)
            next_state, reward, done, _ = env.step(action)
            agent.update(state, action, next_state, reward)

            if done:
                break
            G.add_edge(state, next_state, action=action)
            state = next_state
    return G, number_of_actions
Пример #4
0
def test_cube(max_episode, max_step):
    env = TreasureCube(max_step=max_step)
    agent = QLearningAgent()
    episode_rewards = []
    for epsisode_num in range(0, max_episode):
        state = env.reset()
        terminate = False
        t = 0
        episode_reward = 0
        while not terminate:
            action = agent.take_action(state)
            reward, terminate, next_state = env.step(action)
            episode_reward += reward
            # you can comment the following two lines, if the output is too much
            # env.render()  # comment
            # print(f'step: {t}, action: {action}, reward: {reward}')  # comment
            t += 1
            agent.train(state, action, next_state, reward)
            state = next_state
        print(
            f'episode: {epsisode_num}, total_steps: {t} episode reward: {episode_reward}'
        )
        episode_rewards.append(episode_reward)
    print(agent.getQTable())
    return showPlot(list(range(max_episode)), episode_rewards, 'episode',
                    'episode rewards')
Пример #5
0
def run_q_learning(env, num_episodes, gamma, alpha, epsilon):
    agent = QLearningAgent(env.action_space.n,
                           gamma=gamma,
                           alpha=alpha,
                           epsilon=epsilon)

    stats = {
        'episode_lengths': np.zeros(num_episodes),
        'episode_rewards': np.zeros(num_episodes)
    }

    for i_episode in range(num_episodes):

        if (i_episode + 1) % 20 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes),
                  end="")
            sys.stdout.flush()

        state = env.reset()
        done = False
        t = 0
        while not done:
            action = agent.step(state)
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            stats['episode_rewards'][i_episode] += reward
            stats['episode_lengths'][i_episode] = t

            agent.update(state, action, next_state, reward)
            t += 1
            state = next_state

    print()

    return agent, stats
Пример #6
0
  def setUp(self):
    # Set a random seed for consistency in agent AND environment.
    if RANDOM_SEED is not None:
        np.random.seed(RANDOM_SEED)

    # Make environment.
    self.env = EnvCatcher(grid_size=GRID_SIZE, 
                     env_type='episodic', 
                     verbose=False, 
                     random_seed=RANDOM_SEED)

    # Make agent.
    self.agent = QLearningAgent(actions=list(range(self.env.action_space)), 
                                learning_rate=LEARNING_RATE,
                                discount_factor=DISCOUNT_FACTOR, 
                                epsilon=EPSILON)
Пример #7
0
    def train(self):
        # load environment
        loader = EnvironmentLoader("environments/maps")
        env = loader.load_map(self.map)

        # set up agent

        agent = QLearningAgent(self.hyperparameters["alpha"],
                               self.epsilon_policy,
                               self.hyperparameters["discount"],
                               env.action_space, env.state_space)

        # reset everything
        epoch_counter = 0
        agent.reset()
        env.reset()

        # generate epsilon values (runs infinitely long)
        for epsilon in self.epsilon_policy:

            # set epsilon for current epoch
            agent.epsilon = epsilon

            env.reset_position()
            done = False

            while not done:
                render_current_epoch = RenderSettings.ENABLED and epoch_counter % RenderSettings.INTERVAL == 0
                save_current_epoch = SaveSettings.ENABLED and epoch_counter % SaveSettings.INTERVAL == 0

                if epoch_counter % RenderSettings.UPDATE_FREQ_TITLE == 0 or render_current_epoch:
                    pygame.display.set_caption(
                        f'KI-Labor GridWorld - Epoch {epoch_counter}'
                    )  # takes around 0.1ms on average
                    env.renderer.update_info(self.hyperparameters, epsilon)

                if render_current_epoch:
                    event = self.event_occured(
                        timeout_ms=RenderSettings.TIME_BETWEEN_FRAMES,
                        renderer=env.renderer)
                else:
                    event = self.event_occured(renderer=env.renderer)

                if render_current_epoch:
                    env.render(agent.get_q__values())

                if save_current_epoch and self.save_name is not None:
                    AgentManager.save_agent_state(
                        agent,
                        f"{SaveSettings.SAVE_PATH}/{self.save_name}_{epoch_counter}.txt"
                    )

                done = self.step(agent, env)

                if done and render_current_epoch and event is None:
                    event = self.event_occured(
                        timeout_ms=RenderSettings.TIME_BETWEEN_FRAMES,
                        renderer=env.renderer)
                    env.render(agent.get_q__values(), True)

                # handle events
                if event == "reset":
                    return
                elif event == "skip":
                    break
                elif event == "pause":
                    while self.event_occured(renderer=env.renderer) != "pause":
                        continue

            epoch_counter += 1
Пример #8
0
def run_q_learning(num_episodes,
                   max_eps_length,
                   env,
                   dom_no,
                   loop_no,
                   run_no,
                   with_options=False,
                   factored=False):
    if with_options:
        print('with skills')
        if factored:
            skills_file = open(
                RESULTS_PATH + 'generated_options/' + str(dom_no) + '/' +
                str(loop_no) + '/' + str(run_no) + '_factored_skills.pickle',
                "rb")
        else:
            skills_file = open(
                RESULTS_PATH + 'generated_options/' + str(dom_no) +
                '/generated_skills.pickle', "rb")
        skills = pickle.load(skills_file)
        number_of_actions = env.action_space.n + len(skills)
    else:
        number_of_actions = env.action_space.n
    agent = QLearningAgent(number_of_actions,
                           gamma=0.9,
                           alpha=0.12,
                           epsilon=0.1)

    stats = EpisodeStats(episode_lengths=np.zeros(num_episodes),
                         episode_rewards=np.zeros(num_episodes))

    input_list = []
    output_list = []
    for i_episode in range(num_episodes):

        if (i_episode + 1) % 20 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes),
                  end="")
            sys.stdout.flush()

        state = env.reset()
        for t in range(max_eps_length):

            action = agent.step(state)
            if action >= env.action_space.n:
                input = state + (action - env.action_space.n, )
                input_list.append(list(input))
                option = skills[action - env.action_space.n]
                if factored:
                    state_in = option.factor_state(state, global_set)
                else:
                    state_in = state
                if option.in_initialisation_set(state_in):
                    next_state, reward, done, _, total_steps = execute_option(
                        25,
                        env,
                        option,
                        state_in,
                        factored=factored,
                        state_uf=state)
                    stats.episode_lengths[i_episode] = t + total_steps - 1
                else:
                    next_state = state
                    reward = -1
                    done = False
                output_list.append(list(next_state))
            else:
                next_state, reward, done, _ = env.step(action)
                stats.episode_lengths[i_episode] = t

            # Update statistics
            stats.episode_rewards[i_episode] += reward

            agent.update(state, action, next_state, reward)

            if done:
                break

            state = next_state
    cols = global_set
    df_out = pd.DataFrame.from_records(output_list, columns=cols)
    cols.append('Action')
    df_in = pd.DataFrame.from_records(input_list, columns=cols)
    data_path = './data/' + str(loop_no)
    if not (os.path.isdir(data_path)):
        os.makedirs(data_path)
    if run_no > 0:
        print("SHOULDNT B HERE")
        df_in2 = pd.read_csv(data_path + '/input_data_task_' +
                             str(run_no - 1) + '.csv',
                             index_col=False)
        df_out2 = pd.read_csv(data_path + '/output_data_task_' +
                              str(run_no - 1) + '.csv',
                              index_col=False)
        df_in = df_in.append(df_in2, ignore_index=True, sort=False)
        df_out = df_out.append(df_out2, ignore_index=True, sort=False)
    df_in.drop(df_in.filter(regex='Unname'), axis=1, inplace=True)
    df_out.drop(df_out.filter(regex='Unname'), axis=1, inplace=True)
    df_in.to_csv(data_path + '/input_data_task_' + str(run_no) + '.csv')
    df_out.to_csv(data_path + '/output_data_task_' + str(run_no) + '.csv')
    return agent, stats
Пример #9
0
    def run_config(self, configuration):
        env, problem_size, num_models, num_episodes, num_steps, policy = configuration
        policies = PolicyCollection.get_batch(policy)

        # create a new graph
        graph = tf.Graph()
        with graph.as_default():

            # and a configuration as well.
            tf_config = tf.ConfigProto(log_device_placement=True)
            tf_config.intra_op_parallelism_threads = 8
            tf_config.inter_op_parallelism_threads = 8
            tf_config.gpu_options.allow_growth = True

            with tf.Session(graph=graph, config=tf_config) as sess:

                env = env("test", [num_models], problem_size)
                state_space = env.state_space
                action_space = env.action_space

                # --------------------- Determine the optimal reward --------------------

                # Determine the agent count
                num_policies = len(policies)
                optimal_ih_rew, minimal_ih_rew, min_q, max_q, _ = env.get_optimal(num_steps, 0.99)

                # --------------------------------------------------------------------------

                # Iterate over all policies and create an agent using that specific policy
                agents = list()
                environments = list()
                densities = list()
                get_best_shared = list()
                shared_steps = list()
                for pol_num in range(num_policies):

                    # Get policies and unique name
                    pe = policies[pol_num]
                    unique_name = str(pol_num)

                    # extract important fields
                    policy = pe[1]
                    policy_config = pe[2]
                    policy_config['num_models'] = num_models
                    policy_config['min_q'] = min_q
                    policy_config['max_q'] = max_q
                    policy_config['action_space'] = action_space

                    current_env = env.clone(unique_name)
                    environments.append(current_env)
                    agent = QLearningAgent(sess, unique_name, current_env, policy, policy_config)
                    agents.append(agent)
                    shared_steps.append(policy_config['shared_steps'])
                    densities.append([agent.ref_complete_densities])

                    if 'shared_learning' in policy_config and policy_config['shared_learning']:
                        get_best_shared.append(agent.get_best_heads)

                # init variables
                init = tf.global_variables_initializer()
                sess.run(init)

                feed_dict = {}
                for agent in agents:
                    feed_dict[agent.use_best] = True

                # retrieve the learn operations
                update_and_receive_rewards = [agent.q_tensor_update for agent in agents]
                reset_ops = [envs.reset_op for envs in environments]
                cum_rew_ops = [envs.cum_rewards for envs in environments]

                # create trainings rewards
                tr_rewards = np.zeros((num_episodes, num_policies, num_models))

                # iterate over episodes
                for episode in range(num_episodes):

                    # reset all environments
                    sess.run(reset_ops)

                    # for each agent sample a new head
                    state_dict = {}
                    for k in range(num_policies):
                        agents[k].sample_head()
                        state_dict[agents[k].use_best] = False

                    # repeat this for the number of steps
                    for k in range(num_steps):
                        for m in range(num_policies):
                            if shared_steps[k] > 0 and k % shared_steps[k] == 0:
                                sess.run(agents[k].get_best_heads)

                        # receive rewards and add
                        sess.run(update_and_receive_rewards, feed_dict=state_dict)

                    # copy values
                    tr_rewards[episode, :, :] = sess.run(cum_rew_ops)

        return tr_rewards
Пример #10
0
                            # Get policies and unique name
                            pe = policies[pol_num]
                            unique_name = str(pol_num)

                            # extract important fields
                            policy = pe[1]
                            policy_config = pe[2]
                            policy_config['num_models'] = num_models
                            policy_config['min_q'] = min_q
                            policy_config['max_q'] = max_q
                            policy_config['action_space'] = action_space

                            current_env = env.clone(unique_name)
                            environments.append(current_env)
                            agent = QLearningAgent(sess, unique_name,
                                                   current_env, policy,
                                                   policy_config)
                            agents.append(agent)

                            if plot_models > 0 and pol_num in record_indices:

                                # setup densities
                                if 'pseudo_count_type' in policy_config and policy_config[
                                        'pseudo_count_type']:
                                    num_densities = 2
                                    densities.append([
                                        agent.cb_complete_densities,
                                        agent.ref_complete_densities
                                    ])
                                else:
                                    num_densities = 1