示例#1
0
def generate_grids(cols):
    grids = []
    for i in range(cols):
        map_name_base = '{}x{}-base'.format(cols, cols)
        env = LochLomondEnv(problem_id=i,
                            is_stochastic=True,
                            reward_hole=-0.02,
                            map_name_base=map_name_base)

        env.render()
        grid = EnvMDP.to_decoded(env).reshape(env.nrow * env.ncol)
        grids.append(np.hstack(([i], grid)))

    return grids
示例#2
0
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=0.0):
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=True,
                        reward_hole=reward_hole)

    np.random.seed(12)

    results = []

    for episode in range(max_episodes):
        env.reset()
        print('-' * 50)
        print_headers()

        for iteration in range(max_iters_per):
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            print(",".join([
                str(episode),
                str(iteration),
                str(reward),
                str(done),
                str(info),
                str(action)
            ]))

            if done and reward == reward_hole:
                env.render()
                print("Hole Found in " + str(iteration) + " iterations")
                results.append({'iters': iteration, 'success': False})
                break

            if done and reward == 1.0:
                env.render()
                print("Frisbee acquired in " + str(iteration) + " iterations")
                results.append({'iters': iteration, 'success': True})
                break

    return results
for e in range(max_episodes):  # iterate over episodes
    observation = env.reset(
    )  # reset the state of the env to the starting state

    for iter in range(max_iter_per_episode):
        #env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line
        action = env.action_space.sample(
        )  # your agent goes here (the current agent takes random actions)
        observation, reward, done, info = env.step(
            action)  # observe what happends when you take the action

        # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner

        print("e,iter,reward,done =" + str(e) + " " + str(iter) + " " +
              str(reward) + " " + str(done))

        # Check if we are done and monitor rewards etc...
        if (done and reward == reward_hole):
            env.render()
            print(
                "We have reached a hole :-( [we can't move so stop trying; just give up]"
            )
            break

        if (done and reward == +1.0):
            env.render()
            print(
                "We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal]"
            )
            break
示例#4
0
reward_list= list()

# Generate the specific problem 
env = LochLomondEnv(problem_id=problem_id, is_stochastic=True, reward_hole=reward_hole)

# Let's visualize the problem/env
print('env',env.desc)

# Reset the random generator to a known state (for reproducability)
np.random.seed(12)

for e in range(max_episodes): # iterate over episodes
    observation = env.reset() # reset the state of the env to the starting state     
    
    for iter in range(max_iter_per_episode):
      env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line
      #action = env.action_space.sample() # your agent goes here (the current agent takes random actions)
      random= runRandom()
      action= random.action()
      observation, reward, done, info = env.step(action) # observe what happends when you take the action
      print("================================================")
      print("info",info)
      # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner
      observation_list.append(observation)
      reward_list.append(reward)



      print("e,iter,reward,done =" + str(e) + " " + str(iter)+ " " + str(reward)+ " " + str(done))

      # Check if we are done and monitor rewards etc...
示例#5
0
def run_senseless_agent(problem_id, map):

    reward_hole = 0.0
    max_episodes = 10000
    max_iter_per_episode = 1000

    env = LochLomondEnv(problem_id=problem_id, is_stochastic=True,
                        map_name_base=map,
                        reward_hole=reward_hole)

    env.render()
    env.action_space.sample()

    np.random.seed(12)

    # variables for performance evaluation
    # number of times goal is reached out of max_episodes/ (performance measures where reward is collected)
    goal_episodes = []
    # number of episodes agent falls in hole
    hole_episodes = []
    # average number of iterations taken to reach goal per rewarded episode
    goal_iterations = []

    rewards = []

    # number of episodes before goal is first reached
    first_goal = 0

    for e in range(max_episodes):

        rewards_current_episode = 0
        state = env.reset()

        for iter in range(max_iter_per_episode):

            action = env.action_space.sample()
            state, reward, done, info = env.step(action)

            rewards_current_episode += reward

            if (done and reward == reward_hole):
                hole_episodes.append(e)
                break

            if (done and reward == +1.0):
                # env.render()
                goal_episodes.append(e)
                goal_iterations.append(iter+2)

                # sets first goal to episode
                if first_goal == 0:
                    first_goal = e
                break

        rewards.append(rewards_current_episode)

    # calculating steps to goal
    goal_iteration_average = mean(goal_iterations)
    goal_iteration_bestcase = mini(goal_iterations)
    goal_iteration_worstcase = maxi(goal_iterations)


    # splits collected rewards into per 100 episodes
    rewards_per_100_eps = np.split(np.array(rewards), max_episodes / 100)
    rewards_per_100_eps = [str(sum(r / 100)) for r in rewards_per_100_eps]


    return len(goal_episodes), len(hole_episodes), goal_iteration_average, goal_iteration_bestcase, \
           goal_iteration_worstcase,  first_goal, rewards_per_100_eps
示例#6
0
class MyAbstractAIAgent():
    """
    Abstract agent that works as a base for all our agents.


    """
    def __init__(self, problem_id, map_name_base="8x8-base"):
        # map_name_base="4x4-base"
        if not (0 <= problem_id <= 7):
            raise ValueError("Problem ID must be 0 <= problem_id <= 7")

        self.map_name_base = map_name_base
        self.env = LochLomondEnv(problem_id=problem_id,
                                 is_stochastic=self.is_stochastic(),
                                 reward_hole=self.reward_hole(),
                                 map_name_base=map_name_base)

        self.problem_id = problem_id
        self.reset()
        self.out = 'out/'
        self.policy = {}
        self._train = []
        self.graphs = {}

    def is_stochastic(self):
        raise NotImplementedError

    def reward_hole(self):
        raise NotImplementedError

    def reset(self):
        self.rewards = 0
        self.failures = 0
        self.eval = []
        self.timeouts = 0

    def solve(self, episodes=10000, iterations=1000, seed=None, gamma=0.95):
        print('Solving with {} Agent'.format(self.name().capitalize()))
        print('Problem: ', self.problem_id)
        print('Grid: ', self.map_name_base)
        print('Episodes that will run...: ', episodes)

        self.train(episodes=episodes, iterations=iterations)
        rewards = self.rewards
        timeouts = self.timeouts
        failures = self.failures

        for e in range(1, episodes + 1):  # iterate over episodes
            state = self.env.reset()
            self.set_episode_seed(e, seed)

            if e % 1000 == 0:
                print("Eval Episode", e)

            for i in range(1, iterations + 1):
                action = self.action(state)
                state, reward, done, info = self.env.step(action)

                if done:
                    if reward == 1.0:
                        rewards += int(reward)
                    else:
                        failures += 1

                    # break the cycle
                    break

            if not done:
                timeouts += 1

            self.eval.append([
                self.problem_id, e, i,
                to_human(action),
                int(reward), rewards, rewards / e, failures, timeouts
            ])

    def action(self, i):
        raise NotImplementedError

    def train(self, episodes, iterations):
        raise NotImplementedError

    def env(self):
        return self.env

    def set_episode_seed(self, episode, seed=None):
        # by default no seed for abstract agent
        return None

    def alias(self):
        return '{}out_{}_{}_{}'.format(self.out, self.name(), self.problem_id,
                                       self.env.ncol)

    def evaluate(self, episodes):
        self.env.reset()
        print("This is the environment: ")
        print(self.env.render())

        if (len(self.policy) > 0):
            print("This is the final policy: ")
            print_table(
                policy_to_arrows(self.policy, self.env.ncol, self.env.ncol))

        print('Saving Evaluation Files...')
        self.write_eval_files()

        # Plotting mean rewards
        print('Saving Plots...')
        labels = ['Episodes', 'Mean Reward']
        title = 'Problem {}. Plot for {} Agent'.format(
            self.problem_id,
            self.name().capitalize())

        if (len(self._train) > 0):
            subtitle = 'Episodes vs Mean Reward (Training Phase).'
            self.plot_train(range(episodes), labels, title, subtitle, 'mr')

            subtitle = 'First 1000 Episodes vs Mean Reward (Training Phase).'
            self.plot_train(range(999), labels, title, subtitle,
                            'mr_first_1000')

            subtitle = 'Last 1000 Episodes vs Mean Reward (Training Phase).'
            self.plot_train(range(episodes - 1000, episodes - 1), labels,
                            title, subtitle, 'mr_last_1000')

        if (len(self.eval) > 0):
            subtitle = 'Episodes vs Mean Reward (Evaluation Phase).'
            self.plot_evaluation(range(episodes), labels, title, subtitle,
                                 'mr')

            subtitle = 'First 1000 Episodes vs Mean Reward (Evaluation Phase).'
            self.plot_evaluation(range(999), labels, title, subtitle,
                                 'mr_first_1000')

            subtitle = 'Last 1000 Episodes vs Mean Reward (Evaluation Phase).'
            self.plot_evaluation(range(episodes - 1000, episodes - 1), labels,
                                 title, subtitle, 'mr_last_1000')

        if (len(self.graphs) > 0):
            subtitle = 'Utilities plot'
            self.plot_utilities(['Episodes', 'U'], title, subtitle)

    def write_eval_files(self):
        def data_for_file(name):
            if name == 'policy':
                return policy_to_list(self.policy)
            if name == 'u':
                return u_to_list(self.U)
            if name == 'eval':
                return self.eval
            if name == 'q':
                return self.Q
            if name == 'train':
                return self._train
            if name == 'graphs':
                return self.graphs

            return []

        for file in self.files():
            if file == 'graphs':
                filename = '{}_{}.json'.format(self.alias(), file)
                with open(filename, 'w') as outfile:
                    json.dump(data_for_file(file), outfile)
            else:
                filename = '{}_{}.csv'.format(self.alias(), file)
                data = [self.header(file)] + data_for_file(file)
                np.savetxt(filename, data, delimiter=",", fmt='%s')
            print('\tFile saved: {}'.format(filename))

    def header(self, key):
        headers = {
            'eval': [
                'id', 'episode', 'iteration', 'action', 'reward', 'rewards',
                'mean_rewards', 'failures', 'timeouts'
            ],
            'policy': ['x', 'y', 'action'],
            'u': ['x', 'y', 'u'],
            'train': [
                'id', 'episode', 'iteration', 'reward', 'rewards',
                'mean_rewards', 'failures', 'timeouts'
            ],
            'graphs': ['x', 'y', 'value'],
            'q': ['position', 'x', 'y', 'action', 'action_friendly', 'value']
        }

        if key in headers:
            return headers[key]

    def plot_train(self, rows, labels, title, subtitle, suffix=''):
        """ Plots mean rewards from training phase """
        train = np.array(self._train)

        x = pd.to_numeric(train[:, 1])
        y = pd.to_numeric(train[:, 5])
        filename = '{}_train_{}.png'.format(self.alias(), suffix)

        self.plot(x, y, rows, labels, filename, title, subtitle)

    def plot_evaluation(self, rows, labels, title, subtitle, suffix=''):
        """ Plots mean rewards from evaluation phase """
        evaluation = np.array(self.eval)

        x = pd.to_numeric(evaluation[:, 1])
        y = pd.to_numeric(evaluation[:, 6])
        filename = '{}_eval_{}.png'.format(self.alias(), suffix)

        self.plot(x, y, rows, labels, filename, title, subtitle)

    def plot_utilities(self, labels, title, subtitle):
        for state, value in self.graphs.items():
            x, y = zip(*value)
            plt.plot(x, y, label=str(state))

        plt.ylim([-0.1, 1.05])
        plt.legend(loc='lower right')
        plt.xlabel(labels[0])
        plt.ylabel(labels[1])
        filename = '{}_utilities.png'.format(self.alias())
        plt.suptitle(title, fontsize=12)
        plt.title(subtitle, fontsize=10)
        plt.savefig(filename)
        plt.close()

        print('\tPlot saved: {}'.format(filename))

    def plot(self, x, y, rows, labels, filename, title, subtitle):
        plt.plot(x[rows], y[rows])
        plt.xlabel(labels[0])
        plt.ylabel(labels[1])
        plt.suptitle(title, fontsize=12)
        plt.title(subtitle, fontsize=10)
        plt.savefig(filename)
        plt.close()

        print('\tPlot saved: {}'.format(filename))