Exemplo n.º 1
0
    def __init__(self, env, **kwargs):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        add_total = 0
        add_total = False
        self.success = 0
        self.total = 0
        self.counter = 0
        self.epsilon_reset_counter = 0
        self.trial_counter = 0.0
        self.min_epsilon = 0.001
        self.eps_freq = 1.0
        self.filled_cell_count = 0
        self.total_cell_count = 0
        self.updated_func_counter = 0
        global stats_df_counter
        global stats_df

        for key, value in kwargs.iteritems():
            print "%s = %s" % (key, value)
            if key == 'alp':
                self.alpha = value
            elif key == 'gma':
                self.gamma = value
            elif key == 'eps':
                self.epsl = value
        self.epsilon = self.epsl
        print "epsilon: ", self.epsilon
        self.qt = QTable(self.alpha, self.gamma)
        print '-' * 80
Exemplo n.º 2
0
 def test_argmax_without_init(self):
     """ Test max(key) function. """
     qtable = QTable(self.actions)
     state = collections.OrderedDict()
     state['from'] = 1
     state['to'] = 2
     state['rank'] = [0, 1, 2]
     self.assertTrue(qtable.argmax(state) in self.actions)
Exemplo n.º 3
0
 def test_max_without_init(self):
     """ Test max(key) function. """
     qtable = QTable(self.actions)
     state = collections.OrderedDict()
     state['from'] = 1
     state['to'] = 2
     state['rank'] = [0, 1, 2]
     self.assertEqual(self.default, qtable.max(state))
Exemplo n.º 4
0
 def test_argmax_with_init(self):
     """ Test max(key) function. """
     qtable = QTable(self.actions)
     state = collections.OrderedDict()
     state['from'] = 1
     state['to'] = 2
     state['rank'] = [0, 1, 2]
     qtable[state][0] = 1.0
     self.assertEqual(0, qtable.argmax(state))
Exemplo n.º 5
0
 def test_argmax_with_parity(self):
     """ Test max(key) function. """
     qtable = QTable(self.actions)
     state = collections.OrderedDict()
     state['from'] = 1
     state['to'] = 2
     state['rank'] = [0, 1, 2]
     qtable[state][0] = 1.0
     qtable[state][1] = 1.0
     self.assertTrue(qtable.argmax(state) in [0, 1])
Exemplo n.º 6
0
def main():
    # step 1: loading the environment
    env = gym.make("FrozenLake-v0")

    # step 2: creating the Q-table
    state_size = env.observation_space.n
    action_size = env.action_space.n
    q = QTable(state_size, action_size)

    # step 3: creating de epsilon decay
    e = Epsilon(initial_epsilon=1.0, max_epsilon=1.0, min_epsilon=0.01, decay_rate=0.005)

    # step 4: Q-table training
    total_episodes = 100000
    max_steps = 100
    q, rewards = train_qtable(env, q, e, total_episodes, max_steps)

    print("Score over time {:.4f}".format(sum(rewards) / total_episodes))
    q.print()

    # Play
    env.reset()

    rewards = []

    for episode in range(1000):
        state = env.reset()
        step = 0
        total_rewards = 0

        for step in range(100):
            action = q.select_action(env, state)

            new_state, reward, done, info = env.step(action)

            total_rewards += reward
            state = new_state

            if done:
                break

        rewards.append(total_rewards)

        if episode % 100 == 0:
            print("******************************************")
            print("EPISODE {}".format(episode))
            print("Number of steps: {}".format(step))
            env.render()

    print("Score over time {:.4f}".format(sum(rewards) / 1000))

    env.close()
Exemplo n.º 7
0
def main():
    # Step 1: create the Taxi-v2 environment
    env = gym.make("Taxi-v2")

    # Step 2: create the QTable
    q = QTable(env.observation_space.n, env.action_space.n, learning_rate=0.7, gamma=0.99)

    # Step 3: create the Epsilon decay
    e = Epsilon()

    # Step 4: Q-table training
    total_episodes = 100000
    max_steps = 100
    q, rewards = train_qtable(env, q, e, total_episodes, max_steps, verbose=True)

    print("Score over time {:.4f}".format(sum(rewards) / total_episodes))
    q.print()

    env.render()

    # Play
    env.reset()

    rewards = []

    for episode in range(1000):
        state = env.reset()
        step = 0
        total_rewards = 0

        for step in range(100):
            action = q.select_action(env, state)

            new_state, reward, done, info = env.step(action)

            total_rewards += reward
            state = new_state

            if done:
                break

        rewards.append(total_rewards)

        if episode % 100 == 0:
            print("******************************************")
            print("EPISODE {}".format(episode))
            print("Number of steps: {}".format(step))
            env.render()

    print("Score over time {:.4f}".format(sum(rewards) / 1000))

    env.close()
class Agent:
    def __init__(self):
        self.qtable = QTable()

    def load_definitions(self, *defs):
        pass

    def train(self, env, epsilon=0.1, update_q_table=True):
        # RL training parameters
        alpha=0.1
        gamma=0.6
        
        steps = 0
        while not env.state.done:
            # ------------------------------------
            # Choose to explore or exploit
            # ------------------------------------
            if np.random.uniform(0, 1) < epsilon: # Explore action space
                action = self.qtable.get_random_action(env.state) 
            else: # Exploit the action space
                action = self.qtable.get_recommended_action(env.state) 

            if PRINT_ACTIONS_TAKEN: print(action, "\n\n")

            old_state = env.state.get_copy()
            next_state, reward, done, to_undo = env.step(action) # will return error and undo, if unsuccessful
            # ------------------------------------
            # Update the qtable
            # ------------------------------------
            if update_q_table and not isinstance(action, Undo): 
                self.qtable.update(old_state, next_state, action, reward, alpha, gamma)

            # ------------------------------------
            # if it's an already visited state, you should undo it so the proof search goes faster
            # ------------------------------------
            if to_undo:
                env.step(Undo())


            steps += 1

        print("The proof of", env.theorem.name)
        print("...took", steps, "steps.")
        print("Proof generated:", env.state.past_actions)

    def evaluate(self, env):
        self.train(env, epsilon=0, update_q_table=False) # only exploit, not explore

    def apply_antisymmetry(self):
        pass
Exemplo n.º 9
0
    def test_state_to_key(self):
        """ Test _state_to_key(key) function. """
        qtable = QTable(self.actions)
        state1 = collections.OrderedDict()
        state1['from'] = 1
        state1['to'] = 2
        state1['rank'] = [0, 1, 2]

        state2 = collections.OrderedDict()
        state2['from'] = 1
        state2['to'] = 2
        state2['rank'] = [0, 2, 1]

        self.assertNotEqual(qtable._state_to_key(state1), qtable._state_to_key(state2))
Exemplo n.º 10
0
def state_lookup():
    w = World(5, 5, [(1, 1, 3), (2, 2, 4)], [(3, 3, 3), (4, 4, 4)], -1, 13, 13)
    q = QTable(w)
    a = Agent(0, 0)

    state = get_current_state(w, a)

    assert (q[state] == {'north': 0, 'south': 0, 'east': 0, 'west': 0})
Exemplo n.º 11
0
def get_max_neighbors_test():
    a = Agent(3, 2)
    w = World(5, 5, [(1, 1, 3), (2, 2, 4)], [(3, 3, 3), (4, 4, 4)], -1, 13, 13)
    q = QTable(w)

    q[get_current_state(w, a)]['south'] = 13

    assert (get_max_neighbors(w.get_neighbors(*a.get_position()),
                              get_current_state(w, a), q) == ['south'])
Exemplo n.º 12
0
def manager(world,
            agent,
            algo,
            learning_rate,
            discount_rate,
            policy,
            num_steps,
            setup=None):
    """
        This function is kinda like the main, it will run the given algorithm
        on the given world with the given learning rate, discount rate and
        policy

        Parameters:
            world (World)
                An instance of a World Object representing the world

            agent (Agent):
                An instance of a Agent Object representing the agent

            algo (function):
                A fuinction to call with the world, agent, qtable, learning
                rate, discount rate and policy as parameters that will decide
                where the agent should move and also update the qtable

            learning_rate (float)

            discount_rate (float)

            policy (string)
                PRANDOM, PEPLOIT or PGREEDY

            num_steps (int)
                how many steps to run for

            setup (list of tuples)
                this is an optional parameter

                it represents different policies that should be activated after
                a particular number of steps

                if this parameter is supplied to the function, it is expected
                to be a list of tupes, where each of the tuples is expected to
                be of this format:
                    (integer, string)

                where the integer is the number of steps, and string is the
                policy to switch to after that many steps have been ran
    """
    if not setup:
        setup = []

    q = QTable(world._w, world._h)
Exemplo n.º 13
0
def main(flags):
    '''
        Runs an agent in an environment.
        params:
            flags (dict): configuration
    '''
    env = gym.make('FrozenLake-v0')
    agent = QTable(env, gamma=flags.gamma, alpha=flags.learning_rate)

    trainer = Trainer(env, agent, flags)
    rewards, lengths = trainer.train(flags.num_episodes, flags.max_steps)

    plot_results(rewards, lengths)
Exemplo n.º 14
0
def p_random_test():
    agent = Agent(3, 3)
    world = World(5, 5, [(1, 1, 3)], [(1, 2, 3)], -1, 13, 13)
    q = QTable(world)

    assert (p_random(agent, world, q) in ["north", "south", "east", "west"])

    world = World(5, 5, [(4, 3, 3)], [(1, 1, 3)], -1, 13, 13)

    assert (p_random(agent, world, q) == "east")

    agent.pick_up()
    agent._set_position(1, 2)
    assert (p_random(agent, world, q) == "north")
Exemplo n.º 15
0
 def test_set_without_init(self):
     """ Test qtable[state][action] = var if state not exists."""
     qtable = QTable(self.actions)
     state = collections.OrderedDict()
     state['from'] = 1
     state['to'] = 2
     state['rank'] = [0, 2, 1]
     qtable[state][0] = 1.0
     changed = {
         0: 1.0,
         1: self.default,
         2: self.default,
     }
     self.assertEqual(changed, qtable[state])
Exemplo n.º 16
0
 def test_get_with_init(self):
     """ Test qtable[state] if state not exists."""
     qtable = QTable(self.actions)
     state = collections.OrderedDict()
     state['from'] = 1
     state['to'] = 2
     state['rank'] = [0, 1, 2]
     wanted = {
         0: self.default,
         1: self.default,
         2: self.default,
     }
     qtable[state] = wanted
     self.assertEqual(wanted, qtable[state])
Exemplo n.º 17
0
    def test_dill(self):
        """ Test the dillability of the class. """
        qtable = QTable(self.actions)

        state1 = collections.OrderedDict()
        state1['from'] = 1
        state1['to'] = 2
        state1['rank'] = [0, 1, 2]

        state2 = collections.OrderedDict()
        state2['from'] = 1
        state2['to'] = 2
        state2['rank'] = [0, 2, 1]

        # create the states
        _ = qtable[state1]
        _ = qtable[state2]

        wanted = str(qtable)
        test = str(dill.loads(dill.dumps(qtable)))
        self.assertEqual(wanted, test)
Exemplo n.º 18
0
def main(filename=None,
         time_to_run=5,
         probability_moving=0.8,
         constant_reward=-0.04):
    """
    Main function for the program.
    :param filename: a txt file
    :param time_to_run: number of seconds to learn for
    :param probability_moving: probability of moving in the desired direction
    :param constant_reward: reward for moving (usually negative)
    :return: None
    """
    # Initialize the lookup table
    qtable = QTable()
    board = None
    # read the board from the file
    with open(filename, 'r') as f:
        board = f.read().split('\n')
        for index in range(len(board)):
            board[index] = board[index].split("\t")
        board = [list(x) for x in board]
    for row in board:
        for element in range(len(row)):
            row[element] = int(row[element])

    # Initialize the board
    board_object = Board(len(board), len(board[0]), board)
    # populate the lookup table with the movement reward as inital values
    board_object.populate_qtable(qtable, constant_reward)
    # Initialize the agent
    agent = Agent(qtable, board_object, time_to_run, probability_moving,
                  constant_reward)
    # Run the agent (start learning)
    results = agent.run()
    # print the results
    print(results)
Exemplo n.º 19
0
N_STATES = 6
ACTIONS = [0, 1]
EPSILON = 0.9
ALPHA = 0.1
GAMMA = 0.9
MAX_EPISODES = 100
FRESH_TIME = 0.1


if __name__ == "__main__":
    env = gym.make('GoAhead-v0')

    rl1 = QTable(
        n_states=N_STATES,
        epsilon=EPSILON,
        gamma=GAMMA,
        alpha=ALPHA,
        actions=ACTIONS
        )
    
    rl2 = DQN(
        n_states=1,
        n_actions=2,
        epsilon=EPSILON,
        gamma=GAMMA
    )

    for i_episode in range(MAX_EPISODES):
        observation = env.reset()
        for t in range(100):
            env.render(fresh_time=FRESH_TIME)
Exemplo n.º 20
0
def run(env,
        n_episodes,
        n_steps,
        initial_value=0,
        learning_rate=0.8,
        gamma=0.9,
        epsilon=0.1):

    env.reset()
    number_agents = len(env.agents)
    env.step(dict(zip(range(number_agents), [2] * number_agents)))

    my_env = LocalEnv(env.rail.grid, env.agents)
    initial_state = my_env.initial_state
    qtable = QTable(number_agents, initial_state, initial_value, gamma,
                    learning_rate)

    lap_time = datetime.now()

    steps_per_episode = []

    for episode in range(n_episodes):

        np.random.seed()
        my_env.restart_agents()
        current_state = my_env.get_current_state()

        if episode % 100 == 0:
            print('episode:', episode)
            print('in', datetime.now() - lap_time)
            lap_time = datetime.now()

        count = 0
        for step in range(n_steps):

            if choose_at_random(epsilon):
                current_possible_actions = my_env.get_current_possible_actions(
                )
                rand_index = np.random.choice(len(current_possible_actions))
                action = current_possible_actions[rand_index]
            else:
                action = qtable.get_max_action(current_state)

            if number_agents == 1:
                reward, new_state, handles_done = my_env.step({0: action})
            else:
                reward, new_state, handles_done = my_env.step(
                    dict(zip(range(number_agents), action)))

            count += 1

            if len(handles_done) == number_agents:
                break

#            if number_agents != 1 :
#                action = list(action)
#                for handle in handles_done:
#                    action[handle] = 4
#                action = tuple(action)
#
#            print('')
#            print(current_state)
#            print(action)
#            print(new_state)
            qtable.update_table(current_state, action, new_state, reward)
            current_state = new_state

        steps_per_episode.append(count)

    return steps_per_episode, my_env, qtable
Exemplo n.º 21
0
 def __init__(self, env):
     self.env = env
     self.qtable = QTable(env.action_space)
Exemplo n.º 22
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env, **kwargs):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        add_total = 0
        add_total = False
        self.success = 0
        self.total = 0
        self.counter = 0
        self.epsilon_reset_counter = 0
        self.trial_counter = 0.0
        self.min_epsilon = 0.001
        self.eps_freq = 1.0
        self.filled_cell_count = 0
        self.total_cell_count = 0
        self.updated_func_counter = 0
        global stats_df_counter
        global stats_df

        for key, value in kwargs.iteritems():
            print "%s = %s" % (key, value)
            if key == 'alp':
                self.alpha = value
            elif key == 'gma':
                self.gamma = value
            elif key == 'eps':
                self.epsl = value
        self.epsilon = self.epsl
        print "epsilon: ", self.epsilon
        self.qt = QTable(self.alpha, self.gamma)
        print '-' * 80

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        totalTime = self.env.get_deadline(self)
        self.qt.printVal(totalTime)
        self.trial_counter += 1.0
        if self.epsilon > self.min_epsilon:
            self.epsilon = (5.0 * self.epsl) / self.trial_counter
            self.eps_freq = math.ceil(1.0 / self.epsilon)
            print "self.epsilon:", self.epsilon, ", self.eps_freq: ", self.eps_freq, "\n"

    def update(self, t):
        global stats_df
        global stats_df_counter
        self.counter += 1
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        current_state = self.env.sense(self)
        self.state = current_state

        deadline = self.env.get_deadline(self)
        # TODO: Update state

        # TODO: Select action according to your policy

        #action = random.choice([None, 'forward', 'left', 'right'])
        #if self.total > 0 and self.total % self.epsilon_freq == 0.0:
        #    print "simulated annealing at ", self.total
        #    action = random.choice([None, 'forward', 'left', 'right'])
        #else:
        if self.epsilon > self.min_epsilon and deadline != 0 and deadline != self.eps_freq and math.floor(
                deadline % self.eps_freq) == 0.0:
            #self.epsilon_reset_counter += 1
            action = random.choice([None, 'forward', 'left', 'right'])
            print "annealing now.", "self.epsilon:", self.epsilon, ", action: ", action, ", deadline:", deadline

        else:
            #print "self.counter: ", self.counter, ", multiplier:", (self.counter * self.epsilon)
            action = self.qt.get_next_action(self.next_waypoint, deadline,
                                             current_state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        add_total = False
        if deadline == 0:
            add_total = True
        if reward > 10:
            self.success += 1
            add_total = True
        if add_total:
            self.total += 1
            print("success: {} / {}".format(self.success, self.total))

        if self.total == 100:

            for item, frame in self.qt.qtable.iteritems():
                for item2, frame2 in frame.iteritems():
                    for item3, frame3 in frame2.iteritems():
                        for item4, frame4 in frame3.iteritems():
                            self.total_cell_count += 1
                            #print("f4:", frame4)
                            if frame4 != 0.0:
                                #print "\n"
                                self.printNav(item2)
                                self.printTraffic(item3)
                                self.printTrafficLight(item4)
                                self.printAction(item)
                                print "Q-Val: {0:.5f}".format(frame4)
                                self.filled_cell_count += 1
            print '-' * 80
            print "updated cells: ", self.filled_cell_count, ", self.total_cell_count:", self.total_cell_count, ", updated_func_counter:", self.updated_func_counter
            print "self.alpha:", self.alpha, "self.gamma:", self.gamma, ", self.epsilon:", self.epsl, ", success:", self.success, " in steps: ", deadline
            stats_df.loc[stats_df_counter] = [
                self.alpha, self.gamma, self.epsl, self.success, deadline
            ]
            stats_df_counter += 1
            print '_' * 80
            #    print '_'*20
        # TODO: Learn policy based on state, action, reward
        next_state_value = self.env.sense(self)
        next_state_deadline = self.env.get_deadline(self)
        next_state_waypoint = self.planner.next_waypoint()
        self.qt.update(self.next_waypoint, deadline, current_state, action,
                       reward, next_state_value, next_state_waypoint, self,
                       self.env)
        self.updated_func_counter += 1

    def printAction(self, code):
        print '|',
        if code == 'AN':
            print "Action: None",
        elif code == 'BF':
            print "Action: Forward",
        elif code == 'CR':
            print "Action: Right",
        elif code == 'DL':
            print "Action: Left",
        print '|',

    def printNav(self, code):
        print '|',
        if code == 0:
            print "Nav: None",
        elif code == 1:
            print "Nav: Forward",
        elif code == 2:
            print "Nav: Right",
        elif code == 3:
            print "Nav: Left",

    def printTraffic(self, code):
        left_mask = 0b000011
        right_mask = 0b001100
        oncoming_mask = 0b110000

        left_filtered = code & left_mask
        right_filtered = code & right_mask
        oncoming_filtered = code & oncoming_mask

        print '| Traffic state: ',
        if left_filtered == 0:
            print "Left: None",
        elif left_filtered == 1:
            print "Left: Forward",
        elif left_filtered == 2:
            print "Left: Right",
        elif left_filtered == 3:
            print "Left: Left",
        print '-+-',

        if right_filtered == 0:
            print "Right: None",
        elif right_filtered == 4:
            print "Right: Forward",
        elif right_filtered == 8:
            print "Right: Right",
        elif right_filtered == 12:
            print "Right: Left",
        print '-+-',

        if oncoming_filtered == 0:
            print "Oncoming: None",
        elif oncoming_filtered == 16:
            print "Oncoming: Forward",
        elif oncoming_filtered == 32:
            print "Oncoming: Right",
        elif oncoming_filtered == 48:
            print "Oncoming: Left",

    def printTrafficLight(self, code):
        print '| ',
        if code == 0:
            print "Light: Red",
        else:
            print "Light: Green",
 def loadQTable(self):
     self.qtable = QTable(self.states, self.actions, getNextState=self._getNextState)
     self.qtable[self.states[0], self.actions[0]].nextState = self.states[0]
     self.qtable[self.states[-1], self.actions[1]].nextState = self.states[-1]
     for key in self.qtable:
def main():
    """Main procedure"""

    # Init data structures
    qtable = QTable([-10, 10], \
                    [ \
                        (-AREA_SIZE, AREA_SIZE, 8), \
                        (-1, 1, 10), \
                        (-SAFE_ANGLE_RAD, SAFE_ANGLE_RAD, 28), (-1, 1, 28) \
                    ])

    # Inverted pendulum model
    initial_state = (0.0, 0.0, 0.0, 0.0)
    model = Model(initial_state, AREA_SIZE, SAFE_ANGLE_RAD)

    # Reinforcement learning
    qtable.learn(model,
                 LEARNING_ITERATION, \
                 MAX_STATE_TRANSITIONS, \
                 SIMULATION_TIME_DELTA, \
                 LEARNING_RATE, \
                 DISCOUNT_FACTOR)

    # Visualize QTable
    qtable.draw(os.path.join(_DIR, "./../output/qtable.png"))

    # Clear temporary files
    delete_temp_files()

    # Run inverted pendulum system simulation
    model.reset()

    for i in range(0, round(SIMULATION_TIME / SIMULATION_TIME_DELTA) + 1):
        print("%.2fsec" % (i * SIMULATION_TIME_DELTA), \
              model.get_state(), \
              qtable.get_q_vals(model.get_state()))
        print(model.is_system_safe())

        if not model.is_system_safe():
            print("FAIL")
            break

        force = qtable.get_best_action(model.get_state())
        model.simulate(force, SIMULATION_TIME_DELTA)

        model.draw_state(force, os.path.join(_DIR, \
                                             "./../output/state_%06dms.png" % \
                                             (i)), qtable)

    # Generate video
    print("\nGenerating video...", end="")
    sys.stdout.flush()

    if _platform == "linux":                                         # GNU/Linux
        subprocess.call(os.path.join(_DIR, "./../make_video.sh"), shell=True)
    elif _platform == "darwin":                                           # OS X
        pass
    elif _platform == "win32" or _platform == "cygwin":             # Windows...
        subprocess.call(os.path.join(_DIR, "./../make_video.bat"), shell=True)
        import winsound
        freq = 2500
        dur = 1000
        winsound.Beep(freq, dur)

    print("DONE")
Exemplo n.º 25
0
#    Num Observation         Min                     Max
#    0   Cart Position       -4.8                    4.8
#    1   Cart Velocity       -Inf                    Inf
#    2   Pole Angle          -0.418 rad (-24 deg)    0.418 rad (24 deg)
#    3   Pole Velocity       -Inf                    Inf

bounds = list(zip(env.observation_space.low, env.observation_space.high))

# Velocity bounds by default are infinite, so rebind them.
bounds[1] = [-1, 1]
bounds[3] = [-math.radians(50), math.radians(50)]

observation_space = discretize_observation_space(
    bounds, [15, 5, math.radians(1), math.radians(2)])
actions = [i for i in range(env.action_space.n)]
table = QTable(observation_space, actions)

prev_score = 0

for episode in range(n_episodes):
    observation = env.reset()
    state_action_pairs = []
    t_steps_taken = 0

    while True:
        env.render()
        state = discretize_state(observation, observation_space)
        action = table.decide_action(state)
        state_action_pairs.append((state, action))
        observation, reward, done, info = env.step(action)
        t_steps_taken += 1
Exemplo n.º 26
0
def state_space_present_test():
    w = World(5, 5, [(1, 1, 3), (2, 2, 4)], [(3, 3, 3), (4, 4, 4)], -1, 13, 13)
    q = QTable(w)

    assert (len(q._table) == 5 * 5 * 2 * 2 * 2 * 2 * 2)
Exemplo n.º 27
0
    scheduler = 5
    with open('waiting_time_{0}.csv'.format(scheduler), mode='w') as waiting_time_file, \
        open('action_selection.csv', mode='w') as action_selection:
        waiting_time_file = csv.writer(waiting_time_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        action_selection = csv.writer(action_selection, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        simulator = LogicSimulator(waiting_time_file=waiting_time_file, action_file=action_selection)
        simulator.schedulers = [
            FifoScheduler(simulator),
            LQFScheduler(simulator),
            FixedTimeScheduler(simulator, 300),
            FixedTimeScheduler(simulator, 200),
            FixedTimeScheduler(simulator, 400),
            #PrioWEScheduler(env)
        ]
        agent = QTable(256, len(simulator.schedulers), simulator.schedulers)
        agent.load_table()
        done = False     
        hour = 1
        state = simulator.get_state()
        while not done:
            state, _, done = simulator.step(agent.act(state, greedy=False))
            #_, _, done = simulator.step(scheduler)
            if (simulator.time % simulator.time_steps_per_hour == 0):
                print('Simulating hour: {0}'.format(hour))
                simulator.save_stats()
                hour += 1
        """
        plt.subplot(4,1,1)
        plt.plot(simulator.x, simulator.ny, 'b',label='NORTH')
        plt.legend(loc='upper left')
Exemplo n.º 28
0
def main():
    # print(colorsys.rgb_to_hsv(86, 201, 123))
    # exit()
    f = open("results.txt", "a")

    red = (217, 41, 56)
    purple = (148, 105, 191)
    blue = (36, 132, 191)
    green = (50, 166, 46)
    orange = (242, 98, 15)
    expColors = [blue, orange, green, red, purple]
    render = False
    seedC = 42
    pygame.init()
    clock = pygame.time.Clock()
    for run in range(2):

        plot1Surface = pygame.Surface((580, 440))
        plot1Surface.fill((199, 189, 189))
        plot2Surface = pygame.Surface((480, 440))
        plot2Surface.fill((199, 189, 189))

        # plt.show()
        np.random.seed(seedC)
        frameRate = 10
        cellSize = 40
        agentSize = 4
        mainSurfaceSize = (1380, 820)
        flags = DOUBLEBUF
        mainSurface = pygame.display.set_mode(mainSurfaceSize, flags)
        mainSurface.set_alpha(None)
        pygame.display.set_caption("QLearning and SARSA")
        mainSurface.fill((199, 189, 189))
        pickupPoints = [(0, 0), (2, 2), (4, 4)]
        dropoffPoints = [(1, 4), (4, 0), (4, 2)]
        NUM_STATES = 50
        NUM_ACTIONS = 6
        qtableLocation = (1100, 0)
        numGrid = (5, 5)

        pickupItemCount1 = [5, 5, 5]
        dropoffItemCount1 = [0, 0, 0]
        startingState = State(0, 4, 0)
        startLocation1 = (0, 0)

        world1 = PDWorld(startLocation1, cellSize, mainSurfaceSize, numGrid,
                         startingState, agentSize, pickupPoints, dropoffPoints,
                         pickupItemCount1, dropoffItemCount1)

        pickupItemCount2 = [5, 5, 5]
        dropoffItemCount2 = [0, 0, 0]
        startingState2 = State(0, 4, 0)
        startLocation2 = (220, 0)
        world2 = PDWorld(startLocation2, cellSize, mainSurfaceSize, numGrid,
                         startingState, agentSize, pickupPoints, dropoffPoints,
                         pickupItemCount2, dropoffItemCount2)

        pickupItemCount3 = [5, 5, 5]
        dropoffItemCount3 = [0, 0, 0]
        startingState3 = State(0, 4, 0)
        startLocation3 = (440, 0)
        world3 = PDWorld(startLocation3, cellSize, mainSurfaceSize, numGrid,
                         startingState, agentSize, pickupPoints, dropoffPoints,
                         pickupItemCount3, dropoffItemCount3)

        pickupItemCount4 = [5, 5, 5]
        dropoffItemCount4 = [0, 0, 0]
        startingState4 = State(0, 4, 0)
        startLocation4 = (660, 0)
        world4 = PDWorld(startLocation4, cellSize, mainSurfaceSize, numGrid,
                         startingState, agentSize, pickupPoints, dropoffPoints,
                         pickupItemCount4, dropoffItemCount4)

        pickupItemCount5 = [5, 5, 5]
        dropoffItemCount5 = [0, 0, 0]
        startingState5 = State(0, 4, 0)
        startLocation5 = (880, 0)
        world5 = PDWorld(startLocation5, cellSize, mainSurfaceSize, numGrid,
                         startingState, agentSize, pickupPoints, dropoffPoints,
                         pickupItemCount5, dropoffItemCount5)

        policy1 = Policy(PolicyType.RANDOM)
        policy2 = Policy(PolicyType.RANDOM)
        policy3 = Policy(PolicyType.RANDOM)
        policy4 = Policy(PolicyType.RANDOM)
        policy5 = Policy(PolicyType.RANDOM)

        qtable1 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize,
                         qtableLocation, Populate.ZEROS, world1)
        qtable2 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize,
                         qtableLocation, Populate.ZEROS, world2)
        qtable3 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize,
                         qtableLocation, Populate.ZEROS, world3)
        qtable4 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize,
                         qtableLocation, Populate.ZEROS, world4)
        qtable5 = QTable(NUM_STATES, NUM_ACTIONS, mainSurfaceSize,
                         qtableLocation, Populate.ZEROS, world5)
        #
        r1 = RLearning(1, world1, qtable1, policy1, RL.Q_LEARNING, 0.3, 0.5,
                       0.2, 0, 8000, 0, f)
        r2 = RLearning(2, world2, qtable2, policy2, RL.Q_LEARNING, 0.3, 0.5,
                       0.2, 0, 8000, 0, f)
        r3 = RLearning(3, world3, qtable3, policy3, RL.SARSA, 0.3, 0.5, 0.2, 0,
                       8000, 0, f)
        r4 = RLearning(4, world4, qtable4, policy4, RL.SARSA, 0.3, 1, 0.2, 0,
                       8000, 0, f)
        r5 = RLearning(5, world5, qtable5, policy5, RL.Q_LEARNING, 0.3, 0.5,
                       0.2, 0, 8000, 0, f)

        qtables = [qtable1, qtable2, qtable3, qtable4, qtable5]
        rl = [r1, r2, r3, r4, r5]
        currentStates = []
        nextStates = []
        for i in range(len(rl)):
            currentStates.append(startingState)
            nextStates.append(startingState)
        i = 0
        for r in rl:
            r.world.qtable = qtables[i]
            i += 1
            r.nextEpisode()
        # pygame.time.wait(1500)
        selected = 0
        for step in range(8000):
            mainSurface.fill((199, 189, 189))
            # if step == 400:
            #     frameRate = 0.5
            clickBoxes = []
            for r in range(len(rl)):
                minX = rl[r].world.startLocation[0]
                minY = rl[r].world.startLocation[1]
                maxX = rl[r].world.startLocation[
                    0] + cellSize * numGrid[0] + 10 + 6
                maxY = rl[r].world.startLocation[
                    1] + cellSize * numGrid[1] + 10 + 6
                clickBoxes.append([minX, maxX, minY, maxY])

            event = pygame.event.poll()
            if event.type == pygame.QUIT:
                exit()
            if event.type == pygame.MOUSEBUTTONDOWN:
                mousex, mousey = pygame.mouse.get_pos()
                for i in range(len(rl)):
                    if mousex > clickBoxes[i][0] and mousex < clickBoxes[i][
                            1] and mousey > clickBoxes[i][
                                2] and mousey < clickBoxes[i][3]:
                        selected = i
                    # if r.expNum-1 == selected:
                    #     r.world.selected = True
                    #     print(clickBoxes[selected], mousex, mousey)
                    #
                    # else:
                    #     r.world.selected = False

            for r in rl:
                r.color = expColors[r.expNum - 1]
                # print(r.r)
                currentStates[r.expNum - 1] = r.s
                event = pygame.event.poll()
                if event.type == pygame.QUIT:
                    exit()
                if event.type == MOUSEBUTTONDOWN:
                    #
                    # pygame.display.update()
                    # mainSurface.fill((199, 189, 189))
                    if event.button == 3:
                        for ri in range(len(rl)):
                            if selected == ri:
                                if rl[ri].qtable.selected == 1:
                                    rl[ri].qtable.selected = 0
                                else:
                                    rl[ri].qtable.selected = 1

                expN = r.expNum
                if expN - 1 == selected:
                    r.world.selected = True
                    r.world.colorMode = True
                else:
                    r.world.selected = False
                    r.world.colorMode = False

                # if step == 3999 or step == 199 or step == r.steps:
                #     original = []
                #     for i in range(5):
                #         original.append(rl[i].world.state.b)
                #         for j in range(2):
                #             txt = ["_without_package.png","_with_package.png"]
                #             rl[i].world.state.b = j
                #
                #             rl[i].update()
                #             # mainSurface.fill(Color.VL_GREY)
                #             # pygame.display.update()
                #             rl[i].draw(mainSurface)
                #             pygame.display.update()
                #             filename = 'Run_' + str(run+1) + '_Experiment_' + str(i+1) + '_' +str(step) + txt[j]
                #             # 422 + 274
                #
                #             qtables[i].update()
                #             qtables[i].draw(mainSurface)
                #             qtableSurface = pygame.Surface((422,816))
                #             qtableSurface.blit(qtables[i].surface,(0,0))
                #             surface = pygame.Surface((274,410))
                #             surface.fill((199, 189, 189))
                #             surface.blit(rl[i].world.surface,(0,0))
                #             surface.blit(rl[i].surface,(12,274))
                #             pygame.image.save(qtableSurface,'Run_'+str(run+1)+'_Experiment_'+str(i+1)+'_qtable_'+txt[j]+'.png')
                #             pygame.image.save(surface,filename)
                #         rl[i].world.state.b = original[i]

                if r.expNum == 1 and step == 4000:
                    r.policy.switchPolicy(PolicyType.GREEDY)
                if r.expNum == 2 and step == 200:
                    r.policy.switchPolicy(PolicyType.EXPLOIT)
                if r.expNum == 3 and step == 200:
                    r.policy.switchPolicy(PolicyType.EXPLOIT)
                if r.expNum == 4 and step == 200:
                    r.policy.switchPolicy(PolicyType.EXPLOIT)
                if r.expNum == 5 and step == 200:
                    r.policy.switchPolicy(PolicyType.EXPLOIT)
                if r.expNum == 5 and r.isTerminalState():
                    r.world.dropoffPoints = pickupPoints
                    r.world.pickupPoints = dropoffPoints
                    # exit()

                # actns = r.world.getApplicableActions(r.world.state)
                # # if not r.isTerminalState():
                # newstate = r.applyaction(r.world.state,np.random.choice(actns))
                # r.world.state = newstate
                if r.isTerminalState():
                    r.nextEpisode()
                    r.minStep.append(r.currentStep)
                    r.currentStep = 0
                    r.world.reset()

                r.update()
                # r.world.draw(mainSurface)

            for r in rl:
                r.draw(mainSurface)
                if step < r.steps:
                    r.nextStep()

            qtables[selected].update()
            qtables[selected].draw(mainSurface)

            for r in range(len(rl)):
                if r == selected:
                    color = (0, 0, 0)
                    offsetx = 0
                    offsety = 29

                    startL = (rl[selected].world.startLocation[0] +
                              cellSize * numGrid[0] + offsetx,
                              rl[selected].world.startLocation[1] +
                              cellSize * numGrid[1] + offsety)
                    startL1 = (rl[selected].world.startLocation[0] +
                               cellSize * numGrid[0] + offsetx + 15,
                               rl[selected].world.startLocation[1] +
                               cellSize * numGrid[1] + offsety)
                    startL2 = (rl[selected].world.startLocation[0] +
                               cellSize * numGrid[0] + offsetx + 15,
                               rl[selected].world.startLocation[1] +
                               cellSize * numGrid[1] + offsety + 135)
                    startL3 = (1095, 364)
                    startLL = (1095, 415)

                    # pygame.draw.circle(mainSurface, color, startL1, 2)
                    # pygame.draw.circle(mainSurface, color, startL2, 2)

                    # pygame.draw.circle(mainSurface, color, startLL, 2)
                    pygame.draw.line(mainSurface, expColors[selected], startL,
                                     startL1, 2)
                    pygame.draw.line(mainSurface, expColors[selected], startL1,
                                     startL2, 2)
                    pygame.draw.line(mainSurface, expColors[selected], startL2,
                                     startL3, 2)
                    pygame.draw.line(mainSurface, expColors[selected], startL3,
                                     startLL, 2)
                    pygame.draw.circle(mainSurface, (255, 255, 255), startL, 7)
                    pygame.draw.circle(mainSurface, expColors[selected],
                                       startL, 5)
                    pygame.draw.circle(mainSurface, (255, 255, 255), startL, 3)

                    pygame.draw.circle(mainSurface, color, startLL, 4)

                # lw = 38
                # startColorCo = (rl[r].world.startLocation[0] + 10,
                #                 rl[r].world.startLocation[1] + cellSize * numGrid[1] + lw)
                # # pygame.draw.circle(mainSurface, purple, startColorCo, 2)
                #
                # startColorCoE = (rl[r].world.startLocation[0] + cellSize * numGrid[0] + 8,
                #                  rl[r].world.startLocation[1] + cellSize * numGrid[1] + lw)
                # pygame.draw.line(mainSurface,expColors[r],startColorCo,startColorCoE, 6)

            for r in rl:
                nextStates[r.expNum - 1] = r.s

            if step % 1 == 0:
                # plt.figure(figsize=(5,5))
                plot1Surface.fill((199, 189, 189))
                fig, ax = plt.subplots(figsize=(6.2, 4.4), facecolor='#C7BDBD')
                canvas = agg.FigureCanvasAgg(fig)
                t = range(step + 1)

                ax.set(xlabel='step', ylabel='reward', title='Step vs Reward')
                ax.set_facecolor('#C7BDBD')
                mpl.rcParams['legend.facecolor'] = '#C7BDBD'
                mpl.rcParams["legend.fancybox"] = False
                for r in rl:
                    s = r.rewardPerTimeStep
                    lt = 1
                    ls = ':'
                    if r.expNum - 1 == selected:
                        lt = 2.0
                        ls = '-'
                    ax.plot(t,
                            s,
                            label='Exp ' + str(r.expNum),
                            linewidth=lt,
                            linestyle=ls)
                    ax.grid()
                    ax.legend()
                canvas.draw()
                renderer = canvas.get_renderer()
                raw_data = renderer.tostring_rgb()
                size = canvas.get_width_height()
                image = pygame.image.fromstring(raw_data, size, "RGB")

                # fig.savefig("stepVreward.png", transparent=True)
                # image = pygame.image.load('stepVreward.png')
                # image = pygame.transform.scale(image,(400,400))
                plt.close('all')
                rect = image.get_rect()
                plot1Surface.blit(image, rect)

            # plot2Surface.fill((199, 160, 189))

            for r in rl:
                if r.isTerminalState():
                    fig1, ax1 = plt.subplots(figsize=(4.8, 4.4))
                    ax1.set(xlabel='s/e',
                            ylabel='steps',
                            title='steps per terminal episode')
                    plot2Surface.fill((199, 189, 189))
                    for rk in rl:
                        t = range(rk.episodes - 1)
                        s = rk.minStep
                        ax1.plot(t, s, marker='o')
                    fig1.savefig('sevs.png', transparent=True)
                    image1 = pygame.image.load('sevs.png')
                    rect1 = image1.get_rect()
                    plot2Surface.blit(image1, rect1)

            mainSurface.blit(plot1Surface, (10, 370))
            mainSurface.blit(plot2Surface, (610, 370))
            plt.close('all')
            pygame.display.update()
            clock.tick(frameRate)
            seedC += 1

        for r in rl:
            l = str(run + 1) + ' ' + str(r.expNum) + ' '
            f.write(l)
            r.saveRunStatistics()

    f.close()
Exemplo n.º 29
0
def manager(world,
            agent,
            learning_function,
            learning_rate,
            discount_rate,
            policy,
            num_steps,
            setup=None,
            swap_after_iter=None,
            filename="give_me_a_name.txt",
            state_space='big'):
    """
        This function is kinda like the main, it will run the given
        learning_function on the given world with the given learning rate,
        discount rate and policy

        Parameters:
            world (World)
                An instance of a World Object representing the world

            agent (Agent):
                An instance of a Agent Object representing the agent

            learning_function (function):
                A fuinction to call with the world, agent, qtable, learning
                rate, discount rate and policy as parameters that will decide
                where the agent should move and also update the qtable

            learning_rate (float)

            discount_rate (float)

            policy (function)
                The function for the given policy

            num_steps (int)
                how many steps to run for

            setup (list of tuples)
                this is an optional parameter

                it represents different policies that should be activated after
                a particular number of steps

                if this parameter is supplied to the function, it is expected
                to be a list of tupes, where each of the tuples is expected to
                be of this format:
                    (integer, function)

                where the integer is the number of steps, and function is the
                policy to switch to after that many steps have been ran
    """
    if not setup:
        setup = []

    q = QTable(world, state_space=state_space)
    current_step = 0

    # Set this to None here for the SARSA algorithm.  We won't be
    action = None
    next_action = None
    iteration = 1

    movements = []

    steps_this_iter = 0
    steps_per_iter = []

    heatmap = [[0 for _ in range(world._w)] for __ in range(world._h)]
    swapped = False

    while current_step < num_steps:
        movements.append((agent.get_position(), agent.is_holding_block()))
        heatmap[agent.get_position()[1]][agent.get_position()[0]] += 1

        if is_world_solved(world, agent):
            world.reset_world()
            agent.reset_to_start()
            iteration += 1

            steps_per_iter.append(steps_this_iter)
            steps_this_iter = 0

        steps_this_iter += 1

        if swap_after_iter and (swap_after_iter +
                                1) == iteration and not swapped:
            world.swap_pickup_dropoff()
            swapped = True

        # The policy will tell us what our next action will be
        #
        # We have to also compute the next action because SARSA requires this
        # information.  We just return a new agent object that has pretended to
        # move for the first action to the policy function.
        #
        #   There is probably better way of doing this
        if not action:
            action = policy(agent, world, q, state_space=state_space)
            next_action = policy(agent.pretend_move(action),
                                 world,
                                 q,
                                 state_space=state_space)
        else:
            action = next_action
            next_action = policy(agent.pretend_move(action),
                                 world,
                                 q,
                                 state_space=state_space)

        #   Update the q table based on the state we are in and the action we
        #   have chosen
        learning_function(world,
                          agent,
                          q,
                          action,
                          next_action,
                          learning_rate,
                          discount_rate,
                          state_space=state_space)

        # If we are on a pick up and don't have a block, pick it up.  If we are
        # on a drop off and have a block, drop it off
        pickup_dropoff(world, agent)

        # Move to our new location based on our action
        agent.move(action)

        current_step += 1

        policy = get_new_policy(setup, current_step, policy)

    write_experiment_output(OUT_DIR, filename, world, agent, q, policy,
                            iteration, movements, heatmap, state_space,
                            steps_per_iter)
 def __init__(self):
     self.qtable = QTable()