Пример #1
0
    def __init__(self,
                 num_states=(2 * 4) * (2 * 4) * 2,
                 num_actions=5,
                 alpha=1.0,
                 alpha_decay=0.99996,
                 gamma=0.9,
                 epsilon=1.0,
                 epsilon_decay=0.99996,
                 max_steps_per_episode=1000,
                 max_num_episodes=1000000,
                 save_model_per=1000000,
                 verbose=False):

        self.world = World()
        self.game = Play_game()

        # inputs
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.max_steps_per_episode = max_steps_per_episode
        self.max_num_episodes = max_num_episodes
        self.save_model_per = save_model_per

        # initialize
        self.num_states = num_states  # 4x2 grid, two players, with or without ball
        self.num_actions = num_actions

        self.q_table = np.full(shape=(num_states, num_actions), fill_value=1.0)
        self.q_tables = {
            'A': deepcopy(self.q_table),
            'B': deepcopy(self.q_table)
        }

        self.state = {}
        self.actions = {
            'A': 0,
            'B': 0
        }  # map N, S, E, W, and stick to [0,1,2,3,4]

        # error
        self.ERRs = []
        self.steps_to_plot = []

        self.verbose = verbose
Пример #2
0
class FriendQLearner:
    def __init__(self,
                 num_states=(2 * 4) * (2 * 4) * 2,
                 num_actions=5,
                 alpha=0.05,
                 alpha_decay=0.99999,
                 gamma=0.9,
                 epsilon=1.0,
                 max_steps_per_episode=1000,
                 max_num_episodes=10000000,
                 save_model_per=1000000,
                 verbose=True):

        self.world = World()
        self.game = Play_game()

        # inputs
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self.gamma = gamma
        self.epsilon = epsilon
        self.max_steps_per_episode = max_steps_per_episode
        self.max_num_episodes = max_num_episodes
        self.save_model_per = save_model_per

        # initialize
        self.num_states = num_states  # 4x2 grid, two players, with or without ball
        self.num_actions = num_actions

        q_table = np.full(shape=(num_states, num_actions, num_actions),
                          fill_value=0.0)
        self.q_tables = {'A': deepcopy(q_table), 'B': deepcopy(q_table)}

        self.state = {}

        # error
        self.ERRs = []
        self.steps_to_plot = []

        self.verbose = verbose

    def friendQ_agent(self):

        self.step_count = 1

        for episode in range(self.max_num_episodes):
            # reset game
            self.all_states, self.state, _ = self.game.init_game()

            # play game and learn
            for t in range(self.max_steps_per_episode):

                # get actions
                current_actions = self.get_actions()

                # observe
                state_prime, r, done, _ = self.game.play(current_actions)

                # update
                self.update_Q(current_actions, state_prime, r['A'], 'A')
                self.update_Q(current_actions, state_prime, r['B'], 'B')
                self.state = state_prime

                self.step_count += 1

                # save and plot
                if self.step_count > 0 and self.step_count % self.save_model_per == 0:
                    experiment_id = 1
                    self.save_data(experiment_id)
                    self.plot_error(experiment_id)

                if done:
                    self.alpha *= self.alpha_decay
                    break

            if self.verbose:
                print('episode: ', episode)

            if self.step_count > 1000000:
                break

    def get_actions(self):
        # off-policy
        actions = {}
        actions['A'] = rand.randint(0, self.num_actions - 1)
        actions['B'] = rand.randint(0, self.num_actions - 1)

        # if t == 0:
        #     self.actions = actions

        return actions

    def update_Q(self, actions, state_prime, r, player_name):

        state_index = self.all_states[self.state]
        state_prime_index = self.all_states[state_prime]
        # other_player = set(['A', [B]]) - set(player_name)
        # print other_player

        # update Q table
        V = np.amax(self.q_tables[player_name][state_prime_index])

        # error = (((1 - self.gamma) * r + self.gamma * V) - self.q_tables[player_name][state_index, actions['A'], actions['B']]) * self.alpha
        error = ((r + self.gamma * V) -
                 self.q_tables[player_name][state_index, actions['A'],
                                            actions['B']]) * self.alpha
        self.q_tables[player_name][state_index, actions['A'],
                                   actions['B']] += error

        # collect errors
        if player_name == 'A' and self.state == 'B21' and actions[
                'A'] == 1 and actions['B'] == 4:
            self.ERRs.append(abs(error))
            self.steps_to_plot.append(self.step_count)
            # print self.ERRs

        if self.verbose:
            print('Action of B at state s: ',
                  np.argmax(self.q_tables['B'][self.all_states['B21']]) % 5)

    def plot_error(self, experiment_id):

        err_to_plot = self.ERRs[::20]
        step_to_plot = self.steps_to_plot[::20]

        plt.plot(step_to_plot, err_to_plot, '-', linewidth=1.3)
        plt.ylim(0, 0.5)
        plt.xlim(0, 1000000)
        plt.xlabel("Simulation Iteration")
        plt.ylabel("Q-value Difference")
        plt.title("Friend-Q")
        plt.savefig('outputs/FriendQ_exp_' + str(experiment_id) + '_' +
                    str(self.step_count) + '.png')
        plt.show(block=False)
        plt.show()

    def save_data(self, experiment_id):
        error_file_name = 'outputs/data_FriendQ_error_exp_' + str(
            experiment_id) + '.txt'
        error_file = open(error_file_name, 'w')
        for item in self.ERRs:
            error_file.write("%s\n" % item)

        # step_file = open('step_test.txt', 'w')
        step_file_name = 'outputs/data_FriendQ_step_exp_' + str(
            experiment_id) + '.txt'
        step_file = open(step_file_name, 'w')
        for item in self.steps_to_plot:
            step_file.write("%s\n" % item)

        if self.verbose:
            print('epsilon:', self.epsilon)
            print('alpha: ', self.alpha)
Пример #3
0
class CeQLearner:
    def __init__(
            self,
            num_states=(2 * 4) * (2 * 4) * 2,
            num_actions=5,
            alpha=1.0,
            alpha_decay=0.99997,
            gamma=0.9,
            epsilon=1.0,
            max_steps_per_episode=1000,  #1000
            max_num_episodes=10000000,  #10000000
            save_model_per=10000,
            verbose=True):

        self.world = World()
        self.game = Play_game()

        # inputs
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self.gamma = gamma
        self.epsilon = epsilon
        self.max_steps_per_episode = max_steps_per_episode
        self.max_num_episodes = max_num_episodes
        self.save_model_per = save_model_per

        # initialize
        self.num_states = num_states  # 4x2 grid, two players, with or without ball
        self.num_actions = num_actions

        q_table = np.full(shape=(num_states, num_actions, num_actions),
                          fill_value=1.0)
        self.q_tables = {'A': deepcopy(q_table), 'B': deepcopy(q_table)}

        self.state = {}

        # error
        self.ERRs = []
        self.steps_to_plot = []

        self.verbose = verbose

    def ceQ_agent(self):

        self.step_count = 1

        for episode in range(self.max_num_episodes):
            # reset game
            self.all_states, self.state, _ = self.game.init_game()

            # play game and learn
            for t in range(self.max_steps_per_episode):

                # get actions
                current_actions = self.get_actions()

                # observe
                state_prime, r, done, _ = self.game.play(current_actions)

                # update
                self.update_Q(current_actions, state_prime, r['A'], r['B'])
                self.state = state_prime

                self.step_count += 1

                # save and plot
                if self.step_count > 0 and self.step_count % self.save_model_per == 0:

                    experiment_id = 1
                    self.save_data(experiment_id)
                    self.plot_error(experiment_id)

                if done:
                    self.alpha *= self.alpha_decay
                    break

            if self.step_count > 1000000:
                break

    def get_actions(self):
        # off-policy
        actions = {}
        actions['A'] = rand.randint(0, self.num_actions - 1)
        actions['B'] = rand.randint(0, self.num_actions - 1)

        return actions

    def get_V(self, q_table_0, q_table_1, state_prime):
        num_states, num_action_0, num_action_1 = q_table_0.shape

        q_table_0 = q_table_0[state_prime]
        q_table_1 = q_table_1[state_prime]

        c = -(q_table_0.flatten() + q_table_1.flatten())

        A_ub = []

        for i in range(num_action_0):
            for j in range(num_action_0):
                if i != j:
                    A_ub.append(
                        np.vstack([
                            np.zeros((i, num_action_1)),
                            q_table_0[j, :] - q_table_0[i, :],
                            np.zeros((num_action_0 - i - 1, num_action_1))
                        ]).flatten())

        for i in range(num_action_1):
            for j in range(num_action_1):
                if i != j:
                    A_ub.append(
                        np.vstack([
                            np.zeros((i, num_action_0)),
                            q_table_1[:, j] - q_table_1[:, i],
                            np.zeros((num_action_1 - i - 1, num_action_0))
                        ]).transpose().flatten())

        A_ub = np.stack(A_ub, 0)
        b_ub = np.zeros([A_ub.shape[0]])

        A_eq = [[1.0] * num_action_0 * num_action_1]
        b_eq = [1.0]

        bounds = [[0.0, None]] * num_action_0 * num_action_1

        res = linprog(c,
                      A_ub=A_ub,
                      b_ub=b_ub,
                      A_eq=A_eq,
                      b_eq=b_eq,
                      bounds=bounds)

        return res

    def update_Q(self, actions, state_prime, r_A, r_B):

        state_index = self.all_states[self.state]
        state_prime_index = self.all_states[state_prime]

        # update Q table
        # V = np.amax(self.q_tables[player_name][state_prime_index])
        res = self.get_V(self.q_tables['A'], self.q_tables['B'],
                         state_prime_index)

        if res.success:
            V = -res.fun
            error_A = (((1 - self.gamma) * r_A + self.gamma * V) -
                       self.q_tables['A'][state_index, actions['A'],
                                          actions['B']]) * self.alpha
            error_B = (((1 - self.gamma) * r_B + self.gamma * V) -
                       self.q_tables['B'][state_index, actions['A'],
                                          actions['B']]) * self.alpha
        else:
            error_A = 0.
            error_B = 0.

        self.q_tables['A'][state_index, actions['A'], actions['B']] += error_A
        self.q_tables['B'][state_index, actions['A'], actions['B']] += error_B

        # collect errors
        if self.state == 'B21' and actions['A'] == 1 and actions['B'] == 4:
            self.ERRs.append(abs(error_A))
            self.steps_to_plot.append(self.step_count)
            # print self.ERRs

        # if self.verbose:
        #     print 'Action of B at state s: ', np.argmax(self.q_tables['B'][self.all_states['B21']]) % 5

    def plot_error(self, experiment_id):

        err_to_plot = self.ERRs
        step_to_plot = self.steps_to_plot

        plt.plot(step_to_plot, err_to_plot, '-', linewidth=0.8)
        plt.ylim(0, 0.5)
        # plt.xlim(0, 1000000)
        plt.xlabel("Simulation Iteration")
        plt.ylabel("Q-value Difference")
        plt.title("Ce-Q")
        plt.savefig('outputs/CeQ_exp_' + str(experiment_id) + '_' +
                    str(self.step_count) + '.png')

        plt.show(block=False)
        plt.show()

    def save_data(self, experiment_id):
        error_file_name = 'outputs/data_CeQ_error_exp_' + str(
            experiment_id) + '.txt'
        error_file = open(error_file_name, 'w')
        for item in self.ERRs:
            error_file.write("%s\n" % item)

        step_file_name = 'outputs/data_CeQ_step_exp_' + str(
            experiment_id) + '.txt'
        step_file = open(step_file_name, 'w')
        for item in self.steps_to_plot:
            step_file.write("%s\n" % item)

        if self.verbose:
            print(self.ERRs)
            print('epsilon:', self.epsilon)
            print('alpha: ', self.alpha)
Пример #4
0
class QLearner:
    def __init__(self,
                 num_states=(2 * 4) * (2 * 4) * 2,
                 num_actions=5,
                 alpha=1.0,
                 alpha_decay=0.99996,
                 gamma=0.9,
                 epsilon=1.0,
                 epsilon_decay=0.99996,
                 max_steps_per_episode=1000,
                 max_num_episodes=1000000,
                 save_model_per=1000000,
                 verbose=False):

        self.world = World()
        self.game = Play_game()

        # inputs
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.max_steps_per_episode = max_steps_per_episode
        self.max_num_episodes = max_num_episodes
        self.save_model_per = save_model_per

        # initialize
        self.num_states = num_states  # 4x2 grid, two players, with or without ball
        self.num_actions = num_actions

        self.q_table = np.full(shape=(num_states, num_actions), fill_value=1.0)
        self.q_tables = {
            'A': deepcopy(self.q_table),
            'B': deepcopy(self.q_table)
        }

        self.state = {}
        self.actions = {
            'A': 0,
            'B': 0
        }  # map N, S, E, W, and stick to [0,1,2,3,4]

        # error
        self.ERRs = []
        self.steps_to_plot = []

        self.verbose = verbose

    def Qlearning_agent(self, verbose):
        self.step_count = 1

        for episode in range(self.max_num_episodes):
            # reset game
            self.all_states, self.state, _ = self.game.init_game()
            self.actions = self.get_first_actions(self.state)

            # play game and learn
            for t in range(self.max_steps_per_episode):
                # update q_table seperately
                state_prime, r, done, _ = self.game.play(self.actions)
                self.actions['A'] = self.update_Q(state_prime, r['A'], 'A')
                self.actions['B'] = self.update_Q(state_prime, r['B'], 'B')

                self.state = state_prime
                self.step_count += 1

                # save and plot
                if self.step_count > 0 and self.step_count % self.save_model_per == 0:
                    experiment_id = 1
                    self.save_data(experiment_id)
                    self.plot_error(experiment_id)

                if done:
                    self.epsilon *= self.epsilon_decay
                    self.alpha *= self.alpha_decay
                    break

            if self.step_count > 1000000:
                break

    def get_first_actions(self, s):
        first_actions = {}

        first_actions['A'] = rand.randint(0, self.num_actions - 1)
        first_actions['B'] = rand.randint(0, self.num_actions - 1)

        if self.verbose: print('s =', s, 'a =', first_actions)

        return first_actions

    def update_Q(self, state_prime, r, player_name):
        # epsilon-greedy selection function
        state_index = self.all_states[self.state]
        state_prime_index = self.all_states[state_prime]

        if rand.random() < self.epsilon:
            action = rand.randint(0, self.num_actions - 1)

        else:
            action = np.argmax(self.q_tables[player_name][state_prime_index])

        # update Q table
        V = np.amax(self.q_tables[player_name][state_prime_index])

        error = (
            ((1 - self.gamma) * r + self.gamma * V) -
            self.q_tables[player_name][state_index,
                                       self.actions[player_name]]) * self.alpha

        self.q_tables[player_name][state_index,
                                   self.actions[player_name]] += error

        # collect errors
        if player_name == 'A' and self.state == 'B21' and self.actions[
                'A'] == 1 and error != 0.0:
            self.ERRs.append(abs(error))
            self.steps_to_plot.append(self.step_count)
            # print self.ERRs

        if self.verbose: print('s =', state_prime, 'a =', action, 'r =', r)
        return action

    def plot_error(self, experiment_id):
        err_to_plot = self.ERRs[::20]
        step_to_plot = self.steps_to_plot[::20]

        plt.plot(step_to_plot, err_to_plot, '-', linewidth=0.3)
        plt.ylim(0, 0.5)
        plt.xlim(0, 1000000)
        plt.xlabel("Simulation Iteration")
        plt.ylabel("Q-value Difference")
        plt.title("Q-learner")

        plt.savefig('outputs/Q_exp_' + str(experiment_id) + '_' +
                    str(self.step_count) + '.png')
        plt.show(block=False)
        plt.show()

    def save_data(self, experiment_id):
        error_file_name = 'outputs/data_Q_error_exp_' + str(
            experiment_id) + '.txt'
        error_file = open(error_file_name, 'w')
        for item in self.ERRs:
            error_file.write("%s\n" % item)

        # step_file = open('step_test.txt', 'w')
        step_file_name = 'outputs/data_Q_step_exp_' + str(
            experiment_id) + '.txt'
        step_file = open(step_file_name, 'w')
        for item in self.steps_to_plot:
            step_file.write("%s\n" % item)

        if self.verbose:
            print(self.ERRs)
            print('epsilon:', self.epsilon)
            print('alpha: ', self.alpha)
Пример #5
0
class FoeQLearner:
    def __init__(self,
                 num_states=(2 * 4) * (2 * 4) * 2,
                 num_actions=5,
                 alpha=1.0,
                 alpha_decay=0.99997,
                 gamma=0.9,
                 epsilon=1.0,
                 max_steps_per_episode=1000,
                 max_num_episodes=1000000,
                 save_model_per=20000,
                 verbose=True):

        self.world = World()
        self.game = Play_game()

        # inputs
        self.alpha = alpha
        self.alpha_decay = alpha_decay
        self.gamma = gamma
        self.epsilon = epsilon
        self.max_steps_per_episode = max_steps_per_episode
        self.max_num_episodes = max_num_episodes
        self.save_model_per = save_model_per

        # initialize
        self.num_states = num_states  # 4x2 grid, two players, with or without ball
        self.num_actions = num_actions

        q_table = np.full(shape=(num_states, num_actions, num_actions),
                          fill_value=1.0)
        self.q_tables = {'A': deepcopy(q_table), 'B': deepcopy(q_table)}

        self.state = {}

        # error
        self.ERRs = []
        self.steps_to_plot = []

        self.verbose = verbose

    def foeQ_agent(self):

        self.step_count = 1

        for episode in range(self.max_num_episodes):
            # reset game
            self.all_states, self.state, _ = self.game.init_game()

            # play game and learn
            for t in range(self.max_steps_per_episode):

                # get actions
                current_actions = self.get_actions()

                # observe
                state_prime, r, done, _ = self.game.play(current_actions)

                # update
                self.update_Q(current_actions, state_prime, r['A'], 'A')
                self.update_Q(current_actions, state_prime, r['B'], 'B')
                self.state = state_prime

                self.step_count += 1

                # save and plot
                if self.step_count > 0 and self.step_count % self.save_model_per == 0:
                    experiment_id = 3
                    self.save_data(experiment_id)
                    self.plot_error(experiment_id)

                if done:
                    self.alpha *= self.alpha_decay
                    break

            if self.step_count > 1000000:
                break

    def get_actions(self):
        # off-policy
        actions = {}
        actions['A'] = rand.randint(0, self.num_actions - 1)
        actions['B'] = rand.randint(0, self.num_actions - 1)

        # if t == 0:
        #     self.actions = actions

        return actions

    def get_V(self, q_table, state_prime):

        num_states, num_action_0, num_action_1 = q_table.shape

        # -1.0 because we need to maximize
        c = [-1.0] + [0.0] * num_action_0

        A_ub = np.transpose(
            np.concatenate([[[1.0] * num_action_0], -q_table[state_prime]], 0))
        b_ub = [0.0] * num_action_1

        A_eq = [[0.0] + [1.0] * num_action_0]
        b_eq = [1.0]

        bounds = [[None, None]] + [[0.0, None]] * num_action_0

        res = linprog(c,
                      A_ub=A_ub,
                      b_ub=b_ub,
                      A_eq=A_eq,
                      b_eq=b_eq,
                      bounds=bounds)

        V = res.x[0] if type(res.x) != float else 1.0
        # V = res.x[0]
        return V

    def update_Q(self, actions, state_prime, r, player_name):

        state_index = self.all_states[self.state]
        state_prime_index = self.all_states[state_prime]

        # update Q table
        # V = np.amax(self.q_tables[player_name][state_prime_index])
        V = self.get_V(self.q_tables[player_name], state_prime_index)

        error = (((1 - self.gamma) * r + self.gamma * V) -
                 self.q_tables[player_name][state_index, actions['A'],
                                            actions['B']]) * self.alpha

        self.q_tables[player_name][state_index, actions['A'],
                                   actions['B']] += error

        # collect errors
        if player_name == 'A' and self.state == 'B21' and actions[
                'A'] == 1 and actions['B'] == 4:
            self.ERRs.append(abs(error))
            self.steps_to_plot.append(self.step_count)
            # print self.ERRs

        # if self.verbose:
        #     print 'Action of B at state s: ', np.argmax(self.q_tables['B'][self.all_states['B21']]) % 5

    def plot_error(self, experiment_id):

        err_to_plot = self.ERRs
        step_to_plot = self.steps_to_plot

        plt.plot(step_to_plot, err_to_plot, '-', linewidth=0.5)
        plt.ylim(0, 0.5)
        plt.xlim(0, 1000000)
        plt.xlabel("Simulation Iteration")
        plt.ylabel("Q-value Difference")
        plt.title("Foe-Q")
        plt.savefig('outputs/FoeQ_exp_' + str(experiment_id) + '_' +
                    str(self.step_count) + '.png')
        plt.show(block=False)
        plt.show()

    def save_data(self, experiment_id):
        error_file_name = 'outputs/data_FeoQ_error_exp_' + str(
            experiment_id) + '.txt'
        error_file = open(error_file_name, 'w')
        for item in self.ERRs:
            error_file.write("%s\n" % item)

        # step_file = open('step_test.txt', 'w')
        step_file_name = 'outputs/data_FoeQ_step_exp_' + str(
            experiment_id) + '.txt'
        step_file = open(step_file_name, 'w')
        for item in self.steps_to_plot:
            step_file.write("%s\n" % item)

        if self.verbose:
            print(self.ERRs)
            print('epsilon:', self.epsilon)
            print('alpha: ', self.alpha)