예제 #1
0
    def soft_2player_value_iteration(self, alpha, tol=1e-10):
        self.v = np.zeros(self.env.n_states)
        self.q2p = np.zeros(
            (self.env.n_states, self.env.n_actions, self.env.n_actions))

        while True:
            v_old = np.copy(self.v)
            for a in range(self.env.n_actions):
                for o in range(self.env.n_actions):
                    self.q2p[:, a, o] = self.env.r[:, a] + self.env.gamma * (
                        (alpha) * self.env.sparseT[a].dot(self.v) +
                        (1 - alpha) * self.env.sparseT[o].dot(self.v))
            self.v = softmax(np.min(self.q2p,
                                    axis=2)).reshape(self.env.n_states)
            if np.linalg.norm(self.v - v_old) < tol:
                break
        opponent_policy = np.zeros((self.env.n_states, self.env.n_actions))
        q_opponent = special.logsumexp(self.q2p, axis=1)
        for index in range(self.env.n_states):
            possible_actions = np.array(
                self.env.get_possible_actions(
                    state=self.env.index_to_state(index)))
            worst_action_indices = np.where(
                q_opponent[index, possible_actions] == np.min(
                    q_opponent[index, possible_actions]), True, False)
            worst_actions = possible_actions[np.array(worst_action_indices)]
            opponent_policy[index, :] = np.array([
                1 / len(worst_actions) if a in worst_actions else 0
                for a in range(self.env.n_actions)
            ])

        return softmax_probs(np.min(self.q2p, axis=2)), opponent_policy
예제 #2
0
    def soft_value_iteration_fixed_horizon(self, horizon=10, tol=1e-10):
        self.v = np.zeros(self.env.n_states)
        self.q = np.zeros((self.env.n_states, self.env.n_actions))

        for _ in range(horizon):
            for a in range(self.env.n_actions):
                self.q[:,
                       a] = self.env.r[:,
                                       a] + self.env.gamma * self.env.sparseT[
                                           a].dot(self.v)
            self.v = softmax(self.q).reshape(self.env.n_states)

        return softmax_probs(self.q)
예제 #3
0
    def soft_value_iteration(self, tol=1e-10):
        self.v = np.zeros(self.env.n_states)
        self.q = np.zeros((self.env.n_states, self.env.n_actions))

        while True:
            v_old = np.copy(self.v)
            for a in range(self.env.n_actions):
                self.q[:,
                       a] = self.env.r[:,
                                       a] + self.env.gamma * self.env.sparseT[
                                           a].dot(self.v)
            self.v = softmax(self.q).reshape(self.env.n_states)
            if np.linalg.norm(self.v - v_old) < tol:
                break
        return softmax_probs(self.q)
예제 #4
0
    def rational_opponent_two_players_soft_Q(self,
                                             alpha,
                                             tol=1e-10,
                                             reuseQ=False):
        #Opponent takes control w.p. 1 - alpha
        self.lr = 0.5  #0.5 for deterministic environments, 0.1 for stochastic transitions
        if (not reuseQ) or not self.oldQavailable:
            self.q2p = np.zeros(
                (self.env.n_states, self.env.n_actions, self.env.n_actions))
            self.q_player = np.zeros((self.env.n_states, self.env.n_actions))
            self.q_opponent = np.zeros((self.env.n_states, self.env.n_actions))
            for i in range(self.env.n_states):
                for a in range(self.env.n_actions):
                    for o in range(self.env.n_actions):
                        if not (a
                                in self.env.get_possible_actions(state_id=i)):
                            self.q2p[i, a, :] = -np.inf
                            self.q_player[i, a] = -np.inf
                            self.q_opponent[i, a] = -np.inf
                            if not (o in self.env.get_possible_actions(
                                    state_id=i)):
                                self.q2p[i, :, o] = -np.inf
        else:
            print("Recycling Q's")
        n_episodes = 30000
        player = Agent(self.env,
                       policy=self.env.uniform_policy())  #Stochastic Policy
        opponent = Agent(self.env, policy=self.env.uniform_policy())
        counter = 0
        for _ in range(n_episodes):
            common_state = player.env.get_random_initial_state()
            player.state = common_state
            opponent.state = common_state
            player.index = self.env.state_to_index(player.state)
            opponent.index = player.index
            delta = 0
            while True:

                if (player.index in self.env.terminal_indexes):

                    if delta < tol and not delta == 0:
                        counter += 1
                        if counter == 30:
                            """Debug Plots"""
                            self.oldQavailable = True
                            return player.policy, opponent.policy
                    break
                a = player.choose_action()
                player_next_state = player.env.take_action(player.state, a)
                player_next_is = player.env.state_to_index(player_next_state)

                o = opponent.choose_action()
                opponent_next_state = opponent.env.take_action(
                    opponent.state, o)
                opponent_next_is = opponent.env.state_to_index(
                    opponent_next_state)

                tot_reward = (1 - alpha) * opponent.env.r[
                    opponent.index, o] + alpha * player.env.r[player.index, a]

                #TD Update
                update = (tot_reward + self.env.gamma *
                          ((1 - alpha) * self.v[opponent_next_is] +
                           alpha * self.v[player_next_is]) -
                          self.q2p[player.index, a, o])

                self.q2p[player.index, a, o] += self.lr * update

                delta = np.max([delta, np.abs(update)])
                #Marginalize Q values
                possible_actions = np.array(
                    opponent.env.get_possible_actions(state=opponent.state))
                self.q_player[player.index,
                              a] = np.min(self.q2p[player.index, a,
                                                   possible_actions])

                self.q_opponent[opponent.index,
                                o] = softmax(self.q2p[opponent.index,
                                                      possible_actions,
                                                      o].reshape(1, -1) -
                                             np.log(self.env.n_actions))

                self.v[player.index] = softmax(self.q_player[
                    player.index, possible_actions].reshape(1, -1) -
                                               np.log(self.env.n_actions))

                #Update Policies
                """ Deterministic Opponent here below """
                #opponent.policy[opponent.index] = possible_actions[np.argmin(self.q_opponent[opponent.index,possible_actions])]
                """ Tie breaking Opponent here below """
                #worst_action_indices = np.where(np.round(self.q_opponent[opponent.index,possible_actions],2) == np.min(np.round(self.q_opponent[opponent.index,possible_actions],2)),True,False)
                #worst_actions = possible_actions[np.array(worst_action_indices)]
                #opponent.policy[opponent.index, :] = np.array([1/len(worst_actions) if a in worst_actions else 0 for a in range(self.env.n_actions)])
                """Eps Greedy Opponent"""
                epsilon = 0.1
                worst_action_indices = np.where(
                    np.round(self.q_opponent[opponent.index, possible_actions],
                             2) == np.min(
                                 np.round(
                                     self.q_opponent[opponent.index,
                                                     possible_actions], 2)),
                    True, False)
                worst_actions = possible_actions[np.array(
                    worst_action_indices)]
                if len(possible_actions) == len(worst_actions):
                    opponent.policy[opponent.index, :] = np.array([
                        1 / len(worst_actions) if a in worst_actions else 0
                        for a in range(self.env.n_actions)
                    ])
                else:
                    opponent.policy[opponent.index, :] = np.array([
                        (1 - epsilon) /
                        len(worst_actions) if a in worst_actions else epsilon /
                        (len(possible_actions) - len(worst_actions)) if
                        (not (a in worst_actions)
                         and a in possible_actions) else 0
                        for a in range(self.env.n_actions)
                    ])
                """ Entropy Opponent """
                #poss_actions_prob = softmax_probs(-1*self.q_opponent[player.index,possible_actions].reshape(1,-1))
                #probs = np.zeros([self.env.n_actions])
                #probs[possible_actions] = poss_actions_prob
                #opponent.policy[opponent.index, :] = probs
                """ Entropy Player """
                poss_actions_prob = softmax_probs(
                    self.q_player[player.index,
                                  possible_actions].reshape(1, -1))  #/alpha)
                probs = np.zeros([self.env.n_actions])
                probs[possible_actions] = poss_actions_prob
                player.policy[player.index] = probs

                next_is = np.random.choice([player_next_is, opponent_next_is],
                                           p=[alpha, 1 - alpha])

                next_state = player.env.index_to_state(next_is)

                player.state = next_state
                opponent.state = next_state  #both the agents act from the same state
                player.index = next_is
                opponent.index = next_is

        self.oldQavailable = True
        return player.policy, opponent.policy
예제 #5
0
    def two_players_soft_Q(self,
                           alpha,
                           beta,
                           beta_op,
                           n_episodes,
                           lr,
                           tol=1e-10,
                           reuseQ=False):

        if (not reuseQ) or not self.oldQavailable:
            self.q2p = np.zeros(
                (self.env.n_states, self.env.n_actions, self.env.n_actions))
            self.q_player = np.zeros((self.env.n_states, self.env.n_actions))
            self.q_opponent = np.zeros((self.env.n_states, self.env.n_actions))
            for i in range(self.env.n_states):
                for a in range(self.env.n_actions):
                    for o in range(self.env.n_actions):
                        if not (a
                                in self.env.get_possible_actions(state_id=i)):
                            self.q2p[i, a, :] = np.inf
                            self.q_player[i, a] = -np.inf
                            self.q_opponent[i, a] = np.inf
                            if not (o in self.env.get_possible_actions(
                                    state_id=i)):
                                self.q2p[i, :, o] = np.inf

        player = Agent(self.env,
                       policy=self.env.uniform_policy())  #Stochastic Policy
        opponent = Agent(self.env, policy=self.env.uniform_policy())
        counter = 0
        for _ in range(n_episodes):
            step = 0
            common_state = player.env.get_random_initial_state()
            player.state = common_state
            opponent.state = common_state
            player.index = player.env.state_to_index(player.state)
            opponent.index = player.env.state_to_index(opponent.state)
            delta = 0
            while True:
                step += 1
                if (player.index in self.env.terminal_indexes or step > 1000):
                    if delta < tol and not delta == 0:
                        counter += 1
                        if counter == 30:
                            return player.policy, opponent.policy
                    break

                a = player.choose_action()
                player_next_state = player.env.take_action(player.state, a)
                player_next_is = player.env.state_to_index(player_next_state)

                o = opponent.choose_action()
                opponent_next_state = opponent.env.take_action(
                    opponent.state, o)
                opponent_next_is = opponent.env.state_to_index(
                    opponent_next_state)

                tot_reward = (1 - alpha) * opponent.env.r[
                    opponent.index, o] + alpha * player.env.r[player.index, a]

                #TD Update
                update = (tot_reward + self.env.gamma *
                          ((1 - alpha) * self.v[opponent_next_is] +
                           alpha * self.v[player_next_is]) -
                          self.q2p[player.index, a, o])

                self.q2p[player.index, a, o] += lr * update

                delta = np.max([delta, np.abs(update)])

                possible_actions = np.array(
                    opponent.env.get_possible_actions(state=opponent.state))
                #Marginalize Q values
                self.q_player[player.index, a] = -beta_op * softmax(
                    (self.q2p[player.index, a, possible_actions].reshape(
                        1, -1)) / -beta_op)
                self.q_opponent[opponent.index, o] = beta * softmax(
                    (self.q2p[opponent.index, possible_actions, o].reshape(
                        1, -1)) / beta)

                #Update V value
                self.v[player.index] = beta * softmax(
                    (self.q_player[player.index, possible_actions].reshape(
                        1, -1)) / beta)

                #Update Policies
                # """ Adversary """
                # opponent.policy[opponent.index, :] = softmax_probs(1/-beta_op*self.q_opponent[opponent.index, :].reshape(1,-1)) #/alpha)
                # """ Player """
                # player.policy[player.index, :] = softmax_probs(1/beta*self.q_player[player.index, :].reshape(1,-1)) #/alpha)

                poss_actions_prob = softmax_probs(
                    1 / beta *
                    self.q_player[player.index, possible_actions].reshape(
                        1, -1))  #/alpha)
                probs = np.zeros([player.env.n_actions])
                probs[possible_actions] = poss_actions_prob
                player.policy[player.index, :] = probs

                poss_actions_prob = softmax_probs(
                    1 / -beta_op *
                    self.q_opponent[opponent.index, possible_actions].reshape(
                        1, -1))  #/alpha)
                probs = np.zeros([player.env.n_actions])
                probs[possible_actions] = poss_actions_prob
                opponent.policy[opponent.index, :] = probs

                next_is = np.random.choice([player_next_is, opponent_next_is],
                                           p=[alpha, 1 - alpha])
                next_state = player.env.index_to_state(next_is)

                player.state = next_state
                opponent.state = next_state
                player.index = next_is
                opponent.index = next_is
        self.oldQavailable = True
        return player.policy, opponent.policy