示例#1
0
class Game:
    def __init__(self, p, s, policy, steps):

        # current state
        self.p = p
        self.s = s
        self.t = 0
        self.gamma = 0.95
        self.isWon = False

        # max number of steps played
        self.steps = steps

        self.reward = 0

        # import the dynamics
        self.domain = Domain()

        # create the agent
        self.agent = Agent(self.domain, policy)

        # create a memory for the taken trajectory
        self.trajectory = []
        self.fullTrajectory = []

    def getReward(self):
        return self.reward

    def playGame(self):
        i = 0

        # we play as long as we are not in a temrinal state or havent played a given amount of steps
        while i < self.steps:

            # generate an action based on the policy of the agent
            action = self.agent.policy(self.p, self.s)

            # getting the resulting state from state and action
            next_state = self.domain.dynamics(self.p, self.s, action, self.t)
            p, s, t = next_state

            # fill the trajectory
            r = self.domain.rewardSignal(p, s)
            self.reward = self.reward + pow(self.gamma, i) * r
            self.trajectory.append(((p, s), action, r))

            # update our current state
            self.p, self.s, self.t = next_state

            if self.domain.isTerminalState(self.p, self.s):
                # check if won
                if self.domain.rewardSignal(self.p, self.s) == 1:
                    self.isWon = True

                break

            i += 1

        #if self.domain.rewardSignal(self.p,self.s)==0:
        #print(self.domain.rewardSignal(self.p, self.s))

    def playGameTillEnd(self):
        i = 0
        # we play as long as we are not in a terminal state
        while self.domain.isTerminalState(self.p, self.s) == False:

            # generate an action based on the policy of the agent
            action = self.agent.policy(self.p, self.s)

            # getting the resulting state from state and action
            next_state = self.domain.dynamics(self.p, self.s, action, self.t)
            p, s, t = next_state

            # fill the trajectory
            r = self.domain.rewardSignal(p, s)
            self.reward = self.reward + pow(self.gamma, i) * r
            self.fullTrajectory.append(((self.p, self.s), action, (p, s), r))

            # update our current state
            self.p, self.s, self.t = next_state
            i = i + 1

        # check if won
        if self.domain.rewardSignal(self.p, self.s) == 1:
            self.isWon = True

    def playGameGivenQ(self, Qprev, maxiter=1000):
        i = 0
        accelerate = np.zeros((1, 3))
        decelerate = np.zeros((1, 3))

        while self.domain.isTerminalState(self.p,
                                          self.s) == False and i < maxiter:

            accelerate[0][0] = self.p
            accelerate[0][1] = self.s
            accelerate[0][2] = 4

            decelerate[0][0] = self.p
            decelerate[0][1] = self.s
            decelerate[0][2] = -4

            if Qprev.predict(accelerate) >= Qprev.predict(decelerate):
                action = 4
            else:
                action = -4
            # getting the resulting state from state and action
            next_state = self.domain.dynamics(self.p, self.s, action, self.t)
            p, s, t = next_state

            # fill the trajectory
            r = self.domain.rewardSignal(p, s)
            self.reward = self.reward + pow(self.gamma, i) * r
            self.fullTrajectory.append(((self.p, self.s), action, (p, s), r))

            # update our current state
            self.p, self.s, self.t = next_state
            i = i + 1

        # check if won
        if self.domain.rewardSignal(self.p, self.s) == 1:
            self.isWon = True

    # sets the parameters for a FQI policy game
    # 'policy'  is a string giving the nature of the SL model type (tree, linear or network) that the FQI algo will use
    # N is the number of iteration the FQI algo will use if this is the chosen policy
    # trajectory is the trajectory that the FQI would use to build its model
    # it must be a (x,u,r) tuple, where x is a (p,s) tuple
    def setToFQI(self, policy_FQI, trajectory, N, nb_games):

        # change the policy name
        self.policy_name = policy_FQI
        # create the FQI agent and replace the original one
        self.agent = Agent(self.domain, policy_FQI, trajectory, N, nb_games)

    # sets the parameters for a parametric Q learning policy game
    # 'policy'  is a string giving the nature of the SL model type (radial based or network) that the PQL algo will use
    # trajectory is the trajectory that the PQL would use to build its model
    # it must be a (x,u,r) tuple, where x is a (p,s) tuple
    def setToPQL(self, policy_PQL, trajectory, PATH):

        # change the policy name
        self.policy_name = policy_PQL
        # create the PQL agent and replace the original one
        self.agent = Agent(self.domain,
                           policy_PQL,
                           trajectory,
                           QLearning=True,
                           PATH=PATH)
示例#2
0
class FittedQItLearner:
    """
    Initializes the learner.

    'model_type' is a string stating either "tree", "linear" or "network", the model that will be used
    for the estimation of the Q function

    'trajectory' is a list of tuple (x,u,r), where x is (p,s)

    'N' is the max number of iteration

    """
    def __init__(self, model_type, trajectory, N, nb_games):

        # parameters of the models
        self.trees_n_estimators = 50

        self.model_type = model_type
        self.trajectory = trajectory
        self.nb_games = nb_games
        self.domain = Domain()
        self.model = self.Q_iter(N)

    def Q_iterOld(self, N):

        X_U = []
        R = []
        win = 0
        lost = 0

        for tuple in self.trajectory:

            # extraction of the elements of the trajectory into their respective lists
            x, u, r = tuple
            p, s = x

            X_U.append((p, s, u))
            R.append(r)

            # count the amount of winned games in the training
            if r == 1:
                win += 1

            # count the amount of lost games in the training
            if r == -1:
                lost += 1

        print(" \nthere were " + str(win) + " winned games and " + str(lost) +
              " lost games")
        print(" \nstarting iterations of the FQI")

        X_U = np.array(X_U)
        R = np.array(R)
        name_model = ''

        # compute approximation of Q0
        if (self.model_type == "linear"):

            model = LinearRegression()

            # creating the name of the model
            name_model = self.model_type + "_" + str(
                self.nb_games) + "_games_it_" + str(0)

            # if the exact same model has already been made (same name), just load it
            if os.path.isfile(name_model):
                model = load(name_model)

            # otherwise, create the model
            else:
                model = LinearRegression().fit(X_U, R)

        elif (self.model_type == "tree"):

            model = ExtraTreesRegressor(n_estimators=self.trees_n_estimators,
                                        random_state=0)

            # creating the name of the model
            name_model = self.model_type + '_' + str(
                self.trees_n_estimators) + "_" + str(
                    self.nb_games) + "_games_it_" + str(0)

            # if the exact same model has already been made (same name), just load it
            if os.path.isfile(name_model):
                model = load(name_model)

            # otherwise, create the model
            else:
                model.fit(X_U, R)

        elif (self.model_type == "network"):
            print("error, model not done yet")

        else:
            print("error, model type not available")

        # saving the model for potential later uses
        dump(model, name_model)

        self.model = model

        for i in (range(N)):
            i += 1

            print(" \nFQI iteration " + str(i) + " out of " + str(N) + " \n")
            R_i = []

            # writing down the name of the model we are about to build
            name_model = ''

            if self.model_type == "tree":
                name_model = self.model_type + '_' + str(
                    self.trees_n_estimators) + "_" + str(
                        self.nb_games) + "_games_it_" + str(i)

            if self.model_type == 'linear':
                name_model = self.model_type + "_" + str(
                    self.nb_games) + "_games_it_" + str(i)

            if self.model_type == 'network':
                print(" no such models yet, need to create model names")

            # if we have already built a model for this scenario, skip the training set build
            if not (os.path.isfile(name_model)):
                for tuple in self.trajectory:

                    # building the training set for next iteration of Q
                    x, u, r = tuple
                    p, s = x

                    # building the next state and the cumulated r
                    p2, s2, t2 = self.domain.dynamics(
                        p, s, u, 0)  # we dont care about t
                    cumulated_r = r + self.domain.DISCOUNT_FACTOR * self.maxPreviousQ(
                        p2, s2)

                    R_i = np.append(cumulated_r,
                                    R_i)  # here maybe speed loss !!!!!

            R_i = np.array(R_i)

            # if the exact same model has already been made (same name), just load it
            if os.path.isfile(name_model):
                self.model = load(name_model)
                print("existing model was found")

            else:
                # otherwise training the new model to approximate Qi
                self.model.fit(X_U, R_i)
                dump(self.model, name_model)

        return self.model

    def Q_iter(self, N):
        print("number of tuples : " + str(len(self.trajectory)))

        Q0 = self.getNewModel()

        xtrain = np.zeros((len(self.trajectory), 3))
        ytrain = np.zeros(len(self.trajectory))
        j = 0

        #generate intial training set
        for (pt, st), action, (pnext, snext), reward in self.trajectory:
            xtrain[j][0] = pt
            xtrain[j][1] = st
            xtrain[j][2] = action
            ytrain[j] = reward
            j = j + 1

        Q0.fit(xtrain, ytrain)

        #Useful variables
        Qprev = Q0
        currStateAction1 = np.zeros((1, 3))  #State and chosen action is Left
        currStateAction2 = np.zeros((1, 3))  #State and chosen action is Right

        print("\nFitted Q learning:\n")

        #Fitted Q algorithm
        for i in range(N):
            print(" iteration " + str(i) + " out of " + str(N))
            j = 0

            #Rebuild the training set
            for (pt, st), action, (pnext, snext), reward in self.trajectory:

                # TODO re use the more general maxPreviousQ function ( and adapt it )
                currStateAction1[0][0] = pnext
                currStateAction1[0][1] = snext
                currStateAction1[0][2] = -4

                currStateAction2[0][0] = pnext
                currStateAction2[0][1] = snext
                currStateAction2[0][2] = 4

                ytrain[j] = reward + self.domain.DISCOUNT_FACTOR * max(
                    Qprev.predict(currStateAction1),
                    Qprev.predict(currStateAction2))
                j = j + 1

            # TODO make more general
            Qcurr = self.getNewModel()
            Qcurr.fit(xtrain, ytrain)
            Qprev = Qcurr

        print("done iterating")

        return Qcurr

    # gives the max reward obtainable from a given state x for all actions possibles
    # u using the last SL model
    def maxPreviousQ(self, p, s):

        best_reward = float("-inf")

        for action in self.domain.ACTIONS:

            to_predict = [[p, s, action]]  # to avoid dimensionality problems
            reward = self.model.predict((to_predict))

            if (reward > best_reward):
                best_reward = reward

        return best_reward

    # returns the reward estimated by the Q_N model for a given state and action
    def rewardFromModelOld(self, x, u):

        p, s = x
        to_predict = [[p, s, u]]  # to avoid dimensionality problems
        reward = self.model.predict((to_predict))

        return reward

    # returns the reward estimated by the Q_N model for a given state and action
    def rewardFromModel(self, x, u):

        currStateAction = np.zeros((1, 3))
        p, s = x

        currStateAction[0][0] = p
        currStateAction[0][1] = s
        currStateAction[0][2] = u
        reward = self.model.predict(currStateAction)

        return reward

    def getNewModel(self):

        if (self.model_type == "linear"):

            return LinearRegression()

        elif (self.model_type == "tree"):

            return ExtraTreesRegressor(n_estimators=self.trees_n_estimators,
                                       random_state=0)

        elif (self.model_type == "network"):
            print("error, model not done yet")
            return 0

        else:
            print("error, model type not available")

            return 0