Пример #1
0
class History():
    def __init__(self, isMultiThreaded=False):

        self.transitions = {}
        self.transitionKeys = ["s", "a", "r", "s_", "terminal"]

        for key in self.transitionKeys:
            self.transitions[key] = []

        if isMultiThreaded:
            self.histLock = Lock()
        else:
            self.histLock = EmptyLock()

    def learn(self, s, a, r, s_, terminal=False):
        self.histLock.acquire()

        self.transitions["s"].append(s.copy())
        self.transitions["a"].append(a)
        self.transitions["r"].append(r)
        self.transitions["s_"].append(s_.copy())
        self.transitions["terminal"].append(terminal)

        self.histLock.release()

    def GetHistory(self, reset=True):
        self.histLock.acquire()
        transitions = self.transitions.copy()
        if reset:
            for key in self.transitionKeys:
                self.transitions[key] = []
        self.histLock.release()

        return transitions

    def Reset(self):
        self.histLock.acquire()
        for key in self.transitionKeys:
            self.transitions[key] = []
        self.histLock.release()

    def RemoveNonTerminalHistory(self):
        self.histLock.acquire()
        idx = len(self.transitions["terminal"]) - 1
        while self.transitions["terminal"][idx] != True and idx >= 0:
            for key in self.transitionKeys:
                self.transitions[key].pop(-1)
            idx -= 1
        self.histLock.release()

    def ExtractHistory(self, transitions):
        s = np.array(transitions["s"], dtype=float)
        a = np.array(transitions["a"], dtype=int)
        r = np.array(transitions["r"], dtype=float)
        s_ = np.array(transitions["s_"], dtype=float)
        terminal = np.array(transitions["terminal"], dtype=bool)

        return s, a, r, s_, terminal
Пример #2
0
class QLearningTable:
    def __init__(self,
                 modelParams,
                 qTableName,
                 qTableDirectory,
                 loadTable=True,
                 isMultiThreaded=False,
                 agentName=""):
        self.qTableFullName = qTableDirectory + qTableName

        if isMultiThreaded:
            self.checkStateLoc = Lock()
        else:
            self.checkStateLoc = EmptyLock()

        self.TrialsData = "TrialsData"
        self.NumRunsTotalSlot = 0
        self.NumRunsExperimentSlot = 1
        self.AvgRewardSlot = 2
        self.AvgRewardExperimentSlot = 3

        slotsInTable = max(4, modelParams.numActions)
        self.actions = list(range(modelParams.numActions))
        self.slots = list(range(slotsInTable))  # a list
        self.table = pd.DataFrame(columns=self.slots, dtype=np.float)

        self.params = modelParams

    def InitModel(self, session, resetModel=False):
        if os.path.isfile(self.qTableFullName + '.gz') and not resetModel:
            self.ReadTable()

        self.check_state_exist(self.TrialsData)

        self.numTotRuns = self.table.ix[self.TrialsData, self.NumRunsTotalSlot]
        self.avgTotReward = self.table.ix[self.TrialsData, self.AvgRewardSlot]
        self.numExpRuns = 0
        self.avgExpReward = 0

        self.table.ix[self.TrialsData, self.AvgRewardExperimentSlot] = 0
        self.table.ix[self.TrialsData, self.NumRunsExperimentSlot] = 0

    def TakeDfltValues(self):
        return False

    def InitTTable(self, ttable):
        self.ttable = ttable
        self.reverseTable = ttable.reverseKey
        self.normalTable = ttable.normalKey
        self.timeoutPropogation = 10

    def ReadTable(self):
        self.table = pd.read_pickle(self.qTableFullName + '.gz',
                                    compression='gzip')

    def SaveTable(self):
        self.table.to_pickle(self.qTableFullName + '.gz', 'gzip')

    def choose_absolute_action(self, observation):
        state = str(observation)
        self.check_state_exist(state)
        state_action = self.table.ix[state, self.actions]

        state_actionReindex = state_action.reindex(
            np.random.permutation(state_action.index))
        action = state_actionReindex.idxmax()

        return action, state_action[action]

    def ExploreProb(self):
        return self.params.ExploreProb(self.numTotRuns)

    def choose_action(self, state, validActions, targetValues=False):
        state = str(state)

        exploreProb = self.params.ExploreProb(self.numTotRuns)
        actionVals = self.ActionsValues(state, validActions, targetValues)

        if np.random.uniform() > exploreProb:
            # choose best action

            # some actions have the same value
            maxArgs = np.argwhere(
                actionVals == np.amax(actionVals[validActions])).squeeze()

            # choose from valid actions
            maxArgsValid = [x for x in maxArgs if x in validActions]
            action = np.random.choice(maxArgsValid)
        else:
            # choose random action
            action = np.random.choice(validActions)

        return action, actionVals

    def ActionsValues(self, state, validActions, targetValues=False):
        s = str(state)
        self.check_state_exist(s)
        vals = self.table.ix[s, :]

        return vals

    def NumRuns(self):
        return self.numTotRuns

    def learn(self, statesVec, actionsVec, rewardsVec, nextStateVec, terminal):
        for i in range(len(rewardsVec)):
            s = str(statesVec[i])
            s_ = str(nextStateVec[i])
            self.check_state_exist(s)
            self.check_state_exist(s_)
            self.learnIMP(s, actionsVec[i], rewardsVec[i], s_, terminal[i])

    def learnIMP(self, s, a, r, s_, terminal):
        q_predict = self.table.ix[s, a]

        if not terminal:
            q_target = r + self.params.discountFactor * self.table.ix[
                s_, :].max()
        else:
            q_target = r  # next state is terminal

        # update
        self.table.ix[s,
                      a] += self.params.learningRate * (q_target - q_predict)

    def end_run(self, r, saveTable=False):
        self.avgTotReward = (self.numTotRuns * self.avgTotReward +
                             r) / (self.numTotRuns + 1)
        self.avgExpReward = (self.numExpRuns * self.avgExpReward +
                             r) / (self.numExpRuns + 1)

        self.numTotRuns += 1
        self.numExpRuns += 1

        self.table.ix[self.TrialsData, self.AvgRewardSlot] = self.avgTotReward
        self.table.ix[self.TrialsData,
                      self.AvgRewardExperimentSlot] = self.avgExpReward

        self.table.ix[self.TrialsData, self.NumRunsTotalSlot] = self.numTotRuns
        self.table.ix[self.TrialsData,
                      self.NumRunsExperimentSlot] = self.numExpRuns

        # print("num total runs = ", self.numTotRuns, "avg total = ", self.avgTotReward)
        # print("num experiment runs = ", self.numExpRuns, "avg experiment = ", self.avgExpReward)

        if saveTable:
            self.SaveTable()

    def Reset(self):
        self.table = pd.DataFrame(columns=self.slots, dtype=np.float)
        self.check_state_exist(self.TrialsData)

        self.numTotRuns = self.table.ix[self.TrialsData, self.NumRunsTotalSlot]
        self.avgTotReward = self.table.ix[self.TrialsData, self.AvgRewardSlot]
        self.numExpRuns = 0
        self.avgExpReward = 0

        self.table.ix[self.TrialsData, self.AvgRewardExperimentSlot] = 0
        self.table.ix[self.TrialsData, self.NumRunsExperimentSlot] = 0

    def check_state_exist(self, state, stateToInitValues=None):
        self.checkStateLoc.acquire()
        newState = False
        if state not in self.table.index:
            # append new state to q table
            self.table = self.table.append(
                pd.Series([0] * len(self.slots),
                          index=self.table.columns,
                          name=state))

            if stateToInitValues in self.table.index:
                self.table.ix[state, :] = self.table.ix[stateToInitValues, :]
            newState = True

        self.checkStateLoc.release()
        return newState
Пример #3
0
class History:
    def __init__(self,
                 params,
                 historyFileName='',
                 directory='',
                 isMultiThreaded=False,
                 createAllHistFiles=False):

        self.params = params
        self.transitions = {}
        self.transitionKeys = [
            "sFrame", "sVars", "a", "r", "s_Frame", "s_Vars", "terminal"
        ]

        self.transitions["currIdx"] = 0
        self.transitions["size"] = 0

        if len(params.frameSize) == 1:
            frame_shape = (params.maxReplaySize, params.frameSize[0], 1)
            self.copyState = self.CopyState1D
        else:
            frame_shape = (params.maxReplaySize, params.frameSize[0],
                           params.frameSize[1], 1)
            self.copyState = self.CopyState2D

        self.transitions["sFrame"] = np.zeros(frame_shape, dtype=np.float)
        self.transitions["s_Frame"] = np.zeros(frame_shape, dtype=np.float)

        self.transitions["sVars"] = np.zeros(
            (params.maxReplaySize, params.gameVarsSize), dtype=np.float)
        self.transitions["s_Vars"] = np.zeros(
            (params.maxReplaySize, params.gameVarsSize), dtype=np.float)

        self.transitions["a"] = np.zeros(params.maxReplaySize, dtype=np.int32)
        self.transitions["r"] = np.zeros(params.maxReplaySize, dtype=np.float)
        self.transitions["terminal"] = np.zeros(params.maxReplaySize,
                                                dtype=np.bool)

        self.transitions["maxAbsReward"] = 0.0

        self.transitions["maxFrameVals"] = np.ones(params.frameSize, float)
        self.transitions["maxVarsVals"] = np.ones(params.gameVarsSize, float)

        self.isMultiThreaded = isMultiThreaded
        if isMultiThreaded:
            self.histLock = Lock()
        else:
            self.histLock = EmptyLock()

        self.metaDataFields = ["maxVarsVals", "maxFrameVals", "maxAbsReward"]

        if historyFileName != '':
            self.histFileName = directory + historyFileName
        else:
            self.histFileName = historyFileName

    def CopyState1D(self, key, s, idx):
        self.transitions[key][idx, :, 0] = s.copy()

    def CopyState2D(self, key, s, idx):
        self.transitions[key][idx, :, :, 0] = s.copy()

    def add_transition(self, s, a, r, s_, terminal=False):
        self.histLock.acquire()
        currIdx = self.transitions["currIdx"]
        self.transitions["currIdx"] = (self.transitions["currIdx"] +
                                       1) % self.params.maxReplaySize
        self.transitions["size"] = min(self.transitions["size"] + 1,
                                       self.params.maxReplaySize)
        self.histLock.release()

        self.transitions["maxAbsReward"] = max(
            self.transitions["maxAbsReward"], abs(r))

        sFrame = s[0]
        s_Frame = s_[0]

        self.copyState("sFrame", sFrame, currIdx)
        self.copyState("s_Frame", s_Frame, currIdx)

        self.transitions["a"][currIdx] = a
        self.transitions["r"][currIdx] = r
        self.transitions["terminal"][currIdx] = terminal

        if self.params.gameVarsSize > 0:
            sVars = s[1]
            s_Vars = s_[1]

            self.transitions["maxVarsVals"] = np.maximum(
                self.transitions["maxVarsVals"],
                np.maximum(abs(sVars), abs(s_Vars)))
            self.transitions["sVars"][currIdx, :] = sVars.copy()
            self.transitions["s_Vars"][currIdx, :] = s_Vars.copy()

    def Load(self):
        if os.path.isfile(self.histFileName +
                          '.gz') and os.path.getsize(self.histFileName +
                                                     '.gz') > 0:
            self.transitions = pd.read_pickle(self.histFileName + '.gz',
                                              compression='gzip')

    def Save(self):
        pd.to_pickle(self.transitions, self.histFileName + '.gz', 'gzip')

    def Reset(self):
        self.transitions["currIdx"] = 0
        self.transitions["size"] = 0

    def CleanHistory(self):
        for key in self.transitionKeys:
            self.transitions[key] = []

    def get_sample(self, sample_size):
        i = sample(range(0, self.transitions["size"]), sample_size)

        r = self.transitions["r"][i]

        r = r / self.transitions[
            "maxAbsReward"] if self.params.normalizeRewards else r

        sVars = self.transitions["sVars"][i]
        sVars = sVars / self.transitions[
            "maxVarsVals"] if self.params.normalizeState else sVars

        s_Vars = self.transitions["sVars"][i]
        s_Vars = s_Vars / self.transitions[
            "maxVarsVals"] if self.params.normalizeState else s_Vars

        s = [self.transitions["sFrame"][i], sVars]
        s_ = [self.transitions["s_Frame"][i], s_Vars]

        return s, self.transitions["a"][i], r, s_, self.transitions[
            "terminal"][i]

    def DrawState(self, realState=True):
        if realState:
            s, _, _, _, _ = self.get_sample(1)
            return s

    def Size(self):
        return self.transitions["size"]
Пример #4
0
class DQN_WithTargetAndDefault(DQN_WithTarget):
    def __init__(self,
                 modelParams,
                 nnName,
                 nnDirectory,
                 loadNN,
                 isMultiThreaded=False,
                 agentName="",
                 createSaver=True):
        super(DQN_WithTargetAndDefault,
              self).__init__(modelParams=modelParams,
                             nnName=nnName,
                             nnDirectory=nnDirectory,
                             isMultiThreaded=isMultiThreaded,
                             loadNN=loadNN,
                             agentName=agentName,
                             createSaver=False)

        self.defaultDecisionMaker = modelParams.defaultDecisionMaker
        self.rewardHistDefault = []
        self.trialsOfDfltRun = modelParams.numTrials2CmpResults

        if isMultiThreaded:
            self.rewardHistDfltLock = Lock()
        else:
            self.rewardHistDfltLock = EmptyLock()

        self.defaultScope = self.scope + "_dflt"
        self.initValDflt = 1000.0
        with tf.variable_scope(self.defaultScope):
            self.valueDefaultDm = tf.get_variable(
                "value_dflt",
                shape=(),
                initializer=tf.constant_initializer(self.initValDflt),
                dtype=tf.float32)

        self.init_op = tf.global_variables_initializer()
        self.sess.run(self.init_op)

        if createSaver:
            self.saver = tf.train.Saver()
            fnameNNMeta = self.directoryName + ".meta"
            if os.path.isfile(fnameNNMeta) and loadNN:
                self.saver.restore(self.sess, self.directoryName)
            else:
                self.CopyDqn2Target(0)
                self.Save()

    def choose_action(self, state, validActions, targetValues=False):
        if targetValues:
            if self.ValueDefault() > self.ValueTarget():
                return self.defaultDecisionMaker.choose_action(
                    state, validActions, targetValues)
            else:
                return super(DQN_WithTargetAndDefault,
                             self).choose_action(state, validActions,
                                                 targetValues)
        else:
            if self.ValueDefault() == self.initValDflt:
                super(DQN_WithTargetAndDefault,
                      self).choose_action(state, validActions)
            else:
                self.defaultDecisionMaker.choose_action(state, validActions)

    def ValueDefault(self):
        return self.valueDefaultDm.eval(session=self.sess)

    def ActionsValues(self, state, validActions, targetValues=False):
        if targetValues:
            if self.ValueDefault() > self.ValueTarget():
                return self.defaultDecisionMaker.ActionsValues(
                    state, targetValues)
            else:
                return super(DQN_WithTargetAndDefault,
                             self).ActionsValues(state, targetValues)
        else:
            if self.ValueDefault() == self.initValDflt:
                return self.defaultDecisionMaker.ActionsValues(
                    state, targetValues)
            else:
                return super(DQN_WithTargetAndDefault,
                             self).ActionsValues(state, targetValues)

    def ExploreProb(self):
        if self.ValueDefault() == self.initValDflt:
            return 0.0
        else:
            return super(DQN_WithTargetAndDefault, self).ExploreProb()

    def end_run(self, r, toSave=False):
        if self.ValueDefault() == self.initValDflt:
            self.rewardHistDfltLock.acquire()

            self.rewardHistDefault.append(r)
            if len(self.rewardHistDefault) >= self.trialsOfDfltRun:
                avgReward = np.average(np.array(self.rewardHistDefault))
                assign = self.valueDefaultDm.assign(avgReward)
                self.sess.run(assign)
                self.Save()

            self.rewardHistDfltLock.release()

            print("\t",
                  threading.current_thread().getName(),
                  " : take default dm value #", len(self.rewardHistDefault))
        else:
            super(DQN_WithTargetAndDefault, self).end_run(r, toSave)

    def DecisionMakerType(self):
        return "DQN_WithTargetAndDefault"

    def TakeDfltValues(self):
        return self.ValueDefault() > self.ValueTarget()

    def NumDfltRuns(self):
        return len(self.rewardHistDefault)

    def DfltValueInitialized(self):
        return self.ValueDefault() != self.initValDflt

    def actionValuesSpecific(self, state, dmId):  # dmId = dflt, target, curr
        if dmId == "dflt":
            return self.defaultDecisionMaker.ActionsValues(state)
        else:
            return super(DQN_WithTargetAndDefault,
                         self).actionValuesSpecific(state, dmId)


# class CopyDqn:
#     def __init__(self, argListFrom, argListTo):
#         self.sess = tf.Session()
#         argListFrom["params"].tfSession = self.sess
Пример #5
0
class DQN_WithTarget(DQN):
    def __init__(self,
                 modelParams,
                 nnName,
                 nnDirectory,
                 loadNN,
                 agentName="",
                 isMultiThreaded=False,
                 createSaver=True):
        super(DQN_WithTarget, self).__init__(modelParams=modelParams,
                                             nnName=nnName,
                                             nnDirectory=nnDirectory,
                                             isMultiThreaded=isMultiThreaded,
                                             loadNN=loadNN,
                                             agentName=agentName,
                                             createSaver=False)

        self.numTrials2CmpResults = modelParams.numTrials2CmpResults

        self.targetScope = self.scope + "_target"

        self.lastTrainNumRuns = 0

        with tf.variable_scope(self.targetScope):
            self.numRunsTarget = tf.get_variable(
                "numRuns_target",
                shape=(),
                initializer=tf.zeros_initializer(),
                dtype=tf.int32)
            self.valueTarget = tf.get_variable(
                "value_target",
                shape=(),
                initializer=tf.constant_initializer(-100.0),
                dtype=tf.float32)

        with tf.variable_scope(self.scope):
            self.valueDqn = tf.get_variable(
                "value_dqn",
                shape=(),
                initializer=tf.constant_initializer(-1000.0),
                dtype=tf.float32)

        # Construct target network
        if modelParams.type == "DQN_Embedding":
            self.targetOutput = self.build_dqn_withEmbedding(
                modelParams.nn_Func, self.targetScope)
        else:
            self.targetOutput = self.build_dqn(modelParams.nn_Func,
                                               self.targetScope)

        self.init_op = tf.global_variables_initializer()
        self.sess.run(self.init_op)

        if createSaver:
            self.saver = tf.train.Saver()
            fnameNNMeta = self.directoryName + ".meta"
            if os.path.isfile(fnameNNMeta) and loadNN:
                self.saver.restore(self.sess, self.directoryName)
            else:
                self.CopyDqn2Target(0)
                self.Save()

        if modelParams.outputGraph:
            # $ tensorboard --logdir=logs
            tf.summary.FileWriter(nnDirectory + "/", self.sess.graph)

        self.rewardHist = []
        if isMultiThreaded:
            self.rewardHistLock = Lock()
        else:
            self.rewardHistLock = EmptyLock()

    def CopyNN(self, scopeTo, scopeFrom):
        fromParams = [
            t for t in tf.trainable_variables() if t.name.startswith(scopeFrom)
        ]
        fromParams = sorted(fromParams, key=lambda v: v.name)

        toParams = [
            t for t in tf.trainable_variables() if t.name.startswith(scopeTo)
        ]
        toParams = sorted(toParams, key=lambda v: v.name)

        update_ops = []
        for fromVar, toVar in zip(fromParams, toParams):
            op = toVar.assign(fromVar)
            update_ops.append(op)

        self.sess.run(update_ops)

    def CopyDqn2Target(self, numRuns2Save):
        self.CopyNN(self.targetScope, self.scope)

        if numRuns2Save != None:
            assign = self.numRunsTarget.assign(numRuns2Save)
            self.sess.run(assign)

    def CopyTarget2DQN(self, numRuns):
        self.CopyNN(self.scope, self.targetScope)

        assign = self.numRuns.assign(numRuns)
        self.sess.run(assign)
        self.Save()

        self.rewardHistLock.acquire()
        self.rewardHist = []
        self.rewardHistLock.release()

    def choose_action(self, state, validActions, targetValues=False):
        if targetValues:
            if np.random.uniform() > self.TargetExploreProb():
                vals = self.targetOutput.eval(
                    {self.inputLayer: state.reshape(1, self.num_input)},
                    session=self.sess)

                maxArgs = list(
                    np.argwhere(vals[0] == np.amax(vals[0][validActions]))[0])
                maxArgsValid = [x for x in maxArgs if x in validActions]
                action = np.random.choice(maxArgsValid)
            else:
                action = np.random.choice(validActions)
        else:
            action = super(DQN_WithTarget,
                           self).choose_action(state, validActions,
                                               targetValues)

        return action

    def ActionsValues(self, state, validActions, targetValues=False):
        if targetValues:
            allVals = self.targetOutput.eval(
                {self.inputLayer: state.reshape(1, self.num_input)},
                session=self.sess)
            return allVals[0]
        else:
            return super(DQN_WithTarget,
                         self).ActionsValues(state, targetValues)

    def TargetExploreProb(self):
        return 0

    def NumRunsTarget(self):
        return self.numRunsTarget.eval(session=self.sess)

    def ValueTarget(self):
        return self.valueTarget.eval(session=self.sess)

    def ValueDqn(self):
        return self.valueDqn.eval(session=self.sess)

    def CalcValueDqn(self):
        # calculate results and compare to target
        self.rewardHistLock.acquire()
        rewardHist = self.rewardHist.copy()
        self.rewardHistLock.release()

        if len(rewardHist) >= self.numTrials2CmpResults:
            avgReward = np.average(np.array(rewardHist))
            assign = self.valueDqn.assign(avgReward)
            self.sess.run(assign)

    def learn(self, s, a, r, s_, terminal, numRuns2Save=None):
        self.CalcValueDqn()
        if self.ValueDqn() > self.ValueTarget():
            self.CopyDqn2Target(self.lastTrainNumRuns)

        self.lastTrainNumRuns = numRuns2Save

        super(DQN_WithTarget, self).learn(s, a, r, s_, terminal, numRuns2Save)

    def end_run(self, r, toSave=False):
        super(DQN_WithTarget, self).end_run(r, toSave)

        # insert reward to reward history and pop first from histor if necessary
        self.rewardHistLock.acquire()
        self.rewardHist.append(r)
        if len(self.rewardHist) > self.numTrials2CmpResults:
            self.rewardHist.pop(0)

        self.rewardHistLock.release()

    def DecisionMakerType(self):
        return "DQN_WithTarget"