class History(): def __init__(self, isMultiThreaded=False): self.transitions = {} self.transitionKeys = ["s", "a", "r", "s_", "terminal"] for key in self.transitionKeys: self.transitions[key] = [] if isMultiThreaded: self.histLock = Lock() else: self.histLock = EmptyLock() def learn(self, s, a, r, s_, terminal=False): self.histLock.acquire() self.transitions["s"].append(s.copy()) self.transitions["a"].append(a) self.transitions["r"].append(r) self.transitions["s_"].append(s_.copy()) self.transitions["terminal"].append(terminal) self.histLock.release() def GetHistory(self, reset=True): self.histLock.acquire() transitions = self.transitions.copy() if reset: for key in self.transitionKeys: self.transitions[key] = [] self.histLock.release() return transitions def Reset(self): self.histLock.acquire() for key in self.transitionKeys: self.transitions[key] = [] self.histLock.release() def RemoveNonTerminalHistory(self): self.histLock.acquire() idx = len(self.transitions["terminal"]) - 1 while self.transitions["terminal"][idx] != True and idx >= 0: for key in self.transitionKeys: self.transitions[key].pop(-1) idx -= 1 self.histLock.release() def ExtractHistory(self, transitions): s = np.array(transitions["s"], dtype=float) a = np.array(transitions["a"], dtype=int) r = np.array(transitions["r"], dtype=float) s_ = np.array(transitions["s_"], dtype=float) terminal = np.array(transitions["terminal"], dtype=bool) return s, a, r, s_, terminal
class QLearningTable: def __init__(self, modelParams, qTableName, qTableDirectory, loadTable=True, isMultiThreaded=False, agentName=""): self.qTableFullName = qTableDirectory + qTableName if isMultiThreaded: self.checkStateLoc = Lock() else: self.checkStateLoc = EmptyLock() self.TrialsData = "TrialsData" self.NumRunsTotalSlot = 0 self.NumRunsExperimentSlot = 1 self.AvgRewardSlot = 2 self.AvgRewardExperimentSlot = 3 slotsInTable = max(4, modelParams.numActions) self.actions = list(range(modelParams.numActions)) self.slots = list(range(slotsInTable)) # a list self.table = pd.DataFrame(columns=self.slots, dtype=np.float) self.params = modelParams def InitModel(self, session, resetModel=False): if os.path.isfile(self.qTableFullName + '.gz') and not resetModel: self.ReadTable() self.check_state_exist(self.TrialsData) self.numTotRuns = self.table.ix[self.TrialsData, self.NumRunsTotalSlot] self.avgTotReward = self.table.ix[self.TrialsData, self.AvgRewardSlot] self.numExpRuns = 0 self.avgExpReward = 0 self.table.ix[self.TrialsData, self.AvgRewardExperimentSlot] = 0 self.table.ix[self.TrialsData, self.NumRunsExperimentSlot] = 0 def TakeDfltValues(self): return False def InitTTable(self, ttable): self.ttable = ttable self.reverseTable = ttable.reverseKey self.normalTable = ttable.normalKey self.timeoutPropogation = 10 def ReadTable(self): self.table = pd.read_pickle(self.qTableFullName + '.gz', compression='gzip') def SaveTable(self): self.table.to_pickle(self.qTableFullName + '.gz', 'gzip') def choose_absolute_action(self, observation): state = str(observation) self.check_state_exist(state) state_action = self.table.ix[state, self.actions] state_actionReindex = state_action.reindex( np.random.permutation(state_action.index)) action = state_actionReindex.idxmax() return action, state_action[action] def ExploreProb(self): return self.params.ExploreProb(self.numTotRuns) def choose_action(self, state, validActions, targetValues=False): state = str(state) exploreProb = self.params.ExploreProb(self.numTotRuns) actionVals = self.ActionsValues(state, validActions, targetValues) if np.random.uniform() > exploreProb: # choose best action # some actions have the same value maxArgs = np.argwhere( actionVals == np.amax(actionVals[validActions])).squeeze() # choose from valid actions maxArgsValid = [x for x in maxArgs if x in validActions] action = np.random.choice(maxArgsValid) else: # choose random action action = np.random.choice(validActions) return action, actionVals def ActionsValues(self, state, validActions, targetValues=False): s = str(state) self.check_state_exist(s) vals = self.table.ix[s, :] return vals def NumRuns(self): return self.numTotRuns def learn(self, statesVec, actionsVec, rewardsVec, nextStateVec, terminal): for i in range(len(rewardsVec)): s = str(statesVec[i]) s_ = str(nextStateVec[i]) self.check_state_exist(s) self.check_state_exist(s_) self.learnIMP(s, actionsVec[i], rewardsVec[i], s_, terminal[i]) def learnIMP(self, s, a, r, s_, terminal): q_predict = self.table.ix[s, a] if not terminal: q_target = r + self.params.discountFactor * self.table.ix[ s_, :].max() else: q_target = r # next state is terminal # update self.table.ix[s, a] += self.params.learningRate * (q_target - q_predict) def end_run(self, r, saveTable=False): self.avgTotReward = (self.numTotRuns * self.avgTotReward + r) / (self.numTotRuns + 1) self.avgExpReward = (self.numExpRuns * self.avgExpReward + r) / (self.numExpRuns + 1) self.numTotRuns += 1 self.numExpRuns += 1 self.table.ix[self.TrialsData, self.AvgRewardSlot] = self.avgTotReward self.table.ix[self.TrialsData, self.AvgRewardExperimentSlot] = self.avgExpReward self.table.ix[self.TrialsData, self.NumRunsTotalSlot] = self.numTotRuns self.table.ix[self.TrialsData, self.NumRunsExperimentSlot] = self.numExpRuns # print("num total runs = ", self.numTotRuns, "avg total = ", self.avgTotReward) # print("num experiment runs = ", self.numExpRuns, "avg experiment = ", self.avgExpReward) if saveTable: self.SaveTable() def Reset(self): self.table = pd.DataFrame(columns=self.slots, dtype=np.float) self.check_state_exist(self.TrialsData) self.numTotRuns = self.table.ix[self.TrialsData, self.NumRunsTotalSlot] self.avgTotReward = self.table.ix[self.TrialsData, self.AvgRewardSlot] self.numExpRuns = 0 self.avgExpReward = 0 self.table.ix[self.TrialsData, self.AvgRewardExperimentSlot] = 0 self.table.ix[self.TrialsData, self.NumRunsExperimentSlot] = 0 def check_state_exist(self, state, stateToInitValues=None): self.checkStateLoc.acquire() newState = False if state not in self.table.index: # append new state to q table self.table = self.table.append( pd.Series([0] * len(self.slots), index=self.table.columns, name=state)) if stateToInitValues in self.table.index: self.table.ix[state, :] = self.table.ix[stateToInitValues, :] newState = True self.checkStateLoc.release() return newState
class History: def __init__(self, params, historyFileName='', directory='', isMultiThreaded=False, createAllHistFiles=False): self.params = params self.transitions = {} self.transitionKeys = [ "sFrame", "sVars", "a", "r", "s_Frame", "s_Vars", "terminal" ] self.transitions["currIdx"] = 0 self.transitions["size"] = 0 if len(params.frameSize) == 1: frame_shape = (params.maxReplaySize, params.frameSize[0], 1) self.copyState = self.CopyState1D else: frame_shape = (params.maxReplaySize, params.frameSize[0], params.frameSize[1], 1) self.copyState = self.CopyState2D self.transitions["sFrame"] = np.zeros(frame_shape, dtype=np.float) self.transitions["s_Frame"] = np.zeros(frame_shape, dtype=np.float) self.transitions["sVars"] = np.zeros( (params.maxReplaySize, params.gameVarsSize), dtype=np.float) self.transitions["s_Vars"] = np.zeros( (params.maxReplaySize, params.gameVarsSize), dtype=np.float) self.transitions["a"] = np.zeros(params.maxReplaySize, dtype=np.int32) self.transitions["r"] = np.zeros(params.maxReplaySize, dtype=np.float) self.transitions["terminal"] = np.zeros(params.maxReplaySize, dtype=np.bool) self.transitions["maxAbsReward"] = 0.0 self.transitions["maxFrameVals"] = np.ones(params.frameSize, float) self.transitions["maxVarsVals"] = np.ones(params.gameVarsSize, float) self.isMultiThreaded = isMultiThreaded if isMultiThreaded: self.histLock = Lock() else: self.histLock = EmptyLock() self.metaDataFields = ["maxVarsVals", "maxFrameVals", "maxAbsReward"] if historyFileName != '': self.histFileName = directory + historyFileName else: self.histFileName = historyFileName def CopyState1D(self, key, s, idx): self.transitions[key][idx, :, 0] = s.copy() def CopyState2D(self, key, s, idx): self.transitions[key][idx, :, :, 0] = s.copy() def add_transition(self, s, a, r, s_, terminal=False): self.histLock.acquire() currIdx = self.transitions["currIdx"] self.transitions["currIdx"] = (self.transitions["currIdx"] + 1) % self.params.maxReplaySize self.transitions["size"] = min(self.transitions["size"] + 1, self.params.maxReplaySize) self.histLock.release() self.transitions["maxAbsReward"] = max( self.transitions["maxAbsReward"], abs(r)) sFrame = s[0] s_Frame = s_[0] self.copyState("sFrame", sFrame, currIdx) self.copyState("s_Frame", s_Frame, currIdx) self.transitions["a"][currIdx] = a self.transitions["r"][currIdx] = r self.transitions["terminal"][currIdx] = terminal if self.params.gameVarsSize > 0: sVars = s[1] s_Vars = s_[1] self.transitions["maxVarsVals"] = np.maximum( self.transitions["maxVarsVals"], np.maximum(abs(sVars), abs(s_Vars))) self.transitions["sVars"][currIdx, :] = sVars.copy() self.transitions["s_Vars"][currIdx, :] = s_Vars.copy() def Load(self): if os.path.isfile(self.histFileName + '.gz') and os.path.getsize(self.histFileName + '.gz') > 0: self.transitions = pd.read_pickle(self.histFileName + '.gz', compression='gzip') def Save(self): pd.to_pickle(self.transitions, self.histFileName + '.gz', 'gzip') def Reset(self): self.transitions["currIdx"] = 0 self.transitions["size"] = 0 def CleanHistory(self): for key in self.transitionKeys: self.transitions[key] = [] def get_sample(self, sample_size): i = sample(range(0, self.transitions["size"]), sample_size) r = self.transitions["r"][i] r = r / self.transitions[ "maxAbsReward"] if self.params.normalizeRewards else r sVars = self.transitions["sVars"][i] sVars = sVars / self.transitions[ "maxVarsVals"] if self.params.normalizeState else sVars s_Vars = self.transitions["sVars"][i] s_Vars = s_Vars / self.transitions[ "maxVarsVals"] if self.params.normalizeState else s_Vars s = [self.transitions["sFrame"][i], sVars] s_ = [self.transitions["s_Frame"][i], s_Vars] return s, self.transitions["a"][i], r, s_, self.transitions[ "terminal"][i] def DrawState(self, realState=True): if realState: s, _, _, _, _ = self.get_sample(1) return s def Size(self): return self.transitions["size"]
class DQN_WithTargetAndDefault(DQN_WithTarget): def __init__(self, modelParams, nnName, nnDirectory, loadNN, isMultiThreaded=False, agentName="", createSaver=True): super(DQN_WithTargetAndDefault, self).__init__(modelParams=modelParams, nnName=nnName, nnDirectory=nnDirectory, isMultiThreaded=isMultiThreaded, loadNN=loadNN, agentName=agentName, createSaver=False) self.defaultDecisionMaker = modelParams.defaultDecisionMaker self.rewardHistDefault = [] self.trialsOfDfltRun = modelParams.numTrials2CmpResults if isMultiThreaded: self.rewardHistDfltLock = Lock() else: self.rewardHistDfltLock = EmptyLock() self.defaultScope = self.scope + "_dflt" self.initValDflt = 1000.0 with tf.variable_scope(self.defaultScope): self.valueDefaultDm = tf.get_variable( "value_dflt", shape=(), initializer=tf.constant_initializer(self.initValDflt), dtype=tf.float32) self.init_op = tf.global_variables_initializer() self.sess.run(self.init_op) if createSaver: self.saver = tf.train.Saver() fnameNNMeta = self.directoryName + ".meta" if os.path.isfile(fnameNNMeta) and loadNN: self.saver.restore(self.sess, self.directoryName) else: self.CopyDqn2Target(0) self.Save() def choose_action(self, state, validActions, targetValues=False): if targetValues: if self.ValueDefault() > self.ValueTarget(): return self.defaultDecisionMaker.choose_action( state, validActions, targetValues) else: return super(DQN_WithTargetAndDefault, self).choose_action(state, validActions, targetValues) else: if self.ValueDefault() == self.initValDflt: super(DQN_WithTargetAndDefault, self).choose_action(state, validActions) else: self.defaultDecisionMaker.choose_action(state, validActions) def ValueDefault(self): return self.valueDefaultDm.eval(session=self.sess) def ActionsValues(self, state, validActions, targetValues=False): if targetValues: if self.ValueDefault() > self.ValueTarget(): return self.defaultDecisionMaker.ActionsValues( state, targetValues) else: return super(DQN_WithTargetAndDefault, self).ActionsValues(state, targetValues) else: if self.ValueDefault() == self.initValDflt: return self.defaultDecisionMaker.ActionsValues( state, targetValues) else: return super(DQN_WithTargetAndDefault, self).ActionsValues(state, targetValues) def ExploreProb(self): if self.ValueDefault() == self.initValDflt: return 0.0 else: return super(DQN_WithTargetAndDefault, self).ExploreProb() def end_run(self, r, toSave=False): if self.ValueDefault() == self.initValDflt: self.rewardHistDfltLock.acquire() self.rewardHistDefault.append(r) if len(self.rewardHistDefault) >= self.trialsOfDfltRun: avgReward = np.average(np.array(self.rewardHistDefault)) assign = self.valueDefaultDm.assign(avgReward) self.sess.run(assign) self.Save() self.rewardHistDfltLock.release() print("\t", threading.current_thread().getName(), " : take default dm value #", len(self.rewardHistDefault)) else: super(DQN_WithTargetAndDefault, self).end_run(r, toSave) def DecisionMakerType(self): return "DQN_WithTargetAndDefault" def TakeDfltValues(self): return self.ValueDefault() > self.ValueTarget() def NumDfltRuns(self): return len(self.rewardHistDefault) def DfltValueInitialized(self): return self.ValueDefault() != self.initValDflt def actionValuesSpecific(self, state, dmId): # dmId = dflt, target, curr if dmId == "dflt": return self.defaultDecisionMaker.ActionsValues(state) else: return super(DQN_WithTargetAndDefault, self).actionValuesSpecific(state, dmId) # class CopyDqn: # def __init__(self, argListFrom, argListTo): # self.sess = tf.Session() # argListFrom["params"].tfSession = self.sess
class DQN_WithTarget(DQN): def __init__(self, modelParams, nnName, nnDirectory, loadNN, agentName="", isMultiThreaded=False, createSaver=True): super(DQN_WithTarget, self).__init__(modelParams=modelParams, nnName=nnName, nnDirectory=nnDirectory, isMultiThreaded=isMultiThreaded, loadNN=loadNN, agentName=agentName, createSaver=False) self.numTrials2CmpResults = modelParams.numTrials2CmpResults self.targetScope = self.scope + "_target" self.lastTrainNumRuns = 0 with tf.variable_scope(self.targetScope): self.numRunsTarget = tf.get_variable( "numRuns_target", shape=(), initializer=tf.zeros_initializer(), dtype=tf.int32) self.valueTarget = tf.get_variable( "value_target", shape=(), initializer=tf.constant_initializer(-100.0), dtype=tf.float32) with tf.variable_scope(self.scope): self.valueDqn = tf.get_variable( "value_dqn", shape=(), initializer=tf.constant_initializer(-1000.0), dtype=tf.float32) # Construct target network if modelParams.type == "DQN_Embedding": self.targetOutput = self.build_dqn_withEmbedding( modelParams.nn_Func, self.targetScope) else: self.targetOutput = self.build_dqn(modelParams.nn_Func, self.targetScope) self.init_op = tf.global_variables_initializer() self.sess.run(self.init_op) if createSaver: self.saver = tf.train.Saver() fnameNNMeta = self.directoryName + ".meta" if os.path.isfile(fnameNNMeta) and loadNN: self.saver.restore(self.sess, self.directoryName) else: self.CopyDqn2Target(0) self.Save() if modelParams.outputGraph: # $ tensorboard --logdir=logs tf.summary.FileWriter(nnDirectory + "/", self.sess.graph) self.rewardHist = [] if isMultiThreaded: self.rewardHistLock = Lock() else: self.rewardHistLock = EmptyLock() def CopyNN(self, scopeTo, scopeFrom): fromParams = [ t for t in tf.trainable_variables() if t.name.startswith(scopeFrom) ] fromParams = sorted(fromParams, key=lambda v: v.name) toParams = [ t for t in tf.trainable_variables() if t.name.startswith(scopeTo) ] toParams = sorted(toParams, key=lambda v: v.name) update_ops = [] for fromVar, toVar in zip(fromParams, toParams): op = toVar.assign(fromVar) update_ops.append(op) self.sess.run(update_ops) def CopyDqn2Target(self, numRuns2Save): self.CopyNN(self.targetScope, self.scope) if numRuns2Save != None: assign = self.numRunsTarget.assign(numRuns2Save) self.sess.run(assign) def CopyTarget2DQN(self, numRuns): self.CopyNN(self.scope, self.targetScope) assign = self.numRuns.assign(numRuns) self.sess.run(assign) self.Save() self.rewardHistLock.acquire() self.rewardHist = [] self.rewardHistLock.release() def choose_action(self, state, validActions, targetValues=False): if targetValues: if np.random.uniform() > self.TargetExploreProb(): vals = self.targetOutput.eval( {self.inputLayer: state.reshape(1, self.num_input)}, session=self.sess) maxArgs = list( np.argwhere(vals[0] == np.amax(vals[0][validActions]))[0]) maxArgsValid = [x for x in maxArgs if x in validActions] action = np.random.choice(maxArgsValid) else: action = np.random.choice(validActions) else: action = super(DQN_WithTarget, self).choose_action(state, validActions, targetValues) return action def ActionsValues(self, state, validActions, targetValues=False): if targetValues: allVals = self.targetOutput.eval( {self.inputLayer: state.reshape(1, self.num_input)}, session=self.sess) return allVals[0] else: return super(DQN_WithTarget, self).ActionsValues(state, targetValues) def TargetExploreProb(self): return 0 def NumRunsTarget(self): return self.numRunsTarget.eval(session=self.sess) def ValueTarget(self): return self.valueTarget.eval(session=self.sess) def ValueDqn(self): return self.valueDqn.eval(session=self.sess) def CalcValueDqn(self): # calculate results and compare to target self.rewardHistLock.acquire() rewardHist = self.rewardHist.copy() self.rewardHistLock.release() if len(rewardHist) >= self.numTrials2CmpResults: avgReward = np.average(np.array(rewardHist)) assign = self.valueDqn.assign(avgReward) self.sess.run(assign) def learn(self, s, a, r, s_, terminal, numRuns2Save=None): self.CalcValueDqn() if self.ValueDqn() > self.ValueTarget(): self.CopyDqn2Target(self.lastTrainNumRuns) self.lastTrainNumRuns = numRuns2Save super(DQN_WithTarget, self).learn(s, a, r, s_, terminal, numRuns2Save) def end_run(self, r, toSave=False): super(DQN_WithTarget, self).end_run(r, toSave) # insert reward to reward history and pop first from histor if necessary self.rewardHistLock.acquire() self.rewardHist.append(r) if len(self.rewardHist) > self.numTrials2CmpResults: self.rewardHist.pop(0) self.rewardHistLock.release() def DecisionMakerType(self): return "DQN_WithTarget"