def update(self, transitionsBatch): """ Update the Q-Values from the given batch of transitions :param transitionsBatch: List of tuples (qState, action, nextQState, reward, isStateFinal, list of legal actions) """ trainingBatchQStates = [] trainingBatchTargetQValues = [] # Convert raw states to our q-states and calculate update policy for each transition in batch for aQState, anAction, aReward, aNextQState, isTerminal, nextStateLegalActions in transitionsBatch: # aReward = util.rescale(aReward, -510, 1000, -1, 1) actionsQValues = self.model.model.predict(np.array([aQState]))[0] targetQValues = actionsQValues.copy() # Update rule if isTerminal: updatedQValueForAction = aReward else: nextActionsQValues = self.model.model.predict( np.array([aNextQState]))[0] nextStateLegalActionsIndices = [ Directions.getIndex(action) for action in nextStateLegalActions ] try: nextStateLegalActionsIndices.remove(4) except: pass nextStateLegalActionsQValues = np.array( nextActionsQValues)[nextStateLegalActionsIndices] maxNextActionQValue = max(nextStateLegalActionsQValues) updatedQValueForAction = ( aReward + self.trainingRoom.discount * maxNextActionQValue) targetQValues[Directions.getIndex( anAction)] = updatedQValueForAction trainingBatchQStates.append(aQState) trainingBatchTargetQValues.append(targetQValues) return self.model.model.train_on_batch( x=np.array(trainingBatchQStates), y=np.array(trainingBatchTargetQValues))
def getAction(self, rawState, epsilon): legalActions = rawState.getLegalActions() legalActions.remove(Directions.STOP) if util.flipCoin(epsilon): return random.choice(legalActions) else: qValues = [(Directions.getIndex(action), self.getQValue(rawState, action)) for action in legalActions] qValues = sorted(qValues, key=lambda x: x[1], reverse=True) for index, qValue in qValues: action = Directions.fromIndex(index) if action in legalActions: return action
def getGhostDirections(state): return np.array([Directions.getIndex(s.getDirection()) for s in state.getGhostStates()]) / 4.0
def remember(self, state, action, reward, nextState): from game import Directions self.replayMemory[str(state.__hash__()) + str(Directions.getIndex(action))] = (state, action, reward, nextState)