def agent_step(self, reward, observation): lastState = self.lastObservation.intArray lastAction = self.lastAction.intArray lastStateId = SamplingUtility.getStateId(lastState) lastActionIdx = self.all_allowed_actions[lastStateId].index(tuple(lastAction)) if reward == self.Bad_Action_Penalty: self.all_allowed_actions[lastStateId].pop(lastActionIdx) self.Q_value_function[lastStateId].pop(lastActionIdx) newAction = self.egreedy(self.lastObservation.intArray) returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) return returnAction newState = observation.intArray newAction = self.egreedy(newState) if type(newAction) is tuple: newAction = list(newAction) Q_sa = self.Q_value_function[lastStateId][lastActionIdx] Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId(newState)][ self.all_allowed_actions[SamplingUtility.getStateId(newState)].index(tuple(newAction))] new_Q_sa = Q_sa + self.sarsa_stepsize * (reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa) if not self.policyFrozen: self.Q_value_function[SamplingUtility.getStateId(lastState)][ self.all_allowed_actions[SamplingUtility.getStateId(lastState)].index(tuple(lastAction))] = new_Q_sa returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_end(self, reward): lastState = self.lastObservation.intArray lastAction = self.lastAction.intArray Q_sa = self.Q_value_function[SamplingUtility.getStateId(lastState)][ self.all_allowed_actions[SamplingUtility.getStateId(lastState)].index(tuple(lastAction))] new_Q_sa = Q_sa + self.sarsa_stepsize * (reward - Q_sa) if not self.policyFrozen: self.Q_value_function[SamplingUtility.getStateId(lastState)][ self.all_allowed_actions[SamplingUtility.getStateId(lastState)].index(tuple(lastAction))] = new_Q_sa
def random_player(self,state): #find the actions for the state stateId = SamplingUtility.getStateId(state) #print 'state '+ str(state)[1:-1] #if len(self.Q_value_function) == 0 or not self.Q_value_function.has_key(stateId): #len() : Return the length (the number of items) of an object. self.all_allowed_actions[stateId] = InvasiveUtility.getActions(state, self.nbrReaches, self.habitatSize) #self.Q_value_function[stateId] = len(self.all_allowed_actions[stateId]) * [0.0] index = self.randGenerator.randint(0, len(self.all_allowed_actions[stateId]) - 1) return self.all_allowed_actions[stateId][index]
def agent_step(self, reward, observation): lastState = self.lastObservation.intArray lastAction = self.lastAction.intArray lastStateId = SamplingUtility.getStateId(lastState) lastActionIdx = self.all_allowed_actions[lastStateId].index( tuple(lastAction)) if reward == self.Bad_Action_Penalty: self.all_allowed_actions[lastStateId].pop(lastActionIdx) self.Q_value_function[lastStateId].pop(lastActionIdx) newAction = self.egreedy(self.lastObservation.intArray) returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) return returnAction newState = observation.intArray newAction = self.egreedy( newState) #for random player, egreedy=random_player if type(newAction) is tuple: newAction = list(newAction) #print newAction #we kept the same names from sarsa because it was a bit convenient ---> test test sarsa again, just replace max(blah,blah), with Q_sprime_aprime and uncomment the code below Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId( newState)][self.all_allowed_actions[SamplingUtility.getStateId( newState)].index(tuple(newAction))] #------>comment lines 133-139 when you want random player Q_sa = self.Q_value_function[lastStateId][lastActionIdx] new_Q_sa = Q_sa + self.stepsize * ( reward + self.discount * Q_sprime_aprime - Q_sa) if not self.policyFrozen: self.Q_value_function[SamplingUtility.getStateId(lastState)][ self.all_allowed_actions[SamplingUtility.getStateId( lastState)].index(tuple(lastAction))] = new_Q_sa #------>comment lines<----- returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def egreedy(self, state): #find the actions for the state stateId = SamplingUtility.getStateId(state) #print 'state '+ str(state)[1:-1] if len(self.Q_value_function) == 0 or not self.Q_value_function.has_key(stateId): self.all_allowed_actions[stateId] = InvasiveUtility.getActions(state, self.nbrReaches, self.habitatSize) self.Q_value_function[stateId] = len(self.all_allowed_actions[stateId]) * [0.0] if not self.exploringFrozen and self.randGenerator.random() < self.sarsa_epsilon: index = self.randGenerator.randint(0, len(self.all_allowed_actions[stateId]) - 1) else: index = self.Q_value_function[stateId].index(max(self.Q_value_function[stateId])) #print 'a '+str(self.all_allowed_actions[stateId][index])[1:-1] return self.all_allowed_actions[stateId][index]
def random_player(self, state): #find the actions for the state stateId = SamplingUtility.getStateId(state) #print 'state '+ str(state)[1:-1] #if len(self.Q_value_function) == 0 or not self.Q_value_function.has_key(stateId): #len() : Return the length (the number of items) of an object. self.all_allowed_actions[stateId] = InvasiveUtility.getActions( state, self.nbrReaches, self.habitatSize) #self.Q_value_function[stateId] = len(self.all_allowed_actions[stateId]) * [0.0] index = self.randGenerator.randint( 0, len(self.all_allowed_actions[stateId]) - 1) return self.all_allowed_actions[stateId][index]
def agent_step(self, reward, observation): lastState = self.lastObservation.intArray lastAction = self.lastAction.intArray lastStateId = SamplingUtility.getStateId(lastState) lastActionIdx = self.all_allowed_actions[lastStateId].index(tuple(lastAction)) if reward == self.Bad_Action_Penalty: self.all_allowed_actions[lastStateId].pop(lastActionIdx) self.Q_value_function[lastStateId].pop(lastActionIdx) newAction = self.egreedy(self.lastObservation.intArray) returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) return returnAction newState = observation.intArray newAction = self.egreedy(newState) #for random player, egreedy=random_player if type(newAction) is tuple: newAction = list(newAction) #print newAction #we kept the same names from sarsa because it was a bit convenient ---> test test sarsa again, just replace max(blah,blah), with Q_sprime_aprime and uncomment the code below Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId(newState)][ self.all_allowed_actions[SamplingUtility.getStateId(newState)].index(tuple(newAction))] #------>comment lines 133-139 when you want random player Q_sa = self.Q_value_function[lastStateId][lastActionIdx] new_Q_sa = Q_sa + self.stepsize * (reward + self.discount *Q_sprime_aprime - Q_sa) if not self.policyFrozen: self.Q_value_function[SamplingUtility.getStateId(lastState)][ self.all_allowed_actions[SamplingUtility.getStateId(lastState)].index(tuple(lastAction))] = new_Q_sa #------>comment lines<----- returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction
def agent_step(self, reward, observation): lastState = self.lastObservation.intArray lastAction = self.lastAction.intArray lastStateId = SamplingUtility.getStateId(lastState) lastActionIdx = self.all_allowed_actions[lastStateId].index(tuple(lastAction)) if reward == self.Bad_Action_Penalty: self.all_allowed_actions[lastStateId].pop(lastActionIdx) self.Q_value_function[lastStateId].pop(lastActionIdx) newAction = self.egreedy(self.lastObservation.intArray) print InvasiveUtility.get_budget_cost_actions(lastAction, lastState, self.actionParameterObj) returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) return returnAction newState = observation.intArray newAction = self.egreedy(newState) if type(newAction) is tuple: newAction = list(newAction) Q_sa = self.Q_value_function[lastStateId][lastActionIdx] #print "THE Q_sa IS : " #print Q_sa Q_sprime_aprime = self.Q_value_function[SamplingUtility.getStateId(newState)][ self.all_allowed_actions[SamplingUtility.getStateId(newState)].index(tuple(newAction))] new_Q_sa = Q_sa + self.sarsa_stepsize * (reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa) #print "THE new_Q_sa IS : " #print new_Q_sa if not self.policyFrozen: self.Q_value_function[SamplingUtility.getStateId(lastState)][ self.all_allowed_actions[SamplingUtility.getStateId(lastState)].index(tuple(lastAction))] = new_Q_sa returnAction = Action() returnAction.intArray = newAction self.lastAction = copy.deepcopy(returnAction) self.lastObservation = copy.deepcopy(observation) return returnAction