def start(self, observation): state = observation.intArray[0] action = self.egreedy(state) return_action = Action() return_action.intArray = [action] self.lastAction = copy.deepcopy(return_action) self.lastObservation = copy.deepcopy(observation) return return_action
def step(self, reward, observation): # Generate random action, 0 or 1 int_action = self.rand_generator.randint(0, 1) return_action = Action() return_action.intArray = [int_action] last_action = copy.deepcopy(return_action) last_observation = copy.deepcopy(observation) return return_action
def choose_action(self): pi_s = self._learner.policy(self._laststate) lastaction = pi_s.policy_action pi_s = lastaction.policy(self._laststate) self._lastaction = lastaction assert self._lastaction # print(lastaction) return_action = Action() return_action.intArray = self._lastaction.tolist() return return_action
def step(self, reward, observation): state = observation.intArray[0] last_state = self.lastObservation.intArray[0] last_action = self.lastAction.intArray[0] action = self.egreedy(state) Q_sa = self.value_function[last_state][last_action] Q_sprime_aprime = self.value_function[state][action] new_Q_sa = Q_sa + self.sarsa_stepsize * (reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa) if not self.policyFrozen: self.value_function[last_state][last_action] = new_Q_sa return_action = Action() return_action.intArray = [action] self.lastAction = copy.deepcopy(return_action) self.lastObservation = copy.deepcopy(observation) return return_action
def get_Action(self): return Action.from_AbstractType(self.get_AbstractType())