示例#1
0
    def start(self, observation):
        state = observation.intArray[0]
        action = self.egreedy(state)
        return_action = Action()
        return_action.intArray = [action]

        self.lastAction = copy.deepcopy(return_action)
        self.lastObservation = copy.deepcopy(observation)

        return return_action
示例#2
0
    def step(self, reward, observation):
        # Generate random action, 0 or 1
        int_action = self.rand_generator.randint(0, 1)
        return_action = Action()
        return_action.intArray = [int_action]

        last_action = copy.deepcopy(return_action)
        last_observation = copy.deepcopy(observation)

        return return_action
示例#3
0
    def choose_action(self):
        pi_s = self._learner.policy(self._laststate)

        lastaction = pi_s.policy_action
        pi_s = lastaction.policy(self._laststate)

        self._lastaction = lastaction
        assert self._lastaction
        # print(lastaction)

        return_action = Action()
        return_action.intArray = self._lastaction.tolist()
        return return_action
示例#4
0
    def step(self, reward, observation):
        state = observation.intArray[0]
        last_state = self.lastObservation.intArray[0]
        last_action = self.lastAction.intArray[0]

        action = self.egreedy(state)

        Q_sa = self.value_function[last_state][last_action]
        Q_sprime_aprime = self.value_function[state][action]

        new_Q_sa = Q_sa + self.sarsa_stepsize * (reward + self.sarsa_gamma * Q_sprime_aprime - Q_sa)

        if not self.policyFrozen:
            self.value_function[last_state][last_action] = new_Q_sa

        return_action = Action()
        return_action.intArray = [action]

        self.lastAction = copy.deepcopy(return_action)
        self.lastObservation = copy.deepcopy(observation)

        return return_action
示例#5
0
 def get_Action(self):
     return Action.from_AbstractType(self.get_AbstractType())