def policy(self, state, turn, available, epsilon=0.08): maxvalue = -99999 minvalue = 99999 action_list = [] if np.random.rand(1) < epsilon: action_list = available else: if turn == 1: for i in available: state[i] = turn state = encode(state) if self.value[state] > maxvalue: action_list = [] maxvalue = self.value[state] action_list.append(i) elif self.value[state] == maxvalue: action_list.append(i) state = decode(state) state[i] = 0 else: for i in available: state[i] = turn state = encode(state) if self.value[state] < minvalue: action_list = [] minvalue = self.value[state] action_list.append(i) elif self.value[state] == minvalue: action_list.append(i) state = decode(state) state[i] = 0 return random.choice(action_list)
def policy(self, state, turn, epsilon=0.08): maxvalue = -99999 minvalue = 99999 encoded_state = encode(state) available = available_actions(state) action_list = [] if np.random.rand(1) < epsilon: action_list = available else: if turn == 1: for action in available: encoded = encoded_state + str(action) if self.action_value[encoded] > maxvalue: action_list = [] maxvalue = self.action_value[encoded] action_list.append(action) elif self.action_value[encoded] == maxvalue: action_list.append(action) else: for action in available: encoded = encoded_state + str(action) if self.action_value[encoded] < minvalue: action_list = [] minvalue = self.action_value[encoded] action_list.append(action) elif self.action_value[encoded] == minvalue: action_list.append(action) return random.choice(action_list)
def init_value(self): state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) encoded_state = encode(state) available = available_actions(state) for action in available: encoded = encoded_state + str(action) self.action_value[encoded] = 0
def init_value(self): state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) encoded = encode(state) done, winner = is_finished(state) if not done: self._value[encoded] = random.uniform(-0.5, 0.5) self._policy[encoded] = random.choice(available_actions(state)) # terminal state value else: self._value[encoded] = 0
def init_value(self): state_list = itertools.product([0, 1, 2], repeat=9) for state in state_list: state = list(state) done, winner = is_finished(state) encoded = encode(state) if not done: self.value[encoded] = 0 elif winner == 1: self.value[encoded] = 1 elif winner == 2: self.value[encoded] = -1 else: self.value[encoded] = 0
def assign_policy(self, state, x): encoded = encode(state) self._policy[encoded] = x
def policy(self, state): encoded = encode(state) return self._policy[encoded]
def assign_value(self, state, x): encoded = encode(state) self._value[encoded] = x
def value(self, state): encoded = encode(state) return self._value[encoded]
def update(agent, state, next_state, learning_rate=0.4): state = encode(state) next_state = encode(next_state) agent.value[state] = agent.value[state] + learning_rate * ( agent.value[next_state] - agent.value[state])
def assign_q(self, state, action, x): encoded = encode(state) + str(action) self.action_value[encoded] = x
def q(self, state, action): encoded = encode(state) + str(action) return self.action_value[encoded]