def eval_step(self, state): ''' Predict the action for evaluation purpose. Args: state (numpy.array): current state Returns: action (int): an action id ''' q_values = self.q_estimator.predict_nograd( np.expand_dims(state['obs'], 0))[0] probs = remove_illegal(np.exp(q_values), state['legal_actions']) best_action = np.argmax(probs) return best_action, probs
def step(self, state): ''' Predict the action for genrating training data but have the predictions disconnected from the computation graph Args: state (numpy.array): current state Returns: action (int): an action id ''' A = self.predict(state['obs']) A = remove_illegal(A, state['legal_actions']) action = np.random.choice(np.arange(len(A)), p=A) return action
def eval_step(self, state): ''' Use the average policy for evaluation purpose Args: state (dict): The current state. Returns: action (int): An action id. ''' if self.evaluate_with == 'best_response': action, probs = self._rl_agent.eval_step(state) elif self.evaluate_with == 'average_policy': obs = state['obs'] legal_actions = state['legal_actions'] probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) else: raise ValueError( "'evaluate_with' should be either 'average_policy' or 'best_response'." ) return action, probs
def step(self, state): ''' Returns the action to be taken. Args: state (dict): The current state Returns: action (int): An action id ''' obs = state['obs'] legal_actions = state['legal_actions'] if self._mode == MODE.best_response: probs = self._rl_agent.predict(obs) self._add_transition(obs, probs) elif self._mode == MODE.average_policy: probs = self._act(obs) probs = remove_illegal(probs, legal_actions) action = np.random.choice(len(probs), p=probs) return action