def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_rewards: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = ( None, None, None, ) if self.all_action_scores is not None: self.all_action_scores = self.all_action_scores.cpu().numpy() self.model_propensities = Evaluator.softmax( self.all_action_scores, self.rl_temperature) maxq_action_idxs = self.all_action_scores.argmax(axis=1) if logged_actions is not None: model_values_on_logged_actions = np.sum( (logged_actions * self.all_action_scores), axis=1, keepdims=True) evaluator.report( self.loss.cpu().numpy(), logged_actions, logged_propensities, logged_rewards, logged_values, self.model_propensities, self.all_action_scores, model_values_on_logged_actions, maxq_action_idxs, )
def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): workspace.RunNet(self.all_q_score_model.net) all_action_scores = workspace.FetchBlob(self.all_q_score_output) maxq_action_idxs = workspace.FetchBlob(self.maxq_action_idxs) model_values_on_logged_actions = np.sum( (logged_actions * all_action_scores), axis=1, keepdims=True) model_propensities = Evaluator.softmax(all_action_scores, self.rl_temperature) logged_rewards = workspace.FetchBlob("rewards") evaluator.report( workspace.FetchBlob(self.loss_blob), logged_actions, logged_propensities, logged_rewards, logged_values, model_propensities, all_action_scores, model_values_on_logged_actions, maxq_action_idxs, )
def evaluate(self, evaluator: Evaluator, logged_value: Optional[torch.Tensor]): evaluator.report( self.loss.cpu().numpy(), None, None, None, logged_value.cpu().numpy() if logged_value is not None else None, None, None, None, self.all_action_scores.cpu().numpy(), None, )
def evaluate(self, evaluator: Evaluator): # FIXME evaluator.report( self.loss.cpu().numpy(), None, None, None, None, None, None, None, self.all_action_scores.cpu().numpy(), None, )
def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): evaluator.report( self.loss.cpu().numpy(), None, None, None, logged_values, None, None, self.all_action_scores.cpu().numpy(), None, )
def evaluate( self, evaluator: Evaluator, logged_actions: torch.Tensor, logged_propensities: Optional[torch.Tensor], logged_rewards: torch.Tensor, logged_values: Optional[torch.Tensor], ): self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = ( None, None, None, ) if self.all_action_scores is not None: self.all_action_scores = self.all_action_scores self.model_propensities = Evaluator.softmax( self.all_action_scores.cpu().numpy(), self.rl_temperature ) maxq_action_idxs = self.all_action_scores.argmax(dim=1, keepdim=True) if logged_actions is not None: model_values_on_logged_actions = ( torch.sum( (logged_actions * self.all_action_scores), dim=1, keepdim=True ) .cpu() .numpy() ) evaluator.report( self.loss.cpu().numpy(), logged_actions.cpu().numpy(), logged_propensities.cpu().numpy() if logged_propensities is not None else None, logged_rewards.cpu().numpy(), logged_values.cpu().numpy() if logged_values is not None else None, self.model_propensities, self.reward_estimates.cpu().numpy(), self.all_action_scores.cpu().numpy(), model_values_on_logged_actions, maxq_action_idxs, )
def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): workspace.RunNet(self.q_score_model.net) model_values_on_logged_actions = workspace.FetchBlob(self.q_score_output) evaluator.report( workspace.FetchBlob(self.loss_blob), None, None, None, logged_values, None, None, model_values_on_logged_actions, )