def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): workspace.RunNet(self.all_q_score_model.net) all_action_scores = workspace.FetchBlob(self.all_q_score_output) maxq_action_idxs = workspace.FetchBlob(self.maxq_action_idxs) model_values_on_logged_actions = np.sum( (logged_actions * all_action_scores), axis=1, keepdims=True) model_propensities = Evaluator.softmax(all_action_scores, self.rl_temperature) logged_rewards = workspace.FetchBlob("rewards") evaluator.report( workspace.FetchBlob(self.loss_blob), logged_actions, logged_propensities, logged_rewards, logged_values, model_propensities, all_action_scores, model_values_on_logged_actions, maxq_action_idxs, )
def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_rewards: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = ( None, None, None, ) if self.all_action_scores is not None: self.all_action_scores = self.all_action_scores.cpu().numpy() self.model_propensities = Evaluator.softmax( self.all_action_scores, self.rl_temperature) maxq_action_idxs = self.all_action_scores.argmax(axis=1) if logged_actions is not None: model_values_on_logged_actions = np.sum( (logged_actions * self.all_action_scores), axis=1, keepdims=True) evaluator.report( self.loss.cpu().numpy(), logged_actions, logged_propensities, logged_rewards, logged_values, self.model_propensities, self.all_action_scores, model_values_on_logged_actions, maxq_action_idxs, )
def __init__(self, env, assume_optimal_policy: bool, use_int_features: bool = False) -> None: Evaluator.__init__(self, 1) self._env = env samples = env.generate_samples(200000, 1.0) self.logged_states = samples.states self.logged_actions = samples.actions self.logged_propensities = np.array(samples.propensities).reshape( -1, 1) # Create integer logged actions self.logged_actions_int: List[int] = [] for action in self.logged_actions: self.logged_actions_int.append(self._env.action_to_index(action)) self.logged_actions_one_hot = np.zeros( [len(self.logged_actions), len(env.ACTIONS)], dtype=np.float32) for i, action in enumerate(self.logged_actions): self.logged_actions_one_hot[i, env.action_to_index(action)] = 1 self.logged_values = env.true_values_for_sample( self.logged_states, self.logged_actions, assume_optimal_policy) self.logged_rewards = env.true_rewards_for_sample( self.logged_states, self.logged_actions) self.estimated_ltv_values = np.zeros( [len(self.logged_states), len(self._env.ACTIONS)], dtype=np.float32) for action in range(len(self._env.ACTIONS)): self.estimated_ltv_values[:, action] = self._env.true_values_for_sample( self.logged_states, [self._env.index_to_action(action)] * len(self.logged_states), True, ).flatten() self.estimated_reward_values = np.zeros( [len(self.logged_states), len(self._env.ACTIONS)], dtype=np.float32) for action in range(len(self._env.ACTIONS)): self.estimated_reward_values[:, action] = self._env.true_rewards_for_sample( self.logged_states, [ self._env.index_to_action( action) ] * len(self.logged_states), ).flatten() self.use_int_features = use_int_features
def evaluate(self, evaluator: Evaluator, logged_value: Optional[torch.Tensor]): evaluator.report( self.loss.cpu().numpy(), None, None, None, logged_value.cpu().numpy() if logged_value is not None else None, None, None, None, self.all_action_scores.cpu().numpy(), None, )
def evaluate(self, evaluator: Evaluator): # FIXME evaluator.report( self.loss.cpu().numpy(), None, None, None, None, None, None, None, self.all_action_scores.cpu().numpy(), None, )
def test_evaluator_ground_truth(self): environment = GridworldContinuous() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, _ = environment.generate_samples(100000, 1.0) true_values = environment.true_values_for_sample( states, actions, False) # Hijack the reward timeline to insert the ground truth reward_timelines = [] for tv in true_values: reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.05) self.assertLess(evaluator.mc_loss[-1], 0.12)
def policy(self, states): with core.DeviceScope(self.c2_device): if isinstance(self.trainer, DiscreteActionTrainer): workspace.FeedBlob("states", states) elif isinstance(self.trainer, ContinuousActionDQNTrainer): num_actions = len(self.trainer.action_normalization_parameters) actions = np.eye(num_actions, dtype=np.float32) actions = np.tile(actions, reps=(len(states), 1)) states = np.repeat(states, repeats=num_actions, axis=0) workspace.FeedBlob("states", states) workspace.FeedBlob("actions", actions) else: raise NotImplementedError( "Invalid trainer passed to GymPredictor") workspace.RunNetOnce(self.trainer.internal_policy_model.net) policy_output_blob = self.trainer.internal_policy_output q_scores = workspace.FetchBlob(policy_output_blob) if isinstance(self.trainer, DiscreteActionTrainer): assert q_scores.shape[0] == 1 q_scores = q_scores[0] q_scores_softmax = Evaluator.softmax(q_scores.reshape( 1, -1), self.trainer.rl_temperature)[0] if np.isnan( q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] policies = [ np.argmax(q_scores), np.random.choice(q_scores.shape[0], p=q_scores_softmax), ] return policies
def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): evaluator.report( self.loss.cpu().numpy(), None, None, None, logged_values, None, None, self.all_action_scores.cpu().numpy(), None, )
def evaluate( self, evaluator: Evaluator, logged_actions: torch.Tensor, logged_propensities: Optional[torch.Tensor], logged_rewards: torch.Tensor, logged_values: Optional[torch.Tensor], ): self.model_propensities, model_values_on_logged_actions, maxq_action_idxs = ( None, None, None, ) if self.all_action_scores is not None: self.all_action_scores = self.all_action_scores self.model_propensities = Evaluator.softmax( self.all_action_scores.cpu().numpy(), self.rl_temperature ) maxq_action_idxs = self.all_action_scores.argmax(dim=1, keepdim=True) if logged_actions is not None: model_values_on_logged_actions = ( torch.sum( (logged_actions * self.all_action_scores), dim=1, keepdim=True ) .cpu() .numpy() ) evaluator.report( self.loss.cpu().numpy(), logged_actions.cpu().numpy(), logged_propensities.cpu().numpy() if logged_propensities is not None else None, logged_rewards.cpu().numpy(), logged_values.cpu().numpy() if logged_values is not None else None, self.model_propensities, self.reward_estimates.cpu().numpy(), self.all_action_scores.cpu().numpy(), model_values_on_logged_actions, maxq_action_idxs, )
def evaluate( self, evaluator: Evaluator, logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): workspace.RunNet(self.q_score_model.net) model_values_on_logged_actions = workspace.FetchBlob(self.q_score_output) evaluator.report( workspace.FetchBlob(self.loss_blob), None, None, None, logged_values, None, None, model_values_on_logged_actions, )
def test_evaluator_timeline(self): environment = Gridworld() samples = environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(1) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.2) self.assertLess(evaluator.mc_loss[-1], 0.2)
def test_evaluator_ground_truth(self): environment = GridworldContinuous() samples = environment.generate_samples(500000, 1.0, DISCOUNT) # Hijack the reward timeline to insert the ground truth samples.episode_values = environment.true_values_for_sample( samples.states, samples.actions, False) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(None, 10, DISCOUNT, None, None) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.15)
def test_evaluator_timeline(self, environment): states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdp = environment.preprocess_samples(states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines) trainer.stream_tdp(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.2) self.assertLess(evaluator.mc_loss[-1], 0.2)
def test_compute_episode_value_from_samples(self): samples = [ MockSample("1", 3, 1), MockSample("1", 5, 2), MockSample("1", 6, 1), MockSample("3", 10, 2), MockSample("3", 11, 1), MockSample("6", 2, 3), MockSample("6", 4, 2), MockSample("6", 5, 0), MockSample("6", 8, 1), ] logged_values = Evaluator.compute_episode_value_from_samples( samples, 0.5) expected_values = [1.625, 2.5, 1, 2.5, 1, 3.515625, 2.0625, 0.125, 1] for i, val in enumerate(logged_values): self.assertEquals(val, expected_values[i])
def test_evaluator_ground_truth(self): environment = Gridworld() samples = environment.generate_samples(200000, 1.0) true_values = environment.true_values_for_sample( samples.states, samples.actions, False) # Hijack the reward timeline to insert the ground truth samples.reward_timelines = [] for tv in true_values: samples.reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None) tdps = environment.preprocess_samples(samples, self.minibatch_size) for _ in range(2): for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_evaluator_ground_truth(self): environment = GridworldContinuous() samples = environment.generate_samples(200000, 1.0) true_values = environment.true_values_for_sample( samples.states, samples.actions, False) # Hijack the reward timeline to insert the ground truth samples.reward_timelines = [] for tv in true_values: samples.reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(None, 10, DISCOUNT) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: tdp.rewards = tdp.rewards.flatten() tdp.not_terminals = tdp.not_terminals.flatten() trainer.train(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.15)
def test_evaluator_ground_truth(self, environment): states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, _ = environment.generate_samples(100000, 1.0) true_values = environment.true_values_for_sample( states, actions, False) # Hijack the reward timeline to insert the ground truth reward_timelines = [] for tv in true_values: reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdp = environment.preprocess_samples(states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines) trainer.stream_tdp(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.05) self.assertLess(evaluator.mc_loss[-1], 0.12)
def _test_evaluator_ground_truth_dueling(self, use_gpu=False, use_all_avail_gpus=False): environment = Gridworld() samples = environment.generate_samples(100000, 1.0, DISCOUNT) true_values = environment.true_values_for_sample( samples.states, samples.actions, False) # Hijack the reward timeline to insert the ground truth samples.episode_values = true_values trainer = self.get_sarsa_trainer(environment, True, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus) evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None) tdps = environment.preprocess_samples(samples, self.minibatch_size, use_gpu=use_gpu) for tdp in tdps: trainer.train(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.1)
def policy(self, states): with core.DeviceScope(self.c2_device): if isinstance(self.trainer, DiscreteActionTrainer): workspace.FeedBlob("states", states) else: raise NotImplementedError( "Invalid trainer passed to GymPredictor") workspace.RunNetOnce(self.trainer.internal_policy_model.net) policy_output_blob = self.trainer.internal_policy_output q_scores = workspace.FetchBlob(policy_output_blob) if isinstance(self.trainer, DiscreteActionTrainer): assert q_scores.shape[0] == 1 q_scores = q_scores[0] q_scores_softmax = Evaluator.softmax(q_scores.reshape( 1, -1), self.trainer.rl_temperature)[0] if np.isnan( q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] policies = [ np.argmax(q_scores), np.random.choice(q_scores.shape[0], p=q_scores_softmax), ] return policies
def test_evaluator_timeline(self): environment = GridworldContinuous() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.2) self.assertLess(evaluator.mc_loss[-1], 0.2)
def policy(self, states): if isinstance(self.trainer, DQNTrainer): input = states elif isinstance(self.trainer, ParametricDQNTrainer): num_actions = len(self.trainer.action_normalization_parameters) actions = np.eye(num_actions, dtype=np.float32) actions = np.tile(actions, reps=(len(states), 1)) states = np.repeat(states, repeats=num_actions, axis=0) input = np.hstack((states, actions)) else: raise NotImplementedError("Invalid trainer passed to GymPredictor") q_scores = self.trainer.internal_prediction(input) if isinstance(self.trainer, DQNTrainer): assert q_scores.shape[0] == 1 q_scores = q_scores[0] q_scores_softmax = Evaluator.softmax(q_scores.reshape(1, -1), self.trainer.rl_temperature)[0] if np.isnan(q_scores_softmax).any() or np.max(q_scores_softmax) < 1e-3: q_scores_softmax[:] = 1.0 / q_scores_softmax.shape[0] policies = [ np.argmax(q_scores), np.random.choice(q_scores.shape[0], p=q_scores_softmax), ] return policies
def evaluate(self, predictor): # test only float features predictions = predictor.predict(self.logged_states) estimated_reward_values = predictor.estimate_reward(self.logged_states) if isinstance(predictor.trainer, ParametricDQNTrainer): predictions = predictions.reshape([-1, self._env.action_dim]) estimated_reward_values = estimated_reward_values.reshape( [-1, self._env.action_dim]) value_error_sum = 0.0 reward_error_sum = 0.0 for i in range(len(self.logged_states)): logged_action = self.logged_actions[i] logged_value = self.logged_values[i][0] target_value = predictions[i][logged_action] value_error_sum += abs(logged_value - target_value) logged_reward = self.logged_rewards[i][0] estimated_reward = estimated_reward_values[i][logged_action] reward_error_sum += abs(logged_reward - estimated_reward) value_error_mean = value_error_sum / np.sum(np.abs(self.logged_values)) reward_error_mean = reward_error_sum / np.sum( np.abs(self.logged_rewards)) logger.info("EVAL Q-Value MAE ERROR: {0:.3f}".format(value_error_mean)) self.mc_loss.append(value_error_mean) logger.info("EVAL REWARD MAE ERROR: {0:.3f}".format(reward_error_mean)) self.reward_loss.append(reward_error_mean) target_propensities = Evaluator.softmax( predictions, GymEvaluator.SOFTMAX_TEMPERATURE) reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_propensities, target_propensities, estimated_reward_values, ) self.reward_inverse_propensity_score.append( reward_inverse_propensity_score) self.reward_direct_method.append(reward_direct_method) self.reward_doubly_robust.append(reward_doubly_robust) logger.info( "Reward Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}" .format( reward_inverse_propensity_score.normalized, reward_inverse_propensity_score.raw, )) logger.info( "Reward Direct Method : normalized {0:.3f} raw {1:.3f}" .format(reward_direct_method.normalized, reward_direct_method.raw)) logger.info( "Reward Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(reward_doubly_robust.normalized, reward_doubly_robust.raw)) value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_values, self.logged_propensities, target_propensities, predictions, ) self.value_inverse_propensity_score.append( value_inverse_propensity_score) self.value_direct_method.append(value_direct_method) self.value_doubly_robust.append(value_doubly_robust) logger.info( "Value Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}" .format( value_inverse_propensity_score.normalized, value_inverse_propensity_score.raw, )) logger.info( "Value Direct Method : normalized {0:.3f} raw {1:.3f}" .format(value_direct_method.normalized, value_direct_method.raw)) logger.info( "Value One-Step Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(value_doubly_robust.normalized, value_doubly_robust.raw)) sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, predictions, ) self.value_sequential_doubly_robust.append(sequential_doubly_robust) logger.info( "Value Sequential Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(sequential_doubly_robust.normalized, sequential_doubly_robust.raw)) weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, predictions, num_j_steps=1, whether_self_normalize_importance_weights=True, ) self.value_weighted_doubly_robust.append(weighted_doubly_robust) logger.info( "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}" .format(weighted_doubly_robust.normalized, weighted_doubly_robust.raw)) magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, predictions, num_j_steps=GymEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR, whether_self_normalize_importance_weights=True, ) self.value_magic_doubly_robust.append(magic_doubly_robust) logger.info( "Value Magic Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(magic_doubly_robust.normalized, magic_doubly_robust.raw)) avg_rewards, avg_discounted_rewards = self._env.run_ep_n_times( 100, predictor, test=True) episode_starts = np.nonzero(self.logged_terminals.squeeze())[0] + 1 logged_discounted_performance = (self.logged_values[0][0] + np.sum( self.logged_values[episode_starts[:-1]])) / np.sum( self.logged_terminals) true_discounted_value_PE = (avg_discounted_rewards / logged_discounted_performance) self.true_discounted_value_PE.append(true_discounted_value_PE) logger.info( "True Discounted Value P.E : normalized {0:.3f} raw {1:.3f}" .format(true_discounted_value_PE, avg_discounted_rewards)) logged_performance = np.sum(self.logged_rewards) / np.sum( self.logged_terminals) true_value_PE = avg_rewards / logged_performance self.true_value_PE.append(true_value_PE) logger.info( "True Value P.E : normalized {0:.3f} raw {1:.3f}" .format(true_value_PE, avg_rewards))
def evaluate(self, predictor): # Test feeding float features & int features if self.use_int_features: float_features, int_features = self._split_int_and_float_features( self.logged_states) # Since all gridworld features are float types, swap these so # all inputs are now int_features for testing purpose float_features, int_features = int_features, float_features prediction_string = predictor.predict(float_features, int_features) # Test only feeding float features else: prediction_string = predictor.predict(self.logged_states) # Convert action string to integer prediction = np.zeros([len(prediction_string), len(self._env.ACTIONS)], dtype=np.float32) for x in range(len(self.logged_states)): for action_index, action in enumerate(self._env.ACTIONS): prediction[x][action_index] = prediction_string[x].get( action, 1e-9) # Print out scores using all states all_states = [] for x in self._env.STATES: all_states.append({x: 1.0}) if self.use_int_features: all_states_float, all_states_int = self._split_int_and_float_features( all_states) all_states_prediction_string = predictor.predict( all_states_float, all_states_int) else: all_states_prediction_string = predictor.predict(all_states) all_states_prediction = np.zeros( [len(all_states_prediction_string), len(self._env.ACTIONS)], dtype=np.float32, ) for x in range(len(all_states)): for action_index, action in enumerate(self._env.ACTIONS): all_states_prediction[x][ action_index] = all_states_prediction_string[x].get( action, 1e-9) print(all_states_prediction[:, 0].reshape(5, 5), "\n") print(all_states_prediction[:, 1].reshape(5, 5), "\n") print(all_states_prediction[:, 2].reshape(5, 5), "\n") print(all_states_prediction[:, 3].reshape(5, 5), "\n") error_sum = 0.0 num_error_prints = 0 for x in range(len(self.logged_states)): logged_value = self.logged_values[x][0] target_value = prediction_string[x].get(self.logged_actions[x], 1e-9) error = abs(logged_value - target_value) if num_error_prints < 10 and error > 0.2: print( "GOT THIS STATE WRONG: ", x, self._env._pos(list(self.logged_states[x].keys())[0]), self.logged_actions[x], logged_value, target_value, ) num_error_prints += 1 if num_error_prints == 10: print("MAX ERRORS PRINTED") error_sum += error error_mean = error_sum / float(len(self.logged_states)) logger.info("EVAL ERROR: {0:.3f}".format(error_mean)) self.mc_loss.append(error_mean) target_propensities = Evaluator.softmax( prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE) reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_propensities, target_propensities, self.estimated_reward_values, ) self.reward_inverse_propensity_score.append( reward_inverse_propensity_score) self.reward_direct_method.append(reward_direct_method) self.reward_doubly_robust.append(reward_doubly_robust) logger.info( "Reward Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}" .format( reward_inverse_propensity_score.normalized, reward_inverse_propensity_score.raw, )) logger.info( "Reward Direct Method : normalized {0:.3f} raw {1:.3f}" .format(reward_direct_method.normalized, reward_direct_method.raw)) logger.info( "Reward Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(reward_doubly_robust.normalized, reward_doubly_robust.raw)) value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_values, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_inverse_propensity_score.append( value_inverse_propensity_score) self.value_direct_method.append(value_direct_method) self.value_doubly_robust.append(value_doubly_robust) logger.info( "Value Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}" .format( value_inverse_propensity_score.normalized, value_inverse_propensity_score.raw, )) logger.info( "Value Direct Method : normalized {0:.3f} raw {1:.3f}" .format(value_direct_method.normalized, value_direct_method.raw)) logger.info( "Value One-Step Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(value_doubly_robust.normalized, value_doubly_robust.raw)) sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_sequential_doubly_robust.append(sequential_doubly_robust) logger.info( "Value Sequential Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(sequential_doubly_robust.normalized, sequential_doubly_robust.raw)) weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, num_j_steps=1, whether_self_normalize_importance_weights=True, ) self.value_weighted_doubly_robust.append(weighted_doubly_robust) logger.info( "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}" .format(weighted_doubly_robust.normalized, weighted_doubly_robust.raw)) magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, num_j_steps=GridworldEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR, whether_self_normalize_importance_weights=True, ) self.value_magic_doubly_robust.append(magic_doubly_robust) logger.info( "Value Magic Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}" .format(magic_doubly_robust.normalized, magic_doubly_robust.raw))
def evaluate_predictions(self, prediction, all_states_prediction): print(all_states_prediction[:, 0].reshape(5, 5), "\n") print(all_states_prediction[:, 1].reshape(5, 5), "\n") print(all_states_prediction[:, 2].reshape(5, 5), "\n") print(all_states_prediction[:, 3].reshape(5, 5), "\n") error_sum = 0.0 num_error_prints = 0 for x in range(len(self.logged_states)): int_action = self._env.action_to_index(self.logged_actions[x]) logged_value = self.logged_values[x][0] target_value = prediction[x][int_action] error = abs(logged_value - target_value) if num_error_prints < 10 and error > 0.2: print( "GOT THIS STATE WRONG: ", x, self._env._pos(list(self.logged_states[x].keys())[0]), self.logged_actions[x], logged_value, target_value, ) num_error_prints += 1 if num_error_prints == 10: print("MAX ERRORS PRINTED") error_sum += error error_mean = error_sum / float(len(self.logged_states)) logger.info("EVAL ERROR: {0:.3f}".format(error_mean)) self.mc_loss.append(error_mean) target_propensities = Evaluator.softmax( prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE ) reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_propensities, target_propensities, self.estimated_reward_values, ) self.reward_inverse_propensity_score.append(reward_inverse_propensity_score) self.reward_direct_method.append(reward_direct_method) self.reward_doubly_robust.append(reward_doubly_robust) logger.info( "Reward Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}".format( reward_inverse_propensity_score.normalized, reward_inverse_propensity_score.raw, ) ) logger.info( "Reward Direct Method : normalized {0:.3f} raw {1:.3f}".format( reward_direct_method.normalized, reward_direct_method.raw ) ) logger.info( "Reward Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}".format( reward_doubly_robust.normalized, reward_doubly_robust.raw ) ) value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_one_step_policy_estimation( self.logged_actions_one_hot, self.logged_values, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_inverse_propensity_score.append(value_inverse_propensity_score) self.value_direct_method.append(value_direct_method) self.value_doubly_robust.append(value_doubly_robust) logger.info( "Value Inverse Propensity Score : normalized {0:.3f} raw {1:.3f}".format( value_inverse_propensity_score.normalized, value_inverse_propensity_score.raw, ) ) logger.info( "Value Direct Method : normalized {0:.3f} raw {1:.3f}".format( value_direct_method.normalized, value_direct_method.raw ) ) logger.info( "Value One-Step Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}".format( value_doubly_robust.normalized, value_doubly_robust.raw ) ) sequential_doubly_robust = self.doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_sequential_doubly_robust.append(sequential_doubly_robust) logger.info( "Value Sequential Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}".format( sequential_doubly_robust.normalized, sequential_doubly_robust.raw ) ) weighted_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, num_j_steps=1, whether_self_normalize_importance_weights=True, ) self.value_weighted_doubly_robust.append(weighted_doubly_robust) logger.info( "Value Weighted Sequential Doubly Robust P.E. : noramlized {0:.3f} raw {1:.3f}".format( weighted_doubly_robust.normalized, weighted_doubly_robust.raw ) ) magic_doubly_robust = self.weighted_doubly_robust_sequential_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_terminals, self.logged_propensities, target_propensities, self.estimated_ltv_values, num_j_steps=GridworldEvaluator.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR, whether_self_normalize_importance_weights=True, ) self.value_magic_doubly_robust.append(magic_doubly_robust) logger.info( "Value Magic Doubly Robust P.E. : normalized {0:.3f} raw {1:.3f}".format( magic_doubly_robust.normalized, magic_doubly_robust.raw ) )
def train(self, training_samples: TrainingDataPage, evaluator: Optional[Evaluator] = None): if self.minibatch == 0: # Assume that the tensors are the right shape after the first minibatch assert (training_samples.states.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.states.shape) assert training_samples.actions.shape == torch.Size([ self.minibatch_size, len(self._actions) ]), "Invalid shape: " + str(training_samples.actions.shape) assert training_samples.rewards.shape == torch.Size( [self.minibatch_size, 1]), "Invalid shape: " + str(training_samples.rewards.shape) assert (training_samples.next_states.shape == training_samples.states.shape), "Invalid shape: " + str( training_samples.next_states.shape) assert (training_samples.not_terminals.shape == training_samples.rewards.shape), "Invalid shape: " + str( training_samples.not_terminals.shape) if training_samples.possible_next_actions is not None: assert ( training_samples.possible_next_actions.shape == training_samples.actions.shape), "Invalid shape: " + str( training_samples.possible_next_actions.shape) if training_samples.propensities is not None: assert (training_samples.propensities.shape == training_samples .rewards.shape), "Invalid shape: " + str( training_samples.propensities.shape) # Apply reward boost if specified reward_boosts = torch.sum(training_samples.actions.float() * self.reward_boosts, dim=1, keepdim=True) boosted_rewards = training_samples.rewards + reward_boosts self.minibatch += 1 states = training_samples.states.detach().requires_grad_(True) actions = training_samples.actions rewards = boosted_rewards next_states = training_samples.next_states discount_tensor = torch.full(training_samples.time_diffs.shape, self.gamma).type(self.dtype) not_done_mask = training_samples.not_terminals if self.use_seq_num_diff_as_time_diff: discount_tensor = discount_tensor.pow(training_samples.time_diffs) if self.maxq_learning: # Compute max a' Q(s', a') over all possible actions using target network possible_next_actions = training_samples.possible_next_actions next_q_values = self.get_max_q_values(next_states, possible_next_actions, self.double_q_learning) else: # SARSA next_actions = training_samples.next_actions next_q_values = self.get_next_action_q_values( next_states, next_actions) filtered_next_q_vals = next_q_values * not_done_mask if self.minibatch < self.reward_burnin: target_q_values = rewards else: target_q_values = rewards + (discount_tensor * filtered_next_q_vals) # Get Q-value of action taken all_q_values = self.q_network(states) self.all_action_scores = all_q_values.detach() q_values = torch.sum(all_q_values * actions, 1, keepdim=True) loss = self.q_network_loss(q_values, target_q_values) self.loss = loss.detach() self.q_network_optimizer.zero_grad() loss.backward() if self.gradient_handler: self.gradient_handler(self.q_network.parameters()) self.q_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network, self.q_network_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network, self.q_network_target, self.tau) # get reward estimates reward_estimates = self.reward_network(states) self.reward_estimates = reward_estimates.detach() reward_estimates_for_logged_actions = reward_estimates.gather( 1, actions.argmax(dim=1, keepdim=True)) reward_loss = F.mse_loss(reward_estimates_for_logged_actions, rewards) self.reward_network_optimizer.zero_grad() reward_loss.backward() self.reward_network_optimizer.step() self.loss_reporter.report(td_loss=float(self.loss), reward_loss=float(reward_loss)) training_metadata = {} if evaluator is not None: model_propensities = torch.from_numpy( Evaluator.softmax(self.all_action_scores.cpu().numpy(), self.rl_temperature)) cpe_stats = BatchStatsForCPE( logged_actions=training_samples.actions, logged_propensities=training_samples.propensities, logged_rewards=rewards, logged_values=None, # Compute at end of each epoch for CPE model_propensities=model_propensities, model_rewards=self.reward_estimates, model_values=self.all_action_scores, model_values_on_logged_actions= None, # Compute at end of each epoch for CPE model_action_idxs=self.all_action_scores.argmax(dim=1, keepdim=True), ) evaluator.report(cpe_stats) training_metadata["model_rewards"] = self.reward_estimates.cpu( ).numpy() return training_metadata
def train_network(params): logger.info("Running Parametric DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"]) else: in_training_cpe_parameters = None trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) state_normalization = read_norm_file(params["state_norm_data_path"]) action_normalization = read_norm_file(params["action_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = ParametricDQNTrainer( trainer_params, state_normalization, action_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) state_preprocessor = Preprocessor(state_normalization, params["use_gpu"]) action_preprocessor = Preprocessor(action_normalization, params["use_gpu"]) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( None, 100, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, ) else: evaluator = Evaluator( None, 100, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), ) start_time = time.time() for epoch in range(params["epochs"]): dataset.reset_iterator() for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, params["epochs"]) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training( state_preprocessor, batch, action_preprocessor=action_preprocessor) tdp.set_type(trainer.dtype) trainer.train(tdp, evaluator) evaluator.collect_parametric_action_samples( mdp_ids=tdp.mdp_ids, sequence_numbers=tdp.sequence_numbers.cpu().numpy(), logged_state_actions=np.concatenate( (tdp.states.cpu().numpy(), tdp.actions.cpu().numpy()), axis=1), logged_rewards=tdp.rewards.cpu().numpy(), logged_propensities=tdp.propensities.cpu().numpy(), logged_terminals=(1.0 - tdp.not_terminals), possible_state_actions=tdp.state_pas_concat.cpu().numpy(), pas_lens=tdp.possible_actions_lengths.cpu().numpy(), ) cpe_start_time = time.time() evaluator.recover_samples_to_be_unshuffled() evaluator.score_cpe(trainer_params.rl.gamma) evaluator.clear_collected_samples() logger.info("CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)) through_put = (len(dataset) * params["epochs"]) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) return export_trainer_and_predictor(trainer, params["model_output_path"])
def train_network(params): writer = None if params["model_output_path"] is not None: writer = SummaryWriter( log_dir=os.path.join( os.path.expanduser(params["model_output_path"]), "training_data" ) ) logger.info("Running DQN workflow with params:") logger.info(params) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"] ) else: in_training_cpe_parameters = None trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset( params["training_data_path"], batch_size=training_parameters.minibatch_size ) state_normalization = read_norm_file(params["state_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info( "Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, ) ) trainer = DQNTrainer(trainer_params, state_normalization, params["use_gpu"]) trainer = update_model_for_warm_start(trainer) preprocessor = Preprocessor(state_normalization, params["use_gpu"]) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( trainer_params.actions, 10, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, ) else: evaluator = Evaluator( trainer_params.actions, 10, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), ) start_time = time.time() for epoch in range(int(params["epochs"])): for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, int(params["epochs"])) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training(preprocessor, batch, action_names) trainer.train(tdp) trainer.evaluate( evaluator, tdp.actions, None, tdp.rewards, tdp.episode_values ) evaluator.collect_discrete_action_samples( mdp_ids=tdp.mdp_ids, sequence_numbers=tdp.sequence_numbers.cpu().numpy(), states=tdp.states.cpu().numpy(), logged_actions=tdp.actions.cpu().numpy(), logged_rewards=tdp.rewards.cpu().numpy(), logged_propensities=tdp.propensities.cpu().numpy(), logged_terminals=np.invert( tdp.not_terminals.cpu().numpy().astype(np.bool) ), ) cpe_start_time = time.time() evaluator.recover_samples_to_be_unshuffled() evaluator.score_cpe() if writer is not None: evaluator.log_to_tensorboard(writer, epoch) evaluator.clear_collected_samples() logger.info( "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time) ) through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time) logger.info( "Training finished. Processed ~{} examples / s.".format(round(through_put)) ) if writer is not None: writer.close() return export_trainer_and_predictor(trainer, params["model_output_path"])
def evaluate(self, predictor): # Test feeding float features & int features if self.use_int_features: float_features, int_features = self._split_int_and_float_features( self.logged_states) # Since all gridworld features are float types, swap these so # all inputs are now int_features for testing purpose float_features, int_features = int_features, float_features prediction_string = predictor.predict(float_features, int_features) # Test only feeding float features else: prediction_string = predictor.predict(self.logged_states) # Convert action string to integer prediction = np.zeros([len(prediction_string), len(self._env.ACTIONS)], dtype=np.float32) for x in range(len(self.logged_states)): for action_index, action in enumerate(self._env.ACTIONS): prediction[x][action_index] = prediction_string[x][action] error_sum = 0.0 for x in range(len(self.logged_states)): logged_value = self.logged_values[x][0] target_value = prediction_string[x][self.logged_actions[x]] error_sum += abs(logged_value - target_value) error_mean = error_sum / float(len(self.logged_states)) logger.info("EVAL ERROR: {0:.3f}".format(error_mean)) self.mc_loss.append(error_mean) target_propensities = Evaluator.softmax( prediction, GridworldEvaluator.SOFTMAX_TEMPERATURE) value_inverse_propensity_score, value_direct_method, value_doubly_robust = self.doubly_robust_policy_estimation( self.logged_actions_one_hot, self.logged_values, self.logged_propensities, target_propensities, self.estimated_ltv_values, ) self.value_inverse_propensity_score.append( value_inverse_propensity_score) self.value_direct_method.append(value_direct_method) self.value_doubly_robust.append(value_doubly_robust) logger.info("Value Inverse Propensity Score : {0:.3f}".format( value_inverse_propensity_score)) logger.info("Value Direct Method : {0:.3f}".format( value_direct_method)) logger.info("Value Doubly Robust P.E. : {0:.3f}".format( value_doubly_robust)) reward_inverse_propensity_score, reward_direct_method, reward_doubly_robust = self.doubly_robust_policy_estimation( self.logged_actions_one_hot, self.logged_rewards, self.logged_propensities, target_propensities, self.estimated_reward_values, ) self.reward_inverse_propensity_score.append( reward_inverse_propensity_score) self.reward_direct_method.append(reward_direct_method) self.reward_doubly_robust.append(reward_doubly_robust) logger.info("Reward Inverse Propensity Score: {0:.3f}".format( reward_inverse_propensity_score)) logger.info("Reward Direct Method : {0:.3f}".format( reward_direct_method)) logger.info("Reward Doubly Robust P.E. : {0:.3f}".format( reward_doubly_robust))
def get_recent_reward_loss(self): return Evaluator.calculate_recent_window_average( self.reward_loss, Evaluator.RECENT_WINDOW_SIZE, num_entries=1)
def train(self, training_samples: TrainingDataPage): if self.minibatch == 0: # Assume that the tensors are the right shape after the first minibatch assert (training_samples.states.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.states.shape) assert training_samples.actions.shape == torch.Size([ self.minibatch_size, len(self._actions) ]), "Invalid shape: " + str(training_samples.actions.shape) assert training_samples.rewards.shape == torch.Size( [self.minibatch_size, 1]), "Invalid shape: " + str(training_samples.rewards.shape) assert (training_samples.next_states.shape == training_samples.states.shape), "Invalid shape: " + str( training_samples.next_states.shape) assert (training_samples.not_terminal.shape == training_samples.rewards.shape), "Invalid shape: " + str( training_samples.not_terminal.shape) if training_samples.possible_next_actions_mask is not None: assert ( training_samples.possible_next_actions_mask.shape == training_samples.actions.shape), ( "Invalid shape: " + str(training_samples.possible_next_actions_mask.shape)) if training_samples.propensities is not None: assert (training_samples.propensities.shape == training_samples .rewards.shape), "Invalid shape: " + str( training_samples.propensities.shape) if training_samples.metrics is not None: assert ( training_samples.metrics.shape[0] == self.minibatch_size ), "Invalid shape: " + str(training_samples.metrics.shape) boosted_rewards = self.boost_rewards(training_samples.rewards, training_samples.actions) self.minibatch += 1 states = training_samples.states.detach().requires_grad_(True) actions = training_samples.actions rewards = boosted_rewards discount_tensor = torch.full(training_samples.time_diffs.shape, self.gamma).type(self.dtype) not_done_mask = training_samples.not_terminal if self.use_seq_num_diff_as_time_diff: discount_tensor = discount_tensor.pow(training_samples.time_diffs) all_next_q_values, all_next_q_values_target = self.get_detached_q_values( training_samples.next_states) if self.maxq_learning: # Compute max a' Q(s', a') over all possible actions using target network next_q_values, max_q_action_idxs = self.get_max_q_values( all_next_q_values, all_next_q_values_target, training_samples.possible_next_actions_mask, ) else: # SARSA next_q_values, max_q_action_idxs = self.get_max_q_values( all_next_q_values, all_next_q_values_target, training_samples.next_actions, ) filtered_next_q_vals = next_q_values * not_done_mask if self.minibatch < self.reward_burnin: target_q_values = rewards else: target_q_values = rewards + (discount_tensor * filtered_next_q_vals) # Get Q-value of action taken all_q_values = self.q_network(states) self.all_action_scores = all_q_values.detach() q_values = torch.sum(all_q_values * actions, 1, keepdim=True) loss = self.q_network_loss(q_values, target_q_values) self.loss = loss.detach() self.q_network_optimizer.zero_grad() loss.backward() if self.gradient_handler: self.gradient_handler(self.q_network.parameters()) self.q_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network, self.q_network_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network, self.q_network_target, self.tau) if training_samples.metrics is None: metrics_reward_concat_real_vals = training_samples.rewards else: metrics_reward_concat_real_vals = torch.cat( (training_samples.metrics, training_samples.rewards), dim=1) ######### Train separate reward network for CPE evaluation ############# reward_estimates = self.reward_network(states) logged_action_idxs = actions.argmax(dim=1, keepdim=True) reward_estimates_for_logged_actions = reward_estimates.gather( 1, self.reward_idx_offsets + logged_action_idxs) reward_loss = F.mse_loss(reward_estimates_for_logged_actions, metrics_reward_concat_real_vals) self.reward_network_optimizer.zero_grad() reward_loss.backward() self.reward_network_optimizer.step() ######### Train separate q-network for CPE evaluation ############# metric_q_values = self.q_network_cpe(states).gather( 1, self.reward_idx_offsets + logged_action_idxs) metric_target_q_values = self.q_network_cpe_target(states).detach() max_q_values_metrics = metric_target_q_values.gather( 1, self.reward_idx_offsets + max_q_action_idxs) filtered_max_q_values_metrics = max_q_values_metrics * not_done_mask if self.minibatch < self.reward_burnin: target_metric_q_values = metrics_reward_concat_real_vals else: target_metric_q_values = metrics_reward_concat_real_vals + ( discount_tensor * filtered_max_q_values_metrics) metric_q_value_loss = self.q_network_loss(metric_q_values, target_metric_q_values) self.q_network_cpe.zero_grad() metric_q_value_loss.backward() self.q_network_cpe_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.q_network_cpe, self.q_network_cpe_target, 1.0) else: # Use the soft update rule to update target network self._soft_update(self.q_network_cpe, self.q_network_cpe_target, self.tau) model_propensities = torch.from_numpy( Evaluator.softmax(self.all_action_scores.cpu().numpy(), self.rl_temperature)) self.loss_reporter.report( td_loss=self.loss, reward_loss=reward_loss, logged_actions=logged_action_idxs, logged_propensities=training_samples.propensities, logged_rewards=rewards, logged_values=None, # Compute at end of each epoch for CPE model_propensities=model_propensities, model_rewards=reward_estimates[:, torch.arange( self.reward_idx_offsets[0], self.reward_idx_offsets[0] + self.num_actions, ), ], model_values=self.all_action_scores, model_values_on_logged_actions= None, # Compute at end of each epoch for CPE model_action_idxs=self.all_action_scores.argmax(dim=1, keepdim=True), ) training_metadata = {} training_metadata["model_rewards"] = reward_estimates.detach().cpu( ).numpy() return training_metadata