def __init__(self, env, params): """ DQN algorithm from Mnih et al., 2015. This implementation includes mechanisms from Universal Value Function Approximators (Schaul et al., 2015) and Agent57 (Badia et al., 2019). Parameters ---------- env: BaseEnv Learning environment. params: dict Dictionary of parameters. Attributes ---------- batch_size: int Batch size. gamma: float Discount factor in [0, 1]. layers: tuple of ints Describes sizes of hidden layers of the critics. goal_conditioned: bool Whether the algorithm is goal conditioned or not. This is the idea behind UVFA (Schaul et al., 2015). replace_target_cnt: int Frequency with which target critics should be replaced by current critics (in learning steps). logdir: str Logging directory save_policy_every: int Frequency to save policy (in episodes). eval_and_log_every: int Frequency to pring logs (in episodes). n_evals_if_stochastic: int Number of evaluation episodes if the environment is stochastic. stochastic: bool Whether the environment is stochastic. epsilon: float Probability to sample a random action (epsilon-greedy exploration). This is set to 0 during evaluation. dims: dict Dimensions of states and actions. cost_function: BaseMultiCostFunction Multi-cost function. nb_costs: int Number of cost functions use_constraints: bool Whether the algorithm uses constraints. pareto_size: int Number of random goals to be sampled to generate the pareto front. """ super(DQN, self).__init__(env, params) # Save parameters self.batch_size = self.algo_params['batch_size'] self.gamma = self.algo_params['gamma'] self.layers = tuple(self.algo_params['layers']) self.goal_conditioned = self.algo_params['goal_conditioned'] self.replace_target_cnt = self.algo_params['replace_target_count'] self.logdir = params['logdir'] self.save_policy_every = self.algo_params['save_policy_every'] self.eval_and_log_every = self.algo_params['eval_and_log_every'] self.n_evals_if_stochastic = self.algo_params['n_evals_if_stochastic'] self.stochastic = params['model_params']['stochastic'] self.epsilon = self.algo_params['epsilon_greedy'] self.cost_function = self.env.unwrapped.cost_function self.nb_costs = self.env.unwrapped.cost_function.nb_costs self.use_constraints = self.cost_function.use_constraints self.pareto_size = self.algo_params['pareto_size'] self.is_multi_obj = True if self.goal_conditioned else False # DQN is not a multi-obj algorithm, unless it is goal-conditioned self.dims = dict(s=env.observation_space.shape[0], a=env.action_space.n) # if self.goal_conditioned: self.goal_dim = self.env.unwrapped.cost_function.goal_dim eval_goals = self.cost_function.get_eval_goals(1) goals, index, inverse = np.unique(eval_goals, return_inverse=True, return_index=True, axis=0) goal_keys = [str(g) for g in goals] else: self.goal_dim = 0 goal_keys = [str(self.cost_function.beta)] # Initialize Logger. if self.logdir: os.makedirs(self.logdir + 'models/', exist_ok=True) stats_keys = ['mean_agg', 'std_agg'] + [ 'mean_C{}'.format(i) for i in range(self.nb_costs) ] + ['std_C{}'.format(i) for i in range(self.nb_costs)] keys = ['Episode', 'Best score so far', 'Eval score'] for k in goal_keys: for s in stats_keys: keys.append('Eval, g: ' + k + ': ' + s) keys += ['Loss {}'.format(i + 1) for i in range(self.nb_costs)] + [ 'Train, Cost {}'.format(i + 1) for i in range(self.nb_costs) ] + ['Train, Aggregated cost'] self.logger = Logger(keys=keys, logdir=self.logdir) # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.algo_params['buffer_size']) # Initialize critics self.Q_eval = Critic( n_critics=self.nb_costs, dim_state=self.dims['s'], dim_goal=self.goal_dim, dim_actions=self.dims['a'], goal_ids=( (), ()), # no goal, the mixing parameters comes after the critic layers=self.layers) self.Q_next = Critic(n_critics=self.nb_costs, dim_state=self.dims['s'], dim_goal=self.goal_dim, dim_actions=self.dims['a'], goal_ids=((), ()), layers=self.layers) # Initialize optimizers. self.optimizers = [ optim.Adam(q.parameters(), lr=self.algo_params['lr']) for q in self.Q_eval.qs ] # If we use constraint, we train a Q-network per constraint using a negative reward of -1 whenever the constraint is violated. # This network learns to estimate the number of times the constraint will be violated in the future. # We then use it to guide action selection, selecting the action that does not violate the constraint when the other does, # or the action that minimizes constraint violation when both action lead to constraint violations. if self.use_constraints: self.nb_constraints = len(self.cost_function.constraints_ids) self.Q_eval_constraints = Critic( n_critics=self.nb_constraints, dim_state=self.dims['s'], dim_goal=self.goal_dim, dim_actions=self.dims['a'], goal_ids=self.cost_function.constraints_ids, layers=self.layers) self.Q_next_constraints = Critic( n_critics=self.nb_constraints, dim_state=self.dims['s'], dim_goal=self.goal_dim, dim_actions=self.dims['a'], goal_ids=self.cost_function.constraints_ids, layers=self.layers) self.optimizers_constraints = [ optim.Adam(q.parameters(), lr=self.algo_params['lr']) for q in self.Q_eval_constraints.qs ] else: self.nb_constraints = 0 # Initialize counters self.learn_step_counter = 0 self.env_step_counter = 0 self.episode = 0 self.best_cost = np.inf self.aggregated_costs = [] self.costs = []
class DQN(BaseAlgorithm): def __init__(self, env, params): """ DQN algorithm from Mnih et al., 2015. This implementation includes mechanisms from Universal Value Function Approximators (Schaul et al., 2015) and Agent57 (Badia et al., 2019). Parameters ---------- env: BaseEnv Learning environment. params: dict Dictionary of parameters. Attributes ---------- batch_size: int Batch size. gamma: float Discount factor in [0, 1]. layers: tuple of ints Describes sizes of hidden layers of the critics. goal_conditioned: bool Whether the algorithm is goal conditioned or not. This is the idea behind UVFA (Schaul et al., 2015). replace_target_cnt: int Frequency with which target critics should be replaced by current critics (in learning steps). logdir: str Logging directory save_policy_every: int Frequency to save policy (in episodes). eval_and_log_every: int Frequency to pring logs (in episodes). n_evals_if_stochastic: int Number of evaluation episodes if the environment is stochastic. stochastic: bool Whether the environment is stochastic. epsilon: float Probability to sample a random action (epsilon-greedy exploration). This is set to 0 during evaluation. dims: dict Dimensions of states and actions. cost_function: BaseMultiCostFunction Multi-cost function. nb_costs: int Number of cost functions use_constraints: bool Whether the algorithm uses constraints. pareto_size: int Number of random goals to be sampled to generate the pareto front. """ super(DQN, self).__init__(env, params) # Save parameters self.batch_size = self.algo_params['batch_size'] self.gamma = self.algo_params['gamma'] self.layers = tuple(self.algo_params['layers']) self.goal_conditioned = self.algo_params['goal_conditioned'] self.replace_target_cnt = self.algo_params['replace_target_count'] self.logdir = params['logdir'] self.save_policy_every = self.algo_params['save_policy_every'] self.eval_and_log_every = self.algo_params['eval_and_log_every'] self.n_evals_if_stochastic = self.algo_params['n_evals_if_stochastic'] self.stochastic = params['model_params']['stochastic'] self.epsilon = self.algo_params['epsilon_greedy'] self.cost_function = self.env.unwrapped.cost_function self.nb_costs = self.env.unwrapped.cost_function.nb_costs self.use_constraints = self.cost_function.use_constraints self.pareto_size = self.algo_params['pareto_size'] self.is_multi_obj = True if self.goal_conditioned else False # DQN is not a multi-obj algorithm, unless it is goal-conditioned self.dims = dict(s=env.observation_space.shape[0], a=env.action_space.n) # if self.goal_conditioned: self.goal_dim = self.env.unwrapped.cost_function.goal_dim eval_goals = self.cost_function.get_eval_goals(1) goals, index, inverse = np.unique(eval_goals, return_inverse=True, return_index=True, axis=0) goal_keys = [str(g) for g in goals] else: self.goal_dim = 0 goal_keys = [str(self.cost_function.beta)] # Initialize Logger. if self.logdir: os.makedirs(self.logdir + 'models/', exist_ok=True) stats_keys = ['mean_agg', 'std_agg'] + [ 'mean_C{}'.format(i) for i in range(self.nb_costs) ] + ['std_C{}'.format(i) for i in range(self.nb_costs)] keys = ['Episode', 'Best score so far', 'Eval score'] for k in goal_keys: for s in stats_keys: keys.append('Eval, g: ' + k + ': ' + s) keys += ['Loss {}'.format(i + 1) for i in range(self.nb_costs)] + [ 'Train, Cost {}'.format(i + 1) for i in range(self.nb_costs) ] + ['Train, Aggregated cost'] self.logger = Logger(keys=keys, logdir=self.logdir) # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.algo_params['buffer_size']) # Initialize critics self.Q_eval = Critic( n_critics=self.nb_costs, dim_state=self.dims['s'], dim_goal=self.goal_dim, dim_actions=self.dims['a'], goal_ids=( (), ()), # no goal, the mixing parameters comes after the critic layers=self.layers) self.Q_next = Critic(n_critics=self.nb_costs, dim_state=self.dims['s'], dim_goal=self.goal_dim, dim_actions=self.dims['a'], goal_ids=((), ()), layers=self.layers) # Initialize optimizers. self.optimizers = [ optim.Adam(q.parameters(), lr=self.algo_params['lr']) for q in self.Q_eval.qs ] # If we use constraint, we train a Q-network per constraint using a negative reward of -1 whenever the constraint is violated. # This network learns to estimate the number of times the constraint will be violated in the future. # We then use it to guide action selection, selecting the action that does not violate the constraint when the other does, # or the action that minimizes constraint violation when both action lead to constraint violations. if self.use_constraints: self.nb_constraints = len(self.cost_function.constraints_ids) self.Q_eval_constraints = Critic( n_critics=self.nb_constraints, dim_state=self.dims['s'], dim_goal=self.goal_dim, dim_actions=self.dims['a'], goal_ids=self.cost_function.constraints_ids, layers=self.layers) self.Q_next_constraints = Critic( n_critics=self.nb_constraints, dim_state=self.dims['s'], dim_goal=self.goal_dim, dim_actions=self.dims['a'], goal_ids=self.cost_function.constraints_ids, layers=self.layers) self.optimizers_constraints = [ optim.Adam(q.parameters(), lr=self.algo_params['lr']) for q in self.Q_eval_constraints.qs ] else: self.nb_constraints = 0 # Initialize counters self.learn_step_counter = 0 self.env_step_counter = 0 self.episode = 0 self.best_cost = np.inf self.aggregated_costs = [] self.costs = [] def _replace_target_network(self): """ Replaces the target network with the evaluation network every 'self.replace_target_cnt' learning steps. """ if self.replace_target_cnt is not None and self.learn_step_counter % self.replace_target_cnt == 0: self.Q_next.set_goal_params(self.Q_eval.get_params()) if self.use_constraints: if self.replace_target_cnt is not None and self.learn_step_counter % self.replace_target_cnt == 0: self.Q_next_constraints.set_goal_params( self.Q_eval_constraints.get_params()) def _update(self, batch_size): """ Performs network updates according to the DQN algorithm. Here we update several critics: one for each cost and one for each constraint. We then use these critics at decision time: first filtering actions that are not expected to violate constraints, then selecting actions that maximize a convex combination of the costs as expressed by the mixing parameter beta. Beta can be provided by the experimenter (dqn) or selected by the agent (goal_dqn). Parameters ---------- batch_size: int Batch size Returns ------- loss """ # Reset gradients of optimizes for opt in self.optimizers: opt.zero_grad() if self.use_constraints: for opt in self.optimizers_constraints: opt.zero_grad() # Update target network. self._replace_target_network() # Sample a batch state, action, cost_aggregated, costs, next_state, goal, done, constraints = self.replay_buffer.sample( batch_size) # Concatenate goal if the policy is goal conditioned (might not be used afterwards). if self.goal_conditioned: state = ag.Variable( torch.FloatTensor( np.float32(np.concatenate([state, goal], axis=1)))) next_state = ag.Variable( torch.FloatTensor( np.float32(np.concatenate([next_state, goal], axis=1)))) else: state = ag.Variable(torch.FloatTensor(np.float32(state))) next_state = ag.Variable(torch.FloatTensor(np.float32(next_state))) action = ag.Variable(torch.LongTensor(action)) indices = np.arange(self.batch_size) # rewards = [- ag.Variable(torch.FloatTensor(c_func.scale(c))) for c_func, c in zip(self.cost_function.costs, costs.transpose())] rewards = [ -ag.Variable(torch.FloatTensor(list(c_func.scale(c)))) for c_func, c in zip(self.cost_function.costs, costs.transpose()) ] q_preds = self.Q_eval.forward(state) q_preds = [q_p[indices, action] for q_p in q_preds] q_nexts = self.Q_next.forward(next_state) q_evals = self.Q_eval.forward(next_state) max_actions = [torch.argmax(q_ev, dim=1) for q_ev in q_evals] q_targets = [ r + self.gamma * q_nex[indices, max_act] for r, q_nex, max_act in zip(rewards, q_nexts, max_actions) ] losses = [(q_pre - ag.Variable(q_targ.data)).pow(2).mean() for q_pre, q_targ in zip(q_preds, q_targets)] for loss in losses: loss.backward() for opt in self.optimizers: opt.step() if self.use_constraints: constraints = [ ag.Variable(torch.FloatTensor(constraints[:, i])) for i in range(self.nb_constraints) ] q_preds = list(self.Q_eval_constraints.forward(state)) q_preds = [q_p[indices, action.squeeze()] for q_p in q_preds] q_nexts = self.Q_next_constraints.forward(next_state) q_evals = self.Q_eval_constraints.forward(next_state) for i_q in range(self.nb_constraints): max_actions = torch.argmax(q_evals[i_q], dim=1) q_target = -constraints[i_q] + 1 * q_nexts[i_q][indices, max_actions] losses.append( (q_preds[i_q] - ag.Variable(q_target.data)).pow(2).mean()) losses[-1].backward() for opt in self.optimizers_constraints: opt.step() self.learn_step_counter += 1 return losses def store_episodes(self, episodes): lengths = [] for e in episodes: for t in range(e['env_states'].shape[0] - 1): self.replay_buffer.push( state=e['env_states'][t], action=e['actions'][t], aggregated_cost=e['aggregated_costs'][t], costs=e['costs'][t], next_state=e['env_states'][t + 1], constraints=e['constraints'][t], goal=e['goal'], done=e['dones'][t]) lengths.append(e['env_states'].shape[0] - 1) return lengths def act(self, state, deterministic=False): """ Policy that uses the learned critics. Parameters ---------- state: 1D nd.array Current state. deterministic: bool Whether the policy should be deterministic (e.g. in evaluation mode). Returns ------- action: nd.array Action vector. q_constraints: nd.array Values of the critics estimating the expected constraint violations. """ # print(state.dtype) # float64 # state = pd.to_numeric(state) # not sure if np.random.rand() > self.epsilon or deterministic: if self.use_constraints: # If we use constraint, then the set of action is filtered by the constraint critics # so that we pick an action that is not expected to lead to constraint violation. # In the remaining actions, we take the one that maximizes the mixture of critics # that evaluate the values of each negative costs expected in the future. # If all actions lead to constraint violation, we chose the one that minimizes it. with torch.no_grad(): # state = ag.Variable(torch.FloatTensor(state).unsqueeze(0)) if state.dtype == object: # state = ag.Variable(torch.from_numpy(state).unsqueeze(0)) state = ag.Variable( torch.FloatTensor(list(state)).unsqueeze(0)) else: state = ag.Variable( torch.FloatTensor(state).unsqueeze(0)) q_value1, q_value2 = self.Q_eval.forward(state) beta = self.cost_function.beta q_constraints = torch.cat( self.Q_eval_constraints.forward(state)).numpy() q_constraints_clipped = q_constraints.clip( max=0) # clamp to 0 (q value must be neg) q_constraints_worst = q_constraints_clipped.min(axis=0) valid_ids = np.argwhere(q_constraints_worst > -1).flatten() if valid_ids.size == 0: action = np.argmax(q_constraints.sum(axis=0)) else: q_value = (1 - beta) * q_value1[ 0, valid_ids] + beta * q_value2[0, valid_ids] action = valid_ids[np.argmax(q_value.numpy())] else: # If no constraint, then the best action is the one that maximizes # the mixture of values with the chose mixing parameter beta (either by experimenter or by agent). with torch.no_grad(): # state = ag.Variable(torch.FloatTensor(state).unsqueeze(0)) if state.dtype == object: # state = ag.Variable(torch.from_numpy(state).unsqueeze(0)) state = ag.Variable( torch.FloatTensor(list(state)).unsqueeze(0)) else: state = ag.Variable( torch.FloatTensor(state).unsqueeze(0)) q_value1, q_value2 = self.Q_eval.forward(state) beta = self.cost_function.beta q_value = (1 - beta) * q_value1 + beta * q_value2 action = int(q_value.max(1)[1].data[0]) q_constraints = None else: # Epsilon-greedy exploration, random action with probability epsilon. action = np.random.randint(self.dims['a']) q_constraints = None return np.atleast_1d(action), q_constraints def update(self): """ Update the algorithm. """ if self.env_step_counter > 0: losses = self._update(self.batch_size) return [np.atleast_1d(l.data)[0] for l in losses] else: return [0] * (2 + self.nb_constraints) def save_model(self, path): """ Extract model state dicts and save them. Parameters ---------- path: str Saving path. """ q_eval = self.Q_eval.get_model() to_save = [q_eval] if self.use_constraints: q_constraints = self.Q_eval_constraints.get_model() to_save.append(q_constraints) with open(path, 'wb') as f: torch.save(to_save, f) def load_model(self, path): """ Load model from file and feed critics' state dicts. Parameters ---------- path: str Loading path """ with open(path, 'rb') as f: out = torch.load(f) try: self.Q_eval.set_model(out[0]) except: self.Q_eval.set_model(out) if self.use_constraints: self.Q_eval_constraints.set_model(out[1]) def learn(self, num_train_steps): """ Main training loop. Parameters ---------- num_train_steps: int Number of training steps (environment steps) Returns ------- """ while self.env_step_counter < num_train_steps: if self.goal_conditioned: goal = self.env.unwrapped.sample_cost_function_params() else: goal = None episodes = run_rollout( policy=self, env=self.env, n=1, goal=goal, eval=False, additional_keys=('costs', 'constraints'), ) lengths = self.store_episodes(episodes) self.env_step_counter += np.sum(lengths) self.episode += 1 self.aggregated_costs.append( np.sum(episodes[0]['aggregated_costs'])) self.costs.append(np.sum(episodes[0]['costs'], axis=0)) # Update if len(self.replay_buffer) > self.batch_size: update_losses = [] for _ in range(int(np.sum(lengths) * 0.5)): update_losses.append(self.update()) update_losses = np.array(update_losses) losses = update_losses.mean(axis=0) else: losses = [np.nan] * 2 if self.episode % self.eval_and_log_every == 0: # Run evaluations new_logs, eval_costs = self.evaluate( n=self.n_evals_if_stochastic if self.stochastic else 1) # Compute train scores train_agg_cost = np.mean(self.aggregated_costs) train_costs = np.array(self.costs).mean(axis=0) self.log(self.episode, new_logs, losses, train_agg_cost, train_costs) # Reset training score tracking self.aggregated_costs = [] self.costs = [] if self.episode % self.save_policy_every == 0: self.save_model(self.logdir + '/models/policy_{}.cp'.format(self.episode)) self.evaluate_pareto() print('Run has terminated successfully') def evaluate(self, n=None, goal=None, best=None, reset_same_model=False): # run eval if n is None: n = self.n_evals_if_stochastic if self.env.unwrapped.stochastic else 1 if self.goal_conditioned: if goal is not None: eval_goals = np.array([goal] * n) else: eval_goals = self.cost_function.get_eval_goals(n) n = eval_goals.shape[0] else: eval_goals = None eval_episodes = run_rollout( policy=self, env=self.env, n=n, goal=eval_goals, eval=True, reset_same_model=reset_same_model, additional_keys=('costs', 'constraints'), ) new_logs, costs = self.compute_eval_score(eval_episodes, eval_goals) return new_logs, costs def compute_eval_score(self, eval_episodes, eval_goals): aggregated_costs = [ np.sum(e['aggregated_costs']) for e in eval_episodes ] # costs = np.array([np.sum(e['costs'], axis=0) for e in eval_episodes]) costs = np.array([np.sum(e['costs'], axis=0) for e in eval_episodes], dtype=np.float64) new_logs = dict() if self.goal_conditioned: goals, index, inverse = np.unique(eval_goals, return_inverse=True, return_index=True, axis=0) agg_means = [] for g, i in zip(goals, np.arange(index.size)): ind_g = np.argwhere(inverse == i).flatten() costs_mean = np.mean(costs[ind_g], axis=0) costs_std = np.std(costs[ind_g], axis=0) agg_rew_mean = np.mean(np.array(aggregated_costs)[ind_g], axis=0) agg_rew_std = np.std(np.array(aggregated_costs)[ind_g], axis=0) for i_r in range(self.nb_costs): new_logs['Eval, g: ' + str(g) + ': ' + 'mean_C{}'.format(i_r)] = costs_mean[i_r] new_logs['Eval, g: ' + str(g) + ': ' + 'std_C{}'.format(i_r)] = costs_std[i_r] new_logs['Eval, g: ' + str(g) + ': ' + 'mean_agg'] = agg_rew_mean new_logs['Eval, g: ' + str(g) + ': ' + 'std_agg'] = agg_rew_std agg_means.append(agg_rew_mean) new_logs['Eval score'] = np.mean(agg_means) else: costs_mean = np.mean(np.atleast_2d(costs), axis=0) costs_std = np.std(np.atleast_2d(costs), axis=0) for i_r in range(self.nb_costs): new_logs['Eval, g: ' + str(self.cost_function.beta) + ': ' + 'mean_C{}'.format(i_r)] = costs_mean[i_r] new_logs['Eval, g: ' + str(self.cost_function.beta) + ': ' + 'std_C{}'.format(i_r)] = costs_std[i_r] new_logs['Eval score'] = np.mean(aggregated_costs) new_logs['Eval, g: ' + str(self.cost_function.beta) + ': ' + 'mean_agg'] = np.mean(aggregated_costs) new_logs['Eval, g: ' + str(self.cost_function.beta) + ': ' + 'std_agg'] = np.mean(aggregated_costs) return new_logs, costs def log(self, episode, new_logs, losses, train_agg_cost, train_costs): if new_logs['Eval score'] < self.best_cost: self.best_cost = new_logs['Eval score'] self.save_model(self.logdir + '/models/best_model.cp') train_log_dict = { 'Episode': episode, 'Best score so far': self.best_cost } for i in range(self.nb_costs): train_log_dict['Loss {}'.format(i + 1)] = losses[i] train_log_dict['Train, Cost {}'.format(i + 1)] = train_costs[i] train_log_dict['Train, Aggregated cost'] = train_agg_cost new_logs.update(train_log_dict) self.logger.add(new_logs) self.logger.print_last() self.logger.save() def evaluate_pareto(self, load_model=True): if load_model: self.load_model(self.logdir + '/models/best_model.cp') if self.goal_conditioned: print('----------------\nForming pareto front') goals = sample_goals(self.pareto_size, self.cost_function.goal_dim) res = dict() costs_mean = [] costs_std = [] n = self.n_evals_if_stochastic if self.env.unwrapped.stochastic else 1 for i_g, g in enumerate(goals): if (i_g + 1) % 20 == 0: print('\t{:.2f} %'.format( (i_g + 1) / goals.shape[0] * 100)) gs = np.atleast_2d(np.array([g for _ in range(n)])) if gs.shape[0] != n: gs = gs.transpose() episodes = run_rollout( policy=self, env=self.env, n=n, goal=gs, eval=True, additional_keys=['costs'], ) costs = np.array( [np.array(e['costs']).sum(axis=0) for e in episodes]) costs_mean.append(costs.mean(axis=0)) costs_std.append(costs.std(axis=0)) res['F_all'] = np.array(costs_mean) res['F_std_all'] = np.array(costs_std) res['G_all'] = goals front_ids = compute_pareto_front(costs_mean) costs_mean = np.array(costs_mean) costs_std = np.array(costs_std) costs_std = costs_std[front_ids] costs_mean = costs_mean[front_ids] res['F'] = costs_mean res['F_std'] = costs_std with open(self.logdir + 'res_eval.pk', 'wb') as f: pickle.dump(res, f) else: print('----------------\nForming pareto front') res = dict() costs_mean = [] costs_std = [] n = self.n_evals_if_stochastic if self.env.unwrapped.stochastic else 1 episodes = run_rollout( policy=self, env=self.env, n=n, eval=True, additional_keys=['costs'], ) costs = np.array( [np.array(e['costs']).sum(axis=0) for e in episodes]) costs_mean.append(costs.mean(axis=0)) costs_std.append(costs.std(axis=0)) res['F'] = np.array(costs_mean) res['F_std'] = np.array(costs_std) for k in list(res.keys()): res[k + '_all'] = res[k] res['G_all'] = np.array([[ self.cost_function.beta_default for _ in range(len(costs_mean)) ]]) with open(self.logdir + 'res_eval.pk', 'wb') as f: pickle.dump(res, f)