def generate_advantage(data_dict, baseline_network): ''' @brief: calculate the parameters for the advantage function ''' for path in data_dict: # the predicted value function (baseline function) path["baseline"] = baseline_network.predict(path) advantage_method = baseline_network.args.advantage_method gamma = baseline_network.args.gamma gae_lam = baseline_network.args.gae_lam # esitmate the advantages if advantage_method == 'raw': for path in data_dict: # the gamma discounted rollout value function path["returns"] = utils.discount(path["rewards"], gamma) path["advantage"] = path["returns"] - path["baseline"] path['target_return'] = path['returns'] else: assert advantage_method == 'gae', logger.error( 'invalid advantage estimation method: {}'.format(advantage_method)) for path in data_dict: # the gamma discounted rollout value function path["returns"] = utils.discount(path["rewards"], gamma) # init the advantage path["advantage"] = np.zeros(path['returns'].shape) num_steps = len(path['returns']) # generate the GAE advantage for i_step in reversed(range(num_steps)): if i_step < num_steps - 1: delta = path['rewards'][i_step] \ + gamma * path['baseline'][i_step + 1] \ - path['baseline'][i_step] path['advantage'][i_step] = \ delta + gamma * gae_lam * path['advantage'][i_step + 1] else: delta = path['rewards'][i_step] - path['baseline'][i_step] path['advantage'][i_step] = delta path['target_return'] = path['advantage'] + path['baseline'] # standardized advantage function advant_n = np.concatenate([path["advantage"] for path in data_dict]) advant_n -= advant_n.mean() advant_n /= (advant_n.std() + 1e-8) # standardize to mean 0 stddev 1 return advant_n
def generate_advantage(self, data_dict, feed_dict): ''' @brief: calculate the parameters for the advantage function ''' # get the baseline function if self.args.use_gnn_as_value: baseline_data = self.baseline_network.predict(feed_dict) current_id = 0 for path in data_dict: path['baseline'] = baseline_data[current_id:current_id + len(path['rewards'])] current_id += len(path['rewards']) assert current_id == len(baseline_data), logger.error( 'Extra baseline predicted? ({} vs {})'.format( current_id, len(baseline_data))) else: for path in data_dict: # the predicted value function (baseline function) path["baseline"] = self.baseline_network.predict(path) # esitmate the advantages if self.args.advantage_method == 'raw': for path in data_dict: # the gamma discounted rollout value function path["returns"] = utils.discount(path["rewards"], self.args.gamma) path["advantage"] = path["returns"] - path["baseline"] path['target_return'] = path['returns'] else: assert self.args.advantage_method == 'gae', logger.error( 'invalid advantage estimation method: {}'.format( self.args.advantage_method)) for path in data_dict: # the gamma discounted rollout value function path["returns"] = utils.discount(path["rewards"], self.args.gamma) # init the advantage path["advantage"] = np.zeros(path['returns'].shape) num_steps = len(path['returns']) # generate the GAE advantage for i_step in reversed(range(num_steps)): if i_step < num_steps - 1: delta = path['rewards'][i_step] \ + self.args.gamma * path['baseline'][i_step + 1] \ - path['baseline'][i_step] path['advantage'][i_step] = \ delta + self.args.gamma * self.args.gae_lam \ * path['advantage'][i_step + 1] else: delta = path['rewards'][i_step] \ - path['baseline'][i_step] path['advantage'][i_step] = delta path['target_return'] = path['advantage'] + path['baseline'] # standardized advantage function advant_n = np.concatenate([path["advantage"] for path in data_dict]) advant_n -= advant_n.mean() advant_n /= (advant_n.std() + 1e-8) # standardize to mean 0 stddev 1 return advant_n