def testSine(self): for optimizer in [MAMLFirstOrderOptimizer()]: tf.reset_default_graph() with tf.Session(): input_phs = tf.placeholder(dtype=tf.float32, shape=[None, 1]) target_phs = tf.placeholder(dtype=tf.float32, shape=[None, 1]) network = Mlp(input_phs, 1, hidden_size=(32, 32), name='sin') loss = tf.reduce_mean(tf.square(network.output - target_phs)) input_ph_dict = OrderedDict({'x': input_phs, 'y': target_phs}) optimizer.build_graph(loss, network, input_ph_dict) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) for i in range(5000): xs = np.random.normal(0, 3, (1000, 1)) ys = np.sin(xs) inputs = {'x': xs, 'y': ys} optimizer.optimize(inputs) if i % 100 == 0: print(optimizer.loss(inputs)) xs = np.random.normal(0, 3, (100, 1)) ys = np.sin(xs) y_pred = sess.run( network.output, feed_dict=dict(list(zip(input_ph_dict.values(), (xs, ys))))) self.assertLessEqual(np.mean((ys - y_pred)**2), 0.02)
def __init__(self, max_path_length, *args, name="dice_maml", learning_rate=1e-3, **kwargs): super(DICEMAML, self).__init__(*args, **kwargs) self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate) self.max_path_length = max_path_length self._optimization_keys = [ 'observations', 'actions', 'adjusted_rewards', 'mask', 'agent_infos' ] self.name = name self.build_graph()
def __init__( self, *args, name="vpg_maml", learning_rate=1e-3, inner_type='likelihood_ratio', exploration=False, **kwargs ): super(VPGMAML, self).__init__(*args, **kwargs) assert inner_type in ["log_likelihood", "likelihood_ratio"] self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate) self.inner_type = inner_type self._optimization_keys = ['observations', 'actions', 'advantages', 'agent_infos'] self.name = name self.exploration = exploration if exploration: # add adjusted average rewards tp optimization keys self._optimization_keys.append('adj_avg_rewards') self.build_graph()
def __init__(self, *args, name="vpg", learning_rate=1e-3, inner_type='likelihood_ratio', **kwargs): super(VPG, self).__init__(*args, **kwargs) assert inner_type in ["log_likelihood", "likelihood_ratio"] self.inner_type = inner_type self.recurrent = getattr(self.policy, 'recurrent', False) if self.recurrent: self.optimizer = RL2FirstOrderOptimizer( learning_rate=learning_rate) else: self.optimizer = MAMLFirstOrderOptimizer( learning_rate=learning_rate) self._optimization_keys = [ 'observations', 'actions', 'advantages', 'agent_infos' ] self.name = name self.build_graph()
class VPGMAML(MAMLAlgo): """ Algorithm for PPO MAML Args: policy (Policy): policy object name (str): tf variable scope learning_rate (float): learning rate for the meta-objective exploration (bool): use exploration / pre-update sampling term / E-MAML term inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio inner_lr (float) : gradient step size used for inner step meta_batch_size (int): number of meta-learning tasks num_inner_grad_steps (int) : number of gradient updates taken per maml iteration trainable_inner_step_size (boolean): whether make the inner step size a trainable variable """ def __init__( self, *args, name="vpg_maml", learning_rate=1e-3, inner_type='likelihood_ratio', exploration=False, **kwargs ): super(VPGMAML, self).__init__(*args, **kwargs) assert inner_type in ["log_likelihood", "likelihood_ratio"] self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate) self.inner_type = inner_type self._optimization_keys = ['observations', 'actions', 'advantages', 'agent_infos'] self.name = name self.exploration = exploration if exploration: # add adjusted average rewards tp optimization keys self._optimization_keys.append('adj_avg_rewards') self.build_graph() def _adapt_objective_sym(self, action_sym, adv_sym, dist_info_old_sym, dist_info_new_sym): if self.inner_type == 'likelihood_ratio': with tf.variable_scope("likelihood_ratio"): likelihood_ratio_adapt = self.policy.distribution.likelihood_ratio_sym(action_sym, dist_info_old_sym, dist_info_new_sym) with tf.variable_scope("surrogate_loss"): surr_obj_adapt = -tf.reduce_mean(likelihood_ratio_adapt * adv_sym) elif self.inner_type == 'log_likelihood': with tf.variable_scope("log_likelihood"): log_likelihood_adapt = self.policy.distribution.log_likelihood_sym(action_sym, dist_info_new_sym) with tf.variable_scope("surrogate_loss"): surr_obj_adapt = -tf.reduce_mean(log_likelihood_adapt * adv_sym) else: raise NotImplementedError return surr_obj_adapt def build_graph(self): """ Creates the computation graph Notes: Pseudocode: for task in meta_batch_size: make_vars init_init_dist_sym for step in num_inner_grad_steps: for task in meta_batch_size: make_vars update_init_dist_sym set objectives for optimizer """ """ Create Variables """ with tf.variable_scope(self.name): self.step_sizes = self._create_step_size_vars() """ --- Build inner update graph for adapting the policy and sampling trajectories --- """ # this graph is only used for adapting the policy and not computing the meta-updates self.adapted_policies_params, self.adapt_input_ph_dict = self._build_inner_adaption() """ ----- Build graph for the meta-update ----- """ self.meta_op_phs_dict = OrderedDict() obs_phs, action_phs, adv_phs, dist_info_old_phs, all_phs_dict = self._make_input_placeholders('step0') self.meta_op_phs_dict.update(all_phs_dict) distribution_info_vars, current_policy_params = [], [] all_surr_objs = [] for i in range(self.meta_batch_size): dist_info_sym = self.policy.distribution_info_sym(obs_phs[i], params=None) distribution_info_vars.append(dist_info_sym) # step 0 current_policy_params.append(self.policy.policy_params) # set to real policy_params (tf.Variable) initial_distribution_info_vars = distribution_info_vars initial_action_phs = action_phs with tf.variable_scope(self.name): """ Inner updates""" for step_id in range(1, self.num_inner_grad_steps+1): surr_objs, adapted_policy_params = [], [] # inner adaptation step for each task for i in range(self.meta_batch_size): surr_loss = self._adapt_objective_sym(action_phs[i], adv_phs[i], dist_info_old_phs[i], distribution_info_vars[i]) adapted_params_var = self._adapt_sym(surr_loss, current_policy_params[i]) adapted_policy_params.append(adapted_params_var) surr_objs.append(surr_loss) all_surr_objs.append(surr_objs) # Create new placeholders for the next step obs_phs, action_phs, adv_phs, dist_info_old_phs, all_phs_dict = self._make_input_placeholders('step%i' % step_id) self.meta_op_phs_dict.update(all_phs_dict) # dist_info_vars_for_next_step distribution_info_vars = [self.policy.distribution_info_sym(obs_phs[i], params=adapted_policy_params[i]) for i in range(self.meta_batch_size)] current_policy_params = adapted_policy_params """ Outer objective """ surr_objs = [] # meta-objective for i in range(self.meta_batch_size): log_likelihood = self.policy.distribution.log_likelihood_sym(action_phs[i], distribution_info_vars[i]) surr_obj = - tf.reduce_mean(log_likelihood * adv_phs[i]) if self.exploration: # add adj_avg_reward placeholder adj_avg_rewards = tf.placeholder(dtype=tf.float32, shape=[None], name='adj_avg_rewards' + '_' + str(self.num_inner_grad_steps) + '_' + str(i)) self.meta_op_phs_dict['step%i_task%i_%s' % (self.num_inner_grad_steps, i, 'adj_avg_rewards')] = adj_avg_rewards log_likelihood_inital = self.policy.distribution.log_likelihood_sym(initial_action_phs[i], initial_distribution_info_vars[i]) surr_obj += - tf.reduce_mean(adj_avg_rewards) * tf.reduce_mean(log_likelihood_inital) surr_objs.append(surr_obj) """ Mean over meta tasks """ meta_objective = tf.reduce_mean(tf.stack(surr_objs, 0)) self.optimizer.build_graph( loss=meta_objective, target=self.policy, input_ph_dict=self.meta_op_phs_dict, ) def optimize_policy(self, all_samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op(all_samples_data, self._optimization_keys) if log: logger.log("Optimizing") loss_before = self.optimizer.optimize(input_val_dict=meta_op_input_dict) if log: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=meta_op_input_dict) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after)
class VPG(Algo): """ Algorithm for PPO MAML Args: policy (Policy): policy object name (str): tf variable scope learning_rate (float): learning rate for the meta-objective exploration (bool): use exploration / pre-update sampling term / E-MAML term inner_type (str): inner optimization objective - either log_likelihood or likelihood_ratio inner_lr (float) : gradient step size used for inner step meta_batch_size (int): number of meta-learning tasks num_inner_grad_steps (int) : number of gradient updates taken per maml iteration trainable_inner_step_size (boolean): whether make the inner step size a trainable variable """ def __init__(self, *args, name="vpg", learning_rate=1e-3, inner_type='likelihood_ratio', **kwargs): super(VPG, self).__init__(*args, **kwargs) assert inner_type in ["log_likelihood", "likelihood_ratio"] self.inner_type = inner_type self.recurrent = getattr(self.policy, 'recurrent', False) if self.recurrent: self.optimizer = RL2FirstOrderOptimizer( learning_rate=learning_rate) else: self.optimizer = MAMLFirstOrderOptimizer( learning_rate=learning_rate) self._optimization_keys = [ 'observations', 'actions', 'advantages', 'agent_infos' ] self.name = name self.build_graph() def build_graph(self): """ Creates the computation graph Notes: Pseudocode: for task in meta_batch_size: make_vars init_init_dist_sym for step in num_inner_grad_steps: for task in meta_batch_size: make_vars update_init_dist_sym set objectives for optimizer """ """ Create Variables """ """ ----- Build graph for the meta-update ----- """ self.meta_op_phs_dict = OrderedDict() obs_ph, action_ph, adv_ph, dist_info_old_ph, all_phs_dict = self._make_input_placeholders( 'train', recurrent=self.recurrent) self.meta_op_phs_dict.update(all_phs_dict) # dist_info_vars_for_next_step if self.recurrent: distribution_info_vars, hidden_ph, next_hidden_var = self.policy.distribution_info_sym( obs_ph) else: distribution_info_vars = self.policy.distribution_info_sym(obs_ph) hidden_ph, next_hidden_var = None, None """ Outer objective """ # meta-objective if self.inner_type == 'log_likelihood': log_likelihood = self.policy.distribution.log_likelihood_sym( action_ph, distribution_info_vars) surr_obj = -tf.reduce_mean(log_likelihood * adv_ph) elif self.inner_type == 'likelihood_ratio': likelihood_ratio_adapt = self.policy.distribution.likelihood_ratio_sym( action_ph, dist_info_old_ph, distribution_info_vars) surr_obj = -tf.reduce_mean(likelihood_ratio_adapt * adv_ph) else: raise NotImplementedError self.optimizer.build_graph(loss=surr_obj, target=self.policy, input_ph_dict=self.meta_op_phs_dict, hidden_ph=hidden_ph, next_hidden_var=next_hidden_var) def optimize_policy(self, samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') if log: logger.log("Optimizing") loss_before = self.optimizer.optimize(input_val_dict=input_dict) if log: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=input_dict) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after)
class DICEMAML(MAMLAlgo): """ Algorithm for DICE VPG MAML Args: max_path_length (int): maximum path length policy (Policy) : policy object name (str): tf variable scope learning_rate (float): learning rate for the meta-objective second_order_baseline (boolean): whether to use a second order baseline (Foerster et al.) inner_lr (float) : gradient step size used for inner step meta_batch_size (int): number of meta-learning tasks num_inner_grad_steps (int) : number of gradient updates taken per maml iteration trainable_inner_step_size (boolean): whether make the inner step size a trainable variable """ def __init__(self, max_path_length, *args, name="dice_maml", learning_rate=1e-3, second_order_baseline=False, **kwargs): super(DICEMAML, self).__init__(*args, **kwargs) self.optimizer = MAMLFirstOrderOptimizer(learning_rate=learning_rate) self.max_path_length = max_path_length self._optimization_keys = [ 'observations', 'actions', 'rewards', 'baselines', 'mask', 'agent_infos' ] self.name = name self.second_order_baseline = second_order_baseline self.build_graph() # def _adapt_objective_sym(self, action_stacked_sym, reward_sym, baseline_sym, mask_sym, dist_info_stacked_sym): # with tf.variable_scope("log_likelihood"): # log_likelihood_adapt = self.policy.distribution.log_likelihood_sym(action_stacked_sym, dist_info_stacked_sym) # log_likelihood_adapt = tf.reshape(log_likelihood_adapt, tf.shape(mask_sym)) # with tf.variable_scope("dice_loss"): # dice_loss = - tf.reduce_mean(tf.reduce_sum(magic_box_cumsum(log_likelihood_adapt) * reward_sym * mask_sym, axis=-1)) # baseline_loss = - (1 - tf.reduce_mean(tf.reduce_sum(magic_box(log_likelihood_adapt) * baseline_sym * mask_sym, axis=-1))) # if self.second_order_baseline: # second_order_bl_loss = (1 - (magic_box(log_likelihood_adapt))[:, 1:]) * \ # (1 - (magic_box_cumsum(log_likelihood_adapt))[:, :-1]) * mask_sym[:, 1:] * baseline_sym[:, 1:] # second_order_bl_loss = tf.reduce_mean(tf.reduce_sum(second_order_bl_loss, axis=-1)) # else: # second_order_bl_loss = 0 # # return dice_loss + baseline_loss + second_order_bl_loss def _adapt_objective_sym(self, action_stacked_sym, reward_sym, baseline_sym, mask_sym, dist_info_stacked_sym): with tf.variable_scope("log_likelihood"): log_likelihood_adapt = self.policy.distribution.log_likelihood_sym( action_stacked_sym, dist_info_stacked_sym) log_likelihood_adapt = tf.reshape(log_likelihood_adapt, tf.shape(mask_sym)) with tf.variable_scope("dice_loss"): dice_loss = -tf.reduce_mean( magic_box_cumsum(log_likelihood_adapt) * reward_sym * mask_sym) baseline_loss = -(1 - tf.reduce_mean( magic_box(log_likelihood_adapt) * baseline_sym * mask_sym)) if self.second_order_baseline: second_order_bl_loss = (1 - magic_box(log_likelihood_adapt))[:, 1:] * \ (1 - magic_box_cumsum(log_likelihood_adapt))[:, :-1] * baseline_sym[:, 1:] * mask_sym[:, 1:] second_order_bl_loss = tf.reduce_mean(second_order_bl_loss) else: second_order_bl_loss = 0 return dice_loss + baseline_loss + second_order_bl_loss def _build_inner_adaption(self): """ Creates the (DICE) symbolic graph for the one-step inner gradient update (It'll be called several times if more gradient steps are needed) Args: some placeholders Returns: adapted_policies_params (list): list of Ordered Dict containing the symbolic post-update parameters adapt_input_list_ph (list): list of placeholders """ obs_phs, action_phs, reward_phs, baseline_phs, mask_phs, dist_info_old_phs, adapt_input_ph_dict = self._make_dice_input_placeholders( 'adapt') adapted_policies_params = [] for i in range(self.meta_batch_size): with tf.variable_scope("adapt_task_%i" % i): with tf.variable_scope("adapt_objective"): obs_stacked = self._reshape_obs_phs(obs_phs[i]) action_stacked = self._reshape_action_phs(action_phs[i]) distribution_info_stacked = self.policy.distribution_info_sym( obs_stacked, params=self.policy.policies_params_phs[i]) # inner surrogate objective adapt_loss = self._adapt_objective_sym( action_stacked, reward_phs[i], baseline_phs[i], mask_phs[i], distribution_info_stacked) # get tf operation for adapted (post-update) policy with tf.variable_scope("adapt_step"): adapted_policy_param = self._adapt_sym( adapt_loss, self.policy.policies_params_phs[i]) adapted_policies_params.append(adapted_policy_param) return adapted_policies_params, adapt_input_ph_dict def build_graph(self): """ Creates the computation graph for DICE MAML Notes: Pseudocode: for task in meta_batch_size: make_vars init_init_dist_sym for step in num_inner_grad_steps: for task in meta_batch_size: make_vars update_init_dist_sym set objectives for optimizer """ """ Build graph for sampling """ with tf.variable_scope(self.name + '_sampling'): self.step_sizes = self._create_step_size_vars() """ --- Build inner update graph for adapting the policy and sampling trajectories --- """ # this graph is only used for adapting the policy and not computing the meta-updates self.adapted_policies_params, self.adapt_input_ph_dict = self._build_inner_adaption( ) """ Build graph for meta-update """ meta_update_scope = tf.variable_scope(self.name + '_meta_update') with meta_update_scope: obs_phs, action_phs, reward_phs, baseline_phs, mask_phs, dist_info_old_phs, all_phs_dict = self._make_dice_input_placeholders( 'step0') self.meta_op_phs_dict = OrderedDict(all_phs_dict) distribution_info_vars, current_policy_params, all_surr_objs = [], [], [] for i in range(self.meta_batch_size): obs_stacked = self._reshape_obs_phs(obs_phs[i]) dist_info_sym = self.policy.distribution_info_sym(obs_stacked, params=None) distribution_info_vars.append(dist_info_sym) # step 0 current_policy_params.append( self.policy.policy_params ) # set to real policy_params (tf.Variable) with meta_update_scope: """ Inner updates""" for step_id in range(1, self.num_inner_grad_steps + 1): with tf.variable_scope("inner_update_%i" % step_id): surr_objs, adapted_policy_params = [], [] # inner adaptation step for each task for i in range(self.meta_batch_size): action_stacked = self._reshape_action_phs( action_phs[i]) surr_loss = self._adapt_objective_sym( action_stacked, reward_phs[i], baseline_phs[i], mask_phs[i], distribution_info_vars[i]) adapted_params_var = self._adapt_sym( surr_loss, current_policy_params[i]) adapted_policy_params.append(adapted_params_var) surr_objs.append(surr_loss) all_surr_objs.append(surr_objs) # Create new placeholders for the next step obs_phs, action_phs, reward_phs, baseline_phs, mask_phs, dist_info_old_phs, all_phs_dict = self._make_dice_input_placeholders( 'step%i' % step_id) self.meta_op_phs_dict.update(all_phs_dict) # dist_info_vars_for_next_step distribution_info_vars = [] for i in range(self.meta_batch_size): obs_stacked = self._reshape_obs_phs(obs_phs[i]) distribution_info_vars.append( self.policy.distribution_info_sym( obs_stacked, params=adapted_policy_params[i])) current_policy_params = adapted_policy_params """ Outer (meta-)objective """ with tf.variable_scope("outer_update"): surr_objs = [] # meta-objective for i in range(self.meta_batch_size): action_stacked = self._reshape_action_phs(action_phs[i]) surr_obj = self._adapt_objective_sym( action_stacked, reward_phs[i], baseline_phs[i], mask_phs[i], distribution_info_vars[i]) surr_objs.append(surr_obj) """ Mean over meta tasks """ meta_objective = tf.reduce_mean(tf.stack(surr_objs, 0)) self.optimizer.build_graph( loss=meta_objective, target=self.policy, input_ph_dict=self.meta_op_phs_dict, ) def optimize_policy(self, all_samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op( all_samples_data, self._optimization_keys) if log: logger.log("Optimizing") loss_before = self.optimizer.optimize( input_val_dict=meta_op_input_dict) if log: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=meta_op_input_dict) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after) def _make_dice_input_placeholders(self, prefix=''): """ In contrast to make_input_placeholders each placeholder has one dimension more with the size of self.max_path_length Args: prefix (str) : a string to prepend to the name of each variable Returns: (tuple) : a tuple containing lists of placeholders for each input type and meta task, and for convenience, a list containing all placeholders created """ obs_phs, action_phs, reward_phs, baseline_phs, mask_phs, dist_info_phs = [], [], [], [], [], [] dist_info_specs = self.policy.distribution.dist_info_specs all_phs_dict = OrderedDict() for task_id in range(self.meta_batch_size): # observation ph ph = tf.placeholder( dtype=tf.float32, shape=[None, self.max_path_length, self.policy.obs_dim], name='obs' + '_' + prefix + '_' + str(task_id)) all_phs_dict['%s_task%i_%s' % (prefix, task_id, 'observations')] = ph obs_phs.append(ph) # action ph ph = tf.placeholder( dtype=tf.float32, shape=[None, self.max_path_length, self.policy.action_dim], name='action' + '_' + prefix + '_' + str(task_id)) all_phs_dict['%s_task%i_%s' % (prefix, task_id, 'actions')] = ph action_phs.append(ph) # reward ph ph = tf.placeholder(dtype=tf.float32, shape=[None, self.max_path_length], name='rewards' + '_' + prefix + '_' + str(task_id)) all_phs_dict['%s_task%i_%s' % (prefix, task_id, 'rewards')] = ph reward_phs.append(ph) # baseline ph ph = tf.placeholder(dtype=tf.float32, shape=[None, self.max_path_length], name='baselines' + '_' + prefix + '_' + str(task_id)) all_phs_dict['%s_task%i_%s' % (prefix, task_id, 'baselines')] = ph baseline_phs.append(ph) # mask ph ph = tf.placeholder(dtype=tf.float32, shape=[None, self.max_path_length], name='mask' + '_' + prefix + '_' + str(task_id)) all_phs_dict['%s_task%i_%s' % (prefix, task_id, 'mask')] = ph mask_phs.append(ph) # distribution / agent info dist_info_ph_dict = {} for info_key, shape in dist_info_specs: ph = tf.placeholder( dtype=tf.float32, shape=[None, self.max_path_length] + list(shape), name='%s_%s_%i' % (info_key, prefix, task_id)) all_phs_dict['%s_task%i_agent_infos/%s' % (prefix, task_id, info_key)] = ph dist_info_ph_dict[info_key] = ph dist_info_phs.append(dist_info_ph_dict) return obs_phs, action_phs, reward_phs, baseline_phs, mask_phs, dist_info_phs, all_phs_dict def _reshape_obs_phs(self, obs_sym): # reshape from 3-D tensor of shape (num_paths, max_path_length, ndim_obs) to (num_paths * max_path_length, ndim_obs) return tf.reshape(obs_sym, [-1, self.policy.obs_dim]) def _reshape_action_phs(self, action_sym): # reshape from 3-D tensor of shape (num_paths, max_path_length, ndim_act) to (num_paths * max_path_length, ndim_act) return tf.reshape(action_sym, [-1, self.policy.action_dim])
def testGauss(self): for optimizer in [MAMLFirstOrderOptimizer()]: tf.reset_default_graph() with tf.Session(): input_phs = tf.placeholder(dtype=tf.float32, shape=[None, 100]) target_mean_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1]) target_std_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1]) mean_network = Mlp(input_phs, 1, hidden_size=(8, 8), name='mean') std_network = Mlp(input_phs, 1, hidden_size=(8, 8), name='std') target_std = tf.exp(target_std_ph) pred_std = tf.exp(std_network.output) numerator = tf.square(target_mean_ph - mean_network.output) + tf.square( target_std) - tf.square(pred_std) denominator = 2 * tf.square(pred_std) + 1e-8 loss = tf.reduce_mean( tf.reduce_sum(numerator / denominator + std_network.output - target_std_ph, axis=-1)) joined_network = CombinedMlp([mean_network, std_network]) input_ph_dict = OrderedDict({ 'x': input_phs, 'y_mean': target_mean_ph, 'y_std': target_std_ph }) optimizer.build_graph(loss, joined_network, input_ph_dict) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) for i in range(2000): means = np.random.random(size=(1000)) stds = np.random.random(size=(1000)) inputs = np.vstack([ np.random.normal(mean, np.exp(std), 100) for mean, std in zip(means, stds) ]) all_inputs = { 'x': inputs, 'y_mean': means.reshape(-1, 1), 'y_std': stds.reshape(-1, 1) } optimizer.optimize(all_inputs) if i % 100 == 0: print(optimizer.loss(all_inputs)) means = np.random.random(size=(20)) stds = np.random.random(size=(20)) inputs = np.stack([ np.random.normal(mean, np.exp(std), 100) for mean, std in zip(means, stds) ], axis=0) values_dict = OrderedDict({ 'x': inputs, 'y_mean': means.reshape(-1, 1), 'y_std': stds.reshape(-1, 1) }) mean_pred, std_pred = sess.run( joined_network.output, feed_dict=dict( list(zip(input_ph_dict.values(), values_dict.values())))) self.assertTrue(np.mean(np.square(mean_pred - means)) < 0.2) self.assertTrue(np.mean(np.square(std_pred - stds)) < 0.2)