def optimize_policy(self, itr, samples_data): all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) logger.log("Computing loss before") loss_before = self.optimizer.loss(all_input_values) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(all_input_values) logger.log("Optimizing") self.optimizer.optimize(all_input_values) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(all_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_policy(self, itr, all_samples_data): assert len( all_samples_data ) == self.num_grad_updates + 1 # we collected the rollouts to compute the grads and then the test! if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range( len(all_samples_data)): # these are the gradient steps obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[step][i], "observations", "actions", "advantages") obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list # [ [obs_0], [act_0], [adv_0], [obs_1], ... ] if step == 0: ##CF not used? init_inputs = input_list if self.use_maml: dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[ self.kl_constrain_step][i]['agent_infos'] dist_info_list += [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] input_list += tuple(dist_info_list) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(input_list) logger.log("Computing loss before") loss_before = self.optimizer.loss(input_list) logger.log("Optimizing") self.optimizer.optimize(input_list) logger.log("Computing loss after") loss_after = self.optimizer.loss(input_list) if self.use_maml: logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(input_list) logger.record_tabular('MeanKLBefore', mean_kl_before) # this now won't be 0! logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): init_policy_params = cur_policy_params = self.algo.policy.get_param_values( ) if hasattr(self.algo.env, "get_param_values"): try: cur_env_params = self.algo.env.get_param_values() except: cur_env_params = None else: cur_env_params = None import time start = time.time() if type(reset_args) != list and type(reset_args) != np.ndarray: reset_args = [reset_args] * self.n_envs if self.algo.policy.all_param_vals: cur_policy_params = [ flatten_tensors(x.values()) for x in self.algo.policy.all_param_vals ] else: cur_policy_params = [cur_policy_params] * self.n_envs # do tasks sequentially and parallelize within rollouts per task. paths = {} for i in range(self.n_envs): paths[i] = parallel_sampler.sample_paths( policy_params=cur_policy_params[i], env_params=cur_env_params, max_samples=self.algo.batch_size / self.n_envs, max_path_length=self.algo.max_path_length, scope=self.algo.scope, reset_arg=reset_args[i], show_prog_bar=False, ) total_time = time.time() - start logger.record_tabular(log_prefix + "TotalExecTime", total_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) self.algo.policy.set_param_values(init_policy_params) # currently don't support not whole paths (if desired, add code to truncate paths) assert self.algo.whole_paths return paths
def log_diagnostics(self, paths, prefix=''): progs = [ path["observations"][-1][-4] - path["observations"][0][-4] for path in paths ] logger.record_tabular(prefix+'AverageForwardProgress', np.mean(progs)) logger.record_tabular(prefix+'MaxForwardProgress', np.max(progs)) logger.record_tabular(prefix+'MinForwardProgress', np.min(progs)) logger.record_tabular(prefix+'StdForwardProgress', np.std(progs))
def fit(self, xs, ys, log=True): if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] if self._normalize_inputs: # recompute normalizing constants for inputs self._x_mean_var.set_value( np.mean(xs, axis=0, keepdims=True).astype(theano.config.floatX)) self._x_std_var.set_value((np.std(xs, axis=0, keepdims=True) + 1e-8).astype(theano.config.floatX)) if self._normalize_outputs: # recompute normalizing constants for outputs self._y_mean_var.set_value( np.mean(ys, axis=0, keepdims=True).astype(theano.config.floatX)) self._y_std_var.set_value((np.std(ys, axis=0, keepdims=True) + 1e-8).astype(theano.config.floatX)) if self._name: prefix = self._name + "_" else: prefix = "" # FIXME: needs batch computation to avoid OOM. loss_before, loss_after, mean_kl, batch_count = 0., 0., 0., 0 for batch in iterate_minibatches_generic(input_lst=[xs, ys], batchsize=self._batchsize, shuffle=True): batch_count += 1 xs, ys = batch if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before += self._optimizer.loss(inputs) self._optimizer.optimize(inputs) loss_after += self._optimizer.loss(inputs) if self._use_trust_region: mean_kl += self._optimizer.constraint_val(inputs) if log: logger.record_tabular(prefix + 'LossBefore', loss_before / batch_count) logger.record_tabular(prefix + 'LossAfter', loss_after / batch_count) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after / batch_count) if self._use_trust_region: logger.record_tabular(prefix + 'MeanKL', mean_kl / batch_count)
def fit(self, xs, ys): if self.normalize_inputs: # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 tf.get_default_session().run( tf.group( tf.assign(self.x_mean_var, new_mean), tf.assign(self.x_std_var, new_std), )) if self.use_trust_region and self.first_optimized: old_prob = self.f_prob(xs) inputs = [xs, ys, old_prob] optimizer = self.tr_optimizer else: inputs = [xs, ys] optimizer = self.optimizer loss_before = optimizer.loss(inputs) if self.name: prefix = self.name + "_" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) optimizer.optimize(inputs) loss_after = optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after) self.first_optimized = True
def log_diagnostics(self, paths, prefix=''): progs = [ path["observations"][-1][-3] - path["observations"][0][-3] for path in paths ] #if np.mean(progs) > 4.5: # import pdb; pdb.set_trace() #path = paths[0] #t = -10 #lb, ub = self.action_bounds #scaling = (ub - lb) * 0.5 #rew = path['rewards'][t] #act = path['actions'][t] #ctrl_cost = 0.5*self.ctrl_cost_coeff*np.sum(np.square(act/scaling)) logger.record_tabular('AverageForwardProgress', np.mean(progs)) logger.record_tabular('MaxForwardProgress', np.max(progs)) logger.record_tabular('MinForwardProgress', np.min(progs)) logger.record_tabular('StdForwardProgress', np.std(progs))
def fit(self, xs, ys): if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] sess = tf.get_default_session() if self._normalize_inputs: # recompute normalizing constants for inputs sess.run([ tf.assign(self._x_mean_var, np.mean(xs, axis=0, keepdims=True)), tf.assign(self._x_std_var, np.std(xs, axis=0, keepdims=True) + 1e-8), ]) if self._normalize_outputs: # recompute normalizing constants for outputs sess.run([ tf.assign(self._y_mean_var, np.mean(ys, axis=0, keepdims=True)), tf.assign(self._y_std_var, np.std(ys, axis=0, keepdims=True) + 1e-8), ]) if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) if self._name: prefix = self._name + "_" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) if self._use_trust_region: logger.record_tabular(prefix + 'MeanKL', self._optimizer.constraint_val(inputs)) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract(samples_data, "observations", "actions", "advantages") agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] inputs += tuple(state_info_list) if self.policy.recurrent: inputs += (samples_data["valids"], ) dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] loss_before = self.optimizer.loss(inputs) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def optimize_policy(self, itr, all_samples_data): logger.log("optimizing policy") assert len(all_samples_data) == self.num_grad_updates + 1 if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range(len(all_samples_data)): obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[step][i], "observations", "actions", "advantages") obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list if step == 0: init_inputs = input_list loss_before = self.optimizer.loss(input_list) self.optimizer.optimize(input_list) loss_after = self.optimizer.loss(input_list) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[-1][i]['agent_infos'] dist_info_list += [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] if self.use_maml: mean_kl, max_kl = self.opt_info['f_kl'](*(list(input_list) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def fit(self, xs, ys): if self._normalize_inputs: # recompute normalizing constants for inputs self._x_mean_var.set_value(np.mean(xs, axis=0, keepdims=True)) self._x_std_var.set_value(np.std(xs, axis=0, keepdims=True) + 1e-8) if self._use_trust_region: old_prob = self._f_prob(xs) inputs = [xs, ys, old_prob] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) if self._name: prefix = self._name + "_" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
def compute_updated_dists(self, samples): """ Compute fast gradients once per iteration and pull them out of tensorflow for sampling with the post-update policy. """ start = time.time() num_tasks = len(samples) param_keys = self.all_params.keys() update_param_keys = param_keys no_update_param_keys = [] sess = tf.get_default_session() obs_list, action_list, adv_list = [], [], [] for i in range(num_tasks): inputs = ext.extract(samples[i], 'observations', 'actions', 'advantages') obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) inputs = obs_list + action_list + adv_list # To do a second update, replace self.all_params below with the params that were used to collect the policy. init_param_values = None if self.all_param_vals is not None: # skip this in first iteration init_param_values = self.get_variable_values(self.all_params) step_size = self.step_size for i in range(num_tasks): if self.all_param_vals is not None: # skip this in first iteration self.assign_params(self.all_params, self.all_param_vals[i]) if 'all_fast_params_tensor' not in dir( self): # only enter if first iteration # make computation graph once self.all_fast_params_tensor = [] # compute gradients for a current task (symbolic) for i in range(num_tasks): # compute gradients for a current task (symbolic) gradients = dict( zip( update_param_keys, tf.gradients(self.surr_objs[i], [ self.all_params[key] for key in update_param_keys ]))) # gradient update for params of current task (symbolic) fast_params_tensor = OrderedDict( zip(update_param_keys, [ self.all_params[key] - step_size * gradients[key] for key in update_param_keys ])) # undo gradient update for no_update_params (symbolic) for k in no_update_param_keys: fast_params_tensor[k] = self.all_params[k] # tensors that represent the updated params for all of the tasks (symbolic) self.all_fast_params_tensor.append(fast_params_tensor) # pull new param vals out of tensorflow, so gradient computation only done once ## first is the vars, second the values # these are the updated values of the params after the gradient step self.all_param_vals = sess.run( self.all_fast_params_tensor, feed_dict=dict(list(zip(self.input_list_for_grad, inputs)))) # reset parameters to original ones if init_param_values is not None: # skip this in first iteration self.assign_params(self.all_params, init_param_values) # compile the _cur_f_dist with updated params outputs = [] inputs = tf.split(self.input_tensor, num_tasks, 0) for i in range(num_tasks): # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time. task_inp = inputs[i] info, _ = self.dist_info_sym(task_inp, dict(), all_params=self.all_param_vals[i], is_training=False) outputs.append([info['mean'], info['log_std']]) self._cur_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor], outputs=outputs, ) total_time = time.time() - start logger.record_tabular("ComputeUpdatedDistTime", total_time)
def train(self): # TODO - make this a util flatten_list = lambda l: [item for sublist in l for item in sublist] with tf.Session() as sess: # Code for loading a previous policy. Somewhat hacky because needs to be in sess. if self.load_policy is not None: import joblib self.policy = joblib.load(self.load_policy)['policy'] self.init_opt() # initialize uninitialized vars (only initialize vars that were not loaded) uninit_vars = [] for var in tf.global_variables(): # note - this is hacky, may be better way to do this in newer TF. try: sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) sess.run(tf.variables_initializer(uninit_vars)) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Sampling set of tasks/goals for this meta-batch...") # sample environment configuration env = self.env while not ('sample_env_params' in dir(env) or 'sample_goals' in dir(env)): env = env._wrapped_env if 'sample_goals' in dir(env): learner_env_params = env.sample_goals(self.meta_batch_size) elif 'sample_env_params': learner_env_params = env.sample_env_params(self.meta_batch_size) self.policy.switch_to_init_dist() # Switch to pre-update policy all_samples_data, all_paths = [], [] for step in range(self.num_grad_updates+1): #if step > 0: # import pdb; pdb.set_trace() # test param_vals functions. logger.log('** Step ' + str(step) + ' **') logger.log("Obtaining samples...") paths = self.obtain_samples(itr, reset_args=learner_env_params, log_prefix=str(step)) all_paths.append(paths) logger.log("Processing samples...") samples_data = {} for key in paths.keys(): # the keys are the tasks # don't log because this will spam the consol with every task. samples_data[key] = self.process_samples(itr, paths[key], log=False) all_samples_data.append(samples_data) # for logging purposes self.process_samples(itr, flatten_list(paths.values()), prefix=str(step), log=True) logger.log("Logging diagnostics...") self.log_diagnostics(flatten_list(paths.values()), prefix=str(step)) if step < self.num_grad_updates: logger.log("Computing policy updates...") self.policy.compute_updated_dists(samples_data) logger.log("Optimizing policy...") # This needs to take all samples_data so that it can construct graph for meta-optimization. self.optimize_policy(itr, all_samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, all_samples_data[-1]) # , **kwargs) if self.store_paths: params["paths"] = all_samples_data[-1]["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) # The rest is some example plotting code. # Plotting code is useful for visualizing trajectories across a few different tasks. if False and itr % 2 == 0 and self.env.observation_space.shape[0] <= 4: # point-mass logger.log("Saving visualization of paths") for ind in range(min(5, self.meta_batch_size)): plt.clf() plt.plot(learner_env_params[ind][0], learner_env_params[ind][1], 'k*', markersize=10) plt.hold(True) preupdate_paths = all_paths[0] postupdate_paths = all_paths[-1] pre_points = preupdate_paths[ind][0]['observations'] post_points = postupdate_paths[ind][0]['observations'] plt.plot(pre_points[:,0], pre_points[:,1], '-r', linewidth=2) plt.plot(post_points[:,0], post_points[:,1], '-b', linewidth=1) pre_points = preupdate_paths[ind][1]['observations'] post_points = postupdate_paths[ind][1]['observations'] plt.plot(pre_points[:,0], pre_points[:,1], '--r', linewidth=2) plt.plot(post_points[:,0], post_points[:,1], '--b', linewidth=1) pre_points = preupdate_paths[ind][2]['observations'] post_points = postupdate_paths[ind][2]['observations'] plt.plot(pre_points[:,0], pre_points[:,1], '-.r', linewidth=2) plt.plot(post_points[:,0], post_points[:,1], '-.b', linewidth=1) plt.plot(0,0, 'k.', markersize=5) plt.xlim([-0.8, 0.8]) plt.ylim([-0.8, 0.8]) plt.legend(['goal', 'preupdate path', 'postupdate path']) plt.savefig(osp.join(logger.get_snapshot_dir(), 'prepost_path'+str(ind)+'.png')) elif False and itr % 2 == 0: # swimmer or cheetah logger.log("Saving visualization of paths") for ind in range(min(5, self.meta_batch_size)): plt.clf() goal_vel = learner_env_params[ind] plt.title('Swimmer paths, goal vel='+str(goal_vel)) plt.hold(True) prepathobs = all_paths[0][ind][0]['observations'] postpathobs = all_paths[-1][ind][0]['observations'] plt.plot(prepathobs[:,0], prepathobs[:,1], '-r', linewidth=2) plt.plot(postpathobs[:,0], postpathobs[:,1], '--b', linewidth=1) plt.plot(prepathobs[-1,0], prepathobs[-1,1], 'r*', markersize=10) plt.plot(postpathobs[-1,0], postpathobs[-1,1], 'b*', markersize=10) plt.xlim([-1.0, 5.0]) plt.ylim([-1.0, 1.0]) plt.legend(['preupdate path', 'postupdate path'], loc=2) plt.savefig(osp.join(logger.get_snapshot_dir(), 'swim1d_prepost_itr'+str(itr)+'_id'+str(ind)+'.pdf')) self.shutdown_worker()
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean([ special.discount_return(path["rewards"], self.discount) for path in paths ]) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean( np.square(np.concatenate([path["actions"] for path in paths]))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True)) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True)) logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []
def optimize_policy(self, itr, samples_data): # Init vars rewards = samples_data['rewards'] actions = samples_data['actions'] observations = samples_data['observations'] agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] if self.policy.recurrent: recurrent_vals = [samples_data["valids"]] else: recurrent_vals = [] # Compute sample Bellman error. feat_diff = [] for path in samples_data['paths']: feats = self._features(path) feats = np.vstack([feats, np.zeros(feats.shape[1])]) feat_diff.append(feats[1:] - feats[:-1]) if self.policy.recurrent: max_path_length = max( [len(path["advantages"]) for path in samples_data["paths"]]) # pad feature diffs feat_diff = np.array([ tensor_utils.pad_tensor(fd, max_path_length) for fd in feat_diff ]) else: feat_diff = np.vstack(feat_diff) ################# # Optimize dual # ################# # Here we need to optimize dual through BFGS in order to obtain \eta # value. Initialize dual function g(\theta, v). \eta > 0 # First eval delta_v f_dual = self.opt_info['f_dual'] f_dual_grad = self.opt_info['f_dual_grad'] # Set BFGS eval function def eval_dual(input): param_eta = input[0] param_v = input[1:] val = f_dual(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v])) return val.astype(np.float64) # Set BFGS gradient eval function def eval_dual_grad(input): param_eta = input[0] param_v = input[1:] grad = f_dual_grad(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v])) eta_grad = np.float(grad[0]) v_grad = grad[1] return np.hstack([eta_grad, v_grad]) # Initial BFGS parameter values. x0 = np.hstack([self.param_eta, self.param_v]) # Set parameter boundaries: \eta>0, v unrestricted. bounds = [(-np.inf, np.inf) for _ in x0] bounds[0] = (0., np.inf) # Optimize through BFGS logger.log('optimizing dual') eta_before = x0[0] dual_before = eval_dual(x0) params_ast, _, _ = self.optimizer(func=eval_dual, x0=x0, fprime=eval_dual_grad, bounds=bounds, maxiter=self.max_opt_itr, disp=0) dual_after = eval_dual(params_ast) # Optimal values have been obtained self.param_eta = params_ast[0] self.param_v = params_ast[1:] ################### # Optimize policy # ################### cur_params = self.policy.get_param_values(trainable=True) f_loss = self.opt_info["f_loss"] f_loss_grad = self.opt_info['f_loss_grad'] input = [ rewards, observations, feat_diff, actions ] + state_info_list + recurrent_vals + [self.param_eta, self.param_v] # Set loss eval function def eval_loss(params): self.policy.set_param_values(params, trainable=True) val = f_loss(*input) return val.astype(np.float64) # Set loss gradient eval function def eval_loss_grad(params): self.policy.set_param_values(params, trainable=True) grad = f_loss_grad(*input) flattened_grad = tensor_utils.flatten_tensors( list(map(np.asarray, grad))) return flattened_grad.astype(np.float64) loss_before = eval_loss(cur_params) logger.log('optimizing policy') params_ast, _, _ = self.optimizer(func=eval_loss, x0=cur_params, fprime=eval_loss_grad, disp=0, maxiter=self.max_opt_itr) loss_after = eval_loss(params_ast) f_kl = self.opt_info['f_kl'] mean_kl = f_kl(*([observations, actions] + state_info_list + dist_info_list + recurrent_vals)).astype(np.float64) logger.log('eta %f -> %f' % (eta_before, self.param_eta)) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) logger.record_tabular('DualBefore', dual_before) logger.record_tabular('DualAfter', dual_after) logger.record_tabular('MeanKL', mean_kl)
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray): reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix+"PolicyExecTime", policy_time) logger.record_tabular(log_prefix+"EnvExecTime", env_time) logger.record_tabular(log_prefix+"ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def train(self): cur_std = self.sigma0 cur_mean = self.policy.get_param_values() es = cma_es_lib.CMAEvolutionStrategy(cur_mean, cur_std) parallel_sampler.populate_task(self.env, self.policy) if self.plot: plotter.init_plot(self.env, self.policy) cur_std = self.sigma0 cur_mean = self.policy.get_param_values() itr = 0 while itr < self.n_itr and not es.stop(): if self.batch_size is None: # Sample from multivariate normal distribution. xs = es.ask() xs = np.asarray(xs) # For each sample, do a rollout. infos = (stateful_pool.singleton_pool.run_map( sample_return, [(x, self.max_path_length, self.discount) for x in xs])) else: cum_len = 0 infos = [] xss = [] done = False while not done: sbs = stateful_pool.singleton_pool.n_parallel * 2 # Sample from multivariate normal distribution. # You want to ask for sbs samples here. xs = es.ask(sbs) xs = np.asarray(xs) xss.append(xs) sinfos = stateful_pool.singleton_pool.run_map( sample_return, [(x, self.max_path_length, self.discount) for x in xs]) for info in sinfos: infos.append(info) cum_len += len(info['returns']) if cum_len >= self.batch_size: xs = np.concatenate(xss) done = True break # Evaluate fitness of samples (negative as it is minimization # problem). fs = -np.array([info['returns'][0] for info in infos]) # When batching, you could have generated too many samples compared # to the actual evaluations. So we cut it off in this case. xs = xs[:len(fs)] # Update CMA-ES params based on sample fitness. es.tell(xs, fs) logger.push_prefix('itr #%d | ' % itr) logger.record_tabular('Iteration', itr) logger.record_tabular('CurStdMean', np.mean(cur_std)) undiscounted_returns = np.array( [info['undiscounted_return'] for info in infos]) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('StdReturn', np.mean(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) logger.record_tabular('AverageDiscountedReturn', np.mean(fs)) logger.record_tabular( 'AvgTrajLen', np.mean([len(info['returns']) for info in infos])) self.env.log_diagnostics(infos) self.policy.log_diagnostics(infos) logger.save_itr_params( itr, dict( itr=itr, policy=self.policy, env=self.env, )) logger.dump_tabular(with_prefix=False) if self.plot: plotter.update_plot(self.policy, self.max_path_length) logger.pop_prefix() # Update iteration. itr += 1 # Set final params. self.policy.set_param_values(es.result()[0]) parallel_sampler.terminate_task()
def log_diagnostics(self, paths): log_stds = np.vstack( [path["agent_infos"]["log_std"] for path in paths]) logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds)))
def train(self): with tf.Session() as sess: if self.load_policy is not None: import joblib self.policy = joblib.load(self.load_policy)['policy'] self.init_opt() # initialize uninitialized vars (I know, it's ugly) uninit_vars = [] for var in tf.global_variables(): try: sess.run(var) except tf.errors.FailedPreconditionError: uninit_vars.append(var) sess.run(tf.variables_initializer(uninit_vars)) #sess.run(tf.initialize_all_variables()) self.start_worker() start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) #new_param_values = self.policy.get_variable_values(self.policy.all_params) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) #import pickle #with open('paths_itr'+str(itr)+'.pkl', 'wb') as f: # pickle.dump(paths, f) # debugging """ if itr % 1 == 0: logger.log("Saving visualization of paths") import matplotlib.pyplot as plt; for ind in range(5): plt.clf(); plt.hold(True) points = paths[ind]['observations'] plt.plot(points[:,0], points[:,1], '-r', linewidth=2) plt.xlim([-1.0, 1.0]) plt.ylim([-1.0, 1.0]) plt.legend(['path']) plt.savefig('/home/cfinn/path'+str(ind)+'.png') """ # end debugging logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()