) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) d_rewards_var = TT.vector('d_rewards') importance_weights_var = TT.vector('importance_weight') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) snap_dist_info_vars = snap_policy.dist_info_sym(observations_var) surr = TT.sum(- dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) params = policy.get_params(trainable=True) snap_params = snap_policy.get_params(trainable=True) importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP(actions_var,snap_dist_info_vars,dist_info_vars) grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0',dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1',dtype=grad[1].dtype) eval_grad3 = TT.col('eval_grad3',dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4',dtype=grad[3].dtype) surr_on1 = TT.sum(- dist.log_likelihood_sym_1traj_GPOMDP(actions_var,dist_info_vars)*d_rewards_var*importance_weights_var) surr_on2 = TT.sum(snap_dist.log_likelihood_sym_1traj_GPOMDP(actions_var,snap_dist_info_vars)*d_rewards_var) grad_SVRG =theano.grad(surr_on2,snap_params) #,theano.grad(surr_on1,params))]
observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) d_rewards_var = TT.vector('d_rewards') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) surr = TT.sum( -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) params = policy.get_params(trainable=True) grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype) eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype) eval_grad5 = TT.vector('eval_grad5', dtype=grad[4].dtype) f_train = theano.function( inputs=[observations_var, actions_var, d_rewards_var], outputs=grad) f_update = theano.function( inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5], outputs=None, updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5],
def run_task(*_): # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize( GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True)) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 64)) # Initialize a linear baseline estimator using default hand-crafted features baseline = LinearFeatureBaseline(env.spec) # We will collect 100 trajectories per iteration N = 3 # Each trajectory will have at most 100 time steps T = 400 # Number of iterations n_itr = 1000 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.001 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = -TT.mean( dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True) for epoch in range(n_itr): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) f_train(observations, actions, advantages) returns_to_check = [sum(p["rewards"]) for p in paths] print('Average Return:', np.mean(returns_to_check)) ############################################################################ logger.log("Training finished") logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() logger.record_tabular('Epoch', epoch) logger.record_tabular('Steps', epoch * N * T) logger.record_tabular('AverageReturn', np.mean(returns_to_check)) logger.record_tabular('StdReturn', np.std(returns_to_check)) logger.record_tabular('MaxReturn', np.max(returns_to_check)) logger.record_tabular('MinReturn', np.min(returns_to_check))
# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True ) for _ in range(n_itr): paths = [] for _ in range(N): observations = []
compress=True) print('------- initial training complete ---------------') init_param_value = np.copy(policy.get_param_values()) task_grads = [] for i in range(2): task_grads.append([]) if not load_split_data: split_data = [] net_weights = [] for i in range(grad_epochs): cur_param_val = np.copy(policy.get_param_values()) cur_param = copy.deepcopy(policy.get_params()) cp = [] for param in policy._mean_network.get_params(): cp.append(np.copy(param.get_value())) net_weights.append(cp) paths = algo.sampler.obtain_samples(0) split_data.append(paths) algo.sampler.process_samples(0, paths) samples_data = algo.sampler.process_samples(0, paths) opt_data = algo.optimize_policy(0, samples_data) joblib.dump(split_data, 'data/trained/gradient_temp/rl_split_' + append + '/split_data.pkl',
def perform_evaluation(num_parallel, hidden_size, batch_size, pathlength, random_split, prioritized_split, adaptive_sample, initialize_epochs, grad_epochs, test_epochs, append, task_size, load_init_policy, load_split_data, alternate_update, accumulate_gradient, imbalance_sample, sample_ratio, split_percentages, env_name, seed, test_num=1, param_update_start=50, param_update_frequency=50, param_update_end=200, use_param_variance=0, param_variance_batch=10000, param_variance_sample=100, reverse_metric=False): reps = 1 learning_curves = [] kl_divergences = [] for i in range(len(split_percentages)): learning_curves.append([]) kl_divergences.append([]) performances = [] diretory = 'data/trained/gradient_temp/rl_split_' + append if not os.path.exists(diretory): os.makedirs(diretory) os.makedirs(diretory + '/policies') for testit in range(test_num): print('======== Start Test ', testit, ' ========') env = normalize(GymEnv(env_name, record_log=False, record_video=False)) dartenv = env._wrapped_env.env.env if env._wrapped_env.monitoring: dartenv = dartenv.env np.random.seed(testit * 3 + seed) random.seed(testit * 3 + seed) pre_training_learning_curve = [] policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_size, # append_dim=2, net_mode=0, ) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) if load_init_policy: policy = joblib.load(diretory + '/init_policy.pkl') if adaptive_sample: new_batch_size = int(batch_size / task_size) else: new_batch_size = batch_size algo = TRPO( # _MultiTask( env=env, policy=policy, baseline=baseline, batch_size=new_batch_size, max_path_length=pathlength, n_itr=5, discount=0.995, step_size=0.02, gae_lambda=0.97, whole_paths=False, # task_num=task_size, ) algo.init_opt() from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=num_parallel) parallel_sampler.set_seed(0) algo.start_worker() if not load_init_policy: for i in range(initialize_epochs): print('------ Iter ', i, ' in Init Training --------') if adaptive_sample: paths = [] reward_paths = [] for t in range(task_size): paths += algo.sampler.obtain_samples(0, t) #reward_paths += algo.sampler.obtain_samples(0) elif imbalance_sample: paths = [] reward_paths = [] for t in range(task_size): algo.batch_size = batch_size * sample_ratio[t] task_path = algo.sampler.obtain_samples(0, t) paths += task_path if t == 0: reward_paths += task_path else: paths = algo.sampler.obtain_samples(0) samples_data = algo.sampler.process_samples(0, paths) opt_data = algo.optimize_policy(0, samples_data) pol_aft = (policy.get_param_values()) print(algo.mean_kl(samples_data)) print(dict(logger._tabular)['AverageReturn']) pre_training_learning_curve.append( dict(logger._tabular)['AverageReturn']) joblib.dump(policy, diretory + '/init_policy.pkl', compress=True) print('------- initial training complete ---------------') if not load_split_data: split_data = [] net_weights = [] net_weight_values = [] for i in range(grad_epochs): cur_param_val = np.copy(policy.get_param_values()) cur_param = copy.deepcopy(policy.get_params()) cp = [] for param in policy._mean_network.get_params(): cp.append(np.copy(param.get_value())) net_weights.append(cp) net_weight_values.append(np.copy(policy.get_param_values())) if adaptive_sample: paths = [] reward_paths = [] for t in range(task_size): paths += algo.sampler.obtain_samples(0, t) #reward_paths += algo.sampler.obtain_samples(0) elif imbalance_sample: paths = [] reward_paths = [] for t in range(task_size): algo.batch_size = batch_size * sample_ratio[t] task_path = algo.sampler.obtain_samples(0, t) paths += task_path if t == 0: reward_paths += task_path else: paths = algo.sampler.obtain_samples(0) split_data.append(paths) samples_data = algo.sampler.process_samples(0, paths) opt_data = algo.optimize_policy(0, samples_data) pre_training_learning_curve.append( dict(logger._tabular)['AverageReturn']) joblib.dump(split_data, diretory + '/split_data.pkl', compress=True) joblib.dump(net_weights, diretory + '/net_weights.pkl', compress=True) joblib.dump(net_weight_values, diretory + '/net_weight_values.pkl', compress=True) joblib.dump(pre_training_learning_curve, diretory + '/pretrain_learningcurve_' + str(testit) + '.pkl', compress=True) else: split_data = joblib.load(diretory + '/split_data.pkl') net_weights = joblib.load(diretory + '/net_weights.pkl') net_weight_values = joblib.load(diretory + '/net_weight_values.pkl') pre_training_learning_curve = joblib.load( diretory + '/pretrain_learningcurve_' + str(testit) + '.pkl') task_grads = [] variance_grads = [] for i in range(task_size): task_grads.append([]) for i in range(grad_epochs): policy.set_param_values(net_weight_values[i]) task_paths = [] for j in range(task_size): task_paths.append([]) for path in split_data[i]: taskid = path['env_infos']['state_index'][-1] task_paths[taskid].append(path) for j in range(task_size): samples_data = algo.sampler.process_samples( 0, task_paths[j], False) grad = get_gradient(algo, samples_data, False) task_grads[j].append(grad) if use_param_variance == 1 and i == grad_epochs - 1: for j in range(param_variance_sample): samples_data_ori = algo.sampler.process_samples( 0, split_data[i], False) samples_data = {} indices = np.arange(len(samples_data_ori['observations'])) np.random.shuffle(indices) samples_data["observations"] = samples_data_ori[ "observations"][indices[0:param_variance_batch]] samples_data["actions"] = samples_data_ori["actions"][ indices[0:param_variance_batch]] samples_data["rewards"] = samples_data_ori["rewards"][ indices[0:param_variance_batch]] samples_data["advantages"] = samples_data_ori[ "advantages"][indices[0:param_variance_batch]] samples_data["agent_infos"] = {} samples_data["agent_infos"]["log_std"] = samples_data_ori[ "agent_infos"]["log_std"][ indices[0:param_variance_batch]] samples_data["agent_infos"]["mean"] = samples_data_ori[ "agent_infos"]["mean"][indices[0:param_variance_batch]] grad = get_gradient(algo, samples_data, False) variance_grads.append(grad) algo.sampler.process_samples(0, split_data[i]) weight_variances = [] for i in range(len(task_grads[0][0]) - 1): weight_variances.append(np.zeros(task_grads[0][0][i].shape)) if use_param_variance == 1: for k in range(len(task_grads[0][0]) - 1): one_grad = [] for g in range(len(variance_grads)): one_grad.append(np.asarray(variance_grads[g][k])) weight_variances[k] += np.var(one_grad, axis=0) print('------- collected gradient info -------------') split_counts = [] for i in range(len(task_grads[0][0]) - 1): split_counts.append(np.zeros(task_grads[0][0][i].shape)) for i in range(len(task_grads[0])): for k in range(len(task_grads[0][i]) - 1): region_gradients = [] for region in range(len(task_grads)): region_gradients.append(task_grads[region][i][k]) region_gradients = np.array(region_gradients) if not random_split: split_counts[k] += np.var( region_gradients, axis=0 ) # * np.abs(net_weights[i][k])# + 100 * (len(task_grads[0][i])-k) elif prioritized_split: split_counts[k] += np.random.random( split_counts[k].shape) * (len(task_grads[0][i]) - k) else: split_counts[k] += np.random.random(split_counts[k].shape) for j in range(len(split_counts)): plt.figure() plt.title(policy._mean_network.get_params()[j].name) if len(split_counts[j].shape) == 2: plt.imshow(split_counts[j]) plt.colorbar() elif len(split_counts[j].shape) == 1: plt.plot(split_counts[j]) plt.savefig(diretory + '/' + policy._mean_network.get_params()[j].name + '.png') if use_param_variance: plt.figure() plt.title(policy._mean_network.get_params()[j].name) if len(weight_variances[j].shape) == 2: plt.imshow(weight_variances[j]) plt.colorbar() elif len(weight_variances[j].shape) == 1: plt.plot(weight_variances[j]) plt.savefig(diretory + '/' + policy._mean_network.get_params()[j].name + '_variances.png') algo.shutdown_worker() # organize the metric into each edges and sort them split_metrics = [] metrics_list = [] variance_list = [] for k in range(len(task_grads[0][0]) - 1): for index, value in np.ndenumerate(split_counts[k]): split_metrics.append( [k, index, value, weight_variances[k][index]]) metrics_list.append(value) variance_list.append(weight_variances[k][index]) if use_param_variance == 0: split_metrics.sort(key=lambda x: x[2], reverse=True) else: split_metrics.sort(key=lambda x: x[3], reverse=True) # test the effect of splitting total_param_size = len(policy._mean_network.get_param_values()) pred_list = [] # use the optimized network init_param_value = np.copy(policy.get_param_values()) for split_id, split_percentage in enumerate(split_percentages): split_param_size = split_percentage * total_param_size masks = [] for k in range(len(task_grads[0][0]) - 1): masks.append(np.zeros(split_counts[k].shape)) if split_percentage <= 1.0: for i in range(int(split_param_size)): masks[split_metrics[i][0]][split_metrics[i][1]] = 1 else: threshold = np.mean(metrics_list) + np.std(metrics_list) print('threashold,', threshold) for i in range(len(split_metrics)): if split_metrics[i][2] < threshold: break else: masks[split_metrics[i][0]][split_metrics[i][1]] = 1 mask_split_flat = np.array([]) for k in range(int((len(task_grads[0][0]) - 1) / 2)): for j in range(task_size): mask_split_flat = np.concatenate([ mask_split_flat, np.array(masks[k * 2]).flatten(), np.array(masks[k * 2 + 1]).flatten() ]) mask_share_flat = np.ones(len(mask_split_flat)) mask_share_flat -= mask_split_flat if np.abs(split_percentage - 1.0) < 0.0001: mask_split_flat = np.concatenate( [mask_split_flat, np.ones(dartenv.act_dim * task_size)]) mask_share_flat = np.concatenate( [mask_share_flat, np.zeros(dartenv.act_dim * task_size)]) else: mask_split_flat = np.concatenate( [mask_split_flat, np.zeros(dartenv.act_dim)]) mask_share_flat = np.concatenate( [mask_share_flat, np.ones(dartenv.act_dim)]) policy.set_param_values(init_param_value) if split_param_size != 0: if dartenv.avg_div != task_size: dartenv.avg_div = task_size dartenv.obs_dim += dartenv.avg_div high = np.inf * np.ones(dartenv.obs_dim) low = -high dartenv.observation_space = spaces.Box(low, high) env._wrapped_env._observation_space = rllab.envs.gym_env.convert_gym_space( dartenv.observation_space) env.spec = rllab.envs.env_spec.EnvSpec( observation_space=env.observation_space, action_space=env.action_space, ) split_policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_size, # append_dim=2, net_mode=8, split_num=task_size, split_masks=masks, split_init_net=policy, split_std=np.abs(split_percentage - 1.0) < 0.0001, ) else: split_policy = copy.deepcopy(policy) if split_param_size == 0: baseline_add = 0 else: baseline_add = task_size # use 0 for now, though task_size should in theory improve performance more split_baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=baseline_add) new_batch_size = batch_size if (split_param_size != 0 and alternate_update) or adaptive_sample: new_batch_size = int(batch_size / task_size) split_algo = TRPO( # _MultiTask( env=env, policy=split_policy, baseline=split_baseline, batch_size=new_batch_size, max_path_length=pathlength, n_itr=5, discount=0.995, step_size=0.02, gae_lambda=0.97, whole_paths=False, # task_num=task_size, ) split_algo.init_opt() parallel_sampler.initialize(n_parallel=num_parallel) parallel_sampler.set_seed(0) split_algo.start_worker() if split_param_size != 0: parallel_sampler.update_env_params({ 'avg_div': dartenv.avg_div, 'obs_dim': dartenv.obs_dim, 'observation_space': dartenv.observation_space }) print('Network parameter size: ', total_param_size, len(split_policy.get_param_values())) split_init_param = np.copy(split_policy.get_param_values()) avg_error = 0.0 avg_learning_curve = [] for rep in range(int(reps)): split_policy.set_param_values(split_init_param) learning_curve = [] kl_div_curve = [] for i in range(test_epochs): # if not split if split_param_size == 0: paths, _ = get_samples(split_algo, task_size, adaptive_sample, imbalance_sample, batch_size, sample_ratio) # sanity check samp_num = 0 for p in paths: samp_num += len(p['observations']) print('samp_num: ', samp_num, adaptive_sample, imbalance_sample) samples_data = split_algo.sampler.process_samples( 0, paths) opt_data = split_algo.optimize_policy(0, samples_data) if imbalance_sample: reward = 0 for path in reward_paths: reward += np.sum(path["rewards"]) reward /= len(reward_paths) else: reward = float( (dict(logger._tabular)['AverageReturn'])) kl_div_curve.append(split_algo.mean_kl(samples_data)) print('reward: ', reward) print(split_algo.mean_kl(samples_data)) elif alternate_update: reward = 0 total_traj = 0 task_rewards = [] for j in range(task_size): paths = split_algo.sampler.obtain_samples(0, j) # split_algo.sampler.process_samples(0, paths) samples_data = split_algo.sampler.process_samples( 0, paths) opt_data = split_algo.optimize_policy( 0, samples_data) reward += float((dict( logger._tabular)['AverageReturn'])) * float( (dict(logger._tabular)['NumTrajs'])) total_traj += float( (dict(logger._tabular)['NumTrajs'])) task_rewards.append( dict(logger._tabular)['AverageReturn']) reward /= total_traj print('reward for different tasks: ', task_rewards, reward) elif accumulate_gradient: paths, _ = get_samples(split_algo, task_size, adaptive_sample, imbalance_sample, batch_size, sample_ratio) task_paths = [] task_rewards = [] for j in range(task_size): task_paths.append([]) task_rewards.append([]) for path in paths: taskid = path['env_infos']['state_index'][-1] task_paths[taskid].append(path) task_rewards[taskid].append(np.sum( path['rewards'])) pre_opt_parameter = np.copy( split_policy.get_param_values()) # compute the split gradient first split_policy.set_param_values(pre_opt_parameter) accum_grad = np.zeros(pre_opt_parameter.shape) processed_task_data = [] for j in range(task_size): if len(task_paths[j]) == 0: processed_task_data.append([]) continue split_policy.set_param_values(pre_opt_parameter) # split_algo.sampler.process_samples(0, task_paths[j]) samples_data = split_algo.sampler.process_samples( 0, task_paths[j], False) processed_task_data.append(samples_data) #split_algo.optimize_policy(0, samples_data) # if j == 1: accum_grad += split_policy.get_param_values( ) - pre_opt_parameter # sanity check samp_num = 0 for p in paths: samp_num += len(p['observations']) print('samp_num: ', samp_num) # compute the gradient together split_policy.set_param_values(pre_opt_parameter) all_data = split_algo.sampler.process_samples(0, paths) if imbalance_sample: reward = 0 for path in reward_paths: reward += np.sum(path["rewards"]) reward /= len(reward_paths) else: reward = float( (dict(logger._tabular)['AverageReturn'])) split_algo.optimize_policy(0, all_data) all_data_grad = split_policy.get_param_values( ) - pre_opt_parameter # do a line search to project the udpate onto the constraint manifold sum_grad = all_data_grad # * mask_split_flat + all_data_grad * mask_share_flat ls_steps = [] loss_before = split_algo.loss(all_data) for s in range(50): ls_steps.append(0.97**s) for step in ls_steps: split_policy.set_param_values(pre_opt_parameter + sum_grad * step) if split_algo.mean_kl( all_data )[0] < split_algo.step_size: # and split_algo.loss(all_data)[0] < loss_before[0]: break # step=1 split_policy.set_param_values(pre_opt_parameter + sum_grad * step) for j in range(task_size): task_rewards[j] = np.mean(task_rewards[j]) print('reward for different tasks: ', task_rewards, reward) print('mean kl: ', split_algo.mean_kl(all_data), ' step size: ', step) task_mean_kls = [] for j in range(task_size): if len(processed_task_data[j]) == 0: task_mean_kls.append(0) else: task_mean_kls.append( split_algo.mean_kl( processed_task_data[j])[0]) print('mean kl for different tasks: ', task_mean_kls) kl_div_curve.append( np.concatenate( [split_algo.mean_kl(all_data), task_mean_kls])) else: paths = split_algo.sampler.obtain_samples(0) reward = float( (dict(logger._tabular)['AverageReturn'])) task_paths = [] task_rewards = [] for j in range(task_size): task_paths.append([]) task_rewards.append([]) for path in paths: taskid = path['env_infos']['state_index'][-1] task_paths[taskid].append(path) task_rewards[taskid].append(np.sum( path['rewards'])) pre_opt_parameter = np.copy( split_policy.get_param_values()) # optimize the shared part # split_algo.sampler.process_samples(0, paths) samples_data = split_algo.sampler.process_samples( 0, paths) for layer in split_policy._mean_network._layers: for param in layer.get_params(): if 'split' in param.name: layer.params[param].remove('trainable') split_policy._cached_params = {} split_policy._cached_param_dtypes = {} split_policy._cached_param_shapes = {} split_algo.init_opt() print( 'Optimizing shared parameter size: ', len(split_policy.get_param_values(trainable=True))) split_algo.optimize_policy(0, samples_data) # optimize the tasks for layer in split_policy._mean_network._layers: for param in layer.get_params(): if 'split' in param.name: layer.params[param].add('trainable') if 'share' in param.name: layer.params[param].remove('trainable') # shuffle the optimization order opt_order = np.arange(task_size) np.random.shuffle(opt_order) split_policy._cached_params = {} split_policy._cached_param_dtypes = {} split_policy._cached_param_shapes = {} split_algo.init_opt() for taskid in opt_order: # split_algo.sampler.process_samples(0, task_paths[taskid]) samples_data = split_algo.sampler.process_samples( 0, task_paths[taskid]) print( 'Optimizing parameter size: ', len( split_policy.get_param_values( trainable=True))) split_algo.optimize_policy(0, samples_data) for layer in split_policy._mean_network._layers: for param in layer.get_params(): if 'share' in param.name: layer.params[param].add('trainable') for j in range(task_size): task_rewards[j] = np.mean(task_rewards[j]) print('reward for different tasks: ', task_rewards, reward) learning_curve.append(reward) if (i + initialize_epochs + grad_epochs) % param_update_frequency == 0 and ( i + initialize_epochs + grad_epochs) < param_update_end and ( i + initialize_epochs + grad_epochs) > param_update_start: print("Updating model parameters...") parallel_sampler.update_env_params( {'task_expand_flag': True}) print('============= Finished ', split_percentage, ' Rep ', rep, ' test ', i, ' ================') print(diretory) joblib.dump(split_policy, diretory + '/policies/policy_' + str(rep) + '_' + str(i) + '_' + str(split_percentage) + '.pkl', compress=True) avg_learning_curve.append(learning_curve) kl_divergences[split_id].append(kl_div_curve) joblib.dump(split_policy, diretory + '/policies/final_policy_' + str(split_percentage) + '.pkl', compress=True) avg_error += float(reward) pred_list.append(avg_error / reps) print(split_percentage, avg_error / reps) split_algo.shutdown_worker() print(avg_learning_curve) avg_learning_curve = np.mean(avg_learning_curve, axis=0) learning_curves[split_id].append(avg_learning_curve) # output the learning curves so far joblib.dump(learning_curves, diretory + '/learning_curve.pkl', compress=True) avg_learning_curve = [] for lc in range(len(learning_curves)): avg_learning_curve.append(np.mean(learning_curves[lc], axis=0)) plt.figure() for lc in range(len(learning_curves)): plt.plot(avg_learning_curve[lc], label=str(split_percentages[lc])) plt.legend(bbox_to_anchor=(0.3, 0.3), bbox_transform=plt.gcf().transFigure, numpoints=1) plt.savefig(diretory + '/split_learning_curves.png') if len(kl_divergences[0]) > 0: #print('kldiv:', kl_divergences) avg_kl_div = [] for i in range(len(kl_divergences)): if len(kl_divergences[i]) > 0: avg_kl_div.append(np.mean(kl_divergences[i], axis=0)) #print(avg_kl_div) joblib.dump(avg_kl_div, diretory + '/kl_divs.pkl', compress=True) for i in range(len(avg_kl_div)): one_perc_kl_div = np.array(avg_kl_div[i]) #print(i, one_perc_kl_div) plt.figure() for j in range(len(one_perc_kl_div[0])): append = 'task%d' % j if j == 0: append = 'all' plt.plot(one_perc_kl_div[:, j], label=str(split_percentages[i]) + append, alpha=0.3) plt.legend(bbox_to_anchor=(0.3, 0.3), bbox_transform=plt.gcf().transFigure, numpoints=1) plt.savefig(diretory + '/kl_div_%s.png' % str(split_percentages[i])) performances.append(pred_list) np.savetxt(diretory + '/performance.txt', performances) plt.figure() plt.plot(split_percentages, np.mean(performances, axis=0)) plt.savefig(diretory + '/split_performance.png') joblib.dump(learning_curves, diretory + '/learning_curve.pkl', compress=True) avg_learning_curve = [] for i in range(len(learning_curves)): avg_learning_curve.append(np.mean(learning_curves[i], axis=0)) plt.figure() for i in range(len(split_percentages)): plt.plot(avg_learning_curve[i], label=str(split_percentages[i])) plt.legend(bbox_to_anchor=(0.3, 0.3), bbox_transform=plt.gcf().transFigure, numpoints=1) plt.savefig(diretory + '/split_learning_curves.png') #np.savetxt(diretory + '/learning_curves.txt', avg_learning_curve) if len(kl_divergences[0]) > 0: avg_kl_div = [] for i in range(len(kl_divergences)): avg_kl_div.append(np.mean(kl_divergences[i], axis=0)) joblib.dump(avg_kl_div, diretory + '/kl_divs.pkl', compress=True) for i in range(len(avg_kl_div)): one_perc_kl_div = np.array(avg_kl_div[i]) plt.figure() for j in range(len(one_perc_kl_div[0])): append = 'task%d' % j if j == 0: append = 'all' plt.plot(one_perc_kl_div[:, j], label=str(split_percentages[i]) + append, alpha=0.3) plt.legend(bbox_to_anchor=(0.3, 0.3), bbox_transform=plt.gcf().transFigure, numpoints=1) plt.savefig(diretory + '/kl_div_%s.png' % str(split_percentages[i])) plt.close('all') print(diretory)
def doit(mode): from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,)) # Initialize a linear baseline estimator using default hand-crafted features if "linbaseline" in mode: print('linear baseline') baseline = LinearFeatureBaseline(env.spec) elif "vanilla" in mode: print("zero baseline") baseline = ZeroBaseline(env.spec) elif mode == "batchavg": print('batch average baseline') # use a zero baseline but subtract the mean of the discounted returns (see below) baseline = ZeroBaseline(env.spec) if "_ztrans" in mode: print('z transform advantages') else: print('no z transform') # We will collect 100 trajectories per iteration N = 50 # Each trajectory will have at most 100 time steps T = 50 # Number of iterations n_itr = 50 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1 ) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True ) results = [] for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) if "_ztrans" in mode: advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) if mode == 'batchavg': # in this case `advantages` up to here are just our good old returns, without baseline or z transformation. # now we subtract their mean across all episodes. advantages = advantages - np.mean(advantages) f_train(observations, actions, advantages) avgr = np.mean([sum(p["rewards"]) for p in paths]) print(('Average Return:',avgr)) results.append(avgr) return results
algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=75000, max_path_length=env.horizon, n_itr=5, discount=0.995, step_size=0.01, gae_lambda=0.97, ) algo.init_opt() if not load_path_from_file: init_param = policy.get_param_values() init_param_obj = copy.deepcopy(policy.get_params()) from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=7) env.wrapped_env.env.env.perturb_MP = False algo.start_worker() pol_weights = [] all_paths = [] policy_params = [] for i in range(50): init_param = policy.get_param_values() init_param_obj = copy.deepcopy(policy.get_params()) policy_params.append(np.copy(init_param))
baseline=baseline, batch_size=150000, max_path_length=env.horizon, n_itr=5, discount=0.995, step_size=0.01, gae_lambda=0.97, ) algo.init_opt() one_iter_grad = [] mps = [] if not load_path_from_file: init_param = policy.get_param_values() init_param_obj = copy.deepcopy(policy.get_params()) from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=7) env.wrapped_env.env.env.perturb_MP = False pol_weights = [] all_paths = [] policy_params = [] init_param = np.copy(policy.get_param_values()) algo.start_worker() for i in range(100): policy.set_param_values(init_param) ##### get data ################### for it in range(1):
env=env, policy=policy, baseline=baseline, batch_size=10000, max_path_length=env.horizon, n_itr=5, discount=0.995, step_size=0.01, gae_lambda=0.97, ) algo.init_opt() if not load_path_from_file: init_param = policy.get_param_values() init_param_obj = copy.deepcopy(policy.get_params()) from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=2) env.wrapped_env.env.env.perturb_MP = False algo.start_worker() pol_weights = [] all_paths = [] policy_params = [] for i in range(50): init_param = policy.get_param_values() init_param_obj = copy.deepcopy(policy.get_params()) policy_params.append(np.copy(init_param))