def run_task(*_): env = normalize( GymEnv("DartHopper-v1", record_log=False, record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 64), net_mode=0, ) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = PPO_Clip_Sym( env=env, policy=policy, baseline=baseline, batch_size=20000, max_path_length=env.horizon, n_itr=200, discount=0.99, step_size=0.02, gae_lambda=0.97, whole_paths=False, observation_permutation=np.array( [0.0001, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), action_permutation=np.array([0.0001, 1, 2]), sym_loss_weight=0.0, ) algo.train()
def test_trpo_relu_nan(): env = DummyEnv() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_nonlinearity=naive_relu, hidden_sizes=(1,)) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, step_size=0.001 ) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def test_trpo_deterministic_nan(): env = DummyEnv() policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(1,)) policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) baseline = ZeroBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, step_size=0.01 ) algo.train() assert not np.isnan(np.sum(policy.get_param_values()))
def run_task(*_): env = normalize( GymEnv("DartWalker3d-v1", record_log=False, record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 64), net_mode=0, ) #policy = joblib.load('data/local/experiment/walker3d_symmetry1_sd13_2alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_2d_hardvelenforce_contsupport/policy.pkl') # increase policy std a bit for exploration #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = TRPO_Symmetry( env=env, policy=policy, baseline=baseline, batch_size=60000, max_path_length=env.horizon, n_itr=500, discount=0.99, step_size=0.02, gae_lambda=0.97, observation_permutation=np.array([0.0001,-1, 2,-3,-4, -5,-6,7, 14,-15,-16, 17, 18,-19, 8,-9,-10, 11, 12,-13,\ 20,21,-22, 23,-24,-25, -26,-27,28, 35,-36,-37, 38, 39,-40, 29,-30,-31, 32, 33,-34, 42, 41]), #observation_permutation=np.array([0.0001, 1, 5,6,7, 2,3,4, 8,9,10, 14,15,16, 11,12,13]), #action_permutation=np.array([3,4,5, 0.00001,1,2]), action_permutation=np.array([-0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8]), sym_loss_weight=2.0, whole_paths=False, ) algo.train()
def run_task(*_): env = normalize( GymEnv("DartHumanWalker-v1", record_log=False, record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(128, 64), net_mode=0, ) #policy = joblib.load('data/local/experiment/humanwalker_symmetry1_sd11_1alivebonus_2velrew_targetvelocity1_15frameskip_5en1absenergypenalty_spd20002000/policy.pkl') # increase policy std a bit for exploration #policy.get_params()[-1].set_value(policy.get_params()[-1].get_value() + 0.5) print('trainable parameter size: ', policy.get_param_values(trainable=True).shape) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) algo = TRPO_Symmetry( env=env, policy=policy, baseline=baseline, batch_size=50000, max_path_length=env.horizon, n_itr=1000, discount=0.99, step_size=0.02, gae_lambda=0.97, observation_permutation=np.array([0.0001,-1,2,-3,-4, -11,12,-13,14,15,16, -5,6,-7,8,9,10, -17,18, -19, -24,25,-26,27, -20,21,-22,23,\ 28,29,-30,31,-32,-33, -40,41,-42,43,44,45, -34,35,-36,37,38,39, -46,47, -48, -53,54,-55,56, -49,50,-51,52, 58,57]), action_permutation=np.array([-6,7,-8, 9, 10,11, -0.001,1,-2, 3, 4,5, -12,13, -14, -19,20,-21,22, -15,16,-17,18]), sym_loss_weight=1.0, action_reg_weight=0.0, whole_paths=False, ) algo.train()
outputs = None, updates = sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4], params, learning_rate=learning_rate) ) f_baseline_g = theano.function( inputs = [observations_var, actions_var], outputs = all_der ) alla = [] for est in range(10): if (load_policy): policy.set_param_values(np.loadtxt('policy_novar.txt'), trainable=True) avg_return = np.zeros(n_itr) #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) for j_int in range(n_itr): paths = parallel_sampler.sample_paths_on_trajectories(policy.get_param_values(),N,T,show_bar=False) #baseline.fit(paths) observations = [p["observations"] for p in paths] actions = [p["actions"] for p in paths] d_rewards = [p["rewards"] for p in paths] temp = list() for x in d_rewards: z=list() t=1 for y in x: z.append(y*t) t*=discount temp.append(np.array(z)) d_rewards=temp minT=T cum_num = []
def perform_evaluation(num_parallel, hidden_size, batch_size, pathlength, random_split, prioritized_split, adaptive_sample, initialize_epochs, grad_epochs, test_epochs, append, task_size, load_init_policy, load_split_data, alternate_update, accumulate_gradient, imbalance_sample, sample_ratio, split_percentages, env_name, seed, test_num=1, param_update_start=50, param_update_frequency=50, param_update_end=200, use_param_variance=0, param_variance_batch=10000, param_variance_sample=100, reverse_metric=False): reps = 1 learning_curves = [] kl_divergences = [] for i in range(len(split_percentages)): learning_curves.append([]) kl_divergences.append([]) performances = [] diretory = 'data/trained/gradient_temp/rl_split_' + append if not os.path.exists(diretory): os.makedirs(diretory) os.makedirs(diretory + '/policies') for testit in range(test_num): print('======== Start Test ', testit, ' ========') env = normalize(GymEnv(env_name, record_log=False, record_video=False)) dartenv = env._wrapped_env.env.env if env._wrapped_env.monitoring: dartenv = dartenv.env np.random.seed(testit * 3 + seed) random.seed(testit * 3 + seed) pre_training_learning_curve = [] policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_size, # append_dim=2, net_mode=0, ) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) if load_init_policy: policy = joblib.load(diretory + '/init_policy.pkl') if adaptive_sample: new_batch_size = int(batch_size / task_size) else: new_batch_size = batch_size algo = TRPO( # _MultiTask( env=env, policy=policy, baseline=baseline, batch_size=new_batch_size, max_path_length=pathlength, n_itr=5, discount=0.995, step_size=0.02, gae_lambda=0.97, whole_paths=False, # task_num=task_size, ) algo.init_opt() from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=num_parallel) parallel_sampler.set_seed(0) algo.start_worker() if not load_init_policy: for i in range(initialize_epochs): print('------ Iter ', i, ' in Init Training --------') if adaptive_sample: paths = [] reward_paths = [] for t in range(task_size): paths += algo.sampler.obtain_samples(0, t) #reward_paths += algo.sampler.obtain_samples(0) elif imbalance_sample: paths = [] reward_paths = [] for t in range(task_size): algo.batch_size = batch_size * sample_ratio[t] task_path = algo.sampler.obtain_samples(0, t) paths += task_path if t == 0: reward_paths += task_path else: paths = algo.sampler.obtain_samples(0) samples_data = algo.sampler.process_samples(0, paths) opt_data = algo.optimize_policy(0, samples_data) pol_aft = (policy.get_param_values()) print(algo.mean_kl(samples_data)) print(dict(logger._tabular)['AverageReturn']) pre_training_learning_curve.append( dict(logger._tabular)['AverageReturn']) joblib.dump(policy, diretory + '/init_policy.pkl', compress=True) print('------- initial training complete ---------------') if not load_split_data: split_data = [] net_weights = [] net_weight_values = [] for i in range(grad_epochs): cur_param_val = np.copy(policy.get_param_values()) cur_param = copy.deepcopy(policy.get_params()) cp = [] for param in policy._mean_network.get_params(): cp.append(np.copy(param.get_value())) net_weights.append(cp) net_weight_values.append(np.copy(policy.get_param_values())) if adaptive_sample: paths = [] reward_paths = [] for t in range(task_size): paths += algo.sampler.obtain_samples(0, t) #reward_paths += algo.sampler.obtain_samples(0) elif imbalance_sample: paths = [] reward_paths = [] for t in range(task_size): algo.batch_size = batch_size * sample_ratio[t] task_path = algo.sampler.obtain_samples(0, t) paths += task_path if t == 0: reward_paths += task_path else: paths = algo.sampler.obtain_samples(0) split_data.append(paths) samples_data = algo.sampler.process_samples(0, paths) opt_data = algo.optimize_policy(0, samples_data) pre_training_learning_curve.append( dict(logger._tabular)['AverageReturn']) joblib.dump(split_data, diretory + '/split_data.pkl', compress=True) joblib.dump(net_weights, diretory + '/net_weights.pkl', compress=True) joblib.dump(net_weight_values, diretory + '/net_weight_values.pkl', compress=True) joblib.dump(pre_training_learning_curve, diretory + '/pretrain_learningcurve_' + str(testit) + '.pkl', compress=True) else: split_data = joblib.load(diretory + '/split_data.pkl') net_weights = joblib.load(diretory + '/net_weights.pkl') net_weight_values = joblib.load(diretory + '/net_weight_values.pkl') pre_training_learning_curve = joblib.load( diretory + '/pretrain_learningcurve_' + str(testit) + '.pkl') task_grads = [] variance_grads = [] for i in range(task_size): task_grads.append([]) for i in range(grad_epochs): policy.set_param_values(net_weight_values[i]) task_paths = [] for j in range(task_size): task_paths.append([]) for path in split_data[i]: taskid = path['env_infos']['state_index'][-1] task_paths[taskid].append(path) for j in range(task_size): samples_data = algo.sampler.process_samples( 0, task_paths[j], False) grad = get_gradient(algo, samples_data, False) task_grads[j].append(grad) if use_param_variance == 1 and i == grad_epochs - 1: for j in range(param_variance_sample): samples_data_ori = algo.sampler.process_samples( 0, split_data[i], False) samples_data = {} indices = np.arange(len(samples_data_ori['observations'])) np.random.shuffle(indices) samples_data["observations"] = samples_data_ori[ "observations"][indices[0:param_variance_batch]] samples_data["actions"] = samples_data_ori["actions"][ indices[0:param_variance_batch]] samples_data["rewards"] = samples_data_ori["rewards"][ indices[0:param_variance_batch]] samples_data["advantages"] = samples_data_ori[ "advantages"][indices[0:param_variance_batch]] samples_data["agent_infos"] = {} samples_data["agent_infos"]["log_std"] = samples_data_ori[ "agent_infos"]["log_std"][ indices[0:param_variance_batch]] samples_data["agent_infos"]["mean"] = samples_data_ori[ "agent_infos"]["mean"][indices[0:param_variance_batch]] grad = get_gradient(algo, samples_data, False) variance_grads.append(grad) algo.sampler.process_samples(0, split_data[i]) weight_variances = [] for i in range(len(task_grads[0][0]) - 1): weight_variances.append(np.zeros(task_grads[0][0][i].shape)) if use_param_variance == 1: for k in range(len(task_grads[0][0]) - 1): one_grad = [] for g in range(len(variance_grads)): one_grad.append(np.asarray(variance_grads[g][k])) weight_variances[k] += np.var(one_grad, axis=0) print('------- collected gradient info -------------') split_counts = [] for i in range(len(task_grads[0][0]) - 1): split_counts.append(np.zeros(task_grads[0][0][i].shape)) for i in range(len(task_grads[0])): for k in range(len(task_grads[0][i]) - 1): region_gradients = [] for region in range(len(task_grads)): region_gradients.append(task_grads[region][i][k]) region_gradients = np.array(region_gradients) if not random_split: split_counts[k] += np.var( region_gradients, axis=0 ) # * np.abs(net_weights[i][k])# + 100 * (len(task_grads[0][i])-k) elif prioritized_split: split_counts[k] += np.random.random( split_counts[k].shape) * (len(task_grads[0][i]) - k) else: split_counts[k] += np.random.random(split_counts[k].shape) for j in range(len(split_counts)): plt.figure() plt.title(policy._mean_network.get_params()[j].name) if len(split_counts[j].shape) == 2: plt.imshow(split_counts[j]) plt.colorbar() elif len(split_counts[j].shape) == 1: plt.plot(split_counts[j]) plt.savefig(diretory + '/' + policy._mean_network.get_params()[j].name + '.png') if use_param_variance: plt.figure() plt.title(policy._mean_network.get_params()[j].name) if len(weight_variances[j].shape) == 2: plt.imshow(weight_variances[j]) plt.colorbar() elif len(weight_variances[j].shape) == 1: plt.plot(weight_variances[j]) plt.savefig(diretory + '/' + policy._mean_network.get_params()[j].name + '_variances.png') algo.shutdown_worker() # organize the metric into each edges and sort them split_metrics = [] metrics_list = [] variance_list = [] for k in range(len(task_grads[0][0]) - 1): for index, value in np.ndenumerate(split_counts[k]): split_metrics.append( [k, index, value, weight_variances[k][index]]) metrics_list.append(value) variance_list.append(weight_variances[k][index]) if use_param_variance == 0: split_metrics.sort(key=lambda x: x[2], reverse=True) else: split_metrics.sort(key=lambda x: x[3], reverse=True) # test the effect of splitting total_param_size = len(policy._mean_network.get_param_values()) pred_list = [] # use the optimized network init_param_value = np.copy(policy.get_param_values()) for split_id, split_percentage in enumerate(split_percentages): split_param_size = split_percentage * total_param_size masks = [] for k in range(len(task_grads[0][0]) - 1): masks.append(np.zeros(split_counts[k].shape)) if split_percentage <= 1.0: for i in range(int(split_param_size)): masks[split_metrics[i][0]][split_metrics[i][1]] = 1 else: threshold = np.mean(metrics_list) + np.std(metrics_list) print('threashold,', threshold) for i in range(len(split_metrics)): if split_metrics[i][2] < threshold: break else: masks[split_metrics[i][0]][split_metrics[i][1]] = 1 mask_split_flat = np.array([]) for k in range(int((len(task_grads[0][0]) - 1) / 2)): for j in range(task_size): mask_split_flat = np.concatenate([ mask_split_flat, np.array(masks[k * 2]).flatten(), np.array(masks[k * 2 + 1]).flatten() ]) mask_share_flat = np.ones(len(mask_split_flat)) mask_share_flat -= mask_split_flat if np.abs(split_percentage - 1.0) < 0.0001: mask_split_flat = np.concatenate( [mask_split_flat, np.ones(dartenv.act_dim * task_size)]) mask_share_flat = np.concatenate( [mask_share_flat, np.zeros(dartenv.act_dim * task_size)]) else: mask_split_flat = np.concatenate( [mask_split_flat, np.zeros(dartenv.act_dim)]) mask_share_flat = np.concatenate( [mask_share_flat, np.ones(dartenv.act_dim)]) policy.set_param_values(init_param_value) if split_param_size != 0: if dartenv.avg_div != task_size: dartenv.avg_div = task_size dartenv.obs_dim += dartenv.avg_div high = np.inf * np.ones(dartenv.obs_dim) low = -high dartenv.observation_space = spaces.Box(low, high) env._wrapped_env._observation_space = rllab.envs.gym_env.convert_gym_space( dartenv.observation_space) env.spec = rllab.envs.env_spec.EnvSpec( observation_space=env.observation_space, action_space=env.action_space, ) split_policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_size, # append_dim=2, net_mode=8, split_num=task_size, split_masks=masks, split_init_net=policy, split_std=np.abs(split_percentage - 1.0) < 0.0001, ) else: split_policy = copy.deepcopy(policy) if split_param_size == 0: baseline_add = 0 else: baseline_add = task_size # use 0 for now, though task_size should in theory improve performance more split_baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=baseline_add) new_batch_size = batch_size if (split_param_size != 0 and alternate_update) or adaptive_sample: new_batch_size = int(batch_size / task_size) split_algo = TRPO( # _MultiTask( env=env, policy=split_policy, baseline=split_baseline, batch_size=new_batch_size, max_path_length=pathlength, n_itr=5, discount=0.995, step_size=0.02, gae_lambda=0.97, whole_paths=False, # task_num=task_size, ) split_algo.init_opt() parallel_sampler.initialize(n_parallel=num_parallel) parallel_sampler.set_seed(0) split_algo.start_worker() if split_param_size != 0: parallel_sampler.update_env_params({ 'avg_div': dartenv.avg_div, 'obs_dim': dartenv.obs_dim, 'observation_space': dartenv.observation_space }) print('Network parameter size: ', total_param_size, len(split_policy.get_param_values())) split_init_param = np.copy(split_policy.get_param_values()) avg_error = 0.0 avg_learning_curve = [] for rep in range(int(reps)): split_policy.set_param_values(split_init_param) learning_curve = [] kl_div_curve = [] for i in range(test_epochs): # if not split if split_param_size == 0: paths, _ = get_samples(split_algo, task_size, adaptive_sample, imbalance_sample, batch_size, sample_ratio) # sanity check samp_num = 0 for p in paths: samp_num += len(p['observations']) print('samp_num: ', samp_num, adaptive_sample, imbalance_sample) samples_data = split_algo.sampler.process_samples( 0, paths) opt_data = split_algo.optimize_policy(0, samples_data) if imbalance_sample: reward = 0 for path in reward_paths: reward += np.sum(path["rewards"]) reward /= len(reward_paths) else: reward = float( (dict(logger._tabular)['AverageReturn'])) kl_div_curve.append(split_algo.mean_kl(samples_data)) print('reward: ', reward) print(split_algo.mean_kl(samples_data)) elif alternate_update: reward = 0 total_traj = 0 task_rewards = [] for j in range(task_size): paths = split_algo.sampler.obtain_samples(0, j) # split_algo.sampler.process_samples(0, paths) samples_data = split_algo.sampler.process_samples( 0, paths) opt_data = split_algo.optimize_policy( 0, samples_data) reward += float((dict( logger._tabular)['AverageReturn'])) * float( (dict(logger._tabular)['NumTrajs'])) total_traj += float( (dict(logger._tabular)['NumTrajs'])) task_rewards.append( dict(logger._tabular)['AverageReturn']) reward /= total_traj print('reward for different tasks: ', task_rewards, reward) elif accumulate_gradient: paths, _ = get_samples(split_algo, task_size, adaptive_sample, imbalance_sample, batch_size, sample_ratio) task_paths = [] task_rewards = [] for j in range(task_size): task_paths.append([]) task_rewards.append([]) for path in paths: taskid = path['env_infos']['state_index'][-1] task_paths[taskid].append(path) task_rewards[taskid].append(np.sum( path['rewards'])) pre_opt_parameter = np.copy( split_policy.get_param_values()) # compute the split gradient first split_policy.set_param_values(pre_opt_parameter) accum_grad = np.zeros(pre_opt_parameter.shape) processed_task_data = [] for j in range(task_size): if len(task_paths[j]) == 0: processed_task_data.append([]) continue split_policy.set_param_values(pre_opt_parameter) # split_algo.sampler.process_samples(0, task_paths[j]) samples_data = split_algo.sampler.process_samples( 0, task_paths[j], False) processed_task_data.append(samples_data) #split_algo.optimize_policy(0, samples_data) # if j == 1: accum_grad += split_policy.get_param_values( ) - pre_opt_parameter # sanity check samp_num = 0 for p in paths: samp_num += len(p['observations']) print('samp_num: ', samp_num) # compute the gradient together split_policy.set_param_values(pre_opt_parameter) all_data = split_algo.sampler.process_samples(0, paths) if imbalance_sample: reward = 0 for path in reward_paths: reward += np.sum(path["rewards"]) reward /= len(reward_paths) else: reward = float( (dict(logger._tabular)['AverageReturn'])) split_algo.optimize_policy(0, all_data) all_data_grad = split_policy.get_param_values( ) - pre_opt_parameter # do a line search to project the udpate onto the constraint manifold sum_grad = all_data_grad # * mask_split_flat + all_data_grad * mask_share_flat ls_steps = [] loss_before = split_algo.loss(all_data) for s in range(50): ls_steps.append(0.97**s) for step in ls_steps: split_policy.set_param_values(pre_opt_parameter + sum_grad * step) if split_algo.mean_kl( all_data )[0] < split_algo.step_size: # and split_algo.loss(all_data)[0] < loss_before[0]: break # step=1 split_policy.set_param_values(pre_opt_parameter + sum_grad * step) for j in range(task_size): task_rewards[j] = np.mean(task_rewards[j]) print('reward for different tasks: ', task_rewards, reward) print('mean kl: ', split_algo.mean_kl(all_data), ' step size: ', step) task_mean_kls = [] for j in range(task_size): if len(processed_task_data[j]) == 0: task_mean_kls.append(0) else: task_mean_kls.append( split_algo.mean_kl( processed_task_data[j])[0]) print('mean kl for different tasks: ', task_mean_kls) kl_div_curve.append( np.concatenate( [split_algo.mean_kl(all_data), task_mean_kls])) else: paths = split_algo.sampler.obtain_samples(0) reward = float( (dict(logger._tabular)['AverageReturn'])) task_paths = [] task_rewards = [] for j in range(task_size): task_paths.append([]) task_rewards.append([]) for path in paths: taskid = path['env_infos']['state_index'][-1] task_paths[taskid].append(path) task_rewards[taskid].append(np.sum( path['rewards'])) pre_opt_parameter = np.copy( split_policy.get_param_values()) # optimize the shared part # split_algo.sampler.process_samples(0, paths) samples_data = split_algo.sampler.process_samples( 0, paths) for layer in split_policy._mean_network._layers: for param in layer.get_params(): if 'split' in param.name: layer.params[param].remove('trainable') split_policy._cached_params = {} split_policy._cached_param_dtypes = {} split_policy._cached_param_shapes = {} split_algo.init_opt() print( 'Optimizing shared parameter size: ', len(split_policy.get_param_values(trainable=True))) split_algo.optimize_policy(0, samples_data) # optimize the tasks for layer in split_policy._mean_network._layers: for param in layer.get_params(): if 'split' in param.name: layer.params[param].add('trainable') if 'share' in param.name: layer.params[param].remove('trainable') # shuffle the optimization order opt_order = np.arange(task_size) np.random.shuffle(opt_order) split_policy._cached_params = {} split_policy._cached_param_dtypes = {} split_policy._cached_param_shapes = {} split_algo.init_opt() for taskid in opt_order: # split_algo.sampler.process_samples(0, task_paths[taskid]) samples_data = split_algo.sampler.process_samples( 0, task_paths[taskid]) print( 'Optimizing parameter size: ', len( split_policy.get_param_values( trainable=True))) split_algo.optimize_policy(0, samples_data) for layer in split_policy._mean_network._layers: for param in layer.get_params(): if 'share' in param.name: layer.params[param].add('trainable') for j in range(task_size): task_rewards[j] = np.mean(task_rewards[j]) print('reward for different tasks: ', task_rewards, reward) learning_curve.append(reward) if (i + initialize_epochs + grad_epochs) % param_update_frequency == 0 and ( i + initialize_epochs + grad_epochs) < param_update_end and ( i + initialize_epochs + grad_epochs) > param_update_start: print("Updating model parameters...") parallel_sampler.update_env_params( {'task_expand_flag': True}) print('============= Finished ', split_percentage, ' Rep ', rep, ' test ', i, ' ================') print(diretory) joblib.dump(split_policy, diretory + '/policies/policy_' + str(rep) + '_' + str(i) + '_' + str(split_percentage) + '.pkl', compress=True) avg_learning_curve.append(learning_curve) kl_divergences[split_id].append(kl_div_curve) joblib.dump(split_policy, diretory + '/policies/final_policy_' + str(split_percentage) + '.pkl', compress=True) avg_error += float(reward) pred_list.append(avg_error / reps) print(split_percentage, avg_error / reps) split_algo.shutdown_worker() print(avg_learning_curve) avg_learning_curve = np.mean(avg_learning_curve, axis=0) learning_curves[split_id].append(avg_learning_curve) # output the learning curves so far joblib.dump(learning_curves, diretory + '/learning_curve.pkl', compress=True) avg_learning_curve = [] for lc in range(len(learning_curves)): avg_learning_curve.append(np.mean(learning_curves[lc], axis=0)) plt.figure() for lc in range(len(learning_curves)): plt.plot(avg_learning_curve[lc], label=str(split_percentages[lc])) plt.legend(bbox_to_anchor=(0.3, 0.3), bbox_transform=plt.gcf().transFigure, numpoints=1) plt.savefig(diretory + '/split_learning_curves.png') if len(kl_divergences[0]) > 0: #print('kldiv:', kl_divergences) avg_kl_div = [] for i in range(len(kl_divergences)): if len(kl_divergences[i]) > 0: avg_kl_div.append(np.mean(kl_divergences[i], axis=0)) #print(avg_kl_div) joblib.dump(avg_kl_div, diretory + '/kl_divs.pkl', compress=True) for i in range(len(avg_kl_div)): one_perc_kl_div = np.array(avg_kl_div[i]) #print(i, one_perc_kl_div) plt.figure() for j in range(len(one_perc_kl_div[0])): append = 'task%d' % j if j == 0: append = 'all' plt.plot(one_perc_kl_div[:, j], label=str(split_percentages[i]) + append, alpha=0.3) plt.legend(bbox_to_anchor=(0.3, 0.3), bbox_transform=plt.gcf().transFigure, numpoints=1) plt.savefig(diretory + '/kl_div_%s.png' % str(split_percentages[i])) performances.append(pred_list) np.savetxt(diretory + '/performance.txt', performances) plt.figure() plt.plot(split_percentages, np.mean(performances, axis=0)) plt.savefig(diretory + '/split_performance.png') joblib.dump(learning_curves, diretory + '/learning_curve.pkl', compress=True) avg_learning_curve = [] for i in range(len(learning_curves)): avg_learning_curve.append(np.mean(learning_curves[i], axis=0)) plt.figure() for i in range(len(split_percentages)): plt.plot(avg_learning_curve[i], label=str(split_percentages[i])) plt.legend(bbox_to_anchor=(0.3, 0.3), bbox_transform=plt.gcf().transFigure, numpoints=1) plt.savefig(diretory + '/split_learning_curves.png') #np.savetxt(diretory + '/learning_curves.txt', avg_learning_curve) if len(kl_divergences[0]) > 0: avg_kl_div = [] for i in range(len(kl_divergences)): avg_kl_div.append(np.mean(kl_divergences[i], axis=0)) joblib.dump(avg_kl_div, diretory + '/kl_divs.pkl', compress=True) for i in range(len(avg_kl_div)): one_perc_kl_div = np.array(avg_kl_div[i]) plt.figure() for j in range(len(one_perc_kl_div[0])): append = 'task%d' % j if j == 0: append = 'all' plt.plot(one_perc_kl_div[:, j], label=str(split_percentages[i]) + append, alpha=0.3) plt.legend(bbox_to_anchor=(0.3, 0.3), bbox_transform=plt.gcf().transFigure, numpoints=1) plt.savefig(diretory + '/kl_div_%s.png' % str(split_percentages[i])) plt.close('all') print(diretory)
#task_num=task_size, ) algo.init_opt() from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=num_parallel) parallel_sampler.set_seed(0) algo.start_worker() for i in range(initialize_epochs): print('------ Iter ', i, ' in Init Training ', diretory, '--------') paths = algo.sampler.obtain_samples(0) samples_data = algo.sampler.process_samples(0, paths) opt_data = algo.optimize_policy(0, samples_data) pol_aft = (policy.get_param_values()) print(algo.mean_kl(samples_data)) print(dict(logger._tabular)['AverageReturn']) data_perc_list = [0.999, 0.7, 0.5, 0.3, 0.1, 0.05, 0.01] testpaths = algo.sampler.obtain_samples(0) for perc in data_perc_list: sampnum = int(batch_size * perc) grads = [] for i in range(var_test_time): idx = np.random.choice(len(testpaths), len(testpaths)) algo.sampler.process_samples(0, testpaths) selected_paths = [] current_sample_num = 0 for id in idx:
extra_dims=1 ) d_rewards_var = TT.vector('d_rewards') importance_weights_var = TT.vector('importance_weight') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) snap_dist_info_vars = snap_policy.dist_info_sym(observations_var) surr = TT.sum(- dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) params = policy.get_params(trainable=True) snap_params = snap_policy.get_params(trainable=True) # save initial parameters policy_parameters = policy.get_param_values(trainable=True) importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP(actions_var, dist_info_vars, snap_dist_info_vars) grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0',dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1',dtype=grad[1].dtype) eval_grad3 = TT.matrix('eval_grad3',dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4',dtype=grad[3].dtype) eval_grad5 = TT.matrix('eval_grad5',dtype=grad[4].dtype) eval_grad6 = TT.vector('eval_grad6',dtype=grad[5].dtype) surr_on1 = TT.sum(- dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) #surr_on2 = TT.sum(- snap_dist.log_likelihood_sym_1traj_GPOMDP(actions_var, snap_dist_info_vars) * d_rewards_var ) #grad_SVRG =[sum(x) for x in zip([eval_grad1, eval_grad2, eval_grad3, eval_grad4], # theano.grad(surr_on1, params),
if not load_init_policy: for i in range(initialize_epochs): paths = algo.sampler.obtain_samples(0) # if not split samples_data = algo.sampler.process_samples(0, paths) opt_data = algo.optimize_policy(0, samples_data) print(dict(logger._tabular)['AverageReturn']) joblib.dump(policy, 'data/trained/gradient_temp/rl_split_' + append + '/init_policy.pkl', compress=True) print('------- initial training complete ---------------') init_param_value = np.copy(policy.get_param_values()) task_grads = [] for i in range(2): task_grads.append([]) if not load_split_data: split_data = [] net_weights = [] for i in range(grad_epochs): cur_param_val = np.copy(policy.get_param_values()) cur_param = copy.deepcopy(policy.get_params()) cp = [] for param in policy._mean_network.get_params(): cp.append(np.copy(param.get_value()))
policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(10, 5), # append_dim=2, net_mode=0, ) policy = joblib.load( 'data/local/experiment/hopper_footstrength_rest1_sd4_boundedrandwalk_2000finish/policy_1500.pkl' ) baseline = LinearFeatureBaseline(env_spec=env.spec, additional_dim=0) init_param = policy.get_param_values() ###### get baseline gradient ################################### '''algobase = TRPO( env=env, policy=policy, baseline=baseline, batch_size=2000000, max_path_length=env.horizon, n_itr=5, discount=0.995, step_size=0.01, gae_lambda=0.97, )
alla = [] alla2 = [] alla3 = [] for k in range(10): alla4 = [] if (load_policy): snap_policy.set_param_values(np.loadtxt('policy_novar.txt'), trainable=True) policy.set_param_values(np.loadtxt('policy_novar.txt'), trainable=True) avg_return = np.zeros(s_tot) #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) j = 0 while j < s_tot - N: paths = parallel_sampler.sample_paths_on_trajectories( snap_policy.get_param_values(), N, T, show_bar=False) #baseline.fit(paths) j += N observations = [p["observations"] for p in paths] actions = [p["actions"] for p in paths] d_rewards = [p["rewards"] for p in paths] temp = list() for x in d_rewards: z = list() t = 1 for y in x: z.append(y * t) t *= discount temp.append(np.array(z)) d_rewards = temp s_g = f_train(observations[0], actions[0], d_rewards[0])
def train(env, policy, policy_init, num_episodes, episode_cap, horizon, **alg_args): # Getting the environment env_class = rllab_env_from_name(env) env = normalize(env_class()) # Policy initialization if policy_init == 'zeros': initializer = LI.Constant(0) elif policy_init == 'normal': initializer = LI.Normal() else: raise Exception('Unrecognized policy initialization.') # Setting the policy type if policy == 'linear': hidden_sizes = tuple() elif policy == 'simple-nn': hidden_sizes = [16] else: raise Exception('NOT IMPLEMENTED.') # Creating the policy obs_dim = env.observation_space.flat_dim action_dim = env.action_space.flat_dim mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, output_b_init=None, output_W_init=initializer, ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes, mean_network=mean_network, log_weights=True, ) # Creating baseline baseline = LinearFeatureBaseline(env_spec=env.spec) # Adding max_episodes constraint. If -1, this is unbounded if episode_cap: alg_args['max_episodes'] = num_episodes # Run algorithm algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=horizon * num_episodes, whole_paths=True, max_path_length=horizon, **alg_args) algo.train() print('----- ENDING ------') print(policy.get_param_values())
d_rewards_var = TT.vector('d_rewards') importance_weights_var = TT.vector('importance_weight') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) snap_dist_info_vars = snap_policy.dist_info_sym(observations_var) surr = TT.sum( -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) params = policy.get_params(trainable=True) snap_params = snap_policy.get_params(trainable=True) # save initial parameters policy_parameters = policy.get_param_values(trainable=True) importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP( actions_var, dist_info_vars, snap_dist_info_vars) grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype) eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype) surr_on1 = TT.sum( -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) #surr_on2 = TT.sum(- snap_dist.log_likelihood_sym_1traj_GPOMDP(actions_var, snap_dist_info_vars) * d_rewards_var ) #grad_SVRG =[sum(x) for x in zip([eval_grad1, eval_grad2, eval_grad3, eval_grad4],
paths += algo.sampler.obtain_samples(0, t) reward_paths += algo.sampler.obtain_samples(0) elif imbalance_sample: paths = [] reward_paths = [] for t in range(task_size): algo.batch_size = batch_size * sample_ratio[t] task_path = algo.sampler.obtain_samples(0, t) paths += task_path if t == 0: reward_paths += task_path else: paths = algo.sampler.obtain_samples(0) samples_data = algo.sampler.process_samples(0, paths) opt_data = algo.optimize_policy(0, samples_data) pol_aft = (policy.get_param_values()) print(algo.mean_kl(samples_data)) print(dict(logger._tabular)['AverageReturn']) joblib.dump(policy, diretory + '/init_policy.pkl', compress=True) print('------- initial training complete ---------------') init_param_value = np.copy(policy.get_param_values()) task_grads = [] for i in range(task_size): task_grads.append([]) if not load_split_data: split_data = []
algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=75000, max_path_length=env.horizon, n_itr=5, discount=0.995, step_size=0.01, gae_lambda=0.97, ) algo.init_opt() if not load_path_from_file: init_param = policy.get_param_values() init_param_obj = copy.deepcopy(policy.get_params()) from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=7) env.wrapped_env.env.env.perturb_MP = False algo.start_worker() pol_weights = [] all_paths = [] policy_params = [] for i in range(50): init_param = policy.get_param_values() init_param_obj = copy.deepcopy(policy.get_params())
actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) d_rewards_var = TT.vector('d_rewards') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. # For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # negate the objective for minimization problem surr = TT.sum( -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) # get the list of trainable parameters params = policy.get_params(trainable=True) # save initial parameters policy_parameters = policy.get_param_values(trainable=True) grad = theano.grad(surr, params) eval_grad1 = TT.matrix( 'eval_grad0', dtype=grad[0].dtype ) # (4, 8) hiddenlayer.w = LI.GlorotUniform() aka Xavier Uniform Init eval_grad2 = TT.vector( 'eval_grad1', dtype=grad[1].dtype) # (8, ) hiddenlayer.b = LI.Constant(0.), eval_grad3 = TT.col( 'eval_grad3', dtype=grad[2].dtype) # (8, 1) output.w = LI.GlorotUniform(), eval_grad4 = TT.vector( 'eval_grad4', dtype=grad[3].dtype) # (1, ) output.b = LI.Constant(0.),
for k in range(10): if (load_policy): snap_policy.set_param_values(np.loadtxt('policy_novar.txt'), trainable=True) policy.set_param_values(np.loadtxt('policy_novar.txt'), trainable=True) avg_return = list() n_sub_iter=[] rewards_sub_iter=[] rewards_snapshot=[] importance_weights=[] variance_svrg = [] variance_sgd = [] #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) j=0 while j<s_tot-N: paths = parallel_sampler.sample_paths_on_trajectories(snap_policy.get_param_values(),N,T,show_bar=False) #baseline.fit(paths) paths = paths[:N] j+=N observations = [p["observations"] for p in paths] actions = [p["actions"] for p in paths] d_rewards = [p["rewards"] for p in paths] temp = list() for x in d_rewards: z=list() t=1 for y in x: z.append(y*t) t*=discount temp.append(np.array(z)) d_rewards=temp
for k in range(10): print("Run #{}".format(k)) # load policy if learn_std: file_name = 'roboschool_inv_pendulum_policy' + '.txt' else: file_name = 'roboschool_inv_pendulum_policy_novar' + '.txt' if load_policy: policy.set_param_values(np.loadtxt('save_model/' + file_name), trainable=True) else: np.savetxt("save_model/" + file_name, policy.get_param_values(trainable=True)) load_policy = True # intial setup avg_return = list() eps_list = [] max_rewards = -np.inf num_traj = 0 # loop till done while num_traj <= max_num_traj: # sample snapshot batch of trajectories paths = parallel_sampler.sample_paths_on_trajectories( policy.get_param_values(), snap_bs, traj_length, show_bar=False) paths = paths[:snap_bs]
policy=policy, baseline=baseline, batch_size=150000, max_path_length=env.horizon, n_itr=5, discount=0.995, step_size=0.01, gae_lambda=0.97, ) algo.init_opt() one_iter_grad = [] mps = [] if not load_path_from_file: init_param = policy.get_param_values() init_param_obj = copy.deepcopy(policy.get_params()) from rllab.sampler import parallel_sampler parallel_sampler.initialize(n_parallel=7) env.wrapped_env.env.env.perturb_MP = False pol_weights = [] all_paths = [] policy_params = [] init_param = np.copy(policy.get_param_values()) algo.start_worker() for i in range(100): policy.set_param_values(init_param) ##### get data ###################
variance_sgd_data = {} importance_weights_data = {} rewards_snapshot_data = {} rewards_subiter_data = {} n_sub_iter_data = {} diff_lr_data = {} alfa_t_data = {} parallel_sampler.initialize(10) for k in range(10): if (load_policy): snap_policy.set_param_values(np.loadtxt('policy_swimmer.txt'), trainable=True) policy.set_param_values(np.loadtxt('policy_swimmer.txt'), trainable=True) else: policy.set_param_values(snap_policy.get_param_values(trainable=True), trainable=True) avg_return = [] #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) n_sub_iter = [] rewards_sub_iter = [] rewards_snapshot = [] importance_weights = [] variance_svrg = [] variance_sgd = [] diff_lr = [] alfa_t = [] j = 0 while j < s_tot - N: paths = parallel_sampler.sample_paths_on_trajectories( policy.get_param_values(), N, T, show_bar=False)
all_policy_param_data = {} ar_data = {} parallel_sampler.initialize(4) for k in range(5): if (load_policy): # policy.set_param_values(np.loadtxt('policy.txt'), trainable=True) policy.set_param_values(np.loadtxt('pcb' + np.str(k + 1) + '.txt'), trainable=True) avg_return = np.zeros(n_itr) rewards = [] all_policy_param = [] all_rew = [] #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) for j in range(n_itr): if (j % 100 == 0): all_policy_param.append(policy.get_param_values()) paths = parallel_sampler.sample_paths_on_trajectories( policy.get_param_values(), N, T, show_bar=False) paths = paths[:N] observations = [p["observations"] for p in paths] actions = [p["actions"] for p in paths] d_rewards = [p["rewards"] for p in paths] rewards.append(np.array([sum(p["rewards"]) for p in paths])) temp = list() for x in d_rewards: z = list() t = 1 for y in x: z.append(y * t) t *= discount temp.append(np.array(z))
f_update = theano.function( inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5], outputs=None, updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5], params, learning_rate=learning_rate)) alla = [] for i in range(10): if (load_policy): policy.set_param_values(np.loadtxt('policy.txt'), trainable=True) avg_return = np.zeros(n_itr) #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) for j in range(n_itr): paths = parallel_sampler.sample_paths_on_trajectories( policy.get_param_values(), N, T, show_bar=False) #baseline.fit(paths) observations = [p["observations"] for p in paths] actions = [p["actions"] for p in paths] d_rewards = [p["rewards"] for p in paths] temp = list() for x in d_rewards: z = list() t = 1 for y in x: z.append(y * t) t *= discount temp.append(np.array(z)) d_rewards = temp s_g = f_train(observations[0], actions[0], d_rewards[0]) for ob, ac, rw in zip(observations[1:], actions[1:], d_rewards[1:]):
importance_weights_data = {} rewards_snapshot_data = {} rewards_subiter_data = {} n_sub_iter_data = {} all_policy_param_data = {} parallel_sampler.initialize(4) for k in range(10): if (load_policy): # snap_policy.set_param_values(np.loadtxt('policy.txt'), trainable=True) # policy.set_param_values(np.loadtxt('policy.txt'), trainable=True) snap_policy.set_param_values(np.loadtxt('pc' + np.str(k + 1) + '.txt'), trainable=True) policy.set_param_values(np.loadtxt('pc' + np.str(k + 1) + '.txt'), trainable=True) else: policy.set_param_values(snap_policy.get_param_values(trainable=True), trainable=True) avg_return = np.zeros(s_tot) #np.savetxt("policy_novar.txt",snap_policy.get_param_values(trainable=True)) n_sub_iter = [] rewards_sub_iter = [] rewards_snapshot = [] importance_weights = [] variance_svrg = [] variance_sgd = [] all_policy_param = [] j = 0 while j < s_tot - N: all_policy_param.append(policy.get_param_values()) paths = parallel_sampler.sample_paths_on_trajectories( policy.get_param_values(), N, T, show_bar=False)