def main(): # General env properties env_tgt = gym.make('minigolf-v0') env_src = gym.make('minigolf-v0') param_space_size = 4 state_space_size = 1 env_param_space_size = 4 episode_length = 20 gaussian_transitions = False env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions) mean_initial_param = np.random.normal( np.ones(param_space_size) * 0.2, 0.01) variance_initial_param = 0 variance_action = 0.1 feats = polynomial simulation_param = sc.SimulationParam( mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[0.20097172, 0.20182519, 0.19957835, 0.20096946], [0.34099334, 0.21422279, 0.20053974, 0.20105477], [0.46923638, 0.22986188, 0.20266549, 0.20137892], [0.64977232, 0.26575410, 0.21014003, 0.20300604], [0.89955698, 0.32707635, 0.23490234, 0.21518798], [1.09006747, 0.35577241, 0.24517702, 0.22017502], [1.22329955, 0.40621784, 0.28787368, 0.24836521], [1.34824502, 0.43750823, 0.29981691, 0.25448715], [1.24846429, 0.42882867, 0.27008977, 0.22433061], [1.41946655, 0.53908188, 0.33195278, 0.25586648]] putter_length = np.random.uniform(0.7, 1.0, arguments.n_source_models) friction = np.random.uniform(0.1, 0.15, arguments.n_source_models) hole_size = np.random.uniform(0.10, 0.15, arguments.n_source_models) envs = [[putter_length[i], friction[i], hole_size[i], 0.09] for i in range(arguments.n_source_models)] policy_params = [] env_params = [] num_policy = len(pis) for e in envs: for p in pis: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) source_envs = [] for param in np.array(envs): source_envs.append(gym.make('minigolf-v0')) source_envs[-1].setParams(param) n_config_cv = policy_params.shape[0] n_source = [arguments.n_source_samples * len(pis) for _ in envs] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size, features=feats, env_target=env_tgt) # Envs for discrete model estimation possible_env_params = envs # possible envs are the source envs possible_envs = [] for param in np.array(possible_env_params): possible_envs.append(gym.make('minigolf-v0')) possible_envs[-1].setParams(param) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) model_estimation = 0 off_policy = 0 discrete_estimation = 0 model = None env_src_models = None # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) source_dataset.policy_per_model = num_policy if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]: name = estimator else: off_policy = 1 name = estimator[:-3] if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec( env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[0, 0, 0, 0]]), np.array([[1.0, 0.131, 0.1, 0.09]]), param_space_size, state_space_size, env_param_space_size, features=feats, env_target=env_tgt) source_dataset = sc.SourceDataset(*data_sr, 1) elif estimator.endswith("DI"): model_estimation = 1 discrete_estimation = 1 model = Models(possible_envs) elif estimator.endswith("GP") or estimator.endswith( "ES") or estimator.endswith("MI") or estimator.endswith( "NS"): model_estimation = 1 model = ModelEstimatorRKHS( kernel_rho=10, kernel_lambda=[100, 10], sigma_env=env_tgt.sigma_noise, sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples, state_dim=1, linear_kernel=False, balance_coeff=arguments.balance_coeff, alpha_gp=1, print_mse=arguments.print_mse, features=polynomial, param_dim=param_space_size, target_env=env_tgt, heteroscedastic=True) if estimator.endswith("GP"): # or estimator.endswith("NS"): model.use_gp = True elif estimator.endswith("MI"): model.use_gp_generate_mixture = True if estimator.endswith("NS"): n_models = int( source_dataset.episodes_per_config.shape[0] / source_dataset.policy_per_model) transition_models = [] for i in range(n_models): model_estimator = ModelEstimatorRKHS( kernel_rho=10, kernel_lambda=[100, 10], sigma_env=env_tgt.sigma_noise, sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples, state_dim=1, linear_kernel=False, balance_coeff=arguments.balance_coeff, alpha_gp=1, print_mse=arguments.print_mse, features=polynomial, param_dim=param_space_size, target_env=env_tgt, heteroscedastic=True, max_gp_src=arguments.max_gp_samples_src) transition_models.append(model_estimator) env_src_models = SourceEstimator(source_dataset, transition_models) result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=model_estimation, dicrete_estimation=discrete_estimation, model_estimator=model, verbose=not arguments.quiet, features=polynomial, source_estimator=env_src_models if estimator.endswith("NS") else None) stats[estimator].append(result) return stats
tot_rewards_real = [] x = range(num_batch) #folder = "20190122_165737" param_space_size = 4 mean_initial_param = np.zeros(param_space_size) variance_initial_param = np.zeros(param_space_size) batch_size = 10 discount_factor = 0.99 ess_min = 20 adaptive = "No" n_min = 3 variance_action = 0.1 simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, batch_size, num_batch, discount_factor, None, None, ess_min, adaptive, n_min) features = identity env_tgt = gym.make('cartpolec-v0') variance_env = env_tgt.getEnvParam()[-1] param_space_size = 4 state_space_size = 4 env_param_space_size = 3 episode_length = 200 discount_factor_timestep = np.asarray( [discount_factor**i for i in range(episode_length)]) env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size,
def main(transition_model): """ lqg1d sample reuse """ env_tgt = gym.make('cartpolec-v0') variance_env = 0 env_planning = PlanningEnv(transition_model, env_tgt, np.sqrt(variance_env)) param_space_size = 4 state_space_size = 4 env_param_space_size = 3 episode_length = 200 env_param = sc.EnvParam(env_planning, param_space_size, state_space_size, env_param_space_size, episode_length) mean_initial_param = np.random.normal(np.zeros(param_space_size), 0.01) variance_initial_param = 0 variance_action = 0.1 batch_size = 10 discount_factor = 0.99 ess_min = 25 adaptive = "No" n_min = 5 simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, batch_size, num_batch, discount_factor, None, None, ess_min, adaptive, n_min) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) source_dataset_batch_size = 1 policy_params = np.array([[0, 0, 0, 0]]) env_params = np.array([[1.0, 0.5, 0.09]]) name = estimator[:-3] [ source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised ] = stc.sourceTaskCreationSpec(env_tgt, episode_length, source_dataset_batch_size, discount_factor, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) source_dataset = sc.SourceDataset(source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised, 1) off_policy = 0 name = estimator simulation_param.batch_size = 10 simulation_param.learning_rate = learning_rate result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy) stats[estimator].append(result) return stats
def main(): # General env properties env_tgt = gym.make('LQG1D-v0') env_src = gym.make('LQG1D-v0') param_space_size = 1 state_space_size = 1 env_param_space_size = 3 episode_length = 20 gaussian_transition = True env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transition) mean_initial_param = -0.1 * np.ones(param_space_size) variance_initial_param = 0 variance_action = 0.1 simulation_param = sc.SimulationParam( mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[-0.1], [-0.15], [-0.2], [-0.25], [-0.3], [-0.35], [-0.4], [-0.45]] if arguments.random_src: A = np.random.uniform(0.6, 1.4, arguments.n_source_models) B = np.random.uniform(0.8, 1.2, arguments.n_source_models) else: A = np.array(arguments.src_A) B = np.array(arguments.src_B) envs = [[A[i], B[i], 0.09] for i in range(A.shape[0])] print(envs) policy_params = [] env_params = [] for e in envs: for p in pis: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) n_config_cv = policy_params.shape[0] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) off_policy = 0 if estimator in [ "GPOMDP", "REINFORCE", "REINFORCE-BASELINE" ] else 1 name = estimator if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec( env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[-0.1]]), np.array([[1.0, 1.0, 0.09]]), param_space_size, state_space_size, env_param_space_size) source_dataset = sc.SourceDataset(*data_sr, 1) name = estimator[:-3] result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=0, dicrete_estimation=0, model_estimator=None, verbose=not arguments.quiet) stats[estimator].append(result) return stats
def main(): # General env properties env_tgt = gym.make('minigolf-v0') env_src = gym.make('minigolf-v0') param_space_size = 4 state_space_size = 1 env_param_space_size = 4 episode_length = 20 gaussian_transitions = False env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions) mean_initial_param = np.random.normal(np.ones(param_space_size) * 0.2, 0.01) variance_initial_param = 0 variance_action = 0.1 feats = polynomial simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[0.20097172, 0.20182519, 0.19957835, 0.20096946], [0.34099334, 0.21422279, 0.20053974, 0.20105477], [0.46923638, 0.22986188, 0.20266549, 0.20137892], [0.64977232, 0.26575410, 0.21014003, 0.20300604], [0.89955698, 0.32707635, 0.23490234, 0.21518798], [1.09006747, 0.35577241, 0.24517702, 0.22017502], [1.22329955, 0.40621784, 0.28787368, 0.24836521], [1.34824502, 0.43750823, 0.29981691, 0.25448715], [1.24846429, 0.42882867, 0.27008977, 0.22433061], [1.41946655, 0.53908188, 0.33195278, 0.25586648]] putter_length = np.random.uniform(0.7, 1.0, arguments.n_source_models) friction = np.random.uniform(0.065, 0.196, arguments.n_source_models) hole_size = np.random.uniform(0.10, 0.15, arguments.n_source_models) envs = [[putter_length[i], friction[i], hole_size[i], 0.09] for i in range(arguments.n_source_models)] policy_params = [] env_params = [] num_policy = len(pis) for e in envs: for p in pis: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) n_config_cv = policy_params.shape[0] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size, features=feats, env_target=env_tgt) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) source_dataset.policy_per_model = num_policy off_policy = 0 if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"] else 1 name = estimator if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec(env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[0, 0, 0, 0]]), np.array([[1.0, 0.131, 0.1, 0.09]]), param_space_size, state_space_size, env_param_space_size, features=feats, env_target=env_tgt) source_dataset = sc.SourceDataset(*data_sr, 1) source_dataset.policy_per_model = num_policy name = estimator[:-3] result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=0, dicrete_estimation=0, model_estimator=None, verbose=not arguments.quiet, features=feats) stats[estimator].append(result) return stats
def main(): # General env properties env_tgt = gym.make('LQG1D-v0') env_src = gym.make('LQG1D-v0') param_space_size = 1 state_space_size = 1 env_param_space_size = 3 episode_length = 20 gaussian_transitions = True env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions) mean_initial_param = -0.1 * np.ones(param_space_size) variance_initial_param = 0 variance_action = 0.1 simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[-0.1], [-0.2], [-0.3], [-0.4], [-0.5], [-0.6], [-0.7], [-0.8]] A = np.random.uniform(0.6, 1.4, arguments.n_source_models) B = np.random.uniform(0.8, 1.2, arguments.n_source_models) envs = [[A[i], B[i], 0.09] for i in range(A.shape[0])] policy_params = [] env_params = [] for p in pis: for e in envs: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) source_envs = [] for param in np.array(envs): source_envs.append(gym.make('LQG1D-v0')) source_envs[-1].setParams(param) n_config_cv = policy_params.shape[0] n_source = [arguments.n_source_samples * len(pis) for _ in envs] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) # Envs for discrete model estimation possible_env_params = [[1.0, 1.0, 0.09], [1.5, 1.0, 0.09], [0.5, 1.0, 0.09], [1.2, 0.8, 0.09], [0.8, 1.2, 0.09], [1.1, 0.9, 0.09], [0.9, 1.1, 0.09], [1.5, 0.5, 0.09]] possible_envs = [] for param in np.array(possible_env_params): possible_envs.append(gym.make('LQG1D-v0')) possible_envs[-1].setParams(param) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) model_estimation = 0 off_policy = 0 discrete_estimation = 0 model = None # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]: name = estimator else: off_policy = 1 name = estimator[:-3] if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec( env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[-0.1]]), np.array([[1.0, 1.0, 0.09]]), param_space_size, state_space_size, env_param_space_size) source_dataset = sc.SourceDataset(*data_sr, 1) elif estimator.endswith("DI"): model_estimation = 1 discrete_estimation = 1 model = Models(possible_envs) elif estimator.endswith("GP") or estimator.endswith( "ES") or estimator.endswith("MI"): model_estimation = 1 model = ModelEstimatorRKHS( kernel_rho=1, kernel_lambda=[1, 1], sigma_env=env_tgt.sigma_noise, sigma_pi=np.sqrt(variance_action), T=episode_length, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples, state_dim=1, linear_kernel=True, balance_coeff=arguments.balance_coeff, target_env=env_tgt if arguments.print_mse else None) if estimator.endswith("GP"): model.use_gp = True elif estimator.endswith("MI"): model.use_gp_generate_mixture = True result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=model_estimation, dicrete_estimation=discrete_estimation, model_estimator=model, verbose=not arguments.quiet) stats[estimator].append(result) return stats
def main(id): # General env properties env_tgt = gym.make('cartpolec-v0') env_src = gym.make('cartpolec-v0') param_space_size = 4 state_space_size = 4 env_param_space_size = 3 episode_length = 200 gaussian_transition = True env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transition) mean_initial_param = np.random.normal(np.zeros(param_space_size), 0.01) variance_initial_param = 0 variance_action = 0.1 simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, arguments.batch_size, arguments.iterations, arguments.gamma, None, arguments.learning_rate, arguments.ess_min, "Yes" if arguments.adaptive else "No", arguments.n_min, use_adam=arguments.use_adam) # Source tasks pis = [[-0.04058811, 0.06820783, 0.09962419, -0.01481458], [-0.04327763, 0.01926409, 0.10651812, 0.07304843], [-0.04660533, -0.08301117, 0.14598312, 0.31524803], [-0.04488895, -0.04959011, 0.20856307, 0.52564195], [-0.02085553, 0.11530108, 0.24525215, 0.58338479], [-0.03072567, 0.15546779, 0.27241488, 0.65833969], [-0.05493752, 0.11100809, 0.30213226, 0.73134919], [-0.02389198, 0.18004238, 0.30697023, 0.72447482], [-0.0702051, 0.17653729, 0.32254312, 0.72004621], [-0.09675066, 0.16063462, 0.32343255, 0.73801456]] m = np.random.uniform(0.8, 1.2, arguments.n_source_models) l = np.random.uniform(0.4, 0.6, arguments.n_source_models) envs = [[m[i], l[i], 0.09] for i in range(m.shape[0])] policy_params = [] env_params = [] num_policy = len(pis) for e in envs: for p in pis: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) source_envs = [] for param in np.array(envs): source_envs.append(gym.make('cartpolec-v0')) source_envs[-1].setParams(param) n_config_cv = policy_params.shape[0] n_source = [arguments.n_source_samples*len(pis) for _ in envs] data = stc.sourceTaskCreationSpec(env_src, episode_length, arguments.n_source_samples, arguments.gamma, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) # Envs for discrete model estimation possible_env_params = [[1.0, 0.5, 0.09], [0.8, 0.3, 0.09], [1.2, 0.7, 0.09], [1.1, 0.6, 0.09], [0.9, 0.4, 0.09], [0.9, 0.6, 0.09], [1.1, 0.4, 0.09], [1.5, 1.0, 0.09]] possible_envs = [] for param in np.array(possible_env_params): possible_envs.append(gym.make('cartpolec-v0')) possible_envs[-1].setParams(param) stats = {} for estimator in estimators: stats[estimator] = [] for estimator in estimators: print(estimator) model_estimation = 0 off_policy = 0 discrete_estimation = 0 model = None env_src_models = None # Create a new dataset object source_dataset = sc.SourceDataset(*data, n_config_cv) source_dataset.policy_per_model = num_policy if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]: name = estimator else: off_policy = 1 name = estimator[:-3] if estimator.endswith("SR"): # Create a fake dataset for the sample-reuse algorithm data_sr = stc.sourceTaskCreationSpec(env_src, episode_length, 1, arguments.gamma, variance_action, np.array([[0, 0, 0, 0]]), np.array([[1.0, 0.5, 0.09]]), param_space_size, state_space_size, env_param_space_size) source_dataset = sc.SourceDataset(*data_sr, 1) elif estimator.endswith("DI"): model_estimation = 1 discrete_estimation = 1 model = Models(possible_envs) elif estimator.endswith("GP") or estimator.endswith("ES") or estimator.endswith("MI") or estimator.endswith("NS"): model_estimation = 1 model = ModelEstimatorRKHS(kernel_rho=1, kernel_lambda=[1, 1, 1, 1, 1], sigma_env=env_tgt.sigma_env, sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples, state_dim=4, linear_kernel=False, balance_coeff=arguments.balance_coeff, alpha_gp=1e-5, target_env=env_tgt if arguments.print_mse else None, id=id) if estimator.endswith("GP"): model.use_gp = True elif estimator.endswith("MI"): model.use_gp_generate_mixture = True if estimator.endswith("NS"): n_models = int(source_dataset.episodes_per_config.shape[0]/source_dataset.policy_per_model) transition_models = [] for i in range(n_models): model_estimator = ModelEstimatorRKHS(kernel_rho=1, kernel_lambda=[1, 1, 1, 1, 1], sigma_env=env_tgt.sigma_env, sigma_pi=np.sqrt(variance_action), T=arguments.rkhs_horizon, R=arguments.rkhs_samples, lambda_=0.0, source_envs=source_envs, n_source=n_source, max_gp=arguments.max_gp_samples_src, state_dim=4, linear_kernel=False, balance_coeff=arguments.balance_coeff, alpha_gp=1e-5, target_env=env_tgt if arguments.print_mse else None, id=id) transition_models.append(model_estimator) env_src_models = SourceEstimator(source_dataset, transition_models) result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, model_estimation=model_estimation, dicrete_estimation=discrete_estimation, model_estimator=model, verbose=not arguments.quiet, dump_model=arguments.dump_estimated_model, iteration_dump=arguments.iteration_dump, source_estimator=env_src_models if estimator.endswith("NS") else None) stats[estimator].append(result) return stats
def main(): """ lqg1d sample reuse """ env_tgt = gym.make('LQG1D-v0') env_src = gym.make('LQG1D-v0') param_space_size = 1 state_space_size = 1 env_param_space_size = 3 episode_length = 20 gaussian_transitions = True env_param = sc.EnvParam(env_tgt, param_space_size, state_space_size, env_param_space_size, episode_length, gaussian_transitions) mean_initial_param = 0 * np.ones(param_space_size) variance_initial_param = 0 variance_action = 0.1 batch_size = 10 discount_factor = 0.99 ess_min = 25 adaptive = "No" n_min = 3 simulation_param = sc.SimulationParam(mean_initial_param, variance_initial_param, variance_action, batch_size, num_batch, discount_factor, None, None, ess_min, adaptive, n_min) # source task for lqg1d source_dataset_batch_size = 1 discount_factor = 0.99 pis = [[-0.1]]#, [-0.2], [-0.3], [-0.4], [-0.5], [-0.6], [-0.7], [-0.8]] A = np.random.uniform(0.5, 1.5, 1) B = np.random.uniform(0.8, 1.2, 1) variance_env = 0.09 envs = [] for i in range(len(A)): envs.append([A[i], B[i], variance_env]) policy_params = [] env_params = [] for p in pis: for e in envs: policy_params.append(p) env_params.append(e) policy_params = np.array(policy_params) env_params = np.array(env_params) n_config_cv = policy_params.shape[0] [source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised] = stc.sourceTaskCreationSpec(env_src, episode_length, source_dataset_batch_size, discount_factor, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) stats = {} for estimator in estimators: stats[estimator] = [] self_normalised = 0 for estimator,learning_rate in zip(estimators, learning_rates): print(estimator) simulation_param.learning_rate = learning_rate if estimator in ["GPOMDP", "REINFORCE", "REINFORCE-BASELINE"]: off_policy = 0 name = estimator simulation_param.batch_size = 10 self_normalised = 0 elif estimator == "IS-SN": self_normalised = 1 name = estimator[:-3] #estimator = estimator[:-3] off_policy = 1 elif estimator.endswith("SR"): #if sample reuse source_dataset_batch_size = 1 discount_factor = 0.99 policy_params = np.array([[-1]]) env_params = np.array([[1-5, 1, 0.09]]) n_config_cv = 1 name = estimator[:-3] self_normalised = 0 [source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised] = stc.sourceTaskCreationSpec(env_src, episode_length, source_dataset_batch_size, discount_factor, variance_action, policy_params, env_params, param_space_size, state_space_size, env_param_space_size) else: off_policy = 1 name = estimator self_normalised = 0 simulation_param.learning_rate = learning_rate source_dataset = sc.SourceDataset(source_task, source_param, episodes_per_configuration, next_states_unclipped, actions_clipped, next_states_unclipped_denoised, n_config_cv) simulation_param.learning_rate = learning_rate result = la.learnPolicy(env_param, simulation_param, source_dataset, name, off_policy=off_policy, self_normalised=self_normalised) stats[estimator].append(result) return stats