def train_and_save_policy(env_name, seed, save_period, total_timesteps, hidden_dim=100): save_dir = "trained_policy/%s/seed_%d_hidden_%d" % (env_name, seed, hidden_dim) os.makedirs(save_dir, exist_ok=True) if len(glob.glob("%s/step_*.pkl" % save_dir)) > 0: print("already trained: %s" % save_dir) return def callback(_locals, _globals): global n_steps model_filepath = "%s/step_%d.pkl" % (save_dir, n_steps + 1) if (n_steps + 1) % save_period == 0: print('Saving a model to %s' % model_filepath) model.save(model_filepath) n_steps += 1 return True global n_steps n_steps = 0 env = gym.make(precise_env_name(env_name)) env.seed(seed) set_global_seeds(seed) model = SAC(env, ent_coef='auto', seed=seed, hidden_dim=hidden_dim) model.learn(total_timesteps=total_timesteps, log_interval=10, seed=seed, callback=callback)
def load_trained_agent(env_name, trained_policy_seed, trained_policy_step, bias_offset=0, seed=0, hidden_dim=64): env = gym.make(precise_env_name(env_name)) trained_agent = SAC.load("trained_policy/%s/seed_%d_hidden_%d/step_%d.pkl" % (env_name, trained_policy_seed, hidden_dim, trained_policy_step), env, seed=seed, hidden_dim=hidden_dim) parameters = trained_agent.get_parameters() for i, parameter in enumerate(parameters): name, value = parameter if name == 'actor/f2_log_std/bias:0': parameters[i] = (name, value + bias_offset) trained_agent.load_parameters(parameters) return trained_agent
def make_vectorized_env(env_name, n_envs=multiprocessing.cpu_count()): def make_env(env_id, seed=0): def _init(): env = gym.make(env_id) env.seed(seed) return env set_global_seeds(seed) return _init vec_env = SubprocVecEnv([make_env(precise_env_name(env_name), i) for i in range(n_envs)]) return vec_env
def generate_cluster(env_name, trained_policy_seed, trained_policy_step, num_episodes, num_clusters, seed, hidden_dim=64): """ :return: cluster infos - list of [(obs, action, reward, next_obs, done), ... ] - len(trajectory): num_episodes - len(trajectory[0]): time steps of 0th episode """ save_dir = "batch_trajectory/{}/seed_{}_hidden_{}/step_{}".format( env_name, trained_policy_seed, hidden_dim, trained_policy_step) os.makedirs(save_dir, exist_ok=True) trajectory_filepath = '%s/episode_%d_seed_%d.npy' % (save_dir, num_episodes, seed) cluster_filepath = '%s/episode_%d_seed_%d_clusters_%d.npy' % ( save_dir, num_episodes, seed, num_clusters) if os.path.exists(cluster_filepath): cluster_result = np.load(cluster_filepath, allow_pickle=True)[()] print('Clusters has already been generated: %s...' % cluster_filepath) else: print('%s not exists... generate clusters...' % cluster_filepath) env = gym.make(precise_env_name(env_name)) env.seed(seed) set_global_seeds(seed) trajectory = np.load(trajectory_filepath, allow_pickle=True) obs = [] for traj in trajectory: for (o, a, r, no, d) in traj: obs.append(o) obs_mean = np.mean(obs, axis=0, keepdims=True) obs_std = np.std(obs, axis=0, keepdims=True) + 1e-3 stan_obs = (obs - obs_mean) / obs_std np.random.shuffle(stan_obs) import time startime = time.time() covertree = CoverTree(stan_obs[:10000], scipy.spatial.distance.euclidean, leafsize=10) print('used_time: {}'.format(time.time() - startime)) print(covertree.root.ctr_idx) current_parents = [covertree.root] next_parents = [] representatives = set([]) candidates = [] while len(representatives) < num_clusters: if not candidates: for child in current_parents[0].children: if isinstance(child, CoverTree._LeafNode): candidates.append(child) else: current_parents.append(child) representatives.add(current_parents.pop(0).ctr_idx) else: representatives.add(candidates.pop().ctr_idx) print(representatives) cluster_result = {'representatives': stan_obs[list(representatives)]} np.save(cluster_filepath, cluster_result) return cluster_result
def run(env_name, trained_policy_seed, trained_policy_step, trajectory_episode, trajectory_seed, alg, total_timesteps, seed, alg_params={}): hidden_dim = 64 if alg_params.get('hidden_dim') is None else alg_params['hidden_dim'] env = gym.make(precise_env_name(env_name)) state_dim, action_dim, max_action = env.observation_space.shape[0], env.action_space.shape[0], float(env.action_space.high[0]) trained_agent = load_trained_agent(env_name, trained_policy_seed, trained_policy_step, hidden_dim=hidden_dim) parameters = trained_agent.get_parameters() # Load trajectory & train/valid split split_ratio = 0.8 trajectory_all = generate_trajectory(env_name, trained_policy_seed, trained_policy_step, trajectory_episode, trajectory_seed, hidden_dim=hidden_dim) trajectory_train = trajectory_all[:int(len(trajectory_all) * split_ratio)] trajectory_valid = trajectory_all[int(len(trajectory_all) * split_ratio):] log_interval = 10000 # max(100, total_timesteps // 300) alg_name, trajectory = alg, trajectory_all batch_trajectory = None # Load model if alg == 'bc': model = BC(state_dim, action_dim, max_action, hidden_dim=hidden_dim) elif alg == 'vaebc': model = VAEBC(state_dim, action_dim, max_action, hidden_dim=hidden_dim) elif alg == 'klac': kl_coef, gradient_norm_panelty, gradient_norm_limit = alg_params['kl_coef'], alg_params['gradient_norm_panelty'], alg_params['gradient_norm_limit'] alg_name = 'klac_klcoef_{}_grad_norm_panelty_{}_grad_norm_limit_{}'.format(kl_coef, gradient_norm_panelty, gradient_norm_limit) model = KLAC(state_dim, action_dim, max_action, kl_coef=kl_coef, gradient_norm_panelty=gradient_norm_panelty, gradient_norm_limit=gradient_norm_limit, hidden_dim=hidden_dim) trajectory = trajectory_train elif alg == 'bopah_single': kl_coef, gradient_norm_panelty, gradient_norm_limit = alg_params['kl_coef'], alg_params['gradient_norm_panelty'], alg_params['gradient_norm_limit'] alg_name = 'bopah_single_klcoef_{}_grad_norm_panelty_{}_grad_norm_limit_{}'.format(kl_coef, gradient_norm_panelty, gradient_norm_limit) model = BOPAHSingle(state_dim, action_dim, max_action, kl_coef=kl_coef, gradient_norm_panelty=gradient_norm_panelty, gradient_norm_limit=gradient_norm_limit, hidden_dim=hidden_dim) trajectory = trajectory_train batch_trajectory = trajectory_valid elif alg == 'bopah': kl_coef, gradient_norm_panelty, gradient_norm_limit, dependent_limit, num_clusters \ = alg_params['kl_coef'], alg_params['gradient_norm_panelty'], alg_params['gradient_norm_limit'], alg_params['dependent_limit'], alg_params['num_clusters'] alg_name = 'bopah_klcoef_{}_grad_norm_panelty_{}_grad_norm_limit_{}_dependent_limit_{}'.format(kl_coef, gradient_norm_panelty, gradient_norm_limit, dependent_limit) if alg_params.get('total_loss'): alg_name += '_total_loss' cluster_info = generate_cluster(env_name, trained_policy_seed, trained_policy_step, trajectory_episode, num_clusters, trajectory_seed, hidden_dim=hidden_dim) model = BOPAH(trajectory_train, trajectory_valid, state_dim, action_dim, max_action, kl_coef=kl_coef, gradient_norm_panelty=gradient_norm_panelty, gradient_norm_limit=gradient_norm_limit, hidden_dim=hidden_dim, cluster_info=cluster_info, dependent_limit=dependent_limit, seed=seed, total_loss=alg_params.get('total_loss')) trajectory = trajectory_train batch_trajectory = trajectory_valid elif alg == 'bcq': alg_name += '_perturb_{}'.format(alg_params['perturb']) model = BCQ(state_dim, action_dim, max_action, trajectory=trajectory_all, hidden_dim=hidden_dim, perturb=alg_params['perturb']) elif alg == 'bear': alg_name += '_thres_{}'.format(alg_params['thres']) model = BEAR(state_dim, action_dim, max_action, hidden_dim=hidden_dim, threshold=alg_params['thres']) else: raise NotImplementedError() # Set result path result_dir = "eval_results/%s/seed_%d/step_%d/trajectory_%d/seed_%d_hidden_%d/%s" % (env_name, trained_policy_seed, trained_policy_step, trajectory_episode, trajectory_seed, hidden_dim, alg_name) os.makedirs(result_dir, exist_ok=True) result_filepath = "%s/seed_%d.npy" % (result_dir, seed) if os.path.exists(result_filepath): print('Result file already exists: %s' % result_filepath) return np.load(result_filepath, allow_pickle=True)[()] # Run algorithm and save the result print('==============================================') print('Run: ', result_filepath) vec_env = make_vectorized_env(env_name) # for policy evaluation eval_timesteps, evals, info_values = model.batch_learn(trajectory, vec_env, total_timesteps=total_timesteps, log_interval=log_interval, seed=seed, result_filepath=result_filepath, valid_trajectory=batch_trajectory) result = {'eval_timesteps': eval_timesteps, 'evals': evals, 'info_values': info_values} np.save(result_filepath, result) os.remove(result_filepath + '.tmp.npy') return result
def generate_trajectory(env_name, trained_policy_seed, trained_policy_step, num_episodes, seed, hidden_dim=64): """ :return: trajectory - list of [(obs, action, reward, next_obs, done), ... ] - len(trajectory): num_episodes - len(trajectory[0]): time steps of 0th episode """ save_dir = "batch_trajectory/{}/seed_{}_hidden_{}/step_{}".format( env_name, trained_policy_seed, hidden_dim, trained_policy_step) os.makedirs(save_dir, exist_ok=True) trajectory_filepath = '%s/episode_%d_seed_%d.npy' % (save_dir, num_episodes, seed) if os.path.exists(trajectory_filepath): trajectory_result = np.load(trajectory_filepath, allow_pickle=True) print('Trajectory has already been generated: %s...' % trajectory_filepath) else: print('%s not exists... generate trajectories...' % trajectory_filepath) env = gym.make(precise_env_name(env_name)) env.seed(seed) set_global_seeds(seed) if trained_policy_seed != 'uniform': trained_agent = load_trained_agent(env_name, trained_policy_seed, trained_policy_step, seed=seed, hidden_dim=hidden_dim) trajectory_result = [] for episode in tqdm(range(num_episodes), desc='generate_trajectory', ncols=70): obs = env.reset() trajectory_one = [] for t in range(10000): if trained_policy_seed != 'uniform': action, _ = trained_agent.predict(obs, deterministic=False) else: action = env.action_space.sample() next_obs, reward, done, info = env.step(action) terminal = done if info.get('TimeLimit.truncated'): terminal = False trajectory_one.append( (obs, action, reward, next_obs, terminal)) if done: break obs = next_obs trajectory_result.append(trajectory_one) trajectory_result = np.array(trajectory_result) np.save(trajectory_filepath, trajectory_result) return trajectory_result