def _get_best_single_option_policy(self): best_returns = float('-inf') best_z = None for z in range(self._num_skills): fixed_z_policy = FixedOptionPolicy(self._policy, self._num_skills, z) paths = rollouts(self._eval_env, fixed_z_policy, self._max_path_length, self._best_skill_n_rollouts, render=False) total_returns = np.mean([path['rewards'].sum() for path in paths]) if total_returns > best_returns: best_returns = total_returns best_z = z return FixedOptionPolicy(self._policy, self._num_skills, best_z)
def dump_trace(picklefile: str, args): filename = '{}_{}_{}_trace.png'.format(os.path.splitext(picklefile)[0], args.dim_0, args.dim_1) with tf.Session(), tf.variable_scope(picklefile): data = joblib.load(picklefile) policy = data['policy'] env = data['env'] num_skills = data['policy'].observation_space.flat_dim - data['env'].spec.observation_space.flat_dim plt.figure(figsize=(6, 6)) palette = sns.color_palette('hls', num_skills) with policy.deterministic(args.deterministic): skills = range(num_skills) if args.specific_skill == _use_all_skills else [args.specific_skill] for z in skills: fixed_z_policy = FixedOptionPolicy(policy, num_skills, z) for path_index in range(args.n_paths): obs = env.reset() if args.use_qpos: qpos = env.wrapped_env.env.model.data.qpos[:, 0] obs_vec = [qpos] else: obs_vec = [obs] for t in range(args.max_path_length): action, _ = fixed_z_policy.get_action(obs) (obs, _, _, _) = env.step(action) if args.use_qpos: qpos = env.wrapped_env.env.model.data.qpos[:, 0] obs_vec.append(qpos) elif args.use_action: obs_vec.append(action) else: obs_vec.append(obs) obs_vec = np.array(obs_vec) x = obs_vec[:, args.dim_0] y = obs_vec[:, args.dim_1] plt.plot(x, y, c=palette[z]) use_plot_lims = np.isfinite(env.observation_space.bounds).all() if use_plot_lims: xlim, ylim = np.asarray(env.observation_space.bounds).T plt.xlim(xlim) plt.ylim(ylim) plt.savefig(filename) plt.close()
def _save_traces(self, filename): utils._make_dir(filename) obs_vec = [] for z in range(self._num_skills): fixed_z_policy = FixedOptionPolicy(self._policy, self._num_skills, z) paths = rollouts(self._eval_env, fixed_z_policy, self._max_path_length, n_paths=3, render=False) obs_vec.append([path['observations'].tolist() for path in paths]) with open(filename, 'w') as f: json.dump(obs_vec, f)
def collect_expert_trajectories(expert_snapshot, max_path_length): tf.logging.info('Collecting expert trajectories') with tf.Session() as sess: data = joblib.load(expert_snapshot) policy = data['policy'] env = data['env'] num_skills = data['policy'].observation_space.flat_dim - data['env'].spec.observation_space.flat_dim traj_vec = [] with policy.deterministic(True): for z in range(num_skills): fixed_z_policy = FixedOptionPolicy(policy, num_skills, z) new_paths = rollouts(env, fixed_z_policy, args.max_path_length, n_paths=1) path = new_paths[0] traj_vec.append(path) tf.reset_default_graph() return traj_vec
def get_best_skill(policy, env, num_skills, max_path_length): tf.logging.info('Finding best skill to finetune...') reward_list = [] with policy.deterministic(True): for z in range(num_skills): fixed_z_policy = FixedOptionPolicy(policy, num_skills, z) new_paths = rollouts(env, fixed_z_policy, max_path_length, n_paths=2) total_returns = np.mean( [path['rewards'].sum() for path in new_paths]) tf.logging.info('Reward for skill %d = %.3f', z, total_returns) reward_list.append(total_returns) best_z = np.argmax(reward_list) tf.logging.info('Best skill found: z = %d, reward = %d', best_z, reward_list[best_z]) return best_z
def main(): parser = argparse.ArgumentParser() parser.add_argument('file', type=str, help='Path to the snapshot file.') parser.add_argument('--max-path-length', '-l', type=int, default=100) parser.add_argument('--speedup', '-s', type=float, default=1) parser.add_argument('--deterministic', '-d', dest='deterministic', action='store_true') parser.add_argument('--no-deterministic', '-nd', dest='deterministic', action='store_false') parser.add_argument('--separate_videos', type=bool, default=False) parser.set_defaults(deterministic=True) # unity_env args parser.add_argument('--idx', type=int, default=0) parser.add_argument('--no_graphics', type=bool, default=False) args = parser.parse_args() filename = os.path.splitext(args.file)[0] + '.avi' best_filename = os.path.splitext(args.file)[0] + '_best.avi' worst_filename = os.path.splitext(args.file)[0] + '_worst.avi' path_list = [] reward_list = [] with tf.Session() as sess: data = joblib.load(args.file) policy = data['policy'] env = data['env'] num_skills = data['policy'].observation_space.flat_dim - data[ 'env'].spec.observation_space.flat_dim with policy.deterministic(args.deterministic): for z in range(num_skills): fixed_z_policy = FixedOptionPolicy(policy, num_skills, z) new_paths = rollouts(env, fixed_z_policy, args.max_path_length, n_paths=1, render=True, render_mode='rgb_array') path_list.append(new_paths) total_returns = np.mean( [path['rewards'].sum() for path in new_paths]) reward_list.append(total_returns) if args.separate_videos: base = os.path.splitext(args.file)[0] end = '_skill_%02d.avi' % z skill_filename = base + end utils._save_video(new_paths, skill_filename) import csv file_path = args.file.split('/') file_path = file_path[-1].split('.')[0] file_path = './data/' + file_path if not os.path.exists(file_path): os.mkdir(file_path) print(file_path) with open(file_path + '/path%02d.csv' % z, 'w', newline='') as csvfile: spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow( ['X', '-X', 'Y', '-Y', 'X_speed', 'Y_speed']) for ob in path_list[-1][0]['observations']: spamwriter.writerow(ob) if not args.separate_videos: paths = [path for paths in path_list for path in paths] utils._save_video(paths, filename) print('Best reward: %d' % np.max(reward_list)) print('Worst reward: %d' % np.min(reward_list)) # Record extra long videos for best and worst skills: best_z = np.argmax(reward_list) worst_z = np.argmin(reward_list) for (z, filename) in [(best_z, best_filename), (worst_z, worst_filename)]: fixed_z_policy = FixedOptionPolicy(policy, num_skills, z) new_paths = rollouts(env, fixed_z_policy, 3 * args.max_path_length, n_paths=1, render=True, render_mode='rgb_array') utils._save_video(new_paths, filename) env.terminate()
args = parser.parse_args() filename = '{}_{}_{}_trace.png'.format( os.path.splitext(args.file)[0], args.dim_0, args.dim_1) with tf.Session() as sess: data = joblib.load(args.file) policy = data['policy'] env = data['env'] num_skills = data['policy'].observation_space.flat_dim - data[ 'env'].spec.observation_space.flat_dim plt.figure(figsize=(6, 6)) palette = sns.color_palette('hls', num_skills) with policy.deterministic(args.deterministic): for z in range(num_skills): fixed_z_policy = FixedOptionPolicy(policy, num_skills, z) for path_index in range(args.n_paths): obs = env.reset() if args.use_qpos: qpos = env.wrapped_env.env.model.data.qpos[:, 0] obs_vec = [qpos] else: obs_vec = [obs] for t in range(args.max_path_length): action, _ = fixed_z_policy.get_action(obs) (obs, _, _, _) = env.step(action) if args.use_qpos: qpos = env.wrapped_env.env.model.data.qpos[:, 0] obs_vec.append(qpos) elif args.use_action: obs_vec.append(action)
filename = os.path.splitext(args.file)[0] + '.avi' best_filename = os.path.splitext(args.file)[0] + '_best.avi' worst_filename = os.path.splitext(args.file)[0] + '_worst.avi' path_list = [] reward_list = [] with tf.Session() as sess: data = joblib.load(args.file) policy = data['policy'] env = data['env'] num_skills = data['policy'].observation_space.flat_dim - data['env'].spec.observation_space.flat_dim with policy.deterministic(args.deterministic): for z in range(num_skills): fixed_z_policy = FixedOptionPolicy(policy, num_skills, z) new_paths = rollouts(env, fixed_z_policy, args.max_path_length, n_paths=1, render=True, render_mode='rgb_array') path_list.append(new_paths) total_returns = np.mean([path['rewards'].sum() for path in new_paths]) reward_list.append(total_returns) if args.separate_videos: base = os.path.splitext(args.file)[0] end = '_skill_%02d.avi' % z skill_filename = base + end utils._save_video(new_paths, skill_filename) if not args.separate_videos: paths = [path for paths in path_list for path in paths]
path_list = [] reward_list = [] with tf.Session() as sess: data = joblib.load(args.file) policy = data['policy'] env = data['env'] #pdb.set_trace() num_skills = get_num_skills(policy, env, args.concat_type) #num_skills = data['policy'].observation_space.flat_dim - data['env'].spec.observation_space.flat_dim concat_type = args.concat_type with policy.deterministic(args.deterministic): for z in range(num_skills): fixed_z_policy = FixedOptionPolicy(policy, num_skills, z, concat_type) new_paths = rollouts(env, fixed_z_policy, args.max_path_length, n_paths=1, render=True, render_mode='rgb_array') path_list.append(new_paths) total_returns = np.mean( [path['rewards'].sum() for path in new_paths]) reward_list.append(total_returns) if args.separate_videos: base = os.path.splitext(args.file)[0] end = '_skill_%02d.avi' % z skill_filename = base + end