def configure(folder: str, format_strs: Optional[Sequence[str]] = None) -> None: """Configure Stable Baselines logger to be `accumulate_means()`-compatible. After this function is called, `stable_baselines.logger.{configure,reset}()` are replaced with stubs that raise RuntimeError. Args: folder: Argument from `stable_baselines.logger.configure`. format_strs: An list of output format strings. For details on available output formats see `stable_baselines.logger.make_output_format`. """ # Replace `stable_baselines.logger` methods with erroring stubs to # prevent unexpected logging state from mixed logging configuration. sb_logger.configure = _sb_logger_configure_replacement sb_logger.reset = _sb_logger_reset_replacement if format_strs is None: format_strs = ["stdout", "log", "csv"] output_formats = _build_output_formats(folder, format_strs) default_logger = sb_logger.Logger(folder, output_formats) hier_logger = _HierarchicalLogger(default_logger, format_strs) sb_logger.Logger.CURRENT = hier_logger sb_logger.log("Logging to %s" % folder) assert is_configured()
def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir): """ Runs the test """ logger.log(f"#######CMA and then PPO TRAIN: {args}") this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir) log_dir = get_log_dir(this_conti_ppo_run_dir) conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir) logger.configure(log_dir) full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(conti_ppo_save_dir): import shutil shutil.rmtree(conti_ppo_save_dir) os.makedirs(conti_ppo_save_dir) def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{full_space_save_dir}/ppo2") model.set_from_flat(start_theta) if args.normalize: env.load_running_average(full_space_save_dir) model.set_env(env) run_info = {"run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path} # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, # noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model.tell_run_info(run_info) episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps) model.save(f"{conti_ppo_save_dir}/ppo2") env.save_running_average(conti_ppo_save_dir) return episode_returns, full_param_traj_dir_path
def train(args): total_timesteps = int(args.num_timesteps) seed = args.seed # get params alg_kwargs, policy_kwargs = get_params(args) env = build_env(args) policy = get_policy(args) # if args.use_typeVector: # model = policy(CnnVectorPolicy, env, verbose=1, policy_kwargs=policy_kwargs, **alg_kwargs) # else: # model = policy(CnnPolicy, env, verbose=1, policy_kwargs=policy_kwargs, **alg_kwargs) model = policy(CnnPolicy, env, verbose=1, policy_kwargs=policy_kwargs, **alg_kwargs) model.learn( total_timesteps=total_timesteps, log_interval=args.log_interval, # save_interval=args.save_interval ) logger.log('Trained Over.') return model, env
def main(): # requires n_comp_to_use, pc1_chunk_size import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) save_dir = get_save_dir( this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) ''' ========================================================================================== get the pc vectors ========================================================================================== ''' result = do_pca(cma_args.n_components, cma_args.n_comp_to_use, traj_params_dir_name, intermediate_data_dir, proj=False, origin="mean_param", use_IPCA=cma_args.use_IPCA, chunk_size=cma_args.chunk_size, reuse=True) logger.debug("after pca") final_pcs = result["first_n_pcs"] all_param_iterator = get_allinone_concat_df(dir_name=traj_params_dir_name, use_IPCA=True, chunk_size=cma_args.pc1_chunk_size) plane_angles_vs_final_plane_along_the_way = [] ipca = IncrementalPCA(n_components=cma_args.n_comp_to_use) # for sparse PCA to speed up for chunk in all_param_iterator: logger.log(f"currently at {all_param_iterator._currow}") ipca.partial_fit(chunk.values) first_n_pcs = ipca.components_[:cma_args.n_comp_to_use] assert final_pcs.shape[0] == first_n_pcs.shape[0] plane_angle = cal_angle_between_nd_planes(first_n_pcs, final_pcs) plane_angles_vs_final_plane_along_the_way.append(plane_angle) plot_dir = get_plot_dir(cma_args) if not os.path.exists(plot_dir): os.makedirs(plot_dir) plane_angles_vs_final_plane_plot_dir = get_plane_angles_vs_final_plane_along_the_way_plot_dir(plot_dir, cma_args.n_comp_to_use) if not os.path.exists(plane_angles_vs_final_plane_plot_dir): os.makedirs(plane_angles_vs_final_plane_plot_dir) angles_plot_name = f"plane_angles_vs_final_plane_plot_dir " plot_2d(plane_angles_vs_final_plane_plot_dir, angles_plot_name, np.arange(len(plane_angles_vs_final_plane_along_the_way)), plane_angles_vs_final_plane_along_the_way, "num of chunks", "angle with diff in degrees", False)
def do_cma(cma_args, first_n_pcs, orgin_param, save_dir, starting_coord, var): tic = time.time() #TODO better starting locations, record how many samples, logger.log(f"CMAES STARTING :{starting_coord}") es = cma.CMAEvolutionStrategy(starting_coord, var) total_num_of_evals = 0 total_num_timesteps = 0 mean_rets = [] min_rets = [] max_rets = [] eval_returns = None optimization_path = [] while total_num_timesteps < cma_args.cma_num_timesteps and not es.stop(): solutions = es.ask() optimization_path.extend(solutions) thetas = [ np.matmul(coord, first_n_pcs) + orgin_param for coord in solutions ] logger.log( f"current time steps num: {total_num_timesteps} total time steps: {cma_args.cma_num_timesteps}" ) eval_returns = Parallel(n_jobs=cma_args.cores_to_use) \ (delayed(eval_return)(cma_args, save_dir, theta, cma_args.eval_num_timesteps, i) for (i, theta) in enumerate(thetas)) mean_rets.append(np.mean(eval_returns)) min_rets.append(np.min(eval_returns)) max_rets.append(np.max(eval_returns)) total_num_of_evals += len(eval_returns) total_num_timesteps += cma_args.eval_num_timesteps * len(eval_returns) logger.log(f"current eval returns: {str(eval_returns)}") logger.log(f"total timesteps so far: {total_num_timesteps}") negative_eval_returns = [-r for r in eval_returns] es.tell(solutions, negative_eval_returns) es.logger.add() # write data to disc to be plotted es.disp() toc = time.time() logger.log( f"####################################CMA took {toc-tic} seconds") es_logger = es.logger if not hasattr(es_logger, 'xmean'): es_logger.load() n_comp_used = first_n_pcs.shape[0] optimization_path_mean = np.vstack( (starting_coord, es_logger.xmean[:, 5:5 + n_comp_used])) return mean_rets, min_rets, max_rets, np.array( optimization_path), np.array(optimization_path_mean)
def main(): # Parse command line args parser = arg_parser() parser.add_argument("-ns", "--num-timesteps", type=str, default="1e6") parser.add_argument("-hw", "--use-hardware", action="store_true") parser.add_argument("-ld", "--logdir", type=str, default="logs") parser.add_argument("-l", "--load", type=str, default=None) parser.add_argument("-s", "--save", action="store_true") parser.add_argument("-si", "--save-interval", type=float, default=5e4) parser.add_argument("-p", "--play", action="store_true") parser.add_argument("-sd", "--seed", type=int, default=-1) parser.add_argument( "-o", "--output-formats", nargs="*", default=["stdout", "log", "csv"] ) args = parser.parse_args() # Set default seed if args.seed == -1: seed = np.random.randint(1, 1000) print("Seed is", seed) else: seed = args.seed device_type = "hardware" if args.use_hardware else "simulator" logdir = "{}/{}/{}/{}/seed-{}".format( args.logdir, device_type, "QubeSwingupEnv", args.num_timesteps, str(seed) ) logger.configure(logdir, args.output_formats) # Round save interval to a multiple of 2048 save_interval = int(np.ceil(args.save_interval / 2048)) if args.save else 0 # Run training script (+ loading/saving) model, env = train( QubeSwingupEnv, num_timesteps=int(float(args.num_timesteps)), hardware=args.use_hardware, logdir=logdir, save=args.save, save_interval=save_interval, load=args.load, seed=seed, ) if args.play: logger.log("Running trained model") obs = np.zeros((env.num_envs,) + env.observation_space.shape) obs[:] = env.reset() while True: actions = model.step(obs)[0] obs[:] = env.step(actions)[0] if not args.use_hardware: env.render() env.close()
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(args) plot_dir_alg = get_plot_dir(args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir, params_scope="pi") save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) if not os.path.exists(plot_dir_alg): os.makedirs(plot_dir_alg) final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) obz_tensor = model.act_model.fake_input_tensor some_neuron = model.act_model.policy_neurons[2][-1] grads = tf.gradients(tf.math.negative(some_neuron), obz_tensor) grads = list(zip(grads, obz_tensor)) trainer = tf.train.AdamOptimizer(learning_rate=0.01, epsilon=1e-5) train_op = trainer.apply_gradients(grads) for i in range(10000): obz, _ = model.sess.run([obz_tensor, train_op])
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() run_nums = cma_args.run_nums_to_check run_nums = [int(run_num) for run_num in run_nums.split(":")] final_params_list = [] start_params_list = [] for run_num in run_nums: cma_args.run_num = run_num if os.path.exists(get_dir_path_for_this_run(cma_args)): this_run_dir = get_dir_path_for_this_run(cma_args) plot_dir_alg = get_plot_dir(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir( this_run_dir, params_scope="pi") save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) if not os.path.exists(plot_dir_alg): os.makedirs(plot_dir_alg) start_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_start") start_params = pd.read_csv(start_file, header=None).values[0] final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] final_params_list.append(final_params) start_params_list.append(start_params) cma_args.run_num += 1 final_params_distances = [] for i in range(len(final_params_list)): for j in range(i + 1, len(final_params_list)): final_params_distances.append( LA.norm(final_params_list[i] - final_params_list[j], ord=2)) plot_dir = get_plot_dir(cma_args) if not os.path.exists(plot_dir): os.makedirs(plot_dir) np.savetxt(f"{plot_dir}/final_params_distances.txt", final_params_distances, delimiter=",")
def do_eval_returns(plot_args, intermediate_data_dir, two_pcs_eval, origin_param, xcoordinates_to_eval, ycoordinates_to_eval, save_dir, pca_center="final_param", reuse=True): eval_string = f"xnum_{np.min(xcoordinates_to_eval)}:{np.max(xcoordinates_to_eval)}:{plot_args.xnum}_" \ f"ynum_{np.min(ycoordinates_to_eval)}:{np.max(ycoordinates_to_eval)}:{plot_args.ynum}" if not reuse or not os.path.exists( get_eval_returns_filename(intermediate_dir=intermediate_data_dir, eval_string=eval_string, n_comp=2, pca_center=pca_center)): from stable_baselines.ppo2.run_mujoco import eval_return thetas_to_eval = [ origin_param + x * two_pcs_eval[0] + y * two_pcs_eval[1] for y in ycoordinates_to_eval for x in xcoordinates_to_eval ] tic = time.time() eval_returns = Parallel(n_jobs=plot_args.cores_to_use, max_nbytes='100M')\ (delayed(eval_return)(plot_args, save_dir, theta, plot_args.eval_num_timesteps, i) for (i, theta) in enumerate(thetas_to_eval)) toc = time.time() logger.log( f"####################################1st version took {toc-tic} seconds" ) np.savetxt(get_eval_returns_filename( intermediate_dir=intermediate_data_dir, eval_string=eval_string, n_comp=2, pca_center=pca_center), eval_returns, delimiter=',') else: eval_returns = np.loadtxt(get_eval_returns_filename( intermediate_dir=intermediate_data_dir, eval_string=eval_string, n_comp=2, pca_center=pca_center), delimiter=',') return eval_returns
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) ''' ========================================================================================== get the pc vectors ========================================================================================== ''' from stable_baselines.low_dim_analysis.common import do_pca, plot_2d origin = "mean_param" result = do_pca(cma_args.n_components, cma_args.n_comp_to_use, traj_params_dir_name, intermediate_data_dir, proj=False, origin=origin, use_IPCA=cma_args.use_IPCA, chunk_size=cma_args.chunk_size) final_params = result["final_concat_params"] all_pcs = result["pcs_components"] logger.log("grab start params") start_file = get_full_param_traj_file_path(traj_params_dir_name, "start") start_params = pd.read_csv(start_file, header=None).values[0] angles = [] for pc in all_pcs: angles.append(cal_angle(pc, final_params - start_params)) plot_dir = get_plot_dir(cma_args) if not os.path.exists(plot_dir): os.makedirs(plot_dir) angles_plot_name = f"angles with final - start start n_comp:{all_pcs.shape[0]} dim space of mean pca plane, " plot_2d(plot_dir, angles_plot_name, np.arange(all_pcs.shape[0]), angles, "num of pcs", "angle with diff", False)
def log_info(self): """ log the information of the dataset """ logger.log("Total trajectorues: %d" % self.num_traj) logger.log("Total transitions: %d" % self.num_transition) logger.log("Average returns: %f" % self.avg_ret) logger.log("Std for returns: %f" % self.std_ret)
def log_info(self): """ Log the information of the dataset. """ logger.log("Total trajectories: {}".format(self.num_traj)) logger.log("Total transitions: {}".format(self.num_transition)) logger.log("Average returns: {}".format(self.avg_ret)) logger.log("Std for returns: {}".format(self.std_ret))
def plot_cma_returns(plot_dir_alg, name, mean_rets, min_rets, max_rets, show): X = np.arange(len(mean_rets)) fig, ax = plt.subplots() plt.xlabel('num of eval') plt.ylabel('mean returns with min and max filled') ax.plot(X, mean_rets) ax.fill_between(X, min_rets, max_rets, alpha=0.5) file_path = f"{plot_dir_alg}/{name}.pdf" if os.path.isfile(file_path): os.remove(file_path) logger.log(f"saving cma plot to {file_path}") fig.savefig(file_path, dpi=300, bbox_inches='tight', format='pdf') if show: plt.show()
def main(): """ Runs the test """ args = mujoco_arg_parser().parse_args() logger.configure() model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) if args.play: logger.log("Running trained model") obs = np.zeros((env.num_envs,) + env.observation_space.shape) obs[:] = env.reset() while True: actions = model.step(obs)[0] obs[:] = env.step(actions)[0] env.render('human')
def plot_2d(plot_dir_alg, name, X, Y, xlabel, ylabel, show): fig, ax = plt.subplots() plt.xlabel(xlabel) plt.ylabel(ylabel) ax.plot(X, Y) file_path = f"{plot_dir_alg}/{name}.pdf" if os.path.isfile(file_path): os.remove(file_path) if not os.path.exists(plot_dir_alg): os.makedirs(plot_dir_alg) logger.log(f"####saving to {file_path}") fig.savefig(file_path, dpi=300, bbox_inches='tight', format='pdf') if show: plt.show()
def train(args): total_timesteps = int(args.num_timesteps) seed = args.seed # get params alg_kwargs = get_params(args) env = build_env(args) model = PPO2(CnnPolicy, env, verbose=1, **alg_kwargs) model.learn(total_timesteps=total_timesteps, log_interval=args.log_interval, save_interval=args.save_interval) logger.log('Trained Over.') return model, env
def plot_2d_2(plot_dir_alg, name, X, grad_vs_v, pc1_vs_V, xlabel, ylabel, show): fig, ax = plt.subplots() plt.xlabel(xlabel) plt.ylabel(ylabel) ax.plot(X, grad_vs_v) ax.plot(X, pc1_vs_V) plt.legend(['in so far grad_vs_v', 'in so far pc1_vs_V'], loc='upper left') file_path = f"{plot_dir_alg}/{name}.pdf" if os.path.isfile(file_path): os.remove(file_path) logger.log(f"####saving to {file_path}") fig.savefig(file_path, dpi=300, bbox_inches='tight', format='pdf') if show: plt.show()
def main(): """ Runs the test """ args = mujoco_arg_parser().parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) env = make_mujoco_env(args.env, args.seed) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') model.learn(total_timesteps=args.num_timesteps) model.save("ppo1") # env.close() del model # remove to demonstrate saving and loading # env = make_mujoco_env(args.env, args.seed) model = PPO1.load("ppo1") logger.log("~!!!!!!!!") episode_rew = 0 obs = env.reset() while True: action, _states = model.predict(obs) ob, reward, done, info = env.step(action) episode_rew += reward env.render() if done: print(f'episode_rew={episode_rew}') episode_rew = 0 obs = env.reset()
def __init__(self, env_fns, spaces=None): """ If you don't specify observation_space, we'll have to create a dummy environment to get it. """ if spaces: observation_space, action_space = spaces else: logger.log('Creating dummy env object to get spaces') with logger.scoped_configure(format_strs=[]): dummy = env_fns[0]() observation_space, action_space = dummy.observation_space, dummy.action_space dummy.close() del dummy VecEnv.__init__(self, len(env_fns), observation_space, action_space) obs_spaces = observation_space.spaces if isinstance( self.observation_space, gym.spaces.Tuple) else (self.observation_space, ) self.obs_bufs = [ tuple( Array(_NP_TO_CT[s.dtype.type], int(np.prod(s.shape))) for s in obs_spaces) for _ in env_fns ] self.obs_shapes = [s.shape for s in obs_spaces] self.obs_dtypes = [s.dtype for s in obs_spaces] self.parent_pipes = [] self.procs = [] for i, (env_fn, obs_buf) in enumerate(zip(env_fns, self.obs_bufs)): wrapped_fn = CloudpickleWrapper(env_fn) parent_pipe, child_pipe = Pipe() proc = Process(target=_subproc_worker, args=(child_pipe, parent_pipe, wrapped_fn, obs_buf, self.obs_shapes, i)) proc.daemon = True self.procs.append(proc) self.parent_pipes.append(parent_pipe) proc.start() child_pipe.close() self.waiting_step = False
def main(origin="final_param"): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() cma_args = { "alg": 'ppo2', "env": "DartHopper-v1", "num_timesteps": 5000, "normalize": True, "n_steps": 2048, "nminibatches": 32, "run_num": 0 } this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) ''' ========================================================================================== get the pc vectors ========================================================================================== ''' from stable_baselines.low_dim_analysis.common import do_pca result = do_pca(cma_args.n_components, cma_args.n_comp_to_use, traj_params_dir_name, intermediate_data_dir, proj=True, origin=origin, use_IPCA=cma_args.use_IPCA, chunk_size=cma_args.chunk_size)
def test(model_path, env, args): policy = get_policy(args) model = policy.load(model_path) test_episode = args.test_episode num_env = args.num_env take_nums = args.take_nums avg_reward = 0 logger.log('Begin testing, total ' + str(test_episode * num_env) + ' episodes...') for i_episode in range(test_episode): obs = env.reset() for _ in range(take_nums): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) avg_reward += np.sum(rewards) avg_reward /= (test_episode * num_env) logger.log('Average reward: ' + str(avg_reward))
def main(): # requires n_comp_to_use, pc1_chunk_size import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) save_dir = get_save_dir( this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) ''' ========================================================================================== get the pc vectors ========================================================================================== ''' logger.log("grab final params") final_file = get_full_param_traj_file_path(traj_params_dir_name, "final") final_params = pd.read_csv(final_file, header=None).values[0] logger.log("grab start params") start_file = get_full_param_traj_file_path(traj_params_dir_name, "start") start_params = pd.read_csv(start_file, header=None).values[0] V = final_params - start_params pcs_components = np.loadtxt( get_pcs_filename(intermediate_dir=intermediate_data_dir, n_comp=cma_args.num_comp_to_load), delimiter=',') angle = cal_angle(V, pcs_components[0]) logger.log(f"@@@@@@@@@@@@ {angle}")
def test(model, env, args): logger.log("Test...") n_episode = args.test_episode num_env = args.num_env state = model.initial_state if hasattr(model, 'initial_state') else None dones = np.zeros((1, )) total_rewards = 0 for i_episode in range(n_episode): obs = env.reset() for i in range(100): if state is not None: actions, _, state, _ = model.step(obs, S=state, M=dones) else: actions, _, _, _ = model.step(obs) obs, rew, done, info = env.step(actions) for r in rew: total_rewards += np.sum(r) done = done[0] if done: break avg_reward = total_rewards / (n_episode * num_env) if args.log: logger.log("Path: ", args.save_dir) logger.log("Test ", n_episode, " episodes, average reward is: ", avg_reward) logger.log("Test over.") else: print("Test ", n_episode, " episodes, average reward is: ", avg_reward) print("Test over.")
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="PPO1", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) callback = self._init_callback(callback) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn() assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \ "an instance of common.policies.ActorCriticPolicy." with self.sess.as_default(): self.adam.sync() callback.on_training_start(locals(), globals()) # Prepare for rollouts seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch, callback=callback) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() # rolling buffer for episode lengths len_buffer = deque(maxlen=100) # rolling buffer for episode rewards reward_buffer = deque(maxlen=100) while True: if timesteps_so_far >= total_timesteps: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / total_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() # Stop training early (triggered by the callback) if not seg.get('continue_training', True): # pytype: disable=attribute-error break add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observations, actions = seg["observations"], seg["actions"] atarg, tdlamret = seg["adv"], seg["tdlamret"] # true_rew is the reward without discount if writer is not None: total_episode_reward_logger( self.episode_reward, seg["true_rewards"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) # predicted value function before udpate vpredbefore = seg["vpred"] # standardized advantage function estimate atarg = (atarg - atarg.mean()) / atarg.std() dataset = Dataset(dict(ob=observations, ac=actions, atarg=atarg, vtarg=tdlamret), shuffle=not self.policy.recurrent) optim_batchsize = self.optim_batchsize or observations.shape[ 0] # set old parameter values to new parameter values self.assign_old_eq_new(sess=self.sess) logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for k in range(self.optim_epochs): # list of tuples, each of which gives the loss for a minibatch losses = [] for i, batch in enumerate( dataset.iterate_once(optim_batchsize)): steps = ( self.num_timesteps + k * optim_batchsize + int(i * (optim_batchsize / len(dataset.data_map)))) if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata # (memory, compute time, ...) if self.full_tensorboard_log and (1 + k) % 10 == 0: run_options = tf.compat.v1.RunOptions( trace_level=tf.compat.v1.RunOptions. FULL_TRACE) run_metadata = tf.compat.v1.RunMetadata() summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata( run_metadata, 'step%d' % steps) else: summary, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) writer.add_summary(summary, steps) else: _, grad, *newlosses = self.lossandgrad( batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) self.adam.update(grad, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in dataset.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) losses.append(newlosses) mean_losses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, mean_losses)) for (loss_val, name) in zipsame(mean_losses, self.loss_names): logger.record_tabular("loss_" + name, loss_val) logger.record_tabular( "ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # local values lrlocal = (seg["ep_lens"], seg["ep_rets"]) # list of tuples listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, rews = map(flatten_lists, zip(*listoflrpairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() callback.on_training_end() return self
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG", \ reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # a list for tensorboard logging, to prevent logging with the same step number, if it already occured self.tb_seen_steps = [] rank = MPI.COMM_WORLD.Get_rank() # we assume symmetric actions. assert np.all( np.abs(self.env.action_space.low) == self.env.action_space.high) if self.verbose >= 2: logger.log('Using agent with the following configuration:') logger.log(str(self.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) self.episode_reward = np.zeros((1, )) episode_successes = [] with self.sess.as_default(), self.graph.as_default(): # Prepare everything. self._reset() obs = self.env.reset() eval_obs = None if self.eval_env is not None: eval_obs = self.eval_env.reset() episode_reward = 0. episode_step = 0 episodes = 0 step = 0 total_steps = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] eval_episode_rewards = [] eval_qs = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 epoch = 0 while True: for _ in range(log_interval): # Perform rollouts. for _ in range(self.nb_rollout_steps): if total_steps >= total_timesteps: return self # Predict next action. action, q_value = self._policy(obs, apply_noise=True, compute_q=True) assert action.shape == self.env.action_space.shape # Execute next action. if rank == 0 and self.render: self.env.render() # Randomly sample actions from a uniform distribution # with a probabilty self.random_exploration (used in HER + DDPG) if np.random.rand() < self.random_exploration: rescaled_action = action = self.action_space.sample( ) else: rescaled_action = action * np.abs( self.action_space.low) rescaled_action = np.where(action)[0][0] new_obs, reward, done, info = self.env.step( rescaled_action) if writer is not None: ep_rew = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) step += 1 total_steps += 1 self.num_timesteps += 1 if rank == 0 and self.render: self.env.render() episode_reward += reward episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q_value) self._store_transition(obs, action, reward, new_obs, done) obs = new_obs if callback is not None: # Only stop training if return value is False, not when it is None. # This is for backwards compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: return self if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append( float(maybe_is_success)) self._reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(self.nb_train_steps): # Not enough samples in the replay buffer if not self.replay_buffer.can_sample( self.batch_size): break # Adapt param noise, if necessary. if len(self.replay_buffer) >= self.batch_size and \ t_train % self.param_noise_adaption_interval == 0: distance = self._adapt_param_noise() epoch_adaptive_distances.append(distance) # weird equation to deal with the fact the nb_train_steps will be different # to nb_rollout_steps step = (int(t_train * (self.nb_rollout_steps / self.nb_train_steps)) + self.num_timesteps - self.nb_rollout_steps) critic_loss, actor_loss = self._train_step( step, writer, log=t_train == 0) epoch_critic_losses.append(critic_loss) epoch_actor_losses.append(actor_loss) self._update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if self.eval_env is not None: eval_episode_reward = 0. for _ in range(self.nb_eval_steps): if total_steps >= total_timesteps: return self eval_action, eval_q = self._policy( eval_obs, apply_noise=False, compute_q=True) eval_obs, eval_r, eval_done, _ = self.eval_env.step( eval_action * np.abs(self.action_space.low)) if self.render_eval: self.eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: if not isinstance(self.env, VecEnv): eval_obs = self.eval_env.reset() eval_episode_rewards.append( eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self._get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean( epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean( epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean( epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean( epoch_critic_losses) if len(epoch_adaptive_distances) != 0: combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float( step) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std( epoch_actions) # Evaluation statistics. if self.eval_env is not None: combined_stats['eval/return'] = np.mean( eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len( eval_episode_rewards) def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = step for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(self.env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: pickle.dump(self.env.get_state(), file_handler) if self.eval_env and hasattr(self.eval_env, 'get_state'): with open( os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: pickle.dump(self.eval_env.get_state(), file_handler)
def main(): # requires n_comp_to_use, pc1_chunk_size import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) ''' ========================================================================================== get the pc vectors ========================================================================================== ''' logger.log("grab final params") final_file = get_full_param_traj_file_path(traj_params_dir_name, "final") final_params = pd.read_csv(final_file, header=None).values[0] all_param_iterator = get_allinone_concat_df( dir_name=traj_params_dir_name, use_IPCA=True, chunk_size=cma_args.pc1_chunk_size) all_grads_iterator = get_allinone_concat_df( dir_name=traj_params_dir_name, use_IPCA=True, chunk_size=cma_args.pc1_chunk_size, index="grads") angles_with_pc1_along_the_way = [] grad_vs_final_min_current_param = [] ipca = IncrementalPCA(1) # for sparse PCA to speed up for chunk in all_param_iterator: logger.log(f"currently at {all_param_iterator._currow}") target_direction = final_params - chunk.values[-1] ipca.partial_fit(chunk.values) angle_with_pc1 = cal_angle(target_direction, ipca.components_[0]) angles_with_pc1_along_the_way.append(angle_with_pc1) grads = all_grads_iterator.__next__().values for i, grad in enumerate(grads): grad_angle = cal_angle(grad, final_params - chunk.values[i]) grad_vs_final_min_current_param.append(grad_angle) plot_dir = get_plot_dir(cma_args) if not os.path.exists(plot_dir): os.makedirs(plot_dir) angles_plot_name = f"final - current VS so far pc1" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size}" plot_2d(plot_dir, angles_plot_name, np.arange(len(angles_with_pc1_along_the_way)), angles_with_pc1_along_the_way, "num of chunks", "angle with diff in degrees", False) grad_vs_current_plot_name = f"##final - current param VS current grad" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size}" plot_2d(plot_dir, grad_vs_current_plot_name, np.arange(len(grad_vs_final_min_current_param)), grad_vs_final_min_current_param, "num of chunks", "angle with diff in degrees", False)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="TRPO", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) with self.sess.as_default(): seg_gen = traj_segment_generator( self.policy_pi, self.env, self.timesteps_per_batch, reward_giver=self.reward_giver, gail=self.using_gail) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() len_buffer = deque( maxlen=40) # rolling buffer for episode lengths reward_buffer = deque( maxlen=40) # rolling buffer for episode rewards self.episode_reward = np.zeros((self.n_envs, )) true_reward_buffer = None if self.using_gail: true_reward_buffer = deque(maxlen=40) # Initialize dataloader batchsize = self.timesteps_per_batch // self.d_step self.expert_dataset.init_dataloader(batchsize) # Stats not used for now # TODO: replace with normal tb logging # g_loss_stats = Stats(loss_names) # d_loss_stats = Stats(reward_giver.loss_name) # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) while True: if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if total_timesteps and timesteps_so_far >= total_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(vec): return self.allmean( self.compute_fvp( vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec # ------------------ Update G ------------------ logger.log("Optimizing Policy...") # g_step = 1 when not using GAIL mean_losses = None vpredbefore = None tdlamret = None observation = None action = None seg = None for k in range(self.g_step): with self.timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observation, action, atarg, tdlamret = seg["ob"], seg[ "ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate # true_rew is the reward without discount if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, seg["true_rew"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) args = seg["ob"], seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] self.assign_old_eq_new(sess=self.sess) with self.timed("computegrad"): steps = self.num_timesteps + (k + 1) * ( seg["total_timestep"] / self.g_step) run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata( ) if self.full_tensorboard_log else None # run loss backprop with summary, and save the metadata (memory, compute time, ...) if writer is not None: summary, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) if self.full_tensorboard_log: writer.add_run_metadata( run_metadata, 'step%d' % steps) writer.add_summary(summary, steps) else: _, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) lossbefore = self.allmean(np.array(lossbefore)) grad = self.allmean(grad) if np.allclose(grad, 0): logger.log("Got zero gradient. not updating") else: with self.timed("conjugate_gradient"): stepdir = conjugate_gradient( fisher_vector_product, grad, cg_iters=self.cg_iters, verbose=self.rank == 0 and self.verbose >= 1) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot( fisher_vector_product(stepdir)) # abs(shs) to avoid taking square root of negative values lagrange_multiplier = np.sqrt( abs(shs) / self.max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lagrange_multiplier expectedimprove = grad.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = self.get_flat() thnew = None for _ in range(10): thnew = thbefore + fullstep * stepsize self.set_from_flat(thnew) mean_losses = surr, kl_loss, *_ = self.allmean( np.array( self.compute_losses(*args, sess=self.sess))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(mean_losses).all(): logger.log( "Got non-finite value of losses -- bad!" ) elif kl_loss > self.max_kl * 1.5: logger.log( "violated KL constraint. shrinking step." ) elif improve < 0: logger.log( "surrogate didn't improve. shrinking step." ) else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") self.set_from_flat(thbefore) if self.nworkers > 1 and iters_so_far % 20 == 0: # list of tuples paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), self.vfadam.getflat().sum())) assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with self.timed("vf"): for _ in range(self.vf_iters): # NOTE: for recurrent policies, use shuffle=False? for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128, shuffle=True): grad = self.allmean( self.compute_vflossandgrad( mbob, mbob, mbret, sess=self.sess)) self.vfadam.update(grad, self.vf_stepsize) for (loss_name, loss_val) in zip(self.loss_names, mean_losses): logger.record_tabular(loss_name, loss_val) logger.record_tabular( "explained_variance_tdlam_before", explained_variance(vpredbefore, tdlamret)) if self.using_gail: # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, self.reward_giver.loss_name)) assert len(observation) == self.timesteps_per_batch batch_size = self.timesteps_per_batch // self.d_step # NOTE: uses only the last g step for observation d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch # NOTE: for recurrent policies, use shuffle=False? for ob_batch, ac_batch in dataset.iterbatches( (observation, action), include_final_partial_batch=False, batch_size=batch_size, shuffle=True): ob_expert, ac_expert = self.expert_dataset.get_next_batch( ) # update running mean/std for reward_giver if self.reward_giver.normalize: self.reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) # Reshape actions if needed when using discrete actions if isinstance(self.action_space, gym.spaces.Discrete): if len(ac_batch.shape) == 2: ac_batch = ac_batch[:, 0] if len(ac_expert.shape) == 2: ac_expert = ac_expert[:, 0] *newlosses, grad = self.reward_giver.lossandgrad( ob_batch, ac_batch, ob_expert, ac_expert) self.d_adam.update(self.allmean(grad), self.d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*list_lr_pairs)) true_reward_buffer.extend(true_rets) else: # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"] ) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews = map(flatten_lists, zip(*list_lr_pairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) if self.using_gail: logger.record_tabular("EpTrueRewMean", np.mean(true_reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and self.rank == 0: logger.dump_tabular() return self
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() origin_name = "final_param" this_run_dir = get_dir_path_for_this_run(cma_args) plot_dir_alg = get_plot_dir(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir, params_scope="pi") save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) if not os.path.exists(plot_dir_alg): os.makedirs(plot_dir_alg) start_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_start") start_params = pd.read_csv(start_file, header=None).values[0] ''' ========================================================================================== get the pc vectors ========================================================================================== ''' pca_indexes = cma_args.other_pca_index pca_indexes = [int(pca_index) for pca_index in pca_indexes.split(":")] n_comp_to_project_on = pca_indexes result = do_pca(n_components=cma_args.n_components, traj_params_dir_name=traj_params_dir_name, intermediate_data_dir=intermediate_data_dir, use_IPCA=cma_args.use_IPCA, chunk_size=cma_args.chunk_size, reuse=True) logger.debug("after pca") if origin_name == "final_param": origin_param = result["final_params"] elif origin_name == "start_param": origin_param = start_params else: origin_param = result["mean_param"] proj_coords = project(result["pcs_components"], pcs_slice=n_comp_to_project_on, origin_name=origin_name, origin_param=origin_param, IPCA_chunk_size=cma_args.chunk_size, traj_params_dir_name=traj_params_dir_name, intermediate_data_dir=intermediate_data_dir, n_components=cma_args.n_components, reuse=True) ''' ========================================================================================== eval all xy coords ========================================================================================== ''' other_pcs_plot_dir = get_other_pcs_plane_plot_dir(plot_dir_alg, pca_indexes) if not os.path.exists(other_pcs_plot_dir): os.makedirs(other_pcs_plot_dir) plot_3d_trajectory_path_only( other_pcs_plot_dir, f"{pca_indexes}_final_origin_3d_path_plot", proj_coords, explained_ratio=result["explained_variance_ratio"][pca_indexes])
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() # origin = "final_param" origin = cma_args.origin this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) cma_run_num, cma_intermediate_data_dir = generate_run_dir( get_cma_returns_dirname, intermediate_dir=intermediate_data_dir, n_comp=cma_args.n_comp_to_use) ''' ========================================================================================== get the pc vectors ========================================================================================== ''' proj_or_not = (cma_args.n_comp_to_use == 2) result = do_pca(cma_args.n_components, cma_args.n_comp_to_use, traj_params_dir_name, intermediate_data_dir, proj=proj_or_not, origin=origin, use_IPCA=cma_args.use_IPCA, chunk_size=cma_args.chunk_size, reuse=False) ''' ========================================================================================== eval all xy coords ========================================================================================== ''' from stable_baselines.low_dim_analysis.common import plot_contour_trajectory, gen_subspace_coords,do_eval_returns\ , do_proj_on_first_n if origin == "final_param": origin_param = result["final_concat_params"] else: origin_param = result["mean_param"] final_param = result["final_concat_params"] last_proj_coord = do_proj_on_first_n(final_param, result["first_n_pcs"], origin_param) starting_coord = last_proj_coord logger.log(f"CMA STASRTING CORRD: {starting_coord}") # starting_coord = (1/2*np.max(xcoordinates_to_eval), 1/2*np.max(ycoordinates_to_eval)) # use mean assert result["first_n_pcs"].shape[0] == cma_args.n_comp_to_use mean_rets, min_rets, max_rets, opt_path, opt_path_mean = do_cma( cma_args, result["first_n_pcs"], origin_param, save_dir, starting_coord, cma_args.cma_var) dump_rows_write_csv(cma_intermediate_data_dir, opt_path_mean, "opt_mean_path") plot_dir = get_plot_dir(cma_args) cma_plot_dir = get_cma_plot_dir(plot_dir, cma_args.n_comp_to_use, cma_run_num, origin) if not os.path.exists(cma_plot_dir): os.makedirs(cma_plot_dir) ret_plot_name = f"cma return on {cma_args.n_comp_to_use} dim space of real pca plane, " \ f"explained {np.sum(result['explained_variance_ratio'][:cma_args.n_comp_to_use])}" plot_cma_returns(cma_plot_dir, ret_plot_name, mean_rets, min_rets, max_rets, show=False) if cma_args.n_comp_to_use == 2: proj_coords = result["proj_coords"] assert proj_coords.shape[1] == 2 xcoordinates_to_eval, ycoordinates_to_eval = gen_subspace_coords( cma_args, np.vstack((proj_coords, opt_path_mean)).T) eval_returns = do_eval_returns(cma_args, intermediate_data_dir, result["first_n_pcs"], origin_param, xcoordinates_to_eval, ycoordinates_to_eval, save_dir, pca_center=origin, reuse=False) plot_contour_trajectory(cma_plot_dir, f"{origin}_origin_eval_return_contour_plot", xcoordinates_to_eval, ycoordinates_to_eval, eval_returns, proj_coords[:, 0], proj_coords[:, 1], result["explained_variance_ratio"][:2], num_levels=25, show=False, sub_alg_path=opt_path_mean) opt_mean_path_in_old_basis = [ mean_projected_param.dot(result["first_n_pcs"]) + result["mean_param"] for mean_projected_param in opt_path_mean ] distance_to_final = [ LA.norm(opt_mean - final_param, ord=2) for opt_mean in opt_mean_path_in_old_basis ] distance_to_final_plot_name = f"distance_to_final over generations " plot_2d(cma_plot_dir, distance_to_final_plot_name, np.arange(len(distance_to_final)), distance_to_final, "num generation", "distance_to_final", False)
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() origin = "mean_param" this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) cma_run_num, cma_intermediate_data_dir = generate_run_dir( get_cma_returns_dirname, intermediate_dir=intermediate_data_dir, n_comp=cma_args.n_comp_to_use) ''' ========================================================================================== get the pc vectors ========================================================================================== ''' logger.log("grab final params") final_file = get_full_param_traj_file_path(traj_params_dir_name, "final") final_param = pd.read_csv(final_file, header=None).values[0] final_pca = IncrementalPCA(n_components=2) # for sparse PCA to speed up theta_file = get_full_param_traj_file_path(traj_params_dir_name, 0) concat_df = pd.read_csv(theta_file, header=None, chunksize=10000) tic = time.time() for chunk in concat_df: logger.log(f"currnet at : {concat_df._currow}") if chunk.shape[0] < 2: logger.log(f"last column too few: {chunk.shape[0]}") continue final_pca.partial_fit(chunk.values) toc = time.time() logger.log( '\nElapsed time computing the chunked PCA {:.2f} s\n'.format(toc - tic)) logger.log(final_pca.explained_variance_ratio_) pcs_components = final_pca.components_ first_2_pcs = pcs_components[:2] mean_param = final_pca.mean_ origin_param = mean_param theta_file = get_full_param_traj_file_path(traj_params_dir_name, 0) concat_df = pd.read_csv(theta_file, header=None, chunksize=10000) proj_coords = do_proj_on_first_n_IPCA(concat_df, first_2_pcs, origin_param) ''' ========================================================================================== eval all xy coords ========================================================================================== ''' from stable_baselines.low_dim_analysis.common import plot_contour_trajectory, gen_subspace_coords,do_eval_returns, \ get_allinone_concat_df, do_proj_on_first_n from stable_baselines.ppo2.run_mujoco import eval_return last_proj_coord = do_proj_on_first_n(final_param, first_2_pcs, origin_param) starting_coord = last_proj_coord tic = time.time() #TODO better starting locations, record how many samples, logger.log(f"CMAES STARTING :{starting_coord}") es = cma.CMAEvolutionStrategy(starting_coord, 5) total_num_of_evals = 0 total_num_timesteps = 0 mean_rets = [] min_rets = [] max_rets = [] eval_returns = None optimization_path = [] while total_num_timesteps < cma_args.cma_num_timesteps and not es.stop(): solutions = es.ask() optimization_path.extend(solutions) thetas = [ np.matmul(coord, first_2_pcs) + origin_param for coord in solutions ] logger.log( f"current time steps num: {total_num_timesteps} total time steps: {cma_args.cma_num_timesteps}" ) eval_returns = Parallel(n_jobs=cma_args.cores_to_use) \ (delayed(eval_return)(cma_args, save_dir, theta, cma_args.eval_num_timesteps, i) for (i, theta) in enumerate(thetas)) mean_rets.append(np.mean(eval_returns)) min_rets.append(np.min(eval_returns)) max_rets.append(np.max(eval_returns)) total_num_of_evals += len(eval_returns) total_num_timesteps += cma_args.eval_num_timesteps * len(eval_returns) logger.log(f"current eval returns: {str(eval_returns)}") logger.log(f"total timesteps so far: {total_num_timesteps}") negative_eval_returns = [-r for r in eval_returns] es.tell(solutions, negative_eval_returns) es.logger.add() # write data to disc to be plotted es.disp() toc = time.time() logger.log( f"####################################CMA took {toc-tic} seconds") es_logger = es.logger if not hasattr(es_logger, 'xmean'): es_logger.load() n_comp_used = first_2_pcs.shape[0] optimization_path_mean = np.vstack( (starting_coord, es_logger.xmean[:, 5:5 + n_comp_used])) dump_rows_write_csv(cma_intermediate_data_dir, optimization_path_mean, "opt_mean_path") plot_dir = get_plot_dir(cma_args) cma_plot_dir = get_cma_plot_dir(plot_dir, cma_args.n_comp_to_use, cma_run_num, origin=origin) if not os.path.exists(cma_plot_dir): os.makedirs(cma_plot_dir) ret_plot_name = f"cma return on {cma_args.n_comp_to_use} dim space of real pca plane, " \ f"explained {np.sum(final_pca.explained_variance_ratio_[:2])}" plot_cma_returns(cma_plot_dir, ret_plot_name, mean_rets, min_rets, max_rets, show=False) assert proj_coords.shape[1] == 2 xcoordinates_to_eval, ycoordinates_to_eval = gen_subspace_coords( cma_args, np.vstack((proj_coords, optimization_path_mean)).T) from stable_baselines.ppo2.run_mujoco import eval_return thetas_to_eval = [ origin_param + x * first_2_pcs[0] + y * first_2_pcs[1] for y in ycoordinates_to_eval for x in xcoordinates_to_eval ] tic = time.time() eval_returns = Parallel(n_jobs=-1, max_nbytes='100M') \ (delayed(eval_return)(cma_args, save_dir, theta, cma_args.eval_num_timesteps, i) for (i, theta) in enumerate(thetas_to_eval)) toc = time.time() logger.log( f"####################################1st version took {toc-tic} seconds" ) plot_contour_trajectory( cma_plot_dir, f"cma redo___{origin}_origin_eval_return_contour_plot", xcoordinates_to_eval, ycoordinates_to_eval, eval_returns, proj_coords[:, 0], proj_coords[:, 1], final_pca.explained_variance_ratio_, num_levels=25, show=False, sub_alg_path=optimization_path_mean.T) opt_mean_path_in_old_basis = [ mean_projected_param.dot(first_2_pcs) + mean_param for mean_projected_param in optimization_path_mean ] distance_to_final = [ LA.norm(opt_mean - final_param, ord=2) for opt_mean in opt_mean_path_in_old_basis ] distance_to_final_plot_name = f"cma redo distance_to_final over generations " plot_2d(cma_plot_dir, distance_to_final_plot_name, np.arange(len(distance_to_final)), distance_to_final, "num generation", "distance_to_final", False)