def train_agent(train, pickle_file, agent_type, env_kwargs, parms): bin_path = "bin/" + pickle_file if (path.exists(bin_path)): if agent_type == "a2c": print("Loading A2C Agent") RL_model = A2C.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ddpg": print("Loading DDPG Agent") RL_model = DDPG.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ppo": print("Loading PPO2 Agent") RL_model = PPO2.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") else: e_train_gym = ipenv.PortfolioAllocEnv(df=train, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() agent = ipagent.IPRLAgent(env=env_train) model = agent.get_model(model_name=agent_type, model_kwargs=parms) RL_model = agent.train_model(model=model, tb_log_name=agent_type, total_timesteps=1000000) RL_model.save(bin_path) return RL_model
def test(env_id, seed, policy): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) """ # if 'lstm' in policy: # print('LSTM policies not supported for drawing') # return 1 env = DummyVecEnv([PadEnvRender for _ in range(1)]) # Need for lstm # else: # env = PadEnvRender() env = VecFrameStack(env, 8) model = PPO2.load('./pad_5combo_ppo2.pkl', env) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) done = done.any() episode_rew += rew time.sleep(1 / 24.) if done: print('Episode reward:', rew)
def do_ppo(args, start_theta, parent_this_run_dir, full_space_save_dir): """ Runs the test """ logger.log(f"#######CMA and then PPO TRAIN: {args}") this_conti_ppo_run_dir = get_ppo_part(parent_this_run_dir) log_dir = get_log_dir(this_conti_ppo_run_dir) conti_ppo_save_dir = get_save_dir(this_conti_ppo_run_dir) logger.configure(log_dir) full_param_traj_dir_path = get_full_params_dir(this_conti_ppo_run_dir) if os.path.exists(full_param_traj_dir_path): import shutil shutil.rmtree(full_param_traj_dir_path) os.makedirs(full_param_traj_dir_path) if os.path.exists(conti_ppo_save_dir): import shutil shutil.rmtree(conti_ppo_save_dir) os.makedirs(conti_ppo_save_dir) def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{full_space_save_dir}/ppo2") model.set_from_flat(start_theta) if args.normalize: env.load_running_average(full_space_save_dir) model.set_env(env) run_info = {"run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path} # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, # noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model.tell_run_info(run_info) episode_returns = model.learn(total_timesteps=args.ppo_num_timesteps) model.save(f"{conti_ppo_save_dir}/ppo2") env.save_running_average(conti_ppo_save_dir) return episode_returns, full_param_traj_dir_path
def load(self, path, env): if self.trpo(): return TRPO.load(path, env=env) elif self.ppo(): return PPO2.load(path, env=env) else: return SAC.load(path, env=env)
def __init__(self, policy_file="../models/controllers/PPO/CartPole-v1.pkl", mass_prior=None, length_prior=None, episodes_per_params=1, seed=1995, params=["length", "masspole"], steps_per_episode=50, sufficient_stats="Cross-Correlation"): self.env = CartPoleEnv() self.seed = seed self.cached_data = None self.params_scaler = None self.params = params self.steps_per_episode = steps_per_episode self.sufficient_stats = sufficient_stats self.policy = PPO2.load(policy_file) if mass_prior is None: self.m_low = 0.1 self.m_high = 2.0 self.m_prior = self.sample_mass_from_uniform_prior if length_prior is None: self.l_low = 0.1 self.l_high = 2.0 self.l_prior = self.sample_length_from_uniform_prior
def __init__(self, policy_file="../models/controllers/PPO/Pendulum-v0.pkl", mass_prior=None, length_prior=None, episodes_per_params=1, seed=1995, params=["length", "mass"], steps_per_episode=200, sufficient_stats="Cross-Correlation", load_from_file=False, assets_path=".", filename=""): self.env = PendulumEnv() self.seed = seed self.cached_data = None self.params_scaler = None self.params = params self.steps_per_episode = steps_per_episode self.sufficient_stats = sufficient_stats self.assets_path = assets_path self.load_from_file = load_from_file self.data_file = os.path.join(assets_path + filename) self.policy = PPO2.load(policy_file) if mass_prior is None: self.m_low = 0.1 self.m_high = 2.0 self.m_prior = self.sample_mass_from_uniform_prior if length_prior is None: self.l_low = 0.1 self.l_high = 2.0 self.l_prior = self.sample_length_from_uniform_prior
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(args) plot_dir_alg = get_plot_dir(args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir, params_scope="pi") save_dir = get_save_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) if not os.path.exists(plot_dir_alg): os.makedirs(plot_dir_alg) final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) obz_tensor = model.act_model.fake_input_tensor some_neuron = model.act_model.policy_neurons[2][-1] grads = tf.gradients(tf.math.negative(some_neuron), obz_tensor) grads = list(zip(grads, obz_tensor)) trainer = tf.train.AdamOptimizer(learning_rate=0.01, epsilon=1e-5) train_op = trainer.apply_gradients(grads) for i in range(10000): obz, _ = model.sess.run([obz_tensor, train_op])
def neuron_values_generator(args, save_dir, pi_theta, eval_timesteps): # logger.log(f"#######EVAL: {args}") neuron_values_list = [] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) # policy = MlpPolicy # # model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function if pi_theta is not None: model.set_pi_from_flat(pi_theta) if args.normalize: env.load_running_average(save_dir) obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() env.render() ep_infos = [] while 1: neuron_values, actions, _, _, _ = model.step_with_neurons(obs) # neuron_values = model.give_neuron_values(obs) # neuron_values_list.append( neuron_values ) yield neuron_values obs, rew, done, infos = env.step(actions) env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset()
def cont_learn(): print('Continue learning....') env = gym.make('CarRacing-v0') env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) trained_model = PPO2.load("CarRacing_model_PPO2.pkl") trained_model.set_env(env) trained_model.learn(300000) print("Saving model to CarRacing_model.pkl") trained_model.save("CarRacing_model_PPO2.pkl") plot_results(log_dir)
def visualize_neurons(args, save_dir, pi_theta, eval_timesteps): # logger.log(f"#######EVAL: {args}") def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function if pi_theta is not None: model.set_pi_from_flat(pi_theta) if args.normalize: env.load_running_average(save_dir) obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() ep_infos = [] for _ in range(eval_timesteps): actions = model.step(obs)[0] neuron_values = model.give_neuron_values(obs) obs, rew, done, infos = env.step(actions) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: if pi_theta is None: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset() return safe_mean([ep_info['r'] for ep_info in ep_infos])
def test_cnn_lstm_policy(request, policy): model_fname = './test_model_{}.zip'.format(request.node.name) try: env = make_env(0) model = PPO2(policy, env, nminibatches=1) model.learn(total_timesteps=15) env = model.get_env() evaluate_policy(model, env, n_eval_episodes=5) # saving model.save(model_fname) del model, env # loading _ = PPO2.load(model_fname, policy=policy) finally: if os.path.exists(model_fname): os.remove(model_fname)
def run(): """ Run a trained model for the pong problem """ env = gym.make('CarRacing-v0') env = DummyVecEnv([lambda: env]) # model = PPO2.load("CarRacing_model_PPO1_"+ str(5) +".pkl", env) model = PPO2.load("CarRacing_model_PPO2_5.pkl", env) avg_rew = evaluate(model=model, env=env, num_steps=10000) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew)
def main(env, load_path, fig_path): # skip over 1-baxter-no-penalty (no log monitor.csv) if load_path == "1-baxter-no-penalty": plot = False else: plot = True # arguments print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path)) log_path = os.getcwd() + "/log/" + load_path os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True) fig_path = os.getcwd() + "/figs/" + "/" + fig_path load_path = os.getcwd() + "/models/" + load_path # make environment, flattened environment, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal']) env = DummyVecEnv([lambda: env]) # load model model = PPO2.load(load_path, env=env) obs_initial = env.reset() obs = obs_initial # plot results if plot: plot_results(fig_path, log_path) # initializations niter = 10 counter = 0 timestep = 0 results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] print("==============================") # check initial positions and quaternions print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip')) print("box", env.envs[0].env.env.sim.data.get_site_xpos('box')) print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool')) print("mocap", env.envs[0].env.env.sim.data.mocap_pos) print("quat", env.envs[0].env.env.sim.data.mocap_quat) print("==============================") # mocap quaternion check for i in range(5): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) quat = env.envs[0].env.env.sim.data.mocap_quat print("obs", obs) print("quat", quat) print("==============================") # start rendering dists = [] box_goal_pos = np.array([0.6, 0.05, -0.17]) while True: if counter == niter: break action, _states = model.predict(obs) obs_old = obs obs, rewards, dones, info = env.step(action) quaternion = env.envs[0].env.env.sim.data.mocap_quat if obs.all() == obs_initial.all(): if counter % 10 == 0: xyzs = current[0] quats = current[1] print(xyzs) print(quats) filename = log_path + "/" + "results_" + str(counter) + ".txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() box_end_pos = np.array(obs_old[0][3:6].tolist()) print(box_end_pos) print(np.shape(box_end_pos)) print(box_goal_pos) print(np.shape(box_goal_pos)) dists.append(np.linalg.norm(box_goal_pos - box_end_pos)) current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] timestep = 0 counter += 1 print(timestep) print("obs", obs) print("quat", quaternion) # for average trajectory, smoothed for i in range(3): results[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): results[1][timestep][j] += quaternion[0].tolist()[j] # for current trajectory for i in range(3): current[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): current[1][timestep][j] += quaternion[0].tolist()[j] timestep += 1 env.render() # smooth paths by taking average, and calculate mean distance to goal state for timestep in range(100): for i in range(3): results[0][timeste][i] /= niter for j in range(4): results[0][timestep][j] /= niter dist = np.mean(dists) # print and write to file xyzs = results[0] quats = results[1] filename = log_path + "/" + "results_avg.txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() # print average distances print("average distance of box from end goal: %f" % dist)
def __init__(self): self.model = PPO2.load(os.path.expanduser( "~/Code/drl_local_planner_ros_stable_baselines/example_agents/ppo2_1_raw_data_cont_0/ppo2_1_raw_data_cont_0.pkl")) # noqa
# qvel[i,:] = env_in.sim.data.qvel[[7,8,10,12,14,16,17,19,21,23]] qvel[i, :] = env_in.GetMotorVelocities() # torque[i,:] = env_in.sim.data.actuator_force torque[i, :] = env_in.GetMotorTorques() i = (i + 1) % total_data_length return True ############################################################################### # # Use this code for testing the basic controller # Create the stoch mujoco environment # env = stoch2_gym_mjc_env.Stoch2Env() env = stoch2_gym_env.Stoch2Env(render=True) model_test = PPO2.load( dir_name + "/tflow_log/model_trot") # model_trot_200kiter_0pen_notermination obs = env.reset() print("Render mode...") for _ in range(15): action, _states = model_test.predict(obs, deterministic=True) obs, reward, done, _ = env.step(action, callback=render_callback) # if done: # break pickle.dump(qpos[0:total_data_length:int(total_data_length / total_data_length_for_saving)], open(input_file, "wb")) # save it into a file named save.p pickle.dump(action, open(action_file, "wb"))
def visualize_policy_and_collect_COM( augment_num_timesteps, top_num_to_include_slice, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, metric_param): result_dir = get_result_dir(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note, metric_param) args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######VISUALIZE: {args}") # non_linear_global_dict linear_global_dict, non_linear_global_dict, lagrangian_values, input_values, layers_values, all_weights = read_all_data( policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note=additional_note) timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{top_num_to_include_slice.start}_{top_num_to_include_slice.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' this_run_dir = get_experiment_path_for_this_run( entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=top_num_to_include_slice, result_dir=result_dir, network_size=network_size, metric_param=metric_param) traj_params_dir_name = get_full_params_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) aug_plot_dir = get_aug_plot_dir(this_run_dir) + "_vis" final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] args.env = f'{experiment_label}_{entry_point}-v1' register(id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={ 'linear_global_dict': linear_global_dict, 'non_linear_global_dict': non_linear_global_dict, 'top_to_include_slice': top_num_to_include_slice, 'aug_plot_dir': aug_plot_dir, "lagrangian_values": lagrangian_values, "layers_values": layers_values }) def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = False if args.normalize: env = VecNormalize(env) set_global_seeds(args.seed) walker_env.seed(args.seed) model = PPO2.load(f"{save_dir}/ppo2", seed=augment_seed) model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) sk = env.venv.envs[0].env.env.robot_skeleton lagrangian_values = {} obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() env = VecVideoRecorder(env, aug_plot_dir, record_video_trigger=lambda x: x == 0, video_length=3000, name_prefix="vis_this_policy") lagrangian_values["M"] = [sk.M.reshape((-1, 1))] lagrangian_values["COM"] = [sk.C.reshape((-1, 1))] lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))] lagrangian_values["q"] = [sk.q.reshape((-1, 1))] lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))] contact_values = {} neuron_values = model.give_neuron_values(obs) raw_layer_values_list = [[neuron_value.reshape((-1, 1))] for neuron_value in neuron_values] env.render() ep_infos = [] steps_to_first_done = 0 first_done = False # epi_rew = 0 for _ in range(3000): actions = model.step(obs)[0] # yield neuron_values obs, rew, done, infos = env.step(actions) # epi_rew+= rew[0] if done and not first_done: first_done = True if not first_done: steps_to_first_done += 1 neuron_values = model.give_neuron_values(obs) for i, layer in enumerate(neuron_values): raw_layer_values_list[i].append(layer.reshape((-1, 1))) # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values) lagrangian_values["M"].append(sk.M.reshape((-1, 1))) lagrangian_values["q"].append(sk.q.reshape((-1, 1))) lagrangian_values["dq"].append(sk.dq.reshape((-1, 1))) lagrangian_values["COM"].append(sk.C.reshape((-1, 1))) lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1))) # env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') # print(f'episode_rew={epi_rew}') # epi_rew = 0 obs = env.reset() #Hstack into a big matrix lagrangian_values["M"] = np.hstack(lagrangian_values["M"]) lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"]) lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"]) lagrangian_values["q"] = np.hstack(lagrangian_values["q"]) lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"]) # for contact_body_name, l in contact_values.items(): # body_contact_dict = contact_values[contact_body_name] # for name, l in body_contact_dict.items(): # body_contact_dict[name] = np.hstack(body_contact_dict[name]) input_values = np.hstack(raw_layer_values_list[0]) layers_values = [ np.hstack(layer_list) for layer_list in raw_layer_values_list ][1:-2] # drop variance and inputs for i, com in enumerate(lagrangian_values["COM"]): plt.figure() plt.plot(np.arange(len(com)), com) plt.xlabel("time") plt.ylabel(f"COM{i}") plt.savefig(f"{aug_plot_dir}/COM{i}.jpg") plt.close()
return env set_global_seeds(seed) return _init # Make Environments envs = [make_env(rank=0, seed=0, framerange=(start_frame, start_frame))] env = SubprocVecEnv(envs) ''' env = PyBulletEnvironment(max_steps, GUI, data_path = data_path, reward_func = reward_func) env = DummyVecEnv(env) ''' # Load Training Agent agent = PPO2.load(model_path, env=env) # Run Agent on Environment reward_log = [] for i in range(repeats): state = env.reset() deterministic = False sum_reward = 0 for _ in range(max_steps): action, _ = agent.predict(state, deterministic=deterministic) #print(action) state, reward, done, info = env.step(action) time.sleep(sleep_time) sum_reward += reward[0] reward_log.append(sum_reward / max_steps) print('Reward Fraction Achieved', np.mean(reward_log))
def train_agent_ppo2(config, agent_name, total_timesteps, policy, gamma=0.99, n_steps=128, ent_coef=0.01, learning_rate=0.00025, vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, num_envs=1, robot_radius=0.46, rew_fnc=3, num_stacks=1, stack_offset=15, disc_action_space=False, debug=False, normalize=False, stage=0, pretrained_model_name="", task_mode="static"): # Setting seed seed = random.randint(0, 1000) np.random.seed(seed) tf.random.set_random_seed(seed) random.seed(seed) # Define pathes to store things path_to_tensorboard_log = config['PATHES']['path_to_tensorboard_log'] global path_to_models path_to_models = config['PATHES']['path_to_models'] agent_dir = '%s/%s' % (path_to_models, agent_name) if not os.path.exists(agent_dir): os.makedirs(agent_dir) # Loading simulation environment env = load_train_env(num_envs, robot_radius, rew_fnc, num_stacks, stack_offset, debug, task_mode, policy, disc_action_space, normalize) if stage == 0: model = PPO2(eval(policy), env, gamma=gamma, n_steps=n_steps, ent_coef=ent_coef, learning_rate=learning_rate, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lam=lam, nminibatches=nminibatches, noptepochs=noptepochs, cliprange=cliprange, verbose=1, tensorboard_log='%s' % (path_to_tensorboard_log)) else: # Pretrained model is loaded to continue training. model = PPO2.load( "%s/%s/%s.pkl" % (path_to_models, pretrained_model_name, pretrained_model_name), env, tensorboard_log='%s' % (path_to_tensorboard_log)) # Document agent print("Starting PPO2 Training of agent: %s" % (agent_name)) print("------------------------------------------------------") print("gamma \t\t\t\t %f" % model.gamma) print("n_steps \t\t\t %d" % model.n_steps) print("ent_coef \t\t\t %f" % model.ent_coef) print("learning_rate \t\t\t %f" % learning_rate) print("vf_coef \t\t\t %f" % model.vf_coef) print("max_grad_norm \t\t\t %f" % model.max_grad_norm) print("lam \t\t\t\t %f" % model.lam) print("nminibatches \t\t\t %d" % model.nminibatches) print("noptepochs \t\t\t %d" % model.noptepochs) print("cliprange \t\t\t %f" % cliprange) print("total_timesteps \t\t %d" % total_timesteps) print("Policy \t\t\t\t %s" % policy) print("reward_fnc \t\t\t %d" % rew_fnc) print("Normalized state: %d" % normalize) print("discrete action space %d" % disc_action_space) print("Number of stacks: %d, stack offset: %d" % (num_stacks, stack_offset)) print("\n") # Starting training reset_num_timesteps = False if stage == 0: reset_num_timesteps = True model.learn(total_timesteps=total_timesteps, log_interval=100, callback=train_callback, tb_log_name=agent_name, reset_num_timesteps=reset_num_timesteps) # Saving final model model.save("%s/%s/%s" % (path_to_models, agent_name, "%s_stage_%d" % (agent_name, stage))) print("Training finished.") env.close()
def visualize_policy_and_collect_COM(seed, run_num, policy_env, policy_num_timesteps, policy_seed, policy_run_num): logger.log(sys.argv) common_arg_parser = get_common_parser() args, cma_unknown_args = common_arg_parser.parse_known_args() args.env = policy_env args.seed = policy_seed args.num_timesteps = policy_num_timesteps args.run_num = policy_run_num this_run_dir = get_dir_path_for_this_run(args) traj_params_dir_name = get_full_params_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out env = DummyVecEnv([make_env]) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2", seed=seed) model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) sk = env.venv.envs[0].env.env.robot_skeleton lagrangian_values = {} obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() plot_dir = get_plot_dir(policy_env=args.env, policy_num_timesteps=policy_num_timesteps, policy_run_num=policy_run_num, policy_seed=policy_seed, eval_seed=seed, eval_run_num=run_num, additional_note="") if os.path.exists(plot_dir): shutil.rmtree(plot_dir) os.makedirs(plot_dir) env = VecVideoRecorder(env, plot_dir, record_video_trigger=lambda x: x == 0, video_length=3000, name_prefix="3000000agent-{}".format(args.env)) lagrangian_values["M"] = [sk.M.reshape((-1, 1))] lagrangian_values["COM"] = [sk.C.reshape((-1, 1))] lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))] lagrangian_values["q"] = [sk.q.reshape((-1, 1))] lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))] contact_values = {} neuron_values = model.give_neuron_values(obs) raw_layer_values_list = [[neuron_value.reshape((-1, 1))] for neuron_value in neuron_values] env.render() ep_infos = [] steps_to_first_done = 0 first_done = False # epi_rew = 0 for _ in range(3000): actions = model.step(obs)[0] # yield neuron_values obs, rew, done, infos = env.step(actions) # epi_rew+= rew[0] if done and not first_done: first_done = True if not first_done: steps_to_first_done += 1 neuron_values = model.give_neuron_values(obs) for i, layer in enumerate(neuron_values): raw_layer_values_list[i].append(layer.reshape((-1, 1))) # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values) lagrangian_values["M"].append(sk.M.reshape((-1, 1))) lagrangian_values["q"].append(sk.q.reshape((-1, 1))) lagrangian_values["dq"].append(sk.dq.reshape((-1, 1))) lagrangian_values["COM"].append(sk.C.reshape((-1, 1))) lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1))) # env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') # print(f'episode_rew={epi_rew}') # epi_rew = 0 obs = env.reset() #Hstack into a big matrix lagrangian_values["M"] = np.hstack(lagrangian_values["M"]) lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"]) lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"]) lagrangian_values["q"] = np.hstack(lagrangian_values["q"]) lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"]) # for contact_body_name, l in contact_values.items(): # body_contact_dict = contact_values[contact_body_name] # for name, l in body_contact_dict.items(): # body_contact_dict[name] = np.hstack(body_contact_dict[name]) input_values = np.hstack(raw_layer_values_list[0]) layers_values = [ np.hstack(layer_list) for layer_list in raw_layer_values_list ][1:-2] # drop variance and inputs for i, com in enumerate(lagrangian_values["COM"]): plt.figure() plt.plot(np.arange(len(com)), com) plt.xlabel("time") plt.ylabel(f"COM{i}") plt.savefig(f"{plot_dir}/COM{i}.jpg") plt.close()
model = PPO2(CustomPolicy, env, n_steps=int(2048 / 128), nminibatches=64, noptepochs=10, lam=0.98, verbose=1, tensorboard_log='/home/xi/model/log') # model = PPO2.load("ppo2_ipadgame") # model.set_env(env) # model.tensorboard_log='/home/xi/model/log' # env.load_running_average("/home/xi/model/") model.learn(total_timesteps=50000) # model.save("ppo2_ipadgame") # env.save_running_average("/home/xi/model/") # print ('done') env = gym.make(env_id) env = DummyVecEnv([lambda: env]) env = VecNormalize(env) obs = env.reset() model = PPO2.load("ppo2_ipadgame") env.load_running_average("/home/xi/model/") for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def run_experiment_with_trained(augment_num_timesteps, linear_co_threshold, augment_seed, augment_run_num, network_size, policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, learning_rate, additional_note, result_dir, keys_to_include, metric_param, linear_top_vars_list=None, linear_correlation_neuron_list=None, visualize=False, lagrangian_inds_to_include=None, neurons_inds_to_include=None, use_lagrangian=True): trained_model = None if not use_lagrangian: with tf.variable_scope("trained_model"): common_arg_parser = get_common_parser() trained_args, cma_unknown_args = common_arg_parser.parse_known_args() trained_args.env = policy_env trained_args.seed = policy_seed trained_args.num_timesteps = policy_num_timesteps trained_args.run_num = policy_run_num trained_this_run_dir = get_dir_path_for_this_run(trained_args) trained_traj_params_dir_name = get_full_params_dir(trained_this_run_dir) trained_save_dir = get_save_dir(trained_this_run_dir) trained_final_file = get_full_param_traj_file_path(trained_traj_params_dir_name, "pi_final") trained_final_params = pd.read_csv(trained_final_file, header=None).values[0] trained_model = PPO2.load(f"{trained_save_dir}/ppo2", seed=augment_seed) trained_model.set_pi_from_flat(trained_final_params) args = AttributeDict() args.normalize = True args.num_timesteps = augment_num_timesteps args.run_num = augment_run_num args.alg = "ppo2" args.seed = augment_seed logger.log(f"#######TRAIN: {args}") # non_linear_global_dict timestamp = get_time_stamp('%Y_%m_%d_%H_%M_%S') experiment_label = f"learning_rate_{learning_rate}timestamp_{timestamp}_augment_num_timesteps{augment_num_timesteps}" \ f"_top_num_to_include{linear_co_threshold.start}_{linear_co_threshold.stop}" \ f"_augment_seed{augment_seed}_augment_run_num{augment_run_num}_network_size{network_size}" \ f"_policy_num_timesteps{policy_num_timesteps}_policy_run_num{policy_run_num}_policy_seed{policy_seed}" \ f"_eval_seed{eval_seed}_eval_run_num{eval_run_num}_additional_note_{additional_note}" if policy_env == "DartWalker2d-v1": entry_point = 'gym.envs.dart:DartWalker2dEnv_aug_input' elif policy_env == "DartHopper-v1": entry_point = 'gym.envs.dart:DartHopperEnv_aug_input' elif policy_env == "DartHalfCheetah-v1": entry_point = 'gym.envs.dart:DartHalfCheetahEnv_aug_input' elif policy_env == "DartSnake7Link-v1": entry_point = 'gym.envs.dart:DartSnake7LinkEnv_aug_input' else: raise NotImplemented() this_run_dir = get_experiment_path_for_this_run(entry_point, args.num_timesteps, args.run_num, args.seed, learning_rate=learning_rate, top_num_to_include=linear_co_threshold, result_dir=result_dir, network_size=network_size) full_param_traj_dir_path = get_full_params_dir(this_run_dir) log_dir = get_log_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) create_dir_remove(this_run_dir) create_dir_remove(full_param_traj_dir_path) create_dir_remove(save_dir) create_dir_remove(log_dir) logger.configure(log_dir) linear_top_vars_list_wanted_to_print = [] if (use_lagrangian and lagrangian_inds_to_include is None) or (not use_lagrangian and neurons_inds_to_include is None): # note this is only linear if linear_top_vars_list is None or linear_correlation_neuron_list is None: linear_top_vars_list, linear_correlation_neuron_list = read_linear_top_var(policy_env, policy_num_timesteps, policy_run_num, policy_seed, eval_seed, eval_run_num, additional_note, metric_param=metric_param) lagrangian_inds_to_include, neurons_inds_to_include, linear_top_vars_list_wanted_to_print = \ get_wanted_lagrangians_and_neurons(keys_to_include, linear_top_vars_list, linear_correlation_neuron_list, linear_co_threshold) with open(f"{log_dir}/lagrangian_inds_to_include.json", 'w') as fp: json.dump(lagrangian_inds_to_include, fp) with open(f"{log_dir}/linear_top_vars_list_wanted_to_print.json", 'w') as fp: json.dump(linear_top_vars_list_wanted_to_print, fp) with open(f"{log_dir}/neurons_inds_to_include.json", 'w') as fp: json.dump(neurons_inds_to_include, fp) args.env = f'{experiment_label}_{entry_point}-v1' if not use_lagrangian: register( id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": None, "trained_model": trained_model, "neurons_inds_to_include": neurons_inds_to_include} ) else: register( id=args.env, entry_point=entry_point, max_episode_steps=1000, kwargs={"lagrangian_inds_to_include": lagrangian_inds_to_include, "trained_model": None, "neurons_inds_to_include": None} ) def make_env(): env_out = gym.make(args.env) env_out.env.visualize = visualize env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) walker_env = env.envs[0].env.env walker_env.disableViewer = not visualize if args.normalize: env = VecNormalize(env) policy = MlpPolicy set_global_seeds(args.seed) walker_env.seed(args.seed) num_dof = walker_env.robot_skeleton.ndofs show_M_matrix(num_dof, lagrangian_inds_to_include, linear_co_threshold, log_dir) # extra run info I added for my purposes run_info = {"run_num": args.run_num, "env_id": args.env, "full_param_traj_dir_path": full_param_traj_dir_path} layers = [network_size, network_size] policy_kwargs = {"net_arch" : [dict(vf=layers, pi=layers)]} model = PPO2(policy=policy, env=env, n_steps=4096, nminibatches=64, lam=0.95, gamma=0.99, noptepochs=10, ent_coef=0.0, learning_rate=learning_rate, cliprange=0.2, optimizer='adam', policy_kwargs=policy_kwargs, seed=args.seed) model.tell_run_info(run_info) model.learn(total_timesteps=args.num_timesteps, seed=args.seed) model.save(f"{save_dir}/ppo2") if args.normalize: env.save_running_average(save_dir) return log_dir
qpos[i, :] = env_in.GetMotorAngles() # qvel[i,:] = env_in.sim.data.qvel[[7,8,10,12,14,16,17,19,21,23]] qvel[i, :] = env_in.GetMotorVelocities() # torque[i,:] = env_in.sim.data.actuator_force torque[i, :] = env_in.GetMotorTorques() i = (i + 1) % total_data_length return True ############################################################################### # # Use this code for testing the basic controller # Create the stoch mujoco environment # env = stoch2_gym_mjc_env.Stoch2Env() env = vision60_gym_bullet_env.Vision60BulletEnv(render=True) model_test = PPO2.load(dir_name + "/model_trot") obs = env.reset() print("Render mode...") for _ in range(10): action, _states = model_test.predict(obs, deterministic=True) obs, reward, done, _ = env.step(action, callback=render_callback) # if done: # break pickle.dump(qpos[0:total_data_length:int(total_data_length / 100)], open("save.p", "wb")) # save it into a file named save.p # print(np.shape(qpos[0:total_data_length:int(total_data_length/100)])) # print(np.shape(qpos))
def main(env, load, save_path, load_path=None, train_timesteps=1.25e6, eval_timesteps=5e3): # arguments print( "env %s; load %s; save_path %s; load_path %s; train_timesteps %s; eval_timesteps %s;" % (env, load, save_path, load_path, train_timesteps, eval_timesteps)) train_timesteps = int(float(train_timesteps)) eval_timesteps = int(float(eval_timesteps)) # models path model_dir = os.getcwd() + "/models/" os.makedirs(model_dir, exist_ok=True) # logging path log_dir = os.getcwd() + "/log/" + save_path os.makedirs(log_dir, exist_ok=True) # absolute save path and models path save_path = model_dir + save_path if load and not load_path: print("no load path given, exiting...") sys.exit() elif load: load_path = model_dir + load_path # make environment, flattened environment, monitor, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # load model, or start from scratch if load: print("loading model from: " + load_path) model = PPO2.load(load_path, env=env) else: print("training model from scratch") model = PPO2(MlpPolicy, env, verbose=1) # evaluate current model mean_reward_before_train = evaluate(model, env, num_steps=eval_timesteps) # train model global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model.learn(total_timesteps=train_timesteps, callback=None) # save model print("saving model to:" + save_path) model.save(save_path) # evaluate post training model mean_reward_after_train = evaluate(model, env, num_steps=eval_timesteps) # results print("reward before training:" + str(mean_reward_before_train)) print("reward after training:" + str(mean_reward_after_train)) print("done")
env.close() # Curriculum implemented is extremely basic one. (Need to improvise later on) # Curriculum is a newly made folder. In google drive, read the note all_arenas = sorted(glob.glob('configs/Curriculum/*.yaml')) print(all_arenas) model_name = 'ppo_model_after_bc' all_frames_vec = [200000, 200000, 100000, 200000, 200000, 200000, 200000, 200000] #no of tsteps per curriculum for i in range(len(all_arenas)): # create arena env = create_env_fn(num_actors = 1, inference=False, config=all_arenas[i], seed=0) env = make_vec_env(env, n_envs=4) print('####################') print("## Curriculum {} ##".format(i)) print('####################') model = PPO2.load(model_name, env) frames_idx = all_frames_vec[i] print('{} arena is used for training for {} timesteps'.format(all_arenas[i], frames_idx)) model.learn(total_timesteps=frames_idx) model_name = "ppo_model_after_training_arena_{}".format(i) model.save(model_name) env.close() del model del env print('Training complete!!')
def eval_trained_policy_and_collect_data(eval_seed, eval_run_num, policy_env, policy_num_timesteps, policy_seed, policy_run_num, additional_note): logger.log(sys.argv) common_arg_parser = get_common_parser() args, cma_unknown_args = common_arg_parser.parse_known_args() args.env = policy_env args.seed = policy_seed args.num_timesteps = policy_num_timesteps args.run_num = policy_run_num this_run_dir = get_dir_path_for_this_run(args) traj_params_dir_name = get_full_params_dir(this_run_dir) save_dir = get_save_dir( this_run_dir) final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(eval_seed) return env_out env = DummyVecEnv([make_env]) running_env = env.envs[0].env.env set_global_seeds(eval_seed) running_env.seed(eval_seed) if args.normalize: env = VecNormalize(env) model = PPO2.load(f"{save_dir}/ppo2", seed=eval_seed) model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) # is it necessary? running_env = env.venv.envs[0].env.env lagrangian_values = {} obs = np.zeros((env.num_envs,) + env.observation_space.shape) obs[:] = env.reset() # env = VecVideoRecorder(env, "./", # record_video_trigger=lambda x: x == 0, video_length=3000, # name_prefix="3000000agent-{}".format(args.env)) #init lagrangian values for lagrangian_key in lagrangian_keys: flat_array = running_env.get_lagrangian_flat_array(lagrangian_key) lagrangian_values[lagrangian_key] = [flat_array] neuron_values = model.give_neuron_values(obs) raw_layer_values_list = [[neuron_value.reshape((-1,1))] for neuron_value in neuron_values] # env.render() ep_infos = [] steps_to_first_done = 0 first_done = False for _ in range(30000): actions = model.step(obs)[0] # yield neuron_values obs, rew, done, infos = env.step(actions) if done and not first_done: first_done = True if not first_done: steps_to_first_done += 1 neuron_values = model.give_neuron_values(obs) for i, layer in enumerate(neuron_values): raw_layer_values_list[i].append(layer.reshape((-1,1))) # fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values) # filling lagrangian values for lagrangian_key in lagrangian_keys: flat_array = running_env.get_lagrangian_flat_array(lagrangian_key) lagrangian_values[lagrangian_key].append(flat_array) # env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset() #Hstack into a big matrix for lagrangian_key in lagrangian_keys: lagrangian_values[lagrangian_key] = np.hstack(lagrangian_values[lagrangian_key]) # for contact_body_name, l in contact_values.items(): # body_contact_dict = contact_values[contact_body_name] # for name, l in body_contact_dict.items(): # body_contact_dict[name] = np.hstack(body_contact_dict[name]) input_values = np.hstack(raw_layer_values_list[0]) layers_values = [np.hstack(layer_list) for layer_list in raw_layer_values_list][1:-2]# drop variance and inputs data_dir = get_data_dir(policy_env=args.env, policy_num_timesteps=policy_num_timesteps, policy_run_num=policy_run_num , policy_seed=policy_seed, eval_seed=eval_seed, eval_run_num=eval_run_num, additional_note=additional_note) if os.path.exists(data_dir): shutil.rmtree(data_dir) os.makedirs(data_dir) lagrangian_values_fn = f"{data_dir}/lagrangian.pickle" with open(lagrangian_values_fn, 'wb') as handle: pickle.dump(lagrangian_values, handle, protocol=pickle.HIGHEST_PROTOCOL) input_values_fn = f"{data_dir}/input_values.npy" layers_values_fn = f"{data_dir}/layer_values.npy" np.save(input_values_fn, input_values) np.save(layers_values_fn, layers_values) all_weights = model.get_all_weight_values() for ind, weights in enumerate(all_weights): fname = f"{data_dir}/weights_layer_{ind}.txt" np.savetxt(fname, weights)
def main(): import sys logger.log(sys.argv) common_arg_parser = get_common_parser() args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(args) plot_dir_alg = get_plot_dir(args) traj_params_dir_name = get_full_params_dir(this_run_dir) save_dir = get_save_dir(this_run_dir) if not os.path.exists(plot_dir_alg): os.makedirs(plot_dir_alg) final_file = get_full_param_traj_file_path(traj_params_dir_name, "pi_final") final_params = pd.read_csv(final_file, header=None).values[0] def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out env = DummyVecEnv([make_env]) # env_out = gym.make(args.env) # env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) if args.normalize: env = VecNormalize(env) # policy = MlpPolicy model = PPO2.load(f"{save_dir}/ppo2") # this also loads V function # model = PPO2(policy=policy, env=env, n_steps=args.n_steps, nminibatches=args.nminibatches, lam=0.95, gamma=0.99, noptepochs=10, # ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, optimizer=args.optimizer) model.set_pi_from_flat(final_params) if args.normalize: env.load_running_average(save_dir) sk = env.venv.envs[0].env.env.robot_skeleton lagrangian_values = {} obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() # env = VecVideoRecorder(env, "./", # record_video_trigger=lambda x: x == 0, video_length=3000, # name_prefix="3000000agent-{}".format(args.env)) lagrangian_values["M"] = [sk.M.reshape((-1, 1))] lagrangian_values["COM"] = [sk.C.reshape((-1, 1))] lagrangian_values["Coriolis"] = [sk.c.reshape((-1, 1))] lagrangian_values["q"] = [sk.q.reshape((-1, 1))] lagrangian_values["dq"] = [sk.dq.reshape((-1, 1))] contact_values = {} neuron_values = model.give_neuron_values(obs) layer_values_list = [[neuron_value.reshape((-1, 1))] for neuron_value in neuron_values] env.render() ep_infos = [] steps_to_first_done = 0 first_done = False for _ in range(3000): actions = model.step(obs)[0] # yield neuron_values obs, rew, done, infos = env.step(actions) if done and not first_done: first_done = True if not first_done: steps_to_first_done += 1 neuron_values = model.give_neuron_values(obs) for i, layer in enumerate(neuron_values): layer_values_list[i].append(layer.reshape((-1, 1))) fill_contacts_jac_dict(infos[0]["contacts"], contact_dict=contact_values, neuron_values=neuron_values) lagrangian_values["M"].append(sk.M.reshape((-1, 1))) lagrangian_values["q"].append(sk.q.reshape((-1, 1))) lagrangian_values["dq"].append(sk.dq.reshape((-1, 1))) lagrangian_values["COM"].append(sk.C.reshape((-1, 1))) lagrangian_values["Coriolis"].append(sk.c.reshape((-1, 1))) env.render() # time.sleep(1) for info in infos: maybe_ep_info = info.get('episode') if maybe_ep_info is not None: ep_infos.append(maybe_ep_info) # env.render() done = done.any() if done: episode_rew = safe_mean([ep_info['r'] for ep_info in ep_infos]) print(f'episode_rew={episode_rew}') obs = env.reset() #Hstack into a big matrix lagrangian_values["M"] = np.hstack(lagrangian_values["M"]) lagrangian_values["COM"] = np.hstack(lagrangian_values["COM"]) lagrangian_values["Coriolis"] = np.hstack(lagrangian_values["Coriolis"]) lagrangian_values["q"] = np.hstack(lagrangian_values["q"]) lagrangian_values["dq"] = np.hstack(lagrangian_values["dq"]) for contact_body_name, l in contact_values.items(): body_contact_dict = contact_values[contact_body_name] for name, l in body_contact_dict.items(): body_contact_dict[name] = np.hstack(body_contact_dict[name]) layer_values_list = [ np.hstack(layer_list) for layer_list in layer_values_list ][1:-2] # drop variance # plt.scatter(lagrangian_values["M"][15], layer_values_list[1][2]) # plt.scatter(lagrangian_values["M"][11], layer_values_list[0][63]) out_dir = f"/home/panda-linux/PycharmProjects/low_dim_update_dart/low_dim_update_stable/neuron_vis/plots_{args.env}_{args.num_timesteps}" if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) all_weights = model.get_all_weight_values() for ind, weights in enumerate(all_weights): fname = f"{out_dir}/weights_layer_{ind}.txt" np.savetxt(fname, weights) PLOT_CUTOFF = steps_to_first_done plot_everything(lagrangian_values, layer_values_list, out_dir, PLOT_CUTOFF) scatter_the_linear_significant_ones(lagrangian_values, layer_values_list, threshold=0.6, out_dir=out_dir) scatter_the_nonlinear_significant_but_not_linear_ones( lagrangian_values, layer_values_list, linear_threshold=0.3, nonlinear_threshold=0.6, out_dir=out_dir) # # contact_dicts = {} # for contact_body_name, l in contact_values.items(): # body_contact_dict = contact_values[contact_body_name] # # # contact_dicts[contact_body_name] = {} # # build_dict = contact_dicts[contact_body_name] # # build_dict["body"] = {} # build_dict["layer"] = {} # for name, l in body_contact_dict.items(): # for i in range(len(l)): # # if name == contact_body_name: # build_dict["body"][f"{contact_body_name}_{i}"] = l[i] # else: # build_dict["layer"][f"layer_{name}_neuron_{i}"] = l[i] # # body_contact_df = pd.DataFrame.from_dict(build_dict["body"], "index") # layer_contact_df = pd.DataFrame.from_dict(build_dict["layer"], "index") # body_contact_df.to_csv(f"{data_dir}/{contact_body_name}_contact.txt", sep='\t') # layer_contact_df.to_csv(f"{data_dir}/{contact_body_name}_layers.txt", sep='\t') # #TO CSV format # data_dir = f"/home/panda-linux/PycharmProjects/low_dim_update_dart/mictools/examples/neuron_vis_data{args.env}_time_steps_{args.num_timesteps}" # if os.path.exists(data_dir): # shutil.rmtree(data_dir) # # os.makedirs(data_dir) # # for contact_body_name, d in contact_dicts.items(): # # build_dict = d # # body_contact_df = pd.DataFrame.from_dict(build_dict["body"], "index") # layer_contact_df = pd.DataFrame.from_dict(build_dict["layer"], "index") # # body_contact_df.to_csv(f"{data_dir}/{contact_body_name}_contact.txt", sep='\t') # layer_contact_df.to_csv(f"{data_dir}/{contact_body_name}_layers.txt", sep='\t') # # # # neurons_dict = {} # for layer_index in range(len(layer_values_list)): # for neuron_index in range(len(layer_values_list[layer_index])): # neurons_dict[f"layer_{layer_index}_neuron_{neuron_index}"] = layer_values_list[layer_index][neuron_index] # # for i in range(len(lagrangian_values["COM"])): # neurons_dict[f"COM_index_{i}"] = lagrangian_values["COM"][i] # # neuron_df = pd.DataFrame.from_dict(neurons_dict, "index") # # # # lagrangian_dict = {} # for k,v in lagrangian_values.items(): # for i in range(len(v)): # lagrangian_dict[f"{k}_index_{i}"] = v[i] # # lagrangian_df = pd.DataFrame.from_dict(lagrangian_dict, "index") # # # neuron_df.to_csv(f"{data_dir}/neurons.txt", sep='\t') # lagrangian_df.to_csv(f"{data_dir}/lagrangian.txt", sep='\t') # cor = {} # best_cor = {} # cor["M"] = get_correlations(lagrangian_values["M"], layer_values_list) # best_cor["M"] = [np.max(np.abs(cor_m)) for cor_m in cor["M"]] # # # cor["COM"] = get_correlations(lagrangian_values["COM"], layer_values_list) # best_cor["COM"] = [np.max(np.abs(cor_m)) for cor_m in cor["COM"]] # # cor["Coriolis"] = get_correlations(lagrangian_values["Coriolis"], layer_values_list) # best_cor["Coriolis"] = [np.max(np.abs(cor_m)) for cor_m in cor["Coriolis"]] # best_cor["Coriolis_argmax"] = [np.argmax(np.abs(cor_m)) for cor_m in cor["Coriolis"]] # # # # # ncor = {} # nbest_cor = {} # ncor["M"] = get_normalized_correlations(lagrangian_values["M"], layer_values_list) # nbest_cor["M"] = [np.max(np.abs(cor_m)) for cor_m in ncor["M"]] # # # ncor["COM"] = get_normalized_correlations(lagrangian_values["COM"], layer_values_list) # nbest_cor["COM"] = [np.max(np.abs(cor_m)) for cor_m in ncor["COM"]] # # ncor["Coriolis"] = get_normalized_correlations(lagrangian_values["Coriolis"], layer_values_list) # nbest_cor["Coriolis"] = [np.max(np.abs(cor_m)) for cor_m in ncor["Coriolis"]] # nbest_cor["Coriolis_argmax"] = [np.argmax(np.abs(cor_m)) for cor_m in ncor["Coriolis"]] # # # # # # lin_reg = {"perm_1":{}, "perm_2":{}} # best_lin_reg = {"perm_1":{}, "perm_2":{}} # lin_reg["perm_1"]["M"], best_lin_reg["perm_1"]["M"] = get_results("M", lagrangian_values, layer_values_list, perm_num=1) # lin_reg["perm_2"]["M"], best_lin_reg["perm_2"]["M"] = get_results("M", lagrangian_values, layer_values_list, perm_num=2) # lin_reg["perm_1"]["COM"], best_lin_reg["perm_1"]["COM"] = get_results("COM", lagrangian_values, layer_values_list, perm_num=1) # lin_reg["perm_2"]["COM"], best_lin_reg["perm_2"]["COM"] = get_results("COM", lagrangian_values, layer_values_list, perm_num=2) # # # lin_reg_1["M"] = get_linear_regressions_1_perm(lagrangian_values["M"], layer_values_list) # lin_reg_2["M"] = get_linear_regressions_2_perm(lagrangian_values["M"], layer_values_list) # best_lin_reg_2["M"] = [] # for lin_l in lin_reg_2["M"]: # if lin_l == []: # best_lin_reg_2["M"].append([]) # else: # best_lin_reg_2["M"].append(lin_l[np.argmin(lin_l[:,0])]) # # best_lin_reg_1["M"] = [] # for lin_l in lin_reg_1["M"]: # if lin_l == []: # best_lin_reg_1["M"].append([]) # else: # best_lin_reg_1["M"].append(lin_l[np.argmin(lin_l[:,0])]) # best_lin_reg_1["M"] = np.array(best_lin_reg_1["M"]) # best_lin_reg_2["M"] = np.array(best_lin_reg_2["M"]) # # # lin_reg_1["M"].dump("lin_reg_1_M.txt") # lin_reg_2["M"].dump("lin_reg_2_M.txt") # best_lin_reg_1["M"].dump("best_lin_reg_1_M.txt") # best_lin_reg_2["M"].dump("best_lin_reg_2_M.txt") # # lin_reg_1["COM"] = get_linear_regressions_1_perm(lagrangian_values["COM"], layer_values_list) # lin_reg_2["COM"] = get_linear_regressions_2_perm(lagrangian_values["COM"], layer_values_list) # best_lin_reg_2["COM"] = [] # for lin_l in lin_reg_2["COM"]: # if lin_l == []: # best_lin_reg_2["COM"].append([]) # else: # best_lin_reg_2["COM"].append(lin_l[np.argmin(lin_l[:, 0])]) # # best_lin_reg_1["COM"] = [] # for lin_l in lin_reg_1["COM"]: # if lin_l == []: # best_lin_reg_1["COM"].append([]) # else: # best_lin_reg_1["COM"].append(lin_l[np.argmin(lin_l[:, 0])]) # # # best_lin_reg_1["COM"] = np.array(best_lin_reg_1["M"]) # best_lin_reg_2["COM"] = np.array(best_lin_reg_2["M"]) # lin_reg_1["COM"].dump("lin_reg_1_COM.txt") # lin_reg_2["COM"].dump("lin_reg_2_COM.txt") # best_lin_reg_1["COM"].dump("best_lin_reg_1_COM.txt") # best_lin_reg_2["COM"].dump("best_lin_reg_2_COM.txt") pass
def load_model(load_dir=LOAD_DIR): return PPO2.load(load_dir)