def test_ddpg_normalization(): """ Test that observations and returns normalizations are properly saved and loaded. """ param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=0.05) model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, batch_size=64, param_noise=param_noise) model.learn(1000) obs_rms_params = model.sess.run(model.obs_rms_params) ret_rms_params = model.sess.run(model.ret_rms_params) model.save('./test_ddpg') loaded_model = DDPG.load("test_ddpg") obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params) ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params) for param, param_loaded in zip(obs_rms_params + ret_rms_params, obs_rms_params_2 + ret_rms_params_2): assert np.allclose(param, param_loaded) del model, loaded_model if os.path.exists("./test_ddpg"): os.remove("./test_ddpg")
def train_agent(train, pickle_file, agent_type, env_kwargs, parms): bin_path = "bin/" + pickle_file if (path.exists(bin_path)): if agent_type == "a2c": print("Loading A2C Agent") RL_model = A2C.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ddpg": print("Loading DDPG Agent") RL_model = DDPG.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") elif agent_type == "ppo": print("Loading PPO2 Agent") RL_model = PPO2.load( bin_path, tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{agent_type}") else: e_train_gym = ipenv.PortfolioAllocEnv(df=train, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() agent = ipagent.IPRLAgent(env=env_train) model = agent.get_model(model_name=agent_type, model_kwargs=parms) RL_model = agent.train_model(model=model, tb_log_name=agent_type, total_timesteps=1000000) RL_model.save(bin_path) return RL_model
def main(env, load_path, fig_path): # arguments print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path)) log_path = os.getcwd() + "/log/" + load_path os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True) fig_path = os.getcwd() + "/figs/" + "/" + fig_path load_path = os.getcwd() + "/models/" + load_path # make environment, flattened environment, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal']) env = DummyVecEnv([lambda: env]) # load model model = DDPG.load(load_path, env=env) obs_initial = env.reset() obs = obs_initial # plot results plot_results(fig_path, log_path) # initializations niter = 10 counter = 0 timestep = 0 results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] print("==============================") # check initial positions and quaternions print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip')) print("box", env.envs[0].env.env.sim.data.get_site_xpos('box')) print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool')) print("mocap", env.envs[0].env.env.sim.data.mocap_pos) print("quat", env.envs[0].env.env.sim.data.mocap_quat) print("==============================") # mocap quaternion check for i in range(5): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) quat = env.envs[0].env.env.sim.data.mocap_quat print("obs", obs) print("quat", quat) print("==============================") # start rendering dists = [] box_goal_pos = np.array([0.6, 0.05, -0.17]) while True: if counter == niter: break action, _states = model.predict(obs) obs_old = obs obs, rewards, dones, info = env.step(action) quaternion = env.envs[0].env.env.sim.data.mocap_quat if obs.all() == obs_initial.all(): if counter % 10 == 0: xyzs = current[0] quats = current[1] print(xyzs) print(quats) filename = log_path + "/" + "results_" + str(counter) + ".txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() box_end_pos = np.array(obs_old[0][3:6].tolist()) print(box_end_pos) print(np.shape(box_end_pos)) print(box_goal_pos) print(np.shape(box_goal_pos)) dists.append(np.linalg.norm(box_goal_pos - box_end_pos)) current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] timestep = 0 counter += 1 print(timestep) print("obs", obs) print("quat", quaternion) # for average trajectory, smoothed for i in range(3): results[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): results[1][timestep][j] += quaternion[0].tolist()[j] # for current trajectory for i in range(3): current[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): current[1][timestep][j] += quaternion[0].tolist()[j] timestep += 1 env.render() # smooth paths by taking average, and calculate mean distance to goal state for timestep in range(100): for i in range(3): results[0][timeste][i] /= niter for j in range(4): results[0][timestep][j] /= niter dist = np.mean(dists) # print and write to file xyzs = results[0] quats = results[1] filename = log_path + "/" + "results_avg.txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() # print average distances print("average distance of box from end goal: %f" % dist)
def main(env, load, save_path, load_path=None, train_timesteps=1.25e6, eval_timesteps=5e3): # arguments print( "env %s; load %s; save_path %s; load_path %s; train_timesteps %s; eval_timesteps %s;" % (env, load, save_path, load_path, train_timesteps, eval_timesteps)) train_timesteps = int(float(train_timesteps)) eval_timesteps = int(float(eval_timesteps)) # models path model_dir = os.getcwd() + "/models/" os.makedirs(model_dir, exist_ok=True) # logging path log_dir = os.getcwd() + "/log/" + save_path os.makedirs(log_dir, exist_ok=True) # absolute save path and models path save_path = model_dir + save_path if load and not load_path: print("no load path given, exiting...") sys.exit() elif load: load_path = model_dir + load_path # make environment, flattened environment, monitor, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # load model, or start from scratch if load: print("loading model from: " + load_path) model = DDPG.load(load_path, env=env) else: print("training model from scratch") model = DDPG(MlpPolicy, env, verbose=1) # evaluate current model mean_reward_before_train = evaluate(model, env, num_steps=eval_timesteps) # train model global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model.learn(total_timesteps=train_timesteps, callback=None) # save model print("saving model to:" + save_path) model.save(save_path) # evaluate post training model mean_reward_after_train = evaluate(model, env, num_steps=eval_timesteps) # results print("reward before training:" + str(mean_reward_before_train)) print("reward after training:" + str(mean_reward_after_train)) print("done")