def test(testing_data, model_file, result): model = TRPO.load(model_file) # set testing environment stock_test_data = StocksData.read_csv(testing_data) stocks_test_env = StocksEnv(stock_test_data, bars_count=10, reset_on_close=False) obs = stocks_test_env.reset() # set vars for recording results result_df = pandas.DataFrame([], columns=['date', 'open', 'action', 'reward']) net_reward = 0.0 while True: action, _states = model.predict(obs) obs, reward, done, info = stocks_test_env.step(action) # print and record the offset, action taken, reward, opening price df = pandas.DataFrame([[ stock_test_data.date[int(info["offset"])], stock_test_data.open[int(info["offset"])], Actions(action).name, reward ]], columns=['date', 'open', 'action', 'reward']) print(df) result_df = result_df.append(df, ignore_index=True) net_reward += reward # at end of episode, record results and exit if done: print('Net Reward: ', net_reward) result_df.to_csv(result, index=False) break
def test_models(env): # seeds = [1, 2, 3] seeds = [1] for s in seeds: # Load Models # models = [A2C.load(f'data/models/a2c_{s}'), # ACKTR.load(f'data/models/acktr_{s}'), # DDPG.load(f'data/models/ddpg_{s}'), # PPO2.load(f'data/models/ppo_{s}'), # SAC.load(f'data/models/sac_{s}'), # TD3.load(f'data/models/td3_{s}'), # TRPO.load(f'data/models/trpo_{s}')] models = [PPO2.load(f'data/models/ppo_{s}'), SAC.load(f'data/models/sac_{s}'), TD3.load( f'data/models/td3_{s}'), TRPO.load(f'data/models/trpo_{s}')] for m in models: # run_policy(m, env) og_params = m.get_parameters() generalization_test(m, env) for i in range(50): params = prune_policy(m.__class__.__name__, og_params, 0.1) m.load_parameters(params) generalization_test(m, env)
def test(model_path: str, exp_config: dict): test_env, _ = init_env(exp_config) if ALG == 'ddpg': model = DDPG.load(model_path, env=test_env) elif ALG == 'trpo': model = TRPO.load(model_path, env=test_env) elif ALG == 'ppo2': model = PPO2.load(model_path, env=test_env) elif ALG == 'her': # model = HER.load(model_path, env=test_env) raise NotImplemented() else: raise ValueError(f'Unknown algorithm "{ALG}"!') monitor = test_env.envs[0] # type: Monitor assert isinstance(monitor, Monitor) raw_env = monitor.unwrapped # type: GaussianPendulumEnv assert isinstance(raw_env, GaussianPendulumEnv) raw_env.configure(seed=42, mass_mean=(0.05, 1.5), mass_stdev=(0.01, 0.15), embed_knowledge=exp_config.get('embed_knowledge', False), perfect_knowledge=exp_config.get('perfect_knowledge', False), gym_env=test_env) runs = np.zeros((TEST_RUNS, 4)) fixed_masses = np.linspace(0.030, 1.600, TEST_RUNS) for test_ep in range(runs.shape[0]): obs = test_env.reset() if TEST_LINSPACE_MASS: p = raw_env.physical_props raw_env.physical_props = p[0], fixed_masses[test_ep], p[2] mass_distr_params = raw_env.mass_distr_params.copy() sampled_mass = raw_env.physical_props[1] while True: action, states = model.predict(obs, deterministic=True) obs, rewards, dones, info = test_env.step(action) rewards_by_episode = monitor.episode_rewards episode = len(rewards_by_episode) if episode != test_ep: break last_tot_reward = rewards_by_episode[-1] runs[test_ep, :] = mass_distr_params[0], mass_distr_params[ 1], sampled_mass, last_tot_reward avg_reward = runs[:, 3].mean() print(f'Avg. test reward: {avg_reward}\n') return runs
def load_model(path: str, env, desc: str): """ Loads a model from a stable baseline checkpoint file into a memory representation Args: path (str) : Path to the Stable Baseline Checkpoint File env (SB Env) : Path to the Stable Baseline Checkpoint File desc (str) : Text Description of what model this is Returns: The loaded model """ if desc == "ddpg": return DDPG.load(path, env) elif desc == "ppo": env = DummyVecEnv([lambda: env]) return PPO2.load(path, env) elif desc == "trpo": env = DummyVecEnv([lambda: env]) return TRPO.load(path, env) elif desc == "td3": return TD3.load(path, env) elif desc == "sac": return SAC.load(path, env) else: raise RuntimeError(f"Model Name {desc} not supported")
def train_trpo(seed): """ test TRPO on the uav_env(cartesian,discrete) """ """ TRPO(policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, tensorboard_log=None, _init_setup_model=True) """ algo = 'TRPO' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 # Tested with: timesteps_per_batch=1024 model = TRPO(policy=MlpPolicy, env=env, gamma=0.99, timesteps_per_batch=128, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=0.01, vf_stepsize=0.0003, vf_iters=3, verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo)) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = TRPO.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
def mainUp(arg): test = arg == TEST env = fet.FurutaEnvPosTrpoUp(cm.RUN, render = not test) #env.setRender(True) model = TRPO.load(POLICY_PATH + "trpo_pos_policy_up.zip") buf_rew = [] test_cutoff_count = 0 test_count = 0 overspeed = 0 total_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_UP: print("\n***Average reward: %.3f\tAverage count: %.3f\tShort runs: %d" % (sum(buf_rew)/float(len(buf_rew)), total_count/float(test_count), test_cutoff_count - overspeed)) break obs, done = env.reset(), False episode_rew = 0 count = 0 while not done: action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 total_count += 1 buf_rew.append(episode_rew) if test and count <= TEST_CUTOFF_MAX: test_cutoff_count += 1 print("Episode average reward: %.3f\tCount: %d" % (episode_rew/count, count))
def my_compute_data(self, args, env, params, n_episodes): env = gym.make('gym_quadcopter:quadcopter-v' + str(args.env)) for alg, start_index, end_index, step, suffix in params: re_d = [] sr_d = [] rewards, s_rates = [], [] for i in range(start_index, end_index, step): print("") print( f"Working on alg={alg}, start_index={start_index}, end_index={end_index}, step={step}, suffix={suffix}, i={i}" ) path = f"{self.base_dir}models/{alg}/quadcopter-v{args.env}-{i}{suffix}.pkl" print(f"Evaluating model at {path}") if not os.path.exists(path): print(f"WARNING: File {path} does not exist --> SKIPPING") continue if alg == "ddpg": model = DDPG.load(path) elif alg == "ppo": model = PPO2.load(path) else: model = TRPO.load(path) r, su = mean_eval(n_episodes, model, env, False, False) print(f"Average Success Rate: {su}") rewards.append(r) s_rates.append(su[0]) i_max = np.argmax(s_rates) re_d.append(rewards) sr_d.append(s_rates) return re_d, sr_d
def loader(algo, env_name): if algo == 'dqn': return DQN.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'ppo2': return PPO2.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'a2c': return A2C.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'acer': return ACER.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'trpo': return TRPO.load("trained_agents/" + algo + "/" + env_name + ".pkl")
def mainHybrid(arg): test = arg == TEST env = fet.FurutaEnvPosTrpo(cm.RUN, render = not test) #env.setRender(True) modelBal = TRPO.load(POLICY_PATH + "trpo_pos_policy_bal.zip") modelUp = TRPO.load(POLICY_PATH + "trpo_pos_policy_up.zip") buf_rew = [] test_cutoff_count = 0 test_count = 0 overspeed = 0 complete_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_HYBRID: print("\n***Average reward: %.3f\tLong runs: %d\tComplete: %d" % (sum(buf_rew)/float(len(buf_rew)), test_cutoff_count - overspeed, complete_count)) break obs, done = env.reset(), False episode_rew = 0 count = 0 while not done: if abs(obs[2]) > cm.deg2Rad(cm.ANGLE_TERMINAL_MIN_D): action, _ = modelUp.predict(obs) else: action, _ = modelBal.predict(obs) obs, rew, done, _ = env.step(action) if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 if count > 999: complete_count += 1 buf_rew.append(episode_rew) if test and count >= TEST_CUTOFF_MAX: test_cutoff_count += 1 print("Episode reward: %.3f" % (episode_rew))
def f_checkpoints_range_2_mean_performance( self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]: logging.debug( f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}" ) rewards = np.zeros(len(checkpoints)) s_rates = np.zeros(len(checkpoints)) # Intent # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint # - Pass that model to `mean_eval` evaluation function which will evaluate the model on # - a certain number of episodes # - a certain env # - continuous or not continuous space # - an evaluation returns reward and average success rate # # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates j = 0 """ NOTE: i can range in anyway while j iterates over the numpy array """ for i in checkpoints: path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}" logging.debug(f"Evaluating model at {path}") if self.args.model['name'] == "ddpg": model = DDPG.load(path) elif self.args.model['name'] == "ppo": model = PPO2.load(path) elif self.args.model['name'] == "trpo": model = TRPO.load(path) elif self.args.model['name'] == "td3": model = TD3.load(path) elif self.args.model['name'] == "sac": model = SAC.load(path) logging.debug( f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}" ) rewards_list, success_rates_list = mean_eval( num_episodes=self.args.n_episodes, checkpoint_id=i, model=model, env=self.env, v=True, continuous=self.args.continuous, plots_dir=self.args.plots_dir) rewards_mean = np.mean(rewards_list) success_rates_mean = np.mean(success_rates_list) logging.debug( f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}" ) rewards[j] = rewards_mean s_rates[j] = success_rates_mean j += 1 return rewards, s_rates
def load_model(path: str, algorithm: str): from stable_baselines import PPO2, DQN, A2C, ACER, GAIL, TRPO if algorithm == 'PPO2': return PPO2.load(path) if algorithm == 'DQN': return DQN.load(path) if algorithm == 'A2C': return A2C.load(path) if algorithm == 'ACER': return ACER.load(path) if algorithm == 'GAIL': return GAIL.load(path) if algorithm == 'TRPO': return TRPO.load(path) return None
def main(): # unpause Simulation so that robot receives data on all topics gazebo_connection.GazeboConnection().unpauseSim() # create node rospy.init_node('pickbot_gym', anonymous=True, log_level=rospy.FATAL) env = gym.make('Pickbot-v0') model = TRPO.load("pickbot_model_trpo_discrete_2019-03-11 10:22:01") while True: obs, done = env.reset(), False action, _states = model.predict(obs) episode_rew = 0 while not done: obs, rewards, done, info = env.step(action) episode_rew += rewards print("Episode reward", episode_rew)
def render_to_gif(): def save_frames_as_gif(frames, path='./', filename='growspace_with_trpo.gif'): # Mess with this to change frame size plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50) anim.save(path + filename, writer='imagemagick', fps=60) env = gym.make('GrowSpaceEnv-Control-v0') model = TRPO(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=2500) # model.save("trpo_cartpole") # del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") frames = [] obs = env.reset() for _ in range(150): # while True: frames.append(env.render(mode="rgb_array")) action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) # if done: # break # env.render() env.close() save_frames_as_gif(frames)
def render_growspace_with_trpo(): env = gym.make('GrowSpaceEnv-Control-v0') model = TRPO(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=2500) # model.save("trpo_cartpole") # # del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") obs = env.reset() for t in range(150): print(t) # while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) # if dones: # env.reset() env.render()
def trpo(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): from stable_baselines import TRPO env = gym.make(env_id) if load_weights is not None: model = TRPO.load(load_weights, env=env, verbose=0) else: model = TRPO(policy, env, verbose=1, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="trpo", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback)
def visual_test(model_path: str): test_env, _ = init_env() model = TRPO.load(model_path, env=test_env) monitor = test_env.envs[0] # type: Monitor assert isinstance(monitor, Monitor) raw_env = monitor.unwrapped # type: CartPoleEnv assert isinstance(raw_env, CartPoleEnv) for _ in range(5): obs = test_env.reset() for _ in range(500): test_env.render() action, states = model.predict(obs) obs, rewards, dones, info = test_env.step(action) sleep(1./60) test_env.close()
def train(game, num_timesteps, num_envs, dir_name, model_name, prev_model_name): dir_name = get_valid_filename(dir_name) model_name = get_valid_filename(model_name) log_dir = f"logs/{dir_name}/{model_name}-training" model_dir = f"models/{dir_name}" os.makedirs(log_dir, exist_ok=True) os.makedirs(model_dir, exist_ok=True) env = make_vec_envs(game, False, num_envs) prev_model_path = f"{model_dir}/{prev_model_name}.zip" if prev_model_name is not None and os.path.exists(prev_model_path): model = TRPO.load(prev_model_path, env=env) model.tensorboard_log = log_dir else: model = TRPO(policy="MlpPolicy", env=env, gamma=0.8, verbose=1, tensorboard_log=log_dir) model.learn(num_timesteps) model.save(f"{model_dir}/{model_name}.zip") env.close()
def tst(): def _init_openmpi(): """Pre-load libmpi.dll and register OpenMPI distribution.""" import os import ctypes if os.name != 'nt' or 'OPENMPI_HOME' in os.environ: return try: openmpi_home = os.path.abspath(os.path.dirname(__file__)) openmpi_bin = os.path.join(openmpi_home, 'bin') os.environ['OPENMPI_HOME'] = openmpi_home os.environ['PATH'] = ';'.join((openmpi_bin, os.environ['PATH'])) ctypes.cdll.LoadLibrary(os.path.join(openmpi_bin, 'libmpi.dll')) except Exception: pass _init_openmpi() import gym from stable_baselines.common.policies import MlpPolicy, CnnPolicy from stable_baselines import TRPO env = gym.make('BreakoutNoFrameskip-v4') #'CartPole-v1') model = TRPO(CnnPolicy, env, timesteps_per_batch=1024, verbose=1) model.learn(total_timesteps=25000) model.save("trpo_cartpole") del model # remove to demonstrate saving and loading model = TRPO.load("trpo_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def mainBal(arg): test = arg == TEST env = fet.FurutaEnvPosTrpoBal(cm.RUN, render = not test) #env.setRender(not test) model = TRPO.load(POLICY_PATH + "trpo_pos_policy_bal.pkl") buf_rew = [] test_cutoff_count = 0 complete_count = 0 test_count = 0 overspeed = 0 total_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_BAL: print("\n***Average reward: %.3f\tLong runs: %d\tAverage count: %.3f\tCompleted: %d\tOverspeed: %d***\n" % (sum(buf_rew)/float(len(buf_rew)), test_cutoff_count, total_count/float(test_count), complete_count, overspeed)) break obs, done = env.reset(), False #obs[4] = ARM_TARGET_RAD episode_rew = 0 count = 0 while not done: action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) #obs[4] = ARM_TARGET_RAD if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 total_count += 1 if count > 999: complete_count += 1 buf_rew.append(episode_rew) if test and count >= TEST_CUTOFF_MIN: test_cutoff_count += 1 print("Episode reward: %.3f\tCount: %d" % (episode_rew, count))
def visual_test(model_path: str, exp_config: dict): test_env, _ = init_env(exp_config) if ALG == 'ddpg': model = DDPG.load(model_path, env=test_env) elif ALG == 'trpo': model = TRPO.load(model_path, env=test_env) elif ALG == 'ppo2': model = PPO2.load(model_path, env=test_env) elif ALG == 'her': # model = HER.load(model_path, env=test_env) raise NotImplemented() else: raise ValueError(f'Unknown algorithm "{ALG}"!') monitor = test_env.envs[0] # type: Monitor assert isinstance(monitor, Monitor) raw_env = monitor.unwrapped # type: GaussianPendulumEnv assert isinstance(raw_env, GaussianPendulumEnv) for _ in range(5): obs = test_env.reset() mass_distr_params = raw_env.mass_distr_params sampled_mass = raw_env.physical_props[1] print( f'==> distribution params: {mass_distr_params} (mean, stdev) | sampled mass: {sampled_mass}' ) for _ in range(200): test_env.render() action, states = model.predict(obs, deterministic=True) obs, rewards, dones, info = test_env.step(action) sleep(1. / 60) test_env.close()
environment = 'Swimmer-v2' path = 'Results/' + environment + '_seed=' + str(seed) + '_run=' + str( run) + '_total_timesteps=' + str( total_timesteps) + '_trpo_episode_reward.npy' pathmodel = 'Results/' + environment + '_seed=' + str(seed) + '_run=' + str( run) + '_total_timesteps=' + str(total_timesteps) + '_trpo' env = gym.make(environment) env = DummyVecEnv([lambda: env]) # Automatically normalize the input features env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) model = TRPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=total_timesteps, path=path, seed=seed) model.save(pathmodel) # Don't forget to save the running average when saving the agent log_dir = "/tmp/" env.save_running_average(log_dir) ''' del model # remove to demonstrate saving and loading ''' model = TRPO.load("") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
env, n_steps=64, verbose=1, tensorboard_log=out_dir) elif args.model == 'sac': model = SAC("CnnPolicy", env) train(model, env, out_dir) else: #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "rl") path = '{}/best_model.zip'.format(args.eval) env = CarEnv(args.eval, cam_idx_list=(0, 3, 4)) env.next_weather() #env = Monitor(env, args.eval) #print(env.num_envs) if args.model == 'trpo': model = TRPO.load(path) elif args.model == 'acer': model = ACER.load(path) elif args.model == 'ppo': model = PPO2.load(path) elif args.model == 'acktr': model = ACKTR.load(path) elif args.model == 'ddpg': model = DDPG.load(path) elif args.model == 'a2c': model = A2C.load(path) elif args.model == 'sac': model = SAC.load(path) #mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True) #eps_rewards, eps_len = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True) # print(eps_rewards)
model.set_env(env) model.learn(total_timesteps=int(args.total_timesteps)) # library helper plot_results( [log_dir], int(args.total_timesteps), results_plotter.X_TIMESTEPS, "TRPO muscle" + identifer, ) plt.savefig("convergence_plot" + identifer + ".png") model.save("policy-" + identifer) else: # Use trained policy for the simulation. model = TRPO.load("trpo_" + identifer) obs = env.reset() done = False score = 0 while not done: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) score += rewards if info["ctime"] > final_time: break print("Final Score:", score) env.post_processing( filename_video="video-" + identifer + ".mp4", SAVE_DATA=True, )
from stable_baselines import TRPO from stable_baselines import PPO2 from snake_env.gym_swimmer_env import SwimmerLocomotionEnv import numpy as np fixed_path = [(-0.2 * i, 0) for i in range(30)] use_random_path = False robot_k = 1.0 robot_link_length = 0.3 #these are for testing #model = TRPO.load("trpo_swimmer") model = TRPO.load("real_trpo_swimmer_traj_following") env = SwimmerLocomotionEnv(path=fixed_path, random_path=use_random_path, use_hard_path=False, robot_link_length=robot_link_length, robot_k=robot_k, record_trajectory=True) obs = env.reset() total_reward = 0 x_list = [] for i in range(10000): action, _states = model.predict(obs) #step_time = 0.5 #action = [-0.8*np.sin(step_time*i), 0.8*np.cos(step_time*i)] # print("start of step") print(action) x_list.append(action[1])
elif AGENT_ALGORITHM == "PPO2": # Create model model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=global_path + "tb") # Load if pretrained if PRETRAINED_MODEL: PPO2.load(global_path + pretrained_model_name, env=env) print("INFO: Loaded model " + global_path + pretrained_model_name) elif AGENT_ALGORITHM == "TRPO": # Create model model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=global_path + "tb") # Load if pretrained if PRETRAINED_MODEL: TRPO.load(global_path + pretrained_model_name, env=env) print("INFO: Loaded model " + global_path + pretrained_model_name) else: raise RuntimeError('ERROR: Agent not recognized') def evaluate(model, num_steps=1000, pub=None): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward for the last 100 episodes """ episode_rewards = [0.0] obs = env.reset()
def __init__(self, obs_shape, action_space, base=None, base_kwargs=None, load_expert=None, env_name=None, rl_baseline_zoo_dir=None, expert_algo=None, normalize=True): super(Policy, self).__init__() #TODO: Pass these parameters in self.epsilon = 0.1 self.dril = True if base_kwargs is None: base_kwargs = {} if base is None: if env_name in ['duckietown']: base = DuckieTownCNN elif len(obs_shape) == 3: base = CNNBase elif len(obs_shape) == 1: base = MLPBase else: raise NotImplementedError self.base = base(obs_shape[0], normalize=normalize, **base_kwargs) self.action_space = None if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(self.base.output_size, num_outputs) self.action_space = "Discrete" elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian(self.base.output_size, num_outputs) self.action_space = "Box" elif action_space.__class__.__name__ == "MultiBinary": raise Exception('Error') else: raise NotImplementedError if load_expert == True and env_name not in [ 'duckietown', 'highway-v0' ]: print('[Loading Expert --- Base]') model_path = os.path.join(rl_baseline_zoo_dir, 'trained_agents', f'{expert_algo}') try: import mpi4py from stable_baselines import TRPO except ImportError: mpi4py = None DDPG, TRPO = None, None from stable_baselines import PPO2 model_path = f'{model_path}/{env_name}.pkl' if env_name in ['AntBulletEnv-v0']: baselines_model = TRPO.load(model_path) else: baselines_model = PPO2.load(model_path) for key, value in baselines_model.get_parameters().items(): print(key, value.shape) if base.__name__ == 'CNNBase': print(['Loading CNNBase expert model']) params = copy_cnn_weights(baselines_model) elif load_expert == True and base.__name__ == 'MLPBase': print(['Loading MLPBase expert model']) params = copy_mlp_weights(baselines_model) #TODO: I am not sure what this is doing try: self.load_state_dict(params) self.obs_shape = obs_shape[0] except: self.base = base(obs_shape[0] + 1, **base_kwargs) self.load_state_dict(params) self.obs_shape = obs_shape[0] + 1
def main(): parser = argparse.ArgumentParser( description='Plotting mechanisms for GARAT and related modifications') parser.add_argument('--sim_env', default="InvertedPendulum-v2", type=str, help="Name of the simulator/source environment") parser.add_argument('--real_env', default="InvertedPendulumModified-v2", type=str, help="Name of the real/target environment") parser.add_argument( '--load_policy_path', default= "data/models/TRPO_initial_policy_steps_InvertedPendulum-v2_2000000_.pkl", help="relative path of policy to be used for generating plots") parser.add_argument( '--load_atp_path', default= "data/models/garat/Single_GAIL_sim2real_TRPO_2000000_1000_50_0/", type=str, help="relative path for stored Action transformation policies") parser.add_argument('--seed', default=0, type=int, help="Random seed") args = parser.parse_args() #Set seed np.random.seed(args.seed) sim_env = gym.make(args.sim_env) real_env = gym.make(args.real_env) policy = TRPO.load(args.load_policy_path) action_tf_policy_list_single = [] action_tf_policy_list_double = [] action_tf_policy_list_shared_double = [] action_tf_policy_list_airl = [] num_grounding = 50 atp_path_single = args.load_atp_path atp_path_double = args.load_atp_path.replace('_0', '_2') atp_path_shared_double = args.load_atp_path.replace('_0', '_1') atp_path_airl = args.load_atp_path.replace( 'Single_GAIL_sim2real_TRPO_2000000_1000_50_0', 'Single_AIRL_sim2real_TRPO_2000000_1000_50_1') print('################## Begin File loading ##################') for index in range(num_grounding): file_path_single = os.path.join( atp_path_single, "action_transformer_policy1_" + str(index) + ".pkl") print(file_path_single) action_tf_policy_list_single.append(PPO2.load(file_path_single)) file_path_double = os.path.join( atp_path_double, "action_transformer_policy1_" + str(index) + ".pkl") print(file_path_double) action_tf_policy_list_double.append(PPO2.load(file_path_double)) file_path_shared_double = os.path.join( atp_path_shared_double, "action_transformer_policy1_" + str(index) + ".pkl") print(file_path_shared_double) action_tf_policy_list_shared_double.append( PPO2.load(file_path_shared_double)) #file_path_airl = os.path.join(atp_path_airl,"action_transformer_policy1_"+str(index)+".pkl") #print(file_path_airl) #action_tf_policy_list_airl.append(PPO2.load(file_path_airl)) results_dict = {} print('################## File loading Completed ##################') results_single = calculate_transition_errors(sim_env, real_env, policy, action_tf_policy_list_single) print('############## Begin Double Discriminator Calculations') results_shared_double = calculate_transition_errors( sim_env, real_env, policy, action_tf_policy_list_shared_double) results_double = calculate_transition_errors(sim_env, real_env, policy, action_tf_policy_list_double) print('############## Begin AIRL Calculations') #results_airl = calculate_transition_errors(sim_env, real_env, policy, action_tf_policy_list_airl) results_dict['GARAT'] = results_single results_dict['GARAT Double Discriminator'] = results_double results_dict[ 'GARAT Double Discriminator (Generator LR modifications)'] = results_shared_double #results_dict['GARAT AIRL'] = results_airl plot_results(results_dict)
def _evaluation_worker(test_data, model_type, model_path, perfect_knowledge, episode_length=200, mpc_sequences=2000, model_kwargs=None): # init environment env = gym.make('GaussianPendulum-v0') model_kwargs = model_kwargs or dict() vision = False if model_type.startswith('mpc'): # load model if model_type == 'mpc-mdn': model = MDN_Model.load(model_path, **model_kwargs) elif model_type == 'mpc-mlp': model = MlpModel.load(model_path, **model_kwargs) elif model_type == 'mpc-sim': model = PendulumSim(env, **model_kwargs) elif model_type == 'mpc-vae-mlp': model = VaeTorchModel(model_path, **model_kwargs) vision = True else: raise NotImplementedError mpc = MPC(env, model, horizon=20, n_action_sequences=mpc_sequences, np_random=None) def next_action(obs): return mpc.get_action(obs) model_info = dict(type=model_type, horizon=mpc.horizon, sequences=mpc.n_action_sequences, perfect_knowledge=perfect_knowledge) elif model_type == 'trpo': # load model model = TRPO.load(model_path, env=env, **model_kwargs) def next_action(obs): action, _ = model.predict(obs, deterministic=True) return action model_info = dict(type='trpo', perfect_knowledge=perfect_knowledge) else: raise NotImplementedError rewards = _run_model(env, next_action, test_data, episode_length=episode_length, embed_knowledge=perfect_knowledge, perfect_knowledge=perfect_knowledge, vision=vision) results = pd.DataFrame(test_data) results = results.assign(rewards=pd.Series(rewards).values) results = results.assign(model_info=[model_info] * len(results)) return results
def main(): # parameters for the gym_carla environment params = { 'number_of_vehicles': 8, 'number_of_walkers': 0, 'display_size': 256, # screen size of bird-eye render 'max_past_step': 1, # the number of past steps to draw 'dt': 0.1, # time interval between two frames 'discrete': True, # whether to use discrete control space 'continuous_accel_range': [-3.0, 3.0], # continuous acceleration range 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'town': 'Town06', # which town to simulate 'task_mode': 'acc_1', # mode of the task, [random, roundabout (only for Town03)] 'max_time_episode': 1000, # maximum timesteps per episode 'max_waypt': 12, # maximum number of waypoints 'obs_range': 32, # observation range (meter) 'lidar_bin': 0.125, # bin size of lidar sensor (meter) 'd_behind': 12, # distance behind the ego vehicle (meter) 'out_lane_thres': 2.0, # threshold for out of lane 'desired_speed': 16.67, # desired speed (m/s) 'max_ego_spawn_times': 200, # maximum times to spawn ego vehicle 'display_route': True, # whether to render the desired route 'pixor_size': 64, # size of the pixor labels 'pixor': False, # whether to output PIXOR observation 'RGB_cam': True, # whether to use RGB camera sensor } solver_params = { 'layers': [64, 64, 64], 'alpha': 0.001, 'gamma': 0.99, 'epsilon': 0.1, 'replay_memory_size': 500000, 'update_target_estimator_every': 10000, 'batch_size': 64, } # Set gym-carla environment env = gym.make('carla-v0', params=params) # check_env(env) obs = env.reset() checkpoint_callback = CheckpointCallback(save_freq=5000, save_path='./trpo_checkpoint/', name_prefix='trpo_check') #model = DQN.load("./trpo_checkpoint/trpo_check_200_steps.zip",env=env,tensorboard_log="./trpo) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log="./trpo") model.learn(total_timesteps=35000, tb_log_name="35k-with-checkoint", callback=checkpoint_callback) model.save("trpo_carla") del model # remove to demonstrate saving and loading model = TRPO.load("trpo_carla") obs = env.reset() for i in range(100): while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if dones: obs = env.reset() break
def evaluate(game, num_eps, num_envs, dir_name, model_name): dir_name = get_valid_filename(dir_name) model_name = get_valid_filename(model_name) log_dir = f"logs/{dir_name}/{model_name}" os.makedirs(log_dir, exist_ok=True) env = make_vec_envs(game, True, num_envs, model_name=model_name) model_path = f"models/{dir_name}/{model_name}.zip" model = TRPO.load(model_path, env=env) model.tensorboard_log = log_dir eps_done = 0 ep_rewards = np.array([0] * num_eps) curr_rewards = [0] * num_envs obs = env.reset() while eps_done != num_eps: # For vectorised environments, they are automatically reset when done, # so returned obs would be the start state of next episode action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) env.render(mode="human") for i in range(num_envs): curr_rewards[i] += reward[i] if done[i]: ep_rewards[eps_done] = curr_rewards[i] curr_rewards[i] = 0 eps_done += 1 print("All episodes completed") env.close() mean = ep_rewards.mean() std_dev = ep_rewards.std() # Outliers: outside of 3 standard deviations outlier_threshold_upper = mean + 3 * std_dev outlier_threshold_lower = mean - 3 * std_dev trimmed_rewards = np.array([ rew for rew in ep_rewards if outlier_threshold_lower <= rew <= outlier_threshold_upper ]) avg_reward = trimmed_rewards.mean() best_reward = ep_rewards.max() print(f"Average score over {num_eps} games: {avg_reward:.2f}") print(f"Best score: {best_reward}") summary_writer = tf.summary.FileWriter(log_dir) sess = tf.Session() rew_var = tf.Variable(0, dtype=tf.int64) rew_val = tf.summary.scalar(f"Reward / Episode ({model_name})", rew_var) for i in range(num_eps): rew = ep_rewards[i] sess.run(rew_var.assign(rew)) summary_writer.add_summary(sess.run(rew_val), i) best_val = tf.summary.scalar(f"Best Reward", rew_var) sess.run(rew_var.assign(best_reward)) summary_writer.add_summary(sess.run(best_val), 0) avg_var = tf.Variable(0.0, dtype=tf.float64) avg_val = tf.summary.scalar(f"Trimmed Average ({model_name})", avg_var) sess.run(avg_var.assign(avg_reward)) summary_writer.add_summary(sess.run(avg_val), 0) summary_writer.flush() summary_writer.close() sess.close()