def main(logdir): # params SLEEP_RATE = 100 #100Hz N_EPISODE = 1000 EPISODE_TIME = 30 EPISODE_LENGTH = SLEEP_RATE * EPISODE_TIME TOTAL_TIMESTEPS = EPISODE_LENGTH * N_EPISODE # logdir logdir = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2' checkpoint_path = os.path.join(logdir, 'checkpoint') callback_path = logdir final_model_path = logdir + '/final_model' # env env = BlimpEnv(SLEEP_RATE) env = Monitor(env, logdir) # env = make_vec_env(lambda: env, n_envs=1, monitor_dir=logdir) print("Observation space:", env.observation_space) print("Shape:", env.observation_space.shape) print("Action space:", env.action_space) # # # callback SAVE_FREQ = EPISODE_LENGTH * 20 # every 1 episode checkpoint_callback = CheckpointCallback(save_freq=SAVE_FREQ, save_path=checkpoint_path, name_prefix='sac_callback_model') save_on_best_training_reward_callback = SaveOnBestTrainingRewardCallback( check_freq=SAVE_FREQ, log_dir=callback_path) callback = CallbackList( [checkpoint_callback, save_on_best_training_reward_callback]) # traing got kill for some reason so continue from the checkpoint model_path = '/home/yliu2/rl_log/sac_mpc/ALT/3act/2/best_model.zip' model = SAC.load(model_path) model.set_env(env) print("---------- Start Learing -----------") model.learn(total_timesteps=TOTAL_TIMESTEPS, log_interval=SAVE_FREQ, callback=callback) print("---------- Finish Learning ----------") model.save(final_model_path) del model # remove to demonstrate saving and loading model = SAC.load(final_model_path) results_plotter.plot_results([logdir], TOTAL_TIMESTEPS, results_plotter.X_TIMESTEPS, "SAC BLIMP") plt.show()
def main(): env = gym.make("teaching-env-v0", teacher_path=os.path.join(os.getcwd(), "../saved_models", sys.argv[1]), validation_path=DATA_PATH, max_queries=config.MAX_QUERIES) agent_model = SAC(MlpPolicy, env, train_freq=1, batch_size=64, learning_rate=3e-4, learning_starts=0, buffer_size=1000, random_exploration=config.EPSILON_EXPLORATION, gamma=config.GAMMA, verbose=1) #agent_model.learn(total_timesteps=config.MAX_QUERIES * config.NUM_TRAIN_EPISODES) #agent_model.save('test_SAC') agent_model.load('test_SAC', env=env) obs = env.reset() total_reward = float('-inf') prog = tqdm(range(config.MAX_QUERIES), postfix={'Reward': total_reward}) actions = [] # For visualization total_reward = 0.0 for i in prog: action = select_action(agent_model, obs, epsilon=config.EPSILON_EXPLORATION) #action, _states = agent_model.predict(obs, deterministic=False) obs, reward, done, info = env.step(action) total_reward += reward prog.set_postfix({'Reward': total_reward}) actions.append(np.asscalar(action)) plt.hist(actions, bins=config.NUM_BINS, range=(-5, 5), density=True) plt.savefig('./visualizations/histograms/SAC') plt.clf() # Plot student's predicted function inputs = np.linspace(-5, 5, num=1000) outputs = env.student_model(inputs.reshape(-1, 1)) plt.scatter(inputs, outputs, s=0.1, label='SAC') plt.title("SAC Student's Approximation") plt.ylim((-60, 100)) plt.savefig('./visualizations/functions/SAC') plt.clf()
def main(): parser = argparse.ArgumentParser("Insertion, Manual mode") parser.add_argument('checkpoint_path', type=str, help='Path to checkpoint') parser.add_argument('--host', default="192.168.2.121", type=str, help='IP of the server (default is a Windows#2)') parser.add_argument( '--port', default=9090, type=int, help='Port that should be used to connect to the server') parser.add_argument( '--use_coord', action="store_true", help=('If set, the environment\'s observation space will be' 'coordinates instead of images')) args = parser.parse_args() os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' env = gym.make('insertion-v0', kwargs={ 'host': args.host, "port": args.port, "use_coord": args.use_coord }) print(f"Observation space: {env.observation_space}") print(f"Action space: {env.action_space}") if args.use_coord: model = SAC('MlpPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") else: model = SAC('CnnPolicy', env, verbose=1, tensorboard_log="../insertion_tensorboard/") model.load(args.checkpoint_path, env=env) obs = env.reset() for i in range(10000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = make_mujoco_env(env, 0) env = Monitor(env, log_dir + "/") continue_train = False if continue_train: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, #action_noise=action_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def test_models(env): # seeds = [1, 2, 3] seeds = [1] for s in seeds: # Load Models # models = [A2C.load(f'data/models/a2c_{s}'), # ACKTR.load(f'data/models/acktr_{s}'), # DDPG.load(f'data/models/ddpg_{s}'), # PPO2.load(f'data/models/ppo_{s}'), # SAC.load(f'data/models/sac_{s}'), # TD3.load(f'data/models/td3_{s}'), # TRPO.load(f'data/models/trpo_{s}')] models = [PPO2.load(f'data/models/ppo_{s}'), SAC.load(f'data/models/sac_{s}'), TD3.load( f'data/models/td3_{s}'), TRPO.load(f'data/models/trpo_{s}')] for m in models: # run_policy(m, env) og_params = m.get_parameters() generalization_test(m, env) for i in range(50): params = prune_policy(m.__class__.__name__, og_params, 0.1) m.load_parameters(params) generalization_test(m, env)
def example(): # This tutorial shows how to view policies of trained actors task = generate_task(task_generator_id='picking') world_params = dict() world_params["skip_frame"] = 3 world_params["seed"] = 0 stable_baselines_policy_path = "./model_2000000_steps.zip" model = SAC.load(stable_baselines_policy_path) # define a method for the policy fn of your trained model def policy_fn(obs): return model.predict(obs, deterministic=True)[0] # # Record a video of the policy is done in one line viewer.record_video_of_policy(task=task, world_params=world_params, policy_fn=policy_fn, file_name="pushing_video", number_of_resets=10, max_time_steps=10 * 100) # Similarly for interactive visualization in the GUI viewer.view_policy(task=task, world_params=world_params, policy_fn=policy_fn, max_time_steps=40 * 600, number_of_resets=40)
def get_new_weights(): v_env = PortfolioEnv(settings['data_file'], settings['output_file'], settings['strategy_name'], settings['total_steps'], settings['window_length'], settings['capital_base'], settings['lot_size'], settings['leverage'], settings['commission_percent'], settings['commission_fixed'], settings['max_slippage_percent'], settings['start_idx'], settings['compute_indicators'], settings['compute_reward'], settings['compute_position'], settings['debug']) # Create the vectorized environment # v_env = DummyVecEnv([lambda: v_env]) # Normalize environment # v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS) model = SAC.load(MODELS_DIR + settings['model_name']) # Strategy obs = v_env.reset() dones = False while not dones: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = v_env.step(action) # v_env.render(mode='ansi') weights = v_env.current_weights return weights
def evaluate_policy(policy_file, policy_type, envname, num_rollouts): if policy_type == "ppo": model = PPO2.load(policy_file) def get_action(obs): return model.predict(obs)[0] elif policy_type == "sac": model = SAC.load(policy_file) def get_action(obs): return model.predict(obs, deterministic=True)[0] else: raise NotImplementedError() env = gym.make(envname) returns = [] for i in range(num_rollouts): # print("iter", i, end=" ") obs = env.reset() done = False totalr = 0.0 while not done: action = get_action(obs) obs, r, done, _ = env.step(action) totalr += r returns.append(totalr) return np.mean(returns), np.std(returns)
def rollout_policy(filename, traj_len, seed, env_name, n_trajs=1): model = SAC.load(filename) env = gym.make(env_name) env.seed(seed) trajs = [] for _ in range(int(n_trajs)): obs_list, acts_list, rews_list = [], [], [] obs = env.reset() obs_list.append(obs) for _ in range(traj_len): act = model.predict(obs, deterministic=True)[0] obs, r, done, _ = env.step(act) # assert not done acts_list.append(act) obs_list.append(obs) rews_list.append(r) infos = [{} for _ in range(traj_len)] traj = types.TrajectoryWithRew( obs=np.array(obs_list), acts=np.array(acts_list), infos=infos, rews=np.array(rews_list), ) trajs.append(traj) return trajs
def load_model(path: str, env, desc: str): """ Loads a model from a stable baseline checkpoint file into a memory representation Args: path (str) : Path to the Stable Baseline Checkpoint File env (SB Env) : Path to the Stable Baseline Checkpoint File desc (str) : Text Description of what model this is Returns: The loaded model """ if desc == "ddpg": return DDPG.load(path, env) elif desc == "ppo": env = DummyVecEnv([lambda: env]) return PPO2.load(path, env) elif desc == "trpo": env = DummyVecEnv([lambda: env]) return TRPO.load(path, env) elif desc == "td3": return TD3.load(path, env) elif desc == "sac": return SAC.load(path, env) else: raise RuntimeError(f"Model Name {desc} not supported")
def test_SAC(env, out_dir, seed=None, **kwargs): model = SAC.load(os.path.join(out_dir, 'final_model'), env=env) env.seed(seed) # Evaluate the trained agent mean_reward = evaluate(env, model, out_dir, num_episodes=20) return
def play(): model = SAC.load(expDir + "/%s/%d" % (name, np.format_float_scientific(nIter))) env = gym.make('PointMassDense-1-v1') while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render(mode='human')
def __init__(self, env): self.env = env load_path_rl="/home/icv/Trustworth/TiRL/models/sac-5" log_path_rl="/home/icv/Trustworth/TiRL/data/sac-5" self.model_rl = SAC.load(load_path_rl, env=env, tensorboard_log=log_path_rl) load_path_rule="/home/icv/Trustworth/TiRL/models/sac_rule3" log_path_rule="/home/icv/Trustworth/TiRL/data/sac_rule3" self.model_rule = SAC.load(load_path_rule, env=env, tensorboard_log=log_path_rule) self.agent_rule = IDM(env) print("load model successfully") self.reset()
def play(save_dir, env): model = SAC.load(save_dir + '/model_dir/sac/test_25_25_14_15', env=env, custom_objects=dict(learning_starts=0)) ### ADD NUM for _ in range(2): obs = env.reset() done = False while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action)
def load_model(config): model = None if config["algo_name"] == "TD3": model = TD3.load("agents/{}".format(args["test_agent_path"])) if config["algo_name"] == "A2C": model = A2C.load("agents/{}".format(args["test_agent_path"])) if config["algo_name"] == "SAC": model = SAC.load("agents/{}".format(args["test_agent_path"])) if config["algo_name"] == "PPO2": model = PPO2.load("agents/{}".format(args["test_agent_path"])) assert model is not None, "Alg name not found, cannot load model, exiting. " return model
def f_checkpoints_range_2_mean_performance( self, checkpoints: range) -> Tuple[np.ndarray, np.ndarray]: logging.debug( f"[f_checkpoints_range_2_mean_performance]: checkpoints={checkpoints}" ) rewards = np.zeros(len(checkpoints)) s_rates = np.zeros(len(checkpoints)) # Intent # - Iterate over this range, to load the associated Stable Baseline Model Checkpoint # - Pass that model to `mean_eval` evaluation function which will evaluate the model on # - a certain number of episodes # - a certain env # - continuous or not continuous space # - an evaluation returns reward and average success rate # # Evaluating N checkpoints on M queries and then averaging on M so to finally have N Rewards and N Success Rates j = 0 """ NOTE: i can range in anyway while j iterates over the numpy array """ for i in checkpoints: path = f"{self.args.training_base_path}/models/quadcopter-{i}{self.args.suffix}" logging.debug(f"Evaluating model at {path}") if self.args.model['name'] == "ddpg": model = DDPG.load(path) elif self.args.model['name'] == "ppo": model = PPO2.load(path) elif self.args.model['name'] == "trpo": model = TRPO.load(path) elif self.args.model['name'] == "td3": model = TD3.load(path) elif self.args.model['name'] == "sac": model = SAC.load(path) logging.debug( f"Evaluating Model {self.args.model['name']} for {self.args.n_episodes} episodes in {self.args.env} environment with continuous={str(self.args.continuous)}" ) rewards_list, success_rates_list = mean_eval( num_episodes=self.args.n_episodes, checkpoint_id=i, model=model, env=self.env, v=True, continuous=self.args.continuous, plots_dir=self.args.plots_dir) rewards_mean = np.mean(rewards_list) success_rates_mean = np.mean(success_rates_list) logging.debug( f"Evaluation Checkpoint={i} --> Average Reward = {rewards_mean}, Average Success Rate = {success_rates_mean}" ) rewards[j] = rewards_mean s_rates[j] = success_rates_mean j += 1 return rewards, s_rates
def load_model(self): model_path = "data/saved_models/" if folder: model_path = model_path + self.folder + "/" else: model_path = model_path + self.model_name + "/" model_path = model_path + self.model_name if self.episode: model_path = model_path + "_" + self.episode + ".pkl" self.model = SAC.load(model_path)
def load_model(model_path, params): env_cls = globals()[params['env']] orig_env = env_cls(**params['env_options']) env = DummyVecEnv([lambda: orig_env]) if params['alg'] == 'PPO2': model = PPO2.load(model_path, env=env) elif params['alg'] == 'SAC': model = SAC.load(model_path, env=env) else: raise NotImplementedError return orig_env, model
def record(exp): model = SAC.load(exp) env = make_vec_env( 'PointMassDense-%d-v1' % num_objs, 1, wrapper_class=FlattenDictWrapper, wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal']) env = VecVideoRecorder( env, osp.join(logger, "videos_2"), record_video_trigger=lambda x: x % save_video_interval == 0, video_length=save_video_length) model.set_env(env) model.learn(total_timesteps=2000, log_interval=100) # model.save(expDir + "/%s/%d" %(name, nIter)) env.close()
def __init__(self, env): self.T = 1.5 self.g0 = 4 self.a = 0.73 self.b = 1.67 self.delta = 4 self.decision_dt = 0.75 self.length_x = 5 # front vehicle length self.env = env load_path = "/home/icv/Trustworth/TiRL/models/sac-5" log_path = "/home/icv/Trustworth/TiRL/data/sac-5" self.model = SAC.load(load_path, env=env, tensorboard_log=log_path) print("load model successfully")
def test_single_episode(model): act = SAC.load(model) done = False #for i in range(1): obs, done = env._validate(1,8,1.0,0.585), False episode_rew = 0 actions = list() while not done: #env.render() action = act(obs[None])[0] #obs, rew, done, _ = env.step(act(obs[None])[0]) obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode total reward", episode_rew) return episode_rew
def mk_env_agent(env_class, registered_model, params, gui=False): model = SAC.load(registered_model.source) params_fname = f'{registered_model.source}.json' # FIXME with open(params_fname, 'r') as fp: loaded_params = json.load(fp) params = {**loaded_params, **params} # merge, overriding loaded params env = make_vec_env(lambda: env_class(params['NJ'], params, gui=gui), n_envs=1) model.set_env(env) env.env_method('set_render_info', { 'name': registered_model.name, 'version': registered_model.version }) # FIXME return env, model
def train_SAC(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = gym.make(env) env = Monitor(env, log_dir + '/', allow_early_resets=True) # Delete keys so the dict can be pass to the model constructor # policy = kwargs['policy'] policy = 'MlpPolicy' # n_timesteps = kwargs['n_timesteps'] n_timesteps = int(1e6) noise_type = None # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) continue_model = False if continue_model is True: # Continue training print("Loading pretrained agent") model = SAC.load(os.path.join(out_dir, 'final_model.pkl'), env=env, tensorboard_log=os.path.join(log_dir, 'tb'), verbose=1, **kwargs) else: model = SAC( policy, env, # action_noise=param_noise, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False, **kwargs) model.learn(total_timesteps=n_timesteps, seed=seed, callback=callback, log_interval=10) return model
def sac(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): env = gym.make(env_id) if load_weights is not None: model = SAC.load(load_weights, env, verbose=0) else: model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="sac", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback)
def get_SAC_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path): policy_kwargs = dict(layers=model_settings['NET_LAYERS']) env = get_single_process_env(model_settings, model_path, ckpt_step) if ckpt_path is not None: print("Loading model from checkpoint '{}'".format(ckpt_path)) model = SAC.load(ckpt_path, env=env, _init_setup_model=True, policy_kwargs=policy_kwargs, **model_settings['train_configs'], verbose=1, tensorboard_log=tb_path) model.num_timesteps = ckpt_step else: model = SAC(SACMlpPolicy, env, _init_setup_model=True, policy_kwargs=policy_kwargs, **model_settings['train_configs'], verbose=1, tensorboard_log=tb_path) return model, env
def get_trajectories(env, policy_path, policy_type, n_rollouts, time_horizon): if policy_type == "sac": from stable_baselines import SAC model = SAC.load(policy_path) def get_action(obs): return model.predict(obs, deterministic=True)[0] elif policy_type == "gail": from imitation.policies import serialize from stable_baselines3.common.vec_env import DummyVecEnv venv = DummyVecEnv([lambda: env]) model = serialize.load_policy("ppo", policy_path, venv) def get_action(obs): return model.predict(obs)[0] elif policy_type == "dads": data = load_data(policy_path) return data["observations"] else: raise NotImplementedError() trajectories = [] for _ in range(n_rollouts): trajectory = [] obs = env.reset() trajectory.append(list(obs)) for t in range(time_horizon - 1): action = get_action(obs) # trajectory.extend(list(action)) obs, reward, done, info = env.step(action) trajectory.append(list(obs)) trajectories.append(trajectory) return trajectories
inference = True # Enjoy trained agent num_of_paths = 1 max_ep_steps = 800 algorithm = "SAC" # PPO2, SAC, DDPG model_save_name = "SAC_1_Ex3_EKF_gyro-v0_model_1" #"ppo2_ekf_0", "sac_ekf_model_2" env_name = 'Ex3_EKF_gyro-v0' # 'Ex3_EKF_gyro-v0', 'Pendulum-v0','Ex3_pureEKF_gyro' if algorithm == "PPO2": from stable_baselines.common import make_vec_env from stable_baselines import PPO2 model = PPO2.load(model_save_name) env = make_vec_env(env_name) elif algorithm == "SAC": from stable_baselines import SAC model = SAC.load(model_save_name) env = gym.make(env_name) elif algorithm == "DDPG": from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec from stable_baselines import DDPG model = DDPG.load(model_save_name) env = gym.make(env_name) if inference: save_figs = False LOG_PATH = "./logs" fig_file_type = "pdf" roll_out_paths = {} roll_out_paths = { "s": [], "r": [],
action_noise=action_noise, tensorboard_log=tensorboard_log_dir) model.learn(total_timesteps=total_timesteps_, log_interval=1, tb_log_name=tensorboard_log_name) model.save(model_save_name) del model # remove to demonstrate saving and loading if inference: if algorithm == "PPO2": env = make_vec_env('Ex3_EKF_gyro-v0') model = PPO2.load("ppo2_ekf_0") elif algorithm == "SAC": env = gym.make('Ex3_EKF_gyro-v0') model = SAC.load("sac_ekf_model_0") # Enjoy trained agent num_of_paths = 1 max_ep_steps = 1000 save_figs = False LOG_PATH = "./logs" fig_file_type = "pdf" roll_out_paths = {} roll_out_paths = { "s": [], "r": [], "s_": [], "state_of_interest": [], "reference": [], "episode_length": [], "return": [],
import gym import numpy as np from stable_baselines.sac.policies import MlpPolicy from stable_baselines import SAC env = gym.make('BipedalWalkerHardcore-v2') model = SAC.load("sac_walker500000") obs = env.reset() for i in range(0, 1500): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) #print(rewards) env.render() if rewards == -100: break env.close()
import gym import numpy as np import imageio from stable_baselines.sac.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import SAC env = gym.make('Pendulum-v0') env = DummyVecEnv([lambda: env]) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=1000, log_interval=10) model.save("../models/sac_pendulum") del model # remove to demonstrate saving and loading model = SAC.load("../models/sac_pendulum") #obs = env.reset() #while True: # action, _states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # env.render()