def test_vec_normalize(): env = DummyVecEnv([lambda: gym.make("Pendulum-v0")]) normalized_vec_env = VecNormalize(env) obs = normalized_vec_env.reset() for _ in range(10): action = [normalized_vec_env.action_space.sample()] obs, reward, _, _ = normalized_vec_env.step(action) print(obs, reward)
def _precompute_normalization(env, num_envs, samples, config): env = VecNormalize(env, training=True, **config) logging.info("Precomputing normalization. This may take a while.") env.reset() log_step = 5000 // num_envs for i in range(samples // num_envs): actions = [env.action_space.sample() for _ in range(num_envs)] obs, rewards, dones, info = env.step(actions) if i % log_step == 0: logging.info("Progress: {}/{}".format(i * num_envs, samples)) logging.info("Successfully precomputed normalization parameters.") env.reset() env.training = False return env
def test_vec_env(tmpdir): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = str(tmpdir.join("vec_normalize")) norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
class PPO2_SB(): def __init__(self): self.love = 'Ramona' self.env_fns = [] self.env_names = [] def make_env(self, env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = Template_Gym() env.seed(seed + rank) return env set_global_seeds(seed) return _init def train(self, num_e=1, n_timesteps=1000000, save_fraction=0.1, save='saves/aud5'): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) self.model = PPO2(CustomPolicy_4, self.env, verbose=0, learning_rate=1e-4, nminibatches=1, tensorboard_log="./day1" ) #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) log_dir = "saves" for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(save+str(i)) self.env.save_running_average(log_dir) self.env.save_running_average(log_dir) def evaluate(self, num_env=1, num_steps=21900, load="saves/aud5", runs=10): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = 'default' num_e = 1 log_dir = "saves" self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_env)]) #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" ) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) self.env.load_running_average(log_dir) for i in range(runs): self.model = PPO2.load(load+str(i), self.env, policy=CustomPolicy_4, tensorboard_log="./default/" ) self.env.load_running_average(log_dir) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] #self.total_pips = [] obs = self.env.reset() state = None # When using VecEnv, done is a vector done = [False for _ in range(env.num_envs)] for i in range(num_steps): # _states are only useful when using LSTM policies action, state = self.model.predict(obs, state=state, mask=done, deterministic=True) obs, rewards , dones, _ = self.env.step(action) #actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env #obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward def pre_train(self, num_e=1, load="saves/m19"): env_id = 'default' num_e = 1 log_dir = "saves" # Usingenv = make_env() only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='default2.npz',traj_limitation=1, batch_size=128) self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.load_running_average("saves") self.model = PPO2(CustomPolicy, self.env, verbose=1, nminibatches=1, learning_rate=1e-5, tensorboard_log="./m1ln4" ) #self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.load_running_average("saves") # Pretrain the PPO2 model self.model.pretrain(dataset, n_epochs=10000) # As an option, you can train the RL agent #self.model.learn(int(100000000)) # Test the pre-trained model self.env = self.model.get_env() self.env.load_running_average("saves") obs = self.env.reset() reward_sum = 0.0 for _ in range(1000000): action, _ = self.model.predict(obs) obs, reward, done, _ = self.env.step(action) reward_sum += reward #self.env.render() if done: print(reward_sum) reward_sum = 0.0 obs = self.env.reset() self.env.close() def gen_pre_train(self, num_e=1, save='default2', episodes=1000): #self.create_envs(game_name=game, state_name=state, num_env=num_e) #self.env=SubprocVecEnv(self.env_fns) env_id = 'default' num_e = 1 self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.load_running_average("saves") self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.load_running_average("saves") #env = make_env() #self.expert_agent = generate_expert_traj(self.model, save, self.env, n_episodes=episodes)
class Optimization(): def __init__(self, config): self.reward_strategy = 'sortino2' #self.input_data_file = 'data/coinbase_hourly.csv' self.params_db_file = 'sqlite:///params.db' # number of parallel jobs self.n_jobs = 1 # maximum number of trials for finding the best hyperparams self.n_trials = 1000 #number of test episodes per trial self.n_test_episodes = 10 # number of evaluations for pruning per trial self.n_evaluations = 10 self.config = config #self.df = pd.read_csv(input_data_file) #self.df = df.drop(['Symbol'], axis=1) #self.df = df.sort_values(['Date']) #self.df = add_indicators(df.reset_index()) #self.train_len = int(len(df) * 0.8) #self.df = df[:train_len] #self.validation_len = int(train_len * 0.8) #self.train_df = df[:validation_len] #self.test_df = df[validation_len:] def make_env(self, env_id, rank, seed=0, eval=False,config=pc.configeurcad4h): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): self.config = config self.eval= eval env = Template_Gym(config=self.config, eval=self.eval) env.seed(seed + rank) return env set_global_seeds(seed) return _init # Categorical parameter #optimizer = trial.suggest_categorical('optimizer', ['MomentumSGD', 'Adam']) # Int parameter #num_layers = trial.suggest_int('num_layers', 1, 3) # Uniform parameter #dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 1.0) # Loguniform parameter #learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2) # Discrete-uniform parameter #drop_path_rate = trial.suggest_discrete_uniform('drop_path_rate', 0.0, 1.0, 0.1) def optimize_envs(self, trial): return { 'reward_func': self.reward_strategy, 'forecast_len': int(trial.suggest_loguniform('forecast_len', 1, 200)), 'confidence_interval': trial.suggest_uniform('confidence_interval', 0.7, 0.99), } def optimize_config(self, trial): return { 'sl': trial.suggest_loguniform('sl', 1.0, 10.0), 'tp': trial.suggest_loguniform('tp', 1.0 ,10.0) } def optimize_ppo2(self,trial): return { 'n_steps': int(trial.suggest_int('n_steps', 16, 2048)), 'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999), 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.), 'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1), 'cliprange': trial.suggest_uniform('cliprange', 0.1, 0.4), 'noptepochs': int(trial.suggest_int('noptepochs', 1, 48)), 'lam': trial.suggest_uniform('lam', 0.8, 1.) } def optimize_lstm(self, trial): return { 'lstm': trial.suggest_categorical('optimizer', ['lstm', 'mlp']) } def ob_types(self, trial): return { 'lstm': trial.suggest_categorical('optimizer', ['lstm', 'mlp']) } def optimize_agent(self,trial): #self.env_params = self.optimize_envs(trial) env_id = "default"+str() num_e = 1 # Number of processes to use #self.config_param = self.optimize_config(trial) #self.config.sl = self.config_param['sl'] #self.config.sl = self.config_param['tp'] #self.model_type = self.optimize_lstm(trial) #self.model_type = self.model_type['lstm'] self.model_type = "mlp" if self.model_type == 'mlp': self.policy = CustomPolicy_5 else: self.policy = CustomPolicy_4 self.train_env = SubprocVecEnv([self.make_env(env_id+str('train'), i, eval=False, config=self.config) for i in range(num_e)]) #self.train_env = SubprocVecEnv([self.make_env(env_id, i, eval=False) for i in range(num_e)]) self.train_env = VecNormalize(self.train_env, norm_obs=True, norm_reward=True) self.test_env =SubprocVecEnv([self.make_env(env_id+str("test"), i, eval=True, config=self.config) for i in range(num_e)]) #self.test_env = SubprocVecEnv([self.make_env(env_id, i, eval=True) for i in range(num_e)]) self.test_env = VecNormalize(self.test_env, norm_obs=True, norm_reward=True) try: self.test_env.load_running_average("saves") self.train_env.load_running_average("saves") except: print('cant load') self.model_params = self.optimize_ppo2(trial) self.model = PPO2(self.policy, self.train_env, verbose=0, nminibatches=1, tensorboard_log="./gbp_chf_single", **self.model_params ) #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-4, nminibatches=1, tensorboard_log="./min1" ) last_reward = -np.finfo(np.float16).max #evaluation_interval = int(len(train_df) / self.n_evaluations) evaluation_interval = 36525 for eval_idx in range(self.n_evaluations): try: self.model.learn(evaluation_interval) self.test_env.save_running_average("saves") self.train_env.save_running_average("saves") except: print('did not work') rewards = [] n_episodes, reward_sum = 0, 0.0 print('Eval') obs = self.test_env.reset() #state = None #done = [False for _ in range(self.env.num_envs)] while n_episodes < self.n_test_episodes: action, _ = self.model.predict(obs, deterministic=True) obs, reward, done, _ = self.test_env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = self.test_env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward def optimize(self, config): self.config = config study_name = 'ppo2_single_ready' study_name = 'ppo2_single_ready_nosltp' study_name = 'ppo2_single_ready_nosltp_all_yeah' study_name = 'ppo2_eur_gbp_op' study_name = 'ppo2_gbp_chf_op' study_name = 'ppo2_gbp_chf_h1_new1' study_name = 'ppo2_gbp_chf_h4_r_new11' study_name = 'ppo2_gbp_chf_h4_r_withvolfixed' study_name = 'ppo2_gbp_chf_h4_r_withvolclosefix212' study_name = 'ppo2_gbp_chf_h4_loged_sortinonew' study = optuna.create_study( study_name=study_name, storage=self.params_db_file, load_if_exists=True) try: study.optimize(self.optimize_agent, n_trials=self.n_trials, n_jobs=self.n_jobs) except KeyboardInterrupt: pass print('Number of finished trials: ', len(study.trials)) print('Best trial:') trial = study.best_trial print('Value: ', trial.value) print('Params: ') for key, value in trial.params.items(): print(' {}: {}'.format(key, value)) return study.trials_dataframe() #if __name__ == '__main__': #optimize()
alt_reward = [] mix_reward = [] temp_reward = [] valveChange = [] speedPunishes = [] for i in range(7): data.append([]) for i in range(3): actions.append([]) lastValves = [0.15,0.2,0.15] for i in range(1000): action, _states = model.predict(obs,deterministic=True) obs, rewards, dones, info = eval_env.step(action) Or_obs = eval_env.get_original_obs() time.append(i) for j in range(7): data[j].append(Or_obs[0][j]) for j in range(3): actions[j].append(action[0][j]) alt_reward.append(abs(Or_obs[0][0]-Or_obs[0][1])/10) mix_reward.append(abs(Or_obs[0][6]-5.5)) temp_reward.append(abs(Or_obs[0][5]-900)/1000) plt.figure(figsize=(11, 8)) plt.subplot(4, 2, 1) plt.xlabel('Time(s)') plt.ylabel('Offset (m)')
cur_m = 0 env.reset() fp_path = '/Users/austin/PycharmProjects/RLDock/' with open('run.pml', 'w') as fp: i = 0 with open('pdbs_traj/test' + str(i) + '.pdb', 'w') as f: cur_m = env.render() f.write(cur_m.toPDB()) fp.write("load " + fp_path + 'pdbs_traj/test' + str(i) + '.pdb ') fp.write(", ligand, " + str(i + 1) + "\n") for i in range(1, 100): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) print(action, rewards, done) atom = env.render() header = atom.dump_header() states.append(atom.dump_coords()) cur_m = atom with open('pdbs_traj/test' + str(i) + '.pdb', 'w') as f: f.write(cur_m.toPDB()) fp.write("load " + fp_path + 'pdbs_traj/test' + str(i) + '.pdb ') fp.write(", ligand, " + str(i + 1) + "\n") if done: obs = env.reset() env.close()
class PPO2_SB(): def __init__(self): self.love = 'Ramona' self.env_fns = [] self.env_names = [] def make_env(self, env_id, rank, seed=0, eval=False,config=pc.configeurcad4h): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): self.config = config self.eval= eval env = Template_Gym(config=self.config, eval=self.eval) env.seed(seed + rank) return env set_global_seeds(seed) return _init def train(self, num_e=1, n_timesteps=1000000, save_fraction=0.0125, save='saves/audbuyh4120', config=pc.configgbpchf4h): env_id = "default" num_e = 1 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona self.config = config self.env = SubprocVecEnv([self.make_env(env_id, i, eval=False, config=self.config) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(CustomPolicy_4, self.env, verbose=0, nminibatches=1, tensorboard_log="./gbp_chf_4h_r", **self.config.params ) #self.model = PPO2(CustomPolicy_5, self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5 )#**self.config.params #self.model = PPO2.load('saves/playerdetails39', self.env, policy=CustomPolicy, tensorboard_log="./playerdetailsex" ) self.model = PPO2.load(self.config.path+str(79)+'.pkl', self.env, policy=CustomPolicy_5, tensorboard_log="./default/" ) #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) log_dir = "saves" #self.env.load_running_average(log_dir) for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(self.config.path+'8'+str(i)) self.env.save_running_average(log_dir) self.env.save_running_average(log_dir) def evaluate(self, num_env=1, num_steps=1461, load='saves/audbuyh1', runs=80, config=pc.configgbpchf4h): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = config.year+config.pair num_e = 1 self.config = config log_dir = self.config.log #log_dir = self.config.norm #self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True) for i in range(num_env)]) self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True, config=self.config) for i in range(num_env)]) #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" ) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) try: self.env.load_running_average(log_dir) except: print('cant load') for i in range(runs): #self.model = PPO2(CustomPolicy, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./moose14" ) #self.model = PPO2.load(self.config.path, self.env, policy=CustomPolicy_2, tensorboard_log="./default/" ) self.model = PPO2.load(self.config.path+'8'+str(i)+'.pkl', self.env, policy=CustomPolicy_5, tensorboard_log="./default/" ) #self.env.load_running_average(log_dir) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] #self.total_pips = [] obs = self.env.reset() state = None # When using VecEnv, done is a vector done = [False for _ in range(self.env.num_envs)] for i in range(num_steps): # _states are only useful when using LSTM policies action, state = self.model.predict(obs, state=state, mask=done, deterministic=True) obs, rewards , dones, _ = self.env.step(action) #actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env #obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) #self.env.save_running_average(log_dir) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) #self.env.save(log_dir) return mean_reward def live(self, num_env=1, num_steps=1461, load='saves/gbp_usd_buy', runs=1, config=pc.configgbpcad4h): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ self.config = config env_id = self.config.pair num_e = 1 log_dir = self.config.log self.config.live = True self.config.load = False self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True, config=self.config) for i in range(num_env)]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) try: self.env.load_running_average(self.config.log) except: print('cant load') self.env.num_envs = 1 for i in range(runs): self.model = PPO2.load(self.config.path+str(self.config.best)+'.pkl', self.env, policy=CustomPolicy_5, tensorboard_log="./default/" ) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] print(datetime.datetime.now()) print(time.ctime()) print('Market Check') print("Market time check") obs = self.env.reset() state = None # When using VecEnv, done is a vector done = [False for _ in range(self.env.num_envs)] for i in range(num_steps): # _states are only useful when using LSTM policies print("live step") action, state = self.model.predict(obs, state=state, mask=done, deterministic=True) obs, rewards , dones, _ = self.env.step(action) # # here, action, rewards and dones are arrays # # because we are using vectorized env #obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) print(datetime.datetime.now()) print(time.ctime()) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward_gbp_buy:", mean_reward, "Num episodes:", n_episodes) return mean_reward # The two function below are not working atm def pre_train(self, num_e=1, load="saves/m19"): env_id = 'default' num_e = 1 log_dir = "saves" # Usingenv = make_env() only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset dataset = ExpertDataset(expert_path='default2.npz',traj_limitation=1, batch_size=128) self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.save_running_average("saves"+self.config.pair) self.model = PPO2(CustomPolicy, self.env, verbose=1, nminibatches=1, learning_rate=1e-5, tensorboard_log="./m1ln4" ) #self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.save_running_average("saves"+self.config.pair) # Pretrain the PPO2 model self.model.pretrain(dataset, n_epochs=10000) # As an option, you can train the RL agent #self.model.learn(int(100000000)) # Test the pre-trained model self.env = self.model.get_env() self.env.save_running_average("saves"+self.config.pair) obs = self.env.reset() reward_sum = 0.0 for _ in range(1000000): action, _ = self.model.predict(obs) obs, reward, done, _ = self.env.step(action) reward_sum += reward #self.env.render() if done: print(reward_sum) reward_sum = 0.0 obs = self.env.reset() self.env.close() def gen_pre_train(self, num_e=1, save='default2', episodes=1000): #self.create_envs(game_name=game, state_name=state, num_env=num_e) #self.env=SubprocVecEnv(self.env_fns) env_id = 'default' num_e = 1 self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) #env = Template_Gym() #self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #env = make_env() #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1) self.env.load_running_average("saves") self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" ) self.env.load_running_average("saves") #env = make_env() #self.expert_agent = generate_expert_traj(self.model, save, self.env, n_episodes=episodes) def gen_pre_train_2(self, game, state, num_e=1, save='default2', episodes=10): self.create_envs(game_name=game, state_name=state, num_env=num_e) env=SubprocVecEnv(self.env_fns) self.expert_agent = "moose" self.generate_expert_traj(self.expert_agent, save, env, n_episodes=episodes)
class RocketTrainer: def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"): self.agent_name = agent_name #self.env = LearningRocket(visualize=False) #self.env = NormalizeActionWrapper(self.env) #self.eval_env = LearningRocket(visualize=True) #self.eval_env = NormalizeActionWrapper(self.eval_env) #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)]) self.env = make_vec_env( LearningRocket, n_envs=16 ) #[lambda: LearningRocket(visualize=False) for i in range(16)])) #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)])) self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True), n_envs=1) #self.eval_env = VecNormalize(self.eval_env) self.eval_callback = EvalCallback(self.eval_env, best_model_save_path='Agent007', log_path='./logs/', eval_freq=10000, deterministic=True, render=False, n_eval_episodes=1) kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300]) #check_env(self.env, warn=True) """ if algorithm == "SAC": if load is True: self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #self.model.ent_coef=0.2 else: self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5) print("Trainer Set for SAC") """ if algorithm == "TD3": n_actions = self.env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) if load is True: self.model = TD3.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") #file = open('replay_buffer', 'rb') #self.model.replay_buffer = pickle.load(file) #file.close() else: self.model = TD3(MlpPolicy, self.env, action_noise=action_noise, batch_size=768, gamma=0.95, learning_rate=1e-4, learning_starts=20000, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) print("Trainer Set for TD3") elif algorithm == "PPO2": if load is True: self.model = PPO2.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/") self.eval_env = VecNormalize.load(self.agent_name + "vEnv", self.eval_env) #self.eval_env.clip_obs = 500 #self.env = VecNormalize(self.env) self.env = VecNormalize.load(self.agent_name + "vEnv", self.env) #self.env.clip_obs = 500 #self.env.norm_obs = False #self.eval_env.norm_obs = False else: self.model = PPO2(PPOMlpPolicy, self.env, n_steps=1024, nminibatches=32, lam=0.98, gamma=0.999, noptepochs=4, ent_coef=0.01, verbose=1, tensorboard_log="./rocket_tensorboard/", policy_kwargs=dict(layers=[400, 300])) self.eval_env = VecNormalize(self.eval_env) self.env = VecNormalize(self.env) #self.eval_env.clip_obs = 500 #self.env.clip_obs = 500 #self.env.norm_obs=False #self.eval_env.norm_obs=False print("Trainer set for PPO2. I am speed.") def train(self, visualize=False, lesson_length=100000, lessons=1): print("Today I'm teaching rocket science. How hard can it be?") #self.env.render(visualize) for i in range(lessons): print("*sigh* here we go again.") self.model.learn( total_timesteps=lesson_length, callback=self.eval_callback) #,callback=self.eval_callback) self.model.save(self.agent_name) self.env.save(self.agent_name + "vEnv") #self.eval_env = VecNormalize.load(self.agent_name + "vEnv",self.eval_env) #a_file = open('replay_buffer', 'wb') #pickle.dump(self.model.replay_buffer, a_file) #a_file.close() print("{} Batches Done.".format(i + 1)) # plt.close() mean_reward, std_reward = evaluate_policy(self.model, self.eval_env, n_eval_episodes=1) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") self.evaluate() def lecture(self): teacher = DummyExpert() #teacher = NormalizeActionWrapper(teacher) print("Let me show you how it's done.") generate_expert_traj(teacher.teach, 'dummy_expert_rocket', self.env, n_episodes=10) def evaluate(self): self.eval_env.training = False self.eval_env.norm_reward = False print("Watch this!") obs = self.eval_env.reset() #self.eval_env.render(True) mean_reward, std_reward = evaluate_policy(self.model, self.eval_env, n_eval_episodes=1) print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") reward_list = [] reward_sum: List[float] = [] action_list = [] for i in range(3): action_list.append([]) Time = [] steps = 0 cumulativeReward = 0 data = [] for i in range(obs.size): data.append([]) for j in range(1000): action, states = self.model.predict(obs, deterministic=True) obs, reward, done, info = self.eval_env.step(action) #re_obs = self.eval_env.rescale_observation((obs)) #obs = self.eval_env.get_original_obs() #action = self.eval_env.rescale_action(action) reward_list.append(reward[0]) cumulativeReward += reward[0] reward_sum.append(cumulativeReward) action_list[0].append(action[0]) #for i in range(3): # action_list[i].append(action[i]) for i in range(obs.size): data[i].append(obs[0][i]) steps += 1 Time.append(steps) print("Another happy landing.") plt.figure(figsize=(11, 8)) plt.subplot(3, 2, 3) plt.xlabel('Time(s)') plt.ylabel('Position (m)') plt.plot(Time, data[0], label='X Position') plt.plot(Time, data[1], label='Speed') #plt.plot(Time, data[2], label='Z Position') plt.legend(loc='best') plt.subplot(3, 2, 1) plt.xlabel('Time(s)') plt.ylabel('Reward') plt.plot(Time, reward_list, label='Reward') plt.plot(Time, reward_sum, label='Total Reward') plt.legend(loc='best') plt.subplot(3, 2, 2) plt.xlabel('Time(s)') plt.ylabel('Actions') plt.plot(Time, action_list[0], label='Thrust') #plt.plot(Time, action_list[1], label='GimbalX') #plt.plot(Time, action_list[2], label='GimbalY') plt.legend(loc='best') plt.subplot(3, 2, 4) plt.xlabel('Time(s)') plt.ylabel('Attitude') #plt.plot(Time, data[4], label='Roll') #plt.plot(Time, data[4], label='Pitch') #plt.plot(Time, data[5], label='Yaw') plt.legend(loc='best') plt.subplot(3, 2, 5) plt.xlabel('Time(s)') plt.ylabel('Velocity') #plt.plot(Time, data[2], label='vX') #plt.plot(Time, data[3], label='vY') #plt.plot(Time, data[5], label='vZ') plt.legend(loc='best') plt.subplot(3, 2, 6) plt.xlabel('Time(s)') plt.ylabel('RotVel') #plt.plot(Time, data[12], label='Fuel') #plt.plot(Time, data[6], label='Rot X') #plt.plot(Time, data[7], label='Rot Y') plt.legend(loc='best') plt.tight_layout() plt.show()
class Optimization(): def __init__(self): self.reward_strategy = 'sortino2' #self.input_data_file = 'data/coinbase_hourly.csv' self.params_db_file = 'sqlite:///params.db' # number of parallel jobs self.n_jobs = 1 # maximum number of trials for finding the best hyperparams self.n_trials = 100 #number of test episodes per trial self.n_test_episodes = 10 # number of evaluations for pruning per trial self.n_evaluations = 10 #self.df = pd.read_csv(input_data_file) #self.df = df.drop(['Symbol'], axis=1) #self.df = df.sort_values(['Date']) #self.df = add_indicators(df.reset_index()) #self.train_len = int(len(df) * 0.8) #self.df = df[:train_len] #self.validation_len = int(train_len * 0.8) #self.train_df = df[:validation_len] #self.test_df = df[validation_len:] def make_env(self, env_id, rank, seed=0, eval=False): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): self.eval = eval env = Template_Gym(eval=self.eval) env.seed(seed + rank) return env set_global_seeds(seed) return _init def optimize_envs(self, trial): return { 'reward_func': self.reward_strategy, 'forecast_len': int(trial.suggest_loguniform('forecast_len', 1, 200)), 'confidence_interval': trial.suggest_uniform('confidence_interval', 0.7, 0.99), } def optimize_ppo2(self, trial): return { 'n_steps': int(trial.suggest_loguniform('n_steps', 16, 2048)), 'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999), 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.), 'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1), 'cliprange': trial.suggest_uniform('cliprange', 0.1, 0.4), 'noptepochs': int(trial.suggest_loguniform('noptepochs', 1, 48)), 'lam': trial.suggest_uniform('lam', 0.8, 1.) } def optimize_agent(self, trial): #self.env_params = self.optimize_envs(trial) env_id = "default" num_e = 1 # Number of processes to use self.train_env = DummyVecEnv([lambda: Template_Gym(eval=False)]) #self.train_env = SubprocVecEnv([self.make_env(env_id, i, eval=False) for i in range(num_e)]) self.train_env = VecNormalize(self.train_env, norm_obs=True, norm_reward=True) self.test_env = DummyVecEnv([lambda: Template_Gym(eval=True)]) #self.test_env = SubprocVecEnv([self.make_env(env_id, i, eval=True) for i in range(num_e)]) self.test_env = VecNormalize(self.train_env, norm_obs=True, norm_reward=True) self.model_params = self.optimize_ppo2(trial) self.model = PPO2(CustomPolicy_2, self.train_env, verbose=0, nminibatches=1, tensorboard_log=Path("./tensorboard2").name, **self.model_params) #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-4, nminibatches=1, tensorboard_log="./min1" ) last_reward = -np.finfo(np.float16).max #evaluation_interval = int(len(train_df) / self.n_evaluations) evaluation_interval = 3000 for eval_idx in range(self.n_evaluations): try: self.model.learn(evaluation_interval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 obs = self.test_env.reset() while n_episodes < self.n_test_episodes: action, _ = self.model.predict(obs) obs, reward, done, _ = self.test_env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = self.test_env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward def optimize(self): study_name = 'ppo2_' + self.reward_strategy study = optuna.create_study(study_name=study_name, storage=self.params_db_file, load_if_exists=True) try: study.optimize(self.optimize_agent, n_trials=self.n_trials, n_jobs=self.n_jobs) except KeyboardInterrupt: pass print('Number of finished trials: ', len(study.trials)) print('Best trial:') trial = study.best_trial print('Value: ', trial.value) print('Params: ') for key, value in trial.params.items(): print(' {}: {}'.format(key, value)) return study.trials_dataframe() #if __name__ == '__main__': #optimize()
class SAC_SB(): def __init__(self): self.love = 'Ramona' self.env_fns = [] self.env_names = [] def make_env(self, env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess """ def _init(): env = Template_Gym() env.seed(seed + rank) return env set_global_seeds(seed) return _init def train(self, num_e=1, n_timesteps=10000000, save_fraction=0.1, save='saves/m1'): env_id = "default" num_e = 32 # Number of processes to use # Create the vectorized environment #env = DummyVecEnv([lambda: env]) #Ramona #self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)]) env = Template_Gym() self.env = DummyVecEnv([lambda: env]) self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True) #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./test6" ) self.model = SAC(CustomPolicy_sac, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./m1lstm1") #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" ) n_timesteps = n_timesteps * save_fraction n_timesteps = int(n_timesteps) training_loop = 1 / save_fraction training_loop = int(training_loop) for i in range(training_loop): self.model.learn(n_timesteps) self.model.save(save+str(i)) def evaluate(self, num_env=32, num_steps=50, load="saves/defaultlstmday", runs=10): """ Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward """ env_id = 'default' num_e = 1 self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_env)]) #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" ) self.env = VecNormalize(self.env, norm_obs=False, norm_reward=True) for i in range(runs): self.model = PPO2.load(load+str(i), self.env, policy=CustomPolicy_2, tensorboard_log="./default/" ) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] #self.total_pips = [] obs = self.env.reset() for i in range(num_steps): # _states are only useful when using LSTM policies actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env obs, rewards, dones, info = self.env.step(actions) #self.total_pips.append(self.env.player.placement) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward