def _init_environment(self,datapath,window_size): df = pd.read_csv(datapath) bid_price_columns = [i for i in range(1,len(df.columns),20)] print(bid_price_columns) ask_price_columns = [i for i in range(3,len(df.columns),20)] bidPrices = df[df.columns[bid_price_columns]] askPrices = df[df.columns[bid_price_columns]] df_concat = pd.concat([bidPrices, askPrices]) midPrices = df_concat.groupby(df_concat.index).mean().transpose().values[-len(self.securities):] print(midPrices[:,0]) self.env = DummyVecEnv([lambda: securities_trading_env(np.array(midPrices).T)]) self.env = VecCheckNan(self.env, raise_exception=True) n_actions = self.env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) print(n_actions) if(self.policy == "DDPG"): self.model = DDPG(ddpgMlpPolicy, self.env, verbose=int(self.verbose), param_noise=param_noise, action_noise= action_noise) elif(self.policy=="TD3"): self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose)) elif(self.policy=="GAIL"): self.model = TD3(td3MlpPolicy, self.env, verbose=int(self.verbose)) else: self.model = PPO2(MlpLnLstmPolicy, self.env, verbose=int(self.verbose)) if self.load: #load model self.model = self.model.load("save/"+modelpath+".h5") #init model class self.gym_model = Agent(market_event_securities, market_event_queue, securities, queue, host, policy,strategy, cash_balance,self.model,self.env,window_size,self.inventory)
def test_check_nan(): """Test VecCheckNan Object""" env = DummyVecEnv([NanAndInfEnv]) env = VecCheckNan(env, raise_exception=True) env.step([[0]]) try: env.step([[float('NaN')]]) except ValueError: pass else: assert False try: env.step([[float('inf')]]) except ValueError: pass else: assert False try: env.step([[-1]]) except ValueError: pass else: assert False try: env.step([[1]]) except ValueError: pass else: assert False
def main(): save_path = args.checkpoint_dir + args.policy + "/" + args.policy env = gym.make("SegmentationEnv-v0", objs_dir=args.objs_dir, max_scenes=args.max_scenes, sample_size=args.sample_size, diff_punishment=args.diff_punishment, max_steps_per_scene=args.max_steps_per_scene, scene_mode=args.scene_mode, training=False, point_mode=args.point_mode, voxel_size=args.voxel_size, voxel_mode=args.voxel_mode, single_scenes=args.single_scenes, early_diff=args.early_diff) env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run env = VecCheckNan(env, raise_exception=True) model = PPO2.load(save_path, env=env) n_episodes = 10 for i in range(n_episodes): total_reward = 0 obs = env.reset() while True: action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) total_reward += reward if done: print("Total Reward: ", total_reward) break env.close()
def objective(trial): # copy to preserve original params _params = params.copy() _params['hyper_params'] = HYPERPARAMS_SAMPLER[args.algorithm.lower()](trial) # network architecture net_arch = trial.suggest_categorical('net_arch', ['8x8', '16x16', '32x32']) layers = map(int, net_arch.split('x')) policy_kwargs = dict(act_fun=tf.nn.relu, net_arch=list(layers)) print(f'*** beginning trial {trial.number}') print('\thyper-parameters:') for param, value in _params['hyper_params'].items(): print(f'\t\t{param}:{value}') print(f'\t\tnet_arch: {net_arch}') _params['save_dir'] = _params['save_dir'] / 'optimizer' try: # purge any previously saved models purge_model(_params, args, interactive=False) ###################################################### # learning phase - on possibly multiple environments # ###################################################### godot_instances = [GodotInstance(o_port, a_port) for o_port, a_port in get_godot_instances(args.n_godot_instances)] env = create_env(args, env_id, godot_instances, _params, session_path) env = VecCheckNan(env, warn_once=False, raise_exception=True) # learn and save model model = init_model(session_path, _params, env, args, policy_kwargs=policy_kwargs) learn(env, model, _params, args, session_path) env.close() ########################################################################## # evaluation phase - single environment (deterministic action selection) # ########################################################################## env = create_env(args, env_id, [GODOT_EVAL_INSTANCE], _params, session_path, eval=True) env = VecCheckNan(env, warn_once=False, raise_exception=True) # loaded previously learned model and evaluate model = init_model(session_path, _params, env, args, eval=True) mean_reward, _ = evaluate(model, env, args, n_episodes=n_episodes_per_eval) env.close() except (AssertionError, ValueError) as e: print(f'pruning optimizer trial {trial} due to exception {e}') raise optuna.exceptions.TrialPruned() # optuna minimizes the objective by default, so we need to flip the sign to maximize cost = -1 * mean_reward return cost
def make_env(n_envs=1, normalize=True, multiprocess=False, log_dir_env=None): """ Initializes an OpenAI Gym environment for training and evaluation. :param n_envs: Number of parallel environments to initialize :param normalize: Normalization of state values :param multiprocess: Use multi processing with the SubprocVecEnv instead of DummyVecEnv. Not recommended. :param log_dir_env: Parent directory of the environments log directory :return: """ def init_env(log_dir_env): if log_dir_env is None: log_dir_env = os.path.join(log_dir, "env_direct") os.makedirs(log_dir_env, exist_ok=True) log_dir_single = os.path.join(log_dir_env, str(uuid.uuid4())) env = gym.make('AtcEnv-v0') env = TimeLimit(env, 8000) os.makedirs(log_dir_single, exist_ok=True) env = Monitor(env, log_dir_single, allow_early_resets=True) return env if multiprocess: env = SubprocVecEnv( [lambda: init_env(log_dir_env) for i in range(n_envs)]) else: env = DummyVecEnv( [lambda: init_env(log_dir_env) for i in range(n_envs)]) env = VecCheckNan(env, raise_exception=True) if normalize: env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) return env
def _check_nan(env: gym.Env) -> None: """Check for Inf and NaN using the VecWrapper.""" vec_env = VecCheckNan(DummyVecEnv([lambda: env])) for _ in range(10): action = [env.action_space.sample()] _, _, _, _ = vec_env.step(action)
ghz_state = StandardGHZStateQ(N) else: ghz_state = StandardGHZState(N) env = QubitEnv(N=N, V=C6, geometry=geometry, t_list=np.linspace(0, t, t_num), Omega_range=OMEGA_RANGE, Delta_range=DELTA_RANGE, ghz_state=ghz_state, verbose=ENV_VERBOSE) return env generating_envs_start_time = time.time() env = EnvType([lambda: make_gym_env() for i in range(n_envs)]) env = VecCheckNan(env, raise_exception=True) generating_envs_end_time = time.time() print( f"Generated {n_envs} envs in {generating_envs_end_time - generating_envs_start_time:.3f}s" ) model = PPO2( MlpLstmPolicy, env, learning_rate=LEARNING_RATE, verbose=1, nminibatches=1, n_steps=t_num, tensorboard_log='./tensorboard_logs', # ent_coef=0.05 )
import __future__ import gym from stable_baselines.common.vec_env import VecCheckNan from stable_baselines.common import make_vec_env from stable_baselines import PPO2 env = make_vec_env('CMuRL_Env:CMuRL-Env-v0') env = VecCheckNan(env, raise_exception=True) model = PPO2.load('CMuRL_Model_v6') obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, _, _ = env.step(action) env.render()
def main(): global save_path, log_dir, model, best_mean_reward mk_dir(args.checkpoint_dir + args.policy) save_path = args.checkpoint_dir + args.policy + "/" + args.policy log_dir = args.summary_dir + args.policy mk_dir(log_dir) env = gym.make("SegmentationEnv-v0", objs_dir=args.objs_dir, max_scenes=args.max_scenes, sample_size=args.sample_size, diff_punishment=args.diff_punishment, max_steps_per_scene=args.max_steps_per_scene, scene_mode=args.scene_mode, point_mode=args.point_mode, voxel_size=args.voxel_size, voxel_mode=args.voxel_mode, single_scenes=args.single_scenes, early_diff=args.early_diff, wall_weight=args.wall_weight) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run env = VecCheckNan(env, raise_exception=True) net_module = importlib.import_module(args.policy) model = PPO2(net_module.Policy, env, verbose=args.verbose, tensorboard_log=log_dir, learning_rate=args.learning_rate, ent_coef=args.ent_coef, cliprange=args.cliprange, cliprange_vf=args.cliprange_vf, lam=args.lam, gamma=args.gamma, seed=args.seed, n_cpu_tf_sess=args.n_cpu_tf_sess, noptepochs=args.noptepochs, nminibatches=args.nminibatches, n_steps=args.n_steps, max_grad_norm=args.max_grad_norm) if os.path.isfile("expert_trajectories.npz") and args.pretrain == 1: print("------------start pretrain------------") #dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, traj_limitation=100, batch_size=16) dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, train_fraction=args.train_fraction, batch_size=args.pretrain_batch_size) #model.pretrain(dataset, learning_rate=0.001, n_epochs=1000) model = model.pretrain(dataset, val_interval=1, learning_rate=args.pretrain_learning_rate, n_epochs=args.pretrain_n_epochs) print("pretrain finished -- save model") model.save(save_path) returns = [] print("Calculate mean reward") n_episodes = 10 for i in range(n_episodes): total_reward = 0 obs = env.reset() while True: action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) total_reward += reward if done: returns.append(total_reward) break returns = np.array(returns) best_mean_reward = np.mean(returns) print("Best mean reward: {:.2f}".format(best_mean_reward)) model.learn(total_timesteps=args.total_timesteps, callback=callback) env.close()
MAX_FOR_BLOCKS = 230 botToUse = sys.argv[1] # oldLog = sys.argv[2] oldLog = "{\"log\":[{\"PayloadRolled\":{\"from\":1,\"block\":1}},{\"PayloadPlaced\":{\"from\":1,\"block\":1,\"orientation\":0,\"x\":0,\"y\":0}},{\"PayloadRolled\":{\"from\":0,\"block\":1}},{\"PayloadPlaced\":{\"from\":0,\"block\":1,\"orientation\":0,\"x\":8,\"y\":0}},{\"PayloadRolled\":{\"from\":1,\"block\":1}},{\"PayloadPlaced\":{\"from\":1,\"block\":1,\"orientation\":0,\"x\":2,\"y\":0}},{\"PayloadRolled\":{\"from\":0,\"block\":0}},{\"PayloadPlaced\":{\"from\":0,\"block\":0,\"orientation\":1,\"x\":3,\"y\":2}},{\"PayloadRolled\":{\"from\":1,\"block\":0}},{\"PayloadPlaced\":{\"from\":1,\"block\":0,\"orientation\":1,\"x\":3,\"y\":6}},{\"PayloadRolled\":{\"from\":0,\"block\":2}},{\"PayloadConsidering\":{\"play_index\":0}}]}" is_box_space = True dirname = "D:\\4-System\\rusty\\" filename = "50000_heutistic_pretrain_" filename += "box" if is_box_space else "discrete" np.seterr(all='raise') origEnv = gym.make("rustybox-v0" if is_box_space else "rustydiscrete-v0") origEnv.max_invalid_tries = 7 env = VecCheckNan(DummyVecEnv([lambda: origEnv])) # Instantiate the agent model = PPO2.load("models/ppo2boxbestparam/2e4-30.pkl", env=env) # model.load("models/pretrain/"+filename) rustLib.field_restore_log(origEnv.field, oldLog.encode('utf-8')) obs = field_to_array(origEnv.field) actions, _states = model.predict(obs) if is_box_space: action = 0 current_max = -1 for i in range(0, len(actions)): action_probability = actions[i] newIndex = MAX_FOR_BLOCKS + i action_possible = obs[int(newIndex / 10)][newIndex % 10] == 1
# Load Dataset stocks_df = dth.load_data(config.data_path) # make train, val, test df train, val, test = dth.train_val_test_split(stocks_df) # Training Env train_env = DummyVecEnv([ lambda: valueTradingEnv(df=train, sample=config.trainsampling, episodic=config.episodic, yearrange=config.yearrange, save_path=config.env_path.joinpath("train")) for i in range(config.num_envs) ]) train_env = VecCheckNan(train_env, raise_exception=True) # Validation Env val_env = DummyVecEnv([ lambda: valueTradingEnv(df=val, sample=False, episodic=config.episodic, yearrange=config.yearrange, save_path=config.env_path.joinpath("val")) for i in range(config.num_envs) ]) val_env = VecCheckNan(val_env, raise_exception=True) # test_env test_env = DummyVecEnv([ lambda: valueTradingEnv(df=test,
import gym import sys from stable_baselines.common.policies import MlpPolicy from stable_baselines import PPO2 from stable_baselines.common.vec_env import DummyVecEnv, VecCheckNan import numpy as np from os import listdir from os.path import isfile, join np.seterr(all='raise') npseed = 230386042 np.random.seed(npseed) gym.register(id="rustyblocks-v0", entry_point="custom_env:RustyBlocksEnv") env = gym.make("rustyblocks-v0") env.max_invalid_tries = 7 env = VecCheckNan(DummyVecEnv([lambda: env])) begin = 0 step_size = int(2e4) # dirname = "models/" # logdirname = "boardlog/" # modeldir = "ppo2boxbestparam/" # file_step = "2e4-" # for f in listdir(dirname+modeldir): # if f.startswith(file_step): # end = f[len(file_step):] # num = int(end[:len(end)-4]) # if num > begin: # begin = num # seed 420420420 # Instantiate the agent # model = PPO2('MlpPolicy', env,verbose=1,max_grad_norm=1.42481794257356,cliprange=1.36870169927419, vf_coef=0.487354638658612, ent_coef=0.000130839434944482, gamma=0.993211512071304, lam=0.92669713813749, learning_rate=0.00150606967404027, n_steps=709, noptepochs=35,nminibatches=1) # Train the agent
def single_run(self, folder_path, num_evals, policy_kwargs=None, is_baseline=False, baseline_policy=None): # initialize cProfile profiler_object = cProfile.Profile() profiler_object.enable() config = configparser.ConfigParser() config.read('gym_config/config.ini') rl_time_steps = config.getint('rl', 'time_steps') ent_coef = config.getfloat('rl', 'ent_coef') n_steps = config.getint('rl', 'n_steps') nminibatches = config.getint('rl', 'nminibatches') noptepochs = config.getint('rl', 'noptepochs') learning_rate = config.getfloat('rl', 'learning_rate') time_steps = config.getint('garden', 'time_steps') step = config.getint('garden', 'step') num_plants_per_type = config.getint('garden', 'num_plants_per_type') num_plant_types = config.getint('garden', 'num_plant_types') garden_x = config.getint('garden', 'X') garden_y = config.getint('garden', 'Y') garden_z = 2 * config.getint( 'garden', 'num_plant_types' ) + 1 # Z axis contains a matrix for every plant type plus one for water levels. sector_width = config.getint('garden', 'sector_width') sector_height = config.getint('garden', 'sector_height') action_low = config.getfloat('action', 'low') action_high = config.getfloat('action', 'high') obs_low = config.getint('obs', 'low') obs_high = config.getint('obs', 'high') env = gym.make( 'simalphagarden-v0', wrapper_env=SimAlphaGardenWrapper(time_steps, garden_x, garden_y, sector_width, sector_height, num_plant_types, num_plants_per_type, step=step), garden_x=garden_x, garden_y=garden_y, garden_z=garden_z, sector_width=sector_width, sector_height=sector_height, action_low=action_low, action_high=action_high, obs_low=obs_low, obs_high=obs_high, ) env = DummyVecEnv([lambda: env]) # TODO: Normalize input features? VecNormalize env = VecCheckNan(env, raise_exception=False) if is_baseline: copyfile('gym_config/config.ini', folder_path + '/config.ini') # Evaluate baseline on 50 random environments of same parameters. self.evaluate_policy(folder_path, num_evals, env, garden_x, garden_y, sector_width, sector_height, is_baseline=True, baseline_policy=baseline_policy, step=1) # Graph evaluations self.graph_utils.graph_evaluations(folder_path, garden_x, garden_y, time_steps, step, num_evals, num_plant_types) else: pathlib.Path(folder_path + '/ppo_v2_tensorboard').mkdir( parents=True, exist_ok=True) # Instantiate the agent model = PPO2(CustomCnnPolicy, env, policy_kwargs=policy_kwargs, ent_coef=ent_coef, n_steps=n_steps, nminibatches=nminibatches, noptepochs=noptepochs, learning_rate=learning_rate, verbose=1, tensorboard_log=folder_path + '/ppo_v2_tensorboard/') # model = PPO2(MlpPolicy, env, ent_coef=ent_coef, n_steps=n_steps, nminibatches=nminibatches, noptepochs=noptepochs, learning_rate=learning_rate, verbose=1, tensorboard_log=folder_path + '/ppo_v2_tensorboard/') # Train the agent model.learn( total_timesteps=rl_time_steps ) # this will crash explaining that the invalid value originated from the env model.save(folder_path + '/model') copyfile('gym_config/config.ini', folder_path + '/config.ini') # Evaluate model on 50 random environments of same parameters. self.evaluate_policy(folder_path, num_evals, env, garden_x, garden_y, sector_width, sector_height, is_baseline=False) # Graph evaluations # self.graph_utils.graph_evaluations(folder_path, garden_x, garden_y, time_steps, step, num_evals, num_plant_types) profiler_object.disable() # dump the profiler stats s = io.StringIO() ps = pstats.Stats(profiler_object, stream=s).sort_stats('cumulative') pathlib.Path(folder_path + '/Timings').mkdir(parents=True, exist_ok=True) ps.dump_stats(folder_path + '/Timings/dump.txt') # convert to human readable format out_stream = open(folder_path + '/Timings/time.txt', 'w') ps = pstats.Stats(folder_path + '/Timings/dump.txt', stream=out_stream) ps.strip_dirs().sort_stats('cumulative').print_stats()