def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) env.reset() # Initialize running mean for _ in range(100): env.step([env.action_space.sample()]) obs = env.reset() original_obs = env.get_original_obs() dummy_rewards = np.random.rand(10) # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) sync_envs_normalization(env, eval_env) # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs)) assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) if args.stats_path is None: envs = VecNormalize(envs, norm_obs=True, clip_obs=np.inf, norm_reward=False, clip_reward=np.inf) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) print("Do random explorations to build running averages") envs.reset() for _ in tqdm(range(1000)): random_action = np.stack( [envs.action_space.sample() for _ in range(n_envs)]) envs.step(random_action) envs.training = False # freeze the running averages (what a terrible variable name...) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs, device=args.device) learner.learn(total_timesteps=args.total_timesteps, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs, device=args.device, target_kl=2e-2) if args.device == 'cpu': torch.cuda.empty_cache() learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def main(args): expert = None expert_state_dim = 0 if args.policy_path is not None: policy_path = args.policy_path expert = PPO.load(policy_path) expert_state_dim = expert.observation_space.shape[0] factory = EnvFactory(args.env) env = DummyVecEnv([factory.make_env]) if args.stats_path is not None: env = VecNormalize.load(args.stats_path, env) env.training = False else: env = VecNormalize(env, training=False) obs = env.reset() env.render() total_reward = 0 while True: if expert is None: action = env.action_space.sample() action = np.zeros_like(action) else: good_obs = obs[:, :expert_state_dim] action, _ = expert.predict(good_obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() reward = env.get_original_reward() total_reward += reward[0] if done: print("Total reward: {:.3f}".format(total_reward)) obs = env.reset() total_reward = 0
def record_video_example(): # Record a Video. env_id = "CartPole-v1" video_folder = "logs/videos/" video_length = 100 env = DummyVecEnv([lambda: gym.make(env_id)]) obs = env.reset() # Record the video starting at the first step. env = VecVideoRecorder(env, video_folder, record_video_trigger=lambda x: x == 0, video_length=video_length, name_prefix=f"random-agent-{env_id}") env.reset() for _ in range(video_length + 1): action = [env.action_space.sample()] obs, _, _, _ = env.step(action) # Save the video. env.close()
def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) assert isinstance(unwrap_vec_normalize(env), VecNormalize) if not isinstance(env.observation_space, spaces.Dict): env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) if not isinstance(env.observation_space, spaces.Dict): eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) env.reset() # Initialize running mean latest_reward = None for _ in range(100): _, latest_reward, _, _ = env.step([env.action_space.sample()]) # Check that unnormalized reward is same as original reward original_latest_reward = env.get_original_reward() assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward)) obs = env.reset() dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced assert allclose(obs, eval_env.normalize_obs(original_obs)) assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def _make_warmstart(env_fn, **kwargs): """Warm-start VecNormalize by stepping through 100 actions.""" venv = DummyVecEnv([env_fn]) venv = VecNormalize(venv, **kwargs) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv
def _make_warmstart_cartpole(): """Warm-start VecNormalize by stepping through CartPole""" venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) venv = VecNormalize(venv) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv
def _make_warmstart_dict_env(): """Warm-start VecNormalize by stepping through BitFlippingEnv""" venv = DummyVecEnv([make_dict_env]) venv = VecNormalize(venv) venv.reset() venv.get_original_obs() for _ in range(100): actions = [venv.action_space.sample()] venv.step(actions) return venv
def test_obs_rms_vec_normalize(): env_fns = [lambda: DummyRewardEnv(0), lambda: DummyRewardEnv(1)] env = DummyVecEnv(env_fns) env = VecNormalize(env) env.reset() assert np.allclose(env.obs_rms.mean, 0.5, atol=1e-4) assert np.allclose(env.ret_rms.mean, 0.0, atol=1e-4) env.step([env.action_space.sample() for _ in range(len(env_fns))]) assert np.allclose(env.obs_rms.mean, 1.25, atol=1e-4) assert np.allclose(env.ret_rms.mean, 2, atol=1e-4) # Check convergence to true mean for _ in range(3000): env.step([env.action_space.sample() for _ in range(len(env_fns))]) assert np.allclose(env.obs_rms.mean, 2.0, atol=1e-3) assert np.allclose(env.ret_rms.mean, 5.688, atol=1e-3)
def DRL_prediction(df, model, name, last_state, iter_num, unique_trade_date, rebalance_window, turbulence_threshold, initial): ### make a prediction based on trained model### ## trading env trade_data = data_split(df, start=unique_trade_date[iter_num - rebalance_window], end=unique_trade_date[iter_num]) env_trade = DummyVecEnv([lambda: StockEnvTrade(trade_data, turbulence_threshold=turbulence_threshold, initial=initial, previous_state=last_state, model_name=name, iteration=iter_num)]) obs_trade = env_trade.reset() for i in range(len(trade_data.index.unique())): action, _states = model.predict(obs_trade) obs_trade, rewards, dones, info = env_trade.step(action) if i == (len(trade_data.index.unique()) - 2): # print(env_test.render()) last_state = env_trade.render() df_last_state = pd.DataFrame({'last_state': last_state}) df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i), index=False) return last_state
def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'): """ :param env_id: (str) :param model: (RL model) :param video_length: (int) :param prefix: (str) :param video_folder: (str) """ eval_env = DummyVecEnv([lambda: gym.make(env_id)]) # Start the video at step=0 and record 500 steps eval_env = VecVideoRecorder(eval_env, video_folder=video_folder, record_video_trigger=lambda step: step == 0, video_length=video_length, name_prefix=prefix) obs = eval_env.reset() for _ in range(video_length): action, _ = model.predict(obs) obs, _, _, _ = eval_env.step(action) # Close the video recorder eval_env.close()
def eval_100_trials(args): with open(args.config) as fp: json_data = json.load(fp) config = GameConfig.deserialize(json_data) config.agents_config[args.agent]["save_path"] += "_vs_time_pt.zip" env = DummyVecEnv( [lambda: retro.make(config.game_name, state=config.eval_state[1])]) agent = AgentLoader.get_agent(args.agent, config.agents_config, env, load=True) rew_list = [] trials = 100 for i in tqdm(range(trials)): obs = env.reset() done = False reward = 0 while not done: actions, _ = agent.agent.predict(obs) obs, rew, done, info = env.step(actions) reward += rew rew_list.append(reward) env.close() count = sum(i > 0 for i in rew_list) print("win percentage = {}%".format(count / trials * 100))
def get_sb_env(self): def get_self(): return deepcopy(self) e = DummyVecEnv([get_self]) obs = e.reset() return e, obs
def test_predict(model_class, env_id, device): if device == "cuda" and not th.cuda.is_available(): pytest.skip("CUDA not available") if env_id == "CartPole-v1": if model_class in [SAC, TD3]: return elif model_class in [DQN]: return # Test detection of different shapes by the predict method model = model_class("MlpPolicy", env_id, device=device) # Check that the policy is on the right device assert get_device(device).type == model.policy.device.type env = gym.make(env_id) vec_env = DummyVecEnv([lambda: gym.make(env_id), lambda: gym.make(env_id)]) obs = env.reset() action, _ = model.predict(obs) assert action.shape == env.action_space.shape assert env.action_space.contains(action) vec_env_obs = vec_env.reset() action, _ = model.predict(vec_env_obs) assert action.shape[0] == vec_env_obs.shape[0]
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False): print("Testing:") print(f" Seed {seed}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}") eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}") return total_reward, distance_x
def record_video(env_id, model, video_length=500, prefix='', video_folder='videos'): """ :param env_id: (str) :param model: (RL model) :param video_length: (int) :param prefix: (str) :param video_folder: (str) """ eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) val_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # Start the video at step=0 and record 500 steps eval_env = VecVideoRecorder(eval_env, video_folder='tmp', record_video_trigger=lambda step: step == 0, video_length=video_length, name_prefix=prefix) obs = eval_env.reset() for i in range(video_length): action, _ = model.predict(obs) obs, _, _, _ = eval_env.step(action) # Close the video recorder eval_env.close()
def eval_time(args): with open(args.config) as fp: json_data = json.load(fp) video_path = os.path.join("./videos", args.agent) config = GameConfig.deserialize(json_data) config.agents_config[args.agent]["save_path"] += "_vs_time_pt_check.zip" env = DummyVecEnv( [lambda: retro.make(config.game_name, state=config.eval_state[1])]) agent = AgentLoader.get_agent(args.agent, config.agents_config, env, load=True) env.close() env = DummyVecEnv([ lambda: retro.make( config.game_name, state=config.eval_state[1], record=video_path) ]) obs = env.reset() done = False while not done: actions, _ = agent.agent.predict(obs) obs, rew, done, info = env.step(actions) # env.render() env.close()
def DRL_prediction(self,model,name,last_state,iter_num,turbulence_threshold,initial): ### make a prediction based on trained model### ## trading env trade_data = data_split(self.df, start=self.unique_trade_date[iter_num - self.rebalance_window], end=self.unique_trade_date[iter_num]) trade_env = DummyVecEnv([lambda: StockTradingEnv(trade_data, self.stock_dim, self.hmax, self.initial_amount, self.buy_cost_pct, self.sell_cost_pct, self.reward_scaling, self.state_space, self.action_space, self.tech_indicator_list, turbulence_threshold=turbulence_threshold, initial=initial, previous_state=last_state, model_name=name, mode='trade', iteration=iter_num, print_verbosity=self.print_verbosity)]) trade_obs = trade_env.reset() for i in range(len(trade_data.index.unique())): action, _states = model.predict(trade_obs) trade_obs, rewards, dones, info = trade_env.step(action) if i == (len(trade_data.index.unique()) - 2): # print(env_test.render()) last_state = trade_env.render() df_last_state = pd.DataFrame({'last_state': last_state}) df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i), index=False) return last_state
def main(config, agent): with open(config) as fp: json_data = json.load(fp) video_path = os.path.join("./videos", agent, "pong") config = GameConfig.deserialize(json_data) config.agents_config[args.agent]["save_path"] += "best_model.zip" # config.agents_config[args.agent]["save_path"] = "my_models/pong/pong_ppo/best_model.zip" print(config.agents_config[args.agent]["save_path"]) # env = retro.make(config.game_name) env = gym.make("PongNoFrameskip-v4") agent = AgentLoader.get_agent(args.agent, config.agents_config, env, load=True) env.close() env = gym.make("PongNoFrameskip-v4") env = DummyVecEnv([lambda: env]) # env = retro.make(config.game_name, record=video_path) env = VecVideoRecorder( env, video_path, record_video_trigger=lambda x: x == 0, ) obs = env.reset() done = False while not done: actions, _ = agent.agent.predict(obs) obs, rew, done, info = env.step(actions) env.close()
def train_stable_baselines(submodule, flags): """Train policies using the PPO algorithm in stable-baselines.""" from stable_baselines3.common.vec_env import DummyVecEnv flow_params = submodule.flow_params # Path to the saved files exp_tag = flow_params['exp_tag'] result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S")) # Perform training. start_time = timeit.default_timer() # print experiment.json information print("=========================================") print('Beginning training.') print('Algorithm :', flags.algorithm) model = run_model_stablebaseline(flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps, flags.algorithm, flags.exp_config) stop_time = timeit.default_timer() run_time = stop_time - start_time print("Training is Finished") print("total runtime: ", run_time) # Save the model to a desired folder and then delete it to demonstrate # loading. print('Saving the trained model!') path = os.path.realpath(os.path.expanduser('~/baseline_results')) ensure_dir(path) save_path = os.path.join(path, result_name) model.save(save_path) # dump the flow params with open(os.path.join(path, result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) # Replay the result by loading the model print('Loading the trained model and testing it out!') if flags.exp_config.lower() == "ppo": from stable_baselines3 import PPO model = PPO.load(save_path) elif flags.exp_config.lower() == "ddpg": from stable_baselines3 import DDPG model = DDPG.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = True env = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print('the final reward is {}'.format(reward))
def main(): # multiprocess environment # n_cpu = 8 # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) n_cpu = 1 env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=False) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
def test_discrete(model_class, env): env = DummyVecEnv([lambda: env]) model = model_class('MlpPolicy', env, gamma=0.5, seed=1).learn(3000) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) obs = env.reset() assert np.shape(model.predict(obs)[0]) == np.shape(obs)
def train_stable_baselines3(submodule, flags): """Train policies using the PPO algorithm in stable-baselines3.""" from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3 import PPO import torch start_time = timeit.default_timer() flow_params = submodule.flow_params # Path to the saved files exp_tag = flow_params['exp_tag'] result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S")) # Perform training. print("cuda is available: ", torch.cuda.is_available()) print('Beginning training.') print("==========================================") model = run_model_stablebaseline3(flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps) # Save the model to a desired folder and then delete it to demonstrate # loading. print('Saving the trained model!') path = os.path.realpath(os.path.expanduser('~/baseline_results')) ensure_dir(path) save_path = os.path.join(path, result_name) model.save(save_path) # dump the flow params # check time for choose GPU and CPU stop_time = timeit.default_timer() run_time = stop_time - start_time with open(os.path.join(path, result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) # Replay the result by loading the model print('Loading the trained model and testing it out!') model.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = False flow_params['env'].horizon = 1500 # 150seconds operation env = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print("--------------------------------------------------------") flow_params['sim'].render = True simulation = Experiment(flow_params) simulation.run(num_runs=1) print('the final reward is {}'.format(reward)) print("total run_time:", run_time, "s")
def play(env_name, load_file, total_timesteps): env = DummyVecEnv([lambda: gym.make(env_name)]) model = PPO.load(load_file, verbose=1) obs = env.reset() for i in range(total_timesteps): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) # env.render() # dummy if done: print(info[0]['episode']) del model env.close()
def test(test_n, seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"): print("Testing:") total_rewards = [] distance_xs = [] for i in range(test_n): print(f" Seed {seed+i}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}") eval_env = utils.make_env(render=render, wrapper=None, robot_body=test, body_info=test_as_class) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed+i) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}") total_rewards.append(total_reward) distance_xs.append(distance_x) # avoid yaml turn float64 to numpy array total_rewards = [float(x) for x in total_rewards] distance_xs = [float(x) for x in distance_xs] data = { "title": "test", "train": train, "test": test, "total_reward": total_rewards, "distance_x": distance_xs, } with open(f"{save_file}", "w") as f: yaml.dump(data, f)
def test_check_nan(): """Test VecCheckNan Object""" env = DummyVecEnv([NanAndInfEnv]) env = VecCheckNan(env, raise_exception=True) env.step([[0]]) with pytest.raises(ValueError): env.step([[float('NaN')]]) with pytest.raises(ValueError): env.step([[float('inf')]]) with pytest.raises(ValueError): env.step([[-1]]) with pytest.raises(ValueError): env.step([[1]]) env.step(np.array([[0, 1], [0, 1]])) env.reset()
def random_train_model(): import gym import datetime as dt import matplotlib.pyplot as plt from stable_baselines3 import PPO from stable_baselines3.common.vec_env import DummyVecEnv import pandas as pd from lutils.stock import LTdxHq import tushare as ts pro = ts.pro_api() stock_codes = pro.stock_basic(exchange='', list_status='L', fields='ts_code,symbol,name,area,industry,list_date') env = DummyVecEnv([lambda: LStockDailyEnv()]) # model = PPO('MlpPolicy', env, verbose=1) model = PPO.load('ppo_stock') model.set_env(env) for i in range(10): code = random.choice(stock_codes['ts_code'])[:-3] print('load data: %s' % code) ltdxhq = LTdxHq() df = ltdxhq.get_k_data_1min(code) # 000032 300142 603636 600519 ltdxhq.close() df = df[:-240] env.set_attr('df', df) env.reset() model.learn(20000) model.save('ppo_stock')
def create_env_trading(self, env_class, data, turbulence_threshold=150): env_trade = DummyVecEnv([ lambda: env_class(df=data, stock_dim=self.stock_dim, hmax=self.hmax, initial_amount=self.initial_amount, transaction_cost_pct=self.transaction_cost_pct, reward_scaling=self.reward_scaling, state_space=self.state_space, action_space=self.action_space, tech_indicator_list=self.tech_indicator_list, turbulence_threshold=turbulence_threshold) ]) obs_trade = env_trade.reset() return env_trade, obs_trade
def record_video(env_name, train_env, model, videoLength=500, prefix='', videoPath='videos/'): print('record_video function') # Wrap the env in a Vec Video Recorder local_eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) local_eval_env = VecNormalize(local_eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) sync_envs_normalization(train_env, local_eval_env) local_eval_env = VecVideoRecorder(local_eval_env, video_folder=videoPath, record_video_trigger=lambda step: step == 0, video_length=videoLength, name_prefix=prefix) obs = local_eval_env.reset() for _ in range(videoLength): action, _ = model.predict(obs) obs, _, _, _ = local_eval_env.step(action) # Close the video recorder local_eval_env.close()
def test_predict(model_class, env_id): if env_id == 'CartPole-v1' and model_class not in [PPO, A2C]: return # test detection of different shapes by the predict method model = model_class('MlpPolicy', env_id) env = gym.make(env_id) vec_env = DummyVecEnv([lambda: gym.make(env_id), lambda: gym.make(env_id)]) obs = env.reset() action, _ = model.predict(obs) assert action.shape == env.action_space.shape assert env.action_space.contains(action) vec_env_obs = vec_env.reset() action, _ = model.predict(vec_env_obs) assert action.shape[0] == vec_env_obs.shape[0]