def main(args): expert = None expert_state_dim = 0 if args.policy_path is not None: policy_path = args.policy_path expert = PPO.load(policy_path) expert_state_dim = expert.observation_space.shape[0] factory = EnvFactory(args.env) env = DummyVecEnv([factory.make_env]) if args.stats_path is not None: env = VecNormalize.load(args.stats_path, env) env.training = False else: env = VecNormalize(env, training=False) obs = env.reset() env.render() total_reward = 0 while True: if expert is None: action = env.action_space.sample() action = np.zeros_like(action) else: good_obs = obs[:, :expert_state_dim] action, _ = expert.predict(good_obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() reward = env.get_original_reward() total_reward += reward[0] if done: print("Total reward: {:.3f}".format(total_reward)) obs = env.reset() total_reward = 0
def main(): # multiprocess environment # n_cpu = 8 # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) n_cpu = 1 env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=False) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
def DRL_prediction(self,model,name,last_state,iter_num,turbulence_threshold,initial): ### make a prediction based on trained model### ## trading env trade_data = data_split(self.df, start=self.unique_trade_date[iter_num - self.rebalance_window], end=self.unique_trade_date[iter_num]) trade_env = DummyVecEnv([lambda: StockTradingEnv(trade_data, self.stock_dim, self.hmax, self.initial_amount, self.buy_cost_pct, self.sell_cost_pct, self.reward_scaling, self.state_space, self.action_space, self.tech_indicator_list, turbulence_threshold=turbulence_threshold, initial=initial, previous_state=last_state, model_name=name, mode='trade', iteration=iter_num, print_verbosity=self.print_verbosity)]) trade_obs = trade_env.reset() for i in range(len(trade_data.index.unique())): action, _states = model.predict(trade_obs) trade_obs, rewards, dones, info = trade_env.step(action) if i == (len(trade_data.index.unique()) - 2): # print(env_test.render()) last_state = trade_env.render() df_last_state = pd.DataFrame({'last_state': last_state}) df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i), index=False) return last_state
def DRL_prediction(df, model, name, last_state, iter_num, unique_trade_date, rebalance_window, turbulence_threshold, initial): ### make a prediction based on trained model### ## trading env trade_data = data_split(df, start=unique_trade_date[iter_num - rebalance_window], end=unique_trade_date[iter_num]) env_trade = DummyVecEnv([lambda: StockEnvTrade(trade_data, turbulence_threshold=turbulence_threshold, initial=initial, previous_state=last_state, model_name=name, iteration=iter_num)]) obs_trade = env_trade.reset() for i in range(len(trade_data.index.unique())): action, _states = model.predict(obs_trade) obs_trade, rewards, dones, info = env_trade.step(action) if i == (len(trade_data.index.unique()) - 2): # print(env_test.render()) last_state = env_trade.render() df_last_state = pd.DataFrame({'last_state': last_state}) df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i), index=False) return last_state
def random_train_test(): import gym import datetime as dt import matplotlib.pyplot as plt from stable_baselines3 import PPO from stable_baselines3.common.vec_env import DummyVecEnv import pandas as pd from lutils.stock import LTdxHq ltdxhq = LTdxHq() df = ltdxhq.get_k_data_1min('000032') # 000032 300142 603636 df = df[-240:] ltdxhq.close() model = PPO.load('ppo_stock') env = DummyVecEnv([lambda: LStockDailyEnv(df)]) obs = env.reset() rewards = [] actions = [] net_worths = [] for i in range(NEXT_OBSERVATION_SIZE, df.shape[0]): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) rewards.append(reward) actions.append(action[0][0]) net_worths.append(info[0]['net_worth']) # print(info[0]['current_step']) env.render() fig, ax = plt.subplots() ax.plot(rewards, label='rewards') ax.plot(actions, label='actions') ax.legend() ax2 = ax.twinx() ax2.plot(net_worths, label='net worth', color='red') ax2.legend() plt.show()
def stock_trade(stock_file): day_profits = [] df = pd.read_csv(stock_file) df = df.sort_values('date') # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: StockTradingEnv(df)]) model = PPO('MlpPolicy', env, verbose=0, tensorboard_log='./log') model.learn(total_timesteps=int(1e6)) df_test = pd.read_csv(stock_file.replace('train', 'test')) env = DummyVecEnv([lambda: StockTradingEnv(df_test)]) obs = env.reset() for i in range(len(df_test) - 1): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) profit = env.render() day_profits.append(profit) if done: break return day_profits
import gym import json import datetime as dt from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3 import PPO from env.StockTradingEnv import StockTradingEnv import pandas as pd df = pd.read_csv('./data/AAPL.csv') df = df.sort_values('Date') # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: StockTradingEnv(df)]) model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=20000) obs = env.reset() for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Project : crypto # @Time : 2021/4/26 13:48 # @Author : Adolf # @File : ppo_base.py # @Function : import pandas as pd from stable_baselines3 import PPO from stable_baselines3.common.vec_env import DummyVecEnv from strategy.reinforcement_base.environment.crypto_env_v0 import CryptoEnv df = pd.read_csv("dataset/1d/BTC.csv") df_test = pd.read_csv("dataset/1d/ETH.csv") env = DummyVecEnv([lambda: CryptoEnv(df)]) model = PPO("MlpPolicy", env, verbose=1) model.learn(total_timesteps=20000) env_test = DummyVecEnv([lambda: CryptoEnv(df)]) obs = env_test.reset() for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = env_test.step(action) env_test.render()
envs = SubprocVecEnv([make_env] * num_envs) envs = VecFrameStack(envs, n_stack=4) model = PPO.load("./subzero_model") model.set_env(envs) obs = envs.reset() print(obs.shape) # Create one env for testing env = DummyVecEnv([make_env]) env = VecFrameStack(env, n_stack=4) obs = env.reset() # model.predict(test_obs) would through an error # because the number of test env is different from the number of training env # so we need to complete the observation with zeroes zero_completed_obs = np.zeros((num_envs,) + envs.observation_space.shape) zero_completed_obs[0, :] = obs obs = zero_completed_obs while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render(mode="human") if dones.all() == True: break zero_completed_obs = np.zeros((num_envs,) + envs.observation_space.shape) zero_completed_obs[0, :] = obs obs = zero_completed_obs
import gym #from stable_baselines.common.policies import MlpPolicy from stable_baselines3.ppo import MlpPolicy #from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines3.common.vec_env import DummyVecEnv #from stable_baselines import PPO2 from stable_baselines3 import PPO from env.StockTradingEnv import StockTradingEnv import pandas as pd df = pd.read_csv('./data/MSFT.csv') df = df.sort_values('Date') # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: StockTradingEnv(df)]) model = PPO(MlpPolicy, env, verbose=1) model.learn(total_timesteps=50) obs = env.reset() print(f"Number of iterations {len(df['Date'])}") for i in range(len(df['Date'])): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render(mode='live')
def test_rl(): import gym import datetime as dt import matplotlib.pyplot as plt # from stable_baselines.common.policies import MlpPolicy, CnnPolicy, MlpLstmPolicy, ActorCriticPolicy, LstmPolicy # from stable_baselines.common.vec_env import DummyVecEnv # from stable_baselines import PPO2, PPO1, A2C, DQN, TD3, SAC # from stable_baselines3.common.policies import MlpPolicy from stable_baselines3 import PPO from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.common.evaluation import evaluate_policy from sklearn import preprocessing import pandas as pd from lutils.stock import LTdxHq ltdxhq = LTdxHq() code = '600519' # 000032 300142 603636 600519 df = ltdxhq.get_k_data_1min(code, end='2021-09-02') # 000032 300142 603636 600519 # df = ltdxhq.get_k_data_daily('603636', end='2019-01-01') # 000032 300142 603636 600519 df = StockDataFrame(df.rename(columns={'vol': 'volume'})) # min_max_scaler = preprocessing.MinMaxScaler() # df = pd.DataFrame(min_max_scaler.fit_transform(df.drop(columns=['date', 'code']))) # df.columns = ['open', 'close', 'high', 'low', 'volume', 'amount'] df_eval = ltdxhq.get_k_data_1min(code, start='2021-09-01') df_eval = StockDataFrame(df_eval.rename(columns={'vol': 'volume'})) ltdxhq.close() # df = ltdxhq.get_k_data_5min('603636') # df = ltdxhq.get_k_data_daily('603636') # df1 = df[:-240] # df2 = df[-240:] # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: LStockDailyEnv(df)]) # model = PPO2(MlpPolicy, env, verbose=1) # , tensorboard_log='log') model = PPO('MlpPolicy', env, verbose=1) # , tensorboard_log='log') model.learn(100000) # model = PPO1(LstmPolicy, env, verbose=1) # model.learn(total_timesteps=1000) # env.set_attr('df', df2) # obs = env.reset() # rewards = [] # actions = [] # net_worths = [] # # for i in range(220): # for i in range(NEXT_OBSERVATION_SIZE, df2.shape[0]): # # actual_obs = observation(df2, i) # # action, _states = model.predict(actual_obs) # # action = [action] # action, _states = model.predict(obs) # obs, reward, done, info = env.step(action) # rewards.append(reward) # actions.append(action[0][0]) # net_worths.append(info[0]['net_worth']) # # print(info[0]['current_step']) # env.render() # mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=1, render=True) # EVAL_EPS # print(mean_reward) model.save('ppo_stock') # model = PPO.load('ppo_stock') eval_env = DummyVecEnv([lambda: LStockDailyEnv(df_eval)]) obs = eval_env.reset() net_worths = [] actions = [] done, state = False, None while not done: action, state = model.predict(obs, state=state, deterministic=True) obs, reward, done, _info = eval_env.step(action) net_worths.append(_info[0]['net_worth']) # if is_recurrent: # obs[0, :] = new_obs # else: # obs = new_obs # if action[0] < Actions.Buy: # Buy # actions.append(1) # elif action[0] < Actions.Sell: # Sell # actions.append(2) # else: # actions.append(0) actions.append(action[0]) eval_env.render() plt.plot(net_worths) plt.plot(actions) plt.show()