def main(): train_env = SubprocVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)]) model = PPO2("MlpPolicy", env, verbose=1) model.learn(total_timesteps=100000) test_env = DummyVecEnv([lambda: gym.make(ENV_ID)]) state = env.reset() for i in range(200): test_env.render() action, _ = model.predict(state, deterministic=True) state, reward, done, info = test_env.step(action) if done: break env.close()
def stock_trade(stock_file): day_profits = [] df = pd.read_csv(stock_file) df = df.sort_values('date') # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: StockTradingEnv(df)]) model = PPO2(MlpPolicy, env, verbose=0, tensorboard_log='./log', gamma=0.95, n_steps=20, learning_rate=2.5e-2) model.learn(total_timesteps=int(1e5)) df_test = pd.read_csv(stock_file.replace('train', 'test')) env = DummyVecEnv([lambda: StockTradingEnv(df_test)]) obs = env.reset() for i in range(len(df_test) - 1): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) profit = env.render() day_profits.append(profit) if done: break return day_profits
def DRL_prediction(df, model, name, last_state, iter_num, unique_trade_date, rebalance_window, turbulence_threshold, initial): ### make a prediction based on trained model### ## trading env trade_data = data_split(df, start=unique_trade_date[iter_num - rebalance_window], end=unique_trade_date[iter_num]) env_trade = DummyVecEnv([lambda: StockEnvTrade(trade_data, turbulence_threshold=turbulence_threshold, initial=initial, previous_state=last_state, model_name=name, iteration=iter_num)]) obs_trade = env_trade.reset() for i in range(len(trade_data.index.unique())): action, _states = model.predict(obs_trade) obs_trade, rewards, dones, info = env_trade.step(action) if i == (len(trade_data.index.unique()) - 2): # print(env_test.render()) last_state = env_trade.render() df_last_state = pd.DataFrame({'last_state': last_state}) df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i), index=False) return last_state
def run_PPO2_model(test_data_file, model_path): model = PPO2.load(model_path) day_profits = [] buy_hold_profit = [] df_test = pd.read_csv(test_data_file) env_test = DummyVecEnv([lambda: StockTradingEnv_US(df_test)]) obs = env_test.reset() no_of_shares = 0 buy_hold_commission = 0 for i in range(len(df_test) - 1): if i > 22: break action, _states = model.predict(obs) obs, rewards, done, info = env_test.step(action) profit = env_test.render() day_profits.append(profit) if i == 0: buy_hold_profit.append(0) no_of_shares = INITIAL_ACCOUNT_BALANCE // df_test.iloc[0]['Close'] buy_hold_commission = no_of_shares * df_test.iloc[0][ 'Close'] * 0.001 print('Buy ' + str(no_of_shares) + ' shares and hold') else: buy_hold_profit_per_step = no_of_shares * ( df_test.iloc[i]['Close'] - df_test.iloc[0]['Close']) - buy_hold_commission buy_hold_profit.append(buy_hold_profit_per_step) print('Buy and Hold: ' + '*' * 40) print('No of shares: ' + str(no_of_shares) + ' average cost per share ' + str(df_test.iloc[0]['Close'])) print('profit is ' + str(buy_hold_profit_per_step))
def main(): cmd_parser = cmd_parse() options = cmd_parser.parse_args() ## Get the Stock Ticker data ## # print("The Stock ticker used here is ", options.ticker) file = Path("./data/" + options.ticker + ".csv") if file.is_file(): df = pd.read_csv('./data/' + options.ticker + '.csv') df = df.sort_values('Date') print("Loading ticker data from: " + "./data/" + options.ticker + ".csv") else: print( "Data file for ticker does not exist. Please download data first to ./data/" + options.ticker + ".csv") training_logs_path = options.output_file + "_training_logs.csv" eval_logs_path = options.output_file + "_eval_logs" ## Get the training set size ## print("The options.training_set_size is ", options.training_set_size) ## Get the number of look back days ## print("The options.look-back-days here is: ", options.look_back_days) ## Get the model we are using to train the agent ## print("The model to train the agent here is: ", options.model) # The algorithms require a vectorized environment to run env = DummyVecEnv([ lambda: StockTradingEnv(df, options.look_back_days, options. training_set_size, eval_logs_path) ]) if options.model == "PPO2": model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=options.training_set_size) np.savetxt(training_logs_path, model.training_rewards, delimiter=",") obs = env.reset() for i in range(options.training_set_size, len(df['Date'])): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render(title=options.ticker) env.close()
def main(): train_env = SubprocVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)]) model = PPO2('MlpPolicy', train_env, verbose=1) model.learn(total_timesteps=10000) test_env = DummyVecEnv([lambda: gym.make(ENV_ID)]) state = test_env.reset() for i in range(200): test_env.render() action, _ = model.predict(state) state, rewards, done, info = test_env.step(action) # エピソード完了 if done: break # 環境のクローズ test_env.close()
def test(obj, gif_steps): if isinstance(obj, str): model, env = load_from_name(obj, best=True) else: raise ValueError("Unexpected input type to load") # print("ENV", env) # env.env.env.early_low_termination = True env = DummyVecEnv([lambda: env]) obs = env.reset() if gif_steps == 0: total_reward = 0 ep_len = 0 for _ in range(10000): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) total_reward += reward ep_len += 1 if done: print("EP length:", ep_len, " reward:", total_reward) total_reward = 0 ep_len = 0 obs = env.reset() frame = env.render() else: gif_frames = list() rewards = list() for i in range(gif_steps): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) if 'frames' in info[0]: gif_frames.extend(info[0]['frames']) frame = env.render(mode='rgb_array') gif_frames.append(frame) rewards.append(reward) print("REWARD", sum(rewards)) import imageio render_path = os.path.join(RENDERS, obj + '.gif') print("saving to ", render_path) os.makedirs(os.path.dirname(render_path), exist_ok=True) imageio.mimsave(render_path, gif_frames[::5], subrectangles=True, duration=0.05)
def test_gym_integration(self): g = nx.Graph() g.add_edges_from([(0,1)]) nx.set_node_attributes(g, {0: (0,1), 1: (1,2)}, "coords") orders = [(1,1,1,1,0.5)] drivers = np.ones((2), dtype=int) env_id = "TaxiEnvBatchTest-v01" gym.envs.register( id=env_id, entry_point='gym_taxi.envs:TaxiEnvBatch', kwargs={ 'world': g, 'orders': orders, 'order_sampling_rate': 1, 'drivers_per_node': drivers, 'n_intervals': 10, 'wc': 0.5 } ) DATA_PATH = os.path.join(os.environ['ALLDATA_PATH'], "macaoFiles", "taxi_env_batch_test") if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH) os.makedirs(DATA_PATH) def make_env(): env = gym.make(env_id) env.seed(1) return env env = DummyVecEnv([make_env]) model = A2C(MlpPolicy, env, verbose=1) model.learn(total_timesteps=10) obs = env.reset() images = [] img = env.render(mode="rgb_array") images.append(img) for _ in range(10): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) images.append(env.render(mode="rgb_array")) imageio.mimwrite(os.path.join(DATA_PATH, 'taxi_batch_a2c.gif'), [np.array(img) for i, img in enumerate(images)], format="GIF-PIL", fps=5)
def main(): train_env = DummyVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)]) model = PPO2('CnnPolicy', train_env, verbose=0) model.learn(total_timesteps=1280000, callback=callback) test_env = DummyVecEnv([make_env(ENV_ID, 9)]) state = test_env.reset() total_reward = 0 while True: test_env.render() time.sleep(1 / 60) action, _ = model.predict(state) state, reward, done, info = test_env.step(action) total_reward += reward[0] if done: print('reward:', total_reward) state = test_env.reset() total_reward = 0
def run(): """ Run a trained model for the pong problem """ env = gym.make('CarRacing-v0') env = DummyVecEnv([lambda: env]) # model = PPO2.load("CarRacing_model_PPO1_"+ str(5) +".pkl", env) model = PPO2.load("CarRacing_model_PPO2_5.pkl", env) avg_rew = evaluate(model=model, env=env, num_steps=10000) while True: obs, done = env.reset(), False episode_rew = 0 while not done: env.render() action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) episode_rew += rew print("Episode reward", episode_rew)
def run_baseline_ddpg(env_name, train=True): import numpy as np # from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG env = gym.make(env_name) env = DummyVecEnv([lambda: env]) if train: # mlp from stable_baselines.ddpg.policies import FeedForwardPolicy class CustomPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[64, 64, 64], layer_norm=True, feature_extraction="mlp") # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions)+0.15, sigma=0.3 * np.ones(n_actions)) model = DDPG(CustomPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tau=0.01, observation_range=(env.observation_space.low, env.observation_space.high), critic_l2_reg=0, actor_lr=1e-3, critic_lr=1e-3, memory_limit=100000) model.learn(total_timesteps=1e5) model.save("checkpoints/ddpg_" + env_name) else: model = DDPG.load("checkpoints/ddpg_" + env_name) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info) del model # remove to demonstrate saving and loading
def test(): env = DummyVecEnv([lambda: PrticleEnv(alpha, win_thre)]) model = PPO2.load("ppo2_particle") # Enjoy trained agent obs = env.reset() dones = False x = 0 y = 0 x_prev = 20 y_prev = 20 while not dones: if int(x) != int(x_prev) or int(y) != int(y_prev): env.render() x_prev, y_prev = x, y action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) x = obs[0][0] y = obs[0][2] env.close()
def test(self, model_epoch: int = 0, should_render: bool = True): train_provider, test_provider = self.data_provider.split_data_train_test( self.train_split_percentage) del train_provider init_envs = DummyVecEnv( [make_env(test_provider) for _ in range(self.n_envs)]) model_path = path.join('data', 'agents', f'{self.study_name}__{model_epoch}.pkl') model = self.Model.load(model_path, env=init_envs) test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)]) self.logger.info(f'Testing model ({self.study_name}__{model_epoch})') zero_completed_obs = np.zeros((self.n_envs, ) + init_envs.observation_space.shape) zero_completed_obs[0, :] = test_env.reset() state = None rewards = [] for _ in range(len(test_provider.data_frame)): action, state = model.predict(zero_completed_obs, state=state) obs, reward, _, __ = test_env.step([action[0]]) zero_completed_obs[0, :] = obs rewards.append(reward) if should_render: test_env.render(mode='human') self.logger.info( f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}' )
def view_ppo2_mlplstm(): env = gimbal(5, 500) env = DummyVecEnv([lambda: gimbal(5, 500)]) model = PPO2.load("./models/baseline_ppo2_t6_dynamicR") success_rate = 0 reward_avg = 0 for episodes in range(50): obs = env.reset() state = None done = [False] r = 0 while True: action, state = model.predict(obs, state=state, mask=done) obs, rewards, done, _ = env.step(action) r += rewards env.render() if done: if r > -100: success_rate += 1 reward_avg += r break print("Success rate: ", success_rate, "Avg rewards: ", (reward_avg / success_rate))
def sb_lstm(): train_env = DummyVecEnv([ lambda: Environment(mode="train", interval=INTERVAL, pair=PAIR, algo=ALGO, data_features=FEATURES) ]) model = PPO2('MlpLstmPolicy', train_env, nminibatches=1, verbose=1) model.learn(TRAIN_TIMESTEPS) validate_env = DummyVecEnv([ lambda: Environment(mode="validate", interval=INTERVAL, pair=PAIR, algo=ALGO, data_features=FEATURES) ]) obs = validate_env.reset() state = None done = [False for _ in range(validate_env.num_envs)] for _ in range(len(validate_env.envs[0].df)): action, state = model.predict(obs, state=state, mask=done) obs, reward, done, _ = validate_env.step(action) validate_env.render()
def test_cartpole(): env = gym.make('CartPole-v0') env = DummyVecEnv([lambda: env]) model = PPO2(MlpPolicy, env) model.learn(total_timesteps=100000) rewards = [] for i in range(10): done = False cum_rewards = 0 obs = env.reset() while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action) cum_rewards += reward env.render() rewards.append(cum_rewards) print(cum_rewards) avg_rewards = sum(rewards) / len(rewards) print('average', avg_rewards) assert avg_rewards >= 200 env.close()
def main(): train_env = DummyVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)]) model = PPO2("CnnPolicy", train_env, verbose=0, cliprange=0.1) model = PPO2.load("./logs/best_model.zip", env=train_env, verbose=0) callback = SaveOnBestTrainingRewardCallback(check_freq=10, log_dir=log_dir) #model.learn(total_timesteps=1280000, callback=callback) test_env = DummyVecEnv([make_env(ENV_ID, 9)]) state = test_env.reset() total_reward = 0 while True: test_env.render() time.sleep(1 / 60) action, _ = model.predict(state) state, reward, done, info = test_env.step(action) total_reward += reward[0] if done: print(f"reward:{total_reward}") state = test_env.reset() total_reward = 0
def main(): # 学習環境の生成 train_env = DummyVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)]) # モデルの生成 model = PPO2('CnnPolicy', train_env, verbose=0, cliprange=0.1) # モデルの読み込み # model = PPO2.load('breakout_model', env=train_env, verbose=0) # モデルの学習 model.learn(total_timesteps=1280000, callback=callback) # テスト環境の生成 test_env = DummyVecEnv([make_env(ENV_ID, 9)]) # モデルのテスト state = test_env.reset() total_reward = 0 while True: # 環境の描画 test_env.render() time.sleep(1 / 60) # モデルの推論 action, _ = model.predict(state) # 1ステップ実行 state, reward, done, info = test_env.step(action) # エピソードの完了 total_reward += reward[0] if done: print('reward:', total_reward) state = test_env.reset() total_reward = 0
def main(model_name, algo, testRange, isTargetPositionFixed, isDiscrete): panda_env = PandaGraspGymEnv(urdfRoot=object_data.getDataPath(), isRendering=True, useIK=True, isDiscrete=isDiscrete, numControlledJoints=7, isTargetPositionFixed=isTargetPositionFixed) env = DummyVecEnv([lambda: panda_env]) if algo == "DDPG": model = DDPG.load(model_name) else: model = DQN.load(model_name) obs = env.reset() images = [] img = env.get_images() for i in range(testRange): images.append(img) action, _states = model.predict(obs, deterministic=True) print("Step: {} Action: {}".format(i, action)) obs, rewards, done, info = env.step(action) env.render(mode='human') img = env.get_images() os.makedirs(gif_dir, exist_ok=True) imageio.mimsave(gif_dir + model_name + '.gif', [np.array(img[0]) for i, img in enumerate(images) if i % 2 == 0], fps=29)
def run(self, max_episodes=500, max_timesteps=10000): """ Run the PPO RL algorithm provided by stable baselines library (https://github.com/hill-a/stable-baselines) and save the generated model back to training job S3 bucket """ unity_file = self.download_unity_env() env = self.get_gym_env(unity_file) # ========================================== # # RL stable baselines algorithms require a vectorized environment to run env = DummyVecEnv([lambda: env]) model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=max_timesteps) obs = env.reset() for i in range(max_episodes): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() sb_model_path = os.path.join('/tmp', 'ppo2_rldemo_sb') model.save(sb_model_path) # Note: the content of /opt/ml/model and /opt/ml/output is automatically uploaded # to previously selected bucket (by the estimator) at the end of the execution # os.environ['SM_MODEL_DIR'] correspongs to /opt/ml/model model_path = os.path.join(os.environ['SM_MODEL_DIR'], 'ppo2_rldemo') # Note: this model can not be directly employed in Unity ml-agents # it has to be converted into Barracuda format generate_checkpoint_from_model(sb_model_path, model_path) # ========================================== # BaselinePPOTrainer.close_env(env)
def train_multitask(df, unique_trade_date, timesteps=10, policy="MlpPolicy", model_name="multitask"): # df of all intermixed values # get out the individual tickers and switch out the dates # timesteps = num training steps per date start = time.time() df = data_split(df, start=unique_trade_date[0], end=unique_trade_date[len(unique_trade_date) - 1]) last_state, initial = [], True model = None for i in range(len(unique_trade_date) - 2): for ticker in df["tic"].unique(): # Interval is every two days so we can optimize on the change in account value start_date, end_date = unique_trade_date[i], unique_trade_date[i + 2] quanta_df = data_split(df, start=start_date, end=end_date) quanta_df = quanta_df[quanta_df["tic"] == ticker] if len(quanta_df.index) < 2: continue quanta_df = quanta_df.reset_index() quanta_env = DummyVecEnv([ lambda: StockEnvTrade(quanta_df, previous_state=last_state, initial=initial, log_interval=1) ]) quanta_env.reset() model = train_PPO_update(model, quanta_env, timesteps, policy=policy) last_state = quanta_env.render() initial = False model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") end = time.time() print('Training time (Multitask): ', (end - start) / 60, ' minutes') return model
def DRL_prediction(self, model, name, last_state, iter_num, turbulence_threshold, initial): ### make a prediction based on trained model### ## trading env trade_data = data_split( self.df, start=self.unique_trade_date[iter_num - self.rebalance_window], end=self.unique_trade_date[iter_num]) trade_env = DummyVecEnv([ lambda: StockTradingEnv(trade_data, self.stock_dim, self.hmax, self.initial_amount, self.buy_cost_pct, self.sell_cost_pct, self.reward_scaling, self.state_space, self.action_space, self.tech_indicator_list, turbulence_threshold=turbulence_threshold, initial=initial, previous_state=last_state, model_name=name, mode='trade', iteration=iter_num, print_verbosity=self.print_verbosity) ]) trade_obs = trade_env.reset() for i in range(len(trade_data.index.unique())): action, _states = model.predict(trade_obs) trade_obs, rewards, dones, info = trade_env.step(action) if i == (len(trade_data.index.unique()) - 2): # print(env_test.render()) last_state = trade_env.render() df_last_state = pd.DataFrame({'last_state': last_state}) df_last_state.to_csv('results/last_state_{}_{}.csv'.format(name, i), index=False) return last_state
def drl_prediction(df, model, name, last_state, turbulence_threshold): """Make a prediction based on trained model.""" trade_data = data_split(df, start=20160102, end=2021010) env_trade = DummyVecEnv([ lambda: StockEnvTrade(trade_data, turbulence_threshold=turbulence_threshold, previous_state=last_state, model_name=name) ]) obs_trade = env_trade.reset() for i in range(len(trade_data.index.unique())): action, _states = model.predict(obs_trade) obs_trade, rewards, dones, info = env_trade.step(action) if i == (len(trade_data.index.unique()) - 2): last_state = env_trade.render() df_last_state = pd.DataFrame({'last_state': last_state}) df_last_state.to_csv('results/last_state_{}.csv'.format(name), index=False) return last_state
def stock_trade(stock_file_train): df_train = pd.read_csv(stock_file_train) df_train = df_train.sort_values('date') # The algorithms require a vectorized environment to run env_train = DummyVecEnv([lambda: StockTradingEnv(df_train)]) model = PPO2(MlpPolicy, env_train, verbose=0, tensorboard_log='./log') # model = DQN("MlpPolicy", env_train, verbose=0, tensorboard_log='./log') model.learn(total_timesteps=int(1e4)) # -----------------Test Model -------------------------------------- day_profits = [] buy_hold_profit = [] df_test = pd.read_csv(stock_file_train.replace('train', 'test')) env_test = DummyVecEnv([lambda: StockTradingEnv(df_test)]) obs = env_test.reset() no_of_shares = 0 for i in range(len(df_test) - 1): action, _states = model.predict(obs) obs, rewards, done, info = env_test.step(action) profit = env_test.render() day_profits.append(profit) if i == 0: buy_hold_profit.append(0) no_of_shares = INITIAL_ACCOUNT_BALANCE // df_test.iloc[0]['close'] print('Buy ' + str(no_of_shares) + ' shares and hold') else: buy_hold_profit.append( no_of_shares * (df_test.iloc[i]['close'] - df_test.iloc[i - 1]['close'])) if done: break return day_profits, buy_hold_profit
def fund_trade(ts_code): day_profits = [] df = read_fund_nav(ts_code, 3000).head(-30) df = df.sort_values(by='end_date', ascending=True).reset_index(drop=True) # todo 最后一行包含了估值,导致报错,临时删除 df.drop([len(df) - 1], inplace=True) # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: FundTradingEnv(df)]) model = PPO2(MlpPolicy, env, verbose=0, tensorboard_log='./log',seed=1) model.learn(total_timesteps=int(1e4)) print("开始测试") df_test = df.tail(30).reset_index(drop=True) env = DummyVecEnv([lambda: FundTradingEnv(df_test)]) obs = env.reset() for i in range(len(df_test) - 1): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) profit = env.render() day_profits.append(profit) if done: break return day_profits
from gym import spaces import numpy as np # n_cpu = 4 # total_timesteps = 200000000 # # total_timesteps = 200000 # env = SubprocVecEnv([lambda: gym.make('WalkingSpider-v0') for i in range(n_cpu)]) # model = PPO2(MlpPolicy, env, verbose=1) # model.learn(total_timesteps=total_timesteps) # model.save("experience_learned/ppo2_WalkingSpider_v0_testing") # del model # remove to demonstrate saving and loading # # # Enjoy trained agent model = PPO2.load("experience_learned/ppo2_WalkingSpider_v0_testing_3") print("Enjoy trained agent") env = DummyVecEnv([lambda: gym.make('WalkingSpider-v0')]) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() # Random Environment # env = gym.make('WalkingSpider-v0') # env.reset() # for _ in range(1000): # env.render() # observation, reward, done, info = env.step(env.action_space.sample()) # take a random action # print("Obs Shape ", observation, " Action Shape ", env.action_space.sample().shape)
model.save(path.join(SAVE_DIR, model_name), cloudpickle=True) obs = testEnv.reset() # Test for consecutive 2000 days for testNo in range(365 * 5): action, _states = model.predict(obs) if np.isnan(action).any(): print(testNo) obs, rewards, done, info = testEnv.step(action) if done: print("Done") break profit_list.append(info[0]['profit']) act_profit_list.append(info[0]['actual_profit']) singleDay_record = testEnv.render(mode="detail") singleDay_record['testNo'] = testNo singleDay_record['rewards'] = rewards[0] detail_list.append(singleDay_record) if testNo % 365 == 0: print("\n============= TESTING " + str(testNo) + " =============\n") testEnv.render() detail_fileName = detail_fileName_model[:-5] + str(tstep) + '-' + str( modelNo) + detail_fileName_model[-4:] pickle.dump(detail_list, open(path.join(SAVE_DIR, detail_fileName), "wb")) final_result.append({
def stock_trade_US(stock_file_train, no_of_test_trading_days): df_train = pd.read_csv(stock_file_train) # df_train = df_train.sort_values('date') # The algorithms require a vectorized environment to run env_train = DummyVecEnv([lambda: StockTradingEnv_US(df_train)]) total_timesteps = int(4e4) # total_timesteps = int(1e5) model = PPO2('MlpPolicy', env_train, verbose=0, tensorboard_log='./log', seed=12345).learn(total_timesteps=total_timesteps) # Random Agent, after training # mean_reward, std_reward = evaluate_policy(model, env_train, n_eval_episodes=100) # print(f"after training, mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") # -----------------Test Model -------------------------------------- import sys sys.stdout = open( f'./output/output_SPY_{total_timesteps}_days_{no_of_test_trading_days}.txt', 'wt') day_profits = [] buy_hold_profit = [] df_test_raw = pd.read_csv(stock_file_train.replace('train', 'test')) #start from random day # df_test = df_test_raw.iloc[200:].reset_index(drop=True) df_test = df_test_raw df_test = df_test.drop(['Adj Close'], axis=1) env_test = DummyVecEnv([lambda: StockTradingEnv_US(df_test)]) obs = env_test.reset() no_of_shares = 0 buy_hold_commission = 0 for n in range(len(df_test) - 1): if n > no_of_test_trading_days: break action, _states = model.predict(obs) # let agent start with a buy all # if n == 0: # action[0][0] = 0 # action[0][1] = 1 obs, rewards, done, info = env_test.step(action) profit = env_test.render() day_profits.append(profit) if n == 0: buy_hold_profit.append(0) no_of_shares = INITIAL_ACCOUNT_BALANCE // df_test.iloc[0]['Close'] buy_hold_commission = no_of_shares * df_test.iloc[0][ 'Close'] * 0.001 print('Buy ' + str(no_of_shares) + ' shares and hold') else: buy_hold_profit_per_step = no_of_shares * ( df_test.iloc[n]['Close'] - df_test.iloc[0]['Close']) - buy_hold_commission buy_hold_profit.append(buy_hold_profit_per_step) print('Buy and Hold: ' + '*' * 40) print('No of shares: ' + str(no_of_shares) + ' average cost per share ' + str(df_test.iloc[0]['Close'])) print('profit is ' + str(buy_hold_profit_per_step)) if done: break good_model = False if day_profits[-1] > buy_hold_profit[-1]: good_model = True return day_profits, buy_hold_profit, good_model, model, total_timesteps
import pandas as pd from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import A2C from env.BitcoinTradingEnv import BitcoinTradingEnv df = pd.read_csv('./data/bitstamp.csv') df = df.sort_values('Timestamp') slice_point = int(len(df) - 50000) train_df = df[:slice_point] test_df = df[slice_point:] train_env = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, serial=True)]) model = A2C(MlpPolicy, train_env, verbose=1, tensorboard_log="./tensorboard/") model.learn(total_timesteps=200000) test_env = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, serial=True)]) obs = test_env.reset() for i in range(50000): action, _states = model.predict(obs) obs, rewards, done, info = test_env.step(action) test_env.render(mode="system", title="BTC") test_env.close()
def test(self, model_epoch: int = 0, render_env: bool = True, render_report: bool = True, save_report: bool = False): train_provider, test_provider = self.data_provider.split_data_train_test( self.train_split_percentage) del train_provider init_envs = DummyVecEnv( [make_env(test_provider) for _ in range(self.n_envs)]) model_path = path.join('data', 'agents', f'{self.study_name}__{model_epoch}.pkl') model = self.Model.load(model_path, env=init_envs) test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)]) self.logger.info(f'Testing model ({self.study_name}__{model_epoch})') zero_completed_obs = np.zeros((self.n_envs, ) + init_envs.observation_space.shape) zero_completed_obs[0, :] = test_env.reset() state = None rewards = [] for _ in range(len(test_provider.data_frame)): action, state = model.predict(zero_completed_obs, state=state) obs, reward, done, info = test_env.step([action[0]]) zero_completed_obs[0, :] = obs rewards.append(reward) if render_env: test_env.render(mode='human') if done: net_worths = pd.DataFrame({ 'Date': info[0]['timestamps'], 'Balance': info[0]['net_worths'], }) net_worths.set_index('Date', drop=True, inplace=True) returns = net_worths.pct_change()[1:] if render_report: qs.plots.snapshot(returns.Balance, title='RL Trader Performance') if save_report: reports_path = path.join( 'data', 'reports', f'{self.study_name}__{model_epoch}.html') qs.reports.html(returns.Balance, file=reports_path) self.logger.info( f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}' )