def train(): best_reward, best_reward_timesteps = None, None save_path = "model_save/"+MODEL_PATH+"/" if save_path is not None: os.makedirs(save_path, exist_ok=True) # log_dir = f"model_save/" log_dir = save_path env, env_eval = ENV(util='train', par=PARAM, dt=DT), ENV(util='val', par=PARAM, dt=DT) env, env_eval = Monitor(env, log_dir), Monitor(env_eval, log_dir) env, env_eval = DummyVecEnv([lambda: env]), DummyVecEnv([lambda: env_eval]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) if PARAM['algo']=='td3': model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ddpg': model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ppo': model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed']) eval_callback = EvalCallback(env_eval, best_model_save_path=save_path+MODEL_PATH+'_best_model', log_path=log_dir, eval_freq=PARAM['eval_freq'], save_freq=PARAM['save_freq'], deterministic=True, render=False) model.learn(total_timesteps=int(PARAM['total_time_step']), callback=eval_callback, log_interval = 500) print("best mean reward:", eval_callback.best_mean_reward_overall, "timesteps:", eval_callback.best_mean_reward_timestep) model.save(save_path+MODEL_PATH+'_final_timesteps')
def test(MODEL_TEST): log_dir = "model_save/" + MODEL_PATH + "/" + MODEL_PATH + MODEL_TEST env = ENV(util='test', par=PARAM, dt=DT) env.render = True env = Monitor(env, log_dir) if PARAM['algo']=='td3': model = TD3.load(log_dir) elif PARAM['algo']=='ddpg': model = DDPG.load(log_dir) elif PARAM['algo']=='ppo': model = PPO.load(log_dir) # plot_results(f"model_save/") trade_dt = pd.DataFrame([]) # trade_dt: 所有股票的交易数据 result_dt = pd.DataFrame([]) # result_dt: 所有股票一年测试结果数据 for i in range(TEST_STOCK_NUM): state = env.reset() stock_bh_id = 'stock_bh_'+str(i) # 记录每个股票交易的buy_hold stock_port_id = 'stock_port_'+str(i) # 记录每个股票交易的portfolio stock_action_id = 'stock_action_' + str(i) # 记录每个股票交易的action flow_L_id = 'stock_flow_' + str(i) # 记录每个股票的流水 stock_bh_dt, stock_port_dt, action_policy_dt, flow_L_dt = [], [], [], [] day = 0 while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",day,"reward:", reward,"now profit:",env.profit) # 测试每一步的交易policy stock_bh_dt.append(env.buy_hold) stock_port_dt.append(env.Portfolio_unit) action_policy_dt.append(action[0][0]) # 用于记录policy flow_L_dt.append(env.flow) day+=1 if done: print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}' .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad)) # 交易完后记录:股票ID,利润(单位100%),buy_hold(单位100%),夏普率,最大回撤率(单位100%),romad result=pd.DataFrame([[i,env.profit*100,env.buy_hold*100,env.sp,env.mdd*100,env.romad]]) break trade_dt_stock = pd.DataFrame({stock_port_id: stock_port_dt, stock_bh_id: stock_bh_dt, stock_action_id: action_policy_dt, flow_L_id: flow_L_dt}) # 支股票的交易数据 trade_dt = pd.concat([trade_dt, trade_dt_stock], axis=1) # 所有股票交易数据合并(加行) result_dt = pd.concat([result_dt,result],axis=0) # 所有股票结果数据合并(加列) result_dt.columns = ['stock_id','prfit(100%)','buy_hold(100%)','sp','mdd(100%)','romad'] trade_dt.to_csv('out_dt/trade_'+MODEL_PATH+'.csv',index=False) result_dt.to_csv('out_dt/result_'+MODEL_PATH+'.csv',index=False)
def train(): log_dir = f"model_save/" env = ENV(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed']) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(PARAM['total_time_step']), callback=callback, log_interval=480) model.save('model_save/' + MODEL_PATH)