def experiment(variant): expl_env = get_env() eval_env = get_env() post_epoch_funcs = [] M = variant['layer_size'] trainer = get_sac_model(env=eval_env, hidden_sizes=[M, M]) policy = trainer.policy eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) columns = ['Epoch', 'mean', 'std'] eval_result = pd.DataFrame(columns=columns) eval_output_csv = os.path.join(variant['log_dir'], 'eval_result.csv') def post_epoch_func(self, epoch): nonlocal eval_result nonlocal policy print(f'-------------post_epoch_func start-------------') eval_result = my_eval_policy( env=get_env(), algorithm=self, epoch=epoch, eval_result=eval_result, output_csv=eval_output_csv, ) print(f'-------------post_epoch_func done-------------') algorithm.post_epoch_funcs = [ post_epoch_func, ] algorithm.to(ptu.device) algorithm.train()
def train_model(variant): gt.reset_root() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") log_dir = f"./output/train_out_{timestamp}/" setup_logger('name-of-experiment', variant=variant, snapshot_mode='gap_and_last', snapshot_gap=20, log_dir=log_dir) expl_env_kwargs = variant['expl_env_kwargs'] eval_env_kwargs = variant['eval_env_kwargs'] trainer_kwargs = variant['trainer_kwargs'] df_ret_train, df_ret_val, df_feature = load_dataset() df_ret_train.to_csv(os.path.join(log_dir, 'df_ret_train.csv')) df_ret_val.to_csv(os.path.join(log_dir, 'df_ret_val.csv')) df_feature.to_csv(os.path.join(log_dir, 'df_feature.csv')) expl_env = NormalizedBoxEnv( gym.make('MarketEnv-v0', returns=df_ret_train, features=df_feature, **expl_env_kwargs)) eval_env = NormalizedBoxEnv( gym.make('MarketEnv-v0', returns=df_ret_val, features=df_feature, **eval_env_kwargs)) def post_epoch_func(self, epoch): progress_csv = os.path.join(log_dir, 'progress.csv') df = pd.read_csv(progress_csv) kpis = ['cagr', 'dd', 'mdd', 'wealths', 'std'] srcs = ['evaluation', 'exploration'] n = 50 for kpi in kpis: series = map(lambda s: df[f'{s}/env_infos/final/{kpi} Mean'], srcs) plot_ma(series=series, lables=srcs, title=kpi, n=n) plt.savefig(os.path.join(log_dir, f'{kpi}.png')) plt.close() trainer = get_trainer(env=eval_env, **trainer_kwargs) policy = trainer.policy eval_policy = MakeDeterministic(policy) #eval_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.post_epoch_funcs = [ post_epoch_func, ] algorithm.to(ptu.device) algorithm.train()