def demo_custom_env_finance_rl(): from elegantrl2.agent import AgentPPO '''choose an DRL algorithm''' args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.lambda_entropy = 0.02 # todo ceta2 args.gpu_id = sys.argv[-1][-4] args.random_seed = 1943210 "TotalStep: 10e4, TargetReturn: 3.0, UsedTime: 200s, FinanceStock-v1" "TotalStep: 20e4, TargetReturn: 4.0, UsedTime: 400s, FinanceStock-v1" "TotalStep: 30e4, TargetReturn: 4.2, UsedTime: 600s, FinanceStock-v1" # from envs.FinRL.StockTrading import StockTradingEnv args.gamma = 0.999 # args.env = StockTradingEnv(if_eval=False, gamma=gamma) args.env = StockTradingVecEnv(if_eval=False, gamma=args.gamma, env_num=2) args.env_eval = StockTradingEnv(if_eval=True, gamma=args.gamma) args.net_dim = 2**9 args.batch_size = args.net_dim * 4 args.target_step = args.env.max_step * 2 # todo ceta0 args.repeat_times = 2**4 args.eval_gap = 2**8 args.eval_times1 = 2**0 args.eval_times2 = 2**1 args.break_step = int(16e6) '''train and evaluate''' # train_and_evaluate(args) args.worker_num = 2 train_and_evaluate_mp(args)
def demo_continuous_action_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy from elegantrl2.agent import AgentPPO args.agent = AgentPPO() args.agent.cri_target = True '''choose environment''' if_train_pendulum = 0 if if_train_pendulum: "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s" env = gym.make('Pendulum-v0') env.target_return = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 args.net_dim = 2**7 args.batch_size = args.net_dim * 2 args.target_step = args.env.max_step * 16 if_train_lunar_lander = 0 if if_train_lunar_lander: "TotalStep: 4e5, TargetReward: 200, UsedTime: 900s" args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) args.reward_scale = 2**0 # RewardRange: -800 < -200 < 200 < 302 if_train_bipedal_walker = 1 if if_train_bipedal_walker: "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s" args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) args.reward_scale = 2**0 # RewardRange: -200 < -150 < 300 < 334 args.gamma = 0.97 args.if_per_or_gae = True # args.agent.lambda_entropy = 0.05 '''train and evaluate''' train_and_evaluate(args)
def check_agent(): from elegantrl2.agent import AgentPPO as Agent agent = Agent() net_dim = 2**7 state_dim = 8 action_dim = 2 agent.init(net_dim, state_dim, action_dim, learning_rate=1e-4, if_use_gae=False, gpu_id=0) # print(agent.act.state_dict()) for key, value in agent.act.state_dict().items(): print(key, value.shape)
def demo_continuous_action_on_policy_temp_mg(): args = Arguments(if_on_policy=True) # hyper-parameters of on-policy is different from off-policy from elegantrl2.agent import AgentPPO args.agent = AgentPPO() args.agent.cri_target = False # todo ceta2 args.learning_rate = 2 ** -14 args.random_seed = 1943 # args.gpu_id = (0, 1, 2, 3) args.gpu_id = (0, 1) # (2, 3) # args.gpu_id = (2, 3) # args.gpu_id = int(sys.argv[-1][-4]) args.random_seed = 1549 '''choose environment''' if_train_pendulum = 0 if if_train_pendulum: "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s" env = gym.make('Pendulum-v0') env.target_return = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2 ** -3 # RewardRange: -1800 < -200 < -50 < 0 args.net_dim = 2 ** 7 args.batch_size = args.net_dim * 2 args.target_step = args.env.max_step * 16 if_train_bipedal_walker = 1 if if_train_bipedal_walker: "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s" env_name = 'BipedalWalker-v3' env = gym.make(env_name) env.target_return = -50 # todo test args.eval_gap = 2 ** 5 # todo test args.env = PreprocessEnv(env=env) # args.env = PreprocessVecEnv(env=env, env_num=2) # args.env_eval = PreprocessEnv(env=env_name) args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 args.gamma = 0.97 args.target_step = args.env.max_step * 4 args.repeat_times = 2 ** 4 args.if_per_or_gae = True args.agent.lambda_entropy = 0.05 args.break_step = int(8e6) # if_train_finance_rl = 1 # if if_train_finance_rl: '''train and evaluate''' # train_and_evaluate(args) args.worker_num = 2 if isinstance(args.gpu_id, int) or isinstance(args.gpu_id, str): train_and_evaluate_mp(args) elif isinstance(args.gpu_id, tuple) or isinstance(args.gpu_id, list): train_and_evaluate_mg(args) else: print(f"Error in args.gpu_id {args.gpu_id}, type {type(args.gpu_id)}")
def demo_custom_env_finance_rl_nas89(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy args.random_seed = 1943 from elegantrl2.agent import AgentPPO args.agent = AgentPPO() args.agent.cri_target = True args.agent.lambda_entropy = 0.04 from envs.FinRL.StockTrading import StockEnvDOW30, StockEnvNAS89, StockVecEnvNAS89 args.gamma = 0.999 if_dow30_daily = 1 if if_dow30_daily: args.env = StockEnvDOW30(if_eval=False, gamma=args.gamma) args.env_eval = StockEnvDOW30(if_eval=True, gamma=args.gamma) else: # elif if_nas89_minute: args.env = StockEnvNAS89(if_eval=False, gamma=args.gamma) args.env_eval = StockEnvNAS89(if_eval=True, gamma=args.gamma) args.repeat_times = 2**4 args.learning_rate = 2**-14 args.net_dim = int(2**8 * 1.5) args.batch_size = args.net_dim * 4 args.target_step = args.env.max_step args.eval_gap = 2**8 args.eval_times1 = 2**0 args.eval_times2 = 2**1 args.break_step = int(16e6) args.if_allow_break = False if_single_env = 1 if if_single_env: args.gpu_id = 0 args.worker_num = 4 train_and_evaluate_mp(args) if_batch_env = 0 if if_batch_env: args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2) args.gpu_id = 3 args.random_seed += args.gpu_id args.worker_num = 2 train_and_evaluate_mp(args) if_multi_learner = 0 if if_multi_learner: args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2) args.gpu_id = (0, 1) args.worker_num = 2 train_and_evaluate_mg(args)
def demo_continuous_action_on_policy(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy from elegantrl2.agent import AgentPPO args.agent = AgentPPO() args.gpu_id = sys.argv[-1][-4] args.agent.cri_target = True args.learning_rate = 2**-14 args.random_seed = 1943 '''choose environment''' if_train_pendulum = 0 if if_train_pendulum: "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s" env = gym.make('Pendulum-v0') env.target_return = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 args.net_dim = 2**7 args.batch_size = args.net_dim * 2 args.target_step = args.env.max_step * 16 if_train_lunar_lander = 0 if if_train_lunar_lander: "TotalStep: 6e5, TargetReward: 200, UsedTime: 800s" env_name = 'LunarLanderContinuous-v2' # args.env = PreprocessEnv(env=env_name) args.env = PreprocessVecEnv(env=env_name, env_num=2) args.env_eval = PreprocessEnv(env=env_name) args.reward_scale = 2**0 # RewardRange: -800 < -200 < 200 < 302 args.break_step = int(8e6) args.if_per_or_gae = True args.target_step = args.env.max_step * 8 args.repeat_times = 2**4 if_train_bipedal_walker = 1 if if_train_bipedal_walker: "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s" env_name = 'BipedalWalker-v3' # args.env = PreprocessEnv(env=env_name) args.env = PreprocessVecEnv(env=env_name, env_num=2) args.env_eval = PreprocessEnv(env=env_name) args.reward_scale = 2**0 # RewardRange: -200 < -150 < 300 < 334 args.gamma = 0.97 args.target_step = args.env.max_step * 8 args.repeat_times = 2**4 args.if_per_or_gae = True args.agent.lambda_entropy = 0.05 args.break_step = int(8e6) '''train and evaluate''' # train_and_evaluate(args) args.worker_num = 2 train_and_evaluate_mp(args)
def demo_custom_env_finance_rl_dow30(): # 1.7+ 2.0+ args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy args.random_seed = 19430 from elegantrl2.agent import AgentPPO args.agent = AgentPPO() args.agent.cri_target = True args.agent.lambda_entropy = 0.02 args.gamma = 0.995 from envs.FinRL.StockTrading import StockEnvDOW30, StockVecEnvDOW30 args.env = StockEnvDOW30(if_eval=False, gamma=args.gamma) args.env_eval = StockEnvDOW30(if_eval=True, gamma=args.gamma) args.repeat_times = 2**4 args.learning_rate = 2**-14 args.net_dim = 2**8 args.batch_size = args.net_dim args.eval_gap = 2**7 args.eval_times1 = 2**0 args.eval_times2 = 2**1 args.break_step = int(10e6) args.if_allow_break = False if_single_env = 0 if if_single_env: args.gpu_id = int(sys.argv[-1][-4]) args.random_seed += int(args.gpu_id) args.target_step = args.env.max_step * 4 args.worker_num = 4 train_and_evaluate_mp(args) if_batch_env = 1 if if_batch_env: args.env = StockVecEnvDOW30(if_eval=False, gamma=args.gamma, env_num=4) args.gpu_id = int(sys.argv[-1][-4]) args.random_seed += args.gpu_id args.target_step = args.env.max_step args.worker_num = 4 train_and_evaluate_mp(args) if_multi_learner = 0 if if_multi_learner: args.env = StockVecEnvDOW30(if_eval=False, gamma=args.gamma, env_num=2) args.gpu_id = (0, 1) args.worker_num = 2 train_and_evaluate_mg(args)
def demo_custom_env_finance_rl(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy args.random_seed = 0 from elegantrl2.agent import AgentPPO args.agent = AgentPPO() args.agent.cri_target = True args.agent.lambda_entropy = 0.04 from envs.FinRL.StockTrading import StockEnvNAS89, StockVecEnvNAS89 args.gamma = 0.999 args.env = StockEnvNAS89(if_eval=False, gamma=args.gamma) args.env_eval = StockEnvNAS89(if_eval=True, gamma=args.gamma) args.repeat_times = 2**4 args.learning_rate = 2**-14 args.net_dim = int(2**8 * 1.5) args.batch_size = args.net_dim * 4 if_single_env = 0 if if_single_env: args.gpu_id = 0 args.worker_num = 4 train_and_evaluate_mp(args) if_batch_env = 1 if if_batch_env: args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2) args.gpu_id = 0 args.worker_num = 2 train_and_evaluate_mp(args) if_multi_learner = 0 if if_multi_learner: args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2) args.gpu_id = (0, 1) args.worker_num = 2 train_and_evaluate_mg(args) "TotalStep: 52e5, TargetReturn: 2.35, UsedTime: 3934s, FinanceStock-v2" "TotalStep: 81e5, TargetReturn: 2.47, UsedTime: 6129s, FinanceStock-v2" "TotalStep: 19e5, TargetReturn: 2.50, UsedTime: 1654s, FinanceStock-v2 GPU 2, 3" "TotalStep: 65e5, TargetReturn: 4.61, UsedTime: 5659s, FinanceStock-v2 GPU 2, 3" "TotalStep: 18e5, TargetReturn: 2.50, UsedTime: 1452s, FinanceStock-v2 GPU 0, 1" "TotalStep: 61e5, TargetReturn: 3.92, UsedTime: 4921s, FinanceStock-v2 GPU 0, 1" "TotalStep: 4e5, TargetReturn: 2.20, UsedTime: 583s, FinanceStock-v2 GPU 0, 1, 2, 3" "TotalStep: 11e6, TargetReturn: 4.39, UsedTime: 9648s, FinanceStock-v2 GPU 0, 1, 2, 3"
def demo_custom_env_finance_rl(): args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy args.random_seed = 0 from elegantrl2.agent import AgentPPO args.agent = AgentPPO() args.agent.cri_target = True args.agent.lambda_entropy = 0.04 from envs.FinRL.StockTrading import StockEnvNAS89, StockVecEnvNAS89 args.gamma = 0.999 args.env = StockEnvNAS89(if_eval=False, gamma=args.gamma) args.eval_env = StockEnvNAS89(if_eval=True, gamma=args.gamma) args.repeat_times = 2**4 args.learning_rate = 2**-14 args.net_dim = int(2**8 * 1.5) args.batch_size = args.net_dim * 4 if_single_env = 0 if if_single_env: args.gpu_id = 0 args.worker_num = 4 train_and_evaluate_mp(args) if_batch_env = 1 if if_batch_env: args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2) args.gpu_id = 0 args.worker_num = 2 train_and_evaluate_mp(args) if_multi_learner = 0 if if_multi_learner: args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2) args.gpu_id = (0, 1) args.worker_num = 2 train_and_evaluate_mg(args)
def demo_get_video_to_watch_gym_render(): import cv2 # pip3 install opencv-python import gym # pip3 install gym==0.17 pyglet==1.5.0 # env.render() bug in gym==0.18, pyglet==1.6 import torch """parameters""" env_name = 'LunarLanderContinuous-v2' env = PreprocessEnv(env=gym.make(env_name)) '''initialize agent''' agent = None # means use random action if agent is None: # use random action device = None else: from elegantrl2.agent import AgentPPO agent = AgentPPO() # means use the policy network which saved in cwd cwd = f'./{env_name}_{agent.__class__.__name__}/' # current working directory path net_dim = 2 ** 9 # 2 ** 7 state_dim = env.state_dim action_dim = env.action_dim agent.init(net_dim, state_dim, action_dim) agent.save_load_model(cwd=cwd, if_save=False) device = agent.device '''initialize evaluete and env.render()''' save_frame_dir = '' # means don't save video, just open the env.render() # save_frame_dir = 'frames' # means save video in this directory if save_frame_dir: os.makedirs(save_frame_dir, exist_ok=True) state = env.reset() episode_return = 0 step = 0 for i in range(2 ** 10): print(i) if i % 128 == 0 else None for j in range(1): if agent is None: action = env.action_space.sample() else: s_tensor = torch.as_tensor((state,), dtype=torch.float32, device=device) a_tensor = agent.act(s_tensor) action = a_tensor.detach().cpu().numpy()[0] # if use 'with torch.no_grad()', then '.detach()' not need. next_state, reward, done, _ = env.step(action) episode_return += reward step += 1 if done: print(f'{i:>6}, {step:6.0f}, {episode_return:8.3f}, {reward:8.3f}') state = env.reset() episode_return = 0 step = 0 else: state = next_state if save_frame_dir: frame = env.render('rgb_array') cv2.imwrite(f'{save_frame_dir}/{i:06}.png', frame) cv2.imshow('OpenCV Window', frame) cv2.waitKey(1) else: env.render() env.close() '''convert frames png/jpg to video mp4/avi using ffmpeg''' if save_frame_dir: frame_shape = cv2.imread(f'{save_frame_dir}/{3:06}.png').shape print(f"frame_shape: {frame_shape}") save_video = 'gym_render.mp4' os.system(f"| Convert frames to video using ffmpeg. Save in {save_video}") os.system(f'ffmpeg -r 60 -f image2 -s {frame_shape[0]}x{frame_shape[1]} ' f'-i ./{save_frame_dir}/%06d.png ' f'-crf 25 -vb 20M -pix_fmt yuv420p {save_video}')
def get_video_to_watch_gym_render(): import cv2 # pip3 install opencv-python import gym # pip3 install gym==0.17 pyglet==1.5.0 # env.render() bug in gym==0.18, pyglet==1.6 import torch '''choose env''' import pybullet_envs # for python-bullet-gym dir(pybullet_envs) # from elegantrl2.env import PreprocessEnv env_name = [ 'BipedalWalker-v3', 'AntBulletEnv-v0', 'KukaBulletEnv-v0', 'ReacherBulletEnv-v0', 'PusherBulletEnv-v0', "ThrowerBulletEnv-v0", "StrikerBulletEnv-v0" ][1] env = PreprocessEnv(env=gym.make(env_name)) '''initialize agent''' agent = None from elegantrl2.agent import AgentPPO agent = AgentPPO() agent.if_use_dn = True net_dim = 2**8 cwd = f'./{env_name}_4/' # from elegantrl2.agent import AgentModSAC # agent = AgentModSAC() # agent.if_use_dn = True # net_dim = 2 ** 8 # cwd = f'./{env_name}_2/' device = None if agent is not None: state_dim = env.state_dim action_dim = env.action_dim agent.init(net_dim, state_dim, action_dim) agent.save_load_model(cwd=cwd, if_save=False) device = agent.device rd.seed(194686) torch.manual_seed(1942876) '''initialize evaluete and env.render()''' save_frame_dir = 'frames' if save_frame_dir: os.makedirs(save_frame_dir, exist_ok=True) state = env.reset() episode_return = 0 step = 0 for i in range(2**9): print(i) if i % 128 == 0 else None for j in range(1): if agent is not None: s_tensor = torch.as_tensor((state, ), dtype=torch.float32, device=device) a_tensor = agent.act(s_tensor) action = a_tensor.detach().cpu().numpy( )[0] # if use 'with torch.no_grad()', then '.detach()' not need. else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) episode_return += reward step += 1 if done: print( f'{i:>6}, {step:6.0f}, {episode_return:8.3f}, {reward:8.3f}' ) state = env.reset() episode_return = 0 step = 0 else: state = next_state frame = env.render('rgb_array') frame = frame[50:210, 50:270] # (240, 320) AntPyBulletEnv-v0 # frame = cv2.resize(frame[:, :500], (500//2, 720//2)) cv2.imwrite(f'{save_frame_dir}/{i:06}.png', frame) cv2.imshow('', frame) cv2.waitKey(1) env.close() # exit() '''convert frames png/jpg to video mp4/avi using ffmpeg''' if save_frame_dir: frame_shape = cv2.imread(f'{save_frame_dir}/{3:06}.png').shape print(f"frame_shape: {frame_shape}") save_video = 'gym_render.mp4' os.system( f"| Convert frames to video using ffmpeg. Save in {save_video}") os.system( f'ffmpeg -r 60 -f image2 -s {frame_shape[0]}x{frame_shape[1]} ' f'-i ./{save_frame_dir}/%06d.png ' f'-crf 25 -vb 20M -pix_fmt yuv420p {save_video}')
def check_stock_trading_env(): if_eval = True # False env = StockTradingEnv(if_eval=if_eval) action_dim = env.action_dim state = env.reset() print('| check_stock_trading_env, state_dim', len(state)) from time import time timer = time() # ============================================================ policy_name = 'Random Action 1e-2' step = 1 done = False episode_return = 0 # state = env.reset() while not done: action = rd.uniform(-1, 1, size=action_dim) * 1e-2 next_state, reward, done, _ = env.step(action) # print(';', len(next_state), env.day, reward) episode_return += reward step += 1 print() print(f"| {policy_name}:") print(f"| step {step}, UsedTime {time() - timer:.3e}") print(f"| gamma_reward \t\t\t{env.gamma_reward:.3e}") print(f"| episode return \t\t{episode_return:.3e}") print( f"| discount return \t\t{episode_return / step / (1 - env.gamma):.3e}") print(f"| env episode return \t{env.episode_return:.3e}") # ============================================================ policy_name = 'Buy 4 Action' step = 1 done = False episode_return = 0 # state = env.reset() while not done: action = np.zeros(action_dim) action[:3] = 1 next_state, reward, done, _ = env.step(action) # print(';', len(next_state), env.day, reward) episode_return += reward step += 1 print() print(f"| {policy_name}:") print(f"| step {step}, UsedTime {time() - timer:.3e}") print(f"| gamma_reward \t\t\t{env.gamma_reward:.3e}") print(f"| episode return \t\t{episode_return:.3e}") print( f"| discount return \t\t{episode_return / step / (1 - env.gamma):.3e}") print(f"| env episode return \t{env.episode_return:.3e}") # ============================================================ '''draw_cumulative_return''' from elegantrl2.agent import AgentPPO from elegantrl2.run import Arguments args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.env = StockTradingEnv(if_eval=True) args.if_remove = False args.cwd = './StockTradingEnv-v1_AgentPPO' args.init_before_training() env.draw_cumulative_return(args, torch)
def demo_custom_env_finance_rl_nas89(): # 1.7+ 2.0+ args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy args.random_seed = 19430 from elegantrl2.agent import AgentPPO args.agent = AgentPPO() args.agent.lambda_entropy = 0.02 from envs.FinRL.StockTrading import StockEnvNAS89 args.gamma = 0.999 args.env = StockEnvNAS89(if_eval=False, gamma=args.gamma, turbulence_thresh=30) args.eval_env = StockEnvNAS89(if_eval=True, gamma=args.gamma, turbulence_thresh=15) args.net_dim = 2**9 args.repeat_times = 2**4 args.learning_rate = 2**-14 args.batch_size = args.net_dim * 4 args.eval_gap = 2**8 args.eval_times1 = 2**0 args.eval_times2 = 2**1 args.break_step = int(8e6) args.if_allow_break = False if_single_proc = 0 if if_single_proc: args.gpu_id = int(sys.argv[-1][-4]) args.random_seed += int(args.gpu_id) args.target_step = args.env.max_step * 4 train_and_evaluate(args) if_single_env = 1 if if_single_env: args.gpu_id = int(sys.argv[-1][-4]) args.random_seed += int(args.gpu_id) args.target_step = args.env.max_step * 1 args.worker_num = 4 train_and_evaluate_mp(args) if_multi_learner = 0 if if_multi_learner: args.gpu_id = (2, 3) if len(sys.argv) == 1 else eval( sys.argv[-1]) # python main.py -GPU 0,1 args.repeat_times = 2**4 args.target_step = args.env.max_step args.worker_num = 4 train_and_evaluate_mg(args) if_batch_env = 0 if if_batch_env: from envs.FinRL.StockTrading import StockVecEnvNAS89 args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2) args.gpu_id = int(sys.argv[-1][-4]) args.random_seed += args.gpu_id args.target_step = args.env.max_step args.worker_num = 4 train_and_evaluate_mp(args)