Пример #1
0
def demo_custom_env_finance_rl():
    from elegantrl2.agent import AgentPPO
    '''choose an DRL algorithm'''
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.lambda_entropy = 0.02  # todo ceta2
    args.gpu_id = sys.argv[-1][-4]
    args.random_seed = 1943210

    "TotalStep: 10e4, TargetReturn: 3.0, UsedTime:  200s, FinanceStock-v1"
    "TotalStep: 20e4, TargetReturn: 4.0, UsedTime:  400s, FinanceStock-v1"
    "TotalStep: 30e4, TargetReturn: 4.2, UsedTime:  600s, FinanceStock-v1"
    # from envs.FinRL.StockTrading import StockTradingEnv
    args.gamma = 0.999
    # args.env = StockTradingEnv(if_eval=False, gamma=gamma)
    args.env = StockTradingVecEnv(if_eval=False, gamma=args.gamma, env_num=2)
    args.env_eval = StockTradingEnv(if_eval=True, gamma=args.gamma)

    args.net_dim = 2**9
    args.batch_size = args.net_dim * 4
    args.target_step = args.env.max_step * 2  # todo ceta0
    args.repeat_times = 2**4

    args.eval_gap = 2**8
    args.eval_times1 = 2**0
    args.eval_times2 = 2**1
    args.break_step = int(16e6)
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.worker_num = 2
    train_and_evaluate_mp(args)
Пример #2
0
def demo_continuous_action_on_policy():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    from elegantrl2.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.cri_target = True
    '''choose environment'''
    if_train_pendulum = 0
    if if_train_pendulum:
        "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s"
        env = gym.make('Pendulum-v0')
        env.target_return = -200  # set target_reward manually for env 'Pendulum-v0'
        args.env = PreprocessEnv(env=env)
        args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0
        args.net_dim = 2**7
        args.batch_size = args.net_dim * 2
        args.target_step = args.env.max_step * 16

    if_train_lunar_lander = 0
    if if_train_lunar_lander:
        "TotalStep: 4e5, TargetReward: 200, UsedTime: 900s"
        args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
        args.reward_scale = 2**0  # RewardRange: -800 < -200 < 200 < 302

    if_train_bipedal_walker = 1
    if if_train_bipedal_walker:
        "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s"
        args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
        args.reward_scale = 2**0  # RewardRange: -200 < -150 < 300 < 334
        args.gamma = 0.97
        args.if_per_or_gae = True
        # args.agent.lambda_entropy = 0.05
    '''train and evaluate'''
    train_and_evaluate(args)
Пример #3
0
def check_agent():
    from elegantrl2.agent import AgentPPO as Agent
    agent = Agent()

    net_dim = 2**7
    state_dim = 8
    action_dim = 2
    agent.init(net_dim,
               state_dim,
               action_dim,
               learning_rate=1e-4,
               if_use_gae=False,
               gpu_id=0)

    # print(agent.act.state_dict())
    for key, value in agent.act.state_dict().items():
        print(key, value.shape)
Пример #4
0
def demo_continuous_action_on_policy_temp_mg():
    args = Arguments(if_on_policy=True)  # hyper-parameters of on-policy is different from off-policy
    from elegantrl2.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.cri_target = False  # todo ceta2
    args.learning_rate = 2 ** -14
    args.random_seed = 1943
    # args.gpu_id = (0, 1, 2, 3)
    args.gpu_id = (0, 1)  # (2, 3)
    # args.gpu_id = (2, 3)
    # args.gpu_id = int(sys.argv[-1][-4])
    args.random_seed = 1549

    '''choose environment'''
    if_train_pendulum = 0
    if if_train_pendulum:
        "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s"
        env = gym.make('Pendulum-v0')
        env.target_return = -200  # set target_reward manually for env 'Pendulum-v0'
        args.env = PreprocessEnv(env=env)
        args.reward_scale = 2 ** -3  # RewardRange: -1800 < -200 < -50 < 0
        args.net_dim = 2 ** 7
        args.batch_size = args.net_dim * 2
        args.target_step = args.env.max_step * 16

    if_train_bipedal_walker = 1
    if if_train_bipedal_walker:
        "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s"
        env_name = 'BipedalWalker-v3'
        env = gym.make(env_name)
        env.target_return = -50  # todo test
        args.eval_gap = 2 ** 5 # todo test
        args.env = PreprocessEnv(env=env)
        # args.env = PreprocessVecEnv(env=env, env_num=2)
        # args.env_eval = PreprocessEnv(env=env_name)
        args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
        args.gamma = 0.97
        args.target_step = args.env.max_step * 4
        args.repeat_times = 2 ** 4
        args.if_per_or_gae = True
        args.agent.lambda_entropy = 0.05
        args.break_step = int(8e6)

    # if_train_finance_rl = 1
    # if if_train_finance_rl:

    '''train and evaluate'''
    # train_and_evaluate(args)
    args.worker_num = 2
    if isinstance(args.gpu_id, int) or isinstance(args.gpu_id, str):
        train_and_evaluate_mp(args)
    elif isinstance(args.gpu_id, tuple) or isinstance(args.gpu_id, list):
        train_and_evaluate_mg(args)
    else:
        print(f"Error in args.gpu_id {args.gpu_id}, type {type(args.gpu_id)}")
Пример #5
0
def demo_custom_env_finance_rl_nas89():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    args.random_seed = 1943

    from elegantrl2.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.cri_target = True
    args.agent.lambda_entropy = 0.04

    from envs.FinRL.StockTrading import StockEnvDOW30, StockEnvNAS89, StockVecEnvNAS89
    args.gamma = 0.999

    if_dow30_daily = 1
    if if_dow30_daily:
        args.env = StockEnvDOW30(if_eval=False, gamma=args.gamma)
        args.env_eval = StockEnvDOW30(if_eval=True, gamma=args.gamma)
    else:  # elif if_nas89_minute:
        args.env = StockEnvNAS89(if_eval=False, gamma=args.gamma)
        args.env_eval = StockEnvNAS89(if_eval=True, gamma=args.gamma)

    args.repeat_times = 2**4
    args.learning_rate = 2**-14
    args.net_dim = int(2**8 * 1.5)
    args.batch_size = args.net_dim * 4
    args.target_step = args.env.max_step

    args.eval_gap = 2**8
    args.eval_times1 = 2**0
    args.eval_times2 = 2**1
    args.break_step = int(16e6)
    args.if_allow_break = False

    if_single_env = 1
    if if_single_env:
        args.gpu_id = 0
        args.worker_num = 4
        train_and_evaluate_mp(args)

    if_batch_env = 0
    if if_batch_env:
        args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2)
        args.gpu_id = 3
        args.random_seed += args.gpu_id
        args.worker_num = 2
        train_and_evaluate_mp(args)

    if_multi_learner = 0
    if if_multi_learner:
        args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2)
        args.gpu_id = (0, 1)
        args.worker_num = 2
        train_and_evaluate_mg(args)
Пример #6
0
def demo_continuous_action_on_policy():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    from elegantrl2.agent import AgentPPO
    args.agent = AgentPPO()
    args.gpu_id = sys.argv[-1][-4]
    args.agent.cri_target = True
    args.learning_rate = 2**-14
    args.random_seed = 1943
    '''choose environment'''
    if_train_pendulum = 0
    if if_train_pendulum:
        "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s"
        env = gym.make('Pendulum-v0')
        env.target_return = -200  # set target_reward manually for env 'Pendulum-v0'
        args.env = PreprocessEnv(env=env)
        args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0
        args.net_dim = 2**7
        args.batch_size = args.net_dim * 2
        args.target_step = args.env.max_step * 16

    if_train_lunar_lander = 0
    if if_train_lunar_lander:
        "TotalStep: 6e5, TargetReward: 200, UsedTime: 800s"
        env_name = 'LunarLanderContinuous-v2'
        # args.env = PreprocessEnv(env=env_name)
        args.env = PreprocessVecEnv(env=env_name, env_num=2)
        args.env_eval = PreprocessEnv(env=env_name)
        args.reward_scale = 2**0  # RewardRange: -800 < -200 < 200 < 302
        args.break_step = int(8e6)
        args.if_per_or_gae = True
        args.target_step = args.env.max_step * 8
        args.repeat_times = 2**4

    if_train_bipedal_walker = 1
    if if_train_bipedal_walker:
        "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s"
        env_name = 'BipedalWalker-v3'
        # args.env = PreprocessEnv(env=env_name)
        args.env = PreprocessVecEnv(env=env_name, env_num=2)
        args.env_eval = PreprocessEnv(env=env_name)
        args.reward_scale = 2**0  # RewardRange: -200 < -150 < 300 < 334
        args.gamma = 0.97
        args.target_step = args.env.max_step * 8
        args.repeat_times = 2**4
        args.if_per_or_gae = True
        args.agent.lambda_entropy = 0.05
        args.break_step = int(8e6)
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.worker_num = 2
    train_and_evaluate_mp(args)
Пример #7
0
def demo_custom_env_finance_rl_dow30():  # 1.7+ 2.0+
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    args.random_seed = 19430

    from elegantrl2.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.cri_target = True
    args.agent.lambda_entropy = 0.02

    args.gamma = 0.995

    from envs.FinRL.StockTrading import StockEnvDOW30, StockVecEnvDOW30
    args.env = StockEnvDOW30(if_eval=False, gamma=args.gamma)
    args.env_eval = StockEnvDOW30(if_eval=True, gamma=args.gamma)

    args.repeat_times = 2**4
    args.learning_rate = 2**-14
    args.net_dim = 2**8
    args.batch_size = args.net_dim

    args.eval_gap = 2**7
    args.eval_times1 = 2**0
    args.eval_times2 = 2**1
    args.break_step = int(10e6)
    args.if_allow_break = False

    if_single_env = 0
    if if_single_env:
        args.gpu_id = int(sys.argv[-1][-4])
        args.random_seed += int(args.gpu_id)
        args.target_step = args.env.max_step * 4
        args.worker_num = 4
        train_and_evaluate_mp(args)

    if_batch_env = 1
    if if_batch_env:
        args.env = StockVecEnvDOW30(if_eval=False, gamma=args.gamma, env_num=4)
        args.gpu_id = int(sys.argv[-1][-4])
        args.random_seed += args.gpu_id
        args.target_step = args.env.max_step
        args.worker_num = 4
        train_and_evaluate_mp(args)

    if_multi_learner = 0
    if if_multi_learner:
        args.env = StockVecEnvDOW30(if_eval=False, gamma=args.gamma, env_num=2)
        args.gpu_id = (0, 1)
        args.worker_num = 2
        train_and_evaluate_mg(args)
Пример #8
0
def demo_custom_env_finance_rl():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    args.random_seed = 0

    from elegantrl2.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.cri_target = True
    args.agent.lambda_entropy = 0.04

    from envs.FinRL.StockTrading import StockEnvNAS89, StockVecEnvNAS89
    args.gamma = 0.999
    args.env = StockEnvNAS89(if_eval=False, gamma=args.gamma)
    args.env_eval = StockEnvNAS89(if_eval=True, gamma=args.gamma)

    args.repeat_times = 2**4
    args.learning_rate = 2**-14
    args.net_dim = int(2**8 * 1.5)
    args.batch_size = args.net_dim * 4

    if_single_env = 0
    if if_single_env:
        args.gpu_id = 0
        args.worker_num = 4
        train_and_evaluate_mp(args)

    if_batch_env = 1
    if if_batch_env:
        args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2)
        args.gpu_id = 0
        args.worker_num = 2
        train_and_evaluate_mp(args)

    if_multi_learner = 0
    if if_multi_learner:
        args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2)
        args.gpu_id = (0, 1)
        args.worker_num = 2
        train_and_evaluate_mg(args)

    "TotalStep: 52e5, TargetReturn: 2.35, UsedTime:  3934s, FinanceStock-v2"
    "TotalStep: 81e5, TargetReturn: 2.47, UsedTime:  6129s, FinanceStock-v2"
    "TotalStep: 19e5, TargetReturn: 2.50, UsedTime:  1654s, FinanceStock-v2 GPU 2, 3"
    "TotalStep: 65e5, TargetReturn: 4.61, UsedTime:  5659s, FinanceStock-v2 GPU 2, 3"
    "TotalStep: 18e5, TargetReturn: 2.50, UsedTime:  1452s, FinanceStock-v2 GPU 0, 1"
    "TotalStep: 61e5, TargetReturn: 3.92, UsedTime:  4921s, FinanceStock-v2 GPU 0, 1"
    "TotalStep:  4e5, TargetReturn: 2.20, UsedTime:   583s, FinanceStock-v2 GPU 0, 1, 2, 3"
    "TotalStep: 11e6, TargetReturn: 4.39, UsedTime:  9648s, FinanceStock-v2 GPU 0, 1, 2, 3"
Пример #9
0
def demo_custom_env_finance_rl():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    args.random_seed = 0

    from elegantrl2.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.cri_target = True
    args.agent.lambda_entropy = 0.04

    from envs.FinRL.StockTrading import StockEnvNAS89, StockVecEnvNAS89
    args.gamma = 0.999
    args.env = StockEnvNAS89(if_eval=False, gamma=args.gamma)
    args.eval_env = StockEnvNAS89(if_eval=True, gamma=args.gamma)

    args.repeat_times = 2**4
    args.learning_rate = 2**-14
    args.net_dim = int(2**8 * 1.5)
    args.batch_size = args.net_dim * 4

    if_single_env = 0
    if if_single_env:
        args.gpu_id = 0
        args.worker_num = 4
        train_and_evaluate_mp(args)

    if_batch_env = 1
    if if_batch_env:
        args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2)
        args.gpu_id = 0
        args.worker_num = 2
        train_and_evaluate_mp(args)

    if_multi_learner = 0
    if if_multi_learner:
        args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2)
        args.gpu_id = (0, 1)
        args.worker_num = 2
        train_and_evaluate_mg(args)
Пример #10
0
def demo_get_video_to_watch_gym_render():
    import cv2  # pip3 install opencv-python
    import gym  # pip3 install gym==0.17 pyglet==1.5.0  # env.render() bug in gym==0.18, pyglet==1.6
    import torch

    """parameters"""
    env_name = 'LunarLanderContinuous-v2'
    env = PreprocessEnv(env=gym.make(env_name))

    '''initialize agent'''
    agent = None  # means use random action
    if agent is None:  # use random action
        device = None
    else:
        from elegantrl2.agent import AgentPPO
        agent = AgentPPO()  # means use the policy network which saved in cwd
        cwd = f'./{env_name}_{agent.__class__.__name__}/'  # current working directory path

        net_dim = 2 ** 9  # 2 ** 7
        state_dim = env.state_dim
        action_dim = env.action_dim

        agent.init(net_dim, state_dim, action_dim)
        agent.save_load_model(cwd=cwd, if_save=False)
        device = agent.device

    '''initialize evaluete and env.render()'''
    save_frame_dir = ''  # means don't save video, just open the env.render()
    # save_frame_dir = 'frames'  # means save video in this directory
    if save_frame_dir:
        os.makedirs(save_frame_dir, exist_ok=True)

    state = env.reset()
    episode_return = 0
    step = 0
    for i in range(2 ** 10):
        print(i) if i % 128 == 0 else None
        for j in range(1):
            if agent is None:
                action = env.action_space.sample()
            else:
                s_tensor = torch.as_tensor((state,), dtype=torch.float32, device=device)
                a_tensor = agent.act(s_tensor)
                action = a_tensor.detach().cpu().numpy()[0]  # if use 'with torch.no_grad()', then '.detach()' not need.
            next_state, reward, done, _ = env.step(action)

            episode_return += reward
            step += 1

            if done:
                print(f'{i:>6}, {step:6.0f}, {episode_return:8.3f}, {reward:8.3f}')
                state = env.reset()
                episode_return = 0
                step = 0
            else:
                state = next_state

        if save_frame_dir:
            frame = env.render('rgb_array')
            cv2.imwrite(f'{save_frame_dir}/{i:06}.png', frame)
            cv2.imshow('OpenCV Window', frame)
            cv2.waitKey(1)
        else:
            env.render()
    env.close()

    '''convert frames png/jpg to video mp4/avi using ffmpeg'''
    if save_frame_dir:
        frame_shape = cv2.imread(f'{save_frame_dir}/{3:06}.png').shape
        print(f"frame_shape: {frame_shape}")

        save_video = 'gym_render.mp4'
        os.system(f"| Convert frames to video using ffmpeg. Save in {save_video}")
        os.system(f'ffmpeg -r 60 -f image2 -s {frame_shape[0]}x{frame_shape[1]} '
                  f'-i ./{save_frame_dir}/%06d.png '
                  f'-crf 25 -vb 20M -pix_fmt yuv420p {save_video}')
Пример #11
0
def get_video_to_watch_gym_render():
    import cv2  # pip3 install opencv-python
    import gym  # pip3 install gym==0.17 pyglet==1.5.0  # env.render() bug in gym==0.18, pyglet==1.6
    import torch
    '''choose env'''
    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    # from elegantrl2.env import PreprocessEnv
    env_name = [
        'BipedalWalker-v3', 'AntBulletEnv-v0', 'KukaBulletEnv-v0',
        'ReacherBulletEnv-v0', 'PusherBulletEnv-v0', "ThrowerBulletEnv-v0",
        "StrikerBulletEnv-v0"
    ][1]
    env = PreprocessEnv(env=gym.make(env_name))
    '''initialize agent'''
    agent = None

    from elegantrl2.agent import AgentPPO
    agent = AgentPPO()
    agent.if_use_dn = True
    net_dim = 2**8
    cwd = f'./{env_name}_4/'

    # from elegantrl2.agent import AgentModSAC
    # agent = AgentModSAC()
    # agent.if_use_dn = True
    # net_dim = 2 ** 8
    # cwd = f'./{env_name}_2/'

    device = None
    if agent is not None:
        state_dim = env.state_dim
        action_dim = env.action_dim
        agent.init(net_dim, state_dim, action_dim)
        agent.save_load_model(cwd=cwd, if_save=False)
        device = agent.device
        rd.seed(194686)
        torch.manual_seed(1942876)
    '''initialize evaluete and env.render()'''
    save_frame_dir = 'frames'

    if save_frame_dir:
        os.makedirs(save_frame_dir, exist_ok=True)

    state = env.reset()
    episode_return = 0
    step = 0
    for i in range(2**9):
        print(i) if i % 128 == 0 else None
        for j in range(1):
            if agent is not None:
                s_tensor = torch.as_tensor((state, ),
                                           dtype=torch.float32,
                                           device=device)
                a_tensor = agent.act(s_tensor)
                action = a_tensor.detach().cpu().numpy(
                )[0]  # if use 'with torch.no_grad()', then '.detach()' not need.
            else:
                action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)

            episode_return += reward
            step += 1

            if done:
                print(
                    f'{i:>6}, {step:6.0f}, {episode_return:8.3f}, {reward:8.3f}'
                )
                state = env.reset()
                episode_return = 0
                step = 0
            else:
                state = next_state

        frame = env.render('rgb_array')
        frame = frame[50:210, 50:270]  # (240, 320) AntPyBulletEnv-v0
        # frame = cv2.resize(frame[:, :500], (500//2, 720//2))
        cv2.imwrite(f'{save_frame_dir}/{i:06}.png', frame)
        cv2.imshow('', frame)
        cv2.waitKey(1)
    env.close()
    # exit()
    '''convert frames png/jpg to video mp4/avi using ffmpeg'''
    if save_frame_dir:
        frame_shape = cv2.imread(f'{save_frame_dir}/{3:06}.png').shape
        print(f"frame_shape: {frame_shape}")

        save_video = 'gym_render.mp4'
        os.system(
            f"| Convert frames to video using ffmpeg. Save in {save_video}")
        os.system(
            f'ffmpeg -r 60 -f image2 -s {frame_shape[0]}x{frame_shape[1]} '
            f'-i ./{save_frame_dir}/%06d.png '
            f'-crf 25 -vb 20M -pix_fmt yuv420p {save_video}')
Пример #12
0
def check_stock_trading_env():
    if_eval = True  # False

    env = StockTradingEnv(if_eval=if_eval)
    action_dim = env.action_dim

    state = env.reset()
    print('| check_stock_trading_env, state_dim', len(state))

    from time import time
    timer = time()

    # ============================================================
    policy_name = 'Random Action 1e-2'
    step = 1
    done = False
    episode_return = 0

    # state = env.reset()
    while not done:
        action = rd.uniform(-1, 1, size=action_dim) * 1e-2
        next_state, reward, done, _ = env.step(action)
        # print(';', len(next_state), env.day, reward)
        episode_return += reward
        step += 1

    print()
    print(f"| {policy_name}:")
    print(f"| step {step}, UsedTime {time() - timer:.3e}")
    print(f"| gamma_reward \t\t\t{env.gamma_reward:.3e}")
    print(f"| episode return \t\t{episode_return:.3e}")
    print(
        f"| discount return \t\t{episode_return / step / (1 - env.gamma):.3e}")
    print(f"| env episode return \t{env.episode_return:.3e}")

    # ============================================================
    policy_name = 'Buy  4 Action'
    step = 1
    done = False
    episode_return = 0

    # state = env.reset()
    while not done:
        action = np.zeros(action_dim)
        action[:3] = 1

        next_state, reward, done, _ = env.step(action)
        # print(';', len(next_state), env.day, reward)
        episode_return += reward
        step += 1

    print()
    print(f"| {policy_name}:")
    print(f"| step {step}, UsedTime {time() - timer:.3e}")
    print(f"| gamma_reward \t\t\t{env.gamma_reward:.3e}")
    print(f"| episode return \t\t{episode_return:.3e}")
    print(
        f"| discount return \t\t{episode_return / step / (1 - env.gamma):.3e}")
    print(f"| env episode return \t{env.episode_return:.3e}")

    # ============================================================
    '''draw_cumulative_return'''
    from elegantrl2.agent import AgentPPO
    from elegantrl2.run import Arguments
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.env = StockTradingEnv(if_eval=True)
    args.if_remove = False
    args.cwd = './StockTradingEnv-v1_AgentPPO'
    args.init_before_training()

    env.draw_cumulative_return(args, torch)
Пример #13
0
def demo_custom_env_finance_rl_nas89():  # 1.7+ 2.0+
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    args.random_seed = 19430

    from elegantrl2.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.lambda_entropy = 0.02

    from envs.FinRL.StockTrading import StockEnvNAS89
    args.gamma = 0.999
    args.env = StockEnvNAS89(if_eval=False,
                             gamma=args.gamma,
                             turbulence_thresh=30)
    args.eval_env = StockEnvNAS89(if_eval=True,
                                  gamma=args.gamma,
                                  turbulence_thresh=15)

    args.net_dim = 2**9
    args.repeat_times = 2**4
    args.learning_rate = 2**-14
    args.batch_size = args.net_dim * 4

    args.eval_gap = 2**8
    args.eval_times1 = 2**0
    args.eval_times2 = 2**1
    args.break_step = int(8e6)
    args.if_allow_break = False

    if_single_proc = 0
    if if_single_proc:
        args.gpu_id = int(sys.argv[-1][-4])
        args.random_seed += int(args.gpu_id)
        args.target_step = args.env.max_step * 4
        train_and_evaluate(args)

    if_single_env = 1
    if if_single_env:
        args.gpu_id = int(sys.argv[-1][-4])
        args.random_seed += int(args.gpu_id)
        args.target_step = args.env.max_step * 1
        args.worker_num = 4
        train_and_evaluate_mp(args)

    if_multi_learner = 0
    if if_multi_learner:
        args.gpu_id = (2, 3) if len(sys.argv) == 1 else eval(
            sys.argv[-1])  # python main.py -GPU 0,1
        args.repeat_times = 2**4
        args.target_step = args.env.max_step
        args.worker_num = 4
        train_and_evaluate_mg(args)

    if_batch_env = 0
    if if_batch_env:
        from envs.FinRL.StockTrading import StockVecEnvNAS89
        args.env = StockVecEnvNAS89(if_eval=False, gamma=args.gamma, env_num=2)
        args.gpu_id = int(sys.argv[-1][-4])
        args.random_seed += args.gpu_id
        args.target_step = args.env.max_step
        args.worker_num = 4
        train_and_evaluate_mp(args)