Exemplo n.º 1
0
def demo2_continuous_action_space_on_policy():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    '''choose an DRL algorithm'''
    from agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    '''choose environment'''
    "PPO    TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0"
    env = gym.make('Pendulum-v0')
    env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0

    "PPO    TotalStep: 8e5, TargetReward: 200, UsedTime: 1500s, LunarLanderContinuous-v2"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302

    "PPO    TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s, BipedalWalker-v3"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.gamma = 0.96
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Exemplo n.º 2
0
def demo3_custom_env_fin_rl():
    from agent import AgentPPO
    '''choose an DRL algorithm'''
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = False

    "TotalStep:  5e4, TargetReward: 1.25, UsedTime:  20s, FinanceStock-v2"
    "TotalStep: 20e4, TargetReward: 1.50, UsedTime:  80s, FinanceStock-v2"
    from env import FinanceStockEnv  # a standard env for ElegantRL, not need PreprocessEnv()
    args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024)
    args.env_eval = FinanceStockEnv(
        if_train=False, train_beg=0,
        train_len=1024)  # eva_len = 1699 - train_len
    args.reward_scale = 2**0  # RewardRange: 0 < 1.0 < 1.25 < 1.5 < 1.6
    args.break_step = int(5e6)
    args.net_dim = 2**8
    args.max_step = args.env.max_step
    args.max_memo = (args.max_step - 1) * 8
    args.batch_size = 2**11
    args.repeat_times = 2**4
    args.eval_times1 = 2**2
    args.eval_times2 = 2**4
    args.if_allow_break = True
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 8
    train_and_evaluate_mp(args)
Exemplo n.º 3
0
def demo2_continuous_action_space_off_policy():
    args = Arguments(if_on_policy=False)
    '''choose an DRL algorithm'''
    from agent import AgentModSAC  # AgentSAC, AgentTD3, AgentDDPG
    args.agent = AgentModSAC()
    '''choose environment'''
    "TD3    TotalStep: 3e4, TargetReward: -200, UsedTime: 300s, Pendulum-v0"
    "ModSAC TotalStep: 4e4, TargetReward: -200, UsedTime: 400s, Pendulum-v0"
    env = gym.make('Pendulum-v0')
    env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0

    "TD3    TotalStep:  9e4, TargetReward: 100, UsedTime: 3ks, LunarLanderContinuous-v2"
    "TD3    TotalStep: 20e4, TargetReward: 200, UsedTime: 5ks, LunarLanderContinuous-v2"
    "SAC    TotalStep:  9e4, TargetReward: 200, UsedTime: 3ks, LunarLanderContinuous-v2"
    "ModSAC TotalStep:  5e4, TargetReward: 200, UsedTime: 1ks, LunarLanderContinuous-v2"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
    # args.eval_times2 = 2 ** 4  # set a large eval_times to get a precise learning curve

    "ModSAC TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s, BipedalWalker-v3"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.net_dim = 2 ** 8
    # args.break_step = int(2e5)
    # args.if_allow_break = True  # allow break training when reach goal (early termination)
    # args.break_step = int(2e5 * 4)  # break training after 'total_step > break_step'
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Exemplo n.º 4
0
def demo4_bullet_mujoco_on_policy():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep: 1e5, TargetReward: 18, UsedTime:  3ks, ReacherBulletEnv-v0"
    "TotalStep: 1e6, TargetReward: 18, UsedTime: 30ks, ReacherBulletEnv-v0"
    args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))

    from agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True

    args.break_step = int(2e5 * 8)
    args.reward_scale = 2**0  # RewardRange: -15 < 0 < 18 < 25
    args.gamma = 0.96
    args.eval_times1 = 2**2
    args.eval_times1 = 2**5

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)

    "TotalStep: 30e5, TargetReward: 1500, UsedTime:  6ks, AntBulletEnv-v0"
    "TotalStep: 75e5, TargetReward: 2500, UsedTime: 14ks, AntBulletEnv-v0"
    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))

    from agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    args.agent.lambda_entropy = 0.05  # 0.02
    args.agent.lambda_gae_adv = 0.97  # 0.98
    args.agent.if_use_dn = True
    args.net_dim = 2**8

    args.break_step = int(8e6 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.reward_scale = 2**-2  # (-50) 0 ~ 2500 (3340)
    args.max_memo = args.env.max_step * 4
    args.batch_size = 2**11  # 10
    args.repeat_times = 2**3
    args.net_dim = 2**8
    args.show_gap = 2**8  # for Recorder
    args.eva_size1 = 2**1  # for Recorder
    args.eva_size2 = 2**3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Exemplo n.º 5
0
def demo4_bullet_mujoco_off_policy():
    args = Arguments(if_on_policy=False)
    args.random_seed = 10086

    from agent import AgentModSAC  # AgentSAC, AgentTD3, AgentDDPG
    args.agent = AgentModSAC()  # AgentSAC(), AgentTD3(), AgentDDPG()
    args.agent.if_use_dn = True

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep:  5e4, TargetReward: 18, UsedTime: 1100s, ReacherBulletEnv-v0"
    "TotalStep: 30e4, TargetReward: 25, UsedTime:     s, ReacherBulletEnv-v0"
    # args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))
    # args.env.max_step = 2 ** 10  # important, default env.max_step=150
    # args.reward_scale = 2 ** 0  # -80 < -30 < 18 < 28
    # args.gamma = 0.96
    # args.break_step = int(6e4 * 8)  # (4e4) 8e5, UsedTime: (300s) 700s
    # args.eval_times1 = 2 ** 2
    # args.eval_times1 = 2 ** 5
    # args.if_per = True
    #
    # train_and_evaluate(args)

    "TotalStep:  3e5, TargetReward: 1500, UsedTime:  8ks, AntBulletEnv-v0"
    "TotalStep:  6e5, TargetReward: 2500, UsedTime: 18ks, AntBulletEnv-v0"
    "TotalStep: 20e5, TargetReward: 3000, UsedTime: 80ks, AntBulletEnv-v0"
    "TotalStep: 48e5, TargetReward: 3186, UsedTime:175ks, AntBulletEnv-v0"
    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))
    args.break_step = int(6e5 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.if_allow_break = False
    args.reward_scale = 2**-2  # RewardRange: -50 < 0 < 2500 < 3340
    args.max_memo = 2**20
    args.batch_size = 2**9
    args.show_gap = 2**8  # for Recorder
    args.eva_size1 = 2**1  # for Recorder
    args.eva_size2 = 2**3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Exemplo n.º 6
0
def demo4_bullet_mujoco_mpo():
    args = Arguments(if_on_policy=False)
    args.random_seed = 10086

    from agent import AgentMPO  # AgentSAC, AgentTD3, AgentDDPG
    args.agent = AgentMPO()  # AgentSAC(), AgentTD3(), AgentDDPG()

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))
    args.break_step = int(6e5 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.if_allow_break = False
    args.reward_scale = 2**0  # RewardRange: -50 < 0 < 2500 < 3340
    args.max_memo = 2**20
    args.batch_size = 2**9
    args.show_gap = 2**8  # for Recorder
    args.eva_size1 = 2**1  # for Recorder
    args.eva_size2 = 2**3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
Exemplo n.º 7
0
def demo4_bullet_mujoco_off_policy_per():
    args = Arguments(if_on_policy=False)
    args.random_seed = 10086

    from agent import AgentModSAC  # AgentSAC, AgentTD3, AgentDDPG
    args.agent = AgentModSAC()  # AgentSAC(), AgentTD3(), AgentDDPG()
    args.agent.if_use_dn = True

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep:  5e4, TargetReward: 18, UsedTime: 1100s, ReacherBulletEnv-v0"
    "TotalStep: 30e4, TargetReward: 25, UsedTime:     s, ReacherBulletEnv-v0"
    args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))
    args.env.max_step = 2**10  # important, default env.max_step=150
    args.reward_scale = 2**0  # -80 < -30 < 18 < 28
    args.gamma = 0.96
    args.break_step = int(6e4 * 8)  # (4e4) 8e5, UsedTime: (300s) 700s
    args.eval_times1 = 2**2
    args.eval_times1 = 2**5
    args.if_per = True
    args.rollout_num = 2
    train_and_evaluate_mp(args)