示例#1
0
def demo4_bullet_mujoco_on_policy():
    args = Arguments(if_on_policy=True)  # hyper-parameters of on-policy is different from off-policy

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep: 1e5, TargetReturn: 18, UsedTime:  3ks, ReacherBulletEnv-v0, PPO"
    "TotalStep: 1e6, TargetReturn: 18, UsedTime: 30ks, ReacherBulletEnv-v0, PPO"
    args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))

    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True

    args.break_step = int(2e5 * 8)
    args.reward_scale = 2 ** 0  # RewardRange: -15 < 0 < 18 < 25
    args.gamma = 0.96
    args.eval_times1 = 2 ** 2
    args.eval_times1 = 2 ** 5

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)

    "TotalStep:  3e6, TargetReturn: 1500, UsedTime:  2ks, AntBulletEnv-v0, PPO"
    "TotalStep: 10e6, TargetReturn: 2500, UsedTime:  6ks, AntBulletEnv-v0, PPO"
    "TotalStep: 46e6, TargetReturn: 3017, UsedTime: 25ks, AntBulletEnv-v0, PPO"
    "TotalStep:  5e6, TargetReturn: 1500, UsedTime:  3ks, AntBulletEnv-v0, PPO if_use_dn"
    "TotalStep: 15e6, TargetReturn: 2500, UsedTime: 10ks, AntBulletEnv-v0, PPO if_use_dn"
    "TotalStep: 60e6, TargetReturn: 2949, UsedTime: 34ks, AntBulletEnv-v0, PPO if_use_dn"
    "TotalStep:  2e6, TargetReturn: 1500, UsedTime:  2ks, AntBulletEnv-v0, PPO if_use_cn"
    "TotalStep: 10e6, TargetReturn: 2500, UsedTime:  7ks, AntBulletEnv-v0, PPO if_use_cn"
    "TotalStep: 53e6, TargetReturn: 2834, UsedTime: 35ks, AntBulletEnv-v0, PPO if_use_cn"
    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))

    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    args.agent.lambda_entropy = 0.05
    args.agent.lambda_gae_adv = 0.97

    args.if_allow_break = False
    args.break_step = int(8e6 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.reward_scale = 2 ** -2  # (-50) 0 ~ 2500 (3340)
    args.max_memo = args.env.max_step * 4
    args.batch_size = 2 ** 11  # 10
    args.repeat_times = 2 ** 3
    args.eval_gap = 2 ** 8  # for Recorder
    args.eva_size1 = 2 ** 1  # for Recorder
    args.eva_size2 = 2 ** 3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
示例#2
0
文件: run.py 项目: Zir0ne/ElegantRL
def demo2_continuous_action_space_on_policy():
    import elegantrl.agent as agent
    """DEMO 2.1: Continuous action env (on-policy)"""
    args = Arguments(if_on_policy=True)  # hyper-parameters of on-policy is different from off-policy
    args.agent = agent.AgentGaePPO()  # AgentPPO()

    '''choose environment'''
    # env = gym.make('Pendulum-v0')
    # env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    # args.env = PreprocessEnv(env=env)
    # args.reward_scale = 2 ** -3  # RewardRange: -1800 < -200 < -50 < 0
    # # TotalStep: 4e5, TargetReward: -200, UsedTime: 400s

    args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
    "TotalStep: 8e5, TargetReward: 200, UsedTime: 1500s"

    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.gamma = 0.96
    # "TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s"

    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate__multiprocessing(args)
示例#3
0
文件: run.py 项目: Zir0ne/ElegantRL
def demo2_continuous_action_space_off_policy():
    import elegantrl.agent as agent
    """DEMO 2.1: Continuous action env (off-policy)"""
    args = Arguments(if_on_policy=False)
    args.agent = agent.AgentModSAC()  # AgentSAC(), AgentTD3(), AgentDDPG()

    '''choose environment'''
    # env = gym.make('Pendulum-v0')
    # env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    # args.env = PreprocessEnv(env=env)
    # args.reward_scale = 2 ** -3  # RewardRange: -1800 < -200 < -50 < 0
    # "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s"
    #
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
    # "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s"

    args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    args.reward_scale = 2 ** -1  # RewardRange: -200 < -150 < 300 < 334
    args.gamma = 0.95
    "TotalStep: 2e5, TargetReward: 300, UsedTime: 3500s"

    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 2
    train_and_evaluate__multiprocessing(args)
示例#4
0
def demo2_continuous_action_space_on_policy():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    '''choose an DRL algorithm'''
    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True
    '''choose environment'''
    "PPO    TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0"
    env = gym.make('Pendulum-v0')
    env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0

    "PPO    TotalStep: 8e5, TargetReward: 200, UsedTime: 1500s, LunarLanderContinuous-v2"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302

    "PPO    TotalStep: 8e5, TargetReward: 300, UsedTime: 1800s, BipedalWalker-v3"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.gamma = 0.96
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
示例#5
0
def demo2_continuous_action_space_off_policy():
    args = Arguments(if_on_policy=False)
    '''choose an DRL algorithm'''
    from elegantrl.agent import AgentModSAC  # AgentSAC, AgentTD3, AgentDDPG
    args.agent = AgentModSAC()
    '''choose environment'''
    "TD3    TotalStep: 3e4, TargetReward: -200, UsedTime: 300s, Pendulum-v0"
    "ModSAC TotalStep: 4e4, TargetReward: -200, UsedTime: 400s, Pendulum-v0"
    env = gym.make('Pendulum-v0')
    env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0

    "TD3    TotalStep:  9e4, TargetReward: 100, UsedTime: 3ks, LunarLanderContinuous-v2"
    "TD3    TotalStep: 20e4, TargetReward: 200, UsedTime: 5ks, LunarLanderContinuous-v2"
    "SAC    TotalStep:  9e4, TargetReward: 200, UsedTime: 3ks, LunarLanderContinuous-v2"
    "ModSAC TotalStep:  5e4, TargetReward: 200, UsedTime: 1ks, LunarLanderContinuous-v2"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
    # args.eval_times2 = 2 ** 4  # set a large eval_times to get a precise learning curve

    "ModSAC TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s, BipedalWalker-v3"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.net_dim = 2 ** 8
    # args.break_step = int(2e5)
    # args.if_allow_break = True  # allow break training when reach goal (early termination)
    # args.break_step = int(2e5 * 4)  # break training after 'total_step > break_step'
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
示例#6
0
def demo1_discrete_action_space():
    args = Arguments(agent=None, env=None, gpu_id=None)  # see Arguments() to see hyper-parameters

    '''choose an DRL algorithm'''
    # from elegantrl.agent import AgentD3QN  # AgentDQN,AgentDuelDQN, AgentDoubleDQN,
    # args.agent = AgentD3QN()
    from elegantrl.agent import AgentDuelingDQN  # AgentDQN,AgentDuelDQN, AgentDoubleDQN,
    args.agent = AgentDuelingDQN()

    '''choose environment'''
    "TotalStep: 2e3, TargetReturn: 200, UsedTime: 20s, CartPole-v0"
    "TotalStep: 2e3, TargetReturn: 200, UsedTime: 30s, CartPole-v0 rollout_num = 2"
    # args.env = PreprocessEnv(env=gym.make('CartPole-v0'))
    # args.net_dim = 2 ** 7  # change a default hyper-parameters
    # args.batch_size = 2 ** 7
    # args.target_step = 2 ** 8
    # args.eval_gap = 2 ** 0

    "TotalStep: 6e4, TargetReturn: 200, UsedTime: 600s, LunarLander-v2, D3DQN"
    "TotalStep: 4e4, TargetReturn: 200, UsedTime: 600s, LunarLander-v2, DuelDQN"
    args.env = PreprocessEnv(env=gym.make('LunarLander-v2'))
    args.net_dim = 2 ** 8
    args.batch_size = 2 ** 8

    '''train and evaluate'''
    train_and_evaluate(args)
示例#7
0
def demo4_bullet_mujoco_off_policy():
    args = Arguments(if_on_policy=False)
    args.random_seed = 10086

    from elegantrl.agent import AgentModSAC  # AgentSAC, AgentTD3, AgentDDPG
    args.agent = AgentModSAC()  # AgentSAC(), AgentTD3(), AgentDDPG()
    args.agent.if_use_dn = True

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep:  5e4, TargetReturn: 18, UsedTime: 1100s, ReacherBulletEnv-v0"
    "TotalStep: 30e4, TargetReturn: 25, UsedTime:     s, ReacherBulletEnv-v0"
    args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))
    args.env.max_step = 2 ** 10  # important, default env.max_step=150
    args.reward_scale = 2 ** 0  # -80 < -30 < 18 < 28
    args.gamma = 0.96
    args.break_step = int(6e4 * 8)  # (4e4) 8e5, UsedTime: (300s) 700s
    args.eval_times1 = 2 ** 2
    args.eval_times1 = 2 ** 5
    args.if_per = True

    train_and_evaluate(args)

    "TotalStep:  3e5, TargetReward: 1500, UsedTime:  4ks, AntBulletEnv-v0 ModSAC if_use_dn"
    "TotalStep:  4e5, TargetReward: 2500, UsedTime:  6ks, AntBulletEnv-v0 ModSAC if_use_dn"
    "TotalStep: 10e5, TargetReward: 2879, UsedTime:   ks, AntBulletEnv-v0 ModSAC if_use_dn"
    "TotalStep:  3e5, TargetReward: 1500, UsedTime:  8ks, AntBulletEnv-v0 ModSAC if_use_cn"
    "TotalStep:  7e5, TargetReward: 2500, UsedTime: 18ks, AntBulletEnv-v0 ModSAC if_use_cn"
    "TotalStep: 16e5, TargetReward: 2923, UsedTime:   ks, AntBulletEnv-v0 ModSAC if_use_cn"
    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))
    args.break_step = int(6e5 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.if_allow_break = False
    args.reward_scale = 2 ** -2  # RewardRange: -50 < 0 < 2500 < 3340
    args.max_memo = 2 ** 21
    args.batch_size = 2 ** 8
    args.repeat_times = 2 ** 1
    args.eval_gap = 2 ** 9  # for Recorder
    args.eva_size1 = 2 ** 1  # for Recorder
    args.eva_size2 = 2 ** 3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
示例#8
0
def demo4_bullet_mujoco_on_policy():
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy

    from elegantrl.agent import AgentPPO
    args.agent = AgentPPO()
    args.agent.if_use_gae = True

    import pybullet_envs  # for python-bullet-gym
    dir(pybullet_envs)

    "TotalStep: 1e5, TargetReward: 18, UsedTime:  3ks, ReacherBulletEnv-v0"
    "TotalStep: 1e6, TargetReward: 18, UsedTime: 30ks, ReacherBulletEnv-v0"
    args.env = PreprocessEnv(gym.make('ReacherBulletEnv-v0'))
    args.break_step = int(2e5 * 8)
    args.reward_scale = 2**0  # RewardRange: -15 < 0 < 18 < 25
    args.gamma = 0.96
    args.eval_times1 = 2**2
    args.eval_times1 = 2**5

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)

    "TotalStep:  2e6, TargetReward: 1500, UsedTime:  3ks, AntBulletEnv-v0"
    "TotalStep: 13e6, TargetReward: 2400, UsedTime: 21ks, AntBulletEnv-v0"
    args.env = PreprocessEnv(env=gym.make('AntBulletEnv-v0'))
    args.env.max_step = 2**10

    args.break_step = int(2e6 * 8)  # (5e5) 1e6, UsedTime: (15,000s) 30,000s
    args.reward_scale = 2**-2  # (-50) 0 ~ 2500 (3340)
    args.max_memo = 2**11
    args.repeat_times = 2**3
    args.batch_size = 2**10
    args.net_dim = 2**9
    args.show_gap = 2**8  # for Recorder
    args.eva_size1 = 2**1  # for Recorder
    args.eva_size2 = 2**3  # for Recorder

    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate_mp(args)
示例#9
0
文件: run.py 项目: Yan81192/ElegantRL
def demo1_discrete_action_space():
    args = Arguments(agent=None, env=None,
                     gpu_id=None)  # see Arguments() to see hyper-parameters
    '''choose an DRL algorithm'''
    from elegantrl.agent import AgentD3QN  # AgentDQN,AgentDuelDQN, AgentDoubleDQN,
    args.agent = AgentD3QN()
    '''choose environment'''
    # args.env = PreprocessEnv(env=gym.make('CartPole-v0'))
    # args.net_dim = 2 ** 7  # change a default hyper-parameters
    # args.batch_size = 2 ** 7
    "TotalStep: 2e3, TargetReward: , UsedTime: 10s"
    args.env = PreprocessEnv(env=gym.make('LunarLander-v2'))
    args.net_dim = 2**8
    args.batch_size = 2**8
    "TotalStep: 6e4, TargetReward: 200, UsedTime: 600s"
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate__multiprocessing(args)
示例#10
0
文件: run.py 项目: Zir0ne/ElegantRL
def demo1_discrete_action_space():
    import elegantrl.agent as agent

    """DEMO 1: Discrete action env of gym"""
    args = Arguments(agent=None, env=None, gpu_id=None)  # see Arguments() to see hyper-parameters

    '''choose an DRL algorithm'''
    # args.agent_rl = agent.AgentDuelingDQN()  # AgentDQN()
    args.agent = agent.AgentD3QN()  # AgentDoubleDQN()

    '''choose environment'''
    args.env = PreprocessEnv(env=gym.make('CartPole-v0'))
    args.net_dim = 2 ** 7  # change a default hyper-parameters
    # args.env = PreprocessEnv(env=gym.make('LunarLander-v2'))
    # args.net_dim = 2 ** 8

    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate__multiprocessing(args)
示例#11
0
文件: run.py 项目: Yan81192/ElegantRL
def demo2_continuous_action_space_off_policy():
    args = Arguments(if_on_policy=False)
    '''choose an DRL algorithm'''
    from elegantrl.agent import AgentModSAC  # AgentSAC, AgentTD3, AgentDDPG
    args.agent = AgentModSAC()  # AgentSAC(), AgentTD3(), AgentDDPG()
    '''choose environment'''
    env = gym.make('Pendulum-v0')
    env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0
    "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
    "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.net_dim = 2 ** 8
    # args.break_step = int(2e5)
    # args.if_allow_break = False
    "TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s"
    '''train and evaluate'''
    # train_and_evaluate(args)
    args.rollout_num = 4
    train_and_evaluate__multiprocessing(args)
示例#12
0
def train_and_evaluate(args):
    args.init_before_training()
    '''basic arguments'''
    cwd = args.cwd
    env = args.env
    agent = args.agent
    gpu_id = args.gpu_id  # necessary for Evaluator?
    '''training arguments'''
    net_dim = args.net_dim
    max_memo = args.max_memo
    break_step = args.break_step
    batch_size = args.batch_size
    target_step = args.target_step
    repeat_times = args.repeat_times
    if_break_early = args.if_allow_break
    if_per = args.if_per
    gamma = args.gamma
    reward_scale = args.reward_scale
    '''evaluating arguments'''
    eval_gap = args.eval_gap
    eval_times1 = args.eval_times1
    eval_times2 = args.eval_times2
    if args.env_eval is not None:
        env_eval = args.env_eval
    elif args.env_eval in set(gym.envs.registry.env_specs.keys()):
        env_eval = PreprocessEnv(gym.make(env.env_name))
    else:
        env_eval = deepcopy(env)

    del args  # In order to show these hyper-parameters clearly, I put them above.
    '''init: environment'''
    max_step = env.max_step
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    '''init: Agent, ReplayBuffer, Evaluator'''
    agent.init(net_dim, state_dim, action_dim, if_per)
    if_on_policy = getattr(agent, 'if_on_policy', False)

    buffer = ReplayBuffer(max_len=max_memo + max_step,
                          state_dim=state_dim,
                          action_dim=1 if if_discrete else action_dim,
                          if_on_policy=if_on_policy,
                          if_per=if_per,
                          if_gpu=True)

    evaluator = Evaluator(
        cwd=cwd,
        agent_id=gpu_id,
        device=agent.device,
        env=env_eval,
        eval_gap=eval_gap,
        eval_times1=eval_times1,
        eval_times2=eval_times2,
    )
    '''prepare for training'''
    agent.state = env.reset()
    if if_on_policy:
        steps = 0
    else:  # explore_before_training for off-policy
        with torch.no_grad():  # update replay buffer
            steps = explore_before_training(env, buffer, target_step,
                                            reward_scale, gamma)

        agent.update_net(buffer, target_step, batch_size,
                         repeat_times)  # pre-training and hard update
        agent.act_target.load_state_dict(agent.act.state_dict()) if getattr(
            agent, 'act_target', None) else None
        agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr(
            agent, 'cri_target', None) else None
    total_step = steps
    '''start training'''
    if_reach_goal = False
    while not ((if_break_early and if_reach_goal) or total_step > break_step
               or os.path.exists(f'{cwd}/stop')):
        steps = agent.explore_env(env, buffer, target_step, reward_scale,
                                  gamma)
        total_step += steps

        obj_a, obj_c = agent.update_net(buffer, target_step, batch_size,
                                        repeat_times)

        if_reach_goal = evaluator.evaluate_save(agent.act, steps, obj_a, obj_c)
        evaluator.draw_plot()

    print(
        f'| SavedDir: {cwd}\n| UsedTime: {time.time() - evaluator.start_time:.0f}'
    )