예제 #1
0
파일: run.py 프로젝트: zhangs-cerdar/Python
def demo2_continuous_action_space():
    # from elegantrl.tutorial.run import Arguments, train_and_evaluate
    pass
    '''DEMO 2.1: choose an off-policy DRL algorithm'''
    from elegantrl.tutorial.agent import AgentSAC  # AgentTD3, AgentDDPG
    args = Arguments(if_on_policy=False)
    args.agent = AgentSAC()
    '''DEMO 2.2: choose an on-policy DRL algorithm'''
    from elegantrl.tutorial.agent import AgentPPO  # AgentGaePPO
    args = Arguments(if_on_policy=True)  # hyper-parameters of on-policy is different from off-policy
    args.agent = AgentPPO()
    args.agent.if_use_gae = False

    "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0"
    args.env = PreprocessEnv('Pendulum-v0')
    args.env_eval = PreprocessEnv('Pendulum-v0', if_print=False)
    args.env_eval.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    args.reward_scale = 2 ** -3  # RewardRange: -1800 < -200 < -50 < 0
    args.batch_size = 2 ** 7
    args.net_dim = 2 ** 7
    "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s, LunarLanderContinuous-v2"
    # args.env = PreprocessEnv('LunarLanderContinuous-v2')
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
    "TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s, BipedalWalker-v3"
    # args.env = PreprocessEnv('BipedalWalker-v3')
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.break_step = int(2e5)  # break training when reach break_step (force termination)
    # args.if_allow_break = False  # allow break training when reach goal (early termination)

    train_and_evaluate(args)  # tutorial version
예제 #2
0
def demo1_discrete_action_space():
    # from elegantrl.tutorial.run import Arguments, train_and_evaluate
    args = Arguments(agent=None, env=None, gpu_id=None)  # hyperparameters
    '''choose an DRL algorithm'''
    from elegantrl.tutorial.agent import AgentDoubleDQN  # AgentDQN
    args.agent = AgentDoubleDQN()
    '''choose environment'''
    "TotalStep: 2e3, TargetReward: 195, UsedTime: 10s, CartPole-v0"
    args.env = PreprocessEnv(
        'CartPole-v0')  # or PreprocessEnv(gym.make('CartPole-v0'))
    args.net_dim = 2**7  # change a default hyper-parameters
    args.batch_size = 2**7
    "TotalStep: 6e4, TargetReward: 200, UsedTime: 600s, LunarLander-v2"
    # args.env = PreprocessEnv('LunarLander-v2')
    # args.net_dim = 2 ** 8
    # args.batch_size = 2 ** 8
    '''train and evaluate'''
    train_and_evaluate(args)
예제 #3
0
def run__demo():
    # from elegantrl.tutorial.run import Arguments, train_and_evaluate
    from elegantrl.tutorial.env import PreprocessEnv

    import gym
    gym.logger.set_level(40)  # Block warning
    """DEMO 1: Discrete action env of gym"""
    args = Arguments(agent=None, env=None, gpu_id=None)  # hyperparameters
    '''choose an DRL algorithm'''
    from elegantrl.tutorial.agent import AgentDoubleDQN  # AgentDQN
    args.agent = AgentDoubleDQN()
    '''choose environment'''
    # "TotalStep: 2e3, TargetReward: 195, UsedTime: 10s, CartPole-v0"
    # args.env = PreprocessEnv(env=gym.make('CartPole-v0'))
    # args.net_dim = 2 ** 7  # change a default hyper-parameters
    # args.batch_size = 2 ** 7
    "TotalStep: 6e4, TargetReward: 200, UsedTime: 600s, LunarLander-v2"
    args.env = PreprocessEnv(env=gym.make('LunarLander-v2'))
    args.net_dim = 2**8
    args.batch_size = 2**8
    '''train and evaluate'''
    train_and_evaluate(args)
    exit()
    '''DEMO 2: Continuous action env if gym'''
    '''DEMO 2.1: choose an off-policy DRL algorithm'''
    from elegantrl.tutorial.agent import AgentSAC  # AgentTD3, AgentDDPG
    args = Arguments(if_on_policy=False)
    args.agent = AgentSAC()
    '''DEMO 2.2: choose an on-policy DRL algorithm'''
    from elegantrl.tutorial.agent import AgentPPO  # AgentGaePPO
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    args.agent = AgentPPO()

    "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0"
    env = gym.make('Pendulum-v0')
    env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0
    args.batch_size = 2**7
    args.net_dim = 2**7
    "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s, LunarLanderContinuous-v2"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
    "TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s, BipedalWalker-v3"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.break_step = int(2e5)  # break training when reach break_step (force termination)
    # args.if_allow_break = False  # allow break training when reach goal (early termination)

    train_and_evaluate(args)  # tutorial version
    exit()
    '''DEMO 3: Custom Continuous action env: FinanceStock-v1'''
    from elegantrl.tutorial.agent import AgentPPO
    args = Arguments(if_on_policy=True)
    args.agent = AgentPPO()
    args.agent.if_use_gae = False

    "TotalStep:  5e4, TargetReward: 1.25, UsedTime:   30s, FinanceStock-v2"
    "TotalStep: 16e4, TargetReward: 1.50, UsedTime:  160s, FinanceStock-v2"
    from elegantrl.env import FinanceStockEnv  # a standard env for ElegantRL, not need PreprocessEnv()
    args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024)
    args.env_eval = FinanceStockEnv(
        if_train=False, train_beg=0,
        train_len=1024)  # eva_len = 1699 - train_len
    args.env_eval.target_reward = 1.25  # denotes 1.25 times the initial_account. convergence to 1.5

    args.break_step = int(
        5e6)  # break training when reach break_step (force termination)
    args.if_allow_break = True  # allow break training when reach goal (early termination)
    args.batch_size = 2**11
    args.max_step = args.env.max_step
    args.max_memo = (args.max_step - 1) * 8

    train_and_evaluate(args)  # tutorial version
    # train_and_evaluate_mp(args)  # try multiprocessing in advanced version
    exit()
예제 #4
0
파일: run.py 프로젝트: zhangs-cerdar/Python
def train_and_evaluate(args):
    args.init_before_training()

    '''basic arguments'''
    cwd = args.cwd
    env = args.env
    agent = args.agent
    gpu_id = args.gpu_id

    '''training arguments'''
    net_dim = args.net_dim
    max_memo = args.max_memo
    break_step = args.break_step
    batch_size = args.batch_size
    target_step = args.target_step
    repeat_times = args.repeat_times
    if_break_early = args.if_allow_break
    gamma = args.gamma
    reward_scale = args.reward_scale

    '''evaluating arguments'''
    show_gap = args.show_gap
    eval_times = args.eval_times
    env_eval = PreprocessEnv(env.env_name, if_print=False) if args.env_eval is None else args.env_eval
    del args  # In order to show these hyper-parameters clearly, I put them above.

    '''init: environment'''
    max_step = env.max_step
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete

    '''init: Agent, ReplayBuffer, Evaluator'''
    agent.init(net_dim, state_dim, action_dim)
    if_on_policy = getattr(agent, 'if_on_policy', False)

    buffer = ReplayBuffer(max_len=max_memo + max_step, if_on_policy=if_on_policy, if_gpu=True,
                          state_dim=state_dim, action_dim=1 if if_discrete else action_dim)

    evaluator = Evaluator(cwd=cwd, agent_id=gpu_id, device=agent.device, env=env_eval,
                          eval_times=eval_times, show_gap=show_gap)  # build Evaluator

    '''prepare for training'''
    agent.state = env.reset()
    if if_on_policy:
        steps = 0
    else:  # explore_before_training for off-policy
        with torch.no_grad():  # update replay buffer
            steps = explore_before_training(env, buffer, target_step, reward_scale, gamma)
        agent.update_net(buffer, target_step, batch_size, repeat_times)  # pre-training and hard update
        agent.act_target.load_state_dict(agent.act.state_dict()) if getattr(agent, 'act_target', None) else None
        agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr(agent, 'cri_target', None) else None
    total_step = steps

    '''start training'''
    if_reach_goal = False
    while not ((if_break_early and if_reach_goal)
               or total_step > break_step
               or os.path.exists(f'{cwd}/stop')):
        with torch.no_grad():  # speed up running
            steps = agent.explore_env(env, buffer, target_step, reward_scale, gamma)
        total_step += steps

        obj_a, obj_c = agent.update_net(buffer, target_step, batch_size, repeat_times)
        with torch.no_grad():  # speed up running
            if_reach_goal = evaluator.evaluate_save(agent.act, steps, obj_a, obj_c)
예제 #5
0
def run__demo():
    # from elegantrl.tutorial.run import Arguments, train_and_evaluate
    from elegantrl.tutorial.env import PreprocessEnv

    import gym
    gym.logger.set_level(
        40
    )  # Block warning: 'WARN: Box bound precision lowered by casting to float32'
    """DEMO 1: Discrete action env of gym"""
    args = Arguments(agent=None, env=None,
                     gpu_id=None)  # see Arguments() to see hyper-parameters
    '''choose an DRL algorithm'''
    from elegantrl.tutorial.agent import AgentDoubleDQN  # AgentDQN
    args.agent = AgentDoubleDQN()
    '''choose environment'''
    # args.env = PreprocessEnv(env=gym.make('CartPole-v0'))
    # args.net_dim = 2 ** 7  # change a default hyper-parameters
    # args.batch_size = 2 ** 7
    "TotalStep: 2e3, TargetReward: , UsedTime: 10s"
    args.env = PreprocessEnv(env=gym.make('LunarLander-v2'))
    args.net_dim = 2**8
    args.batch_size = 2**8
    "TotalStep: 6e4, TargetReward: 200, UsedTime: 600s"
    '''train and evaluate'''
    train_and_evaluate(args)
    exit()
    '''DEMO 2: Continuous action env if gym'''
    '''DEMO 2.1: choose an off-policy DRL algorithm'''
    from elegantrl.tutorial.agent import AgentSAC  # AgentTD3, AgentDDPG
    args = Arguments(if_on_policy=False)
    args.agent = AgentSAC()
    '''DEMO 2.2: choose an on-policy DRL algorithm'''
    from elegantrl.tutorial.agent import AgentPPO  # AgentGaePPO
    args = Arguments(
        if_on_policy=True
    )  # hyper-parameters of on-policy is different from off-policy
    args.agent = AgentPPO()
    '''choose environment'''
    env = gym.make('Pendulum-v0')
    env.target_reward = -200  # set target_reward manually for env 'Pendulum-v0'
    args.env = PreprocessEnv(env=env)
    args.reward_scale = 2**-3  # RewardRange: -1800 < -200 < -50 < 0
    args.net_dim = 2**7
    args.batch_size = 2**7
    "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s"
    # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2'))
    # args.reward_scale = 2 ** 0  # RewardRange: -800 < -200 < 200 < 302
    "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s"
    # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3'))
    # args.reward_scale = 2 ** 0  # RewardRange: -200 < -150 < 300 < 334
    # args.break_step = int(2e5)
    # args.if_allow_break = False
    "TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s"
    '''train and evaluate'''
    train_and_evaluate(args)
    # train_and_evaluate__multiprocessing(args)  # try multiprocessing in complete version
    exit()
    '''DEMO 3: Custom Continuous action env: FinanceStock-v1'''
    args = Arguments(if_on_policy=True)
    '''choose an DRL algorithm'''
    from elegantrl.tutorial.agent import AgentPPO
    args.agent = AgentPPO()

    from elegantrl.env import FinanceMultiStockEnv  # a standard env for ElegantRL, not need PreprocessEnv()
    args.env = FinanceMultiStockEnv(if_train=True, train_beg=0, train_len=1024)
    args.env_eval = FinanceMultiStockEnv(
        if_train=False, train_beg=0,
        train_len=1024)  # eva_len = 1699 - train_len
    args.reward_scale = 2**0  # RewardRange: 0 < 1.0 < 1.25 <
    args.break_step = int(5e6)
    args.max_step = args.env.max_step
    args.max_memo = (args.max_step - 1) * 8
    args.batch_size = 2**11
    args.if_allow_break = False
    "TotalStep:  2e5, TargetReward: 1.25, UsedTime:  200s"
    "TotalStep:  4e5, TargetReward: 1.50, UsedTime:  400s"
    "TotalStep: 10e5, TargetReward: 1.62, UsedTime: 1000s"
    '''train and evaluate'''
    train_and_evaluate(args)
    # args.rollout_num = 8
    # train_and_evaluate__multiprocessing(args)  # try multiprocessing in complete version
    exit()