def demo2_continuous_action_space(): # from elegantrl.tutorial.run import Arguments, train_and_evaluate pass '''DEMO 2.1: choose an off-policy DRL algorithm''' from elegantrl.tutorial.agent import AgentSAC # AgentTD3, AgentDDPG args = Arguments(if_on_policy=False) args.agent = AgentSAC() '''DEMO 2.2: choose an on-policy DRL algorithm''' from elegantrl.tutorial.agent import AgentPPO # AgentGaePPO args = Arguments(if_on_policy=True) # hyper-parameters of on-policy is different from off-policy args.agent = AgentPPO() args.agent.if_use_gae = False "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0" args.env = PreprocessEnv('Pendulum-v0') args.env_eval = PreprocessEnv('Pendulum-v0', if_print=False) args.env_eval.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' args.reward_scale = 2 ** -3 # RewardRange: -1800 < -200 < -50 < 0 args.batch_size = 2 ** 7 args.net_dim = 2 ** 7 "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s, LunarLanderContinuous-v2" # args.env = PreprocessEnv('LunarLanderContinuous-v2') # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 "TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s, BipedalWalker-v3" # args.env = PreprocessEnv('BipedalWalker-v3') # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.break_step = int(2e5) # break training when reach break_step (force termination) # args.if_allow_break = False # allow break training when reach goal (early termination) train_and_evaluate(args) # tutorial version
def demo1_discrete_action_space(): # from elegantrl.tutorial.run import Arguments, train_and_evaluate args = Arguments(agent=None, env=None, gpu_id=None) # hyperparameters '''choose an DRL algorithm''' from elegantrl.tutorial.agent import AgentDoubleDQN # AgentDQN args.agent = AgentDoubleDQN() '''choose environment''' "TotalStep: 2e3, TargetReward: 195, UsedTime: 10s, CartPole-v0" args.env = PreprocessEnv( 'CartPole-v0') # or PreprocessEnv(gym.make('CartPole-v0')) args.net_dim = 2**7 # change a default hyper-parameters args.batch_size = 2**7 "TotalStep: 6e4, TargetReward: 200, UsedTime: 600s, LunarLander-v2" # args.env = PreprocessEnv('LunarLander-v2') # args.net_dim = 2 ** 8 # args.batch_size = 2 ** 8 '''train and evaluate''' train_and_evaluate(args)
def run__demo(): # from elegantrl.tutorial.run import Arguments, train_and_evaluate from elegantrl.tutorial.env import PreprocessEnv import gym gym.logger.set_level(40) # Block warning """DEMO 1: Discrete action env of gym""" args = Arguments(agent=None, env=None, gpu_id=None) # hyperparameters '''choose an DRL algorithm''' from elegantrl.tutorial.agent import AgentDoubleDQN # AgentDQN args.agent = AgentDoubleDQN() '''choose environment''' # "TotalStep: 2e3, TargetReward: 195, UsedTime: 10s, CartPole-v0" # args.env = PreprocessEnv(env=gym.make('CartPole-v0')) # args.net_dim = 2 ** 7 # change a default hyper-parameters # args.batch_size = 2 ** 7 "TotalStep: 6e4, TargetReward: 200, UsedTime: 600s, LunarLander-v2" args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) args.net_dim = 2**8 args.batch_size = 2**8 '''train and evaluate''' train_and_evaluate(args) exit() '''DEMO 2: Continuous action env if gym''' '''DEMO 2.1: choose an off-policy DRL algorithm''' from elegantrl.tutorial.agent import AgentSAC # AgentTD3, AgentDDPG args = Arguments(if_on_policy=False) args.agent = AgentSAC() '''DEMO 2.2: choose an on-policy DRL algorithm''' from elegantrl.tutorial.agent import AgentPPO # AgentGaePPO args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy args.agent = AgentPPO() "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s, Pendulum-v0" env = gym.make('Pendulum-v0') env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 args.batch_size = 2**7 args.net_dim = 2**7 "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s, LunarLanderContinuous-v2" # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 "TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s, BipedalWalker-v3" # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.break_step = int(2e5) # break training when reach break_step (force termination) # args.if_allow_break = False # allow break training when reach goal (early termination) train_and_evaluate(args) # tutorial version exit() '''DEMO 3: Custom Continuous action env: FinanceStock-v1''' from elegantrl.tutorial.agent import AgentPPO args = Arguments(if_on_policy=True) args.agent = AgentPPO() args.agent.if_use_gae = False "TotalStep: 5e4, TargetReward: 1.25, UsedTime: 30s, FinanceStock-v2" "TotalStep: 16e4, TargetReward: 1.50, UsedTime: 160s, FinanceStock-v2" from elegantrl.env import FinanceStockEnv # a standard env for ElegantRL, not need PreprocessEnv() args.env = FinanceStockEnv(if_train=True, train_beg=0, train_len=1024) args.env_eval = FinanceStockEnv( if_train=False, train_beg=0, train_len=1024) # eva_len = 1699 - train_len args.env_eval.target_reward = 1.25 # denotes 1.25 times the initial_account. convergence to 1.5 args.break_step = int( 5e6) # break training when reach break_step (force termination) args.if_allow_break = True # allow break training when reach goal (early termination) args.batch_size = 2**11 args.max_step = args.env.max_step args.max_memo = (args.max_step - 1) * 8 train_and_evaluate(args) # tutorial version # train_and_evaluate_mp(args) # try multiprocessing in advanced version exit()
def train_and_evaluate(args): args.init_before_training() '''basic arguments''' cwd = args.cwd env = args.env agent = args.agent gpu_id = args.gpu_id '''training arguments''' net_dim = args.net_dim max_memo = args.max_memo break_step = args.break_step batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times if_break_early = args.if_allow_break gamma = args.gamma reward_scale = args.reward_scale '''evaluating arguments''' show_gap = args.show_gap eval_times = args.eval_times env_eval = PreprocessEnv(env.env_name, if_print=False) if args.env_eval is None else args.env_eval del args # In order to show these hyper-parameters clearly, I put them above. '''init: environment''' max_step = env.max_step state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete '''init: Agent, ReplayBuffer, Evaluator''' agent.init(net_dim, state_dim, action_dim) if_on_policy = getattr(agent, 'if_on_policy', False) buffer = ReplayBuffer(max_len=max_memo + max_step, if_on_policy=if_on_policy, if_gpu=True, state_dim=state_dim, action_dim=1 if if_discrete else action_dim) evaluator = Evaluator(cwd=cwd, agent_id=gpu_id, device=agent.device, env=env_eval, eval_times=eval_times, show_gap=show_gap) # build Evaluator '''prepare for training''' agent.state = env.reset() if if_on_policy: steps = 0 else: # explore_before_training for off-policy with torch.no_grad(): # update replay buffer steps = explore_before_training(env, buffer, target_step, reward_scale, gamma) agent.update_net(buffer, target_step, batch_size, repeat_times) # pre-training and hard update agent.act_target.load_state_dict(agent.act.state_dict()) if getattr(agent, 'act_target', None) else None agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr(agent, 'cri_target', None) else None total_step = steps '''start training''' if_reach_goal = False while not ((if_break_early and if_reach_goal) or total_step > break_step or os.path.exists(f'{cwd}/stop')): with torch.no_grad(): # speed up running steps = agent.explore_env(env, buffer, target_step, reward_scale, gamma) total_step += steps obj_a, obj_c = agent.update_net(buffer, target_step, batch_size, repeat_times) with torch.no_grad(): # speed up running if_reach_goal = evaluator.evaluate_save(agent.act, steps, obj_a, obj_c)
def run__demo(): # from elegantrl.tutorial.run import Arguments, train_and_evaluate from elegantrl.tutorial.env import PreprocessEnv import gym gym.logger.set_level( 40 ) # Block warning: 'WARN: Box bound precision lowered by casting to float32' """DEMO 1: Discrete action env of gym""" args = Arguments(agent=None, env=None, gpu_id=None) # see Arguments() to see hyper-parameters '''choose an DRL algorithm''' from elegantrl.tutorial.agent import AgentDoubleDQN # AgentDQN args.agent = AgentDoubleDQN() '''choose environment''' # args.env = PreprocessEnv(env=gym.make('CartPole-v0')) # args.net_dim = 2 ** 7 # change a default hyper-parameters # args.batch_size = 2 ** 7 "TotalStep: 2e3, TargetReward: , UsedTime: 10s" args.env = PreprocessEnv(env=gym.make('LunarLander-v2')) args.net_dim = 2**8 args.batch_size = 2**8 "TotalStep: 6e4, TargetReward: 200, UsedTime: 600s" '''train and evaluate''' train_and_evaluate(args) exit() '''DEMO 2: Continuous action env if gym''' '''DEMO 2.1: choose an off-policy DRL algorithm''' from elegantrl.tutorial.agent import AgentSAC # AgentTD3, AgentDDPG args = Arguments(if_on_policy=False) args.agent = AgentSAC() '''DEMO 2.2: choose an on-policy DRL algorithm''' from elegantrl.tutorial.agent import AgentPPO # AgentGaePPO args = Arguments( if_on_policy=True ) # hyper-parameters of on-policy is different from off-policy args.agent = AgentPPO() '''choose environment''' env = gym.make('Pendulum-v0') env.target_reward = -200 # set target_reward manually for env 'Pendulum-v0' args.env = PreprocessEnv(env=env) args.reward_scale = 2**-3 # RewardRange: -1800 < -200 < -50 < 0 args.net_dim = 2**7 args.batch_size = 2**7 "TotalStep: 4e5, TargetReward: -200, UsedTime: 400s" # args.env = PreprocessEnv(env=gym.make('LunarLanderContinuous-v2')) # args.reward_scale = 2 ** 0 # RewardRange: -800 < -200 < 200 < 302 "TotalStep: 9e4, TargetReward: 200, UsedTime: 2500s" # args.env = PreprocessEnv(env=gym.make('BipedalWalker-v3')) # args.reward_scale = 2 ** 0 # RewardRange: -200 < -150 < 300 < 334 # args.break_step = int(2e5) # args.if_allow_break = False "TotalStep: 2e5, TargetReward: 300, UsedTime: 5000s" '''train and evaluate''' train_and_evaluate(args) # train_and_evaluate__multiprocessing(args) # try multiprocessing in complete version exit() '''DEMO 3: Custom Continuous action env: FinanceStock-v1''' args = Arguments(if_on_policy=True) '''choose an DRL algorithm''' from elegantrl.tutorial.agent import AgentPPO args.agent = AgentPPO() from elegantrl.env import FinanceMultiStockEnv # a standard env for ElegantRL, not need PreprocessEnv() args.env = FinanceMultiStockEnv(if_train=True, train_beg=0, train_len=1024) args.env_eval = FinanceMultiStockEnv( if_train=False, train_beg=0, train_len=1024) # eva_len = 1699 - train_len args.reward_scale = 2**0 # RewardRange: 0 < 1.0 < 1.25 < args.break_step = int(5e6) args.max_step = args.env.max_step args.max_memo = (args.max_step - 1) * 8 args.batch_size = 2**11 args.if_allow_break = False "TotalStep: 2e5, TargetReward: 1.25, UsedTime: 200s" "TotalStep: 4e5, TargetReward: 1.50, UsedTime: 400s" "TotalStep: 10e5, TargetReward: 1.62, UsedTime: 1000s" '''train and evaluate''' train_and_evaluate(args) # args.rollout_num = 8 # train_and_evaluate__multiprocessing(args) # try multiprocessing in complete version exit()