def td3(env, hyper, policy = "MlpPolicy", verbose = 0, tensorboard_log = None, seed = 0, use_sde = True, learning_starts = 100, device = "auto"): policy_kwargs = make_policy_kwargs(hyper, "td3") hyper = action_noise(hyper, "td3", n_actions = env.action_space.shape[0]) #optimize_memory_usage=False, policy_delay=2, target_policy_noise=0.2, target_noise_clip=0.5, model = TD3('MlpPolicy', env, verbose = verbose, tensorboard_log = tensorboard_log, seed = seed, gamma = hyper['params_gamma'], learning_rate = hyper['params_lr'], batch_size = np.int(hyper['params_batch_size']), buffer_size = np.int(hyper['params_buffer_size']), action_noise = hyper['params_action_noise'], train_freq = np.int(hyper['params_train_freq']), gradient_steps = np.int(hyper['params_train_freq']), n_episodes_rollout = np.int(hyper['params_n_episodes_rollout']), learning_starts = learning_starts, policy_kwargs=policy_kwargs, device = device) return model
def train(): best_reward, best_reward_timesteps = None, None save_path = "model_save/"+MODEL_PATH+"/" if save_path is not None: os.makedirs(save_path, exist_ok=True) # log_dir = f"model_save/" log_dir = save_path env, env_eval = ENV(util='train', par=PARAM, dt=DT), ENV(util='val', par=PARAM, dt=DT) env, env_eval = Monitor(env, log_dir), Monitor(env_eval, log_dir) env, env_eval = DummyVecEnv([lambda: env]), DummyVecEnv([lambda: env_eval]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) if PARAM['algo']=='td3': model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ddpg': model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ppo': model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed']) eval_callback = EvalCallback(env_eval, best_model_save_path=save_path+MODEL_PATH+'_best_model', log_path=log_dir, eval_freq=PARAM['eval_freq'], save_freq=PARAM['save_freq'], deterministic=True, render=False) model.learn(total_timesteps=int(PARAM['total_time_step']), callback=eval_callback, log_interval = 500) print("best mean reward:", eval_callback.best_mean_reward_overall, "timesteps:", eval_callback.best_mean_reward_timestep) model.save(save_path+MODEL_PATH+'_final_timesteps')
def __init__(self, policy: Union[str, Type[TD3Policy]], env: str, mapper: Callable[[Tensor], Union[Tensor, np.ndarray, list]] = None, verbose: bool = True, tensorboard_log: str = "log/") -> None: vecEnv = self._get_env(env, mapper) self._env = vecEnv self._policy = policy self._ddpg = TD3(self._policy, self._env, learning_rate=LEARNING_RATE, buffer_size=BUFFER_SIZE, learning_starts=LEARNING_STARTS, batch_size=BATCH_SIZE, tau=TAU, gamma=GAMMA, policy_delay=POLICY_DELAY, train_freq=(N_EPISODES_ROLLOUT, 'episode'), policy_kwargs={"agent_num": vecEnv.agent_num()}, verbose=verbose, tensorboard_log=tensorboard_log) self._ddpg.replay_buffer = MultiAgentReplayBuffer( buffer_size=BUFFER_SIZE, observation_space=vecEnv.observation_space, action_space=vecEnv.action_space, device=self._ddpg.replay_buffer.device, n_envs=len(vecEnv.envs), n_agent=vecEnv.agent_num(), optimize_memory_usage=self._ddpg.replay_buffer. optimize_memory_usage)
def create_model(env, algorithm, save_path): # the noise object n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.2) * np.ones(n_actions), theta=0.15) if algorithm == "ddpg": return DDPG(DDPG_MlpPolicy, env, learning_rate=0.001, buffer_size=1000000, batch_size=64, tau=0.001, gamma=0.99, train_freq=(10, "step"), action_noise=action_noise, policy_kwargs=dict(optimizer_class=th.optim.AdamW), tensorboard_log=save_path) elif algorithm == "td3": return TD3(TD3_MlpPolicy, env, action_noise=action_noise, tensorboard_log=save_path) elif algorithm == "sac": return SAC(SAC_MlpPolicy, env, action_noise=action_noise, tensorboard_log=save_path) else: raise Exception("--> Alican's LOG: Unknown agent type!")
def train_TD3(self, model_name, model_params = config.TD3_PARAMS): """TD3 model""" from stable_baselines3 import TD3 from stable_baselines3.td3.policies import MlpPolicy from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise env_train = self.env n_actions = env_train.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1*np.ones(n_actions)) start = time.time() model = TD3('MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], learning_rate = model_params['learning_rate'], action_noise = action_noise, verbose=model_params['verbose'], tensorboard_log = f"{config.TENSORBOARD_LOG_DIR}/{model_name}" ) model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "TD3_run") end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end-start)/60,' minutes') return model
def load_model(env, algorithm, filename): if algorithm == "ddpg": return DDPG.load(filename, env=env) elif algorithm == "td3": return TD3.load(filename, env=env) elif algorithm == "sac": return SAC.load(filename, env=env) else: raise Exception("--> Alican's LOG: Unknown agent type!")
def test_td3(action_noise): model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]), learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise) model.learn(total_timesteps=1000, eval_freq=500)
def test_save_load_large_model(tmp_path): """ Test saving and loading a model with a large policy that is greater than 2GB. We test only one algorithm since all algorithms share the same code for loading and saving the model. """ env = select_env(TD3) kwargs = dict(policy_kwargs=dict(net_arch=[8192, 8192, 8192]), device="cpu") model = TD3("MlpPolicy", env, **kwargs) # test saving model.save(tmp_path / "test_save") # test loading model = TD3.load(str(tmp_path / "test_save.zip"), env=env, **kwargs) # clear file from os os.remove(tmp_path / "test_save.zip")
def train_td3(): log_dir = f"model_save/" env = ENV(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = TD3("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1, batch_size=2048, seed=1, learning_starts=500000) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(1000000), callback = callback, log_interval = 480) model.save('model_save/td3_cnn')
def __init__(self, env, hyperparameters=DEFAULT_HYPERPARAMETERS): self.P = hyperparameters if self.P["model_class"] == "dqn": from stable_baselines3 import DQN self.model = DQN('MlpPolicy', env, verbose=self.P["verbose"]) self.model_class = DQN elif self.P["model_class"] == "a2c": from stable_baselines3 import A2C from stable_baselines3.a2c import MlpPolicy self.model = A2C(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = A2C elif self.P["model_class"] == "ddpg": from stable_baselines3 import DDPG from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=self.P["verbose"]) self.model_class = DDPG elif self.P["model_class"] == "td3": from stable_baselines3 import TD3 from stable_baselines3.td3.policies import MlpPolicy from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=self.P["verbose"]) self.model_class = TD3 elif self.P["model_class"] == "ppo": from stable_baselines3 import PPO from stable_baselines3.ppo import MlpPolicy self.model = PPO(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = PPO elif self.P["model_class"] == "sac": from stable_baselines3 import SAC from stable_baselines3.sac import MlpPolicy self.model = SAC(MlpPolicy, env, verbose=self.P["verbose"]) self.model_class = SAC else: raise NotImplementedError()
def test(MODEL_TEST): log_dir = "model_save/" + MODEL_PATH + "/" + MODEL_PATH + MODEL_TEST env = ENV(util='test', par=PARAM, dt=DT) env.render = True env = Monitor(env, log_dir) if PARAM['algo']=='td3': model = TD3.load(log_dir) elif PARAM['algo']=='ddpg': model = DDPG.load(log_dir) elif PARAM['algo']=='ppo': model = PPO.load(log_dir) # plot_results(f"model_save/") trade_dt = pd.DataFrame([]) # trade_dt: 所有股票的交易数据 result_dt = pd.DataFrame([]) # result_dt: 所有股票一年测试结果数据 for i in range(TEST_STOCK_NUM): state = env.reset() stock_bh_id = 'stock_bh_'+str(i) # 记录每个股票交易的buy_hold stock_port_id = 'stock_port_'+str(i) # 记录每个股票交易的portfolio stock_action_id = 'stock_action_' + str(i) # 记录每个股票交易的action flow_L_id = 'stock_flow_' + str(i) # 记录每个股票的流水 stock_bh_dt, stock_port_dt, action_policy_dt, flow_L_dt = [], [], [], [] day = 0 while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",day,"reward:", reward,"now profit:",env.profit) # 测试每一步的交易policy stock_bh_dt.append(env.buy_hold) stock_port_dt.append(env.Portfolio_unit) action_policy_dt.append(action[0][0]) # 用于记录policy flow_L_dt.append(env.flow) day+=1 if done: print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}' .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad)) # 交易完后记录:股票ID,利润(单位100%),buy_hold(单位100%),夏普率,最大回撤率(单位100%),romad result=pd.DataFrame([[i,env.profit*100,env.buy_hold*100,env.sp,env.mdd*100,env.romad]]) break trade_dt_stock = pd.DataFrame({stock_port_id: stock_port_dt, stock_bh_id: stock_bh_dt, stock_action_id: action_policy_dt, flow_L_id: flow_L_dt}) # 支股票的交易数据 trade_dt = pd.concat([trade_dt, trade_dt_stock], axis=1) # 所有股票交易数据合并(加行) result_dt = pd.concat([result_dt,result],axis=0) # 所有股票结果数据合并(加列) result_dt.columns = ['stock_id','prfit(100%)','buy_hold(100%)','sp','mdd(100%)','romad'] trade_dt.to_csv('out_dt/trade_'+MODEL_PATH+'.csv',index=False) result_dt.to_csv('out_dt/result_'+MODEL_PATH+'.csv',index=False)
def main(): args = parse_arguments() load_path = os.path.join("logs", args.env, args.agent, "best_model.zip") stats_path = os.path.join(args.log_dir, args.env, args.agent, "vec_normalize.pkl") if args.agent == 'ddpg': from stable_baselines3 import DDPG model = DDPG.load(load_path) elif args.agent == 'td3': from stable_baselines3 import TD3 model = TD3.load(load_path) elif args.agent == 'ppo': from stable_baselines3 import PPO model = PPO.load(load_path) env = make_vec_env(args.env, n_envs=1) env = VecNormalize.load(stats_path, env) # do not update them at test time env.training = False # reward normalization is not needed at test time env.norm_reward = False # env = gym.make(args.env) img = [] if args.render: env.render('human') done = False obs = env.reset() action = model.predict(obs) if args.gif: img.append(env.render('rgb_array')) if args.timesteps is None: while not done: action, _= model.predict(obs) obs, reward, done, info = env.step(action) if args.gif: img.append(env.render('rgb_array')) else: env.render() else: for i in range(args.timesteps): action, _= model.predict(obs) obs, reward, done, info = env.step(action) if args.gif: img.append(env.render('rgb_array')) else: env.render() if args.gif: imageio.mimsave(f'{os.path.join("logs", args.env, args.agent, "recording.gif")}', [np.array(img) for i, img in enumerate(img) if i%2 == 0], fps=29)
def play(): model = TD3.load("models/kuka_iiwa_insertion-v0") env = gym.make('kuka_iiwa_insertion-v0', use_gui=True) obs = env.reset() i = 0 while True: i += 1 action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) if i % 100 == 0 or dones: print(obs, rewards, dones, info) if dones: print("="*20 + " RESET " + "="*20) env.reset()
def create(self, n_envs=1): """Create the agent""" self.env = self.agent_helper.env log_dir = self.agent_helper.config_dir os.makedirs(log_dir, exist_ok=True) self.env = Monitor(self.env, log_dir) #TODO: # Create DDPG policy and define its hyper parameter here! even the action space and observation space. # add policy policy_name = self.agent_helper.config['policy'] self.policy = eval(policy_name) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) n_actions = int(self.agent_helper.env.action_space.shape[0]) action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=self.agent_helper.config['rand_sigma'] * np.ones(n_actions)) #FIXME: test: # self.model = DDPG("MlpPolicy", self.env, action_noise=action_noise, verbose=1, tensorboard_log=self.agent_helper.graph_path) # TODO: fix the obvervation space and action space later. Test if the obervation space input is correct? Output action space is correct? # activ_function_name = self.agent_helper.config['nn_activ'] # activ_function = eval(activ_function_name) # policy_kwargs = dict(activation_fn=activ_function, # net_arch=[dict(pi=[32, 32], qf=[32, 32])]) policy_kwargs = dict(net_arch=self.agent_helper.config['layers']) self.model = TD3( self.policy, self.env, learning_rate=self.agent_helper.config['learning_rate'], buffer_size = self.agent_helper.config['buffer_size'], batch_size=self.agent_helper.config['batch_size'], tau=self.agent_helper.config['tau'], gamma=self.agent_helper.config['gamma'], gradient_steps=self.agent_helper.config['gradient_steps'], action_noise=action_noise, optimize_memory_usage=self.agent_helper.config['optimize_memory_usage'], create_eval_env=self.agent_helper.config['create_eval_env'], policy_kwargs=policy_kwargs, verbose=self.agent_helper.config['verbose'], learning_starts=self.agent_helper.config['learning_starts'], tensorboard_log=self.agent_helper.graph_path, policy_delay = self.agent_helper.config['policy_delay'], target_policy_noise= self.agent_helper.config['target_policy_noise'], target_noise_clip= self.agent_helper.config['target_noise_clip'], seed=self.agent_helper.seed ) pass
def train_td3(): log_dir = f"model_save/" env = ENV(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=1, batch_size=2048, seed=1) model = TD3('MlpPolicy', env, verbose=1, batch_size=2048, seed=1, learning_starts=1440) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(2880), callback = callback, log_interval = 100) model.save('model_save/td3_sp2')
def main(): n_envs = 8 env_id = "CartPole-v0" # def env_fn(): # return continuous_actions(gym.make(env_id)) env = env_fn() #print(env.observation_space) #obs_size, = env.observation_space.shape #act_size = env.action_space.n sb3_env = SpaceWrap(env) # print(sb3_env.action_space) # exit(0) n_timesteps = 1000 save_path = "log" eval_freq = 50 tensorboard_log = "" sb3_learner_fn = lambda device: TD3(env=sb3_env, tensorboard_log=tensorboard_log, policy=MlpPolicy, device=device) learner_fn = lambda: SB3LearnWrapper(sb3_learner_fn("cuda")) policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cuda").policy) example_policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cpu").policy) #learner = (model) learn_rate = lambda x: 0.01 #policy = SB3Wrapper(model.policy)#MlpPolicy(env.observation_space, env.action_space, learn_rate, device="cpu")) data_store_size = 12800 batch_size = 512 logger = make_logger("log") run_loop( logger, learner_fn, #A2CLearner(policy, 0.001, 0.99, logger, device), OccasionalUpdate(10, example_policy_fn()), lambda: StatelessActor(policy_fn()), env_fn, MakeCPUAsyncConstructor(4), lambda: TransitionAdder(env.observation_space, env.action_space), UniformSampleScheme(data_store_size), data_store_size, batch_size, n_envs=16, log_frequency=5)
def test_td3(): log_dir = f"model_save/best_model_td3_cnn" env = ENV(istest=True) env.render = True env = Monitor(env, log_dir) model = TD3.load(log_dir) plot_results(f"model_save/") for i in range(10): state = env.reset() while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",i,"action:", action,"now profit:",env.profit) if done: print('stock',i,' total profit=',env.profit,' buy hold=',env.buy_hold) break
def test_td3_train_with_batch_norm(): model = TD3( "MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor), learning_starts=0, tau=0, # do not copy the target seed=1, ) ( actor_bias_before, actor_running_mean_before, critic_bias_before, critic_running_mean_before, actor_target_bias_before, actor_target_running_mean_before, critic_target_bias_before, critic_target_running_mean_before, ) = clone_td3_batch_norm_stats(model) model.learn(total_timesteps=200) ( actor_bias_after, actor_running_mean_after, critic_bias_after, critic_running_mean_after, actor_target_bias_after, actor_target_running_mean_after, critic_target_bias_after, critic_target_running_mean_after, ) = clone_td3_batch_norm_stats(model) assert ~th.isclose(actor_bias_before, actor_bias_after).all() assert ~th.isclose(actor_running_mean_before, actor_running_mean_after).all() assert ~th.isclose(critic_bias_before, critic_bias_after).all() assert ~th.isclose(critic_running_mean_before, critic_running_mean_after).all() assert th.isclose(actor_target_bias_before, actor_target_bias_after).all() assert th.isclose(actor_target_running_mean_before, actor_target_running_mean_after).all() assert th.isclose(critic_target_bias_before, critic_target_bias_after).all() assert th.isclose(critic_target_running_mean_before, critic_target_running_mean_after).all()
def prepare_stage(self): dir = f'experiments/{self.config.experiment_name}' if not os.path.exists(dir): os.mkdir(dir) else: # recovers the latest non-corrupted checkpoint, if existent checkpoints = [] for file in glob.glob(f'{dir}/status_checkpoint*'): checkpoints.append( int(file.split('/status_checkpoint_')[1].split('.')[0])) checkpoints.sort() attempts = len(checkpoints) - 1 while attempts >= 0: try: f = open( f'{dir}/status_checkpoint_{checkpoints[attempts]}.pkl', 'rb') self.results_episodes, self.results_episodes_validation, self.current_checkpoint, self.current_episode = pickle.load( f) # only recovers pickle if model also available env2 = DummyVecEnv([lambda: self.env]) self.model = TD3.load( f'{dir}/model_checkpoint_{checkpoints[attempts]}', env=env2) attempts = -1 self.log.write( f'RECOVERED checkpoint {checkpoints[attempts]}') except: self.log.write( f'ERROR: Could not recover checkpoint {checkpoints[attempts]} {traceback.format_exc()}' ) self.results_episodes, self.results_episodes_validation, self.current_checkpoint, self.current_episode = [], [], 0, 0 attempts -= 1
def main(): n_envs = 8 env_id = "CartPole-v0" # def env_fn(): # return continuous_actions(gym.make(env_id)) env = env_fn() #print(env.observation_space) #obs_size, = env.observation_space.shape #act_size = env.action_space.n sb3_env = SpaceWrap(env) # print(sb3_env.action_space) # exit(0) n_timesteps = 1000 save_path = "log" eval_freq = 50 tensorboard_log = "" model = TD3(env=sb3_env, tensorboard_log=tensorboard_log, policy=MlpPolicy) learner = SB3LearnWrapper(model) device = "cpu" learn_rate = lambda x: 0.01 policy = SB3Wrapper( model.policy ) #MlpPolicy(env.observation_space, env.action_space, learn_rate, device="cpu")) data_store_size = 12800 batch_size = 16 logger = make_logger("log") run_loop( logger, lambda: learner, #A2CLearner(policy, 0.001, 0.99, logger, device), NoUpdate(), #.10, policy), lambda: StatelessActor(policy), env_fn, ConcatVecEnv, lambda: TransitionAdder(env.observation_space, env.action_space), UniformSampleScheme(data_store_size), data_store_size, batch_size, n_envs=16, log_frequency=5)
def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs): # wrapper around stable_baselines RL implemenetations assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS) if model == 'a2c': self.rl = A2C(**kwargs) elif model == 'ppo': self.rl = PPO(**kwargs) elif model == 'dqn': self.rl = DQN(**kwargs) elif model == 'td3': self.rl = TD3(**kwargs) self.use_gp = use_gp if self.use_gp: assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR' self.n_train = gp_params['n_train'] self.retraining_iter = gp_params['training_iter'] self.cvar_limit = gp_params['cvar_limit'] self.gp_limit = gp_params['gp_limit'] self.likelihood = gpytorch.likelihoods.GaussianLikelihood() if 'data' in gp_params.keys(): self.X_train = gp_params['data']['X_train'] self.y_train = gp_params['data']['y_train'] else: self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features self.y_train = torch.zeros(self.n_train) self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood) self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp) self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1) self.shares = 0 self.cash = 0 self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned # for plotting self.pred_return = 0 self.pred_lower = 0 self.pred_upper = 0 # for debugging self.goal_num_shares = 0
def run(env, algname, filename): if algname == "TD3": model = TD3.load(f"{algname}_pkl") elif algname == "SAC": if filename: model = SAC.load(f"{filename}") else: model = SAC.load(f"{algname}_pkl") elif algname == "DDPG": model = DDPG.load(f"{algname}_pkl") else: raise "Wrong algorithm name provided." obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() if done: break
def train_TD3(env): print(f"action space shape -1:{env.action_space.shape[-1]}") # The noise objects for TD3 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.02 * np.ones(n_actions)) model = TD3(MlpPolicy, env, learning_rate=0.0003, buffer_size=100000, action_noise=action_noise, batch_size=128, learning_starts=128, verbose=1) model.learn(total_timesteps=2000000, log_interval=10) model.save("TD3_pkl")
def test_td3(): log_dir = f"model_save/best_model_td3_sp2" env = ENV(istest=True) env.render = True env = Monitor(env, log_dir) model = TD3.load(log_dir) plot_results(f"model_save/") for i in range(10): state = env.reset() day = 0 while True: action = model.predict(state) next_state, reward, done, info = env.step(action[0]) state = next_state # print("trying:",day,"reward:", reward,"now profit:",env.profit) day+=1 if done: print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}' .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad)) break
def learn(self, initial_models): mesa_algo = TD3( "MlpPolicy", self.env, verbose=1, learning_starts=1 ) # Note: Unecessarily initializes parameters (could speed up a bit by fixing)' mesa_algo.set_parameters(to_torch(initial_models), exact_match=False) LOG_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/logs/" MODEL_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/models/" callback_list = [] callback_list.append(TensorboardCallback()) callback_list.append( StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1)) """callback_list.append(EvalCallback(self.env, best_model_save_path=MODEL_DIR, log_path=LOG_DIR, deterministic=True, eval_freq=5, n_eval_episodes=1))""" mesa_algo.learn(total_timesteps=1000, callback=callback_list ) #rospy.get_param("/hyperparameters/total_timesteps") print("finished training! Testing mesa network...") test_buffer = ReplayBuffer(100, TaskEnv.observation_space, TaskEnv.action_space, device="cuda") test_env = Monitor(self.env) done = False ob = test_env.reset() while not done: action, state = mesa_algo.predict(ob) next_ob, reward, done, info = test_env.step(action) test_buffer.add(ob, next_ob, action, reward, done, [info]) ob = next_ob meta_buffer = {"test": test_buffer, "train": mesa_algo.replay_buffer} optimized_mesa_parameters = mesa_algo.get_parameters() tf_mesa_models = from_torch(optimized_mesa_parameters) return meta_buffer, tf_mesa_models
def train(): log_dir = f"model_save/" env = ENV(istest=False) env = Monitor(env, log_dir) env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir) model.learn(total_timesteps=int(PARAM['total_time_step']), callback=callback, log_interval=480) model.save('model_save/' + MODEL_PATH)
def __init__(self): #self.observation_space = TaskEnv.observation_space #self.action_space = TaskEnv.action_space self.ros_interface = CatchInterface() self.env = MarshaGym(self.ros_interface) self.mesa_algo = TD3("MlpPolicy", self.env) self.tasks = [Task(self.env), Task(self.env)] self.replay_buffer = None # will use one of the task replay buffers self.lambda_reg = 2.0 # Regularization Strength (2.0 according to iMAML paper) self.meta_models = { "actor": Actor(), "critic_0": Critic(), "critic_1": Critic() } self.loss_functions = [self.actor_loss, self.critic_loss] self.optimized_mesa_models = None
def make_model(config, env): policy = config["policy_name"] if config["policy_name"] == "CustomTCNPolicy": policy = customActorCriticPolicyWrapper( env.observation_space.shape[0] // config["obs_input"], config["obs_input"]) tb_log = None if config["tensorboard_log"]: tb_log = "./tb/{}/".format(config["session_ID"]) ou_noise = None if config["ou_noise"]: ou_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(env.action_space.shape[0]), sigma=config["ou_sigma"] * np.ones(env.action_space.shape[0]), theta=config["ou_theta"], dt=config["ou_dt"], initial_noise=None) model = TD3(policy=policy, env=env, buffer_size=config["buffer_size"], learning_starts=config["learning_starts"], action_noise=ou_noise, target_policy_noise=config["target_policy_noise"], target_noise_clip=config["target_noise_clip"], gamma=config["gamma"], tau=config["tau"], learning_rate=eval(config["learning_rate"]), verbose=config["verbose"], tensorboard_log=tb_log, device="cpu", policy_kwargs=dict(net_arch=[ int(config["policy_hid_dim"]), int(config["policy_hid_dim"]) ])) return model
def main(): # Create log dir log_dir = './td3_data' os.makedirs(log_dir, exist_ok=True) vix_env = trading_vix_env.trading_vix_env() env = Monitor(vix_env, log_dir) # Create action noise because TD3 and DDPG use a deterministic policy n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # Create the callback: check every 20000 steps callback = custom_call_back.CustomCallback(check_freq=20000, log_dir=log_dir) # Create RL model model = TD3('MlpPolicy', env, action_noise=action_noise, verbose=2, batch_size=10000) # Train the agent model.learn(total_timesteps=int(5e9), callback=callback)
algo = ARGS.exp.split("-")[2] if os.path.isfile(ARGS.exp + '/success_model.zip'): path = ARGS.exp + '/success_model.zip' elif os.path.isfile(ARGS.exp + '/best_model.zip'): path = ARGS.exp + '/best_model.zip' else: print("[ERROR]: no model under the specified path", ARGS.exp) if algo == 'a2c': model = A2C.load(path) if algo == 'ppo': model = PPO.load(path) if algo == 'sac': model = SAC.load(path) if algo == 'td3': model = TD3.load(path) if algo == 'ddpg': model = DDPG.load(path) #### Parameters to recreate the environment ################ env_name = ARGS.exp.split("-")[1] + "-aviary-v0" OBS = ObservationType.KIN if ARGS.exp.split( "-")[3] == 'kin' else ObservationType.RGB if ARGS.exp.split("-")[4] == 'rpm': ACT = ActionType.RPM elif ARGS.exp.split("-")[4] == 'dyn': ACT = ActionType.DYN elif ARGS.exp.split("-")[4] == 'pid': ACT = ActionType.PID elif ARGS.exp.split("-")[4] == 'vel': ACT = ActionType.VEL