def sample_td3_params(trial): """ Sampler for TD3 hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512]) buffer_size = trial.suggest_categorical('buffer_size', [int(1e4), int(1e5), int(1e6)]) train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 1000, 2000]) gradient_steps = train_freq noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal']) noise_std = trial.suggest_uniform('noise_std', 0, 1) hyperparams = { 'gamma': gamma, 'learning_rate': learning_rate, 'batch_size': batch_size, 'buffer_size': buffer_size, 'train_freq': train_freq, 'gradient_steps': gradient_steps, } if noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
def main(env): n_actions = env.action_space.shape[0] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Using only one expert trajectory # you can specify `traj_limitation=-1` for using the whole dataset file_dir = "/home/vignesh/Thesis_Suture_data/trial2/ambf_data/" dataset = ExpertDataset(expert_path=file_dir + 'expert_psm_data.npz', traj_limitation=1, batch_size=32) model = DDPG(MlpPolicy, env, gamma=0.95, verbose=1, nb_train_steps=300, nb_rollout_steps=150, param_noise=param_noise, batch_size=128, action_noise=action_noise, random_exploration=0.05, normalize_observations=True, tensorboard_log="./ddpg_dvrk_tensorboard/", observation_range=(-1.5, 1.5)) model.pretrain(dataset, n_epochs=1000) model.save("./gail_robot_env")
def ddpg(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None, load_weights=None): from stable_baselines import DDPG env = gym.make(env_id) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) if load_weights is not None: model = DDPG.load(load_weights, env=env) else: model = DDPG(policy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=tensorboard_log) callback = WandbRenderEnvCallback(model_name="ddpg", env_name=env_id) model.learn(total_timesteps=timesteps, log_interval=log_interval, callback=callback) save_model_weights(model, "ddpg", env_id, policy, seed=seed, path=".")
def main(env: PSMCartesianDDPGEnv): # the noise objects for DDPG n_actions = env.action.action_space.shape[0] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, gamma=0.95, verbose=1, nb_train_steps=300, nb_rollout_steps=150, param_noise=param_noise, batch_size=128, action_noise=action_noise, random_exploration=0.05, normalize_observations=True, tensorboard_log="./ddpg_dvrk_tensorboard/", observation_range=(-1.5, 1.5), critic_l2_reg=0.01) model.learn(total_timesteps=4000000, log_interval=100, callback=CheckpointCallback( save_freq=100000, save_path="./ddpg_dvrk_tensorboard/")) model.save("./ddpg_robot_env")
def train_DDPG(self, model_name, model_params=config.DDPG_PARAMS): """DDPG model""" from stable_baselines import DDPG from stable_baselines.ddpg.policies import DDPGPolicy from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise env_train = self.env n_actions = env_train.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() model = DDPG('MlpPolicy', env_train, batch_size=model_params['batch_size'], buffer_size=model_params['buffer_size'], param_noise=param_noise, action_noise=action_noise, verbose=model_params['verbose']) model.learn(total_timesteps=model_params['timesteps']) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def sample_ddpg_params(trial): """ Sampler for DDPG hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1) # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512]) buffer_size = trial.suggest_categorical('memory_limit', [int(1e4), int(1e5), int(1e6)]) noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal']) noise_std = trial.suggest_uniform('noise_std', 0, 1) normalize_observations = trial.suggest_categorical('normalize_observations', [True, False]) normalize_returns = trial.suggest_categorical('normalize_returns', [True, False]) hyperparams = { 'gamma': gamma, 'actor_lr': learning_rate, 'critic_lr': learning_rate, 'batch_size': batch_size, 'memory_limit': buffer_size, 'normalize_observations': normalize_observations, 'normalize_returns': normalize_returns } if noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(1), sigma=noise_std * np.ones(1)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(1), sigma=noise_std * np.ones(1)) return hyperparams
def main( training_env: PSMCartesianHERDDPGEnv, eval_env: PSMCartesianHERDDPGEnv = None, log_dir='./.logs/results' ): os.makedirs(log_dir, exist_ok=True) # training_env = Monitor(training_env, log_dir) n_actions = training_env.action_space.shape[0] noise_std = 0.2 # Currently using OU noise action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) model_class = DDPG # works also with SAC, DDPG and TD3 rl_model_kwargs = { 'actor_lr': 1e-3, 'critic_lr': 1e-3, 'action_noise': action_noise, 'nb_train_steps': 300, 'nb_rollout_steps': 100, 'gamma': 0.95, 'observation_range': (-1.5, 1.5), 'random_exploration': 0.05, 'normalize_observations': True, 'critic_l2_reg': 0.01 } # Available strategies (cf paper): future, final, episode, random model = HER( 'MlpPolicy', training_env, model_class, verbose=1, n_sampled_goal=4, goal_selection_strategy='future', buffer_size=int(1e5), batch_size=128, tensorboard_log="./ddpg_dvrk_tensorboard/", **rl_model_kwargs ) # Reset the model training_env.reset() # Create callbacks checkpoint_callback = CheckpointCallback( save_freq=100000, save_path="./ddpg_dvrk_tensorboard/" ) # save_path="./.model/model_checkpoint/") #save_freq=100000 # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model', # log_path=log_dir, eval_freq=500) callback = CallbackList([checkpoint_callback]) # , eval_callback]) # Train the model model.learn(4000000, log_interval=100, callback=callback) model.save("./her_robot_env")
def ddpg(env, seed): n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions)) return DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1, tensorboard_log="./data/runs", seed=seed)
def DDPGAgent(multi_stock_env, num_episodes): models_folder = 'saved_models' rewards_folder = 'saved_rewards' env = DummyVecEnv([lambda: multi_stock_env]) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Hyper parameters GAMMA = 0.99 TAU = 0.001 BATCH_SIZE = 16 ACTOR_LEARNING_RATE = 0.0001 CRITIC_LEARNING_RATE = 0.001 BUFFER_SIZE = 500 print("\nRunning DDPG Agent...\n") model = DDPG(MlpPolicy, env, gamma = GAMMA, tau = TAU, batch_size = BATCH_SIZE, actor_lr = ACTOR_LEARNING_RATE, critic_lr = CRITIC_LEARNING_RATE, buffer_size = BUFFER_SIZE, verbose=1, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=50000) model.save(f'{models_folder}/rl/ddpg.h5') del model model = DDPG.load(f'{models_folder}/rl/ddpg.h5') obs = env.reset() portfolio_value = [] for e in range(num_episodes): action, _states = model.predict(obs) next_state, reward, done, info = env.step(action) print(f"episode: {e + 1}/{num_episodes}, episode end value: {info[0]['cur_val']:.2f}") portfolio_value.append(round(info[0]['cur_val'], 3)) # save portfolio value for each episode np.save(f'{rewards_folder}/rl/ddpg.npy', portfolio_value) print("\nDDPG Agent run complete and saved!") a = np.load(f'./saved_rewards/rl/ddpg.npy') print(f"\nCumulative Portfolio Value Average reward: {a.mean():.2f}, Min: {a.min():.2f}, Max: {a.max():.2f}") plt.plot(a) plt.title("Portfolio Value Per Episode (DDPG)") plt.ylabel("Portfolio Value") plt.xlabel("Episodes") plt.show()
def train_DDPG(env_train, model_name, timesteps=10000): """DDPG model""" # the noise objects for DDPG n_actions = env_train.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) start = time.time() model = DDPG('MlpPolicy', env_train, param_noise=param_noise, action_noise=action_noise) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (DDPG): ', (end - start) / 60, ' minutes') return model
def sample_ddpg_params(trial): """ Sampler for DDPG hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical( 'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1) # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256]) buffer_size = trial.suggest_categorical( 'memory_limit', [int(1e4), int(1e5), int(1e6)]) noise_type = trial.suggest_categorical( 'noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param']) noise_std = trial.suggest_uniform('noise_std', 0, 1) normalize_observations = trial.suggest_categorical( 'normalize_observations', [True, False]) normalize_returns = trial.suggest_categorical('normalize_returns', [True, False]) hyperparams = { 'gamma': gamma, 'actor_lr': learning_rate, 'critic_lr': learning_rate, 'batch_size': batch_size, 'memory_limit': buffer_size, 'normalize_observations': normalize_observations, 'normalize_returns': normalize_returns } if noise_type == 'adaptive-param': hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) # Apply layer normalization when using parameter perturbation hyperparams['policy_kwargs'] = dict(layer_norm=True) elif noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
def train_stable_baselines(submodule, flags): """Train policies using the PPO algorithm in stable-baselines.""" from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import DDPG flow_params = submodule.flow_params # Path to the saved files exp_tag = flow_params['exp_tag'] result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S")) # Perform training. print('Beginning training.') model = run_model_stablebaseline( flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps) # Save the model to a desired folder and then delete it to demonstrate # loading. print('Saving the trained model!') path = os.path.realpath(os.path.expanduser('~/baseline_results')) ensure_dir(path) save_path = os.path.join(path, result_name) model.save(save_path) # dump the flow params with open(os.path.join(path, result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) # Replay the result by loading the model print('Loading the trained model and testing it out!') model = DDPG.load(save_path) flow_params = get_flow_params(os.path.join(path, result_name) + '.json') flow_params['sim'].render = True env = env_constructor(params=flow_params, version=0)() n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # The algorithms require a vectorized environment to run eval_env = DummyVecEnv([lambda: env]) obs = eval_env.reset() reward = 0 for _ in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) reward += rewards print('the final reward is {}'.format(reward))
def main(env): n_actions = env.action_space.shape[0] noise_std = 0.2 # Currently using OU noise action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) model_class = DDPG # works also with SAC, DDPG and TD3 rl_model_kwargs = { 'actor_lr': 1e-3, 'critic_lr': 1e-3, 'action_noise': action_noise, 'nb_train_steps': 300, 'nb_rollout_steps': 100, 'gamma': 0.95, 'observation_range': (-1.5, 1.5), 'random_exploration': 0.05, 'normalize_observations': True, 'critic_l2_reg': 0.01 } # Available strategies (cf paper): future, final, episode, random model = HER('MlpPolicy', env, model_class, verbose=1, n_sampled_goal=4, goal_selection_strategy='future', buffer_size=int(1e5), batch_size=128, tensorboard_log="./ddpg_dvrk_tensorboard/", **rl_model_kwargs) # Reset the model env.reset() # Train the model model.learn(4000000, log_interval=100, callback=CheckpointCallback( save_freq=100000, save_path="./ddpg_dvrk_tensorboard/")) model.save("./her_robot_env")
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, ddpg_config, total_time_steps, validate_every_timesteps, task_name): print("Using MPI for multiprocessing with {} workers".format( MPI.COMM_WORLD.Get_size())) rank = MPI.COMM_WORLD.Get_rank() print("Worker rank: {}".format(rank)) task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array( [250, 0, 125, 0, 750, 0, 0, 0.005]), fractional_reward_weight=1, goal_height=0.15, tool_block_mass=0.02) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=0, max_episode_length=maximum_episode_length, normalize_actions=False, normalize_observations=False) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) policy_kwargs = dict(layers=[256, 256]) checkpoint_callback = CheckpointCallback(save_freq=int( validate_every_timesteps / num_of_envs), save_path=log_relative_path, name_prefix='model') model = DDPG(MlpPolicy, env, verbose=2, param_noise=param_noise, action_noise=action_noise, policy_kwargs=policy_kwargs, **ddpg_config) model.learn(total_timesteps=total_time_steps, tb_log_name="ddpg", callback=checkpoint_callback) return
def training(env): n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, render=True, return_range=[-1.0, 1.0], observation_range=[-2.0, 2.0]) model.learn(total_timesteps=40000) time = datetime.now().strftime("%m%d_%H%M%S") model.save("models\\ddpg_sbl_" + time) del model # remove to demonstrate saving and loading testing(env, time)
def __call__(self): policy_kwargs = dict(layers=[400, 300, 200, 100]) n_actions = self.env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions)) # check_env(self.env) model = DDPG(MlpPolicy, self.env, policy_kwargs=policy_kwargs, action_noise=action_noise, memory_limit=50000, tensorboard_log="/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log", verbose=1) time_steps = 3e4 model.learn(total_timesteps=int(time_steps), log_interval=50, tb_log_name="ddpg_Docker_" + self.expt_name) model.save("/home/dfki.uni-bremen.de/mpatil/Documents/ddpg_stable_baselines_" + self.expt_name) print("Closing environment") self.env.close()
def init_ddpg(env_id, timesteps, policy="MlpPolicy", log_interval=None, tensorboard_log=None, seed=None): from stable_baselines import DDPG env = gym.make(env_id) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(policy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=tensorboard_log) return model
def run(Model, Policy, gamma): env = gym.make('Stock-v0') env._init_data(train_data) if gamma != 0: n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(gamma) * np.ones(n_actions)) model = Model(Policy, env, verbose=1, action_noise=action_noise) model.gamma = 0.2 else: model = Model(Policy, env, verbose=1) model.learn(total_timesteps=total_timesteps, log_interval=10) print("test model") env = gym.make('TestStock-v0') env._init_data(test_data) obs = env.reset() for _ in range(686): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) return env.asset_memory
def DRL() -> None: ### PREPARATION # callback for validation eval_callback = EvalCallback(val_env, best_model_save_path=config.val_path, log_path=config.val_path, eval_freq=config.val_freq, deterministic=config.deterministic, n_eval_episodes=config.val_eps) ### SETUP AND TRAIN # Setup model if config.MODEL_NAME == "A2C": model = A2C(config.POLICY, train_env, verbose=1, tensorboard_log=config.tb_path, seed=config.seed) elif config.MODEL_NAME == "PPO": model = PPO2(config.POLICY, train_env, verbose=1, tensorboard_log=config.tb_path, nminibatches=1, seed=config.seed) elif config.MODEL_NAME == "DDPG": # the noise objects for DDPG n_actions = train_env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(config.POLICY, train_env, param_noise=param_noise, action_noise=action_noise, verbose=1, tensorboard_log=config.tb_path, seed=config.seed) print("DDPG does not provice training output...") ### # Train Model model = model.learn(total_timesteps=config.learn_steps, callback=eval_callback) # Load best model after training if config.MODEL_NAME == "A2C": model = A2C.load(load_path=config.val_path.joinpath("best_model.zip")) elif config.MODEL_NAME == "PPO": model = PPO2.load(load_path=config.val_path.joinpath("best_model.zip")) elif config.MODEL_NAME == "DDPG": model = DDPG.load(load_path=config.val_path.joinpath("best_model.zip")) ### EVAL MODEL # Make prediction in test_env test_mean, test_std = evaluate_policy(model=model, env=test_env, deterministic=config.deterministic, n_eval_episodes=config.test_eps, return_episode_rewards=False) print(f"Test Mean:{test_mean}\n"+ \ f"Test Std:{test_std}")
save_path = 'logs/agent_{}/models/'.format(args.agent_id) env = Monitor(env, 'logs/agent_{}/'.format(args.agent_id)) # logging monitor repo = git.Repo(search_parent_directories=False) commit_id = repo.head.object.hexsha with open('logs/agent_{}/reproduction_info.txt'.format(args.agent_id), 'w') as f: # Use file to refer to the file object f.write('Git commit id: {}\n\n'.format(commit_id)) f.write('Program arguments:\n\n{}'.format(args)) f.close() else: save_path = '../logs/' env = Monitor(env, '../logs/') # logging monitor model_dir = save_path + '{}_final_model'.format(args.alg) # model save/load directory if args.alg == 'ddpg': action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=args.action_noise * np.ones(n_actions)) param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(args.param_noise_stddev), desired_action_stddev=float(args.param_noise_stddev)) model = DDPG(DDPGPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, render=args.play) elif args.alg == 'ppo2': model = PPO2(CommonMlpPolicy, env, verbose=1) elif args.alg == 'trpo': model = TRPO(CommonMlpPolicy, env, verbose=1, model_dir=save_path) elif args.alg =='a2c': model = A2C(CommonMlpPolicy, env, verbose=1) else: print(args.alg) raise Exception('Algorithm name is not defined!')
import gym from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines import A2C from stable_baselines import DDPG from FireflyEnv import ffenv_new_cord from Config import Config arg = Config() import numpy as np import time import torch from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(2), sigma=float(0.2) * np.ones(2)) arg.std_range = [0.0001, 0.001, 0.0001, 0.001] env = ffenv_new_cord.FireflyEnv(arg) model = DDPG(MlpPolicy, env, verbose=1, tensorboard_log="./DDPG_tb/", action_noise=action_noise, gamma=0.99, memory_policy=None, eval_env=None, nb_train_steps=50, nb_rollout_steps=100, nb_eval_steps=100, param_noise=None, normalize_observations=False, tau=0.001, batch_size=128,
def run_process(study_name, alg_param, env_param, log_path='.'): study_path = os.path.join(log_path, study_name) make_sure_path_exists(study_path) trial_path, trial_id = generate_trial_path(study_path) make_sure_path_exists(trial_path) with open(trial_path + '/alg_param.pkl', "wb+") as outfile: pickle.dump(alg_param, outfile) with open(trial_path + '/env_param.pkl', "wb+") as outfile: pickle.dump(env_param, outfile) num_nodes = alg_param['num_nodes'] num_layers = alg_param['num_layers'] learning_rate = alg_param['learning_rate'] alg = alg_param['alg'] nenv = alg_param['nenv'] env = build_env(trial_path, env_param, nenv=nenv) if alg == 'dqn': from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN call_iter = 1000 policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)]) model = DQN(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs, tensorboard_log=trial_path) #DDPG calls back every step of every rollout elif alg == 'ddpg': from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec from stable_baselines import DDPG call_iter = 1000 n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)]) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, policy_kwargs=policy_kwargs, tensorboard_log=trial_path) elif alg == 'td3': from stable_baselines import TD3 from stable_baselines.td3.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise call_iter = 1000 n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)]) model = TD3(MlpPolicy, env, verbose=1, action_noise=action_noise, learning_rate=learning_rate, policy_kwargs=policy_kwargs, tensorboard_log=trial_path) #PPO1 calls back only after every rollout elif alg == 'ppo2': from stable_baselines.common.policies import MlpPolicy from stable_baselines import PPO2 call_iter = 100 policy_kwargs = dict(net_arch=[num_nodes for _ in range(num_layers)]) model = PPO2(MlpPolicy, env, policy_kwargs=policy_kwargs, verbose=1, learning_rate=learning_rate, tensorboard_log=trial_path, n_steps=alg_param['n_steps'], noptepochs=alg_param['noptepochs'], nminibatches=alg_param['nminibatches'], gamma=alg_param['gamma'], ent_coef=alg_param['ent_coef'], cliprange=alg_param['cliprange'], lam=alg_param['lam']) best_mean_reward, n_steps = -np.inf, 0 #callback frequency differs among algorithms def callback(_locals, _globals): from stable_baselines.results_plotter import load_results, ts2xy nonlocal n_steps, best_mean_reward, call_iter # Print stats every 1000 call if (n_steps + 1) % call_iter == 0: # Evaluate policy training performance x, y = ts2xy(load_results(trial_path), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-200:]) print(x[-1], 'timesteps') print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}" .format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(trial_path + '/best_model.pkl') n_steps += 1 return True # model= DDPG.load('log/A00/best_model.pkl') # model.set_env(env) print(f"Starting to train {trial_id}") model.learn(total_timesteps=int(1e6), tb_log_name='tb_log', callback=callback) model.save(trial_path + '/fully_trained_model')
hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) elif 'normal' in noise_type: if 'lin' in noise_type: hyperparams['action_noise'] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=hyperparams.get('noise_std_final', 0.0) * np.ones(n_actions), max_steps=n_timesteps) else: hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) elif 'ornstein-uhlenbeck' in noise_type: hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) else: raise RuntimeError('Unknown noise type "{}"'.format(noise_type)) print("Applying {} noise with std {}".format(noise_type, noise_std)) del hyperparams['noise_type'] del hyperparams['noise_std'] if 'noise_std_final' in hyperparams: del hyperparams['noise_std_final'] if ALGOS[args.algo] is None: raise ValueError('{} requires MPI to be installed'.format(args.algo)) if os.path.isfile(args.trained_agent): # Continue training print("Loading pretrained agent") # Policy should not be changed
from stable_baselines import TRPO import numpy as np import math import matplotlib.pyplot as plt # # Create environment env = gym.make('QuadGym-v0') # env = gym.make('HalfCheetah-v2') # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.ones(n_actions) * 0.15, sigma=float(0.1) * np.ones(n_actions)) model = DDPG(MlpPolicy2, env, verbose=1, param_noise=param_noise, action_noise=action_noise, render=False, buffer_size=1000000, random_exploration=0.0) # model = DDPG(MlpPolicy2, env, gamma=0.99, memory_policy=None, nb_train_steps=500, nb_rollout_steps=50, nb_eval_steps=300, param_noise=None, action_noise=action_noise, normalize_observations=False, tau=0.002, batch_size=250, normalize_returns=False, enable_popart=False, observation_range=(-10.0,10.0), critic_l2_reg=0.0, actor_lr=0.0005, critic_lr=0.0005, clip_norm=None, render=False, render_eval=False, buffer_size=1000000, verbose=1, _init_setup_model=True) model.learn(total_timesteps=1000000) model.save("ddpg_quad") qpos0_hist = np.ones((1, 49))
original_adr = currentPath + '/tools/cfgs/' + args.cfg_file.split( '/')[-1] target_adr = currentPath + '/logs/agent_{}/'.format( args.agent_id) + args.cfg_file.split('/')[-1] shutil.copyfile(original_adr, target_adr) else: save_path = 'logs/' env = Monitor(env, 'logs/', info_keywords=('reserved', )) # logging monitor model_dir = save_path + '{}_final_model'.format( cfg.POLICY.NAME) # model save/load directory if cfg.POLICY.NAME == 'DDPG': action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=float(cfg.POLICY.ACTION_NOISE) * np.ones(n_actions)) param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(cfg.POLICY.PARAM_NOISE_STD), desired_action_stddev=float(cfg.POLICY.PARAM_NOISE_STD)) model = DDPG(policy[cfg.POLICY.NET], env, verbose=1, param_noise=param_noise, action_noise=action_noise, policy_kwargs={ 'cnn_extractor': eval(cfg.POLICY.CNN_EXTRACTOR) }) elif cfg.POLICY.NAME == 'PPO2': model = PPO2(policy[cfg.POLICY.NET],
from rl_visualization.visualization_env import VisualizationEnv if __name__ == '__main__': env = gym.make('MountainCarContinuous-v0') env = VisualizationEnv( env, steps_lookback=10000, refresh_time=30, features_names=['Car Position', 'Car Velocity'], actions_names=[ 'Push car to the left (negative value) or to the right (positive value)' ]) model = SAC(MlpPolicy, env, verbose=1, action_noise=OrnsteinUhlenbeckActionNoise(mean=np.zeros(1), sigma=0.5 * np.ones(1))) model.learn(total_timesteps=60000) obs = env.reset() for i in range(100000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() env.close() env.join()
def main(): args = parse_args() algorithm = args.algo agent = args.agent output = args.output use_encoder = './results/' + args.use_encoder + '/' time_steps = int(1e7) env_name = 'HalfCheetah-v2' tf.reset_default_graph() with tf.Session() as sess: def make_env(use_encoder=True, env_name=env_name): if use_encoder: return WrappedEnv(sess=sess, env_name=env_name, feature_dim=10, encoder_gamma=0.98, encoder_hidden_size=128, dynamics_hidden_size=256, invdyn_hidden_size=256, encoder_lr=0.0003, dynamics_lr=0.0003, invdyn_lr=0.0003) else: return gym.make(env_name) if algorithm == 'ddpg': from stable_baselines.common.cmd_util import SubprocVecEnv from stable_baselines.common.callbacks import CheckpointCallback from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise from stable_baselines import DDPG env = make_env(use_encoder=use_encoder) sess.run(tf.global_variables_initializer()) if use_encoder: env = random_run_for_encoder_training(env, num_epochs=200, num_iters=500) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=float(0.22) * np.ones(n_actions)) policy_kwargs = dict(act_fun=tf.nn.tanh, layers=[64, 64]) model = DDPG(MlpPolicy, env, gamma=0.99, batch_size=128, verbose=1, param_noise=param_noise, action_noise=action_noise, policy_kwargs=policy_kwargs, tensorboard_log=output + algorithm + '_' + str(use_encoder) + '/' + 'log/') checkpoint_callback = CheckpointCallback( save_freq=4000, save_path=output + algorithm + '_' + str(use_encoder) + '/', name_prefix='agent') model.set_env(env) model.learn(total_timesteps=time_steps, callback=checkpoint_callback, reset_num_timesteps=False) model.save(output + algorithm + '_' + str(use_encoder) + '/' + agent) env.close() del model elif algorithm == 'ppo': from stable_baselines.common.policies import MlpPolicy, MlpLnLstmPolicy from stable_baselines import PPO2 from stable_baselines.common.cmd_util import SubprocVecEnv from stable_baselines.common.callbacks import CheckpointCallback env = make_env(use_encoder=use_encoder) sess.run(tf.global_variables_initializer()) if use_encoder: env = random_run_for_encoder_training(env, num_epochs=200, num_iters=500) policy_kwargs = dict(act_fun=tf.nn.tanh, layers=[64, 64]) model = PPO2(MlpPolicy, env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, verbose=1, policy_kwargs=policy_kwargs, tensorboard_log=output + algorithm + '_' + str(use_encoder) + '/' + 'log/') checkpoint_callback = CheckpointCallback( save_freq=4096, save_path=output + algorithm + '_' + str(use_encoder) + '/', name_prefix='agent') model.learn(total_timesteps=time_steps, callback=checkpoint_callback, reset_num_timesteps=False) model.save(output + algorithm + '_' + str(use_encoder) + '/' + agent) env.close() del model
def train_decision(config=None, save=False, load=False, calender=None, history=None, predict_results_dict=None, test_mode=False, start_date=None, stop_date=None, episode_steps=1000, model='DDPG'): """ 训练决策模型,从数据库读取数据并进行决策训练 参数: config:配置文件, save:保存结果, calender:交易日日历, history:行情信息, all_quotes:拼接之后的行情信息 predict_results_dict:预测结果信息 """ # 首先处理预测数据中字符串日期 MODEL = model predict_dict = {} for k, v in predict_results_dict.items(): assert isinstance(v['predict_date'].iloc[0], str) tmp = v['predict_date'].apply( lambda x: arrow.get(x, 'YYYY-MM-DD').date()) predict_dict[k] = v.rename(index=tmp) env = Portfolio_Prediction_Env(config=config, calender=calender, stock_history=history, window_len=1, prediction_history=predict_dict, start_trade_date=start_date, stop_trade_date=stop_date, save=save) # 测试模式 if test_mode: obs = env.reset() # check_env(env) for i in range(1000): W = np.random.uniform(0.0, 1.0, size=(6, )) offer = np.random.uniform(-10.0, 10.0, size=(6, )) obs, reward, done, infos = env.step(np.hstack((W, offer))) # env.render() if done: env.save_history() break env.close() # 训练模式 if MODEL == "DDPG": # 添加噪声 n_actions = env.action_space.shape param_noise = None # 适合于惯性系统控制的OU噪声 action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_path = search_file( os.path.join(sys.path[0], 'saved_models', MODEL), MODEL) if len(model_path) > 0 and load: model = DDPG.load( model_path[0], env=env, policy=CustomDDPGPolicy, param_noise=param_noise, action_noise=action_noise, # tensorboard_log='./tb_log', ) else: model = DDPG( policy=CustomDDPGPolicy, env=env, verbose=1, param_noise=param_noise, action_noise=action_noise, # tensorboard_log='./tb_log', ) # 训练步数 model.learn(total_timesteps=episode_steps, ) model.save( os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5')) elif MODEL == 'TD3': n_actions = env.action_space.shape[-1] # 适合于惯性系统控制的OU噪声 action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_path = search_file( os.path.join(sys.path[0], 'saved_models', MODEL), MODEL) if len(model_path) > 0 and load: model = TD3.load( model_path[0], env=env, policy=CustomTD3Policy, action_noise=action_noise, # tensorboard_log='./tb_log', ) else: model = TD3( policy=CustomTD3Policy, env=env, verbose=1, action_noise=action_noise, # tensorboard_log='./tb_log', ) # 训练步数 model.learn(total_timesteps=episode_steps, ) model.save( os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5')) elif MODEL == "HER": """ env必须是GoalEnv """ model_class = DDPG # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER(policy=CustomDDPGPolicy, env=env, model_class=model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1) model.learn(total_timesteps=episode_steps, ) model.save( os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5')) obs = env.reset() # 实测模式 for i in range(1000): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) # env.render(info=info) if done: if save: env.save_history() env.reset() break env.close()
log_interval=1, tb_log_name=tensorboard_log_name) model.save(model_save_name) elif algorithm == "DDPG": if train: for i in range(model_num): from stable_baselines.ddpg.policies import MlpPolicy from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec from stable_baselines import DDPG env = gym.make(env_name) # the noise objects for DDPG n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=tensorboard_log_dir) model.learn(total_timesteps=total_timesteps_, log_interval=1, tb_log_name=tensorboard_log_name) model.save(model_save_name) del model # remove to demonstrate saving and loading
def _preprocess_hyperparams(self, _hyperparams): # Convert to python object if needed if "policy_kwargs" in _hyperparams.keys() and isinstance(_hyperparams["policy_kwargs"], str): _hyperparams["policy_kwargs"] = eval(_hyperparams["policy_kwargs"]) n_timesteps = _hyperparams.pop("n_timesteps", None) n_envs = _hyperparams.pop("n_envs", None) log_every = _hyperparams.pop("log_every", None) if not self.continue_learning: if not log_every: self.logger.debug("log_every not defined in yml file: using command line log_every {}".format(self.log_every)) log_every = self.log_every else: self.logger.debug("using log_every as defined in yml file: {}".format(log_every)) else: self.logger.debug("priority to command line log_every {}".format(self.log_every)) log_every = self.log_every # Parse noise string if self.algo_name in ["ddpg", "sac", "td3"] and _hyperparams.get("noise_type") is not None: noise_type = _hyperparams["noise_type"].strip() noise_std = _hyperparams["noise_std"] n_actions = get_n_actions(env_name=self.env_name, env_variables=self.env_kwargs) self.logger.debug("n_actions: {}".format(n_actions)) if "adaptive-param" in noise_type: assert self.algo_name == "ddpg", "Parameter is not supported by SAC" _hyperparams["param_noise"] = AdaptiveParamNoiseSpec(initial_stddev=noise_std, desired_action_stddev=noise_std) elif "normal" in noise_type: if "lin" in noise_type: _hyperparams["action_noise"] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=_hyperparams.get("noise_std_final", 0.0) * np.ones(n_actions), max_steps=n_timesteps, ) else: _hyperparams["action_noise"] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) elif "ornstein-uhlenbeck" in noise_type: _hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) else: raise RuntimeError('Unknown noise type "{}"'.format(noise_type)) self.logger.debug("Applying {} noise with std {}".format(noise_type, noise_std)) del _hyperparams["noise_type"] del _hyperparams["noise_std"] if "noise_std_final" in _hyperparams: del _hyperparams["noise_std_final"] normalize_kwargs = _parse_normalize(dictionary=_hyperparams) if n_envs is None: self.logger.debug("n_envs not defined in yml file: using command line n_envs {}".format(self.num_envs)) n_envs = self.num_envs else: self.logger.debug("using n_envs as num of envs defined in yml file:".format(n_envs)) if not self.continue_learning: # priority to yml defined n_timesteps if n_timesteps is None: self.logger.debug( "n_timesteps not defined in yml file: using command line n_timesteps {}".format(self.train_total_timesteps) ) n_timesteps = self.train_total_timesteps else: self.logger.debug("using n_timesteps as total timesteps defined in yml file: {}".format(n_timesteps)) n_timesteps = int(n_timesteps) else: if self.train_total_timesteps and self.train_total_timesteps != -1: assert self.train_total_timesteps <= int(n_timesteps), "train_total_timesteps <= n_timesteps: {}, {}".format( self.train_total_timesteps, n_timesteps ) # priority to command line n_timesteps self.logger.debug("priority to command line n_timesteps {}".format(self.train_total_timesteps)) n_timesteps = self.train_total_timesteps elif self.train_total_timesteps == -1: assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps) n_timesteps = int(n_timesteps) self.logger.info("training in continual learning = training from scratch. n_timesteps {}".format(n_timesteps)) else: assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps) n_timesteps = int(n_timesteps // 2) self.logger.debug( "train_total_timesteps not specified in continue_learning: " "taking half of original n_timesteps defined in yml file {}".format(n_timesteps) ) assert n_timesteps % log_every == 0, "it should be possible to divide n_timesteps for log_every: {}, {}".format( n_timesteps, log_every ) return normalize_kwargs, n_envs, n_timesteps, log_every, _hyperparams