def sample_ddpg_params(trial): """ Sampler for DDPG hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical( 'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1) # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256]) buffer_size = trial.suggest_categorical( 'memory_limit', [int(1e4), int(1e5), int(1e6)]) noise_type = trial.suggest_categorical( 'noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param']) noise_std = trial.suggest_uniform('noise_std', 0, 1) normalize_observations = trial.suggest_categorical( 'normalize_observations', [True, False]) normalize_returns = trial.suggest_categorical('normalize_returns', [True, False]) hyperparams = { 'gamma': gamma, 'actor_lr': learning_rate, 'critic_lr': learning_rate, 'batch_size': batch_size, 'memory_limit': buffer_size, 'normalize_observations': normalize_observations, 'normalize_returns': normalize_returns } if noise_type == 'adaptive-param': hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) # Apply layer normalization when using parameter perturbation hyperparams['policy_kwargs'] = dict(layer_norm=True) elif noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
def DDPGgive_results(files, balance, shares=None): env = create_stock_env(files, train=False, balance=balance, shares=shares) max_steps = env.max_steps - env.num_prev env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(0, 2) param_noise = AdaptiveParamNoiseSpec(initial_stddev=1, desired_action_stddev=0.1, adoption_coefficient=1.01) model = DDPG(CustomDDPGPolicy, env, verbose=0, param_noise=param_noise, action_noise=action_noise) # model = DDPG.load("/home/harshit/Documents/itsp-trade agent/Reinforcement-Learning-Stock-Trader/WebPortal/StockApp/Stock_stable.zip",env=env) model.learn(total_timesteps=100) profit = 0 profitst = np.zeros((max_steps - 1, 2)) actionst = np.zeros((n_actions // 2, max_steps - 1, 2)) shares = np.zeros((len(files), max_steps - 1, 2)) obs = env.reset() for i in range(max_steps): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) actionst[:, i, 1] = -info[0]['action'][0][0:n_actions // 2] + info[0][ 'action'][0][n_actions // 2:] actionst[:, i, 0] = i shares[:, i, 1] = info[0]['shares_held'] shares[:, i, 0] = i # print('a',action) profit += rewards profitst[i] = [i, profit] if dones: break print(info[0]['action'][0]) print(actionst) return profitst.tolist(), shares.tolist(), actionst.tolist()
eval_env = Gait2DGenAct(integrator_accuracy=3e-2) #env = Arm2DVecEnv(visualize=True) callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=1000, verbose=1) eval_callback = EvalCallback(eval_env, callback_on_new_best=callback_on_best, verbose=1) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions), theta=0.05) param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.287) class CustomTD3Policy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomTD3Policy, self).__init__(*args, **kwargs, layers=[400, 400], layer_norm=True, feature_extraction="mlp") model = TD3(CustomTD3Policy, env, verbose=1, action_noise=action_noise,
if 'frame_stack' in hyperparams: del hyperparams['frame_stack'] # Stop env processes to free memory if args.optimize_hyperparameters and n_envs > 1: env.close() # Parse noise string for DDPG and SAC if algo_ in ['ddpg', 'sac', 'td3' ] and hyperparams.get('noise_type') is not None: noise_type = hyperparams['noise_type'].strip() noise_std = hyperparams['noise_std'] n_actions = env.action_space.shape[0] if 'adaptive-param' in noise_type: assert algo_ == 'ddpg', 'Parameter is not supported by SAC' hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) elif 'normal' in noise_type: if 'lin' in noise_type: hyperparams['action_noise'] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=hyperparams.get('noise_std_final', 0.0) * np.ones(n_actions), max_steps=n_timesteps) else: hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) elif 'ornstein-uhlenbeck' in noise_type: hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))
if self.verbose > 0: print("Saving new best model to {}".format( self.save_path)) self.model.save(self.save_path) return True # Create log dir log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make('SatelliteEnvironment-v0') env = Monitor(env, log_dir) # Add some param noise for exploration param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # Because we use parameter noise, we should use a MlpPolicy with layer normalization model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=0) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) # Train the agent time_steps = 1e5 model.learn(total_timesteps=int(time_steps), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "DDPG Satellite") plt.show()
shutil.copyfile(original_adr, target_adr) else: save_path = 'logs/' env = Monitor(env, 'logs/', info_keywords=('reserved', )) # logging monitor model_dir = save_path + '{}_final_model'.format( cfg.POLICY.NAME) # model save/load directory if cfg.POLICY.NAME == 'DDPG': action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=float(cfg.POLICY.ACTION_NOISE) * np.ones(n_actions)) param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(cfg.POLICY.PARAM_NOISE_STD), desired_action_stddev=float(cfg.POLICY.PARAM_NOISE_STD)) model = DDPG(policy[cfg.POLICY.NET], env, verbose=1, param_noise=param_noise, action_noise=action_noise, policy_kwargs={ 'cnn_extractor': eval(cfg.POLICY.CNN_EXTRACTOR) }) elif cfg.POLICY.NAME == 'PPO2': model = PPO2(policy[cfg.POLICY.NET], env, verbose=1, model_dir=save_path, policy_kwargs={
def _preprocess_hyperparams(self, _hyperparams): # Convert to python object if needed if "policy_kwargs" in _hyperparams.keys() and isinstance(_hyperparams["policy_kwargs"], str): _hyperparams["policy_kwargs"] = eval(_hyperparams["policy_kwargs"]) n_timesteps = _hyperparams.pop("n_timesteps", None) n_envs = _hyperparams.pop("n_envs", None) log_every = _hyperparams.pop("log_every", None) if not self.continue_learning: if not log_every: self.logger.debug("log_every not defined in yml file: using command line log_every {}".format(self.log_every)) log_every = self.log_every else: self.logger.debug("using log_every as defined in yml file: {}".format(log_every)) else: self.logger.debug("priority to command line log_every {}".format(self.log_every)) log_every = self.log_every # Parse noise string if self.algo_name in ["ddpg", "sac", "td3"] and _hyperparams.get("noise_type") is not None: noise_type = _hyperparams["noise_type"].strip() noise_std = _hyperparams["noise_std"] n_actions = get_n_actions(env_name=self.env_name, env_variables=self.env_kwargs) self.logger.debug("n_actions: {}".format(n_actions)) if "adaptive-param" in noise_type: assert self.algo_name == "ddpg", "Parameter is not supported by SAC" _hyperparams["param_noise"] = AdaptiveParamNoiseSpec(initial_stddev=noise_std, desired_action_stddev=noise_std) elif "normal" in noise_type: if "lin" in noise_type: _hyperparams["action_noise"] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=_hyperparams.get("noise_std_final", 0.0) * np.ones(n_actions), max_steps=n_timesteps, ) else: _hyperparams["action_noise"] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) elif "ornstein-uhlenbeck" in noise_type: _hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) else: raise RuntimeError('Unknown noise type "{}"'.format(noise_type)) self.logger.debug("Applying {} noise with std {}".format(noise_type, noise_std)) del _hyperparams["noise_type"] del _hyperparams["noise_std"] if "noise_std_final" in _hyperparams: del _hyperparams["noise_std_final"] normalize_kwargs = _parse_normalize(dictionary=_hyperparams) if n_envs is None: self.logger.debug("n_envs not defined in yml file: using command line n_envs {}".format(self.num_envs)) n_envs = self.num_envs else: self.logger.debug("using n_envs as num of envs defined in yml file:".format(n_envs)) if not self.continue_learning: # priority to yml defined n_timesteps if n_timesteps is None: self.logger.debug( "n_timesteps not defined in yml file: using command line n_timesteps {}".format(self.train_total_timesteps) ) n_timesteps = self.train_total_timesteps else: self.logger.debug("using n_timesteps as total timesteps defined in yml file: {}".format(n_timesteps)) n_timesteps = int(n_timesteps) else: if self.train_total_timesteps and self.train_total_timesteps != -1: assert self.train_total_timesteps <= int(n_timesteps), "train_total_timesteps <= n_timesteps: {}, {}".format( self.train_total_timesteps, n_timesteps ) # priority to command line n_timesteps self.logger.debug("priority to command line n_timesteps {}".format(self.train_total_timesteps)) n_timesteps = self.train_total_timesteps elif self.train_total_timesteps == -1: assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps) n_timesteps = int(n_timesteps) self.logger.info("training in continual learning = training from scratch. n_timesteps {}".format(n_timesteps)) else: assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps) n_timesteps = int(n_timesteps // 2) self.logger.debug( "train_total_timesteps not specified in continue_learning: " "taking half of original n_timesteps defined in yml file {}".format(n_timesteps) ) assert n_timesteps % log_every == 0, "it should be possible to divide n_timesteps for log_every: {}, {}".format( n_timesteps, log_every ) return normalize_kwargs, n_envs, n_timesteps, log_every, _hyperparams
def learn(self): # Use deterministic actions for evaluation eval_path = self.model_dir + "/best_model" # TODO save checkpoints with vecnormalize callback pkl file save_vec_normalize = SaveVecNormalizeCallback(save_freq=1, save_path=eval_path) if self.norm: # Don't normalize the reward for test env self.test_env = VecNormalize(self.test_env, norm_obs=True, norm_reward=False, clip_obs=10.) eval_callback = EvalCallback(self.test_env, best_model_save_path=eval_path, log_path=eval_path+'/logs', eval_freq=50000, n_eval_episodes=10, callback_on_new_best=save_vec_normalize, deterministic=True, render=False) checkpoint_callback = CheckpointCallback(save_freq=25000, save_path=self.model_dir+'/logs/', name_prefix='rl_model') time_callback = TrainingTimeCallback() tensorboard_file = None if self.config[self.algo]['tensorboard_logs'] is None else "tensorboard_logs/"+self.model_dir if self.algo == 'SAC': if not self.env.envs[0].is_simplified() and (self.env.envs[0].depth_obs or self.env.envs[0].full_obs): policy_kwargs = { "layers": self.config[self.algo]['layers'], "cnn_extractor": custom_obs_policy.create_augmented_nature_cnn(1)} policy = sacCnn elif self.env.envs[0].depth_obs or self.env.envs[0].full_obs: policy_kwargs = {} policy = sacCnn else: policy_kwargs = {"layers": self.config[self.algo]['layers'], "layer_norm": False} policy = sacMlp if self.load_dir: top_folder_idx = self.load_dir.rfind('/') top_folder_str = self.load_dir[0:top_folder_idx] if self.norm: self.env = VecNormalize(self.env, training=True, norm_obs=False, norm_reward=False, clip_obs=10.) self.env = VecNormalize.load(os.path.join(top_folder_str, 'vecnormalize.pkl'), self.env) model = sb.SAC(policy, self.env, policy_kwargs=policy_kwargs, verbose=1, gamma=self.config['discount_factor'], buffer_size=self.config[self.algo]['buffer_size'], batch_size=self.config[self.algo]['batch_size'], learning_rate=self.config[self.algo]['step_size'], tensorboard_log=tensorboard_file) model_load = sb.SAC.load(self.load_dir, self.env) params = model_load.get_parameters() model.load_parameters(params, exact_match=False) else: if self.norm: self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True, clip_obs=10.) model = sb.SAC(policy, self.env, policy_kwargs=policy_kwargs, verbose=2, gamma=self.config['discount_factor'], buffer_size=self.config[self.algo]['buffer_size'], batch_size=self.config[self.algo]['batch_size'], learning_rate=self.config[self.algo]['step_size'], tensorboard_log=tensorboard_file) elif self.algo == 'TRPO': model = sb.TRPO(MlpPolicy, self.env, verbose=2, gamma=self.config['discount_factor'], timesteps_per_batch=self.config[self.algo]['max_iters'], vf_stepsize=self.config[self.algo]['step_size'], tensorboard_log=tensorboard_file) elif self.algo == 'PPO': if not self.env.envs[0].is_simplified() and (self.env.envs[0].depth_obs or self.env.envs[0].full_obs): policy_kwargs = { "layers": self.config[self.algo]['layers'], "cnn_extractor": custom_obs_policy.create_augmented_nature_cnn(1)} policy = CnnPolicy elif self.env.envs[0].depth_obs or self.env.envs[0].full_obs: policy_kwargs = {} policy = CnnPolicy else: policy_kwargs = {"layers": self.config[self.algo]['layers'], "layer_norm": False} policy = MlpPolicy model = sb.PPO2(MlpPolicy, self.env, verbose=2, gamma=self.config['discount_factor'], learning_rate=self.config[self.algo]['learning_rate'], tensorboard_log=tensorboard_file) elif self.algo == 'DQN': if self.load_dir: model = self.load_params() else: model = sb.DQN(DQNMlpPolicy, self.env, verbose=2, gamma=self.config['discount_factor'], batch_size=self.config[self.algo]['batch_size'], prioritized_replay=self.config[self.algo]['prioritized_replay'], tensorboard_log=tensorboard_file) elif self.algo == "DDPG": param_noise = AdaptiveParamNoiseSpec() model = sb.DDPG(ddpgMlp, self.env, verbose=2, gamma=self.config['discount_factor'], param_noise=param_noise, tensorboard_log=tensorboard_file) try: model.learn(total_timesteps=int(self.config[self.algo]['total_timesteps']), callback=[TensorboardCallback(self.env, tensorboard_file, self.algo, self.log_freq, self.model_dir), eval_callback]) except KeyboardInterrupt: pass self.save(model, self.model_dir)
repo = git.Repo(search_parent_directories=False) commit_id = repo.head.object.hexsha with open('logs/agent_{}/reproduction_info.txt'.format(args.agent_id), 'w') as f: # Use file to refer to the file object f.write('Git commit id: {}\n\n'.format(commit_id)) f.write('Program arguments:\n\n{}'.format(args)) f.close() else: save_path = '../logs/' env = Monitor(env, '../logs/') # logging monitor model_dir = save_path + '{}_final_model'.format(args.alg) # model save/load directory if args.alg == 'ddpg': action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=args.action_noise * np.ones(n_actions)) param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(args.param_noise_stddev), desired_action_stddev=float(args.param_noise_stddev)) model = DDPG(DDPGPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, render=args.play) elif args.alg == 'ppo2': model = PPO2(CommonMlpPolicy, env, verbose=1) elif args.alg == 'trpo': model = TRPO(CommonMlpPolicy, env, verbose=1, model_dir=save_path) elif args.alg =='a2c': model = A2C(CommonMlpPolicy, env, verbose=1) else: print(args.alg) raise Exception('Algorithm name is not defined!') print('Model is Created') try: print('Training Started')