def __init__(self, action_mode: ActionMode, obs_config: ObservationConfig, task_class, agent_config): # call parent constructor super(ESAgent, self).__init__(action_mode, obs_config, task_class, agent_config) # setup some parameters self.es_hparams = self.cfg["ESAgent"]["Hyperparameters"] self.n_workers = self.es_hparams["n_workers"] self.perturbations_per_batch = self.es_hparams[ "perturbations_per_batch"] if self.n_workers > 1 and not self.headless: print( "Turning headless mode on, since more than one worker is running." ) self.headless = True if self.perturbations_per_batch % self.n_workers != 0: corrected_perturbations_per_batch = self.perturbations_per_batch +\ (self.n_workers - self.perturbations_per_batch % self.n_workers) print( "\nChanging the number of peturbations per batch from %d to %d." % (self.perturbations_per_batch, corrected_perturbations_per_batch)) self.perturbations_per_batch = corrected_perturbations_per_batch # correct validation interval if self.make_validation_during_training: # change number of validation episodes to match number of workers if self.validation_interval >= self.perturbations_per_batch: remainder = self.validation_interval % self.perturbations_per_batch else: remainder = self.perturbations_per_batch % self.validation_interval if remainder != 0: if self.validation_interval >= self.perturbations_per_batch: new_valid_interval = self.validation_interval + ( self.perturbations_per_batch - remainder) else: new_valid_interval = self.validation_interval + remainder if new_valid_interval - self.validation_interval > 20: question = "Validation interval need to be adjusted from %d to %d. The difference is quite huge, " \ "do you want to proceed anyway?" % (self.validation_interval, new_valid_interval) if not utils.query_yes_no(question): print("Terminating ...") sys.exit() print( "\nChanging validation interval from %d to %d to align with number of workers.\n" % (self.validation_interval, new_valid_interval)) self.validation_interval = new_valid_interval if self.save_weights: self.save_weights_interval = utils.adjust_save_interval( self.save_weights_interval, self.n_workers)
def __init__(self, agent_config_path=None): # read config file self.cfg = None if not agent_config_path: question = "No config-file path provided. Do you really want to continue with the default config-file?" if not utils.query_yes_no(question): print("Terminating ...") sys.exit() agent_config_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "config", "default_config.yaml") with open(agent_config_path, "r") as stream: self.cfg = yaml.safe_load(stream) self.action_mode = self.__setup_action_mode() self.obs_config = self.__setup_obs_config() self.task_class = self.__setup_task_class()
def run(self): if self.mode == "online_training": self.run_online_training() elif self.mode == "validation": validation_model = Network(self.es_hparams["layers_network"], self.dim_actions, self.max_actions) validation_model.build((1, self.dim_observations)) if not self.path_to_model: question = "You have not set a path to model. Do you really want to validate a random model?" if not utils.query_yes_no(question): print("Terminating ...") sys.exit() else: print("\nReading model from ", self.path_to_model, "...\n") validation_model.load_weights( os.path.join(self.path_to_model, "weights", "variables", "variables")) self.run_validation(validation_model) else: raise ValueError("\n%s mode not supported in OpenAI-ES.\n")
def run(self): if self.mode == "online_training": self.run_workers() self.run_online_training() elif self.mode == "offline_training": self.run_offline_training() elif self.mode == "validation": if not self.path_to_model: question = "You have not set a path to model. Do you really want to validate a random model?" if not utils.query_yes_no(question): print("Terminating ...") sys.exit() self.run_validation(self.actor) elif self.mode == "validation_mult": # if an OpenAI-ES agent is evaluated here, use its network instead if self.cfg["Agent"]["Type"] == "OpenAIES": self.actor = ES_Network( self.cfg["ESAgent"]["Hyperparameters"]["layers_network"], self.dim_actions, self.max_actions) self.actor.build((1, self.dim_observations)) self.run_validation_post() else: raise ValueError("%\ns mode is not supported in DDPG!\n")
def __init__(self, action_mode, obs_config, task_class, agent_config): # call parent constructor super(DDPG, self).__init__(action_mode, obs_config, task_class, agent_config) # define the dimensions self.dim_inputs_actor = self.dim_observations self.dim_inputs_critic = self.dim_observations + self.dim_actions # setup the some hyperparameters hparams = self.cfg["DDPG"]["Hyperparameters"] self.gamma = hparams["gamma"] self.tau = hparams["tau"] self.sigma = hparams["sigma"] self.batch_size = hparams["batch_size"] self.training_interval = hparams["training_interval"] self.max_epsilon = hparams["max_epsilon"] self.min_epsilon = hparams["min_epsilon"] self.epsilon = self.max_epsilon self.epsilon_decay_episodes = hparams["epsilon_decay_episodes"] self.layers_actor = hparams["layers_actor"] self.layers_critic = hparams["layers_critic"] self.lr_actor = hparams["lr_actor"] self.lr_critic = hparams["lr_critic"] # some DDPG specific setups setup = self.cfg["DDPG"]["Setup"] self.start_training = setup["start_training"] self.use_ou_noise = setup["use_ou_noise"] self.use_target_copying = setup["use_target_copying"] self.save_dones_in_buffer = setup["save_dones_in_buffer"] self.use_fixed_importance_sampling = setup[ "use_fixed_importance_sampling"] self.importance_sampling_weight = setup["importance_sampling_weight"] self.interval_copy_target = setup["interval_copy_target"] self.global_step_main = 0 self.global_episode = 0 self.write_buffer = setup["write_buffer"] self.path_to_read_buffer = None if setup["read_buffer_id"]: main_logging_dir, _ = os.path.split( os.path.dirname(self.root_log_dir)) self.path_to_read_buffer = os.path.join(main_logging_dir, setup["read_buffer_id"], "") if not os.path.exists(self.path_to_read_buffer): raise FileNotFoundError( "The given path to the read database's directory does not exists: %s" % self.path_to_read_buffer) # setup the replay buffer self.replay_buffer_mode = setup["replay_buffer_mode"] if self.replay_buffer_mode == "VANILLA": self.replay_buffer = ReplayBuffer( setup["buffer_size"], path_to_db_write=self.root_log_dir, path_to_db_read=self.path_to_read_buffer, dim_observations=self.dim_observations, dim_actions=self.dim_actions, write=self.write_buffer) elif self.replay_buffer_mode == "PER_PYTHON": self.replay_buffer = PrioReplayBuffer( setup["buffer_size"], path_to_db_write=self.root_log_dir, path_to_db_read=self.path_to_read_buffer, dim_observations=self.dim_observations, dim_actions=self.dim_actions, write=self.write_buffer, use_cpp=False) elif self.replay_buffer_mode == "PER_CPP": self.replay_buffer = PrioReplayBuffer( setup["buffer_size"], path_to_db_write=self.root_log_dir, path_to_db_read=self.path_to_read_buffer, dim_observations=self.dim_observations, dim_actions=self.dim_actions, write=self.write_buffer, use_cpp=True) else: raise ValueError( "Unsupported replay buffer type. Please choose either VANILLA, PER_PYTHON or PER_CPP." ) if self.path_to_read_buffer: if self.replay_buffer.length >= self.start_training: self.start_training = 0 else: self.start_training = self.start_training - self.replay_buffer.length self.n_random_episodes = None # set later in get_action method if self.mode == "online_training": print("\nStarting training in %d steps." % self.start_training) # setup tensorboard self.summary_writer = None if self.use_tensorboard: self.tensorboard_logger = TensorBoardLogger( root_log_dir=self.root_log_dir) # setup tensorboard for validation if self.make_validation_during_training: self.tensorboard_logger_validation = TensorBoardLoggerValidation( root_log_dir=self.root_log_dir) # change number of validation episodes to match number of workers remainder = self.validation_interval % self.n_workers if self.validation_interval >= self.n_workers else \ self.n_workers % self.validation_interval if remainder != 0: if self.validation_interval >= self.n_workers: new_valid_interval = self.validation_interval + ( self.n_workers - remainder) else: new_valid_interval = self.validation_interval + remainder if new_valid_interval - self.validation_interval > 20: question = "Validation interval need to be adjusted from %d to %d. The difference is quite huge, " \ "do you want to proceed anyway?" % (self.validation_interval, new_valid_interval) if not utils.query_yes_no(question): print("Terminating ...") sys.exit() print( "\nChanging validation interval from %d to %d to align with number of workers.\n" % (self.validation_interval, new_valid_interval)) self.validation_interval = new_valid_interval # --- define actor and its target--- self.actor = ActorNetwork(self.layers_actor, self.dim_actions, self.max_actions, sigma=self.sigma, use_ou_noise=self.use_ou_noise) self.target_actor = ActorNetwork(self.layers_actor, self.dim_actions, self.max_actions, sigma=self.sigma, use_ou_noise=self.use_ou_noise) # instantiate the models self.actor.build((1, self.dim_inputs_actor)) self.target_actor.build((1, self.dim_inputs_actor)) # setup the actor's optimizer self.optimizer_actor = tf.keras.optimizers.Adam( learning_rate=self.lr_actor) # --- define the critic and its target --- if type(self) == DDPG: self.critic = CriticNetwork( self.layers_critic, dim_obs=self.dim_observations, dim_outputs=1) # one Q-value per state needed self.target_critic = CriticNetwork( self.layers_critic, dim_obs=self.dim_observations, dim_outputs=1) # one Q-value per state needed # instantiate the models self.critic.build((1, self.dim_inputs_critic)) self.target_critic.build((1, self.dim_inputs_critic)) # setup the critic's optimizer self.optimizer_critic = tf.keras.optimizers.Adam( learning_rate=self.lr_critic) # --- copy weights to targets or load old model weights --- if type(self) == DDPG: self.init_or_load_weights(load_critic=( self.mode != "validation" and self.mode != "validation_mult"))