def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None if get_default_config: config = dict(get_default_config(), **config) if before_init: before_init(self, observation_space, action_space, config) self.config = config if action_sampler_fn: if not make_model: raise ValueError( "make_model is required if action_sampler_fn is given") self.dist_class = None else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework="tf", ) self.model({ SampleBatch.CUR_OBS: tf.convert_to_tensor( np.array([observation_space.sample()])), SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( [_flatten_action(action_space.sample())]), SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]), }, [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ], tf.convert_to_tensor([1])) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_with_dummy_batch() self._loss_initialized = True if optimizer_fn: self._optimizer = optimizer_fn(self, config) else: self._optimizer = tf.train.AdamOptimizer(config["lr"]) if after_init: after_init(self, observation_space, action_space, config)
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.osm = OSM(config['alpha'], config['gamma'], config['blocks']) self.osm.MDP_matrix_init() P, R = self.osm.get_MDP_matrix() solver = mdptoolbox.mdp.PolicyIteration(P, R, 0.99) solver.run() self.blocks = config['blocks'] self.optimal_policy = solver.policy
def __init__(self, observation_space, action_space, config, actor=None, critic=None): Policy.__init__(self, observation_space, action_space, config) self.actor = actor self.critic = critic self.models = [self.actor, self.critic]
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space=observation_space, action_space=action_space, config=config) env_config = config['env_config'] self.game_version = env_config['version'] self.game = pyspiel.load_game(self.game_version) self.policy_dict = None
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space=observation_space, action_space=action_space, config=config) if config["custom_preprocessor"]: self.preprocessor = ModelCatalog.get_preprocessor_for_space( observation_space=self.observation_space.original_space, options={"custom_preprocessor": config["custom_preprocessor"]}) else: raise ValueError("Custom preprocessor for PokerCFRPolicy needs to be specified on its passed config.") env_id = config['env'] assert env_id == POKER_ENV self.policy_dict = None
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) # Log device and worker index. from ray.rllib.evaluation.rollout_worker import get_global_worker worker = get_global_worker() worker_idx = worker.worker_index if worker else 0 if tf.config.list_physical_devices("GPU"): logger.info( "TF-eager Policy (worker={}) running on GPU.".format( worker_idx if worker_idx > 0 else "local")) else: logger.info( "TF-eager Policy (worker={}) running on CPU.".format( worker_idx if worker_idx > 0 else "local")) self._is_training = False self._loss_initialized = False self._sess = None self._loss = loss_fn self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # TODO: (sven) Allow tf policy to have more than 1 optimizer. # Just like torch Policy does. self._optimizer = optimizers[0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after fake run-throughs. self.global_timestep = 0
def __init__( self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, **kwargs, ): self.framework = config.get("framework", "tf2") # Log device. logger.info("Creating TF-eager policy running on {}.".format( "GPU" if get_gpu_devices() else "CPU")) Policy.__init__(self, observation_space, action_space, config) config = dict(self.get_default_config(), **config) self.config = config self._is_training = False # Global timestep should be a tensor. self.global_timestep = tf.Variable(0, trainable=False, dtype=tf.int64) self.explore = tf.Variable(self.config["explore"], trainable=False, dtype=tf.bool) self._loss_initialized = False # Backward compatibility workaround so Policy will call self.loss() directly. # TODO(jungong): clean up after all policies are migrated to new sub-class # implementation. self._loss = None self.batch_divisibility_req = self.get_batch_divisibility_req() self._max_seq_len = config["model"]["max_seq_len"] self.validate_spaces(observation_space, action_space, config) # If using default make_model(), dist_class will get updated when # the model is created next. self.dist_class = self._init_dist_class() self.model = self.make_model() self._init_view_requirements() self.exploration = self._create_exploration() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Got to reset global_timestep again after fake run-throughs. self.global_timestep.assign(0) # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() # Only for `config.eager_tracing=True`: A counter to keep track of # how many times an eager-traced method (e.g. # `self._compute_actions_helper`) has been re-traced by tensorflow. # We will raise an error if more than n re-tracings have been # detected, since this would considerably slow down execution. # The variable below should only get incremented during the # tf.function trace operations, never when calling the already # traced function after that. self._re_trace_counter = 0
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config)
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.blocks = config['blocks'] self.fiftyone = config['fiftyone'] self.extended = config['extended']
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) x, y, r1, r2 = get_Nash_equilibrium(config['alphas']) self.infiltrating = y / config['alphas'][1]
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.infiltrating = config['infiltrating']
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) self.action_space_shape = action_space.shape self.n_products = config['number_of_products'] self.n_sources = config['number_of_sources']
def __init__(self, observation_space, action_space, config): Policy.__init__(self, observation_space, action_space, config) # example parameter self.w = 1.0
def __init__(self, observation_space, action_space, config): # If this class runs as a @ray.remote actor, eager mode may not # have been activated yet. if not tf1.executing_eagerly(): tf1.enable_eager_execution() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) # Log device and worker index. from ray.rllib.evaluation.rollout_worker import get_global_worker worker = get_global_worker() worker_idx = worker.worker_index if worker else 0 if get_gpu_devices(): logger.info( "TF-eager Policy (worker={}) running on GPU.".format( worker_idx if worker_idx > 0 else "local")) else: logger.info( "TF-eager Policy (worker={}) running on CPU.".format( worker_idx if worker_idx > 0 else "local")) self._is_training = False # Only for `config.eager_tracing=True`: A counter to keep track of # how many times an eager-traced method (e.g. # `self._compute_actions_helper`) has been re-traced by tensorflow. # We will raise an error if more than n re-tracings have been # detected, since this would considerably slow down execution. # The variable below should only get incremented during the # tf.function trace operations, never when calling the already # traced function after that. self._re_trace_counter = 0 self._loss_initialized = False # To ensure backward compatibility: # Old way: If `loss` provided here, use as-is (as a function). if loss_fn is not None: self._loss = loss_fn # New way: Convert the overridden `self.loss` into a plain # function, so it can be called the same way as `loss` would # be, ensuring backward compatibility. elif self.loss.__func__.__qualname__ != "Policy.loss": self._loss = self.loss.__func__ # `loss` not provided nor overridden from Policy -> Set to None. else: self._loss = None self.batch_divisibility_req = (get_batch_divisibility_req(self) if callable(get_batch_divisibility_req) else (get_batch_divisibility_req or 1)) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Lock used for locking some methods on the object-level. # This prevents possible race conditions when calling the model # first, then its value function (e.g. in a loss function), in # between of which another model call is made (e.g. to compute an # action). self._lock = threading.RLock() # Auto-update model's inference view requirements, if recurrent. self._update_model_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_inputs = self.model.get_initial_state() self._is_recurrent = len(self._state_inputs) > 0 # Combine view_requirements for Model and Policy. self.view_requirements.update(self.model.view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # The list of local (tf) optimizers (one per loss term). self._optimizers: List[LocalOptimizer] = optimizers # Backward compatibility: A user's policy may only support a single # loss term and optimizer (no lists). self._optimizer: LocalOptimizer = optimizers[ 0] if optimizers else None self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after fake run-throughs. self.global_timestep = 0
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = "tfe" Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) self.exploration = self._create_exploration() self._state_in = [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ] input_dict = { SampleBatch.CUR_OBS: tf.convert_to_tensor( np.array([observation_space.sample()])), SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( [flatten_to_single_ndarray(action_space.sample())]), SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]), } if action_distribution_fn: dist_inputs, self.dist_class, _ = action_distribution_fn( self, self.model, input_dict[SampleBatch.CUR_OBS]) else: self.model(input_dict, self._state_in, tf.convert_to_tensor([1])) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_with_dummy_batch() self._loss_initialized = True if optimizer_fn: self._optimizer = optimizer_fn(self, config) else: self._optimizer = tf.keras.optimizers.Adam(config["lr"]) if after_init: after_init(self, observation_space, action_space, config)
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() self.framework = config.get("framework", "tfe") Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None self._loss = loss_fn self.batch_divisibility_req = get_batch_divisibility_req(self) if \ callable(get_batch_divisibility_req) else \ (get_batch_divisibility_req or 1) self._max_seq_len = config["model"]["max_seq_len"] if get_default_config: config = dict(get_default_config(), **config) if validate_spaces: validate_spaces(self, observation_space, action_space, config) if before_init: before_init(self, observation_space, action_space, config) self.config = config self.dist_class = None if action_sampler_fn or action_distribution_fn: if not make_model: raise ValueError( "`make_model` is required if `action_sampler_fn` OR " "`action_distribution_fn` is given") else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework=self.framework, ) # Auto-update model's inference view requirements, if recurrent. self._update_model_inference_view_requirements_from_init_state() self.exploration = self._create_exploration() self._state_in = [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ] # Update this Policy's ViewRequirements (if function given). if callable(view_requirements_fn): self.view_requirements.update(view_requirements_fn(self)) # Combine view_requirements for Model and Policy. self.view_requirements.update( self.model.inference_view_requirements) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_from_dummy_batch( auto_remove_unneeded_view_reqs=True, stats_fn=stats_fn, ) self._loss_initialized = True if optimizer_fn: optimizers = optimizer_fn(self, config) else: optimizers = tf.keras.optimizers.Adam(config["lr"]) optimizers = force_list(optimizers) if getattr(self, "exploration", None): optimizers = self.exploration.get_exploration_optimizer( optimizers) # TODO: (sven) Allow tf policy to have more than 1 optimizer. # Just like torch Policy does. self._optimizer = optimizers[0] if optimizers else None if after_init: after_init(self, observation_space, action_space, config) # Got to reset global_timestep again after this fake run-through. self.global_timestep = 0
def __init__(self, obs_space, action_space, config): Policy.__init__(self, obs_space, action_space, config) self.distance = 0.25