Exemplo n.º 1
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            if get_default_config:
                config = dict(get_default_config(), **config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config

            if action_sampler_fn:
                if not make_model:
                    raise ValueError(
                        "make_model is required if action_sampler_fn is given")
                self.dist_class = None
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework="tf",
                )

            self.model({
                SampleBatch.CUR_OBS: tf.convert_to_tensor(
                    np.array([observation_space.sample()])),
                SampleBatch.PREV_ACTIONS: tf.convert_to_tensor(
                    [_flatten_action(action_space.sample())]),
                SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]),
            }, [
                tf.convert_to_tensor([s])
                for s in self.model.get_initial_state()
            ], tf.convert_to_tensor([1]))

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            self._initialize_loss_with_dummy_batch()
            self._loss_initialized = True

            if optimizer_fn:
                self._optimizer = optimizer_fn(self, config)
            else:
                self._optimizer = tf.train.AdamOptimizer(config["lr"])

            if after_init:
                after_init(self, observation_space, action_space, config)
Exemplo n.º 2
0
 def __init__(self, observation_space, action_space, config):
     Policy.__init__(self, observation_space, action_space, config)
     self.osm = OSM(config['alpha'], config['gamma'], config['blocks'])
     self.osm.MDP_matrix_init()
     P, R = self.osm.get_MDP_matrix()
     solver = mdptoolbox.mdp.PolicyIteration(P, R, 0.99)
     solver.run()
     self.blocks = config['blocks']
     self.optimal_policy = solver.policy
Exemplo n.º 3
0
 def __init__(self,
              observation_space,
              action_space,
              config,
              actor=None,
              critic=None):
     Policy.__init__(self, observation_space, action_space, config)
     self.actor = actor
     self.critic = critic
     self.models = [self.actor, self.critic]
Exemplo n.º 4
0
    def __init__(self, observation_space, action_space, config):
        Policy.__init__(self,
                        observation_space=observation_space,
                        action_space=action_space,
                        config=config)

        env_config = config['env_config']
        self.game_version = env_config['version']

        self.game = pyspiel.load_game(self.game_version)

        self.policy_dict = None
Exemplo n.º 5
0
    def __init__(self, observation_space, action_space, config):
        Policy.__init__(self, observation_space=observation_space, action_space=action_space, config=config)

        if config["custom_preprocessor"]:
            self.preprocessor = ModelCatalog.get_preprocessor_for_space(
                observation_space=self.observation_space.original_space,
                options={"custom_preprocessor": config["custom_preprocessor"]})
        else:
            raise ValueError("Custom preprocessor for PokerCFRPolicy needs to be specified on its passed config.")

        env_id = config['env']
        assert env_id == POKER_ENV
        self.policy_dict = None
Exemplo n.º 6
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            self.framework = config.get("framework", "tfe")
            Policy.__init__(self, observation_space, action_space, config)

            # Log device and worker index.
            from ray.rllib.evaluation.rollout_worker import get_global_worker
            worker = get_global_worker()
            worker_idx = worker.worker_index if worker else 0
            if tf.config.list_physical_devices("GPU"):
                logger.info(
                    "TF-eager Policy (worker={}) running on GPU.".format(
                        worker_idx if worker_idx > 0 else "local"))
            else:
                logger.info(
                    "TF-eager Policy (worker={}) running on CPU.".format(
                        worker_idx if worker_idx > 0 else "local"))

            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            self._loss = loss_fn
            self.batch_divisibility_req = get_batch_divisibility_req(self) if \
                callable(get_batch_divisibility_req) else \
                (get_batch_divisibility_req or 1)
            self._max_seq_len = config["model"]["max_seq_len"]

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            # Lock used for locking some methods on the object-level.
            # This prevents possible race conditions when calling the model
            # first, then its value function (e.g. in a loss function), in
            # between of which another model call is made (e.g. to compute an
            # action).
            self._lock = threading.RLock()

            # Auto-update model's inference view requirements, if recurrent.
            self._update_model_view_requirements_from_init_state()

            self.exploration = self._create_exploration()
            self._state_inputs = self.model.get_initial_state()
            self._is_recurrent = len(self._state_inputs) > 0

            # Combine view_requirements for Model and Policy.
            self.view_requirements.update(self.model.view_requirements)

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            if optimizer_fn:
                optimizers = optimizer_fn(self, config)
            else:
                optimizers = tf.keras.optimizers.Adam(config["lr"])
            optimizers = force_list(optimizers)
            if getattr(self, "exploration", None):
                optimizers = self.exploration.get_exploration_optimizer(
                    optimizers)
            # TODO: (sven) Allow tf policy to have more than 1 optimizer.
            #  Just like torch Policy does.
            self._optimizer = optimizers[0] if optimizers else None

            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=stats_fn,
            )
            self._loss_initialized = True

            if after_init:
                after_init(self, observation_space, action_space, config)

            # Got to reset global_timestep again after fake run-throughs.
            self.global_timestep = 0
Exemplo n.º 7
0
    def __init__(
        self,
        observation_space: gym.spaces.Space,
        action_space: gym.spaces.Space,
        config: TrainerConfigDict,
        **kwargs,
    ):
        self.framework = config.get("framework", "tf2")

        # Log device.
        logger.info("Creating TF-eager policy running on {}.".format(
            "GPU" if get_gpu_devices() else "CPU"))

        Policy.__init__(self, observation_space, action_space, config)

        config = dict(self.get_default_config(), **config)
        self.config = config

        self._is_training = False
        # Global timestep should be a tensor.
        self.global_timestep = tf.Variable(0, trainable=False, dtype=tf.int64)
        self.explore = tf.Variable(self.config["explore"],
                                   trainable=False,
                                   dtype=tf.bool)

        self._loss_initialized = False
        # Backward compatibility workaround so Policy will call self.loss() directly.
        # TODO(jungong): clean up after all policies are migrated to new sub-class
        # implementation.
        self._loss = None

        self.batch_divisibility_req = self.get_batch_divisibility_req()
        self._max_seq_len = config["model"]["max_seq_len"]

        self.validate_spaces(observation_space, action_space, config)

        # If using default make_model(), dist_class will get updated when
        # the model is created next.
        self.dist_class = self._init_dist_class()
        self.model = self.make_model()

        self._init_view_requirements()

        self.exploration = self._create_exploration()
        self._state_inputs = self.model.get_initial_state()
        self._is_recurrent = len(self._state_inputs) > 0

        # Got to reset global_timestep again after fake run-throughs.
        self.global_timestep.assign(0)

        # Lock used for locking some methods on the object-level.
        # This prevents possible race conditions when calling the model
        # first, then its value function (e.g. in a loss function), in
        # between of which another model call is made (e.g. to compute an
        # action).
        self._lock = threading.RLock()

        # Only for `config.eager_tracing=True`: A counter to keep track of
        # how many times an eager-traced method (e.g.
        # `self._compute_actions_helper`) has been re-traced by tensorflow.
        # We will raise an error if more than n re-tracings have been
        # detected, since this would considerably slow down execution.
        # The variable below should only get incremented during the
        # tf.function trace operations, never when calling the already
        # traced function after that.
        self._re_trace_counter = 0
Exemplo n.º 8
0
 def __init__(self, observation_space, action_space, config):
     Policy.__init__(self, observation_space, action_space, config)
Exemplo n.º 9
0
 def __init__(self, observation_space, action_space, config):
     Policy.__init__(self, observation_space, action_space, config)
     self.blocks = config['blocks']
     self.fiftyone = config['fiftyone']
     self.extended = config['extended']
Exemplo n.º 10
0
 def __init__(self, observation_space, action_space, config):
     Policy.__init__(self, observation_space, action_space, config)
     x, y, r1, r2 = get_Nash_equilibrium(config['alphas'])
     self.infiltrating = y / config['alphas'][1]
Exemplo n.º 11
0
 def __init__(self, observation_space, action_space, config):
     Policy.__init__(self, observation_space, action_space, config)
     self.infiltrating = config['infiltrating']
Exemplo n.º 12
0
 def __init__(self, observation_space, action_space, config):
     Policy.__init__(self, observation_space, action_space, config)
     self.action_space_shape = action_space.shape
     self.n_products = config['number_of_products']
     self.n_sources = config['number_of_sources']
Exemplo n.º 13
0
 def __init__(self, observation_space, action_space, config):
     Policy.__init__(self, observation_space, action_space, config)
     # example parameter
     self.w = 1.0
Exemplo n.º 14
0
        def __init__(self, observation_space, action_space, config):
            # If this class runs as a @ray.remote actor, eager mode may not
            # have been activated yet.
            if not tf1.executing_eagerly():
                tf1.enable_eager_execution()
            self.framework = config.get("framework", "tfe")
            Policy.__init__(self, observation_space, action_space, config)

            # Log device and worker index.
            from ray.rllib.evaluation.rollout_worker import get_global_worker

            worker = get_global_worker()
            worker_idx = worker.worker_index if worker else 0
            if get_gpu_devices():
                logger.info(
                    "TF-eager Policy (worker={}) running on GPU.".format(
                        worker_idx if worker_idx > 0 else "local"))
            else:
                logger.info(
                    "TF-eager Policy (worker={}) running on CPU.".format(
                        worker_idx if worker_idx > 0 else "local"))

            self._is_training = False

            # Only for `config.eager_tracing=True`: A counter to keep track of
            # how many times an eager-traced method (e.g.
            # `self._compute_actions_helper`) has been re-traced by tensorflow.
            # We will raise an error if more than n re-tracings have been
            # detected, since this would considerably slow down execution.
            # The variable below should only get incremented during the
            # tf.function trace operations, never when calling the already
            # traced function after that.
            self._re_trace_counter = 0

            self._loss_initialized = False
            # To ensure backward compatibility:
            # Old way: If `loss` provided here, use as-is (as a function).
            if loss_fn is not None:
                self._loss = loss_fn
            # New way: Convert the overridden `self.loss` into a plain
            # function, so it can be called the same way as `loss` would
            # be, ensuring backward compatibility.
            elif self.loss.__func__.__qualname__ != "Policy.loss":
                self._loss = self.loss.__func__
            # `loss` not provided nor overridden from Policy -> Set to None.
            else:
                self._loss = None

            self.batch_divisibility_req = (get_batch_divisibility_req(self) if
                                           callable(get_batch_divisibility_req)
                                           else
                                           (get_batch_divisibility_req or 1))
            self._max_seq_len = config["model"]["max_seq_len"]

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            # Lock used for locking some methods on the object-level.
            # This prevents possible race conditions when calling the model
            # first, then its value function (e.g. in a loss function), in
            # between of which another model call is made (e.g. to compute an
            # action).
            self._lock = threading.RLock()

            # Auto-update model's inference view requirements, if recurrent.
            self._update_model_view_requirements_from_init_state()

            self.exploration = self._create_exploration()
            self._state_inputs = self.model.get_initial_state()
            self._is_recurrent = len(self._state_inputs) > 0

            # Combine view_requirements for Model and Policy.
            self.view_requirements.update(self.model.view_requirements)

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            if optimizer_fn:
                optimizers = optimizer_fn(self, config)
            else:
                optimizers = tf.keras.optimizers.Adam(config["lr"])
            optimizers = force_list(optimizers)
            if getattr(self, "exploration", None):
                optimizers = self.exploration.get_exploration_optimizer(
                    optimizers)

            # The list of local (tf) optimizers (one per loss term).
            self._optimizers: List[LocalOptimizer] = optimizers
            # Backward compatibility: A user's policy may only support a single
            # loss term and optimizer (no lists).
            self._optimizer: LocalOptimizer = optimizers[
                0] if optimizers else None

            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=stats_fn,
            )
            self._loss_initialized = True

            if after_init:
                after_init(self, observation_space, action_space, config)

            # Got to reset global_timestep again after fake run-throughs.
            self.global_timestep = 0
Exemplo n.º 15
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            self.framework = "tfe"
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            self.exploration = self._create_exploration()
            self._state_in = [
                tf.convert_to_tensor([s])
                for s in self.model.get_initial_state()
            ]
            input_dict = {
                SampleBatch.CUR_OBS: tf.convert_to_tensor(
                    np.array([observation_space.sample()])),
                SampleBatch.PREV_ACTIONS: tf.convert_to_tensor(
                    [flatten_to_single_ndarray(action_space.sample())]),
                SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]),
            }

            if action_distribution_fn:
                dist_inputs, self.dist_class, _ = action_distribution_fn(
                    self, self.model, input_dict[SampleBatch.CUR_OBS])
            else:
                self.model(input_dict, self._state_in,
                           tf.convert_to_tensor([1]))

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            self._initialize_loss_with_dummy_batch()
            self._loss_initialized = True

            if optimizer_fn:
                self._optimizer = optimizer_fn(self, config)
            else:
                self._optimizer = tf.keras.optimizers.Adam(config["lr"])

            if after_init:
                after_init(self, observation_space, action_space, config)
Exemplo n.º 16
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            self.framework = config.get("framework", "tfe")
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            self._loss = loss_fn
            self.batch_divisibility_req = get_batch_divisibility_req(self) if \
                callable(get_batch_divisibility_req) else \
                (get_batch_divisibility_req or 1)
            self._max_seq_len = config["model"]["max_seq_len"]

            if get_default_config:
                config = dict(get_default_config(), **config)

            if validate_spaces:
                validate_spaces(self, observation_space, action_space, config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config
            self.dist_class = None
            if action_sampler_fn or action_distribution_fn:
                if not make_model:
                    raise ValueError(
                        "`make_model` is required if `action_sampler_fn` OR "
                        "`action_distribution_fn` is given")
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework=self.framework,
                )
            # Auto-update model's inference view requirements, if recurrent.
            self._update_model_inference_view_requirements_from_init_state()

            self.exploration = self._create_exploration()
            self._state_in = [
                tf.convert_to_tensor([s])
                for s in self.model.get_initial_state()
            ]

            # Update this Policy's ViewRequirements (if function given).
            if callable(view_requirements_fn):
                self.view_requirements.update(view_requirements_fn(self))
            # Combine view_requirements for Model and Policy.
            self.view_requirements.update(
                self.model.inference_view_requirements)

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            self._initialize_loss_from_dummy_batch(
                auto_remove_unneeded_view_reqs=True,
                stats_fn=stats_fn,
            )
            self._loss_initialized = True

            if optimizer_fn:
                optimizers = optimizer_fn(self, config)
            else:
                optimizers = tf.keras.optimizers.Adam(config["lr"])
            optimizers = force_list(optimizers)
            if getattr(self, "exploration", None):
                optimizers = self.exploration.get_exploration_optimizer(
                    optimizers)
            # TODO: (sven) Allow tf policy to have more than 1 optimizer.
            #  Just like torch Policy does.
            self._optimizer = optimizers[0] if optimizers else None

            if after_init:
                after_init(self, observation_space, action_space, config)

            # Got to reset global_timestep again after this fake run-through.
            self.global_timestep = 0
Exemplo n.º 17
0
 def __init__(self, obs_space, action_space, config):
     Policy.__init__(self, obs_space, action_space, config)
     self.distance = 0.25