示例#1
0
class RLlibTFA2FilterPolicy(AgentPolicy):
    def __init__(self, load_path, algorithm, policy_name, observation_space,
                 action_space):
        self._checkpoint_path = load_path
        self._algorithm = algorithm
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._sess = None

        if isinstance(action_space, gym.spaces.Box):
            self.is_continuous = True
        elif isinstance(action_space, gym.spaces.Discrete):
            self.is_continuous = False
        else:
            raise TypeError("Unsupport action space")

        if self._sess:
            return

        if self._algorithm == "PPO":
            from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy
        elif self._algorithm in ["A2C", "A3C"]:
            from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy
        elif self._algorithm == "PG":
            from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy
        elif self._algorithm == "DQN":
            from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy as LoadPolicy
        else:
            raise TypeError("Unsupport algorithm")

        self._prep = ModelCatalog.get_preprocessor_for_space(
            self._observation_space)
        self._sess = tf.Session(graph=tf.Graph())
        self._sess.__enter__()

        with tf.name_scope(self._policy_name):
            # obs_space need to be flattened before passed to PPOTFPolicy
            flat_obs_space = self._prep.observation_space
            config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy()
            config['num_workers'] = 0
            config['model']['free_log_std'] = True
            self.policy = LoadPolicy(flat_obs_space, self._action_space,
                                     config)
            objs = pickle.load(open(self._checkpoint_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            filters = objs["filters"]
            self.filters = filters[self._policy_name]
            weights = state[self._policy_name]
            self.policy.set_weights(weights)

    def act(self, obs):

        # single infer
        obs = self._prep.transform(obs)
        obs = self.filters(obs, update=False)
        action = self.policy.compute_actions([obs], explore=False)[0][0]

        return action
示例#2
0
    def __init__(
        self, load_path, algorithm, policy_name, observation_space, action_space
    ):
        load_path = str(load_path)
        if algorithm == "PPO":
            from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy
        elif algorithm in ["A2C", "A3C"]:
            from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy
        elif algorithm == "PG":
            from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy
        elif algorithm == "DQN":
            from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy as LoadPolicy
        else:
            raise ValueError(f"Unsupported algorithm: {algorithm}")

        self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
        self._sess = tf.compat.v1.Session(graph=tf.Graph())

        with tf.compat.v1.name_scope(policy_name):
            # obs_space need to be flattened before passed to PPOTFPolicy
            flat_obs_space = self._prep.observation_space
            policy = LoadPolicy(flat_obs_space, self._action_space, {})
            objs = pickle.load(open(load_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            weights = state[policy_name]
            policy.set_weights(weights)

        # These tensor names were found by inspecting the trained model
        if algorithm == "PPO":
            # CRUCIAL FOR SAFETY:
            #   We use Tensor("split") instead of Tensor("add") to force
            #   PPO to be deterministic.
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observation:0"
            )
            self._output_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/split:0"
            )
        elif self._algorithm == "DQN":
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observations:0"
            )
            self._output_node = tf.argmax(
                input=self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/value_out/BiasAdd:0"
                ),
                axis=1,
            )
        else:
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observations:0"
            )
            self._output_node = tf.argmax(
                input=self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/fc_out/BiasAdd:0"
                ),
                axis=1,
            )
class RLlibTFCheckpointPolicy(AgentPolicy):
    def __init__(self, load_path, algorithm, policy_name, observation_space,
                 action_space):
        self._load_path = load_path
        self._algorithm = algorithm
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._sess = None

        if isinstance(action_space, gym.spaces.Box):
            self.is_continuous = True
        elif isinstance(action_space, gym.spaces.Discrete):
            self.is_continuous = False
        else:
            raise TypeError("Unsupport action space")

    def setup(self):
        if self._sess:
            return

        if self._algorithm == "PPO":
            from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy
        elif self._algorithm in ["A2C", "A3C"]:
            from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy
        elif self._algorithm == "PG":
            from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy
        elif self._algorithm == "DQN":
            from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy as LoadPolicy
        else:
            raise TypeError("Unsupport algorithm")

        self._prep = ModelCatalog.get_preprocessor_for_space(
            self._observation_space)
        self._sess = tf.Session(graph=tf.Graph())
        self._sess.__enter__()

        with tf.name_scope(self._policy_name):
            # obs_space need to be flattened before passed to PPOTFPolicy
            flat_obs_space = self._prep.observation_space
            self.policy = LoadPolicy(flat_obs_space, self._action_space, {})
            objs = pickle.load(open(self._load_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            weights = state[self._policy_name]
            self.policy.set_weights(weights)

    def teardown(self):
        # TODO: actually teardown the TF session
        pass

    def act(self, obs):
        obs = self._prep.transform(obs)
        action = self.policy.compute_actions([obs], explore=False)[0][0]

        return action
示例#4
0
class RLAgent(Agent):
    def __init__(self, load_path, policy_name, observation_space,
                 action_space):
        self._checkpoint_path = load_path
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._sess = None

        if isinstance(action_space, gym.spaces.Box):
            self.is_continuous = True
        elif isinstance(action_space, gym.spaces.Discrete):
            self.is_continuous = False
        else:
            raise TypeError("Unsupport action space")

        if self._sess:
            return

        self._prep = ModelCatalog.get_preprocessor_for_space(
            self._observation_space)
        self._sess = tf.compat.v1.Session(graph=tf.Graph())
        self._sess.__enter__()

        with tf.name_scope(self._policy_name):
            # obs_space need to be flattened before passed to PPOTFPolicy
            flat_obs_space = self._prep.observation_space
            self.policy = LoadPolicy(flat_obs_space, self._action_space, {})
            objs = pickle.load(open(self._checkpoint_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            weights = state[self._policy_name]
            self.policy.set_weights(weights)

    def act(self, obs):
        if isinstance(obs, list):
            # batch infer
            obs = [self._prep.transform(o) for o in obs]
            action = self.policy.compute_actions(obs, explore=False)[0]
        else:
            # single infer
            obs = self._prep.transform(obs)
            action = self.policy.compute_actions([obs], explore=False)[0][0]

        return action
示例#5
0
    def __init__(self, load_path, algorithm, policy_name, yaml_path):
        load_path = str(load_path)
        if algorithm == "ppo":
            from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy
        elif algorithm in "a2c":
            from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy
            from ray.rllib.agents.a3c import DEFAULT_CONFIG
        elif algorithm == "pg":
            from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy
        elif algorithm == "dqn":
            from ray.rllib.agents.dqn import DQNTFPolicy as LoadPolicy
        elif algorithm == "maac":
            from benchmark.agents.maac.tf_policy import CA2CTFPolicy as LoadPolicy
            from benchmark.agents.maac.tf_policy import DEFAULT_CONFIG
        elif algorithm == "maddpg":
            from benchmark.agents.maddpg.tf_policy import MADDPG2TFPolicy as LoadPolicy
            from benchmark.agents.maddpg.tf_policy import DEFAULT_CONFIG
        elif algorithm == "mfac":
            from benchmark.agents.mfac.tf_policy import MFACTFPolicy as LoadPolicy
            from benchmark.agents.mfac.tf_policy import DEFAULT_CONFIG
        elif algorithm == "networked_pg":
            from benchmark.agents.networked_pg.tf_policy import (
                NetworkedPG as LoadPolicy,
            )
            from benchmark.agents.networked_pg.tf_policy import (
                PG_DEFAULT_CONFIG as DEFAULT_CONFIG,
            )
        else:
            raise ValueError(f"Unsupported algorithm: {algorithm}")

        yaml_path = BASE_DIR / yaml_path
        load_path = BASE_DIR / f"log/results/run/{load_path}"

        config = load_config(yaml_path)
        observation_space = config["policy"][1]
        action_space = config["policy"][2]
        pconfig = DEFAULT_CONFIG

        pconfig["model"].update(config["policy"][-1].get("model", {}))
        pconfig["agent_id"] = policy_name

        self._prep = ModelCatalog.get_preprocessor_for_space(observation_space)
        self._sess = tf.Session(graph=tf.get_default_graph())

        with tf.name_scope(policy_name):
            # Observation space needs to be flattened before passed to the policy
            flat_obs_space = self._prep.observation_space
            policy = LoadPolicy(flat_obs_space, action_space, pconfig)
            self._sess.run(tf.global_variables_initializer())
            objs = pickle.load(open(load_path, "rb"))
            objs = pickle.loads(objs["worker"])
            state = objs["state"]
            weights = state[policy_name]
            policy.set_weights(weights)

        # for op in tf.get_default_graph().get_operations():
        #     print(str(op.name))

        # These tensor names were found by inspecting the trained model
        if algorithm == "ppo":
            # CRUCIAL FOR SAFETY:
            #   We use Tensor("split") instead of Tensor("add") to force
            #   PPO to be deterministic.
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observation:0"
            )
            self._output_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/split:0"
            )
        elif algorithm == "dqn":
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observations:0"
            )
            self._output_node = tf.argmax(
                self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/value_out/BiasAdd:0"
                ),
                axis=1,
            )
        elif algorithm == "maac":
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/policy-inputs:0"
            )
            self._output_node = tf.argmax(
                self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/logits_out/BiasAdd:0"
                ),
                axis=1,
            )
        elif algorithm == "maddpg":
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/obs_2:0"
            )
            self._output_node = tf.argmax(
                self._sess.graph.get_tensor_by_name(
                    f"{policy_name}/actor/AGENT_2_actor_RelaxedOneHotCategorical_1/sample/AGENT_2_actor_exp/forward/Exp:0"
                )
            )
        else:
            self._input_node = self._sess.graph.get_tensor_by_name(
                f"{policy_name}/observations:0"
            )
            self._output_node = tf.argmax(
                self._sess.graph.get_tensor_by_name(f"{policy_name}/fc_out/BiasAdd:0"),
                axis=1,
            )