class RLlibTFA2FilterPolicy(AgentPolicy): def __init__(self, load_path, algorithm, policy_name, observation_space, action_space): self._checkpoint_path = load_path self._algorithm = algorithm self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._sess = None if isinstance(action_space, gym.spaces.Box): self.is_continuous = True elif isinstance(action_space, gym.spaces.Discrete): self.is_continuous = False else: raise TypeError("Unsupport action space") if self._sess: return if self._algorithm == "PPO": from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy elif self._algorithm in ["A2C", "A3C"]: from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy elif self._algorithm == "PG": from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy elif self._algorithm == "DQN": from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy as LoadPolicy else: raise TypeError("Unsupport algorithm") self._prep = ModelCatalog.get_preprocessor_for_space( self._observation_space) self._sess = tf.Session(graph=tf.Graph()) self._sess.__enter__() with tf.name_scope(self._policy_name): # obs_space need to be flattened before passed to PPOTFPolicy flat_obs_space = self._prep.observation_space config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy() config['num_workers'] = 0 config['model']['free_log_std'] = True self.policy = LoadPolicy(flat_obs_space, self._action_space, config) objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] filters = objs["filters"] self.filters = filters[self._policy_name] weights = state[self._policy_name] self.policy.set_weights(weights) def act(self, obs): # single infer obs = self._prep.transform(obs) obs = self.filters(obs, update=False) action = self.policy.compute_actions([obs], explore=False)[0][0] return action
def __init__( self, load_path, algorithm, policy_name, observation_space, action_space ): load_path = str(load_path) if algorithm == "PPO": from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy elif algorithm in ["A2C", "A3C"]: from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy elif algorithm == "PG": from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy elif algorithm == "DQN": from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy as LoadPolicy else: raise ValueError(f"Unsupported algorithm: {algorithm}") self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._sess = tf.compat.v1.Session(graph=tf.Graph()) with tf.compat.v1.name_scope(policy_name): # obs_space need to be flattened before passed to PPOTFPolicy flat_obs_space = self._prep.observation_space policy = LoadPolicy(flat_obs_space, self._action_space, {}) objs = pickle.load(open(load_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] weights = state[policy_name] policy.set_weights(weights) # These tensor names were found by inspecting the trained model if algorithm == "PPO": # CRUCIAL FOR SAFETY: # We use Tensor("split") instead of Tensor("add") to force # PPO to be deterministic. self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observation:0" ) self._output_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/split:0" ) elif self._algorithm == "DQN": self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( input=self._sess.graph.get_tensor_by_name( f"{policy_name}/value_out/BiasAdd:0" ), axis=1, ) else: self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( input=self._sess.graph.get_tensor_by_name( f"{policy_name}/fc_out/BiasAdd:0" ), axis=1, )
class RLlibTFCheckpointPolicy(AgentPolicy): def __init__(self, load_path, algorithm, policy_name, observation_space, action_space): self._load_path = load_path self._algorithm = algorithm self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._sess = None if isinstance(action_space, gym.spaces.Box): self.is_continuous = True elif isinstance(action_space, gym.spaces.Discrete): self.is_continuous = False else: raise TypeError("Unsupport action space") def setup(self): if self._sess: return if self._algorithm == "PPO": from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy elif self._algorithm in ["A2C", "A3C"]: from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy elif self._algorithm == "PG": from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy elif self._algorithm == "DQN": from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy as LoadPolicy else: raise TypeError("Unsupport algorithm") self._prep = ModelCatalog.get_preprocessor_for_space( self._observation_space) self._sess = tf.Session(graph=tf.Graph()) self._sess.__enter__() with tf.name_scope(self._policy_name): # obs_space need to be flattened before passed to PPOTFPolicy flat_obs_space = self._prep.observation_space self.policy = LoadPolicy(flat_obs_space, self._action_space, {}) objs = pickle.load(open(self._load_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] weights = state[self._policy_name] self.policy.set_weights(weights) def teardown(self): # TODO: actually teardown the TF session pass def act(self, obs): obs = self._prep.transform(obs) action = self.policy.compute_actions([obs], explore=False)[0][0] return action
class RLAgent(Agent): def __init__(self, load_path, policy_name, observation_space, action_space): self._checkpoint_path = load_path self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._sess = None if isinstance(action_space, gym.spaces.Box): self.is_continuous = True elif isinstance(action_space, gym.spaces.Discrete): self.is_continuous = False else: raise TypeError("Unsupport action space") if self._sess: return self._prep = ModelCatalog.get_preprocessor_for_space( self._observation_space) self._sess = tf.compat.v1.Session(graph=tf.Graph()) self._sess.__enter__() with tf.name_scope(self._policy_name): # obs_space need to be flattened before passed to PPOTFPolicy flat_obs_space = self._prep.observation_space self.policy = LoadPolicy(flat_obs_space, self._action_space, {}) objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] weights = state[self._policy_name] self.policy.set_weights(weights) def act(self, obs): if isinstance(obs, list): # batch infer obs = [self._prep.transform(o) for o in obs] action = self.policy.compute_actions(obs, explore=False)[0] else: # single infer obs = self._prep.transform(obs) action = self.policy.compute_actions([obs], explore=False)[0][0] return action
def __init__(self, load_path, algorithm, policy_name, yaml_path): load_path = str(load_path) if algorithm == "ppo": from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy elif algorithm in "a2c": from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy from ray.rllib.agents.a3c import DEFAULT_CONFIG elif algorithm == "pg": from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy elif algorithm == "dqn": from ray.rllib.agents.dqn import DQNTFPolicy as LoadPolicy elif algorithm == "maac": from benchmark.agents.maac.tf_policy import CA2CTFPolicy as LoadPolicy from benchmark.agents.maac.tf_policy import DEFAULT_CONFIG elif algorithm == "maddpg": from benchmark.agents.maddpg.tf_policy import MADDPG2TFPolicy as LoadPolicy from benchmark.agents.maddpg.tf_policy import DEFAULT_CONFIG elif algorithm == "mfac": from benchmark.agents.mfac.tf_policy import MFACTFPolicy as LoadPolicy from benchmark.agents.mfac.tf_policy import DEFAULT_CONFIG elif algorithm == "networked_pg": from benchmark.agents.networked_pg.tf_policy import ( NetworkedPG as LoadPolicy, ) from benchmark.agents.networked_pg.tf_policy import ( PG_DEFAULT_CONFIG as DEFAULT_CONFIG, ) else: raise ValueError(f"Unsupported algorithm: {algorithm}") yaml_path = BASE_DIR / yaml_path load_path = BASE_DIR / f"log/results/run/{load_path}" config = load_config(yaml_path) observation_space = config["policy"][1] action_space = config["policy"][2] pconfig = DEFAULT_CONFIG pconfig["model"].update(config["policy"][-1].get("model", {})) pconfig["agent_id"] = policy_name self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._sess = tf.Session(graph=tf.get_default_graph()) with tf.name_scope(policy_name): # Observation space needs to be flattened before passed to the policy flat_obs_space = self._prep.observation_space policy = LoadPolicy(flat_obs_space, action_space, pconfig) self._sess.run(tf.global_variables_initializer()) objs = pickle.load(open(load_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] weights = state[policy_name] policy.set_weights(weights) # for op in tf.get_default_graph().get_operations(): # print(str(op.name)) # These tensor names were found by inspecting the trained model if algorithm == "ppo": # CRUCIAL FOR SAFETY: # We use Tensor("split") instead of Tensor("add") to force # PPO to be deterministic. self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observation:0" ) self._output_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/split:0" ) elif algorithm == "dqn": self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( self._sess.graph.get_tensor_by_name( f"{policy_name}/value_out/BiasAdd:0" ), axis=1, ) elif algorithm == "maac": self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/policy-inputs:0" ) self._output_node = tf.argmax( self._sess.graph.get_tensor_by_name( f"{policy_name}/logits_out/BiasAdd:0" ), axis=1, ) elif algorithm == "maddpg": self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/obs_2:0" ) self._output_node = tf.argmax( self._sess.graph.get_tensor_by_name( f"{policy_name}/actor/AGENT_2_actor_RelaxedOneHotCategorical_1/sample/AGENT_2_actor_exp/forward/Exp:0" ) ) else: self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( self._sess.graph.get_tensor_by_name(f"{policy_name}/fc_out/BiasAdd:0"), axis=1, )