def setup(self): if self._sess: return if self._algorithm == "PPO": from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy elif self._algorithm in ["A2C", "A3C"]: from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy elif self._algorithm == "PG": from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy elif self._algorithm == "DQN": from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy as LoadPolicy else: raise TypeError("Unsupport algorithm") self._prep = ModelCatalog.get_preprocessor_for_space( self._observation_space) self._sess = tf.Session(graph=tf.Graph()) self._sess.__enter__() with tf.name_scope(self._policy_name): # obs_space need to be flattened before passed to PPOTFPolicy flat_obs_space = self._prep.observation_space self.policy = LoadPolicy(flat_obs_space, self._action_space, {}) objs = pickle.load(open(self._load_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] weights = state[self._policy_name] self.policy.set_weights(weights)
def __init__(self, obs_space, action_space, config): self.action_space = action_space self.action_noise_std = config["action_noise_std"] self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space) self.observation_filter = get_filter(config["observation_filter"], self.preprocessor.shape) self.single_threaded = config.get("single_threaded", False) self.sess = make_session(single_threaded=self.single_threaded) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, config["model"], dist_type="deterministic") model = ModelCatalog.get_model({SampleBatch.CUR_OBS: self.inputs}, obs_space, action_space, dist_dim, config["model"]) dist = dist_class(model.outputs, model) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def __init__(self, load_path, observation_space, action_space): self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._path_to_model = load_path if isinstance(action_space, gym.spaces.Box): self.is_continuous = True elif isinstance(action_space, gym.spaces.Discrete): self.is_continuous = False else: raise TypeError("Unsupport action space") self._sess = tf.Session(graph=tf.Graph()) self._sess.__enter__() tf.saved_model.load(self._sess, export_dir=self._path_to_model, tags=["serve"]) graph = tf.get_default_graph() if self.is_continuous: # These tensor names were found by inspecting the trained model # deterministic self.output_node = graph.get_tensor_by_name( "default_policy/split:0") # add guassian noise # output_node = graph.get_tensor_by_name("default_policy/add:0") else: self.output_node = graph.get_tensor_by_name( "default_policy/ArgMax:0") self.input_node = graph.get_tensor_by_name( "default_policy/observation:0")
def __init__(self, load_path, algorithm, policy_name, observation_space, action_space): self._checkpoint_path = load_path self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._prep = ModelCatalog.get_preprocessor_for_space( self._observation_space) flat_obs_space = self._prep.observation_space ray.init(ignore_reinit_error=True, local_mode=True) from zhr_train_rllib.ppo_policy_modeldist_multiv_multiobj import PPOTorchPolicy as LoadPolicy config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy() config['num_workers'] = 0 config['model']['free_log_std'] = False config["exploration_config"][ "type"] = "zhr.utils.saved_model_simple.StochasticSampling" self.policy = LoadPolicy(flat_obs_space, self._action_space, config) objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] filters = objs["filters"] self.filters = filters[self._policy_name] weights = state[self._policy_name] weights.pop("_optimizer_variables") self.policy.set_weights(weights) self.model = self.policy.model
def _build_policy_map(self, policy_dict, policy_config): policy_map = {} preprocessors = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): logger.debug("Creating policy for {}".format(name)) merged_conf = merge_dicts(policy_config, conf) if self.preprocessing_enabled: preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[name] = preprocessor obs_space = preprocessor.observation_space else: preprocessors[name] = NoPreprocessor(obs_space) if isinstance(obs_space, gym.spaces.Dict) or \ isinstance(obs_space, gym.spaces.Tuple): raise ValueError( "Found raw Tuple|Dict space as input to policy. " "Please preprocess these observations with a " "Tuple|DictFlatteningPreprocessor.") if tf: with tf.variable_scope(name): policy_map[name] = cls(obs_space, act_space, merged_conf) else: policy_map[name] = cls(obs_space, act_space, merged_conf) if self.worker_index == 0: logger.info("Built policy map: {}".format(policy_map)) logger.info("Built preprocessor map: {}".format(preprocessors)) return policy_map, preprocessors
def __init__(self, load_path, policy_name, observation_space, action_space): self._checkpoint_path = load_path self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._sess = None if isinstance(action_space, gym.spaces.Box): self.is_continuous = True elif isinstance(action_space, gym.spaces.Discrete): self.is_continuous = False else: raise TypeError("Unsupport action space") if self._sess: return self._prep = ModelCatalog.get_preprocessor_for_space( self._observation_space) self._sess = tf.compat.v1.Session(graph=tf.Graph()) self._sess.__enter__() with tf.compat.v1.name_scope(self._policy_name): # obs_space need to be flattened before passed to PPOTFPolicy flat_obs_space = self._prep.observation_space self.policy = LoadPolicy(flat_obs_space, self._action_space, {}) objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] weights = state[self._policy_name] self.policy.set_weights(weights)
def __init__(self, load_path, algorithm, policy_name, observation_space, action_space): self._checkpoint_path = load_path self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._prep = ModelCatalog.get_preprocessor_for_space(self._observation_space) flat_obs_space = self._prep.observation_space ray.init(ignore_reinit_error=True, local_mode=True) from utils.ppo_policy import PPOTorchPolicy as LoadPolicy config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy() config['num_workers'] = 0 config["model"]["use_lstm"] = True config['model']['free_log_std'] = False self.policy = LoadPolicy(flat_obs_space, self._action_space, config) objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] filters = objs["filters"] self.filters = filters[self._policy_name] weights = state[self._policy_name] weights.pop("_optimizer_variables") self.policy.set_weights(weights) self.model = self.policy.model self.rnn_state = self.model.get_initial_state() self.rnn_state = [self.rnn_state[0].unsqueeze(0),self.rnn_state[1].unsqueeze(0)]
def __init__(self, load_path, algorithm, policy_name, observation_space, action_space): self._checkpoint_path = load_path self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._prep = ModelCatalog.get_preprocessor_for_space(self._observation_space) flat_obs_space = self._prep.observation_space ray.init(ignore_reinit_error=True, local_mode=True) from utils.ppo_policy import PPOTorchPolicy as LoadPolicy from utils.fc_model import FCMultiLayerNetwork ModelCatalog.register_custom_model("my_fc", FCMultiLayerNetwork) config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy() config["vf_share_layers"] = True config['num_workers'] = 0 config["model"]["custom_model"] = "my_fc" config['model']['free_log_std'] = False self.policy = LoadPolicy(flat_obs_space, self._action_space, config) objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] filters = objs["filters"] self.filters = filters[self._policy_name] weights = state[self._policy_name] weights.pop("_optimizer_variables") self.policy.set_weights(weights) self.model = self.policy.model
def __init__(self, load_path, algorithm, policy_name, observation_space, action_space): self._checkpoint_path = load_path self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._prep = ModelCatalog.get_preprocessor_for_space(self._observation_space) flat_obs_space = self._prep.observation_space ray.init(ignore_reinit_error=True, local_mode=True) from utils.ppo_policy import PPOTorchPolicy as LoadPolicy from utils.rnn_model import RNNDVEModel ModelCatalog.register_custom_model("my_rnn", RNNDVEModel) config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy() config['num_workers'] = 0 config["model"]["custom_model"] = "my_rnn" self.policy = LoadPolicy(flat_obs_space, self._action_space, config) objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] filters = objs["filters"] self.filters = filters[self._policy_name] weights = state[self._policy_name] weights.pop("_optimizer_variables") self.policy.set_weights(weights) self.model = self.policy.model self.rnn_state = self.model.get_initial_state() self.rnn_state = [torch.reshape(self.rnn_state[0], shape=(1, -1))]
def __init__(self, load_path, observation_space, action_space): self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._checkpoint_path = load_path if isinstance(action_space, gym.spaces.Box): self.is_continuous = True elif isinstance(action_space, gym.spaces.Discrete): self.is_continuous = False else: raise TypeError("Unsupport action space") self._sess = tf.Session(graph=tf.Graph()) self._sess.__enter__() saver = tf.train.import_meta_graph( os.path.join(os.path.dirname(self._checkpoint_path), "model.meta")) saver.restore( self._sess, os.path.join(os.path.dirname(self._checkpoint_path), "model")) graph = tf.get_default_graph() if self.is_continuous: # These tensor names were found by inspecting the trained model # deterministic self.output_node = graph.get_tensor_by_name( "default_policy/split:0") # add guassian noise # output_node = graph.get_tensor_by_name("default_policy/add:0") else: self.output_node = graph.get_tensor_by_name( "default_policy/ArgMax:0") self.input_node = graph.get_tensor_by_name( "default_policy/observation:0")
def __init__( self, load_path, algorithm, policy_name, observation_space, action_space ): load_path = str(load_path) if algorithm == "PPO": from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy elif algorithm in ["A2C", "A3C"]: from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy elif algorithm == "PG": from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy elif algorithm == "DQN": from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy as LoadPolicy else: raise ValueError(f"Unsupported algorithm: {algorithm}") self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._sess = tf.compat.v1.Session(graph=tf.Graph()) with tf.compat.v1.name_scope(policy_name): # obs_space need to be flattened before passed to PPOTFPolicy flat_obs_space = self._prep.observation_space policy = LoadPolicy(flat_obs_space, self._action_space, {}) objs = pickle.load(open(load_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] weights = state[policy_name] policy.set_weights(weights) # These tensor names were found by inspecting the trained model if algorithm == "PPO": # CRUCIAL FOR SAFETY: # We use Tensor("split") instead of Tensor("add") to force # PPO to be deterministic. self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observation:0" ) self._output_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/split:0" ) elif self._algorithm == "DQN": self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( input=self._sess.graph.get_tensor_by_name( f"{policy_name}/value_out/BiasAdd:0" ), axis=1, ) else: self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( input=self._sess.graph.get_tensor_by_name( f"{policy_name}/fc_out/BiasAdd:0" ), axis=1, )
def __init__(self, obs_space, action_space, config): super().__init__(obs_space, action_space, config) self.action_noise_std = self.config["action_noise_std"] self.preprocessor = ModelCatalog.get_preprocessor_for_space( self.observation_space) self.observation_filter = get_filter(self.config["observation_filter"], self.preprocessor.shape) self.single_threaded = self.config.get("single_threaded", False) if self.config["framework"] == "tf": self.sess = make_session(single_threaded=self.single_threaded) # Set graph-level seed. if config.get("seed") is not None: with self.sess.as_default(): tf1.set_random_seed(config["seed"]) self.inputs = tf1.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) else: if not tf1.executing_eagerly(): tf1.enable_eager_execution() self.sess = self.inputs = None if config.get("seed") is not None: # Tf2.x. if config.get("framework") == "tf2": tf.random.set_seed(config["seed"]) # Tf-eager. elif tf1 and config.get("framework") == "tfe": tf1.set_random_seed(config["seed"]) # Policy network. self.dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, self.config["model"], dist_type="deterministic") self.model = ModelCatalog.get_model_v2( obs_space=self.preprocessor.observation_space, action_space=self.action_space, num_outputs=dist_dim, model_config=self.config["model"], ) self.sampler = None if self.sess: dist_inputs, _ = self.model({SampleBatch.CUR_OBS: self.inputs}) dist = self.dist_class(dist_inputs, self.model) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( dist_inputs, self.sess) self.sess.run(tf1.global_variables_initializer()) else: self.variables = ray.experimental.tf_utils.TensorFlowVariables( [], None, self.model.variables()) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items())
def on_postprocess_trajectory(self, worker, episode, agent_id, policy_id, policies, postprocessed_batch, original_batches, **kwargs): to_update = postprocessed_batch[SampleBatch.CUR_OBS] other_id = 1 if agent_id == 0 else 0 action_encoder = ModelCatalog.get_preprocessor_for_space( Box(-np.inf, np.inf, (ACTION_VEC_SIZE,), np.float32) # Unbounded ) _, opponent_batch = original_batches[other_id] opponent_actions = np.array([action_encoder.transform(a) for a in opponent_batch[SampleBatch.ACTIONS]]) to_update[:, -ACTION_VEC_SIZE:] = opponent_actions
def __init__(self, load_path, algorithm, policy_name, observation_space, action_space): self._checkpoint_path = load_path self._algorithm = algorithm self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._sess = None if isinstance(action_space, gym.spaces.Box): self.is_continuous = True elif isinstance(action_space, gym.spaces.Discrete): self.is_continuous = False else: raise TypeError("Unsupport action space") if self._sess: return if self._algorithm == "PPO": from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy elif self._algorithm in ["A2C", "A3C"]: from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy elif self._algorithm == "PG": from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy elif self._algorithm == "DQN": from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy as LoadPolicy else: raise TypeError("Unsupport algorithm") self._prep = ModelCatalog.get_preprocessor_for_space( self._observation_space) self._sess = tf.Session(graph=tf.Graph()) self._sess.__enter__() import ray.rllib.agents.ppo as ppo config = ppo.DEFAULT_CONFIG.copy() config['num_workers'] = 0 config["model"]["use_lstm"] = True with tf.name_scope(self._policy_name): # obs_space need to be flattened before passed to PPOTFPolicy flat_obs_space = self._prep.observation_space self.policy = LoadPolicy(flat_obs_space, self._action_space, config) objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] filters = objs["filters"] self.filters = filters[self._policy_name] weights = state[self._policy_name] self.policy.set_weights(weights) self.model = self.policy.model # print(self.model.summary()) self.rnn_state = self.model.get_initial_state() self.rnn_state = [[self.rnn_state[0]], [self.rnn_state[1]]]
def __init__(self, path_to_model, observation_space, action_space): self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._path_to_model = path_to_model if isinstance(action_space, gym.spaces.Box): self.is_continuous = True elif isinstance(action_space, gym.spaces.Discrete): self.is_continuous = False else: raise TypeError("Unsupport action space")
def generate_policies( policy_id: str, policy_constructor_tuple: Tuple["PolicyClass", "gym.Space", "gym.Space", dict], policies: Dict[str, TFPolicy], policies_to_train: List[str], policy_config: dict, preprocessors: Dict[str, Any], obs_filters: Dict[str, Any], observation_filter: str, ): """ Get policies for each ``agent_id``, and instantiate new ones for newly created agents. """ policy_cls, obs_space, act_space, conf = policy_constructor_tuple if policy_id in preprocessors != policy_id in policies: raise ValueError("'preprocessors' and 'policies' do not agree.") if policy_id in obs_filters != policy_id in policies: raise ValueError("'obs_filters' and 'policies' do not agree.") # If we haven't seen this id, we instantiate a new policy. if policy_id not in policies: merged_conf = merge_dicts(policy_config, conf) # We assume ``self.preprocessing_enabled == True`` in ``RolloutWorker``. preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[policy_id] = preprocessor obs_space = preprocessor.observation_space if tf and tf.executing_eagerly(): if hasattr(policy_cls, "as_eager"): policy_cls = policy_cls.as_eager() if policy_config["eager_tracing"]: policy_cls = policy_cls.with_tracing() elif not issubclass(policy_cls, TFPolicy): pass # could be some other type of policy else: raise ValueError("This policy does not support eager " "execution: {}".format(policy_cls)) if tf: with tf.variable_scope(policy_id): policies[policy_id] = policy_cls(obs_space, act_space, merged_conf) policies_to_train.append(policy_id) else: policies[policy_id] = policy_cls(obs_space, act_space, merged_conf) policies_to_train.append(policy_id) obs_filters[policy_id] = get_filter(observation_filter, obs_space.shape) return policies, preprocessors, obs_filters, policies_to_train
def __init__(self, path_to_model, observation_space): path_to_model = str(path_to_model) # might be a str or a Path, normalize to str self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._sess = tf.compat.v1.Session(graph=tf.Graph()) tf.compat.v1.saved_model.load( self._sess, export_dir=path_to_model, tags=["serve"] ) self._output_node = self._sess.graph.get_tensor_by_name("default_policy/add:0") self._input_node = self._sess.graph.get_tensor_by_name( "default_policy/observation:0" )
def before_init(policy, observation_space, action_space, config): policy.action_noise_std = config["action_noise_std"] policy.preprocessor = ModelCatalog.get_preprocessor_for_space( observation_space) policy.observation_filter = get_filter(config["observation_filter"], policy.preprocessor.shape) policy.single_threaded = config.get("single_threaded", False) def _set_flat_weights(policy, theta): pos = 0 theta_dict = policy.model.state_dict() new_theta_dict = {} for k in sorted(theta_dict.keys()): shape = policy.param_shapes[k] num_params = int(np.prod(shape)) new_theta_dict[k] = torch.from_numpy( np.reshape(theta[pos:pos + num_params], shape)) pos += num_params policy.model.load_state_dict(new_theta_dict) def _get_flat_weights(policy): # Get the parameter tensors. theta_dict = policy.model.state_dict() # Flatten it into a single np.ndarray. theta_list = [] for k in sorted(theta_dict.keys()): theta_list.append(torch.reshape(theta_dict[k], (-1, ))) cat = torch.cat(theta_list, dim=0) return cat.numpy() type(policy).set_flat_weights = _set_flat_weights type(policy).get_flat_weights = _get_flat_weights def _compute_actions(policy, obs_batch, add_noise=False, update=True): observation = policy.preprocessor.transform(obs_batch) observation = policy.observation_filter( observation[None], update=update) observation = convert_to_torch_tensor(observation) dist_inputs, _ = policy.model({ SampleBatch.CUR_OBS: observation }, [], None) dist = policy.dist_class(dist_inputs, policy.model) action = dist.sample().detach().numpy() action = unbatch_actions(action) if add_noise and isinstance(policy.action_space, gym.spaces.Box): action += np.random.randn(*action.shape) * policy.action_noise_std return action type(policy).compute_actions = _compute_actions
def on_postprocess_trajectory(self, worker, episode, agent_id, policy_id, policies, postprocessed_batch, original_batches, **kwargs): to_update = postprocessed_batch[SampleBatch.CUR_OBS] other_id = 1 if agent_id == 0 else 0 action_encoder = ModelCatalog.get_preprocessor_for_space(Discrete(2)) # set the opponent actions into the observation _, opponent_batch = original_batches[other_id] opponent_actions = np.array([ action_encoder.transform(a) for a in opponent_batch[SampleBatch.ACTIONS] ]) to_update[:, -2:] = opponent_actions
def __init__(self, load_path, algorithm, policy_names, observation_space, action_space): self._checkpoint_path = load_path self._algorithm = algorithm self._policy_mapping = dict.fromkeys(policy_names, None) self._observation_space = observation_space self._action_space = action_space self._sess = None if isinstance(action_space, gym.spaces.Box): self.is_continuous = True elif isinstance(action_space, gym.spaces.Discrete): self.is_continuous = False else: raise TypeError("Unsupported action space") if self._sess: return if self._algorithm == "PPO": from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy elif self._algorithm in ["A2C", "A3C"]: from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy elif self._algorithm == "PG": from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy elif self._algorithm == "DQN": from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy as LoadPolicy else: raise TypeError("Unsupport algorithm") self._prep = ModelCatalog.get_preprocessor_for_space( self._observation_space) self._sess = tf.Session(graph=tf.Graph()) self._sess.__enter__() objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] for name in self._policy_mapping: with tf.variable_scope(name): # obs_space need to be flattened before passed to PPOTFPolicy flat_obs_space = self._prep.observation_space self._policy_mapping[name] = PolicyWrapper( LoadPolicy, params=(flat_obs_space, self._action_space, {})) self._policy_mapping[name].set_preprocessor(self._prep) weights = state[name] self._policy_mapping[name].set_weights(weights)
def fill_in_actions(info): """Callback that saves opponent actions into the agent obs. If you don't care about opponent actions you can leave this out.""" to_update = info["post_batch"][SampleBatch.CUR_OBS] my_id = info["agent_id"] other_id = 1 if my_id == 0 else 0 action_encoder = ModelCatalog.get_preprocessor_for_space(Discrete(2)) # set the opponent actions into the observation _, opponent_batch = info["all_pre_batches"][other_id] opponent_actions = np.array([ action_encoder.transform(a) for a in opponent_batch[SampleBatch.ACTIONS] ]) to_update[:, -2:] = opponent_actions
def _build_policy_map( self, policy_dict: MultiAgentPolicyConfigDict, policy_config: TrainerConfigDict ) -> Tuple[Dict[PolicyID, Policy], Dict[PolicyID, Preprocessor]]: policy_map = {} preprocessors = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): logger.debug("Creating policy for {}".format(name)) merged_conf = merge_dicts(policy_config, conf) merged_conf["num_workers"] = self.num_workers merged_conf["worker_index"] = self.worker_index if self.preprocessing_enabled: preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[name] = preprocessor obs_space = preprocessor.observation_space else: preprocessors[name] = NoPreprocessor(obs_space) if isinstance(obs_space, gym.spaces.Dict) or \ isinstance(obs_space, gym.spaces.Tuple): raise ValueError( "Found raw Tuple|Dict space as input to policy. " "Please preprocess these observations with a " "Tuple|DictFlatteningPreprocessor.") if tf1 and tf1.executing_eagerly(): if hasattr(cls, "as_eager"): cls = cls.as_eager() if policy_config.get("eager_tracing"): cls = cls.with_tracing() elif not issubclass(cls, TFPolicy): pass # could be some other type of policy else: raise ValueError("This policy does not support eager " "execution: {}".format(cls)) if tf1: with tf1.variable_scope(name): policy_map[name] = cls(obs_space, act_space, merged_conf) else: policy_map[name] = cls(obs_space, act_space, merged_conf) if self.worker_index == 0: logger.info("Built policy map: {}".format(policy_map)) logger.info("Built preprocessor map: {}".format(preprocessors)) return policy_map, preprocessors
def _build_policy_map(self, policy_dict, policy_config): policy_map = {} preprocessors = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): merged_conf = merge_dicts(policy_config, conf) if self.preprocessing_enabled: preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[name] = preprocessor obs_space = preprocessor.observation_space else: preprocessors[name] = NoPreprocessor(obs_space) if isinstance(obs_space, gym.spaces.Dict) or \ isinstance(obs_space, gym.spaces.Tuple): raise ValueError( "Found raw Tuple|Dict space as input to policy graph. " "Please preprocess these observations with a " "Tuple|DictFlatteningPreprocessor.") with tf.variable_scope(name): policy_map[name] = cls(obs_space, act_space, merged_conf) return policy_map, preprocessors
def __init__(self, load_path, algorithm, policy_name, observation_space): load_path = str(load_path) self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._sess = tf.compat.v1.Session(graph=tf.Graph()) tf.compat.v1.saved_model.load( self._sess, export_dir=load_path, tags=["serve"], clear_devices=True, ) # These tensor names were found by inspecting the trained model if algorithm == "PPO": # CRUCIAL FOR SAFETY: # We use Tensor("split") instead of Tensor("add") to force # PPO to be deterministic. self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observation:0" ) self._output_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/split:0" ) # todo: need to check elif algorithm == "DQN": self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( input=self._sess.graph.get_tensor_by_name( f"{policy_name}/value_out/BiasAdd:0" ), axis=1, ) else: self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( input=self._sess.graph.get_tensor_by_name( f"{policy_name}/fc_out/BiasAdd:0" ), axis=1, )
def __init__(self, load_path, algorithm, policy_name, yaml_path): load_path = str(load_path) if algorithm == "ppo": from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy as LoadPolicy elif algorithm in "a2c": from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy as LoadPolicy from ray.rllib.agents.a3c import DEFAULT_CONFIG elif algorithm == "pg": from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy as LoadPolicy elif algorithm == "dqn": from ray.rllib.agents.dqn import DQNTFPolicy as LoadPolicy elif algorithm == "maac": from benchmark.agents.maac.tf_policy import CA2CTFPolicy as LoadPolicy from benchmark.agents.maac.tf_policy import DEFAULT_CONFIG elif algorithm == "maddpg": from benchmark.agents.maddpg.tf_policy import MADDPG2TFPolicy as LoadPolicy from benchmark.agents.maddpg.tf_policy import DEFAULT_CONFIG elif algorithm == "mfac": from benchmark.agents.mfac.tf_policy import MFACTFPolicy as LoadPolicy from benchmark.agents.mfac.tf_policy import DEFAULT_CONFIG elif algorithm == "networked_pg": from benchmark.agents.networked_pg.tf_policy import ( NetworkedPG as LoadPolicy, ) from benchmark.agents.networked_pg.tf_policy import ( PG_DEFAULT_CONFIG as DEFAULT_CONFIG, ) else: raise ValueError(f"Unsupported algorithm: {algorithm}") yaml_path = BASE_DIR / yaml_path load_path = BASE_DIR / f"log/results/run/{load_path}" config = load_config(yaml_path) observation_space = config["policy"][1] action_space = config["policy"][2] pconfig = DEFAULT_CONFIG pconfig["model"].update(config["policy"][-1].get("model", {})) pconfig["agent_id"] = policy_name self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._sess = tf.Session(graph=tf.get_default_graph()) with tf.name_scope(policy_name): # Observation space needs to be flattened before passed to the policy flat_obs_space = self._prep.observation_space policy = LoadPolicy(flat_obs_space, action_space, pconfig) self._sess.run(tf.global_variables_initializer()) objs = pickle.load(open(load_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] weights = state[policy_name] policy.set_weights(weights) # for op in tf.get_default_graph().get_operations(): # print(str(op.name)) # These tensor names were found by inspecting the trained model if algorithm == "ppo": # CRUCIAL FOR SAFETY: # We use Tensor("split") instead of Tensor("add") to force # PPO to be deterministic. self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observation:0" ) self._output_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/split:0" ) elif algorithm == "dqn": self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( self._sess.graph.get_tensor_by_name( f"{policy_name}/value_out/BiasAdd:0" ), axis=1, ) elif algorithm == "maac": self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/policy-inputs:0" ) self._output_node = tf.argmax( self._sess.graph.get_tensor_by_name( f"{policy_name}/logits_out/BiasAdd:0" ), axis=1, ) elif algorithm == "maddpg": self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/obs_2:0" ) self._output_node = tf.argmax( self._sess.graph.get_tensor_by_name( f"{policy_name}/actor/AGENT_2_actor_RelaxedOneHotCategorical_1/sample/AGENT_2_actor_exp/forward/Exp:0" ) ) else: self._input_node = self._sess.graph.get_tensor_by_name( f"{policy_name}/observations:0" ) self._output_node = tf.argmax( self._sess.graph.get_tensor_by_name(f"{policy_name}/fc_out/BiasAdd:0"), axis=1, )
def __init__(self, path_to_model, observation_space): self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._path_to_model = path_to_model
def before_init(policy, observation_space, action_space, config): policy.action_noise_std = config["action_noise_std"] policy.action_space_struct = get_base_struct_from_space(action_space) policy.preprocessor = ModelCatalog.get_preprocessor_for_space( observation_space) policy.observation_filter = get_filter(config["observation_filter"], policy.preprocessor.shape) policy.single_threaded = config.get("single_threaded", False) def _set_flat_weights(policy, theta): pos = 0 theta_dict = policy.model.state_dict() new_theta_dict = {} for k in sorted(theta_dict.keys()): shape = policy.param_shapes[k] num_params = int(np.prod(shape)) new_theta_dict[k] = torch.from_numpy( np.reshape(theta[pos:pos + num_params], shape)) pos += num_params policy.model.load_state_dict(new_theta_dict) def _get_flat_weights(policy): # Get the parameter tensors. theta_dict = policy.model.state_dict() # Flatten it into a single np.ndarray. theta_list = [] for k in sorted(theta_dict.keys()): theta_list.append(torch.reshape(theta_dict[k], (-1, ))) cat = torch.cat(theta_list, dim=0) return cat.cpu().numpy() type(policy).set_flat_weights = _set_flat_weights type(policy).get_flat_weights = _get_flat_weights def _compute_actions(policy, obs_batch, add_noise=False, update=True, **kwargs): # Batch is given as list -> Try converting to numpy first. if isinstance(obs_batch, list) and len(obs_batch) == 1: obs_batch = obs_batch[0] observation = policy.preprocessor.transform(obs_batch) observation = policy.observation_filter(observation[None], update=update) observation = convert_to_torch_tensor(observation, policy.device) dist_inputs, _ = policy.model({SampleBatch.CUR_OBS: observation}, [], None) dist = policy.dist_class(dist_inputs, policy.model) action = dist.sample() def _add_noise(single_action, single_action_space): single_action = single_action.detach().cpu().numpy() if add_noise and isinstance(single_action_space, gym.spaces.Box): single_action += np.random.randn(*single_action.shape) * \ policy.action_noise_std return single_action action = tree.map_structure(_add_noise, action, policy.action_space_struct) action = unbatch(action) return action, [], {} def _compute_single_action(policy, observation, add_noise=False, update=True, **kwargs): action, state_outs, extra_fetches = policy.compute_actions( [observation], add_noise=add_noise, update=update, **kwargs) return action[0], state_outs, extra_fetches type(policy).compute_actions = _compute_actions type(policy).compute_single_action = _compute_single_action
def generate_policies( policy_id: str, policy_constructor_tuple: Tuple["PolicyClass", "gym.Space", "gym.Space", dict], policies: Dict[str, TFPolicy], policies_to_train: List[str], dead_policies: Set[str], policy_config: dict, preprocessors: Dict[str, Any], obs_filters: Dict[str, Any], observation_filter: str, tf_sess, ): """ Get policies for each ``agent_id``, and instantiate new ones for newly created agents. """ policy_cls, obs_space, act_space, conf = policy_constructor_tuple if (policy_id in preprocessors) != (policy_id in policies): raise ValueError("'preprocessors' and 'policies' do not agree.") if (policy_id in obs_filters) != (policy_id in policies): raise ValueError("'obs_filters' and 'policies' do not agree.") # If we haven't seen this id, we instantiate a new policy. if policy_id not in policies: # We assume configs are homogeneous. # Use a dead policy for this new agent. if dead_policies: dead_policy_id = dead_policies.pop() dead_preprocessor = preprocessors.pop(dead_policy_id) dead_obs_space = dead_preprocessor.observation_space dead_policy = policies.pop(dead_policy_id) dead_obs_filter = obs_filters.pop(dead_policy_id) start = time.time() # Run variable initializer ops, assuming tf model. trainable_model_variables = dead_policy.model.trainable_variables() sess = dead_policy.get_session() sess.run([var.initializer for var in trainable_model_variables]) preprocessors[policy_id] = dead_preprocessor policies[policy_id] = dead_policy obs_filters[policy_id] = dead_obs_filter policies_to_train.append(policy_id) # DEBUG print("sampler.py: Reinitializing dead model: %fs" % (time.time() - start)) else: merged_conf = merge_dicts(policy_config, conf) # We assume ``self.preprocessing_enabled == True`` in ``RolloutWorker``. preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[policy_id] = preprocessor obs_space = preprocessor.observation_space if tf and tf.executing_eagerly(): if hasattr(policy_cls, "as_eager"): policy_cls = policy_cls.as_eager() if policy_config["eager_tracing"]: policy_cls = policy_cls.with_tracing() elif not issubclass(policy_cls, TFPolicy): pass # could be some other type of policy else: raise ValueError("This policy does not support eager " "execution: {}".format(policy_cls)) if tf: # TODO: Is this necessary? Yes. with tf.variable_scope(policy_id): # DEBUG print("sampler.py: Default graph:", tf.get_default_graph()) print("sampler.py: Calling policy init.") start = time.time() policies[policy_id] = policy_cls(obs_space, act_space, merged_conf) # DEBUG print("sampler.py: Done policy init: %fs" % (time.time() - start)) policies_to_train.append(policy_id) else: policies[policy_id] = policy_cls(obs_space, act_space, merged_conf) policies_to_train.append(policy_id) # DEBUG # print("sampler.py: Getting new filter.") obs_filters[policy_id] = get_filter(observation_filter, obs_space.shape) # DEBUG # print("sampler.py: Got new filter.") return policies, preprocessors, obs_filters, policies_to_train, dead_policies
AGENT_ID = "Agent-007" env = gym.make( "smarts.env:hiway-v0", scenarios=scenario_paths, agent_specs={AGENT_ID: agent_spec}, # set headless to false if u want to use envision headless=False, visdom=False, seed=args.seed, ) torch.manual_seed(args.seed) np.random.seed(args.seed) preprocessor = ModelCatalog.get_preprocessor_for_space(OBSERVATION_SPACE) state_dim = 0 for val in OBSERVATION_SPACE.spaces.values(): state_dim += val.shape[0] state_dim = (state_dim, ) if type(ACTION_SPACE) == gym.spaces.Discrete: act_dim = ACTION_SPACE.n action_max = 1 else: act_dim = ACTION_SPACE.shape action_max = ACTION_SPACE.high[0] ppo = core.PPO(state_dim, act_dim, action_max,
def __init__(self, observation_space, action_space, model_config): self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self.model = TrainingModel(self._prep.observation_space, action_space, num_outputs=3, model_config=model_config, name="Name")