def __init__(self, obs_space, action_space, num_outputs, model_config, name, true_obs_shape=(4, ), action_embed_size=6, **kw): super(ParametricActionsModel, self).__init__( obs_space, action_space, num_outputs, model_config, name, **kw) if model_config['custom_options']['spy']: true_obs_space = make_spy_space(model_config['custom_options']['parties'], model_config['custom_options']['blocks']) else: true_obs_space = make_blind_space(model_config['custom_options']['parties'], model_config['custom_options']['blocks']) if model_config['custom_options']['extended']: action_embed_size = 6 else: action_embed_size = 4 total_dim = 0 for space in true_obs_space: total_dim += get_preprocessor(space)(space).size self.action_embed_model = FullyConnectedNetwork( Box(-1, 1, shape = (total_dim,)), action_space, action_embed_size, model_config, name + "_action_embed") self.register_variables(self.action_embed_model.variables())
def get_preprocessor(env, options=None): """Returns a suitable processor for the given environment. Args: env (gym.Env|VectorEnv|ExternalEnv): The environment to wrap. options (dict): Options to pass to the preprocessor. Returns: preprocessor (Preprocessor): Preprocessor for the env observations. """ options = options or MODEL_DEFAULTS for k in options.keys(): if k not in MODEL_DEFAULTS: raise Exception("Unknown config key `{}`, all keys: {}".format( k, list(MODEL_DEFAULTS))) if options.get("custom_preprocessor"): preprocessor = options["custom_preprocessor"] logger.info("Using custom preprocessor {}".format(preprocessor)) prep = _global_registry.get(RLLIB_PREPROCESSOR, preprocessor)(env.observation_space, options) else: cls = get_preprocessor(env.observation_space) prep = cls(env.observation_space, options) logger.debug("Created preprocessor {}: {} -> {}".format( prep, env.observation_space, prep.shape)) return prep
def simulate_env_interaction(env, restart=True) -> SampleBatch: prep = get_preprocessor(env.observation_space)(env.observation_space) batch_builder = SampleBatchBuilder() # get reverse action functions env_ptr = env reverse_action_fns = [] while hasattr(env_ptr, "env"): if isinstance(env_ptr, gym.ActionWrapper): reverse_action_fns.append(env_ptr.reverse_action) env_ptr = env_ptr.env def reverse_action(action): for f in reversed(reverse_action_fns): action = f(action) return action while restart: for eps_id, trajectory_name in enumerate(env.trajectory_names): t = 0 prev_action = None prev_reward = 0 done = False try: obs = env.reset() except TypeError: continue while not done: new_obs, reward, done, info = env.step( env.action_space.sample()) action = info["action"] action = reverse_action(action) if prev_action is None: prev_action = np.zeros_like(action) batch = { "t": t, SampleBatch.EPS_ID: eps_id, SampleBatch.AGENT_INDEX: eps_id, SampleBatch.OBS: prep.transform(obs), SampleBatch.ACTIONS: action, SampleBatch.ACTION_PROB: 1.0, SampleBatch.ACTION_LOGP: 0, SampleBatch.ACTION_DIST_INPUTS: 0, SampleBatch.REWARDS: reward, SampleBatch.PREV_ACTIONS: prev_action, SampleBatch.PREV_REWARDS: prev_reward, SampleBatch.DONES: done, SampleBatch.INFOS: { "trajectory_name": trajectory_name }, SampleBatch.NEXT_OBS: prep.transform(new_obs), } batch_builder.add_values(**batch) obs = new_obs prev_action = action prev_reward = reward t += 1 yield batch_builder.build_and_reset()
def test_one_hot_preprocessor(self): space = Discrete(5) pp = get_preprocessor(space)(space) self.assertTrue(isinstance(pp, OneHotPreprocessor)) self.assertTrue(pp.shape == (5, )) check(pp.transform(3), [0.0, 0.0, 0.0, 1.0, 0.0]) check(pp.transform(0), [1.0, 0.0, 0.0, 0.0, 0.0]) space = MultiDiscrete([2, 3, 4]) pp = get_preprocessor(space)(space) self.assertTrue(isinstance(pp, OneHotPreprocessor)) self.assertTrue(pp.shape == (9, )) check(pp.transform(np.array([1, 2, 0])), [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0]) check(pp.transform(np.array([0, 1, 3])), [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0])
def __init__(self, obs_space, action_space, num_outputs, model_config, name, fc_size=64, lstm_state_size=256): nn.Module.__init__(self) super().__init__(obs_space, action_space, num_outputs, model_config, name) self.obs_size = get_preprocessor(obs_space)(obs_space).size self.fc_size = fc_size self.lstm_state_size = lstm_state_size # Build the Module from fc + LSTM + 2xfc (action + value outs). self.fc1 = nn.Linear(self.obs_size, self.fc_size) self.lstm = nn.LSTM(self.fc_size, self.lstm_state_size, batch_first=True) self.action_branch = nn.Linear(self.lstm_state_size, num_outputs) self.value_branch = nn.Linear(self.lstm_state_size, 1) # Holds the current "base" output (before logits layer). self._features = None
def get_preprocessor(registry, env, options=dict()): """Returns a suitable processor for the given environment. Args: registry (obj): Registry of named objects (ray.tune.registry). env (gym.Env): The gym environment to preprocess. options (dict): Options to pass to the preprocessor. Returns: preprocessor (Preprocessor): Preprocessor for the env observations. """ for k in options.keys(): if k not in MODEL_CONFIGS: raise Exception("Unknown config key `{}`, all keys: {}".format( k, MODEL_CONFIGS)) if "custom_preprocessor" in options: preprocessor = options["custom_preprocessor"] print("Using custom preprocessor {}".format(preprocessor)) return registry.get(RLLIB_PREPROCESSOR, preprocessor)(env.observation_space, options) preprocessor = get_preprocessor(env.observation_space) return preprocessor(env.observation_space, options)
def restore_policy_from_checkpoint( policy_class: type, env_creator: Callable[[Dict[str, Any]], gym.Env], checkpoint_path: str, config: Dict[str, Any]) -> Policy: """ TODO: Write documentation """ # Load checkpoint policy state with open(checkpoint_path, "rb") as checkpoint_dump: checkpoint_state = pickle.load(checkpoint_dump) worker_dump = checkpoint_state['worker'] worker_state = pickle.loads(worker_dump) policy_state = worker_state['state']['default_policy'] # Initiate temporary environment to get observation and action spaces env = env_creator(config.get("env_config", {})) # Get preprocessed observation space preprocessor_class = get_preprocessor(env.observation_space) preprocessor = preprocessor_class(env.observation_space) observation_space = preprocessor.observation_space # Instantiate policy and load checkpoint state policy = policy_class(observation_space, env.action_space, config) policy.set_state(policy_state) return policy
def _unpack_obs(obs, space): if (isinstance(space, gym.spaces.Dict) or isinstance(space, gym.spaces.Tuple)): prep = get_preprocessor(space)(space) if len(obs.shape) != 2 or obs.shape[1] != prep.shape[0]: raise ValueError( "Expected flattened obs shape of [None, {}], got {}".format( prep.shape[0], obs.shape)) assert len(prep.preprocessors) == len(space.spaces), \ (len(prep.preprocessors) == len(space.spaces)) offset = 0 if isinstance(space, gym.spaces.Tuple): u = [] for p, v in zip(prep.preprocessors, space.spaces): obs_slice = obs[:, offset:offset + p.size] offset += p.size u.append( _unpack_obs( tf.reshape(obs_slice, [-1] + list(p.shape)), v)) else: u = OrderedDict() for p, (k, v) in zip(prep.preprocessors, space.spaces.items()): obs_slice = obs[:, offset:offset + p.size] offset += p.size u[k] = _unpack_obs( tf.reshape(obs_slice, [-1] + list(p.shape)), v) return u else: return obs
def get_preprocessor(env, options=None): """Returns a suitable processor for the given environment. Args: env (gym.Env|VectorEnv|ServingEnv): The environment to wrap. options (dict): Options to pass to the preprocessor. Returns: preprocessor (Preprocessor): Preprocessor for the env observations. """ options = options or MODEL_DEFAULTS for k in options.keys(): if k not in MODEL_DEFAULTS: raise Exception("Unknown config key `{}`, all keys: {}".format( k, list(MODEL_DEFAULTS))) if options.get("custom_preprocessor"): preprocessor = options["custom_preprocessor"] print("Using custom preprocessor {}".format(preprocessor)) return _global_registry.get(RLLIB_PREPROCESSOR, preprocessor)(env.observation_space, options) preprocessor = get_preprocessor(env.observation_space) return preprocessor(env.observation_space, options)
def _make_continuous_space(space): if isinstance(space, gym.spaces.Box): return space elif isinstance(space, gym.spaces.Discrete): return gym.spaces.Box(low=np.zeros((space.n,)), high=np.ones((space.n,))) else: return get_preprocessor(space)(space).observation_space
def get_preprocessor_for_space(observation_space, options=None): """Returns a suitable preprocessor for the given observation space. Args: observation_space (Space): The input observation space. options (dict): Options to pass to the preprocessor. Returns: preprocessor (Preprocessor): Preprocessor for the observations. """ options = options or MODEL_DEFAULTS for k in options.keys(): if k not in MODEL_DEFAULTS: raise Exception("Unknown config key `{}`, all keys: {}".format( k, list(MODEL_DEFAULTS))) if options.get("custom_preprocessor"): preprocessor = options["custom_preprocessor"] logger.info("Using custom preprocessor {}".format(preprocessor)) prep = _global_registry.get(RLLIB_PREPROCESSOR, preprocessor)( observation_space, options) else: cls = get_preprocessor(observation_space) prep = cls(observation_space, options) logger.debug("Created preprocessor {}: {} -> {}".format( prep, observation_space, prep.shape)) return prep
def render_q_function(env, agent): action = np.array([0,0]) prep = get_preprocessor(env.observation_space)(env.observation_space) start = time.time() observation, nx, ny = env.default_env.get_observation_array() print("Got %i observations in %.3f seconds"%(len(observation),time.time()-start)) # Reshape action and observation so that the first dimension is the batch #nx, ny, ns = observation.shape #observation = np.reshape(observation, (-1, ns)) #action = np.tile(action, (nx*ny,1)) obs_t = [] act_t = [] for i in range(len(observation)): act_t.append(np.expand_dims(action, axis=0)) obs_t.append(np.expand_dims(prep.transform(observation[i]), axis=0)) print("Prep took %.3f seconds"%(time.time()-start)) q, qt = agent.get_policy().compute_q(obs_t, act_t) q_img = np.reshape(q, (nx,ny,1)) print("Policy took %.3f seconds"%(time.time()-start)) q_img = np.tile(q_img, (1,1,3)) q_img = cv2.blur(q_img, (5,5)) q_img = np.mean(q_img, axis=-1) q_img = 1-(np.clip(q_img, -0.6, 1)+0.6)/1.6 q_img = 255*viridis(q_img) q_img = q_img.astype(np.uint8) q_img = q_img[:,:,:3] # Remove alpha q_img = q_img[:,:,::-1] # Flip colormap to RGB return q_img
def get_preprocessor(registry, env, options=dict()): """Returns a suitable processor for the given environment. Args: registry (obj): Registry of named objects (ray.tune.registry). env (gym.Env): The gym environment to preprocess. options (dict): Options to pass to the preprocessor. Returns: preprocessor (Preprocessor): Preprocessor for the env observations. """ for k in options.keys(): if k not in MODEL_CONFIGS: raise Exception( "Unknown config key `{}`, all keys: {}".format( k, MODEL_CONFIGS)) if "custom_preprocessor" in options: preprocessor = options["custom_preprocessor"] print("Using custom preprocessor {}".format(preprocessor)) return registry.get(RLLIB_PREPROCESSOR, preprocessor)( env.observation_space, options) preprocessor = get_preprocessor(env.observation_space) return preprocessor(env.observation_space, options)
def get_preprocessor_for_space(observation_space, options=None): """Returns a suitable preprocessor for the given observation space. Args: observation_space (Space): The input observation space. options (dict): Options to pass to the preprocessor. Returns: preprocessor (Preprocessor): Preprocessor for the observations. """ options = options or MODEL_DEFAULTS for k in options.keys(): if k not in MODEL_DEFAULTS: raise Exception("Unknown config key `{}`, all keys: {}".format( k, list(MODEL_DEFAULTS))) if options.get("custom_preprocessor"): preprocessor = options["custom_preprocessor"] logger.info("Using custom preprocessor {}".format(preprocessor)) logger.warning( "DeprecationWarning: Custom preprocessors are deprecated, " "since they sometimes conflict with the built-in " "preprocessors for handling complex observation spaces. " "Please use wrapper classes around your environment " "instead of preprocessors.") prep = _global_registry.get(RLLIB_PREPROCESSOR, preprocessor)( observation_space, options) else: cls = get_preprocessor(observation_space) prep = cls(observation_space, options) logger.debug("Created preprocessor {}: {} -> {}".format( prep, observation_space, prep.shape)) return prep
def __init__(self, obs_space, action_space, num_outputs, model_config, name, true_obs_shape=(24, ), action_embed_size=None): super(ParametricActionsModel, self).__init__(obs_space, action_space, num_outputs, model_config, name) if action_embed_size is None: action_embed_size = action_space.n # this works for Discrete() action # we get the size of the output of the preprocessor automatically chosen by rllib for the real_obs space real_obs = obs_space.original_space['real_obs'] true_obs_shape = get_preprocessor(real_obs)( real_obs).size # this will we an integer # true_obs_shape = obs_space.original_space['real_obs'] self.action_embed_model = FullyConnectedNetwork( obs_space=Box(-1, 1, shape=(true_obs_shape, )), action_space=action_space, num_outputs=action_embed_size, model_config=model_config, name=name + "_action_embed") self.base_model = self.action_embed_model.base_model self.register_variables(self.action_embed_model.variables())
def __init__(self, obs_space, action_space, num_outputs, model_config, name, fc_size=64, lstm_state_size=256): nn.Module.__init__(self) super().__init__(obs_space, action_space, num_outputs, model_config, name) self.obs_size = get_preprocessor(obs_space)(obs_space).size self.fc_size = fc_size self.lstm_state_size = lstm_state_size # Build the Module from fc + LSTM + 2xfc (action + value outs). self.fc1 = nn.Linear(self.obs_size, self.fc_size) self.lstm = nn.LSTM( self.fc_size, self.lstm_state_size, batch_first=True) self.action_branch = nn.Linear(self.lstm_state_size, num_outputs) self.value_branch = nn.Linear(self.lstm_state_size, 1) # Holds the current "base" output (before logits layer). self._features = None # Add state-ins to this model's view. for i in range(2): self.inference_view_requirements["state_in_{}".format(i)] = \ ViewRequirement( "state_out_{}".format(i), shift=-1, space=Box(-1.0, 1.0, shape=(self.lstm_state_size,)))
def test_preprocessor(env): Preprocessor = get_preprocessor(env.observation_space) preprocessor = Preprocessor(env.observation_space) action = {i:[0.2, -0.5] for i in range(len(env.default_env.base.robots))} obs, reward, done, _ = env.step(action) out = preprocessor.transform(obs[0]) print(len(out))
def test_nested_multidiscrete_one_hot_preprocessor(self): space = Tuple((MultiDiscrete([2, 3, 4]), )) pp = get_preprocessor(space)(space) self.assertTrue(pp.shape == (9, )) check(pp.transform((np.array([1, 2, 0]), )), [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0]) check(pp.transform((np.array([0, 1, 3]), )), [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0])
def __init__( self, obs_space: spaces.Space, action_space: spaces.Space, num_outputs: int, model_config: ModelConfigDict, name: str, ): super(CentralizedActorCriticModel, self).__init__(obs_space, action_space, num_outputs, model_config, name) model_config = model_config["custom_model_config"] self.n_agents = model_config["agent_number"] if model_config["critic_mode"] == "mean": self.critic_obs = spaces.Dict( OrderedDict({ "own_obs": self.obs_space, "own_act": self.action_space, "oppo_act": self.action_space, })) else: self.critic_obs = spaces.Dict( OrderedDict({ **{ f"AGENT-{i}": self.obs_space for i in range(self.n_agents) }, **{ f"AGENT-{i}-action": self.action_space for i in range(self.n_agents) }, })) self.critic_preprocessor = get_preprocessor(self.critic_obs)( self.critic_obs) self.obs_preprocessor = get_preprocessor(self.obs_space)( self.obs_space) self.act_preprocessor = get_preprocessor(self.action_space)( self.action_space) self.action_model = self._build_action_model( model_config["action_model"]) self.value_model = self._build_value_model(model_config["value_model"]) self.register_variables(self.action_model.variables) self.register_variables(self.value_model.variables)
def __init__(self, ioctx: IOContext = None): super().__init__() print("Input reader initialization success!") import minerl patch_data_pipeline() input_config = ioctx.input_config env_name = ioctx.config.get("env") env_config = ioctx.config.get("env_config", {}) self.data = minerl.data.make( env_name, data_dir=os.getenv("MINERL_DATA_ROOT", input_config.get("data_dir", "data")), num_workers=input_config.get("num_workers", 4), worker_batch_size=input_config.get("worker_batch_size", 32), minimum_size_to_dequeue=input_config.get("minimum_size_to_dequeue", 32), force_download=input_config.get("force_download", False), ) batch_size = input_config.get("batch_size", 1) seq_len = input_config.get("seq_len", 32) num_epochs = input_config.get("num_epochs", -1) preload_buffer_size = input_config.get("preload_buffer_size", 2) seed = input_config.get("seed", None) self.load_complete_episodes = input_config.get( "load_complete_episodes", True) self.generator = self.data.batch_iter( batch_size, seq_len, num_epochs=num_epochs, preload_buffer_size=preload_buffer_size, seed=seed, ) env = MinerRLDataEnv(self.data) env = wrap_env(env, env_config, env_name) self.episode_generator = simulate_env_interaction(env) self.prep = get_preprocessor(env.observation_space)( env.observation_space) env_ptr = env self.obs_fns = [] self.action_fns = [] self.reverse_action_fns = [] self.reward_fns = [] while hasattr(env_ptr, "env"): if isinstance(env_ptr, gym.ObservationWrapper): self.obs_fns.append(env_ptr.observation) if isinstance(env_ptr, gym.ActionWrapper): self.action_fns.append(env_ptr.action) self.reverse_action_fns.append(env_ptr.reverse_action) if isinstance(env_ptr, gym.RewardWrapper): self.reward_fns.append(env_ptr.reward) env_ptr = env_ptr.env
def _init_shape(self, obs_space, options): logger.debug('obs_space:%s, options:%s' % (obs_space, options)) assert isinstance(self._obs_space, spaces.Dict) size = 0 self.preprocessors = [] for space in self._obs_space.spaces.values(): logger.debug("Creating sub-preprocessor for {}".format(space)) preprocessor = get_preprocessor(space)(space, self._options) self.preprocessors.append(preprocessor) size += preprocessor.size return size,
def test_dict_flattening_preprocessor(self): space = Dict({ "a": Discrete(2), "b": Tuple([Discrete(3), Box(-1.0, 1.0, (4, ))]), }) pp = get_preprocessor(space)(space) self.assertTrue(isinstance(pp, DictFlatteningPreprocessor)) self.assertEqual(pp.shape, (9, )) check( pp.transform({ "a": 1, "b": (1, np.array([0.0, -0.5, 0.1, 0.6])) }), [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, -0.5, 0.1, 0.6])
def __init__(self, ctx: ConnectorContext): super().__init__(ctx) if hasattr(ctx.observation_space, "original_space"): # ctx.observation_space is the space this Policy deals with. # We need to preprocess data from the original observation space here. obs_space = ctx.observation_space.original_space else: obs_space = ctx.observation_space self._preprocessor = get_preprocessor(obs_space)(obs_space, ctx.config.get( "model", {}))
def __init__( self, obs_space, action_space, num_outputs, model_config, name, **kwargs ): super(CCModel, self).__init__( obs_space, action_space, num_outputs, model_config, name ) # ordered dict agent_number = 4 critic_obs = gym.spaces.Dict( { **{f"AGENT-{i}": obs_space for i in range(agent_number)}, **{f"AGENT-{i}-action": action_space for i in range(agent_number)}, } ) self.critic_preprocessor = get_preprocessor(critic_obs)(critic_obs) self.obs_preprocessor = get_preprocessor(obs_space)(obs_space) self.act_preprocessor = get_preprocessor(action_space)(action_space) model_config["custom_model_config"] = dict() # inner network self.action_model = DictCNN( obs_space, action_space, num_outputs, model_config, name + "_action", **kwargs, ) self.value_model = FullyConnectedNetwork( gym.spaces.Box(low=-1e10, high=1e10, shape=self.critic_preprocessor.shape), action_space, 1, model_config, name + "_vf", ) self.register_variables(self.action_model.variables()) self.register_variables(self.value_model.variables())
def __init__(self, obs_space, action_space, num_outputs, model_config, name): TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name) nn.Module.__init__(self) self.preprocessor = get_preprocessor(obs_space.original_space)( obs_space.original_space) self.shared_layers = None self.actor_layers = None self.critic_layers = None self._value_out = None
def _unpack_obs(obs, space, tensorlib=tf): """Unpack a flattened Dict or Tuple observation array/tensor. Arguments: obs: The flattened observation tensor space: The original space prior to flattening tensorlib: The library used to unflatten (reshape) the array/tensor """ if (isinstance(space, gym.spaces.Dict) or isinstance(space, gym.spaces.Tuple)): if id(space) in _cache: prep = _cache[id(space)] else: prep = get_preprocessor(space)(space) # Make an attempt to cache the result, if enough space left. if len(_cache) < 999: _cache[id(space)] = prep if len(obs.shape) != 2 or obs.shape[1] != prep.shape[0]: raise ValueError( "Expected flattened obs shape of [None, {}], got {}".format( prep.shape[0], obs.shape)) assert len(prep.preprocessors) == len(space.spaces), \ (len(prep.preprocessors) == len(space.spaces)) offset = 0 if isinstance(space, gym.spaces.Tuple): u = [] for p, v in zip(prep.preprocessors, space.spaces): obs_slice = obs[:, offset:offset + p.size] offset += p.size u.append( _unpack_obs(tensorlib.reshape(obs_slice, [-1] + list(p.shape)), v, tensorlib=tensorlib)) else: u = OrderedDict() for p, (k, v) in zip(prep.preprocessors, space.spaces.items()): obs_slice = obs[:, offset:offset + p.size] offset += p.size u[k] = _unpack_obs(tensorlib.reshape(obs_slice, [-1] + list(p.shape)), v, tensorlib=tensorlib) return u else: return obs
def _unpack_obs(obs, space, tensorlib=tf): """Unpack a flattened Dict or Tuple observation array/tensor. Arguments: obs: The flattened observation tensor space: The original space prior to flattening tensorlib: The library used to unflatten (reshape) the array/tensor """ if (isinstance(space, gym.spaces.Dict) or isinstance(space, gym.spaces.Tuple)): prep = get_preprocessor(space)(space) if len(obs.shape) != 2 or obs.shape[1] != prep.shape[0]: raise ValueError( "Expected flattened obs shape of [None, {}], got {}".format( prep.shape[0], obs.shape)) assert len(prep.preprocessors) == len(space.spaces), \ (len(prep.preprocessors) == len(space.spaces)) offset = 0 if isinstance(space, gym.spaces.Tuple): u = [] for p, v in zip(prep.preprocessors, space.spaces): obs_slice = obs[:, offset:offset + p.size] offset += p.size u.append( _unpack_obs( tensorlib.reshape(obs_slice, [-1] + list(p.shape)), v, tensorlib=tensorlib)) else: u = OrderedDict() for p, (k, v) in zip(prep.preprocessors, space.spaces.items()): obs_slice = obs[:, offset:offset + p.size] offset += p.size u[k] = _unpack_obs( tensorlib.reshape(obs_slice, [-1] + list(p.shape)), v, tensorlib=tensorlib) return u else: return obs
def __init__(self, obs_space, action_space, num_outputs, model_config, name, fc_size=64, lstm_state_size=64): super().__init__(obs_space, action_space, num_outputs, model_config, name) self.obs_size = get_preprocessor(obs_space)(obs_space).size self.fc_size = fc_size self.lstm_state_size = lstm_state_size # Build the Module from fc + LSTM + 2xfc (action + value outs). self.fc1 = nn.Linear(self.obs_size, self.fc_size) self.lstm = nn.LSTM( self.fc_size, self.lstm_state_size, batch_first=True) self.action_branch = nn.Linear(self.lstm_state_size, num_outputs) self.value_branch = nn.Linear(self.lstm_state_size, 1) # Store the value output to save an extra forward pass. self._cur_value = None
def __init__(self, env_config): if env_config['extended']: self.action_n = 6 else: self.action_n = 4 self.extended = env_config['extended'] self.action_space = Discrete(self.action_n) self.wrapped = BitcoinEnv(env_config) self.config = env_config self.alphas = env_config['alphas'] self.max_hidden_block = env_config['max_hidden_block'] self.game_trace = deque(''*10, 10) self.observation_space = Dict({ "action_mask": Box(0,1,shape = (self.action_n,)), "avail_actions": Box(-10, 10, shape=(self.action_n, self.action_n)), "bitcoin":self.wrapped.observation_space, }) spy_space = constants.make_spy_space(len(self.alphas), self.max_hidden_block) blind_space = constants.make_blind_space(len(self.alphas), self.max_hidden_block) self.prep = get_preprocessor(Discrete(3))(Discrete(3)) self.action_assignments = np.zeros((self.action_n, self.action_n)) for i in range(self.action_n): self.action_assignments[i,i] = 1
def _get_size(obs_space): return get_preprocessor(obs_space)(obs_space).size
# flake8: noqa # __preprocessing_observations_start__ import gym env = gym.make("Pong-v0") # RLlib uses preprocessors to implement transforms such as one-hot encoding # and flattening of tuple and dict observations. from ray.rllib.models.preprocessors import get_preprocessor prep = get_preprocessor(env.observation_space)(env.observation_space) # <ray.rllib.models.preprocessors.GenericPixelPreprocessor object at 0x7fc4d049de80> # Observations should be preprocessed prior to feeding into a model env.reset().shape # (210, 160, 3) prep.transform(env.reset()).shape # (84, 84, 3) # __preprocessing_observations_end__ # __query_action_dist_start__ # Get a reference to the policy import numpy as np from ray.rllib.agents.ppo import PPOTrainer trainer = PPOTrainer(env="CartPole-v0", config={ "framework": "tf2", "num_workers": 0 })
def __init__(self, ctx: ConnectorContext): super().__init__(ctx) self._preprocessor = get_preprocessor(ctx.observation_space)( ctx.observation_space, ctx.config.get("model", {}))