def action_space(self): lat_dim = self.low_policy_latent_dim if self.discrete_actions: return spaces.Discrete(lat_dim) # the action is now just a selection else: ub = 1e6 * np.ones(lat_dim) return spaces.Box(-1 * ub, ub)
def __init__(self): EzPickle.__init__(self) self.seed() self.viewer = None self.world = Box2D.b2World() self.moon = None self.lander = None self.particles = [] self.prev_reward = None # useful range is -1 .. +1, but spikes can be higher self.observation_space = spaces.Box(-np.inf, np.inf, shape=(8, )) if self.continuous: # Action is two floats [main engine, left-right engines]. # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power. # Left-right: -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off self.action_space = spaces.Box(-1, +1, (2, )) else: # Nop, fire left engine, main engine, right engine self.action_space = spaces.Discrete(4) self.reset()
def __init__(self, env, base_policy, num_skills, steps_per_option=100): Serializable.quick_init(self, locals()) self._base_policy = base_policy self._env = env self._steps_per_option = steps_per_option self._num_skills = num_skills self.observation_space = self._env.observation_space self.action_space = spaces.Discrete(num_skills) self.spec = EnvSpec(self.observation_space, self.action_space) self._obs = self.reset()
def __init__( self, env_spec, env, # the inner one, I believe pkl_path=None, # for the entire hierarchical policy snn_pkl_path=None, snn_json_path=None, manager_pkl_path=None, # default is to initialize a new manager from scratch period=2, # how often the manager chooses latent skill latent_dim=6, bilinear_integration=True, trainable_snn=True, trainable_manager=True, hidden_sizes_snn=(64, 64), hidden_sizes_selector=(32, 32)): StochasticPolicy.__init__(self, env_spec) self.env = env self.period = period self.latent_dim = latent_dim # unsure self.bilinear_integration = bilinear_integration # unsure self.count = 0 # keep track of how long it's been since sampling a latent skill self.curr_latent = None # something self.outer_action_space = spaces.Discrete(latent_dim) self.trainable_manager = trainable_manager if pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) policy = data['policy'] self.manager = policy.manager self.low_policy = policy.low_policy #following two lines used for random manager # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space) # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, ) else: self.low_policy = GaussianMLPPolicy_snn_hier( env_spec=env.spec, env=env, pkl_path=snn_pkl_path, json_path=snn_json_path, trainable_snn=trainable_snn, latent_dim=latent_dim, bilinear_integration=bilinear_integration, external_latent=True, hidden_sizes_snn=hidden_sizes_snn, hidden_sizes_selector=hidden_sizes_selector) # loading manager from pkl file if manager_pkl_path: manager_data = joblib.load( os.path.join(config.PROJECT_PATH, manager_pkl_path)) self.manager = manager_data['policy'] print("loaded manager") else: # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path) outer_env_spec = EnvSpec( observation_space=self.env.observation_space, action_space=self.outer_action_space) self.manager = CategoricalMLPPolicy( env_spec=outer_env_spec, latent_dim=latent_dim, ) Serializable.quick_init(self, locals()) # todo: is this where this belongs?
def action_space(self): init = np.zeros([0]).shape ub = np.array([3]) lb = np.zeros_like(ub) return spaces.Discrete(4)
def observation_space(self): return spaces.Product([ spaces.Discrete(self.numrow), spaces.Discrete(self.numcol), spaces.Discrete(2) ])
def action_space(self): return spaces.Discrete(4)
def __init__( self, env_spec, env, # the inner one, I believe pkl_path=None, # for the entire hierarchical policy, can take in npz too! snn_pkl_path=None, # can actually be either pkl or npz snn_json_path=None, period=10, # how often the manager chooses latent skill latent_dim=6, bilinear_integration=True, trainable_snn=True, trainable_manager=True, hidden_sizes_snn=(64, 64), hidden_sizes_manager=(32, 32)): StochasticPolicy.__init__(self, env_spec) self.env = env self.period = period self.latent_dim = latent_dim # unsure self.bilinear_integration = bilinear_integration # unsure self.count = 0 # keep track of how long it's been since sampling a latent skill self.curr_latent = None self.curr_manager_obs = None self.outer_action_space = spaces.Discrete(latent_dim) self.trainable_manager = trainable_manager self.trainable_snn = trainable_snn if pkl_path and '.npz' not in pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) policy = data['policy'] self.manager = policy.manager self.low_policy = policy.low_policy #todo: the above is wrong, need to figure out how to warm start the params #following two lines used for random manager # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space) # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, ) else: if snn_pkl_path is not None and '.npz' in snn_pkl_path: npz_path = snn_pkl_path snn_pkl_path = None else: npz_path = None self.low_policy = GaussianMLPPolicy_snn_hier( env_spec=env.spec, env=env, pkl_path=snn_pkl_path, npz_path=npz_path, json_path=snn_json_path, trainable_snn=trainable_snn, latent_dim=latent_dim, bilinear_integration=bilinear_integration, external_latent=True, hidden_sizes_snn=hidden_sizes_snn, hidden_sizes_selector=hidden_sizes_selector) # loading manager from pkl file if manager_pkl_path: manager_data = joblib.load( os.path.join(config.PROJECT_PATH, manager_pkl_path)) self.manager = manager_data['policy'] print("loaded manager") else: # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path) if self.continuous_latent: outer_env_spec = EnvSpec( observation_space=self.env.observation_space, action_space=spaces.Box(-1.0, 1.0, shape=(latent_dim, ))) self.manager = GaussianMLPPolicy(env_spec=outer_env_spec) else: outer_env_spec = EnvSpec( observation_space=self.env.observation_space, action_space=self.outer_action_space) self.manager = CategoricalMLPPolicy( env_spec=outer_env_spec, latent_dim=latent_dim, ) # import ipdb; ipdb.set_trace() if pkl_path is not None and '.npz' in pkl_path: param_dict = dict( np.load(os.path.join(config.PROJECT_PATH, pkl_path))) param_values = param_dict['params'] self.set_param_values(param_values) Serializable.quick_init(self, locals()) # todo: is this where this belongs?
def __init__( self, env_spec, env, # the inner one, I believe pkl_path=None, # for the entire hierarchical policy snn_pkl_path=None, snn_json_path=None, manager_pkl_path=None, # default is to initialize a new manager from scratch max_period=10, # possible periods latent_dim=6, bilinear_integration=True, trainable_snn=True, trainable_manager=True, hidden_sizes_snn=(64, 64), hidden_sizes_selector=(32, 32)): StochasticPolicy.__init__(self, env_spec) self.env = env self.periods = np.arange(1, max_period + 1) assert len(self.periods) > 0 self.curr_period = self.periods[0] self.max_period = max(self.periods) self.latent_dim = latent_dim # unsure self.bilinear_integration = bilinear_integration # unsure self.count = 0 # keep track of how long it's been since sampling a latent skill self.curr_latent = None # something self.outer_action_space = spaces.Discrete(latent_dim) self.trainable_manager = trainable_manager self.random_period = True self.fake_env = PeriodVaryingEnv(env) if pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) policy = data['policy'] self.manager = policy.manager self.low_policy = policy.low_policy # following two lines used for random manager # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space) # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, ) else: # env spec that includes the extra parameter for time self.low_policy = GaussianMLPPolicy_snn_hier( env_spec=self.fake_env.spec, env=self.fake_env, pkl_path=snn_pkl_path, json_path=snn_json_path, trainable_snn=trainable_snn, latent_dim=latent_dim, bilinear_integration=bilinear_integration, external_latent=True, hidden_sizes_snn=hidden_sizes_snn, hidden_sizes_selector=hidden_sizes_selector ) # loading manager from pkl file if manager_pkl_path: manager_data = joblib.load(os.path.join(config.PROJECT_PATH, manager_pkl_path)) self.manager = manager_data['policy'] print("loaded manager") else: # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path) outer_env_spec = EnvSpec(observation_space=self.fake_env.observation_space, action_space=self.outer_action_space) self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, ) if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance(env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 Serializable.quick_init(self, locals()) # todo: ask if this fixes my problem