def __init__(self, goal_reward=10, actuation_cost_coeff=30, distance_cost_coeff=1, init_sigma=0.1): self.dynamics = PointDynamics(dim=2, sigma=0) self.init_mu = np.zeros(2, dtype=np.float32) self.init_sigma = init_sigma self.goal_positions = np.array([[5, 0], [-5, 0], [0, 5], [0, -5]], dtype=np.float32) self.goal_threshold = 1. self.goal_reward = goal_reward self.action_cost_coeff = actuation_cost_coeff self.distance_cost_coeff = distance_cost_coeff self.xlim = (-7, 7) self.ylim = (-7, 7) self.vel_bound = 1. self.reset() self.observation = None self.reward_range = (-float('inf'), float('inf')) self.metadata = {'render.modes': []} self.spec = None self._ax = None self._env_lines = [] self.fixed_plots = None self.dynamic_plots = [] super().__init__() Serializable.quick_init(self, locals())
def __init__(self, env, policy, backup_policy, mix_policy, pos_eps_policy, neg_eps_policy, baseline, minibatch_size=500, n_sub_itr=10, optimizer=None, optimizer_args=None, delta=0.01, **kwargs): Serializable.quick_init(self, locals()) self.optimizer = optimizer if optimizer is None: if optimizer_args is None: optimizer_args = dict() self.optimizer = CGOptimizer(**optimizer_args) self.opt_info = None self.backup_policy = backup_policy self.mix_policy = mix_policy self.pos_eps_policy = pos_eps_policy self.neg_eps_policy = neg_eps_policy self.minibatch_size = minibatch_size self.n_sub_itr = n_sub_itr self.delta = delta super(CATRPO, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs)
def __init__(self, env_spec, obs_pl, action, scope_name=None): Serializable.quick_init(self, locals()) self._obs_pl = obs_pl self._action = action self._scope_name = (tf.get_variable_scope().name if not scope_name else scope_name) super(NNPolicy, self).__init__(env_spec)
def __init__(self, inputs, name, hidden_layer_sizes): Parameterized.__init__(self) Serializable.quick_init(self, locals()) self._name = name self._inputs = inputs self._layer_sizes = list(hidden_layer_sizes) + [1] self._output = self._output_for(self._inputs)
def __init__(self, radar_range=2, radar_resolution=1, discretized=True, use_maps='all', states_cache=None): """ :param radar_range: how many measurements does 'radar' make to each of 4 sides (and combinations) :param radar_resolution: distance between two measurements of agent`s 'radar' :param discretized: discretized actions from {<-1,-0.33> , <-0.33,0.33> , <0.33,1>} to [-1, 0, 1] :param use_maps: which maps to use, list of indexes or 'all' :param states_cache: pre-populated cache to use (observation -> set of states) """ Serializable.quick_init(self, locals()) self.radar_range = radar_range self.radar_resolution = radar_resolution self.discretized = discretized if states_cache is None: self.states_cache = dict() else: self.states_cache = states_cache self.agent_width = 2.4 / np.pi self.max_action_distance = 0.2 self.do_render_init = True self.render_prev_pos = np.zeros(2) self.do_caching = True self.current_map_idx = None self.agent_pos = None self.agent_ori = None # Maps initialization if use_maps == 'all': raw_maps = self.all_maps else: # noinspection PyTypeChecker raw_maps = [self.all_maps[i] for i in use_maps] self.maps = [] self.bit_maps = [] for i in range(len(raw_maps)): # Normalize char map m = np.array([list(row.upper()) for row in raw_maps[i]]) m[np.logical_or(m == '.', m == ' ')] = 'F' m[np.logical_or(m == 'X', m == '#')] = 'W' m[m == 'O'] = 'H' self.maps.append(m) # Make bit map bm = np.zeros(m.shape) bm[np.logical_or(m == 'W', m == 'H')] = 1 self.bit_maps.append(bm)
def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False, force_reset=True): if log_dir is None: if logger.get_snapshot_dir() is None: logger.log( "Warning: skipping Gym environment monitoring since snapshot_dir not configured." ) else: log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") Serializable.quick_init(self, locals()) env = gym.envs.make(env_name) # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when # the time limit specified for each environment has been passed and # therefore the environment is not Markovian (terminal condition depends # on time rather than state). env = env.env self.env = env self.env_id = env.spec.id assert not (not record_log and record_video) if log_dir is None or record_log is False: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self._observation_space = convert_gym_space(env.observation_space) logger.log("observation space: {}".format(self._observation_space)) self._action_space = convert_gym_space(env.action_space) logger.log("action space: {}".format(self._action_space)) self._horizon = env.spec.tags[ 'wrapper_config.TimeLimit.max_episode_steps'] self._log_dir = log_dir self._force_reset = force_reset
def __init__(self, env_spec, hidden_layer_sizes=(100, 100), name='value_function'): Serializable.quick_init(self, locals()) self._Do = flat_dim(env_spec.observation_space) self._observations_ph = tf.placeholder( tf.float32, shape=[None, self._Do], name='observations') super(NNVFunction, self).__init__( inputs=(self._observations_ph,), name=name, hidden_layer_sizes=hidden_layer_sizes)
def __init__(self, env_spec, q_functions): Serializable.quick_init(self, locals()) self.q_functions = q_functions self._Da = flat_dim(env_spec.action_space) self._Do = flat_dim(env_spec.observation_space) self._observations_ph = tf.placeholder( tf.float32, shape=[None, self._Do], name='observations') self._actions_ph = tf.placeholder( tf.float32, shape=[None, self._Da], name='actions') self._output = self.output_for( self._observations_ph, self._actions_ph, reuse=True)
def __init__(self, env, num_orig_skills, subpath_infos=None): """ Creates a top-level environment for a HRL agent. Original env`s actions are replaced by N discrete actions, N being the number of skills. :param env: AsaEnv environment to wrap :param num_orig_skills: number of pre-trained skill that will prepared be in HRL policy :param subpath_infos: 'all' or list of subpath information to keep, defaults to ['env_infos'] """ Serializable.quick_init(self, locals()) super().__init__(env) self._num_orig_skills = num_orig_skills self.action_space = Discrete(self._num_orig_skills) self.hrl_policy = None if subpath_infos is None: subpath_infos = ['env_infos'] self.subpath_infos = subpath_infos
def __init__(self, env, start_obss, end_obss): """ Creates an environment tailored to train a single (missing) skill. Trajectories are initialized in start_obss state and terminated (and reward is generated) upon reaching end_obs state. :param env: AsaEnv environment to wrap. Environment is cloned to sustain integrity of original env. :param start_obss: Tensor of experienced starting observations (where skill should initiate) :param end_obss: Tensor of experienced ending observations (where skill should terminate) """ Serializable.quick_init(self, locals()) Wrapper.__init__(self, AsaEnv.clone_wrapped( env)) # this clones base env along with all wrappers if start_obss.shape != end_obss.shape: raise ValueError( 'start_obss ({}) and end_obss ({}) must be of same shape'. format(start_obss.shape, end_obss.shape)) self._end_obss = end_obss.reshape((end_obss.shape[0], -1)) self._start_obss = start_obss.reshape((start_obss.shape[0], -1)) self.current_obs_idx = None
def __init__(self, goal=(0, -1), arm_distance_coeff=0): """ goal (`list`): List of two elements denoting the x and y coordinates of the goal location. Either of the coordinate can also be a string 'any' to make the reward not to depend on the corresponding coordinate. arm_distance_coeff ('float'): Coefficient for the arm-to-object distance cost. """ super(PusherEnv, self).__init__(file_path=self.FILE_PATH) Serializable.quick_init(self, locals()) self._goal_mask = [coordinate != 'any' for coordinate in goal] self._goal = np.array(goal)[self._goal_mask].astype(np.float32) self._arm_distance_coeff = arm_distance_coeff self._action_cost_coeff = 0.1 # Make the the complete robot visible when visualizing. self.model.stat.extent = 10
def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False, force_reset=True): if log_dir is None: if logger.get_snapshot_dir() is None: logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.") else: log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") Serializable.quick_init(self, locals()) env = gym.envs.make(env_name) # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when # the time limit specified for each environment has been passed and # therefore the environment is not Markovian (terminal condition depends # on time rather than state). env = env.env self.env = env self.env_id = env.spec.id assert not (not record_log and record_video) if log_dir is None or record_log is False: self.monitoring = False else: if not record_video: video_schedule = NoVideoSchedule() else: if video_schedule is None: video_schedule = CappedCubicVideoSchedule() self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) self.monitoring = True self._observation_space = convert_gym_space(env.observation_space) logger.log("observation space: {}".format(self._observation_space)) self._action_space = convert_gym_space(env.action_space) logger.log("action space: {}".format(self._action_space)) self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps'] self._log_dir = log_dir self._force_reset = force_reset
def __init__(self, env_spec, hidden_layer_sizes, squash=True, name='policy'): Serializable.quick_init(self, locals()) self._action_dim = flat_dim(env_spec.action_space) self._observation_dim = flat_dim(env_spec.observation_space) self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim] self._squash = squash self._name = name self._observation_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observation') self._actions = self.actions_for(self._observation_ph) super(StochasticNNPolicy, self).__init__( env_spec, self._observation_ph, self._actions, self._name)
def __init__(self, env_spec, hidden_layer_sizes, squash=True, name='policy'): Serializable.quick_init(self, locals()) self._action_dim = flat_dim(env_spec.action_space) self._observation_dim = flat_dim(env_spec.observation_space) self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim] self._squash = squash self._name = name self._observation_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observation') self._actions = self.actions_for(self._observation_ph) super(StochasticNNPolicy, self).__init__(env_spec, self._observation_ph, self._actions, self._name)
def __init__(self, env_spec, max_replay_buffer_size): super(SimpleReplayBuffer, self).__init__() Serializable.quick_init(self, locals()) max_replay_buffer_size = int(max_replay_buffer_size) self._env_spec = env_spec self._observation_dim = flat_dim(env_spec.observation_space) self._action_dim = flat_dim(env_spec.action_space) self._max_buffer_size = max_replay_buffer_size self._observations = np.zeros( (max_replay_buffer_size, self._observation_dim)) # It's a bit memory inefficient to save the observations twice, # but it makes the code *much* easier since you no longer have to # worry about termination conditions. self._next_obs = np.zeros( (max_replay_buffer_size, self._observation_dim)) self._actions = np.zeros((max_replay_buffer_size, self._action_dim)) self._rewards = np.zeros(max_replay_buffer_size) # self._terminals[i] = a terminal was received at time i self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8') self._top = 0 self._size = 0
def __init__(self, env_spec, max_replay_buffer_size): super(SimpleReplayBuffer, self).__init__() Serializable.quick_init(self, locals()) max_replay_buffer_size = int(max_replay_buffer_size) self._env_spec = env_spec self._observation_dim = flat_dim(env_spec.observation_space) self._action_dim = flat_dim(env_spec.action_space) self._max_buffer_size = max_replay_buffer_size self._observations = np.zeros((max_replay_buffer_size, self._observation_dim)) # It's a bit memory inefficient to save the observations twice, # but it makes the code *much* easier since you no longer have to # worry about termination conditions. self._next_obs = np.zeros((max_replay_buffer_size, self._observation_dim)) self._actions = np.zeros((max_replay_buffer_size, self._action_dim)) self._rewards = np.zeros(max_replay_buffer_size) # self._terminals[i] = a terminal was received at time i self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8') self._top = 0 self._size = 0
def __init__(self, goal_reward=10, actuation_cost_coeff=30, distance_cost_coeff=1, init_sigma=0.1): self.dynamics = PointDynamics(dim=2, sigma=0) self.init_mu = np.zeros(2, dtype=np.float32) self.init_sigma = init_sigma self.goal_positions = np.array( [ [5, 0], [-5, 0], [0, 5], [0, -5] ], dtype=np.float32 ) self.goal_threshold = 1. self.goal_reward = goal_reward self.action_cost_coeff = actuation_cost_coeff self.distance_cost_coeff = distance_cost_coeff self.xlim = (-7, 7) self.ylim = (-7, 7) self.vel_bound = 1. self.reset() self.observation = None self.reward_range = (-float('inf'), float('inf')) self.metadata = {'render.modes': []} self.spec = None self._ax = None self._env_lines = [] self.fixed_plots = None self.dynamic_plots = [] super().__init__() Serializable.quick_init(self, locals())
def __init__(self, top_policy, skill_policy_prototype, skill_policies, skill_stop_functions=None, skill_max_timesteps=100): """ :param top_policy: policy for top-level agent, to be trained :param skill_policies: list of trained skill policies :param skill_policy_prototype: an empty policy serving as a prototype for newly created skill policies. New policies are generated by calling Serializable.clone() upon this prototype, producing a new instance of the policy initialized with same parameters as the prototype. :param skill_stop_functions: list of stopping functions (path_dict -> bool) for trained skills :param skill_max_timesteps: maximum length of skill execution """ Serializable.quick_init(self, locals()) self.top_policy = top_policy self.skill_policy_prototype = skill_policy_prototype self.skill_policies = skill_policies self.skill_max_timesteps = skill_max_timesteps num_orig_skills = len(skill_policies) # pad _skills_end_obss to align indexes with skill_policies self._skills_end_obss = [None for _ in range(num_orig_skills)] # if _skill_stop_functions is not provided, default stopping function (return False) is assigned to all self._skill_stop_functions = skill_stop_functions if skill_stop_functions is not None \ else [lambda path: False for _ in range(num_orig_skills)] assert (len(self._skill_stop_functions) == num_orig_skills) # Check top-level policy if not isinstance(top_policy.action_space, Discrete) \ or top_policy.action_space.n != self.num_skills: raise TypeError( 'Top level policy must have Discrete(num_skills) action space.' )
def __init__(self, env, delay=0.01): Serializable.quick_init(self, locals()) gym.Wrapper.__init__(self, env) self._delay = delay
def __init__(self, *args, **kwargs): Serializable.quick_init(self, locals()) self.reward_range = None self.metadata = None super().__init__(SequenceReacherEnv(*args, **kwargs))