def __init__(self, observation_space, action_space, eval_mode=False, ignore_response=True, discretization_bounds=(0.0, 10.0), number_bins=100, exploration_policy='epsilon_greedy', exploration_temperature=0.99, learning_rate=0.1, gamma=0.99, **kwargs): """TabularQAgent init. Args: observation_space: a gym.spaces object specifying the format of observations. action_space: a gym.spaces object that specifies the format of actions. eval_mode: Boolean indicating whether the agent is in training or eval mode. ignore_response: Boolean indicating whether the agent should ignore the response part of the observation. discretization_bounds: pair of real numbers indicating the min and max value for continuous attributes discretization. Values below the min will all be grouped in the first bin, while values above the max will all be grouped in the last bin. See the documentation of numpy.digitize for further details. number_bins: positive integer number of bins used to discretize continuous attributes. exploration_policy: either one of ['epsilon_greedy', 'min_count'] or a custom function. TODO(mmladenov): formalize requirements of this function. exploration_temperature: a real number passed as parameter to the exploration policy. learning_rate: a real number between 0 and 1 indicating how much to update Q-values, i.e. Q_t+1(s,a) = (1 - learning_rate) * Q_t(s, a) + learning_rate * (R(s,a) + ...). gamma: real value between 0 and 1 indicating the discount factor of the MDP. **kwargs: additional arguments like eval_mode. """ self._kwargs = kwargs super(TabularQAgent, self).__init__(action_space) # hard params self._gamma = gamma self._eval_mode = eval_mode self._previous_slate = None self._learning_rate = learning_rate # storage self._q_value_table = {} self._state_action_counts = {} self._previous_state_action_index = None # discretization and spaces self._discretization_bins = np.linspace(discretization_bounds[0], discretization_bounds[1], num=number_bins) single_doc_space = list( observation_space.spaces['doc'].spaces.values())[0] slate_tuple = tuple([single_doc_space] * self._slate_size) action_space = spaces.Tuple(slate_tuple) self._ignore_response = ignore_response state_action_space = { 'user': observation_space.spaces['user'], 'action': action_space } if not self._ignore_response: state_action_space['response'] = observation_space.spaces[ 'response'] self._state_action_space = spaces.Dict(state_action_space) self._observation_featurizer = agent_utils.GymSpaceWalker( self._state_action_space, self._discretize_gym_leaf) # exploration self._exploration_policy = exploration_policy self._exploration_temperature = exploration_temperature self._base_exploration_temperature = self._exploration_temperature self._exploration_functions = { 'epsilon_greedy': lambda observation: agent_utils.epsilon_greedy_exploration( # pylint: disable=g-long-lambda self._enumerate_state_action_indices(observation), self. _q_value_table, self._exploration_temperature), 'min_count': lambda observation: agent_utils.min_count_exploration( # pylint: disable=g-long-lambda self._enumerate_state_action_indices(observation), self. _state_action_counts) }
def __init__(self, config, home_team, away_team, opp_actor=None): self.__version__ = "0.0.3" self.config = config self.config.competition_mode = False self.config.fast_mode = True self.game = None self.team_id = None self.ruleset = load_rule_set(config.ruleset, all_rules=False) self.home_team = home_team self.away_team = away_team self.actor = Agent("Gym Learner", human=True) self.opp_actor = opp_actor if opp_actor is not None else RandomBot( "Random") self._seed = None self.seed() self.root = None self.cv = None self.last_obs = None self.last_report_idx = 0 self.last_ball_team = None self.last_ball_x = None self.own_team = None self.opp_team = None self.layers = [ OccupiedLayer(), OwnPlayerLayer(), OppPlayerLayer(), OwnTackleZoneLayer(), OppTackleZoneLayer(), UpLayer(), StunnedLayer(), UsedLayer(), RollProbabilityLayer(), BlockDiceLayer(), ActivePlayerLayer(), TargetPlayerLayer(), MALayer(), STLayer(), AGLayer(), AVLayer(), MovemenLeftLayer(), BallLayer(), OwnHalfLayer(), OwnTouchdownLayer(), OppTouchdownLayer(), SkillLayer(Skill.BLOCK), SkillLayer(Skill.DODGE), SkillLayer(Skill.SURE_HANDS), SkillLayer(Skill.CATCH), SkillLayer(Skill.PASS) ] for action_type in FFAIEnv.positional_action_types: self.layers.append(AvailablePositionLayer(action_type)) arena = load_arena(self.config.arena) self.observation_space = spaces.Dict({ 'board': spaces.Box(low=0, high=1, shape=(len(self.layers), arena.height, arena.width)), 'state': spaces.Box(low=0, high=1, shape=(50, )), 'procedures': spaces.Box(low=0, high=1, shape=(len(FFAIEnv.procedures), )), 'available-action-types': spaces.Box(low=0, high=1, shape=(len(FFAIEnv.actions), )) }) self.action_space = spaces.Dict({ 'action-type': spaces.Discrete(len(FFAIEnv.actions)), 'x': spaces.Discrete(arena.width), 'y': spaces.Discrete(arena.height) })
def __init__(self, env, pixels_only=True, render_kwargs=None, normalize=False, observation_key='pixels', camera_ids=(-1, )): """Initializes a new pixel Wrapper. Args: env: The environment to wrap. pixels_only: If `True` (default), the original observation returned by the wrapped environment will be discarded, and a dictionary observation will only include pixels. If `False`, the observation dictionary will contain both the original observations and the pixel observations. render_kwargs: Optional `dict` containing keyword arguments passed to the `self.render` method. observation_key: Optional custom string specifying the pixel observation's key in the `OrderedDict` of observations. Defaults to 'pixels'. Raises: ValueError: If `env`'s observation spec is not compatible with the wrapper. Supported formats are a single array, or a dict of arrays. ValueError: If `env`'s observation already contains the specified `observation_key`. """ super(PixelObservationWrapper, self).__init__(env) if render_kwargs is None: render_kwargs = {} render_mode = render_kwargs.pop('mode', 'rgb_array') assert render_mode == 'rgb_array', render_mode render_kwargs['mode'] = 'rgb_array' # Specify number of cameras and their ids to render with # The observation will become a depthwise concatenation of all the # images gathered from these specified cameras. self._camera_ids = camera_ids self._render_kwargs_per_camera = [{ 'mode': 'rgb_array', 'width': render_kwargs['width'], 'height': render_kwargs['height'], 'camera_id': camera_id, } for camera_id in self._camera_ids] wrapped_observation_space = env.observation_space self._env = env self._pixels_only = pixels_only self._render_kwargs = render_kwargs self._observation_key = observation_key if 'box_warp' in render_kwargs.keys(): self._box_warp = render_kwargs.pop('box_warp') else: self._box_warp = False # import ipdb; ipdb.set_trace() if isinstance(wrapped_observation_space, spaces.Box): self._observation_is_dict = False invalid_keys = set([STATE_KEY]) elif isinstance(wrapped_observation_space, (spaces.Dict, collections.MutableMapping)): self._observation_is_dict = True invalid_keys = set(wrapped_observation_space.spaces.keys()) else: raise ValueError("Unsupported observation space structure.") if not pixels_only and observation_key in invalid_keys: raise ValueError( "Duplicate or reserved observation key {!r}.".format( observation_key)) if pixels_only: self.observation_space = spaces.Dict() elif self._observation_is_dict: self.observation_space = copy.deepcopy(wrapped_observation_space) else: self.observation_space = spaces.Dict() self.observation_space.spaces[ STATE_KEY] = wrapped_observation_space self._normalize = normalize # Extend observation space with pixels. pixels = self._get_pixels() if np.issubdtype(pixels.dtype, np.integer): low, high = (0, 255) elif np.issubdtype(pixels.dtype, np.float): low, high = (-float('inf'), float('inf') ) # Fix for normalized between [-1, 1] else: raise TypeError(pixels.dtype) pixels_space = spaces.Box(shape=pixels.shape, low=low, high=high, dtype=pixels.dtype) self.observation_space.spaces[observation_key] = pixels_space
tf1, tf, tfv = try_import_tf() _, nn = try_import_torch() DICT_SPACE = spaces.Dict({ "sensors": spaces.Dict({ "position": spaces.Box(low=-100, high=100, shape=(3, )), "velocity": spaces.Box(low=-1, high=1, shape=(3, )), "front_cam": spaces.Tuple((spaces.Box(low=0, high=1, shape=(10, 10, 3)), spaces.Box(low=0, high=1, shape=(10, 10, 3)))), "rear_cam": spaces.Box(low=0, high=1, shape=(10, 10, 3)), }), "inner_state": spaces.Dict({ "charge": spaces.Discrete(100), "job_status": spaces.Dict({ "task": spaces.Discrete(5), "progress": spaces.Box(low=0, high=100, shape=()), }) }) }) DICT_SAMPLES = [DICT_SPACE.sample() for _ in range(10)] TUPLE_SPACE = spaces.Tuple([
def __init__(self, env=None): # BowV1Env attributes self.env_id = 'NovelGridworld-Bow-v1' self.env = env # env to restore in reset self.map_size = 10 self.map = np.zeros((self.map_size, self.map_size), dtype=int) # 2D Map self.agent_location = (1, 1) # row, column self.direction_id = {'NORTH': 0, 'SOUTH': 1, 'WEST': 2, 'EAST': 3} self.agent_facing_str = 'NORTH' self.agent_facing_id = self.direction_id[self.agent_facing_str] self.block_in_front_str = 'air' self.block_in_front_id = 0 # air self.block_in_front_location = (0, 0) # row, column self.items = { 'air', 'bow', 'crafting_table', 'plank', 'stick', 'string', 'tree_log', 'wall', 'wool' } self.items_id = self.set_items_id( self.items ) # {'crafting_table': 1, 'plank': 2, ...} # air's ID is 0 self.unbreakable_items = {'air', 'wall'} self.goal_item_to_craft = 'bow' # items_quantity when the episode starts, do not include wall, quantity must be more than 0 self.items_quantity = {'crafting_table': 1, 'tree_log': 3, 'wool': 2} self.inventory_items_quantity = {item: 0 for item in self.items} self.selected_item = '' self.entities = set() self.available_locations = [] # locations that do not have item placed self.not_available_locations = [ ] # locations that have item placed or are above, below, left, right to an item # Action Space self.actions_id = dict() self.manipulation_actions_id = { 'Forward': 0, 'Left': 1, 'Right': 2, 'Break': 3, 'Extract_string': 4 } self.actions_id.update(self.manipulation_actions_id) self.recipes = { 'bow': { 'input': { 'stick': 3, 'string': 3 }, 'output': { 'bow': 1 } }, 'stick': { 'input': { 'plank': 2 }, 'output': { 'stick': 4 } }, 'plank': { 'input': { 'tree_log': 1 }, 'output': { 'plank': 4 } } } # Add a Craft action for each recipe self.craft_actions_id = { 'Craft_' + item: len(self.actions_id) + i for i, item in enumerate(sorted(self.recipes.keys())) } self.actions_id.update(self.craft_actions_id) # Add a Select action for each item except unbreakable items self.select_actions_id = { 'Select_' + item: len(self.actions_id) + i for i, item in enumerate( sorted(list(self.items ^ self.unbreakable_items))) } self.actions_id.update(self.select_actions_id) self.action_space = spaces.Discrete(len(self.actions_id)) self.last_action = 'Forward' # last actions executed self.step_count = 0 # no. of steps taken self.last_step_cost = 0 # last received step_cost # Observation Space self.max_items = 20 self.observation_space = spaces.Box(low=0, high=self.max_items, shape=(self.map_size, self.map_size, 1)) self.observation_space = spaces.Dict({'map': self.observation_space}) # Reward self.last_reward = 0 # last received reward self.reward_intermediate = 10 self.reward_done = 50 self.last_done = False # last done
def __init__( self, render_dt_msec=0, action_l2norm_penalty=0, # disabled for now render_onscreen=False, render_size=84, reward_type="dense", action_scale=1.0, target_radius=0.60, boundary_dist=4, ball_radius=0.50, include_colors_in_obs=False, walls=None, num_colors=8, change_colors=True, fixed_colors=False, fixed_goal=None, randomize_position_on_reset=True, images_are_rgb=False, # else black and white show_goal=True, use_env_labels=False, num_objects=1, include_white=False, **kwargs): if walls is None: walls = [] if walls is None: walls = [] if fixed_goal is not None: fixed_goal = np.array(fixed_goal) if len(kwargs) > 0: LOGGER = logging.getLogger(__name__) LOGGER.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs) self.quick_init(locals()) self.render_dt_msec = render_dt_msec self.action_l2norm_penalty = action_l2norm_penalty self.render_onscreen = render_onscreen self.render_size = render_size self.reward_type = reward_type self.action_scale = action_scale self.target_radius = target_radius self.boundary_dist = boundary_dist self.ball_radius = ball_radius self.walls = walls self.fixed_goal = fixed_goal self.randomize_position_on_reset = randomize_position_on_reset self.images_are_rgb = images_are_rgb self.show_goal = show_goal self.num_objects = num_objects self.max_target_distance = self.boundary_dist - self.target_radius self.change_colors = change_colors self.fixed_colors = fixed_colors self.num_colors = num_colors self._target_position = None self._position = np.zeros((2)) self.include_white = include_white self.initial_pass = False self.white_passed = False if change_colors: self.randomize_colors() else: self.object_colors = [Color('blue')] u = np.ones(2) self.action_space = spaces.Box(-u, u, dtype=np.float32) self.include_colors_in_obs = include_colors_in_obs o = self.boundary_dist * np.ones(2) ohe_range = spaces.Box(np.zeros(self.num_colors), np.ones(self.num_colors), dtype='float32') if include_colors_in_obs and self.fixed_colors: high = np.concatenate((o, np.ones(self.num_colors))) low = np.concatenate((-o, np.zeros(self.num_colors))) else: high = o low = -o self.obs_range = spaces.Box(low, high, dtype='float32') self.use_env_labels = use_env_labels if self.use_env_labels: self.observation_space = spaces.Dict([ ('label', ohe_range), ('observation', self.obs_range), ('desired_goal', self.obs_range), ('achieved_goal', self.obs_range), ('state_observation', self.obs_range), ('state_desired_goal', self.obs_range), ('state_achieved_goal', self.obs_range), ]) else: self.observation_space = spaces.Dict([ ('observation', self.obs_range), ('desired_goal', self.obs_range), ('achieved_goal', self.obs_range), ('state_observation', self.obs_range), ('state_desired_goal', self.obs_range), ('state_achieved_goal', self.obs_range), ]) self.drawer = None self.render_drawer = None self.color_index = 0 self.colors = [ Color('green'), Color('red'), Color('blue'), Color('black'), Color('purple'), Color('brown'), Color('pink'), Color('orange'), Color('grey'), Color('yellow') ]
def _init_train(self): if self.config.RL.DDPPO.force_distributed: self._is_distributed = True if is_slurm_batch_job(): add_signal_handlers() if self._is_distributed: local_rank, tcp_store = init_distrib_slurm( self.config.RL.DDPPO.distrib_backend ) if rank0_only(): logger.info( "Initialized DD-PPO with {} workers".format( torch.distributed.get_world_size() ) ) self.config.defrost() self.config.TORCH_GPU_ID = local_rank self.config.SIMULATOR_GPU_ID = local_rank # Multiply by the number of simulators to make sure they also get unique seeds self.config.TASK_CONFIG.SEED += ( torch.distributed.get_rank() * self.config.NUM_ENVIRONMENTS ) self.config.freeze() random.seed(self.config.TASK_CONFIG.SEED) np.random.seed(self.config.TASK_CONFIG.SEED) torch.manual_seed(self.config.TASK_CONFIG.SEED) self.num_rollouts_done_store = torch.distributed.PrefixStore( "rollout_tracker", tcp_store ) self.num_rollouts_done_store.set("num_done", "0") if rank0_only() and self.config.VERBOSE: logger.info(f"config: {self.config}") profiling_wrapper.configure( capture_start_step=self.config.PROFILING.CAPTURE_START_STEP, num_steps_to_capture=self.config.PROFILING.NUM_STEPS_TO_CAPTURE, ) self._init_envs() ppo_cfg = self.config.RL.PPO if torch.cuda.is_available(): self.device = torch.device("cuda", self.config.TORCH_GPU_ID) torch.cuda.set_device(self.device) else: self.device = torch.device("cpu") if rank0_only() and not os.path.isdir(self.config.CHECKPOINT_FOLDER): os.makedirs(self.config.CHECKPOINT_FOLDER) self._setup_actor_critic_agent(ppo_cfg) if self._is_distributed: self.agent.init_distributed(find_unused_params=True) logger.info( "agent number of parameters: {}".format( sum(param.numel() for param in self.agent.parameters()) ) ) obs_space = self.obs_space if self._static_encoder: self._encoder = self.actor_critic.net.visual_encoder obs_space = spaces.Dict( { "visual_features": spaces.Box( low=np.finfo(np.float32).min, high=np.finfo(np.float32).max, shape=self._encoder.output_shape, dtype=np.float32, ), **obs_space.spaces, } ) self._nbuffers = 2 if ppo_cfg.use_double_buffered_sampler else 1 self.rollouts = RolloutStorage( ppo_cfg.num_steps, self.envs.num_envs, obs_space, self.envs.action_spaces[0], ppo_cfg.hidden_size, num_recurrent_layers=self.actor_critic.net.num_recurrent_layers, is_double_buffered=ppo_cfg.use_double_buffered_sampler, ) self.rollouts.to(self.device) observations = self.envs.reset() batch = batch_obs( observations, device=self.device, cache=self._obs_batching_cache ) batch = apply_obs_transforms_batch(batch, self.obs_transforms) if self._static_encoder: with torch.no_grad(): batch["visual_features"] = self._encoder(batch) self.rollouts.buffers["observations"][0] = batch self.current_episode_reward = torch.zeros(self.envs.num_envs, 1) self.running_episode_stats = dict( count=torch.zeros(self.envs.num_envs, 1), reward=torch.zeros(self.envs.num_envs, 1), ) self.window_episode_stats = defaultdict( lambda: deque(maxlen=ppo_cfg.reward_window_size) ) self.env_time = 0.0 self.pth_time = 0.0 self.t_start = time.time()
def __init__( self, wad_name='semantic_goal_map_dev.wad', input_types=(RGB, TIME, AUTOMAP), episode_start_time=14, episode_timeout=1000, max_actions=None, set_window_visible=False, interactive=False, randomize_textures=1, randomize_maps=None, repeat_count=3, n_actions=DEFAULT_N_ACTIONS, set_automap_buffer_enabled=True, set_render_crosshair=False, set_render_corpses=False, set_render_decals=False, set_render_effects_sprites=False, set_render_hud=False, set_render_messages=False, set_render_minimal_hud=False, set_render_particles=False, set_render_weapon=True, set_screen_resolution=ScreenResolution.RES_320X240, set_sound_enabled=False, max_onscreen_objects=30, ): ''' Args: wad_name: .wad file to use sensor_type: subset of ['RGB', 'D', 'L'] RGB: RGB image as uint8 D: Depth as uint8 L: Labels (semantic, pixelwise) randomize_textures: wad must support randomizing textures randomize_maps: (map_number_low, map_number_high) ''' game = DoomGame() scenarios_dir = os.path.join(os.path.dirname(__file__), 'scenarios') game_path = os.path.join(os.path.dirname(__file__), 'freedoom2.wad') assert os.path.isfile(game_path) game.set_doom_scenario_path(os.path.join(scenarios_dir, wad_name)) # if map_cfg is None: # raise ValueError("map_cfg cannot be none--needed to normalize agent coords.") # game.set_doom_game_path(game_path) game.set_doom_map("map01") # Get observation spaces _obs_space = {} if set_screen_resolution not in VIABLE_SCREEN_RESOLUTIONS: raise NotImplementedError( 'Some screen resolutions do not work with gym.Monitor. Screen resolution must be one of {}' .format(VIABLE_SCREEN_RESOLUTIONS)) else: game.set_screen_resolution(set_screen_resolution) game.set_screen_format(ScreenFormat.RGB24) game.set_render_hud(set_render_hud) game.set_render_minimal_hud( set_render_minimal_hud) # If hud is enabled game.set_render_crosshair(set_render_crosshair) game.set_render_weapon(set_render_weapon) game.set_render_decals(set_render_decals) game.set_render_particles(set_render_particles) game.set_render_effects_sprites(set_render_effects_sprites) game.set_render_messages(set_render_messages) game.set_render_corpses(set_render_corpses) game.add_available_game_variable(GameVariable.AMMO2) game.set_episode_timeout(episode_timeout) game.set_episode_start_time(episode_start_time) game.set_window_visible(set_window_visible) game.set_sound_enabled(set_sound_enabled) if interactive: game.set_mode(Mode.SPECTATOR) else: game.set_mode(Mode.PLAYER) assert n_actions == 3, "Number of actions must be 3" game.add_available_button(Button.TURN_LEFT) game.add_available_button(Button.TURN_RIGHT) game.add_available_button(Button.MOVE_FORWARD) #game.add_game_args("-host 1 -deathmatch +sv_forcerespawn 1 +sv_noautoaim 1\ #+sv_respawnprotect 1 +sv_cheats 1") # +sv_spawnfarthest game.add_game_args("+sv_cheats 1") # +sv_spawnfarthest 1 if RGB in input_types: _obs_space[INPUT_TYPE_TO_SENSOR_NAME[RGB]] = spaces.Box( 0, 255, (game.get_screen_height(), game.get_screen_width(), game.get_screen_channels()), dtype=np.uint8) game.set_depth_buffer_enabled(DEPTH in input_types) if DEPTH in input_types: _obs_space[INPUT_TYPE_TO_SENSOR_NAME[DEPTH]] = spaces.Box( 0, 255, (game.get_screen_height(), game.get_screen_width(), 1), dtype=np.uint8) game.set_labels_buffer_enabled(LABELS in input_types) if LABELS in input_types: _obs_space[INPUT_TYPE_TO_SENSOR_NAME[LABELS]] = spaces.Box( 0, 255, (game.get_screen_height(), game.get_screen_width(), 1), dtype=np.uint8) _obs_space[INPUT_TYPE_TO_SENSOR_NAME[OBJECT_ID_NUM]] = spaces.Box( 0, 255, (max_onscreen_objects, ), dtype=np.uint8) _obs_space[ INPUT_TYPE_TO_SENSOR_NAME[OBJECT_LOC_BBOX]] = spaces.Box( 0, 255, (max_onscreen_objects, 4), dtype=np.uint8) if TIME in input_types: _obs_space[INPUT_TYPE_TO_SENSOR_NAME[TIME]] = spaces.Box( 0, episode_timeout, (1, ), dtype=np.float32) if GOAL in input_types: raise NotImplementedError( "GOAL observation not defined in base environment") game.set_automap_buffer_enabled(AUTOMAP in input_types) #set_automap_buffer_enabled) if AUTOMAP in input_types: game.set_automap_mode(AutomapMode.OBJECTS_WITH_SIZE) game.add_game_args("+viz_am_center 1") # Fixed map _obs_space[INPUT_TYPE_TO_SENSOR_NAME[AUTOMAP]] = spaces.Box( 0, 255, (game.get_screen_height(), game.get_screen_width(), game.get_screen_channels()), dtype=np.uint8) self.observation_space = spaces.Dict(_obs_space) self.action_space = spaces.Discrete(n_actions) self.game = game self.max_actions = max_actions self.max_onscreen_objects = max_onscreen_objects self.episode_number = 0 self.randomize_maps = randomize_maps self.randomize_textures = randomize_textures self.visualize = set_window_visible self.interactive = interactive self.interactive_delay = 0.02 self.repeat_count = repeat_count self.info = {'skip.repeat_count': self.repeat_count} self.screen_resolution = set_screen_resolution # self.viewer = None self.game.init() super().__init__()
def dict2dict_space(d): if isinstance(d, dict): return spaces.Dict({k: dict2dict_space(v) for k, v in d.items()}) else: return d
tf = try_import_tf() _, nn = try_import_torch() DICT_SPACE = spaces.Dict({ "sensors": spaces.Dict({ "position": spaces.Box(low=-100, high=100, shape=(3, )), "velocity": spaces.Box(low=-1, high=1, shape=(3, )), "front_cam": spaces.Tuple((spaces.Box(low=0, high=1, shape=(10, 10, 3)), spaces.Box(low=0, high=1, shape=(10, 10, 3)))), "rear_cam": spaces.Box(low=0, high=1, shape=(10, 10, 3)), }), "inner_state": spaces.Dict({ "charge": spaces.Discrete(100), "job_status": spaces.Dict({ "task": spaces.Discrete(5), "progress": spaces.Box(low=0, high=100, shape=()), }) }) }) DICT_SAMPLES = [DICT_SPACE.sample() for _ in range(10)] TUPLE_SPACE = spaces.Tuple([
def __init__(self, config_file, **kwargs): gym.Env.__init__(self) Receptor.__init__(self) self.config = config = Configuration.from_file(config_file) # Add any optional args for k, v in kwargs.items(): config.set(k, v) self.name = config.name self._max_episode_steps = config.get('max_episode_steps') if 'max_episode_steps' in config else None self.hot_start = config.get('hot_start', 1) if config.get('render', False): distance = config.get('camera_distance', 2.0) yaw = config.get('camera_yaw', 180) pitch = config.get('camera_pitch', -41) target_position = config.get('camera_target_position', [0.0, 0.20, 0.50]) cId = p.connect(p.GUI)#, options="--opengl2") # cId = p.connect(p.GUI, "window_backend=2") # HACK p.resetDebugVisualizerCamera(distance, yaw, pitch, target_position) p.configureDebugVisualizer(p.COV_ENABLE_SEGMENTATION_MARK_PREVIEW, 0) else: cId = p.connect(p.SHARED_MEMORY) if cId < 0: p.connect(p.DIRECT) # p.loadPlugin("eglRendererPlugin") # HACK p.setAdditionalSearchPath(pybullet_data.getDataPath()) p.resetSimulation() timestep = config.get('timestep', 1 / 240.) sub_steps = int(1. / config.get('update_freq', 100) / timestep) iterations = config.get('solver_iterations', 150) p.setPhysicsEngineParameter(numSolverIterations=iterations, numSubSteps=sub_steps, fixedTimeStep=timestep) gravity = config.get('gravity', [0.0, 0.0, -9.81]) p.setGravity(gravity[0], gravity[1], gravity[2]) self.models = OrderedDict( sorted({child.name: Model(child) for child in config.find_all('model')}.items(), key=lambda t: t[0])) self.addons = OrderedDict( sorted( {child.name: AddonFactory.build(child.get('addon'), self, child) for child in config.find_all('addon')}.items(), key=lambda t: t[0])) self.receptors = OrderedDict(sorted({**self.models, self.name: self}.items(), key=lambda t: t[0])) self.collapse_rewards_func = sum if config.get('sum_rewards', False) else None self.collapse_terminals_func = any if config.get( 'terminal_if_any', False) else all if config.get('terminal_if_all', False) else None self.flatten_observations = config.get('flatten_observations', False) self.flatten_actions=config.get('flatten_actions', False) self.action_repeat=config.get('action_repeat', 1) self.seed() self.reset() self.observation_space, self.action_space = spaces.Dict({}), spaces.Dict({}) for name, receptor in self.receptors.items(): obs_space, act_space = receptor.build_spaces() if len(obs_space.spaces): self.observation_space.spaces[name] = obs_space if len(act_space.spaces): self.action_space.spaces[name] = act_space if self.flatten_observations: lows, highs = [flatten(get_bounds_for_space(self.observation_space, opt)) for opt in [True, False]] self.original_observation_space = self.observation_space self.observation_space = gym.spaces.Box(low=lows, high=highs) if self.flatten_actions: lows, highs = (flatten(get_bounds_for_space(self.action_space, opt)) for opt in [True, False]) self.original_action_space = self.action_space self.action_space = gym.spaces.Box(low=lows, high=highs)
def __init__( self, env, pixels_only=True, render_kwargs=None, pixel_keys=("pixels",) ): """Initializes a new pixel Wrapper. Args: env: The environment to wrap. pixels_only: If `True` (default), the original observation returned by the wrapped environment will be discarded, and a dictionary observation will only include pixels. If `False`, the observation dictionary will contain both the original observations and the pixel observations. render_kwargs: Optional `dict` containing keyword arguments passed to the `self.render` method. pixel_keys: Optional custom string specifying the pixel observation's key in the `OrderedDict` of observations. Defaults to 'pixels'. Raises: ValueError: If `env`'s observation spec is not compatible with the wrapper. Supported formats are a single array, or a dict of arrays. ValueError: If `env`'s observation already contains any of the specified `pixel_keys`. """ super(PixelObservationWrapper, self).__init__(env) if render_kwargs is None: render_kwargs = {} for key in pixel_keys: render_kwargs.setdefault(key, {}) render_mode = render_kwargs[key].pop("mode", "rgb_array") assert render_mode == "rgb_array", render_mode render_kwargs[key]["mode"] = "rgb_array" wrapped_observation_space = env.observation_space if isinstance(wrapped_observation_space, spaces.Box): self._observation_is_dict = False invalid_keys = set([STATE_KEY]) elif isinstance(wrapped_observation_space, (spaces.Dict, MutableMapping)): self._observation_is_dict = True invalid_keys = set(wrapped_observation_space.spaces.keys()) else: raise ValueError("Unsupported observation space structure.") if not pixels_only: # Make sure that now keys in the `pixel_keys` overlap with # `observation_keys` overlapping_keys = set(pixel_keys) & set(invalid_keys) if overlapping_keys: raise ValueError( "Duplicate or reserved pixel keys {!r}.".format(overlapping_keys) ) if pixels_only: self.observation_space = spaces.Dict() elif self._observation_is_dict: self.observation_space = copy.deepcopy(wrapped_observation_space) else: self.observation_space = spaces.Dict() self.observation_space.spaces[STATE_KEY] = wrapped_observation_space # Extend observation space with pixels. pixels_spaces = {} for pixel_key in pixel_keys: pixels = self.env.render(**render_kwargs[pixel_key]) if np.issubdtype(pixels.dtype, np.integer): low, high = (0, 255) elif np.issubdtype(pixels.dtype, np.float): low, high = (-float("inf"), float("inf")) else: raise TypeError(pixels.dtype) pixels_space = spaces.Box( shape=pixels.shape, low=low, high=high, dtype=pixels.dtype ) pixels_spaces[pixel_key] = pixels_space self.observation_space.spaces.update(pixels_spaces) self._env = env self._pixels_only = pixels_only self._render_kwargs = render_kwargs self._pixel_keys = pixel_keys
def __init__(self, task_class, observation_mode='state', action_mode='joint', multi_action_space=False, discrete_action_space=False): self.num_skip_control = 20 self.epi_obs = [] self.obs_record = True self.obs_record_id = None self._observation_mode = observation_mode self.obs_config = ObservationConfig() if observation_mode == 'state': self.obs_config.set_all_high_dim(False) self.obs_config.set_all_low_dim(True) elif observation_mode == 'state-goal': self.obs_config.set_all_high_dim(False) self.obs_config.set_all_low_dim(True) self.obs_config.set_goal_info(True) elif observation_mode == 'vision': self.obs_config.set_all(False) self.obs_config.set_camera_rgb(True) elif observation_mode == 'both': self.obs_config.set_all(True) else: raise ValueError('Unrecognised observation_mode: %s.' % observation_mode) self._action_mode = action_mode self.ac_config = None if action_mode == 'joint': self.ac_config = ActionConfig( SnakeRobotActionConfig.ABS_JOINT_POSITION) elif action_mode == 'trigon': self.ac_config = ActionConfig( SnakeRobotActionConfig.TRIGON_MODEL_PARAM, is_discrete=discrete_action_space) else: raise ValueError('Unrecognised action_mode: %s.' % action_mode) self.env = Environment(action_config=self.ac_config, obs_config=self.obs_config, headless=True) self.env.launch() self.task = self.env.get_task(task_class) self.max_episode_steps = self.task.episode_len _, obs = self.task.reset() if action_mode == 'joint': self.action_space = spaces.Box( low=-1.7, high=1.7, shape=(self.ac_config.action_size, ), dtype=np.float32) elif action_mode == 'trigon': if multi_action_space: # action_space1 = spaces.MultiBinary(n=1) low1 = np.array([-0.8, -0.8]) high1 = np.array([0.8, 0.8]) action_space1 = spaces.Box(low=low1, high=high1, dtype=np.float32) # low = np.array([-0.8, -0.8, 1.0, 3.0, -50, -10, -0.1, -0.1]) # high = np.array([0.8, 0.8, 3.0, 5.0, 50, 10, 0.1, 0.1]) low2 = np.array([1.0]) high2 = np.array([3.0]) action_space2 = spaces.Box(low=low2, high=high2, dtype=np.float32) self.action_space = spaces.Tuple( (action_space1, action_space2)) elif discrete_action_space: self.action_space = spaces.MultiDiscrete([4, 4, 4]) else: # low = np.array([0.0, -0.8, -0.8, 1.0, 3.0, -50, -10, -0.1, -0.1]) # high = np.array([1.0, 0.8, 0.8, 3.0, 5.0, 50, 10, 0.1, 0.1]) low = np.array([-1, -1, -1]) high = np.array([1, 1, 1]) self.action_space = spaces.Box(low=low, high=high, dtype=np.float32) if observation_mode == 'state' or observation_mode == 'state-goal': self.observation_space = spaces.Box( low=-np.inf, high=np.inf, shape=(obs.get_low_dim_data().shape[0] * self.num_skip_control, )) if observation_mode == 'state-goal': self.goal_dim = obs.get_goal_dim() elif observation_mode == 'vision': self.observation_space = spaces.Box( low=0, high=1, shape=obs.head_camera_rgb.shape) elif observation_mode == 'both': self.observation_space = spaces.Dict({ "state": spaces.Box(low=-np.inf, high=np.inf, shape=obs.get_low_dim_data().shape), "rattler_eye_rgb": spaces.Box(low=0, high=1, shape=obs.head_camera_rgb.shape), "rattler_eye_depth": spaces.Box(low=0, high=1, shape=obs.head_camera_depth.shape), }) self._gym_cam = None
def __init__(self, map_size_vertical, map_size_horizontal, no_goals, view_straightforward, view_left_right, start_position_x, start_position_y, start_orientation, reward_range, max_no_steps): # set some parameters self.map_size_vertical = map_size_vertical self.map_size_horizontal = map_size_horizontal self.no_goals = no_goals self.view_straightforward = view_straightforward self.view_left_right = view_left_right self.reward_range = reward_range self.max_no_steps = max_no_steps self.living_reward = 0.5 / self.max_no_steps # determine goal rewards if self.no_goals == 2: self.goal_rewards = [0.25, 0.75] elif self.no_goals == 3: self.goal_rewards = [0.125, 0.125, 0.75] elif self.no_goals == 4: self.goal_rewards = [0.1, 0.1, 0.1, 0.7] # build map with dummy surrounding self.map = np.zeros( (self.no_goals + 1, map_size_vertical + 2 * (self.view_straightforward - 1), map_size_horizontal + 2 * (self.view_straightforward - 1)), dtype=np.float32) self.map[0, :, :] = 1 self.map[0, (self.view_straightforward - 1):(1 - self.view_straightforward), (self.view_straightforward - 1):(1 - self.view_straightforward)] = 0 # shift positions due to dummy surrounding self.start_position_x = self.position_x = start_position_x + self.view_straightforward - 1 self.start_position_y = self.position_y = start_position_y + self.view_straightforward - 1 self.start_orientation = self.orientation = start_orientation # down=0, right=1, up=2, left=3 # determine random goal positions and place corresponding indicators in the map self.start_position_index = (start_position_y * map_size_horizontal) + start_position_x self.max_goal_index = (map_size_horizontal * map_size_vertical) - 1 goal1_index = np.random.random_integers(0, self.max_goal_index) while goal1_index == self.start_position_index: goal1_index = np.random.random_integers(0, self.max_goal_index) self.goal1_position_x = (goal1_index % self.map_size_horizontal ) + self.view_straightforward - 1 self.goal1_position_y = (goal1_index // self.map_size_vertical ) + self.view_straightforward - 1 self.map[1, self.goal1_position_y, self.goal1_position_x] = 1. goal2_index = np.random.random_integers(0, self.max_goal_index) while goal2_index == goal1_index: goal2_index = np.random.random_integers(0, self.max_goal_index) self.goal2_position_x = (goal2_index % self.map_size_horizontal ) + self.view_straightforward - 1 self.goal2_position_y = (goal2_index // self.map_size_vertical ) + self.view_straightforward - 1 self.map[2, self.goal2_position_y, self.goal2_position_x] = 1. goal3_index = np.random.random_integers(0, self.max_goal_index) while goal3_index == goal2_index: goal3_index = np.random.random_integers(0, self.max_goal_index) self.goal3_position_x = (goal3_index % self.map_size_horizontal ) + self.view_straightforward - 1 self.goal3_position_y = (goal3_index // self.map_size_vertical ) + self.view_straightforward - 1 if self.no_goals > 2: self.map[3, self.goal3_position_y, self.goal3_position_x] = 1. goal4_index = np.random.random_integers(0, self.max_goal_index) while goal4_index == goal3_index: goal4_index = np.random.random_integers(0, self.max_goal_index) self.goal4_position_x = (goal4_index % self.map_size_horizontal ) + self.view_straightforward - 1 self.goal4_position_y = (goal4_index // self.map_size_vertical ) + self.view_straightforward - 1 if self.no_goals > 3: self.map[4, self.goal4_position_y, self.goal4_position_x] = 1. # set observation and action space observation_shape = (self.map.shape[0], self.view_straightforward, 2 * self.view_left_right + 1) self.observation_space = spaces.Dict({ "position": spaces.MultiDiscrete([self.map.shape[2], self.map.shape[1], 4]), "observation": spaces.Box(low=0., high=1., shape=observation_shape, dtype=np.float32) }) self.action_space = spaces.Discrete(3) self.last_done = False self.last_obs = np.zeros(observation_shape) # init internal state and step counter self.internal_state = 'Empty' self.step_counter = 0.
def __init__(self, config: Config, dataset: Optional[Dataset] = None) -> None: """Constructor. Mimics the :ref:`Env` constructor. Args: :param config: config for the environment. Should contain id for simulator and ``task_name`` for each task, which are passed into ``make_sim`` and ``make_task``. :param dataset: reference to dataset used for first task instance level information. Can be defined as :py:`None` in which case dataset will be built using ``make_dataset`` and ``config``. """ # let superclass instantiate current task by merging first task in TASKS to TASK if len(config.MULTI_TASK.TASKS): logger.info( "Overwriting config.TASK ({}) with first entry in config.MULTI_TASK.TASKS ({})." .format(config.TASK.TYPE, config.MULTI_TASK.TASKS[0].TYPE)) config.defrost() config.TASK.merge_from_other_cfg(config.MULTI_TASK.TASKS[0]) config.freeze() # TASKS[0] dataset has higher priority over default one (if specified), instatiate it before if dataset is None and config.MULTI_TASK.TASKS[0].DATASET.TYPE: dataset = make_dataset( id_dataset=config.MULTI_TASK.TASKS[0].DATASET.TYPE, config=config.MULTI_TASK.TASKS[0].DATASET, ) # initialize first task leveraging Env super().__init__(config, dataset=dataset) # instatiate other tasks self._tasks = [self._task] self._curr_task_idx = 0 # keep each tasks episode iterator to avoid re-creation self._episode_iterators = [self._episode_iterator] for task in config.MULTI_TASK.TASKS[1:]: self._tasks.append( make_task( task.TYPE, config=task, sim=self._sim, # each task gets its dataset dataset=make_dataset( # TODO: lazy make_dataset support id_dataset=task.DATASET.TYPE, config=task.DATASET), )) # get task episode iterator iter_option_dict = { k.lower(): v for k, v in task.EPISODE_ITERATOR_OPTIONS.items() } iter_option_dict["seed"] = self._config.SEED task_ep_iterator: EpisodeIterator = self._tasks[ -1]._dataset.get_episode_iterator(**iter_option_dict) self._episode_iterators.append(task_ep_iterator) # episode counter self._eps_counter = -1 self._cumulative_steps_counter = 0 # when and how to change task is defined here self._task_sampling_behavior = ( config.MULTI_TASK.TASK_ITERATOR.TASK_SAMPLING) self._change_task_behavior = ( config.MULTI_TASK.TASK_ITERATOR.TASK_CHANGE_TIMESTEP) # custom task label can be specified self._curr_task_label = self._task._config.get("TASK_LABEL", self._curr_task_idx) # add task_idx to observation space self.observation_space = spaces.Dict({ **self._sim.sensor_suite.observation_spaces.spaces, **self._task.sensor_suite.observation_spaces.spaces, "task_idx": spaces.Discrete(len(self._tasks)), })
def __init__(self): """ Initialise the environment """ self.random_goal = True self.goal_oriented = False self.obs_type = 5 self.reward_type = 1 self.action_type = 1 # Define action space self.action_space = spaces.Box( low=np.float32( np.array([-0.5, -0.25, -0.25, -0.25, -0.5, -0.005]) / 30), high=np.float32( np.array([0.5, 0.25, 0.25, 0.25, 0.5, 0.005]) / 30), dtype=np.float32) # Define observation space if self.obs_type == 1: self.obs_space_low = np.float32( np.concatenate((MIN_END_EFF_COORDS, JOINT_MIN), axis=0)) self.obs_space_high = np.float32( np.concatenate((MAX_END_EFF_COORDS, JOINT_MAX), axis=0)) elif self.obs_type == 2: self.obs_space_low = np.float32( np.concatenate((MIN_GOAL_COORDS, JOINT_MIN), axis=0)) self.obs_space_high = np.float32( np.concatenate((MAX_GOAL_COORDS, JOINT_MAX), axis=0)) elif self.obs_type == 3: self.obs_space_low = np.float32( np.concatenate(([-1.0] * 6, JOINT_MIN), axis=0)) self.obs_space_high = np.float32( np.concatenate(([1.0] * 6, JOINT_MAX), axis=0)) elif self.obs_type == 4: self.obs_space_low = np.float32( np.concatenate(([-1.0] * 3, JOINT_MIN), axis=0)) self.obs_space_high = np.float32( np.concatenate(([1.0] * 3, JOINT_MAX), axis=0)) elif self.obs_type == 5: self.obs_space_low = np.float32( np.concatenate(([-1.0] * 6, MIN_GOAL_COORDS, JOINT_MIN), axis=0)) self.obs_space_high = np.float32( np.concatenate(([1.0] * 6, MAX_GOAL_COORDS, JOINT_MAX), axis=0)) self.observation_space = spaces.Box(low=self.obs_space_low, high=self.obs_space_high, dtype=np.float32) if self.goal_oriented: self.observation_space = spaces.Dict( dict(desired_goal=spaces.Box(low=np.float32(MIN_GOAL_COORDS), high=np.float32(MAX_GOAL_COORDS), dtype=np.float32), achieved_goal=spaces.Box( low=np.float32(MIN_END_EFF_COORDS), high=np.float32(MAX_END_EFF_COORDS), dtype=np.float32), observation=self.observation_space)) # Initialise goal position if self.random_goal: self.goal = self.sample_random_goal() else: self.goal = FIXED_GOAL_COORDS # Connect to physics client. By default, do not render self.physics_client = p.connect(p.DIRECT) # Load URDFs self.create_world() # reset environment self.reset()
def __init__( self, model_path: str, frame_skip: int, camera_settings: Optional[Dict] = None, ): """Initializes a new MuJoCo environment. Args: model_path: The path to the MuJoCo XML file. frame_skip: The number of simulation steps per environment step. On hardware this influences the duration of each environment step. camera_settings: Settings to initialize the simulation camera. This can contain the keys `distance`, `azimuth`, and `elevation`. """ self._seed() if not os.path.isfile(model_path): raise IOError('[MujocoEnv]: Model path does not exist: {}'.format( model_path)) self.frame_skip = frame_skip self.sim_robot = MujocoSimRobot(model_path, camera_settings=camera_settings) self.sim = self.sim_robot.sim self.model = self.sim_robot.model self.data = self.sim_robot.data self.metadata = { 'render.modes': ['human', 'rgb_array', 'depth_array'], 'video.frames_per_second': int(np.round(1.0 / self.dt)) } self.mujoco_render_frames = False self.init_qpos = self.data.qpos.ravel().copy() self.init_qvel = self.data.qvel.ravel().copy() observation, _reward, done, _info = self.step(np.zeros(self.model.nu)) assert not done bounds = self.model.actuator_ctrlrange.copy() act_upper = bounds[:, 1] act_lower = bounds[:, 0] # Define the action and observation spaces. # HACK: MJRL is still using gym 0.9.x so we can't provide a dtype. try: self.action_space = spaces.Box(act_lower, act_upper, dtype=np.float32) if isinstance(observation, collections.Mapping): self.observation_space = spaces.Dict({ k: spaces.Box(-np.inf, np.inf, shape=v.shape, dtype=np.float32) for k, v in observation.items() }) else: self.obs_dim = np.sum([ o.size for o in observation ]) if type(observation) is tuple else observation.size self.observation_space = spaces.Box(-np.inf, np.inf, (observation.shape[0], ), dtype=np.float32) except TypeError: # Fallback case for gym 0.9.x self.action_space = spaces.Box(act_lower, act_upper) assert not isinstance( observation, collections.Mapping ), 'gym 0.9.x does not support dictionary observation.' self.obs_dim = np.sum([ o.size for o in observation ]) if type(observation) is tuple else observation.size self.observation_space = spaces.Box(-np.inf, np.inf, observation.shape)
def __init__(self, env_id, using_gym_wrapper = True, state_list=None, max_steps=50, reward="sparse", using_demo_init=False, demo_path = "demos", render = False): """ :param env_id: env name :param using_gym_wrapper: whether using gym wrapper provided by SURREAL :param using_demo_init: whether apply initialization of demonstration states :param state_list: select some dimension of states for the observation (None means all) NOTE: this option is only used when not using gym wrapper (using_gym_wrapper = False) """ self.reward_type = reward self.env = suite.make(env_id, has_renderer=render, has_offscreen_renderer=False, use_object_obs=True, use_camera_obs=False, reward_shaping=False if reward == "sparse" else True, control_freq=10, ignore_done=True, ) if using_demo_init: self.env = DemoSamplerWrapper(self.env, demo_path = os.path.join(demo_path, env_id), scheme_ratios = [0.5, 0.5]) self.using_gym_wrapper = using_gym_wrapper if using_gym_wrapper: self.env = GymWrapper(self.env, keys = state_list if state_list is not None else ["robot-state", "object-state"]) ob, rew, done, _ = self.env.step(np.random.rand(self.env.dof)) if not using_gym_wrapper: # ob = self.env.sim.get_state().flatten() self.state_list = state_list if self.state_list: ob = np.concatenate([ob[i] for i in self.state_list]) self.max_episode_steps = max_steps self.n_steps = 0 self.n_actions = self.env.dof self.d_observations = ob.size self.d_goals = self.get_d_goals() self.observation_space = spaces.Dict({ # "observation": self.env.observation_space, "observation": spaces.Box(- np.inf * np.ones(self.d_observations), np.inf * np.ones(self.d_observations)), "desired_goal": spaces.Box(- np.inf * np.ones(self.d_goals), np.inf * np.ones(self.d_goals)), "achieved_goal": spaces.Box(- np.inf * np.ones(self.d_goals), np.inf * np.ones(self.d_goals)), }) self.action_space = self.env.action_space self.acc_rew = 0
from gym import spaces import numpy as np import pytest base_spaces = [ spaces.Discrete(n=10), spaces.Box(0, 1, [3, 32, 32], dtype=np.float32), spaces.Tuple([ spaces.Discrete(n=10), spaces.Box(0, 1, [3, 32, 32], dtype=np.float32), ]), spaces.Dict({ "x": spaces.Tuple([ spaces.Discrete(n=10), spaces.Box(0, 1, [3, 32, 32], dtype=np.float32), ]), "t": spaces.Discrete(1), }), ] def equals(value, expected) -> bool: assert type(value) == type(expected) if isinstance(value, (int, float, bool)): return value == expected if isinstance(value, np.ndarray): return value.tolist() == expected.tolist() if isinstance(value, (tuple, list)): assert len(value) == len(expected)
def __init__( self, render_dt_msec=0, action_l2norm_penalty=0, # disabled for now render_onscreen=True, render_size=84, reward_type="dense", target_radius=0.5, boundary_dist=4, ball_radius=0.25, walls=None, fixed_goal=None, randomize_position_on_reset=True, images_are_rgb=False, # else black and white show_goal=True, action_limit=1, initial_position=(0, 0), **kwargs ): if walls is None: walls = [] if len(kwargs) > 0: import logging LOGGER = logging.getLogger(__name__) LOGGER.log(logging.WARNING, "WARNING, ignoring kwargs:", kwargs) self.quick_init(locals()) self.render_dt_msec = render_dt_msec self.action_l2norm_penalty = action_l2norm_penalty self.render_onscreen = render_onscreen self.render_size = render_size self.reward_type = reward_type self.target_radius = target_radius self.boundary_dist = boundary_dist self.ball_radius = ball_radius self.walls = walls self.fixed_goal = fixed_goal self.randomize_position_on_reset = randomize_position_on_reset self.images_are_rgb = images_are_rgb self.show_goal = show_goal self.action_limit = action_limit self._max_episode_steps = 50 self.max_target_distance = self.boundary_dist - self.target_radius self._target_position = None self._position = np.zeros((2)) u = self.action_limit * np.ones(2) self.action_space = spaces.Box(-u, u, dtype=np.float32) o = self.boundary_dist * np.ones(2) self.obs_range = spaces.Box(-o, o, dtype='float32') self.observation_space = spaces.Dict([ ('observation', self.obs_range), ('desired_goal', self.obs_range), ('achieved_goal', self.obs_range), ('state_observation', self.obs_range), ('state_desired_goal', self.obs_range), ('state_achieved_goal', self.obs_range), ]) self.observation_space.shape = (2,) # hack self._step = 0 self.initial_position = initial_position self.drawer = None
def __init__(self, action_mode="cart", action_buckets=[-1, 0, 1], action_stepsize=1.0, reward_type="sparse"): """ Parameters: action_mode {"cart","cartmixed","cartprod","impulse","impulsemixed"} action_stepsize: Step size of the action to perform. Int for cart and impulse List for cartmixed and impulsemixed action_buckets: List of buckets used when mode is cartprod reward_mode = {"sparse","dense"} Reward Mode: `sparse` rewards are like the standard HPG rewards. `dense` rewards (from the paper/gym) give -(distance to goal) at every timestep. Modes: `cart` is for manhattan style movement where an action moves the arm in one direction for every action. `impulse` treats the action dimensions as velocity and adds/decreases the velocity by action_stepsize depending on the direction picked. Adds current direction velocity to state `impulsemixed` and `cartmixed` does the above with multiple magnitudes of action_stepsize. Needs the action_stepsize as a list. `cartprod` takes combinations of actions as input """ try: self.env = gym.make("FetchReach-v1") except Exception as e: print( "You do not have the latest version of gym (gym-0.10.5). Falling back to v0 with movable table" ) self.env = gym.make("FetchReach-v0") self.action_directions = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]]) self.valid_action_directions = np.float32( np.any(self.action_directions, axis=0)) self.distance_threshold = self.env.distance_threshold # self.distance_threshold = 0.06 self.action_mode = action_mode self.num_actions = self.generate_action_map(action_buckets, action_stepsize) obs_dim = 10 + 4 * (action_mode == "impulse" or action_mode == "impulsemixed") self.observation_space = spaces.Dict( dict( desired_goal=spaces.Box(-np.inf, np.inf, shape=(3, ), dtype='float32'), achieved_goal=spaces.Box(-np.inf, np.inf, shape=(3, ), dtype='float32'), observation=spaces.Box(-np.inf, np.inf, shape=(obs_dim, ), dtype='float32'), )) self._max_episode_steps = self.env._max_episode_steps self.reward_type = reward_type
from ..LoggedTestCase import LoggedTestCase from aienvs.Environment import Env from unittest.mock import Mock from aiagents.AgentFactory import createAgent, classForName, classForNameTyped from gym import spaces compoundclass = 'aiagents.multi.BasicComplexAgent.BasicComplexAgent' randomclass = 'aiagents.single.RandomAgent.RandomAgent' action_space = spaces.Dict({'robot1': spaces.Discrete(4)}) class testComplexAgentComponent(LoggedTestCase): def test_smoke_no_agents(self): params = {} with self.assertRaises(Exception) as context: createAgent(action_space, None, params) self.assertEquals("Parameters must have key 'class' but got {}", str(context.exception)) def test_smoke_agents_no_dict(self): params = {'class': randomclass, 'parameters': {}, 'subAgentList': 3} with self.assertRaises(Exception) as context: createAgent(action_space, None, params) self.assertEquals( "the subAgentList parameter must contain a list, but got {'class': 'aiagents.single.RandomAgent.RandomAgent', 'parameters': {}, 'subAgentList': 3}", str(context.exception)) def test_smoke_bad_parameters(self): params = { 'class': randomclass,
def __init__(self): """ Initialise the environment """ self.random_goal = True self.goal_oriented = False self.obs_type = 4 self.reward_type = 1 self.action_type = 1 self.joint_limits = "small" self.action_coeff = 30 self.endeffector_pos = None self.torso_pos = None self.end_torso = None self.end_goal = None self.joint_positions = None self.reward = None self.obs = None self.action = None self.new_joint_positions = None self.dist = None self.old_dist = None self.term1 = None self.term2 = None if self.joint_limits == "small": self.joint_min = np.array([-3.1, -1.6, -1.6, -1.8, -3.1, 0.0]) self.joint_max = np.array([3.1, 1.6, 1.6, 1.8, 3.1, 0.0]) elif self.joint_limits == "large": self.joint_min = np.array([-3.2, -3.2, -3.2, -3.2, -3.2, -3.2]) self.joint_max = np.array([3.2, 3.2, 3.2, 3.2, 3.2, 3.2]) # Define action space self.action_space = spaces.Box( low=np.float32( np.array([-0.5, -0.25, -0.25, -0.25, -0.5, -0.005]) / self.action_coeff), high=np.float32( np.array([0.5, 0.25, 0.25, 0.25, 0.5, 0.005]) / self.action_coeff), dtype=np.float32) # Define observation space if self.obs_type == 1: self.obs_space_low = np.float32( np.concatenate((MIN_END_EFF_COORDS, self.joint_min), axis=0)) self.obs_space_high = np.float32( np.concatenate((MAX_END_EFF_COORDS, self.joint_max), axis=0)) elif self.obs_type == 2: self.obs_space_low = np.float32( np.concatenate((MIN_GOAL_COORDS, self.joint_min), axis=0)) self.obs_space_high = np.float32( np.concatenate((MAX_GOAL_COORDS, self.joint_max), axis=0)) elif self.obs_type == 3: self.obs_space_low = np.float32( np.concatenate(([-1.0] * 6, self.joint_min), axis=0)) self.obs_space_high = np.float32( np.concatenate(([1.0] * 6, self.joint_max), axis=0)) elif self.obs_type == 4: self.obs_space_low = np.float32( np.concatenate(([-1.0] * 3, self.joint_min), axis=0)) self.obs_space_high = np.float32( np.concatenate(([1.0] * 3, self.joint_max), axis=0)) elif self.obs_type == 5: self.obs_space_low = np.float32( np.concatenate(([-1.0] * 6, MIN_GOAL_COORDS, self.joint_min), axis=0)) self.obs_space_high = np.float32( np.concatenate(([1.0] * 6, MAX_GOAL_COORDS, self.joint_max), axis=0)) self.observation_space = spaces.Box(low=self.obs_space_low, high=self.obs_space_high, dtype=np.float32) if self.goal_oriented: self.observation_space = spaces.Dict( dict(desired_goal=spaces.Box(low=np.float32(MIN_GOAL_COORDS), high=np.float32(MAX_GOAL_COORDS), dtype=np.float32), achieved_goal=spaces.Box( low=np.float32(MIN_END_EFF_COORDS), high=np.float32(MAX_END_EFF_COORDS), dtype=np.float32), observation=self.observation_space)) # Initialise goal position if self.random_goal: self.goal = self.sample_random_goal() else: self.goal = FIXED_GOAL_COORDS # Connect to physics client. By default, do not render self.physics_client = p.connect(p.DIRECT) # Load URDFs self.create_world() # reset environment self.reset()
low = self.action_space.low high = self.action_space.high action = 2 * (action - low) / (high - low) - 1 action = np.clip(action, low, high) return action max_steps = 200 episodes = 50 env = NormalizedEnv(gym.make("Pendulum-v0")) observation_space = spaces.Dict({ 'robot': env.observation_space, 'task': spaces.Box(-1, 1, shape=(0, )), 'goal': spaces.Box(-1, 1, shape=(1, )) }) def reward_function(goal, **kwargs): reward = goal["achieved"] return reward agent = AgentSAC(agent_config, observation_space, env.action_space) task = np.array([]).reshape((0, )) goal = np.zeros(1) results_episodes = [] rewards_episodes = []
def create_state_space(self): ''' needs: self.n_shapes -- self.n_bouncers ''' if self.moving_walls: n_objects = self.n_bouncers + 1 # plus walls else: n_objects = self.n_bouncers state_space = spaces.Dict({ # a vector containing the index of every shapes body in self.space.bodies; -1 for static "shape_body_ids": spaces.MultiDiscrete([self.n_shapes] * self.n_shapes), # sort of header, the index of every shapes' body in the list # a list of ? strings giving the type of each of the bodies in the list (excluding the static body) "body_types": spaces.MultiDiscrete([len(bc.ObjectTypes)] * (n_objects)), "bouncer_sizes": spaces.Box(low=-pymunk.inf, high=pymunk.inf, shape=(self.n_bouncers, 2), dtype=np.float32), "bouncer_weight": spaces.Box(low=-pymunk.inf, high=pymunk.inf, shape=(self.n_bouncers, ), dtype=np.float32), "positions": spaces.Box(low=-pymunk.inf, high=pymunk.inf, shape=(self.n_shapes, 2), dtype=np.float32), "velocities": spaces.Box(low=-pymunk.inf, high=pymunk.inf, shape=(self.n_shapes, 2), dtype=np.float32), "angles": spaces.Box(low=0., high=2. * np.pi, shape=(self.n_shapes, 1), dtype=np.float32), "angular_velocities": spaces.Box(low=-pymunk.inf, high=pymunk.inf, shape=(self.n_shapes, 1), dtype=np.float32), # 3 coordinates: dx, dy - the distance between the centers of the objects, and the # third coordinate contains the absolute distance ( == norm((dx,dy)) - o1.radius - o2.radius) "relative_position_matrix": spaces.Box(low=-pymunk.inf, high=pymunk.inf, shape=(self.n_shapes, self.n_shapes, 3), dtype=np.float32), # estimate; maybe a non-wall collision comes first "next_wall_to_hit_id": spaces.MultiDiscrete([4] * self.n_bouncers), "next_wall_to_hit_where": spaces.Box(low=-pymunk.inf, high=pymunk.inf, shape=(self.n_bouncers, 2), dtype=np.float32), "next_wall_to_hit_when": spaces.Box(low=-pymunk.inf, high=pymunk.inf, shape=(self.n_bouncers, 1), dtype=np.float32) }) # 1's for every pair that just started to collide this round # "collision_matrix": spaces.Discrete(2, shape=(self.n_bouncers + self.n_iwalls, self.n_bouncers + self.n_iwalls)), # "observation": self.observation_space}) self.state_space = state_space self.state = OrderedDict({ 'shape_body_ids': None, 'body_types': None, 'bouncer_sizes': None, 'bouncer_weight': None, 'positions': None, 'velocities': None, 'angles': None, 'angular_velocities': None, 'relative_position_matrix': None, 'next_wall_to_hit_id': None, 'next_wall_to_hit_where': None, 'next_wall_to_hit_when': None }) return self.state_space, self.state
def response_space(cls): return spaces.Dict({ 'reward': spaces.Box(low=-10.0, high=5.0, shape=tuple(), dtype=np.float32) })
class DevStrat_4_6(BTgymBaseStrategy): """ Objectives: external state data feature search: time_embedded three-channeled vector: - `Open` channel is one time-step difference of Open price; - `High` and `Low` channels are differences between current Open price and current High or Low prices respectively internal state data feature search: time_embedded concatenated vector of broker and portfolio statistics reward shaping search: potential-based shaping functions Data: synthetic/real """ time_dim = 30 # NOTE: changed this --> change Policy UNREAL for aux. pix control task upsampling params params = dict( # Note fake `Width` dimension to use 2d conv etc.: state_shape= { 'external': spaces.Box(low=-1, high=1, shape=(time_dim, 1, 3)), 'internal': spaces.Box(low=-2, high=2, shape=(time_dim, 1, 5)), 'metadata': spaces.Dict( { 'trial_num': spaces.Box( shape=(), low=0, high=10**10 ), 'sample_num': spaces.Box( shape=(), low=0, high=10**10 ), 'first_row': spaces.Box( shape=(), low=0, high=10**10 ) } ) }, drawdown_call=5, target_call=19, portfolio_actions=('hold', 'buy', 'sell', 'close'), skip_frame=10, metadata={} ) def __init__(self, **kwargs): """ Args: **kwargs: see BTgymBaseStrategy args. """ #super(DevStrat001, self)._set_params(self.params) super(DevStrat_4_6, self).__init__(**kwargs) self.log.debug('DEV_state_shape: {}'.format(self.p.state_shape)) self.log.debug('DEV_skip_frame: {}'.format(self.p.skip_frame)) self.log.debug('DEV_portfolio_actions: {}'.format(self.p.portfolio_actions)) self.log.debug('DEV_drawdown_call: {}'.format(self.p.drawdown_call)) self.log.debug('DEV_target_call: {}'.format(self.p.target_call)) self.log.debug('DEV_dataset_stat:\n{}'.format(self.p.dataset_stat)) self.log.debug('DEV_episode_stat:\n{}'.format(self.p.episode_stat)) # Define data channels: self.channel_O = bt.Sum(self.data.open, - self.data.open(-1)) self.channel_H = bt.Sum(self.data.high, - self.data.open) self.channel_L = bt.Sum(self.data.low, - self.data.open) # Episodic metadata: self.state['metadata'] = { 'trial_num': np.asarray(self.p.metadata['trial_num']), 'sample_num': np.asarray(self.p.metadata['sample_num']), 'first_row': np.asarray(self.p.metadata['first_row']) } def get_state(self): T = 2e3 # EURUSD # T = 1e2 # EURUSD, Z-norm # T = 1 # BTCUSD x = np.stack( [ np.frombuffer(self.channel_O.get(size=self.dim_time)), np.frombuffer(self.channel_H.get(size=self.dim_time)), np.frombuffer(self.channel_L.get(size=self.dim_time)), ], axis=-1 ) # Log-scale: NOT used. Seems to hurt performance. # x = log_transform(x) # Amplify and squash in [-1,1], seems to be best option as of 4.10.17: # T param is supposed to keep most of the signal in 'linear' part of tanh while squashing spikes. x_market = tanh(x * T) # Update inner state statistic and compose state: self.update_sliding_stat() x_broker = np.concatenate( [ np.asarray(self.sliding_stat['unrealized_pnl'])[..., None], # max_unrealized_pnl[..., None], # min_unrealized_pnl[..., None], np.asarray(self.sliding_stat['realized_pnl'])[..., None], np.asarray(self.sliding_stat['broker_value'])[..., None], np.asarray(self.sliding_stat['broker_cash'])[..., None], np.asarray(self.sliding_stat['exposure'])[..., None], # norm_episode_duration, gamma=5)[...,None], # norm_position_duration, gamma=2)[...,None], ], axis=-1 ) self.state['external'] = x_market[:, None, :] self.state['internal'] = x_broker[:, None, :] return self.state def get_reward(self): """ Shapes reward function as normalized single trade realized profit/loss, augmented with potential-based reward shaping functions in form of: F(s, a, s`) = gamma * FI(s`) - FI(s); - potential FI_1 is current normalized unrealized profit/loss; - potential FI_2 is current normalized broker value. Paper: "Policy invariance under reward transformations: Theory and application to reward shaping" by A. Ng et al., 1999; http://www.robotics.stanford.edu/~ang/papers/shaping-icml99.pdf """ # All sliding statistics for this step are already updated by get_state(). # # TODO: window size for stats averaging? Now it is time_dim - 1, can better be other? # TODO: pass actual gamma as strategy param. OR: maybe: compute reward on algo side? # Potential-based shaping function 1: # based on potential of averaged profit/loss for current opened trade (unrealized p/l): unrealised_pnl = np.asarray(self.sliding_stat['unrealized_pnl']) f1 = .99 * np.average(unrealised_pnl[1:]) - np.average(unrealised_pnl[:-1]) # Potential-based shaping function 2: # based on potential of averaged broker value, normalized wrt to max drawdown and target bounds. norm_broker_value = np.asarray(self.sliding_stat['broker_value']) f2 = .99 * np.average(norm_broker_value[1:]) - np.average(norm_broker_value[:-1]) # Main reward function: normalized realized profit/loss: realized_pnl = np.asarray(self.sliding_stat['realized_pnl'])[-1] # Weights are subject to tune: self.reward = 1.0 * f1 + 1.0 * f2 + 10.0 * realized_pnl # TODO: ------ignore-----: # 'Close-at-the-end' shaping term: # - 1.0 * self.exp_scale(avg_norm_episode_duration, gamma=6) * abs_max_norm_exposure # 'Do-not-expose-for-too-long' shaping term: # - 1.0 * self.exp_scale(avg_norm_position_duration, gamma=3) self.reward = np.clip(self.reward, -1, 1) return self.reward
def _change_task(self, is_reset=False): """Change current task according to specified loop behaviour. Args: is_reset (bool, optional): Whether we're changing task during a ``reset()``. When false, we need to handle mid-episode change. Defaults to False. """ prev_task_idx = self._curr_task_idx if self._task_sampling_behavior == TI_TASK_SAMPLING.SEQUENTIAL.name: self._curr_task_idx = (self._curr_task_idx + 1) % len(self._tasks) elif self._task_sampling_behavior == TI_TASK_SAMPLING.RANDOM.name: # sample from other tasks with equal probability self._curr_task_idx = np.random.choice( [ i for i in range(len(self._tasks)) if i != self._curr_task_idx ], 1, ).item() self._task = self._tasks[self._curr_task_idx] self._episode_iterator = self._episode_iterators[self._curr_task_idx] self.action_space = self._task.action_space # task observation space may change too self.observation_space = spaces.Dict({ **self._sim.sensor_suite.observation_spaces.spaces, **self._task.sensor_suite.observation_spaces.spaces, "task_idx": spaces.Discrete(len(self._tasks)), }) self._curr_task_label = self._task._config.get("TASK_LABEL", self._curr_task_idx) # point to new dataset and episodes self._dataset = self._task._dataset self.number_of_episodes = len(self._dataset.episodes) self._episodes = self._dataset.episodes logger.info( "Current task changed from {} (id: {}) to {} (id: {}).".format( self._tasks[prev_task_idx].__class__.__name__, prev_task_idx, self._task.__class__.__name__, self._curr_task_idx, )) # handle mid-episode change of task if not is_reset: # keep current position and sim state only if scene does not change # if same exact task is passed, you'll get same episode back if (self.current_episode.scene_id == self._episode_iterator.episodes[0].scene_id): self._episode_iterator.episodes[ 0].start_position = self.current_episode.start_position self._episode_iterator.episodes[ 0].start_rotation = self.current_episode.start_rotation # self._episode_iterator._iterator = iter(self._episode_iterator.episodes) # self.reconfigure(self._config) # observations = self.task.reset(episode=self.current_episode) # update current episode self._current_episode = next(self._episode_iterator) # reset prev position self._task.measurements.reset_measures( episode=self.current_episode, task=self.task) else: # scene is different, can't keep old position, end episode self._episode_over = True
def __init__(self, display=False, steps_to_roll=10, episode_len=600, pause_frac=0.66, n_cells=1000): self.display = display if self.display: p.connect(p.GUI) else: p.connect(p.DIRECT) # set up the world, endEffector is the tip of the left finger self.baxterId, self.endEffectorId, self.objectId = setUpWorld() # change friction of object p.changeDynamics(self.objectId, -1, lateralFriction=1) # save the state of the environment for future reset (save time on URDF loading) self.savestate = p.saveState() # get joint ranges self.lowerLimits, self.upperLimits, self.jointRanges, self.restPoses = getJointRanges( self.baxterId, includeFixed=False) # deal with display if self.display: p.configureDebugVisualizer(p.COV_ENABLE_RENDERING, 1) p.resetDebugVisualizerCamera(2., 180, 0., [0.52, 0.2, np.pi / 4.]) p.getCameraImage(320, 200, renderer=p.ER_BULLET_HARDWARE_OPENGL) sleep(1.) # number of pybullet steps to roll at each gym step self.steps_to_roll = steps_to_roll # number of gym steps to roll before the end of the movement self.move_len = int(episode_len * pause_frac) # number of pybullet steps to roll after the movement keeping the same action self.pause_steps = (episode_len - self.episode_len) * steps_to_roll # counts the number of gym steps self.step_counter = 0 # variable to store grasping orientation self.diff_or_at_grab = None # create the discretisation for the goal spaces (quaternion spaces) bounds = [[-1, 1], [-1, 1], [-1, 1], [-1, 1]] self.cvt = utils.CVT(n_cells, bounds) self.grasped = False # observation and action space self.observation_space = spaces.Dict({ 'observation': spaces.Box(-1, 1, shape=(34, )), 'desired_goal': spaces.Discrete(n_cells), 'achieved_goal': spaces.Discrete(n_cells) }) self.action_space = spaces.Box(-1, 1, shape=(8, ))
def observation_space(cls): return spaces.Dict({'user_id': spaces.Discrete(utils.NUM_USERS)})