def test_box_dtype_check(): # Related Issues: # https://github.com/openai/gym/issues/2357 # https://github.com/openai/gym/issues/2298 space = Box(0, 2, tuple(), dtype=np.float32) # casting will match the correct type assert space.contains(0.5) # float64 is not in float32 space assert not space.contains(np.array(0.5)) assert not space.contains(np.array(1))
class Basic: """Mean, stddev, min, max, for the last `lookback` bars, deltas for the last `deltas` bars, and most recent actual values of `fields`.""" def __init__(self, lookback, deltas=4, fields=('bid', 'bidsize', 'ask', 'asksize')): self.lookback = int(lookback) self.fields = [Obs._fields.index(field) for field in fields] # Can't be tuple for ndarray slicing to work self.vals = deque(maxlen=lookback) self.deltas = int(deltas) assert self.lookback > self.deltas >= 0 #obs_size = len(fields) * (5 + deltas) # mean, std, min, max, last, [deltas] for each field self.observation_space = Box(low=-np.inf, high=np.inf, shape=(5 + deltas, len(fields))) # bid ask bidsize asksize # mean # std # min # max # last # delta1 # ... def __call__(self, obs): obs = np.asarray(obs) assert obs.shape == (len(Obs._fields),) self.vals.append(obs[self.fields]) vals = np.array(self.vals) xobs = np.vstack([vals.mean(axis=0), vals.std(axis=0), vals.min(axis=0), vals.max(axis=0), vals[-1], np.diff(vals, axis=0)[-self.deltas:]]) if len(self.vals) > self.deltas and np.all(np.isfinite(xobs)): assert self.observation_space.contains(xobs), 'shape {} expected {}\n{}'.format(xobs.shape, self.observation_space.shape, xobs) else: xobs = None LOG.debug('XFORM %s', xobs) return xobs
class ContinuousCartPoleEnv(CartPoleEnv): metadata = { 'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50 } def __init__(self): super().__init__() self.action_space = Box(-1, 1, shape=[1], dtype=np.float32) def step(self, action): assert self.action_space.contains( action), "%r (%s) invalid" % (action, type(action)) state = self.state x, x_dot, theta, theta_dot = state force = self.force_mag * action.item() costheta = math.cos(theta) sintheta = math.sin(theta) temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass thetaacc = (self.gravity * sintheta - costheta * temp) / ( self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass)) xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass if self.kinematics_integrator == 'euler': x = x + self.tau * x_dot x_dot = x_dot + self.tau * xacc theta = theta + self.tau * theta_dot theta_dot = theta_dot + self.tau * thetaacc else: # semi-implicit euler x_dot = x_dot + self.tau * xacc x = x + self.tau * x_dot theta_dot = theta_dot + self.tau * thetaacc theta = theta + self.tau * theta_dot self.state = (x, x_dot, theta, theta_dot) done = x < -self.x_threshold \ or x > self.x_threshold \ or theta < -self.theta_threshold_radians \ or theta > self.theta_threshold_radians done = bool(done) if not done: reward = 1.0 elif self.steps_beyond_done is None: # Pole just fell! self.steps_beyond_done = 0 reward = 1.0 else: if self.steps_beyond_done == 0: logger.warn( "You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior." ) self.steps_beyond_done += 1 reward = 0.0 return np.array(self.state), reward, done, {}
class Instrument(gym.Env): def __init__(self, polyphony: np.int = np.inf, key_range=(0, 127)): self.__polyphony = polyphony self.__key_start, self.__key_end = key_range note_region_length = self.__key_end - self.__key_start + 1 self.__status = np.zeros((note_region_length,), dtype=np.int8) self.__teacher: Optional[Teacher] = Teacher() self.observation_space = Box(0, 1, (note_region_length,), dtype=np.int8) self.action_space = Box(-1, 1, (note_region_length,), dtype=np.int8) self.num_envs = 1 self.remotes = [] def reset(self): self.__status = np.zeros(self.__key_end - self.__key_start + 1, dtype=np.int8) return self.__status def step(self, action: np.ndarray): if self.action_space.contains(action): a = np.abs(action) if np.sum(a) > self.__polyphony: t = a[a > 0] t[self.__polyphony:] = 0 np.random.shuffle(t) a[a > 0.5] = t self.__status = a.astype(np.int8) else: raise ValueError('Action not in action_space') return self.__status, self.reward(a), False, dict() def render(self, mode='human'): print(self.__status) def reward(self, action: np.ndarray) -> np.float: if np.sum(action) > self.__polyphony: return -1.0 if self.__teacher is None: return 0.0 else: return self.__teacher.evaluate(action) @property def polyphony(self): return self.__polyphony @property def region_length(self): return self.__key_end - self.__key_start + 1 @property def teacher(self): return self.__teacher
def _rescale_from_one_space_to_other( input_val: np.ndarray, input_space: spaces.Box, output_space: spaces.Box) -> np.ndarray: """ Transforms a point in a input space to a corresponding point in the output space. The output point is as far away from the output boundaries as the input point is from the input boundaries. :param input_space: bounded Box :param output_space: bounded Box :param input_val: The input point :return: The output point in the output space """ assert input_space.shape == output_space.shape assert input_space.contains(input_val) slope = (output_space.high-output_space.low) / (input_space.high-input_space.low) return slope * (input_val - input_space.high) + output_space.high
class Delta: """The difference between the last `lookback` bid, ask, last, and volume from `lookback + 1` bars ago.""" def __init__(self, lookback): self.lookback = lookback self.vals = deque(maxlen=lookback + 1) obs_size = lookback * 4 + 2 # diffs for bid/ask/last/vol + unrealized/pos self.observation_space = Box(low=-np.inf, high=np.inf, shape=(obs_size,)) def __call__(self, obs): obs = Obs._make(obs) self.vals.append([obs.bid, obs.ask, obs.last, obs.volume / 100]) xobs = np.hstack([np.diff(np.array(self.vals), axis=0).flatten(), obs.unrealized_gain / 100, obs.position]) if len(self.vals) == self.lookback + 1 and np.isfinite(xobs).all(): assert self.observation_space.contains(xobs), 'shape {} expected {}\n{}'.format(xobs.shape, self.observation_space.shape, xobs) else: xobs = None LOG.debug('XFORM %s', xobs) return xobs
class SawyerReachTorqueEnv(MujocoEnv, Serializable, MultitaskEnv): """Implements a torque-controlled Sawyer environment""" def __init__( self, frame_skip=30, action_scale=10, keep_vel_in_obs=True, use_safety_box=False, fix_goal=False, fixed_goal=(0.05, 0.6, 0.15), reward_type='hand_distance', indicator_threshold=.05, goal_low=None, goal_high=None, ): self.quick_init(locals()) MultitaskEnv.__init__(self) self.action_scale = action_scale MujocoEnv.__init__(self, self.model_name, frame_skip=frame_skip) bounds = self.model.actuator_ctrlrange.copy() low = bounds[:, 0] high = bounds[:, 1] self.action_space = Box(low=low, high=high) if goal_low is None: goal_low = np.array([-0.1, 0.5, 0.02]) else: goal_low = np.array(goal_low) if goal_high is None: goal_high = np.array([0.1, 0.7, 0.2]) else: goal_high = np.array(goal_low) self.safety_box = Box(goal_low, goal_high) self.keep_vel_in_obs = keep_vel_in_obs self.goal_space = Box(goal_low, goal_high) obs_size = self._get_env_obs().shape[0] high = np.inf * np.ones(obs_size) low = -high self.obs_space = Box(low, high) self.achieved_goal_space = Box(-np.inf * np.ones(3), np.inf * np.ones(3)) self.observation_space = Dict([ ('observation', self.obs_space), ('desired_goal', self.goal_space), ('achieved_goal', self.achieved_goal_space), ('state_observation', self.obs_space), ('state_desired_goal', self.goal_space), ('state_achieved_goal', self.achieved_goal_space), ]) self.fix_goal = fix_goal self.fixed_goal = np.array(fixed_goal) self.use_safety_box = use_safety_box self.prev_qpos = self.init_angles.copy() self.reward_type = reward_type self.indicator_threshold = indicator_threshold self.reset() @property def model_name(self): return get_asset_full_path('sawyer_xyz/sawyer_reach_torque.xml') def reset_to_prev_qpos(self): angles = self.data.qpos.copy() velocities = self.data.qvel.copy() angles[:] = self.prev_qpos.copy() velocities[:] = 0 self.set_state(angles.flatten(), velocities.flatten()) def is_outside_box(self): pos = self.get_endeff_pos() return not self.safety_box.contains(pos) def set_to_qpos(self, qpos): angles = self.data.qpos.copy() velocities = self.data.qvel.copy() angles[:] = qpos velocities[:] = 0 self.set_state(angles.flatten(), velocities.flatten()) def viewer_setup(self): self.viewer.cam.trackbodyid = 0 self.viewer.cam.distance = 1.0 # 3rd person view cam_dist = 0.3 rotation_angle = 270 cam_pos = np.array([0, 1.0, 0.5, cam_dist, -45, rotation_angle]) for i in range(3): self.viewer.cam.lookat[i] = cam_pos[i] self.viewer.cam.distance = cam_pos[3] self.viewer.cam.elevation = cam_pos[4] self.viewer.cam.azimuth = cam_pos[5] self.viewer.cam.trackbodyid = -1 def step(self, action): action = action * self.action_scale self.do_simulation(action, self.frame_skip) if self.use_safety_box: if self.is_outside_box(): self.reset_to_prev_qpos() else: self.prev_qpos = self.data.qpos.copy() ob = self._get_obs() info = self._get_info() reward = self.compute_reward(action, ob) done = False return ob, reward, done, info def _get_env_obs(self): if self.keep_vel_in_obs: return np.concatenate([ self.sim.data.qpos.flat, self.sim.data.qvel.flat, self.get_endeff_pos(), ]) else: return np.concatenate([ self.sim.data.qpos.flat, self.get_endeff_pos(), ]) def _get_obs(self): ee_pos = self.get_endeff_pos() state_obs = self._get_env_obs() return dict( observation=state_obs, desired_goal=self._state_goal, achieved_goal=ee_pos, state_observation=state_obs, state_desired_goal=self._state_goal, state_achieved_goal=ee_pos, ) def _get_info(self): hand_distance = np.linalg.norm(self._state_goal - self.get_endeff_pos()) return dict( hand_distance=hand_distance, hand_success=float(hand_distance < self.indicator_threshold), ) def get_endeff_pos(self): return self.data.body_xpos[self.endeff_id].copy() def reset_model(self): angles = self.data.qpos.copy() velocities = self.data.qvel.copy() angles[:] = self.init_angles velocities[:] = 0 self.set_state(angles.flatten(), velocities.flatten()) self.sim.forward() self.prev_qpos = self.data.qpos.copy() def reset(self): self.reset_model() self.set_goal(self.sample_goal()) self.sim.forward() self.prev_qpos = self.data.qpos.copy() return self._get_obs() @property def init_angles(self): return [ 1.02866769e+00, -6.95207647e-01, 4.22932911e-01, 1.76670458e+00, -5.69637604e-01, 6.24117280e-01, 3.53404635e+00, ] @property def endeff_id(self): return self.model.body_names.index('leftclaw') def get_diagnostics(self, paths, prefix=''): statistics = OrderedDict() for stat_name in [ 'hand_distance', 'hand_success', ]: stat_name = stat_name stat = get_stat_in_paths(paths, 'env_infos', stat_name) statistics.update( create_stats_ordered_dict( '%s%s' % (prefix, stat_name), stat, always_show_all_stats=True, )) statistics.update( create_stats_ordered_dict( 'Final %s%s' % (prefix, stat_name), [s[-1] for s in stat], always_show_all_stats=True, )) return statistics """ Multitask functions """ @property def goal_dim(self) -> int: return 3 def get_goal(self): return { 'desired_goal': self._state_goal, 'state_desired_goal': self._state_goal, } def set_goal(self, goal): self._state_goal = goal['state_desired_goal'] def sample_goals(self, batch_size): if self.fix_goal: goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0) else: goals = np.random.uniform( self.goal_space.low, self.goal_space.high, size=(batch_size, self.goal_space.low.size), ) return { 'desired_goal': goals, 'state_desired_goal': goals, } def compute_rewards(self, actions, obs): achieved_goals = obs['achieved_goal'] desired_goals = obs['desired_goal'] hand_pos = achieved_goals goals = desired_goals distances = np.linalg.norm(hand_pos - goals, axis=1) if self.reward_type == 'hand_distance': r = -distances elif self.reward_type == 'hand_success': r = -(distances > self.indicator_threshold).astype(float) else: raise NotImplementedError("Invalid/no reward type.") return r def get_env_state(self): joint_state = self.sim.get_state() goal = self._state_goal.copy() return joint_state, goal def set_env_state(self, state): state, goal = state self.sim.set_state(state) self.sim.forward() self._state_goal = goal
def update(self, tasks, continuous_competence, all_raw_rewards): if not isinstance(continuous_competence, list): continuous_competence = [continuous_competence] if not isinstance(tasks[0], list): tasks = [tasks] #print(continuous_competence) if len(tasks) > 0: new_split = False all_order = None regions = [None] * len(tasks) for i, task in enumerate(tasks): for j, rb in enumerate(self.region_bounds): if rb.contains(task): regions[i] = j break cps = continuous_competence # add new outcomes and tasks to regions for reg, cp, task in zip(regions, cps, tasks): self.regions[reg][0].append(cp) self.regions[reg][1].append(task) self.update_nb += 1 # check if need to split ind_split = [] new_bounds = [] new_sub_regions = [] for reg in range(self.nb_regions): if len(self.regions[reg][0]) > self.maxlen: # try nb_split_attempts splits best_split_score = 0 best_abs_interest_diff = 0 best_bounds = None best_sub_regions = None is_split = False for i in range(self.nb_split_attempts): sub_reg1 = [deque(), deque()] sub_reg2 = [deque(), deque()] # repeat until the two sub regions contain at least 1/4 of the mother region while len(sub_reg1[0]) < self.maxlen / 4 or len(sub_reg2[0]) < self.maxlen / 4: # decide on dimension dim = np.random.choice(range(self.nb_dims)) threshold = self.region_bounds[reg].sample()[dim] bounds1 = Box(self.region_bounds[reg].low, self.region_bounds[reg].high, dtype=np.float32) bounds1.high[dim] = threshold bounds2 = Box(self.region_bounds[reg].low, self.region_bounds[reg].high, dtype=np.float32) bounds2.low[dim] = threshold bounds = [bounds1, bounds2] valid_bounds = True if np.any(bounds1.high - bounds1.low < self.init_size / 15): # to enforce not too small boxes ADHOC TODO #5 valid_bounds = False if np.any(bounds2.high - bounds2.low < self.init_size / 15): valid_bounds = valid_bounds and False # perform split in sub regions sub_reg1 = [deque(), deque()] sub_reg2 = [deque(), deque()] for i, task in enumerate(self.regions[reg][1]): if bounds1.contains(task): sub_reg1[1].append(task) sub_reg1[0].append(self.regions[reg][0][i]) else: sub_reg2[1].append(task) sub_reg2[0].append(self.regions[reg][0][i]) sub_regions = [sub_reg1, sub_reg2] # compute interest interest, _ = self.compute_interests(sub_regions) # compute score split_score = len(sub_reg1) * len(sub_reg2) * np.abs(interest[0] - interest[1]) if split_score >= best_split_score and np.abs(interest[0] - interest[1]) >= self.max_difference / 8 and valid_bounds: best_abs_interest_diff = np.abs(interest[0] - interest[1]) #print(interest) best_split_score = split_score best_sub_regions = sub_regions best_bounds = bounds is_split = True if interest[0] >= interest[1]: order = [1, -1] else: order = [-1, 1] if is_split: ind_split.append(reg) if best_abs_interest_diff > self.max_difference: self.max_difference = best_abs_interest_diff else: self.regions[reg][0] = deque(np.array(self.regions[reg][0])[- int (3 * len(self.regions[reg][0]) / 4):], maxlen=self.maxlen + 1) self.regions[reg][1] = deque(np.array(self.regions[reg][1])[- int(3 * len(self.regions[reg][1]) / 4):], maxlen=self.maxlen + 1) new_bounds.append(best_bounds) new_sub_regions.append(best_sub_regions) # implement splits for i, reg in enumerate(ind_split): all_order = [0] * self.nb_regions all_order.pop(reg) all_order.insert(reg, order[0]) all_order.insert(reg, order[1]) new_split = True self.region_bounds.pop(reg) self.region_bounds.insert(reg, new_bounds[i][0]) self.region_bounds.insert(reg, new_bounds[i][1]) self.regions.pop(reg) self.regions.insert(reg, new_sub_regions[i][0]) self.regions.insert(reg, new_sub_regions[i][1]) self.interest.pop(reg) self.interest.insert(reg, 0) self.interest.insert(reg, 0) self.probas.pop(reg) self.probas.insert(reg, 0) self.probas.insert(reg, 0) # recompute interest self.interest, self.probas = self.compute_interests(self.regions) assert len(self.probas) == len(self.regions) # bk-keeping if new_split: self.all_boxes.append(copy.copy(self.region_bounds)) self.all_interests.append(copy.copy(self.interest)) self.split_iterations.append(self.update_nb) return new_split, all_order else: return False, None
class Everglades(gym.Env): def __init__(self, env_config): """ :param player_num: :param game_config: a dict containing config values """ game_config = env_config['game_config'] player_num = env_config['player_num'] self.server_address = None self.sub_socket = None self.pub_socket = None self.unit_configs = None self.await_connection_time = None self.parse_game_config(game_config) # todo: this needs to be built via info over the server self.node_defs = { 1: { 'defense': 1, 'is_watchtower': False, 'is_fortress': False }, 2: { 'defense': 1.25, 'is_watchtower': True, 'is_fortress': False }, 3: { 'defense': 1.5, 'is_watchtower': False, 'is_fortress': False }, 4: { 'defense': 1.5, 'is_watchtower': False, 'is_fortress': True }, 5: { 'defense': 1.5, 'is_watchtower': False, 'is_fortress': False }, 6: { 'defense': 1.5, 'is_watchtower': False, 'is_fortress': False }, 7: { 'defense': 1.5, 'is_watchtower': False, 'is_fortress': False }, 8: { 'defense': 1.5, 'is_watchtower': False, 'is_fortress': True }, 9: { 'defense': 1.5, 'is_watchtower': False, 'is_fortress': False }, 10: { 'defense': 1.25, 'is_watchtower': True, 'is_fortress': False }, 11: { 'defense': 1, 'is_watchtower': False, 'is_fortress': False } } self.player_num = player_num self.opposing_player_num = 1 if self.player_num == 0 else 1 self.max_game_time = 300 # todo is this correct? self.num_units = 100 self.num_nodes = 11 self.unit_classes = ['controller', 'striker', 'tank'] if self.unit_configs is None: self.unit_configs = { 1: self.num_units // len(self.unit_classes), 2: self.num_units // len(self.unit_classes), 3: self.num_units - 2 * (self.num_units // len(self.unit_classes)) } assert sum(self.unit_configs.values()) == self.num_units # self.group_definitions = {} # self.opp_group_definitions = {} self.setup_state_trackers() self.observation_space = None self.message_parsing_funcs = { evgtypes.PY_GROUP_MoveUpdate.__name__: self.update_group_locs, evgtypes.PY_GROUP_Initialization.__name__: self.initialize_groups, evgtypes.PY_GROUP_CombatUpdate.__name__: self.update_group_combat, evgtypes.PY_GROUP_CreateNew.__name__: self.create_new_group, evgtypes.PY_GROUP_TransferUnits.__name__: self.transfer_units, evgtypes.PY_NODE_ControlUpdate.__name__: self.update_control_state, evgtypes.PY_GAME_Scores.__name__: self.update_game_score, evgtypes.PY_NODE_Knowledge.__name__: self.update_node_knowledge, evgtypes.PY_GROUP_Knowledge.__name__: self.update_opp_group_knowledge, evgtypes.PubError.__name__: self.handle_bad_message, } self.build_obs_space() self.action_space = Box(low=np.ones(self.num_units), high=np.repeat(self.num_nodes + 1, self.num_units)) self.pub_addr = f'tcp://*:{self.pub_socket}' self.sub_addr = f'tcp://{self.server_address}:{self.sub_socket}' self.conn = None def parse_game_config(self, config): """ Loads the dict config file :param config: server_address: IP address of the server pub_socket: publish socket number for the server sub_socket: subscribe socket number for the server unit_config: dict with keys corresponding to unit class numbers ([1-3] by default and values corresponding to the number of units for that type. Must sum to the number of total units :return: """ self.server_address = config.get('server_address') self.pub_socket = config.get('pub_socket', '5555') self.sub_socket = config.get('sub_socket', '5563') self.unit_configs = config.get('unit_config') self.await_connection_time = config.get('await_connection_time', 60) def build_obs_space(self): # control points state # [is fortress, is watchtower, pct controlled by me, pct controlled by opp] # n x 100 (num_units) # [node loc, health, group?, in transit? in combat?] # what should node loc be during transit? fraction of node left or node arriving at? #todo class should be a boolean unit_portion_low = np.array( [1, 0, 0, 0, 0]) # node loc, class, health, in transit, in combat unit_portion_high = np.array( [self.num_nodes, len(self.unit_classes) - 1, 100, 1, 1]) unit_state_low = np.tile(unit_portion_low, self.num_units) unit_state_high = np.tile(unit_portion_high, self.num_units) control_point_portion_low = np.array([ 0, 0, -100, -1 ]) # is fortress, is watchtower, percent controlled, num opp units control_point_portion_high = np.array([1, 1, 100, self.num_units]) control_point_state_low = np.tile(control_point_portion_low, self.num_nodes) control_point_state_high = np.tile(control_point_portion_high, self.num_nodes) self.observation_space = Box( low=np.concatenate([[0], control_point_state_low, unit_state_low]), high=np.concatenate([[self.max_game_time], control_point_state_high, unit_state_high])) def setup_state_trackers(self): self.units = UnitDefs() self.opp_units_at_nodes = [0] * self.num_nodes self.control_states = [0] * self.num_nodes self.global_control_states = [0] * self.num_nodes self.game_time = 0 self.game_scores = np.zeros(2) self.game_conclusion = 0 self.winning_player = -1 self.turn_num = 1 def step(self, action): logger.debug('Action shape {}'.format(action.shape)) [self.act(index, node_id) for index, node_id in enumerate(action)] self.turn_num += 1 self.conn.send(evgcommands.EndTurn(self.conn.guid)) waiting_for_action, is_game_over = False, False _counter = 0 while not waiting_for_action and not is_game_over: waiting_for_action, is_game_started, is_game_over = self.update_state( ) _counter += 1 if _counter > 100: raise TimeoutError('Game not prompted for action') state = self.build_state() info = { 'friendly_units_rem': len(np.where(self.units.get_all_states()[:, 2] > 0)), # 'opp_units_rem': len(np.where(self.opp_units.get_all_states()[:, 2] > 0)), } if is_game_over: logger.info('Game ended') info['game_ending_condition'] = self.game_conclusion reward = self.game_scores[self.player_num] else: reward = 0 return state, reward, is_game_over, info def act(self, index, node_id): units = self.units.get_units(inds=index) for unit in units: if unit.state.health > 0: logger.debug( 'Acting for unit (ind={}, group={}) to node {}'.format( index, unit.group_id, node_id)) self.conn.send( evgcommands.PY_GROUP_MoveToNode(self.conn.guid, unit.group_id, node_id)) def seed(self, seed=None): if seed is not None and type(seed) == dict: self.unit_configs = seed def reset(self): if self.conn is None: logger.info( f'Connecting to server at:\n\tPub Addr {self.pub_addr}\n\tSub Addr {self.sub_addr}' ) self.conn = Connection( self.pub_addr, self.sub_addr, self.player_num, await_connection_time=self.await_connection_time) self.setup_state_trackers() for class_type, count in self.unit_configs.items(): for i in range(count): self.conn.send( evgcommands.PY_GROUP_InitGroup(self.conn.guid, [class_type], [1], [f'{class_type}-{i}'])) waiting_for_action = False _counter = 0 while len(self.units) < self.num_units: _waiting_for_action, _, _ = self.update_state() waiting_for_action = _waiting_for_action if _waiting_for_action else False _counter += 1 if _counter > 50: logger.warning( 'Still awaiting receipt of group initialization. Attempt {}' .format(_counter)) sleep(.1) logger.info( 'Received group initialization receipts ({} Units).'.format( len(self.units))) logger.info('Sending Go') self.conn.send(evgcommands.Go(self.conn.guid)) _counter = 0 while not waiting_for_action: waiting_for_action, _, _ = self.update_state() _counter += 1 if _counter > 50: logger.warning( 'Still waiting for turn to start. Attempt {}'.format( _counter)) sleep(.1) logger.info('Game initialized') state = self.build_state() return state def render(self, mode='human'): if mode == 'string': return self.print_game_state() def close(self): if self.conn is not None: self.conn.close() def update_state(self): msgs = self.conn.receive_and_parse() waiting_for_action = False is_game_over = False is_game_started = False self.opp_units_at_nodes = [0] * self.num_nodes msgs_by_type = {} for msg in msgs: if hasattr(msg, 'player') and msg.player != self.player_num: continue msg_type = msg.__class__.__name__ if msg_type not in msgs_by_type: msgs_by_type[msg_type] = [] msgs_by_type[msg_type].append(msg) logger.debug('Message received types {}'.format(msgs_by_type.keys())) for type in msgs_by_type: if type in self.message_parsing_funcs: [ self.message_parsing_funcs[type](msg) for msg in msgs_by_type[type] ] elif type == str(evgtypes.TurnStart.__name__): waiting_for_action = True elif type == str(evgtypes.GoodBye.__name__): logger.info('Goodbye received. Turn {}'.format(self.turn_num)) is_game_over = True elif type == str(evgtypes.NewClientACK.__name__): # is_game_started = True pass elif type == str(evgtypes.TimeStamp.__name__): self.game_time = max([msg.time for msg in msgs_by_type[type]]) # NOTE: This is to avoid setting a bad time value when the server sends a bad timestamp if self.game_time > self.max_game_time: self.game_time = self.max_game_time else: logger.debug(f'Unrecognized message type {type}') return waiting_for_action, is_game_started, is_game_over def build_state(self): state = np.array([self.max_game_time - self.game_time]) # control points # is fortress, is watchtower, percent controlled for i in range(self.num_nodes): node_num = i + 1 node_def = self.node_defs[node_num] is_fortress = 1 if node_def['is_fortress'] else 0 is_watchtower = 1 if node_def['is_watchtower'] else 0 control_pct = float(self.control_states[i]) num_opp_units = self.opp_units_at_nodes[i] node_state = np.array( [is_fortress, is_watchtower, control_pct, num_opp_units]) state = np.concatenate([state, node_state]) # unit state # node loc, class, health, in transit, in combat state = np.concatenate([state, self.units.get_all_states().flatten()]) self.verify_state(state) return state def initialize_groups(self, msg): logger.debug(f'~~~Initializing groups:\n{json.dumps(msg.__dict__)}') if msg.player != self.player_num: return group_num = msg.group for start_id, group_type, count in zip(msg.start, msg.types, msg.count): if count == 0: continue group_type_num = self.unit_classes.index(group_type.lower()) state = UnitState(node_loc=int(msg.node), health=100, class_type=group_type_num, in_combat=0, in_transit=0) self.units.add_units([start_id + i for i in range(count)], [group_num] * count, [state] * count) def update_group_locs(self, msg): logger.debug( f'~~~Updating group locations:\n{json.dumps(msg.__dict__)}') if msg.player != self.player_num: return in_transit = msg.status == 'IN_TRANSIT' if msg.status == 'IN_TRANSIT': node_loc = (msg.start + msg.destination) / 2. elif msg.status == 'ARRIVED': node_loc = msg.destination else: node_loc = msg.start units = self.units.get_units(group_ids=msg.group) for unit in units: state = unit.state state.node_loc = node_loc state.in_transit = in_transit unit.add_state(state) def update_group_combat(self, msg): logger.debug(f'~~~Group combat update:\n{json.dumps(msg.__dict__)}') if msg.player != self.player_num: return units = self.units.get_units(unit_ids=msg.units) for unit, health in zip(units, msg.health): state = unit.state state.health = health unit.add_state(state) def create_new_group(self, msg): logger.warning('Create new group not supported') def transfer_units(self, msg): logger.warning('Transferring units not supported') def update_control_state(self, msg): logger.debug(f'~~~Control state update:\n{json.dumps(msg.__dict__)}') value = msg.controlvalue if msg.faction == self.player_num else -msg.controlvalue value = int(value * 100) if msg.player == self.player_num: self.control_states[msg.node - 1] = value self.global_control_states[msg.node - 1] = value def update_game_score(self, msg): logger.info(f'~~~Game score update:\n{json.dumps(msg.__dict__)}') self.game_scores = np.array([msg.player1, msg.player2]) # 1: time expired, 2: base captured, 3: units eliminated self.game_conclusion = msg.status self.winning_player = -1 if (self.game_scores[0] == self.game_scores).all() else np.argmax( self.game_scores) def update_node_knowledge(self, msg): logger.debug(f'~~~Node knowledge update:\n{json.dumps(msg.__dict__)}') is_players = msg.player == self.player_num for node, knowledge, controller, percent in zip( msg.nodes, msg.knowledge, msg.controller, msg.percent): if controller == self.opposing_player_num: percent = -percent self.global_control_states[node - 1] = percent if is_players: self.control_states[node - 1] = percent def update_opp_group_knowledge(self, msg): if msg.player != self.player_num: return logger.debug( f'~~~Opponent knowledge update:\n{json.dumps(msg.__dict__)}') num_units = sum(msg.ucount) node_num = msg.node1 - 1 tar_node_num = msg.node2 - 1 if tar_node_num < 0: self.opp_units_at_nodes[node_num] += num_units else: self.opp_units_at_nodes[node_num] += num_units / 2. self.opp_units_at_nodes[tar_node_num] += num_units / 2. def handle_bad_message(self, msg): if 'movetonode' not in msg.message.lower(): logger.info('Bad message {}'.format(json.dumps(msg.__dict__))) def verify_state(self, state): if not self.observation_space.contains(state): logger.info('Observation not within defined bounds') logger.info(self.print_game_state()) logger.info(state) assert self.observation_space.shape == state.shape def print_game_state(self): units_at_node = {} for i in range(1, self.num_nodes + 1): units_at_node[i] = 0 for i, unit in enumerate(self.units.get_all_units()): node_loc = unit.state.node_loc if node_loc not in units_at_node: units_at_node[node_loc] = 0 units_at_node[node_loc] += 1 string = 'Game state at turn {}'.format(self.turn_num) for node_num, num_units in sorted(units_at_node.items()): node_num = float(node_num) string += '\n{} ({:.1f} %): {} units, {} opp units'.format( int(node_num) if node_num.is_integer() else node_num, self.control_states[int(node_num) - 1] if node_num.is_integer() else 0, num_units, self.opp_units_at_nodes[int(node_num) - 1] if node_num.is_integer() else 0) return string
class SawyerPushAndReachXYZDoublePuckEnv(MultitaskEnv, SawyerXYZEnv): def __init__(self, puck_low=(-.4, .2), puck_high=(.4, 1), reward_type='state_distance', norm_order=1, indicator_threshold=0.06, hand_low=(-0.28, 0.3, 0.05), hand_high=(0.28, 0.9, 0.3), fix_goal=False, fixed_goal=(0.15, 0.6, 0.02, -0.15, 0.6), goal_low=(-0.25, 0.3, 0.02, -.2, .4, -.2, .4), goal_high=(0.25, 0.875, 0.02, .2, .8, .2, .8), hide_goal_markers=False, init_puck_z=0.02, num_resets_before_puck_reset=1, num_resets_before_hand_reset=1, always_start_on_same_side=True, goal_always_on_same_side=True, **kwargs): self.quick_init(locals()) MultitaskEnv.__init__(self) SawyerXYZEnv.__init__(self, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs) if puck_low is None: puck_low = self.hand_low[:2] if puck_high is None: puck_high = self.hand_high[:2] puck_low = np.array(puck_low) puck_high = np.array(puck_high) self.puck_low = puck_low self.puck_high = puck_high if goal_low is None: goal_low = np.hstack((self.hand_low, puck_low, puck_low)) if goal_high is None: goal_high = np.hstack((self.hand_high, puck_high, puck_high)) self.goal_low = np.array(goal_low) self.goal_high = np.array(goal_high) self.reward_type = reward_type self.norm_order = norm_order self.indicator_threshold = indicator_threshold self.fix_goal = fix_goal self.fixed_goal = np.array(fixed_goal) self._state_goal = None self.hide_goal_markers = hide_goal_markers self.action_space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]), dtype=np.float32) self.hand_and_two_puck_space = Box(np.hstack( (self.hand_low, puck_low, puck_low)), np.hstack((self.hand_high, puck_high, puck_high)), dtype=np.float32) self.hand_space = Box(self.hand_low, self.hand_high, dtype=np.float32) self.puck_space = Box(self.puck_low, self.puck_high, dtype=np.float32) self.observation_space = Dict([ ('observation', self.hand_and_two_puck_space), ('desired_goal', self.hand_and_two_puck_space), ('achieved_goal', self.hand_and_two_puck_space), ('state_observation', self.hand_and_two_puck_space), ('state_desired_goal', self.hand_and_two_puck_space), ('state_achieved_goal', self.hand_and_two_puck_space), ('proprio_observation', self.hand_space), ('proprio_desired_goal', self.hand_space), ('proprio_achieved_goal', self.hand_space), ]) self.init_puck_z = init_puck_z self.reset_counter = 0 self.num_resets_before_puck_reset = num_resets_before_puck_reset self.num_resets_before_hand_reset = num_resets_before_hand_reset self._always_start_on_same_side = always_start_on_same_side self._goal_always_on_same_side = goal_always_on_same_side self._set_puck_xys(self._sample_puck_xys()) @property def model_name(self): return get_asset_full_path('sawyer_xyz/sawyer_push_two_puck.xml') def viewer_setup(self): self.viewer.cam.trackbodyid = 0 self.viewer.cam.lookat[0] = 0 self.viewer.cam.lookat[1] = 1.0 self.viewer.cam.lookat[2] = 0.5 self.viewer.cam.distance = 0.3 self.viewer.cam.elevation = -45 self.viewer.cam.azimuth = 270 self.viewer.cam.trackbodyid = -1 def step(self, action): self.set_xyz_action(action) u = np.zeros(7) self.do_simulation(u) self._set_goal_marker(self._state_goal) ob = self._get_obs() reward = self.compute_reward(action, ob) info = self._get_info() done = False return ob, reward, done, info def _get_obs(self): e = self.get_endeff_pos() b = self.get_puck1_pos()[:2] c = self.get_puck2_pos()[:2] flat_obs = np.concatenate((e, b, c)) return dict( observation=flat_obs, desired_goal=self._state_goal, achieved_goal=flat_obs, state_observation=flat_obs, state_desired_goal=self._state_goal, state_achieved_goal=flat_obs, proprio_observation=flat_obs[:3], proprio_desired_goal=self._state_goal[:3], proprio_achieved_goal=flat_obs[:3], ) def _get_info(self): hand_goal = self._state_goal[:3] puck1_goal = self._state_goal[3:5] puck2_goal = self._state_goal[5:7] # hand distance hand_diff = hand_goal - self.get_endeff_pos() hand_distance = np.linalg.norm(hand_diff, ord=self.norm_order) # puck1 distance puck1_diff = puck1_goal - self.get_puck1_pos()[:2] puck1_distance = np.linalg.norm(puck1_diff, ord=self.norm_order) # puck2 distance puck2_diff = puck2_goal - self.get_puck2_pos()[:2] puck2_distance = np.linalg.norm(puck2_diff, ord=self.norm_order) # state distance state_diff = np.hstack( (self.get_endeff_pos(), self.get_puck1_pos()[:2], self.get_puck2_pos()[:2])) - self._state_goal state_distance = np.linalg.norm(state_diff, ord=self.norm_order) return dict( hand_distance=hand_distance, puck1_distance=puck1_distance, puck2_distance=puck2_distance, puck_distance_sum=puck1_distance + puck2_distance, hand_and_puck_distance=hand_distance + puck1_distance + puck2_distance, state_distance=state_distance, hand_success=float(hand_distance < self.indicator_threshold), puck1_success=float(puck1_distance < self.indicator_threshold), puck2_success=float(puck2_distance < self.indicator_threshold), hand_and_puck_success=float( hand_distance + puck1_distance + puck2_distance < self.indicator_threshold), state_success=float(state_distance < self.indicator_threshold), ) def get_puck1_pos(self): return self.data.get_body_xpos('puck1').copy() def get_puck2_pos(self): return self.data.get_body_xpos('puck2').copy() def _sample_puck_xys(self): if self._always_start_on_same_side: return np.array([-.1, 0.6]), np.array([0.1, 0.6]) else: if np.random.randint(0, 2) == 0: return np.array([0.1, 0.6]), np.array([-.1, 0.6]) else: return np.array([-.1, 0.6]), np.array([0.1, 0.6]) def _set_goal_marker(self, goal): """ This should be use ONLY for visualization. Use self._state_goal for logging, learning, etc. """ self.data.site_xpos[self.model.site_name2id('hand-goal-site')] = ( goal[:3]) self.data.site_xpos[self.model.site_name2id('puck1-goal-site')][:2] = ( goal[3:5]) self.data.site_xpos[self.model.site_name2id('puck2-goal-site')][:2] = ( goal[5:7]) if self.hide_goal_markers: self.data.site_xpos[self.model.site_name2id('hand-goal-site'), 2] = (-1000) self.data.site_xpos[self.model.site_name2id('puck1-goal-site'), 2] = (-1000) self.data.site_xpos[self.model.site_name2id('puck2-goal-site'), 2] = (-1000) def _set_puck_xys(self, puck_xys): pos1, pos2 = puck_xys qpos = self.data.qpos.flat.copy() qvel = self.data.qvel.flat.copy() qpos[7:10] = np.hstack((pos1.copy(), np.array([self.init_puck_z]))) qpos[10:14] = np.array([1, 0, 0, 0]) qpos[14:17] = np.hstack((pos2.copy(), np.array([self.init_puck_z]))) qpos[17:21] = np.array([1, 0, 0, 0]) qvel[14:21] = 0 self.set_state(qpos, qvel) def reset_model(self): if self.reset_counter % self.num_resets_before_hand_reset == 0: self._reset_hand() if self.reset_counter % self.num_resets_before_puck_reset == 0: self._set_puck_xys(self._sample_puck_xys()) if not (self.puck_space.contains(self.get_puck1_pos()[:2]) and self.puck_space.contains(self.get_puck2_pos()[:2])): self._set_puck_xys(self._sample_puck_xys()) goal = self.sample_goal() self.set_goal(goal) self.reset_counter += 1 self.reset_mocap_welds() return self._get_obs() def _reset_hand(self): velocities = self.data.qvel.copy() angles = self.data.qpos.copy() angles[:7] = self.init_angles[:7] self.set_state(angles.flatten(), velocities.flatten()) for _ in range(10): self.data.set_mocap_pos('mocap', np.array([0, 0.4, 0.02])) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) sim = self.sim if sim.model.nmocap > 0 and sim.model.eq_data is not None: for i in range(sim.model.eq_data.shape[0]): if sim.model.eq_type[i] == mujoco_py.const.EQ_WELD: # Define the xyz + quat of the mocap relative to the hand sim.model.eq_data[i, :] = np.array( [0., 0., 0., 1., 0., 0., 0.]) def reset(self): ob = self.reset_model() if self.viewer is not None: self.viewer_setup() return ob @property def init_angles(self): return [ 1.78026069e+00, -6.84415781e-01, -1.54549231e-01, 2.30672090e+00, 1.93111471e+00, 1.27854012e-01, 1.49353907e+00, 1.80196716e-03, 7.40415706e-01, 2.09895360e-02, 1, 0, 0, 0, 1.80196716e-03 + .3, 7.40415706e-01, 2.09895360e-02, 1, 0, 0, 0 ] """ Multitask functions """ def get_goal(self): return { 'desired_goal': self._state_goal, 'state_desired_goal': self._state_goal, } def set_goal(self, goal): self._state_goal = goal['state_desired_goal'] self._set_goal_marker(self._state_goal) def set_to_goal(self, goal): hand_goal = goal['state_desired_goal'][:3] for _ in range(10): self.data.set_mocap_pos('mocap', hand_goal) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) self.do_simulation(None, self.frame_skip) puck_goal = goal['state_desired_goal'][3:] self._set_puck_xys((puck_goal[:2], puck_goal[2:])) self.sim.forward() def sample_goals(self, batch_size): if self.fix_goal: goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0) else: if self._goal_always_on_same_side: goal_low = self.goal_low.copy() goal_high = self.goal_high.copy() # first puck goal_high[3] = 0 # second puck goal_low[5] = 0 goals = np.random.uniform( goal_low, goal_high, size=(batch_size, self.goal_low.size), ) else: goals = np.random.uniform( self.goal_low, self.goal_high, size=(batch_size, self.goal_low.size), ) return { 'desired_goal': goals, 'state_desired_goal': goals, } def compute_rewards(self, actions, obs): achieved_goals = obs['state_achieved_goal'] desired_goals = obs['state_desired_goal'] hand_pos = achieved_goals[:, :3] puck1_pos = achieved_goals[:, 3:5] puck2_pos = achieved_goals[:, 5:7] hand_goals = desired_goals[:, :3] puck1_goals = desired_goals[:, 3:5] puck2_goals = desired_goals[:, 5:7] hand_distances = np.linalg.norm(hand_goals - hand_pos, ord=self.norm_order, axis=1) puck1_distances = np.linalg.norm(puck1_goals - puck1_pos, ord=self.norm_order, axis=1) puck2_distances = np.linalg.norm(puck2_goals - puck2_pos, ord=self.norm_order, axis=1) if self.reward_type == 'hand_distance': r = -hand_distances elif self.reward_type == 'hand_success': r = -(hand_distances > self.indicator_threshold).astype(float) elif self.reward_type == 'puck1_distance': r = -puck1_distances elif self.reward_type == 'puck1_success': r = -(puck1_distances > self.indicator_threshold).astype(float) elif self.reward_type == 'puck2_distance': r = -puck2_distances elif self.reward_type == 'puck2_success': r = -(puck2_distances > self.indicator_threshold).astype(float) elif self.reward_type == 'state_distance': r = -np.linalg.norm( achieved_goals - desired_goals, ord=self.norm_order, axis=1) elif self.reward_type == 'vectorized_state_distance': r = -np.abs(achieved_goals - desired_goals) else: raise NotImplementedError("Invalid/no reward type.") return r def get_diagnostics(self, paths, prefix=''): statistics = OrderedDict() for stat_name in [ 'hand_distance', 'puck1_distance', 'puck2_distance', 'puck_distance_sum', 'hand_and_puck_distance', 'state_distance', 'hand_success', 'puck1_success', 'puck2_success', 'hand_and_puck_success', 'state_success', ]: stat_name = stat_name stat = get_stat_in_paths(paths, 'env_infos', stat_name) statistics.update( create_stats_ordered_dict( '%s%s' % (prefix, stat_name), stat, always_show_all_stats=True, )) statistics.update( create_stats_ordered_dict( 'Final %s%s' % (prefix, stat_name), [s[-1] for s in stat], always_show_all_stats=True, )) return statistics def get_env_state(self): base_state = super().get_env_state() goal = self._state_goal.copy() return base_state, goal def set_env_state(self, state): base_state, goal = state super().set_env_state(base_state) self._state_goal = goal self._set_goal_marker(goal)
def split(self, nid): # try nb_split_attempts splits reg = self.tree.get_node(nid).data best_split_score = 0 best_abs_interest_diff = 0 best_bounds = None best_sub_regions = None is_split = False for i in range(self.nb_split_attempts): sub_reg1 = [deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1)] sub_reg2 = [deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1)] # repeat until the two sub regions contain at least minlen of the mother region TRICK NB 1 while len(sub_reg1[0]) < self.minlen or len(sub_reg2[0]) < self.minlen: # decide on dimension dim = np.random.choice(range(self.nb_dims)) threshold = reg.bounds.sample()[dim] bounds1 = Box(reg.bounds.low, reg.bounds.high, dtype=np.float32) bounds1.high[dim] = threshold bounds2 = Box(reg.bounds.low, reg.bounds.high, dtype=np.float32) bounds2.low[dim] = threshold bounds = [bounds1, bounds2] valid_bounds = True if np.any(bounds1.high - bounds1.low < self.init_size / 15): # to enforce not too small boxes TRICK NB 2 valid_bounds = False if np.any(bounds2.high - bounds2.low < self.init_size / 15): valid_bounds = valid_bounds and False # perform split in sub regions sub_reg1 = [deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1)] sub_reg2 = [deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1)] for i, task in enumerate(reg.cps_gs[1]): if bounds1.contains(task): sub_reg1[1].append(task) sub_reg1[0].append(reg.cps_gs[0][i]) else: sub_reg2[1].append(task) sub_reg2[0].append(reg.cps_gs[0][i]) sub_regions = [sub_reg1, sub_reg2] # compute interest interest = [self.compute_interest(sub_reg1), self.compute_interest(sub_reg2)] # compute score split_score = len(sub_reg1) * len(sub_reg2) * np.abs(interest[0] - interest[1]) if split_score >= best_split_score and valid_bounds: # TRICK NB 3, max diff #and np.abs(interest[0] - interest[1]) >= self.max_difference / 8 is_split = True best_abs_interest_diff = np.abs(interest[0] - interest[1]) best_split_score = split_score best_sub_regions = sub_regions best_bounds = bounds if is_split: if best_abs_interest_diff > self.max_difference: self.max_difference = best_abs_interest_diff # add new nodes to tree for i, (cps_gs, bounds) in enumerate(zip(best_sub_regions, best_bounds)): self.tree.create_node(parent=nid, data=Region(self.maxlen, cps_gs=cps_gs, bounds=bounds, interest=interest[i])) else: #print("abort mission") # TRICK NB 6, remove old stuff if can't find split assert len(reg.cps_gs[0]) == (self.maxlen + 1) reg.cps_gs[0] = deque(islice(reg.cps_gs[0], int(self.maxlen / 4), self.maxlen + 1)) reg.cps_gs[1] = deque(islice(reg.cps_gs[1], int(self.maxlen / 4), self.maxlen + 1)) return is_split
class FourWayGridWorld(gym.Env): def __init__(self, env_config=None): self.config = copy.deepcopy(default_config) if isinstance(env_config, dict): self.config.update(env_config) self.N = self.config['N'] self.observation_space = Box(0, self.N, shape=(2, )) self.action_space = Box(-1, 1, shape=(2, )) self.map = np.ones((self.N + 1, self.N + 1), dtype=np.float32) self._fill_map() self.walls = [] self._fill_walls() self.reset() def _fill_walls(self): """Let suppose you have three walls, two vertical, one horizontal.""" if self.config['use_walls']: print('Building Walls!!!') self.walls.append(Wall([4, 6], [12, 6])) self.walls.append(Wall([4, 10], [12, 10])) self.walls.append(Wall([12, 6], [12, 10])) def _fill_map(self): self.map.fill(-0.1) self.map[int(self.N / 2), 0] = self.config['down'] self.map[0, int(self.N / 2)] = self.config['left'] self.map[self.N, int(self.N / 2)] = self.config['right'] self.map[int(self.N / 2), self.N] = self.config['up'] self.traj = [] def is_done(self, reward): if self.config['early_done']: return (not (0 < self.x < self.N)) or \ (not (0 < self.y < self.N)) or \ (self.step_num >= 2 * self.N) or \ (reward > 0.1) return self.step_num >= 2 * self.N def step(self, action): x = _clip(self.x + action[0], max(0, self.x - 1), min(self.N, self.x + 1)) y = _clip(self.y + action[1], max(0, self.y - 1), min(self.N, self.y + 1)) if any(w.intersect((self.x, self.y), (x, y)) for w in self.walls): pass else: self.x = x self.y = y loc = np.array((self.x, self.y)) reward = self.map[int(round(self.x)), int(round(self.y))] self.step_num += 1 if self.config['record_trajectory']: self.traj.append(loc) return loc, reward, self.is_done(reward), {} def render(self, mode=None, **plt_kwargs): import matplotlib.pyplot as plt fig = plt.figure(**plt_kwargs) ax = fig.add_subplot() img = ax.imshow(np.transpose(self.map)[::-1, :], aspect=1, extent=[-0.5, self.N + 0.5, -0.5, self.N + 0.5], cmap=plt.cm.hot_r) fig.colorbar(img) ax.set_aspect(1) for w in self.walls: x = [w.start[0], w.end[0]] y = [w.start[1], w.end[1]] ax.plot(x, y, c='orange') if len(self.traj) > 1: traj = np.asarray(self.traj) ax.plot(traj[:, 0], traj[:, 1], c='blue', alpha=0.75) ax.set_xlabel('X-coordinate') ax.set_ylabel('Y-coordinate') ax.set_xlim(0, self.N) ax.set_ylim(0, self.N) if self.config["_show"]: plt.show() return fig, ax def reset(self): if self.config['init_loc'] is not None: loc = self.set_loc(self.config['init_loc']) else: if self.config['int_initialize']: loc = np.random.randint(0, self.N + 1, size=(2, )).astype(np.float32) else: loc = np.random.uniform(0, self.N, size=(2, )).astype(np.float32) self.x, self.y = loc[0], loc[1] self.step_num = 0 self.traj = [loc] return loc def seed(self, s=None): if s is not None: np.random.seed(s) def set_loc(self, loc): loc = np.asarray(loc) assert self.observation_space.contains(loc) self.x = loc[0] self.y = loc[1] return loc
class UR5ePushAndReachXYZEnv(MultitaskEnv, UR5eXYZEnv): def __init__( self, puck_low=(-0.2, -0.2), # x, y puck_high=(0.2, 0.2), reward_type='state_distance', norm_order=1, indicator_threshold=0.05, hand_low=(-0.35, -0.35, 0.84), hand_high=(0.35, 0.3, 1.2), fix_goal=False, fixed_goal=( 0, 0, 0.84, 0, 0), # table top surface height .82, puck half height .02 goal_low=(-.2, -.2, .84, -.2, -.2), goal_high=(.2, .2, .84, .2, .2), hide_goal_markers=False, init_puck_z=0.84, init_hand_xyz=(0, 0, 0.9), reset_free=False, xml_path='ur5e_sim/ur5e_push_puck.xml', clamp_puck_on_step=True, puck_radius=.04, **kwargs): self.quick_init(locals()) self.model_name = get_asset_full_path(xml_path) MultitaskEnv.__init__(self) UR5eXYZEnv.__init__(self, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs) if puck_low is None: puck_low = self.hand_low[:2] if puck_high is None: puck_high = self.hand_high[:2] puck_low = np.array(puck_low) puck_high = np.array(puck_high) self.puck_low = puck_low self.puck_high = puck_high # The goal ranges are composed by 5 values: # hand_goal_x, hand_goal_y, hand_goal_z, puck_goal_x, puck_goal_y # The robot first move gripper to puck goal and then push the puck to hand goal if goal_low is None: goal_low = np.hstack((self.hand_low, puck_low)) if goal_high is None: goal_high = np.hstack((self.hand_high, puck_high)) self.goal_low = np.array(goal_low) self.goal_high = np.array(goal_high) self.reward_type = reward_type self.norm_order = norm_order self.indicator_threshold = indicator_threshold self.fix_goal = fix_goal self.fixed_goal = np.array(fixed_goal) self._state_goal = None self.hide_goal_markers = hide_goal_markers self.action_space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]), dtype=np.float32) self.hand_and_puck_space = Box(np.hstack((self.hand_low, puck_low)), np.hstack((self.hand_high, puck_high)), dtype=np.float32) self.hand_space = Box(self.hand_low, self.hand_high, dtype=np.float32) self.observation_space = Dict([ ('observation', self.hand_and_puck_space), ('desired_goal', self.hand_and_puck_space), ('achieved_goal', self.hand_and_puck_space), ('state_observation', self.hand_and_puck_space), ('state_desired_goal', self.hand_and_puck_space), ('state_achieved_goal', self.hand_and_puck_space), ('proprio_observation', self.hand_space), ('proprio_desired_goal', self.hand_space), ('proprio_achieved_goal', self.hand_space), ]) self.init_puck_z = init_puck_z self.init_hand_xyz = np.array(init_hand_xyz) self._set_puck_xy(self.sample_puck_xy()) self.reset_free = reset_free self.reset_counter = 0 self.puck_space = Box(self.puck_low, self.puck_high, dtype=np.float32) self.clamp_puck_on_step = clamp_puck_on_step self.puck_radius = puck_radius self.reset() def viewer_setup(self): # FIXME change the cam according to the env self.viewer.cam.trackbodyid = 0 self.viewer.cam.lookat[0] = 0 self.viewer.cam.lookat[1] = -1 self.viewer.cam.lookat[2] = 2 self.viewer.cam.distance = 0.5 self.viewer.cam.elevation = -45 self.viewer.cam.azimuth = 90 self.viewer.cam.trackbodyid = -1 def step(self, action): self.set_xyz_action(action) # FIXME refer DOF u = np.zeros(6) # Since no actuator in robot, u takes no effect, # only run the simulation for frame_skip times self.do_simulation(u) if self.clamp_puck_on_step: curr_puck_pos = self.get_puck_pos()[:2] curr_puck_pos = np.clip(curr_puck_pos, self.puck_space.low, self.puck_space.high) self._set_puck_xy(curr_puck_pos) self._set_goal_marker(self._state_goal) ob = self._get_obs() reward = self.compute_reward(action, ob) info = self._get_info() done = False return ob, reward, done, info def _get_obs(self): e = self.get_endeff_pos() b = self.get_puck_pos()[:2] flat_obs = np.concatenate((e, b)) return dict( observation=flat_obs, desired_goal=self._state_goal, achieved_goal=flat_obs, state_observation=flat_obs, state_desired_goal=self._state_goal, state_achieved_goal=flat_obs, proprio_observation=flat_obs[:3], proprio_desired_goal=self._state_goal[:3], proprio_achieved_goal=flat_obs[:3], ) def _get_info(self): hand_goal = self._state_goal[:3] puck_goal = self._state_goal[3:] # hand distance hand_diff = hand_goal - self.get_endeff_pos() hand_distance = np.linalg.norm(hand_diff, ord=self.norm_order) hand_distance_l1 = np.linalg.norm(hand_diff, 1) hand_distance_l2 = np.linalg.norm(hand_diff, 2) # puck distance puck_diff = puck_goal - self.get_puck_pos()[:2] puck_distance = np.linalg.norm(puck_diff, ord=self.norm_order) puck_distance_l1 = np.linalg.norm(puck_diff, 1) puck_distance_l2 = np.linalg.norm(puck_diff, 2) # touch distance touch_diff = self.get_endeff_pos() - self.get_puck_pos() touch_distance = np.linalg.norm(touch_diff, ord=self.norm_order) touch_distance_l1 = np.linalg.norm(touch_diff, ord=1) touch_distance_l2 = np.linalg.norm(touch_diff, ord=2) # state distance state_diff = np.hstack((self.get_endeff_pos(), self.get_puck_pos()[:2])) - self._state_goal state_distance = np.linalg.norm(state_diff, ord=self.norm_order) state_distance_l1 = np.linalg.norm(state_diff, ord=1) state_distance_l2 = np.linalg.norm(state_diff, ord=2) return dict( hand_distance=hand_distance, hand_distance_l1=hand_distance_l1, hand_distance_l2=hand_distance_l2, puck_distance=puck_distance, puck_distance_l1=puck_distance_l1, puck_distance_l2=puck_distance_l2, hand_and_puck_distance=hand_distance + puck_distance, hand_and_puck_distance_l1=hand_distance_l1 + puck_distance_l1, hand_and_puck_distance_l2=hand_distance_l2 + puck_distance_l2, touch_distance=touch_distance, touch_distance_l1=touch_distance_l1, touch_distance_l2=touch_distance_l2, state_distance=state_distance, state_distance_l1=state_distance_l1, state_distance_l2=state_distance_l2, hand_success=float(hand_distance < self.indicator_threshold), puck_success=float(puck_distance < self.indicator_threshold), hand_and_puck_success=float( hand_distance + puck_distance < self.indicator_threshold), touch_success=float(touch_distance < self.indicator_threshold), state_success=float(state_distance < self.indicator_threshold), ) def get_puck_pos(self): return self.data.get_body_xpos('puck').copy() def sample_puck_xy(self): return np.array([0, -0.1]) def _set_goal_marker(self, goal): """This should be use ONLY for visualization. Use self._state_goal for logging, learning, etc. """ self.data.site_xpos[self.model.site_name2id('hand-goal-site')] = ( goal[:3]) self.data.site_xpos[self.model.site_name2id('puck-goal-site')][:2] = ( goal[3:]) if self.hide_goal_markers: self.data.site_xpos[self.model.site_name2id('hand-goal-site'), 2] = (-1000) self.data.site_xpos[self.model.site_name2id('puck-goal-site'), 2] = (-1000) def _set_puck_xy(self, pos): """ WARNING: this resets the sites (because set_state resets sights do). """ qpos = self.data.qpos.flat.copy() qvel = self.data.qvel.flat.copy() # FIXME for UR5e-2F85 model in push puck env, qpos[0:6] are arm joints, # [6:12] are gripper joints, [12:15] is puck position, [15, 19] is puck orientation qpos[12:15] = np.hstack((pos.copy(), np.array([self.init_puck_z]))) qpos[15:19] = np.array([1, 0, 0, 0]) qvel[12:19] = 0 # all velocity equal to zero self.set_state(qpos, qvel) def reset_model(self): self._reset_hand() if not self.reset_free: self._set_puck_xy(self.sample_puck_xy()) if not (self.puck_space.contains(self.get_puck_pos()[:2])): self._set_puck_xy(self.sample_puck_xy()) goal = self.sample_valid_goal() self.set_goal(goal) self.reset_counter += 1 self.reset_mocap_welds() return self._get_obs() def _reset_hand(self): velocities = self.data.qvel.copy() angles = self.data.qpos.copy() # FIXME refer DOF angles[:6] = self.init_angles[:6] self.set_state(angles.flatten(), velocities.flatten()) # TODO meaning of the range? for _ in range(9): self.data.set_mocap_pos('mocap', self.init_hand_xyz.copy()) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) def reset(self): ob = self.reset_model() if self.viewer is not None: self.viewer_setup() return ob @property def init_angles(self): # FIXME refer DOF return [ 0, 0, 0, 0, 0, 0, 5.05442647e-04, 6.00496057e-01, 3.06443862e-02, 1, 0, 0, 0 ] """ Multitask functions """ def get_goal(self): return { 'desired_goal': self._state_goal, 'state_desired_goal': self._state_goal, } def set_goal(self, goal): self._state_goal = goal['state_desired_goal'] self._set_goal_marker(self._state_goal) def set_to_goal(self, goal): hand_goal = goal['state_desired_goal'][:3] # FIXME meaning of the range? for _ in range(9): self.data.set_mocap_pos('mocap', hand_goal) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) self.do_simulation(None, self.frame_skip) puck_goal = goal['state_desired_goal'][3:] self._set_puck_xy(puck_goal) self.sim.forward() def sample_valid_goal(self): goal = self.sample_goal() hand_goal_xy = goal['state_desired_goal'][:2] puck_goal_xy = goal['state_desired_goal'][3:] dist = np.linalg.norm(hand_goal_xy - puck_goal_xy) while dist <= self.puck_radius: goal = self.sample_goal() hand_goal_xy = goal['state_desired_goal'][:2] puck_goal_xy = goal['state_desired_goal'][3:] dist = np.linalg.norm(hand_goal_xy - puck_goal_xy) return goal def sample_goals(self, batch_size): if self.fix_goal: goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0) else: goals = np.random.uniform( self.goal_low, self.goal_high, size=(batch_size, self.goal_low.size), ) return { 'desired_goal': goals, 'state_desired_goal': goals, } def compute_rewards(self, actions, obs): achieved_goals = obs['state_achieved_goal'] desired_goals = obs['state_desired_goal'] hand_pos = achieved_goals[:, :3] puck_pos = achieved_goals[:, 3:] hand_goals = desired_goals[:, :3] puck_goals = desired_goals[:, 3:] hand_distances = np.linalg.norm(hand_goals - hand_pos, ord=self.norm_order, axis=1) puck_distances = np.linalg.norm(puck_goals - puck_pos, ord=self.norm_order, axis=1) puck_zs = self.init_puck_z * np.ones((desired_goals.shape[0], 1)) touch_distances = np.linalg.norm( hand_pos - np.hstack((puck_pos, puck_zs)), ord=self.norm_order, axis=1, ) if self.reward_type == 'hand_distance': r = -hand_distances elif self.reward_type == 'hand_success': r = -(hand_distances > self.indicator_threshold).astype(float) elif self.reward_type == 'puck_distance': r = -puck_distances elif self.reward_type == 'puck_success': r = -(puck_distances > self.indicator_threshold).astype(float) elif self.reward_type == 'hand_and_puck_distance': r = -(puck_distances + hand_distances) elif self.reward_type == 'state_distance': r = -np.linalg.norm( achieved_goals - desired_goals, ord=self.norm_order, axis=1) elif self.reward_type == 'vectorized_state_distance': r = -np.abs(achieved_goals - desired_goals) elif self.reward_type == 'touch_distance': r = -touch_distances elif self.reward_type == 'touch_success': r = -(touch_distances > self.indicator_threshold).astype(float) else: raise NotImplementedError("Invalid/no reward type.") return r def get_diagnostics(self, paths, prefix=''): statistics = OrderedDict() for stat_name in [ 'hand_distance', 'hand_distance_l1', 'hand_distance_l2', 'puck_distance', 'puck_distance_l1', 'puck_distance_l2', 'hand_and_puck_distance', 'hand_and_puck_distance_l1', 'hand_and_puck_distance_l2', 'state_distance', 'state_distance_l1', 'state_distance_l2', 'touch_distance', 'touch_distance_l1', 'touch_distance_l2', 'hand_success', 'puck_success', 'hand_and_puck_success', 'state_success', 'touch_success', ]: stat_name = stat_name stat = get_stat_in_paths(paths, 'env_infos', stat_name) statistics.update( create_stats_ordered_dict( '%s%s' % (prefix, stat_name), stat, always_show_all_stats=True, )) statistics.update( create_stats_ordered_dict( 'Final %s%s' % (prefix, stat_name), [s[-1] for s in stat], always_show_all_stats=True, )) return statistics def get_env_state(self): base_state = super().get_env_state() goal = self._state_goal.copy() return base_state, goal def set_env_state(self, state): base_state, goal = state super().set_env_state(base_state) self._state_goal = goal self._set_goal_marker(goal)
class SawyerPushAndReachXYZEnv(MultitaskEnv, SawyerXYZEnv): def __init__(self, puck_low=(-.4, .2), puck_high=(.4, 1), reward_type='state_distance', norm_order=1, indicator_threshold=0.06, hand_low=(-0.28, 0.3, 0.05), hand_high=(0.28, 0.9, 0.3), fix_goal=False, fixed_goal=(0.15, 0.6, 0.02, -0.15, 0.6), goal_low=(-0.25, 0.3, 0.02, -.2, .4), goal_high=(0.25, 0.875, 0.02, .2, .8), hide_goal_markers=False, init_puck_z=0.02, init_hand_xyz=(0, 0.4, 0.07), reset_free=False, xml_path='sawyer_xyz/sawyer_push_puck.xml', clamp_puck_on_step=False, puck_radius=.07, **kwargs): self.quick_init(locals()) self.model_name = get_asset_full_path(xml_path) MultitaskEnv.__init__(self) SawyerXYZEnv.__init__(self, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs) if puck_low is None: puck_low = self.hand_low[:2] if puck_high is None: puck_high = self.hand_high[:2] puck_low = np.array(puck_low) puck_high = np.array(puck_high) self.puck_low = puck_low self.puck_high = puck_high if goal_low is None: goal_low = np.hstack((self.hand_low, puck_low)) if goal_high is None: goal_high = np.hstack((self.hand_high, puck_high)) self.goal_low = np.array(goal_low) self.goal_high = np.array(goal_high) self.reward_type = reward_type self.norm_order = norm_order self.indicator_threshold = indicator_threshold self.fix_goal = fix_goal self.fixed_goal = np.array(fixed_goal) self._state_goal = None self.hide_goal_markers = hide_goal_markers self.action_space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]), dtype=np.float32) self.hand_and_puck_space = Box(np.hstack((self.hand_low, puck_low)), np.hstack((self.hand_high, puck_high)), dtype=np.float32) self.hand_space = Box(self.hand_low, self.hand_high, dtype=np.float32) self.observation_space = Dict([ ('observation', self.hand_and_puck_space), ('desired_goal', self.hand_and_puck_space), ('achieved_goal', self.hand_and_puck_space), ('state_observation', self.hand_and_puck_space), ('state_desired_goal', self.hand_and_puck_space), ('state_achieved_goal', self.hand_and_puck_space), ('proprio_observation', self.hand_space), ('proprio_desired_goal', self.hand_space), ('proprio_achieved_goal', self.hand_space), ]) self.init_puck_z = init_puck_z self.init_hand_xyz = np.array(init_hand_xyz) self._set_puck_xy(self.sample_puck_xy()) self.reset_free = reset_free self.reset_counter = 0 self.puck_space = Box(self.puck_low, self.puck_high, dtype=np.float32) self.clamp_puck_on_step = clamp_puck_on_step self.puck_radius = puck_radius self.reset() def viewer_setup(self): self.viewer.cam.trackbodyid = 0 self.viewer.cam.lookat[0] = 0 self.viewer.cam.lookat[1] = 1.0 self.viewer.cam.lookat[2] = 0.5 self.viewer.cam.distance = 0.3 self.viewer.cam.elevation = -45 self.viewer.cam.azimuth = 270 self.viewer.cam.trackbodyid = -1 def step(self, action): self.set_xyz_action(action) u = np.zeros(8) u[7] = 1 self.do_simulation(u) if self.clamp_puck_on_step: curr_puck_pos = self.get_puck_pos()[:2] curr_puck_pos = np.clip(curr_puck_pos, self.puck_space.low, self.puck_space.high) self._set_puck_xy(curr_puck_pos) self._set_goal_marker(self._state_goal) ob = self._get_obs() reward = self.compute_reward(action, ob) info = self._get_info() done = False return ob, reward, done, info def _get_obs(self): e = self.get_endeff_pos() b = self.get_puck_pos()[:2] flat_obs = np.concatenate((e, b)) return dict( observation=flat_obs, desired_goal=self._state_goal, achieved_goal=flat_obs, state_observation=flat_obs, state_desired_goal=self._state_goal, state_achieved_goal=flat_obs, proprio_observation=flat_obs[:3], proprio_desired_goal=self._state_goal[:3], proprio_achieved_goal=flat_obs[:3], ) def _get_info(self): hand_goal = self._state_goal[:3] puck_goal = self._state_goal[3:] # hand distance hand_diff = hand_goal - self.get_endeff_pos() hand_distance = np.linalg.norm(hand_diff, ord=self.norm_order) hand_distance_l1 = np.linalg.norm(hand_diff, 1) hand_distance_l2 = np.linalg.norm(hand_diff, 2) # puck distance puck_diff = puck_goal - self.get_puck_pos()[:2] puck_distance = np.linalg.norm(puck_diff, ord=self.norm_order) puck_distance_l1 = np.linalg.norm(puck_diff, 1) puck_distance_l2 = np.linalg.norm(puck_diff, 2) # touch distance touch_diff = self.get_endeff_pos() - self.get_puck_pos() touch_distance = np.linalg.norm(touch_diff, ord=self.norm_order) touch_distance_l1 = np.linalg.norm(touch_diff, ord=1) touch_distance_l2 = np.linalg.norm(touch_diff, ord=2) # state distance state_diff = np.hstack((self.get_endeff_pos(), self.get_puck_pos()[:2])) - self._state_goal state_distance = np.linalg.norm(state_diff, ord=self.norm_order) state_distance_l1 = np.linalg.norm(state_diff, ord=1) state_distance_l2 = np.linalg.norm(state_diff, ord=2) return dict( hand_distance=hand_distance, hand_distance_l1=hand_distance_l1, hand_distance_l2=hand_distance_l2, puck_distance=puck_distance, puck_distance_l1=puck_distance_l1, puck_distance_l2=puck_distance_l2, hand_and_puck_distance=hand_distance + puck_distance, hand_and_puck_distance_l1=hand_distance_l1 + puck_distance_l1, hand_and_puck_distance_l2=hand_distance_l2 + puck_distance_l2, touch_distance=touch_distance, touch_distance_l1=touch_distance_l1, touch_distance_l2=touch_distance_l2, state_distance=state_distance, state_distance_l1=state_distance_l1, state_distance_l2=state_distance_l2, hand_success=float(hand_distance < self.indicator_threshold), puck_success=float(puck_distance < self.indicator_threshold), hand_and_puck_success=float( hand_distance + puck_distance < self.indicator_threshold), touch_success=float(touch_distance < self.indicator_threshold), state_success=float(state_distance < self.indicator_threshold), ) def get_puck_pos(self): return self.data.get_body_xpos('puck').copy() def sample_puck_xy(self): return np.array([0, 0.6]) def _set_goal_marker(self, goal): """ This should be use ONLY for visualization. Use self._state_goal for logging, learning, etc. """ self.data.site_xpos[self.model.site_name2id('hand-goal-site')] = ( goal[:3]) self.data.site_xpos[self.model.site_name2id('puck-goal-site')][:2] = ( goal[3:]) if self.hide_goal_markers: self.data.site_xpos[self.model.site_name2id('hand-goal-site'), 2] = (-1000) self.data.site_xpos[self.model.site_name2id('puck-goal-site'), 2] = (-1000) def _set_puck_xy(self, pos): """ WARNING: this resets the sites (because set_state resets sights do). """ qpos = self.data.qpos.flat.copy() qvel = self.data.qvel.flat.copy() qpos[8:11] = np.hstack((pos.copy(), np.array([self.init_puck_z]))) qpos[11:15] = np.array([1, 0, 0, 0]) qvel[8:15] = 0 self.set_state(qpos, qvel) def reset_model(self): self._reset_hand() if not self.reset_free: self._set_puck_xy(self.sample_puck_xy()) if not (self.puck_space.contains(self.get_puck_pos()[:2])): self._set_puck_xy(self.sample_puck_xy()) goal = self.sample_valid_goal() self.set_goal(goal) self.reset_counter += 1 self.reset_mocap_welds() return self._get_obs() def _reset_hand(self): velocities = self.data.qvel.copy() angles = self.data.qpos.copy() angles[:7] = self.init_angles[:7] self.set_state(angles.flatten(), velocities.flatten()) for _ in range(10): self.data.set_mocap_pos('mocap', self.init_hand_xyz.copy()) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) def reset(self): ob = self.reset_model() if self.viewer is not None: self.viewer_setup() return ob @property def init_angles(self): return [ 1.7244448, -0.92036369, 0.10234232, 2.11178144, 2.97668632, -0.38664629, 0.54065733, 5.05442647e-04, 6.00496057e-01, 3.06443862e-02, 1, 0, 0, 0 ] """ Multitask functions """ def get_goal(self): return { 'desired_goal': self._state_goal, 'state_desired_goal': self._state_goal, } def set_goal(self, goal): self._state_goal = goal['state_desired_goal'] self._set_goal_marker(self._state_goal) def set_to_goal(self, goal): hand_goal = goal['state_desired_goal'][:3] for _ in range(10): self.data.set_mocap_pos('mocap', hand_goal) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) self.do_simulation(None, self.frame_skip) puck_goal = goal['state_desired_goal'][3:] self._set_puck_xy(puck_goal) self.sim.forward() def sample_valid_goal(self): goal = self.sample_goal() hand_goal_xy = goal['state_desired_goal'][:2] puck_goal_xy = goal['state_desired_goal'][3:] dist = np.linalg.norm(hand_goal_xy - puck_goal_xy) while (dist <= self.puck_radius and not self.fix_goal): goal = self.sample_goal() hand_goal_xy = goal['state_desired_goal'][:2] puck_goal_xy = goal['state_desired_goal'][3:] dist = np.linalg.norm(hand_goal_xy - puck_goal_xy) return goal def sample_goals(self, batch_size): if self.fix_goal: goals = np.repeat(self.fixed_goal.copy()[None], batch_size, 0) else: goals = np.random.uniform( self.goal_low, self.goal_high, size=(batch_size, self.goal_low.size), ) return { 'desired_goal': goals, 'state_desired_goal': goals, } def compute_rewards(self, actions, obs): achieved_goals = obs['state_achieved_goal'] desired_goals = obs['state_desired_goal'] hand_pos = achieved_goals[:, :3] puck_pos = achieved_goals[:, 3:] hand_goals = desired_goals[:, :3] puck_goals = desired_goals[:, 3:] hand_distances = np.linalg.norm(hand_goals - hand_pos, ord=self.norm_order, axis=1) puck_distances = np.linalg.norm(puck_goals - puck_pos, ord=self.norm_order, axis=1) puck_zs = self.init_puck_z * np.ones((desired_goals.shape[0], 1)) touch_distances = np.linalg.norm( hand_pos - np.hstack((puck_pos, puck_zs)), ord=self.norm_order, axis=1, ) if self.reward_type == 'hand_distance': r = -hand_distances elif self.reward_type == 'hand_success': r = -(hand_distances > self.indicator_threshold).astype(float) elif self.reward_type == 'hand_success_positive': r = (hand_distances < self.indicator_threshold).astype(float) elif self.reward_type == 'puck_distance': r = -puck_distances elif self.reward_type == 'puck_success': r = -(puck_distances > self.indicator_threshold).astype(float) elif self.reward_type == 'puck_success_positive': r = (puck_distances < self.indicator_threshold).astype(float) elif self.reward_type == 'hand_and_puck_distance': r = -(puck_distances + hand_distances) elif self.reward_type == 'state_distance': r = -np.linalg.norm( achieved_goals - desired_goals, ord=self.norm_order, axis=1) elif self.reward_type == 'vectorized_state_distance': r = -np.abs(achieved_goals - desired_goals) elif self.reward_type == 'touch_distance': r = -touch_distances elif self.reward_type == 'touch_success': r = -(touch_distances > self.indicator_threshold).astype(float) else: raise NotImplementedError("Invalid/no reward type.") return r def get_diagnostics(self, paths, prefix=''): statistics = OrderedDict() for stat_name in [ 'hand_distance', 'hand_distance_l1', 'hand_distance_l2', 'puck_distance', 'puck_distance_l1', 'puck_distance_l2', 'hand_and_puck_distance', 'hand_and_puck_distance_l1', 'hand_and_puck_distance_l2', 'state_distance', 'state_distance_l1', 'state_distance_l2', 'touch_distance', 'touch_distance_l1', 'touch_distance_l2', 'hand_success', 'puck_success', 'hand_and_puck_success', 'state_success', 'touch_success', ]: stat_name = stat_name stat = get_stat_in_paths(paths, 'env_infos', stat_name) statistics.update( create_stats_ordered_dict( '%s%s' % (prefix, stat_name), stat, always_show_all_stats=True, )) statistics.update( create_stats_ordered_dict( 'Final %s%s' % (prefix, stat_name), [s[-1] for s in stat], always_show_all_stats=True, )) return statistics def get_env_state(self): base_state = super().get_env_state() goal = self._state_goal.copy() return base_state, goal def set_env_state(self, state): base_state, goal = state super().set_env_state(base_state) self._state_goal = goal self._set_goal_marker(goal)
class SawyerPushAndReachXYZEnv(MultitaskEnv, SawyerXYZEnv): def __init__( self, puck_low=(-.35, .25), puck_high=(.35, .95), norm_order=1, indicator_threshold=0.06, touch_threshold=0.1, # I just chose this number after doing a few runs and looking at a histogram hand_low=(-0.28, 0.3, 0.05), hand_high=(0.28, 0.9, 0.3), fix_goal=True, fixed_goal=(0.15, 0.6, 0.02, -0.15, 0.6), goal_low=(-0.25, 0.3, 0.02, -0.2, .4), goal_high=(0.25, 0.875, 0.02, .2, .8), init_puck_z=0.035, init_hand_xyz=(0, 0.4, 0.07), reset_free=False, xml_path='sawyer_xyz/sawyer_push_puck.xml', clamp_puck_on_step=False, puck_radius=.07, **kwargs ): self.quick_init(locals()) self.model_name = get_asset_full_path(xml_path) self.goal_type = kwargs.pop('goal_type', 'puck') self.dense_reward = kwargs.pop('dense_reward', False) self.indicator_threshold = kwargs.pop('goal_tolerance', indicator_threshold) self.fixed_goal = np.array(kwargs.pop('goal', fixed_goal)) self.task_agnostic = kwargs.pop('task_agnostic', False) self.init_puck_xy = np.array(kwargs.pop('init_puck_xy', [0, 0.6])) self.init_hand_xyz = np.array(kwargs.pop('init_hand_xyz', init_hand_xyz)) MultitaskEnv.__init__(self) SawyerXYZEnv.__init__( self, hand_low=hand_low, hand_high=hand_high, model_name=self.model_name, **kwargs ) if puck_low is None: puck_low = self.hand_low[:2] if puck_high is None: puck_high = self.hand_high[:2] self.puck_low = np.array(puck_low) self.puck_high = np.array(puck_high) if goal_low is None: goal_low = np.hstack((self.hand_low, puck_low)) if goal_high is None: goal_high = np.hstack((self.hand_high, puck_high)) self.goal_low = np.array(goal_low) self.goal_high = np.array(goal_high) self.norm_order = norm_order self.touch_threshold = touch_threshold self.fix_goal = fix_goal self._state_goal = None self.hide_goal_markers = self.task_agnostic self.action_space = Box(np.array([-1, -1, -1]), np.array([1, 1, 1]), dtype=np.float32) self.hand_and_puck_space = Box( np.hstack((self.hand_low, puck_low)), np.hstack((self.hand_high, puck_high)), dtype=np.float32 ) self.hand_space = Box(self.hand_low, self.hand_high, dtype=np.float32) self.observation_space = Dict([ ('observation', self.hand_and_puck_space), ('desired_goal', self.hand_and_puck_space), ('achieved_goal', self.hand_and_puck_space), ('state_observation', self.hand_and_puck_space), ('state_desired_goal', self.hand_and_puck_space), ('state_achieved_goal', self.hand_and_puck_space), ('proprio_observation', self.hand_space), ('proprio_desired_goal', self.hand_space), ('proprio_achieved_goal', self.hand_space), ]) self.init_puck_z = init_puck_z self._set_puck_xy(self.sample_puck_xy()) self.reset_free = reset_free self.reset_counter = 0 self.puck_space = Box(self.puck_low, self.puck_high, dtype=np.float32) self.clamp_puck_on_step = clamp_puck_on_step self.puck_radius = puck_radius self.reset() def viewer_setup(self): self.viewer.cam.trackbodyid = 0 self.viewer.cam.lookat[0] = 0 self.viewer.cam.lookat[1] = 1.0 self.viewer.cam.lookat[2] = 0.5 self.viewer.cam.distance = 0.3 self.viewer.cam.elevation = -45 self.viewer.cam.azimuth = 270 self.viewer.cam.trackbodyid = -1 def step(self, action): self.set_xyz_action(action) u = np.zeros(8) u[7] = 1 self.do_simulation(u) if self.clamp_puck_on_step: curr_puck_pos = self.get_puck_pos()[:2] curr_puck_pos = np.clip(curr_puck_pos, self.puck_space.low, self.puck_space.high) self._set_puck_xy(curr_puck_pos) self._set_goal_marker(self._state_goal) next_state = self._get_obs()['state_observation'] # reward = self.compute_rewards(action, ob) if not self.task_agnostic else 0. # info = self._get_info() # done = self.is_goal_state(ob['observation']) if not self.task_agnostic else False return next_state def _get_obs(self): e = self.get_endeff_pos() b = self.get_puck_pos()[:2] flat_obs = np.concatenate((e, b)) return dict( observation=flat_obs, desired_goal=self._state_goal, achieved_goal=flat_obs, state_observation=flat_obs, state_desired_goal=self._state_goal, state_achieved_goal=flat_obs, proprio_observation=flat_obs[:3], proprio_desired_goal=self._state_goal[:3], proprio_achieved_goal=flat_obs[:3], ) def _get_info(self, state=None): hand_goal = self._state_goal[:3] puck_goal = self._state_goal[3:] endeff_pos = self.get_endeff_pos() puck_pos = self.get_puck_pos() if state is not None: endeff_pos = state[:3] puck_pos[:2] = state[3:] # hand distance hand_diff = hand_goal - endeff_pos hand_distance = np.linalg.norm(hand_diff, ord=self.norm_order) hand_distance_l1 = np.linalg.norm(hand_diff, 1) hand_distance_l2 = np.linalg.norm(hand_diff, 2) # puck distance puck_diff = puck_goal - puck_pos[:2] puck_distance = np.linalg.norm(puck_diff, ord=self.norm_order) puck_distance_l1 = np.linalg.norm(puck_diff, 1) puck_distance_l2 = np.linalg.norm(puck_diff, 2) # touch distance touch_diff = endeff_pos - puck_pos touch_distance = np.linalg.norm(touch_diff, ord=self.norm_order) touch_distance_l1 = np.linalg.norm(touch_diff, ord=1) touch_distance_l2 = np.linalg.norm(touch_diff, ord=2) # state distance state_diff = np.hstack((endeff_pos, puck_pos[:2])) - self._state_goal state_distance = np.linalg.norm(state_diff, ord=self.norm_order) state_distance_l1 = np.linalg.norm(state_diff, ord=1) state_distance_l2 = np.linalg.norm(state_diff, ord=2) return dict( hand_distance=hand_distance, hand_distance_l1=hand_distance_l1, hand_distance_l2=hand_distance_l2, puck_distance=puck_distance, puck_distance_l1=puck_distance_l1, puck_distance_l2=puck_distance_l2, hand_and_puck_distance=hand_distance+puck_distance, hand_and_puck_distance_l1=hand_distance_l1+puck_distance_l1, hand_and_puck_distance_l2=hand_distance_l2+puck_distance_l2, touch_distance=touch_distance, touch_distance_l1=touch_distance_l1, touch_distance_l2=touch_distance_l2, state_distance=state_distance, state_distance_l1=state_distance_l1, state_distance_l2=state_distance_l2, hand_success=float(hand_distance < self.indicator_threshold), puck_success=float(puck_distance < self.indicator_threshold), hand_and_puck_success=float( hand_distance+puck_distance < self.indicator_threshold ), touch_success=float(touch_distance < self.indicator_threshold), state_success=float(state_distance < self.indicator_threshold), ) def distance_from_goal(self, state=None, goal=None): if state is None: endeff_pos, puck_pos = self.get_endeff_pos(), self.get_puck_pos() else: endeff_pos, puck_pos = state[:3], state[3:] if goal is None: hand_goal, puck_goal = self._state_goal[:3], self._state_goal[3:] else: hand_goal, puck_goal = goal[:3], goal[:3] distances = self._get_info(state) dict_key = self.goal_type + '_distance' try: return distances[dict_key] except KeyError: raise NotImplementedError("Invalid/no reward type.") def is_goal_state(self, state): """ Only used by deep skill chaining. Args: state (np.ndarray): state array [endeff_x, endeff_x, endeff_x, puck_x, puck_y] Returns: True if is goal state, false otherwise """ dist = self.distance_from_goal(state=state) tolerance = self.indicator_threshold if self.goal_type != 'touch' else self.touch_threshold return dist < tolerance def get_puck_pos(self): return self.data.get_body_xpos('puck').copy() def sample_puck_xy(self): return self.init_puck_xy def _set_goal_marker(self, goal): """ This should be use ONLY for visualization. Use self._state_goal for logging, learning, etc. """ self.data.site_xpos[self.model.site_name2id('hand-goal-site')] = ( goal[:3] ) self.data.site_xpos[self.model.site_name2id('puck-goal-site')][:2] = ( goal[3:] ) # if self.hide_goal_markers or self.goal_type == 'touch' or self.goal_type == 'puck': self.data.site_xpos[self.model.site_name2id('hand-goal-site'), 2] = ( -1000 ) if self.hide_goal_markers or self.goal_type == 'touch' or self.goal_type == 'hand': self.data.site_xpos[self.model.site_name2id('puck-goal-site'), 2] = ( -1000 ) def _set_puck_xy(self, pos): """ WARNING: this resets the sites (because set_state resets sights do). """ qpos = self.data.qpos.flat.copy() qvel = self.data.qvel.flat.copy() qpos[8:11] = np.hstack((pos.copy(), np.array([self.init_puck_z]))) qpos[11:15] = np.array([1, 0, 0, 0]) qvel[8:15] = 0 self.set_state(qpos, qvel) def reset_model(self, start_pos=None, goal_pos=None): # Sets reset vars but doesn't actually reset hand position self._reset_hand(start_pos) if goal_pos is None: goal_pos = self.sample_valid_goal()['state_desired_goal'] self.set_goal(goal_pos) # Actually moves the hand's position. Could knock the puck out of the way, # so needs to be done BEFORE resetting puck position. self.step([0, 0]) if not self.reset_free: if start_pos is None: puck_xy = self.sample_puck_xy() else: puck_xy = start_pos[3:] self._set_puck_xy(puck_xy) if not (self.puck_space.contains(self.get_puck_pos()[:2])): self._set_puck_xy(self.sample_puck_xy()) self.reset_counter += 1 self.reset_mocap_welds() return self._get_obs() def _reset_hand(self, start_pos=None): velocities = self.data.qvel.copy() angles = self.data.qpos.copy() angles[:7] = self.init_angles[:7] self.set_state(angles.flatten(), velocities.flatten()) if start_pos is None: hand_pos = self.init_hand_xyz.copy() else: hand_pos = start_pos[:3] for _ in range(10): self.data.set_mocap_pos('mocap', hand_pos) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) def reset(self, start_pos=None, goal_puck_pos=None): ob = self.reset_model(start_pos, goal_puck_pos) if self.viewer is not None: self.viewer_setup() return ob['state_observation'] @property def init_angles(self): return [1.7244448, -0.92036369, 0.10234232, 2.11178144, 2.97668632, -0.38664629, 0.54065733, 5.05442647e-04, 6.00496057e-01, 3.06443862e-02, 1, 0, 0, 0] """ Multitask functions """ def get_goal(self): return { 'desired_goal': self._state_goal, 'state_desired_goal': self._state_goal, } def set_goal(self, goal): self._state_goal = goal self._set_goal_marker(self._state_goal) def set_to_goal(self, goal): hand_goal = goal['state_desired_goal'][:3] for _ in range(10): self.data.set_mocap_pos('mocap', hand_goal) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) self.do_simulation(None, self.frame_skip) puck_goal = goal['state_desired_goal'][3:] self._set_puck_xy(puck_goal) self.sim.forward() def sample_valid_goal(self): goal = self.sample_goal() hand_goal_xy = goal['state_desired_goal'][:2] puck_goal_xy = goal['state_desired_goal'][3:] dist = np.linalg.norm(hand_goal_xy-puck_goal_xy) while dist <= self.puck_radius: goal = self.sample_goal() hand_goal_xy = goal['state_desired_goal'][:2] puck_goal_xy = goal['state_desired_goal'][3:] dist = np.linalg.norm(hand_goal_xy - puck_goal_xy) return goal def sample_goals(self, batch_size): if self.fix_goal: goals = np.repeat( self.fixed_goal.copy()[None], batch_size, 0 ) else: goals = np.random.uniform( self.goal_low, self.goal_high, size=(batch_size, self.goal_low.size), ) return { 'desired_goal': goals, 'state_desired_goal': goals, } def compute_rewards(self, actions, obs): state = obs['observation'] dist = self.distance_from_goal(state) tolerance = self.indicator_threshold if self.goal_type != 'touch' else self.touch_threshold if self.dense_reward: return -dist else: return -(dist > tolerance).astype(float) def get_diagnostics(self, paths, prefix=''): statistics = OrderedDict() for stat_name in [ 'hand_distance', 'hand_distance_l1', 'hand_distance_l2', 'puck_distance', 'puck_distance_l1', 'puck_distance_l2', 'hand_and_puck_distance', 'hand_and_puck_distance_l1', 'hand_and_puck_distance_l2', 'state_distance', 'state_distance_l1', 'state_distance_l2', 'touch_distance', 'touch_distance_l1', 'touch_distance_l2', 'hand_success', 'puck_success', 'hand_and_puck_success', 'state_success', 'touch_success', ]: stat_name = stat_name stat = get_stat_in_paths(paths, 'env_infos', stat_name) statistics.update(create_stats_ordered_dict( '%s%s' % (prefix, stat_name), stat, always_show_all_stats=True, )) statistics.update(create_stats_ordered_dict( 'Final %s%s' % (prefix, stat_name), [s[-1] for s in stat], always_show_all_stats=True, )) return statistics def get_env_state(self): base_state = super().get_env_state() goal = self._state_goal.copy() return base_state, goal def set_env_state(self, state): base_state, goal = state super().set_env_state(base_state) self._state_goal = goal self._set_goal_marker(goal)
class CartPoleContinuousEnv(CartPoleEnv): def __init__(self): super().__init__() self.action_space = Box(low=-1., high=1., shape=(1, ), dtype=np.float32) def step(self, action): err_msg = "%r (%s) invalid" % (action, type(action)) assert self.action_space.contains(action), err_msg x, x_dot, theta, theta_dot = self.state # === # This assignment overwrites the discrete behavior of the cartpole. # The action is the scale of the force magnitude itself force = action[0] * self.force_mag # === costheta = math.cos(theta) sintheta = math.sin(theta) # For the interested reader: # https://coneural.org/florian/papers/05_cart_pole.pdf temp = (force + self.polemass_length * theta_dot**2 * sintheta) / self.total_mass thetaacc = (self.gravity * sintheta - costheta * temp) / ( self.length * (4.0 / 3.0 - self.masspole * costheta**2 / self.total_mass)) xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass if self.kinematics_integrator == 'euler': x = x + self.tau * x_dot x_dot = x_dot + self.tau * xacc theta = theta + self.tau * theta_dot theta_dot = theta_dot + self.tau * thetaacc else: # semi-implicit euler x_dot = x_dot + self.tau * xacc x = x + self.tau * x_dot theta_dot = theta_dot + self.tau * thetaacc theta = theta + self.tau * theta_dot self.state = (x, x_dot, theta, theta_dot) done = bool(x < -self.x_threshold or x > self.x_threshold or theta < -self.theta_threshold_radians or theta > self.theta_threshold_radians) if not done: reward = 1.0 elif self.steps_beyond_done is None: # Pole just fell! self.steps_beyond_done = 0 reward = 1.0 else: if self.steps_beyond_done == 0: logger.warn( "You are calling 'step()' even though this " "environment has already returned done = True. You " "should always call 'reset()' once you receive 'done = " "True' -- any further steps are undefined behavior.") self.steps_beyond_done += 1 reward = 0.0 return np.array(self.state), reward, done, {}
def update(self, goals, binary_competence, continuous_competence=None): if len(goals) > 0: new_split = False all_order = None regions = [None] * len(goals) for i, goal in enumerate(goals): for j, rb in enumerate(self.region_bounds): if rb.contains(goal): regions[i] = j break if self.continuous_competence: c_or_cps = continuous_competence else: c_or_cps = binary_competence # add new outcomes and goals to regions for reg, c_or_cp, goal in zip(regions, c_or_cps, goals): self.regions[reg][0].append(c_or_cp) self.regions[reg][1].append(goal) # check if need to split ind_split = [] new_bounds = [] new_sub_regions = [] for reg in range(self.nb_regions): if len(self.regions[reg][0]) > self.maxlen: # try nb_split_attempts splits best_split_score = 0 best_abs_interest_diff = 0 best_bounds = None best_sub_regions = None is_split = False for i in range(self.nb_split_attempts): sub_reg1 = [deque(), deque()] sub_reg2 = [deque(), deque()] # repeat until the two sub regions contain at least 1/10 of the mother region while len(sub_reg1[0]) < self.maxlen / 4 or len( sub_reg2[0]) < self.maxlen / 4: # decide on dimension dim = np.random.choice(range(self.nb_dims)) threshold = self.region_bounds[reg].sample()[dim] bounds1 = Box(self.region_bounds[reg].low, self.region_bounds[reg].high, dtype=np.float32) bounds1.high[dim] = threshold bounds2 = Box(self.region_bounds[reg].low, self.region_bounds[reg].high, dtype=np.float32) bounds2.low[dim] = threshold bounds = [bounds1, bounds2] valid_bounds = True if np.any(bounds1.high - bounds1.low < self.init_size / 5): valid_bounds = False if np.any(bounds2.high - bounds2.low < self.init_size / 5): valid_bounds = valid_bounds and False # perform split in sub regions sub_reg1 = [deque(), deque()] sub_reg2 = [deque(), deque()] for i, goal in enumerate(self.regions[reg][1]): if bounds1.contains(goal): sub_reg1[1].append(goal) sub_reg1[0].append(self.regions[reg][0][i]) else: sub_reg2[1].append(goal) sub_reg2[0].append(self.regions[reg][0][i]) sub_regions = [sub_reg1, sub_reg2] # compute interest interest = np.zeros([2]) for i in range(2): if self.continuous_competence: cp_window = min(len(sub_regions[i][0]), self.window_cp) cp = np.abs( np.array( sub_regions[i][0])[-cp_window].mean()) else: cp_window = min( len(sub_regions[i][0]) // 2, self.window_cp) cp = np.array( sub_regions[i][0] )[-2 * cp_window:-cp_window].mean() - np.array( sub_regions[i][0])[-cp_window:].mean() interest[i] = np.abs(cp) # compute score split_score = len(sub_reg1) * len(sub_reg2) * np.abs( interest[0] - interest[1]) if split_score >= best_split_score and np.abs( interest[0] - interest[1] ) >= self.max_difference / 2 and valid_bounds: best_abs_interest_diff = np.abs(interest[0] - interest[1]) print(interest) best_split_score = split_score best_sub_regions = sub_regions best_bounds = bounds is_split = True if interest[0] >= interest[1]: order = [1, -1] else: order = [-1, 1] if is_split: ind_split.append(reg) if best_abs_interest_diff > self.max_difference: self.max_difference = best_abs_interest_diff else: self.regions[reg][0] = deque( np.array(self.regions[reg][0]) [-int(3 * len(self.regions[reg][0]) / 4):], maxlen=self.maxlen + 1) self.regions[reg][1] = deque( np.array(self.regions[reg][1]) [-int(3 * len(self.regions[reg][1]) / 4):], maxlen=self.maxlen + 1) new_bounds.append(best_bounds) new_sub_regions.append(best_sub_regions) # implement splits for i, reg in enumerate(ind_split): all_order = [0] * self.nb_regions all_order.pop(reg) all_order.insert(reg, order[0]) all_order.insert(reg, order[1]) new_split = True self.region_bounds.pop(reg) self.region_bounds.insert(reg, new_bounds[i][0]) self.region_bounds.insert(reg, new_bounds[i][1]) self.regions.pop(reg) self.regions.insert(reg, new_sub_regions[i][0]) self.regions.insert(reg, new_sub_regions[i][1]) self.interest.pop(reg) self.interest.insert(reg, 0) self.interest.insert(reg, 0) self.probas.pop(reg) self.probas.insert(reg, 0) self.probas.insert(reg, 0) # recompute interest for i in range(self.nb_regions): if len(self.regions[i][0]) > 10: if self.continuous_competence: cp_window = min(len(self.regions[i][0]), self.window_cp) cp = np.abs( np.array(self.regions[i][0])[-cp_window].mean()) else: cp_window = min( len(self.regions[i][0]) // 2, self.window_cp) cp = np.array( self.regions[i] [0])[-2 * cp_window:-cp_window].mean() - np.array( self.regions[i][0])[-cp_window:].mean() else: cp = 0 self.interest[i] = np.abs(cp) exp_int = np.exp(self.temperature * np.array(self.interest)) probas = exp_int / exp_int.sum() self.probas = probas.tolist() assert len(self.probas) == len(self.regions) return new_split, all_order else: return False, None
class DiscreteQuadEnv(gym.Env): """ Gym environment for our custom system (Crazyflie quad). """ def __init__(self): gym.Env.__init__(self) self.dyn_nn = torch.load( "_models/temp/2018-12-05--15-51-52.2_train-acc-old-50-ensemble_stack3_.pth" ) self.dyn_nn.eval() data_file = open( "_models/temp/2018-12-05--15-57-55.6_train-acc-old-50-ensemble_stack3_--data.pkl", 'rb') self.n = 3 self.state = None df = pickle.load(data_file) self.equilibrium = np.array([31687.1, 37954.7, 33384.8, 36220.11]) self.action_dict = self.init_action_dict() print(df.columns.values) r = 500 print("State Before") print(df.iloc[r, 12:12 + 9].values) print("State Change") print(df.iloc[r, :9].values) print("State After") print(df.iloc[r + 1, 12:12 + 9].values) print("Is this the same") print(df.iloc[r, :9].values + df.iloc[r, 12:12 + 9].values) print("Compare to State Before") print(df.iloc[r + 1, 12 + 9:12 + 9 + 9].values) s_stacked = df.iloc[2, 12:12 + 9 * self.n].values a_stacked = df.iloc[2, 12 + 9 * self.n:12 + 9 * self.n + 4 * self.n].values print(a_stacked) print("\n Prediction") print(self.dyn_nn.predict(s_stacked, a_stacked)) v_bats = df.iloc[:, -1] print(min(v_bats), max(v_bats)) self.dyn_data = df all_states = df.iloc[:, 12:12 + 9].values all_actions = df.iloc[:, 12 + 9 * self.n:12 + 9 * self.n + 4].values min_state_bounds = [ min(all_states[:, i]) for i in range(len(all_states[0, :])) ] max_state_bounds = [ max(all_states[:, i]) for i in range(len(all_states[0, :])) ] min_action_bounds = [ min(all_actions[:, i]) for i in range(len(all_actions[0, :])) ] max_action_bounds = [ max(all_actions[:, i]) for i in range(len(all_actions[0, :])) ] low_state_s = np.tile(min_state_bounds, self.n) low_state_a = np.tile(min_action_bounds, self.n - 1) high_state_s = np.tile(max_state_bounds, self.n) high_state_a = np.tile(max_action_bounds, self.n - 1) # self.action_space = Box(low=np.array(min_action_bounds), high=np.array(max_action_bounds)) self.action_space = gym.spaces.Discrete(16) # 2^4 actions self.observation_space = Box(low=np.append(low_state_s, low_state_a), \ high=np.append(high_state_s, high_state_a)) def state_failed(self, s): """ Check whether a state has failed, so the trajectory should terminate. This happens when either the roll or pitch angle > 30 Returns: failure flag [bool] """ if abs(s[3]) > 30.0 or abs(s[4]) > 30.0: return True return False def init_action_dict(self): motor_discretized = [[-5000, 0, 5000], [-5000, 0, 5000], [-5000, 0, 5000], [-5000, 0, 5000]] action_dict = dict() for ac, action in enumerate(list( itertools.product(*motor_discretized))): action_dict[ac] = np.asarray(action) + self.equilibrium print(action_dict) return action_dict def get_reward_state(self, s_next): """ Returns reward of being in a certain state. """ pitch = s_next[3] roll = s_next[4] if self.state_failed(s_next): # return -1 * 1000 return 0 loss = pitch**2 + roll**2 loss = np.sqrt(loss) reward = 1800 - loss # This should be positive. TODO: Double check reward = 30 - loss # reward = 1 return reward def sample_random_state(self): """ Samples random state from previous logs. Ensures that the sampled state is not in a failed state. """ state_failed = True acceptable = False while not acceptable: # random_state = self.observation_space.sample() row_idx = np.random.randint(self.dyn_data.shape[0]) random_state = self.dyn_data.iloc[row_idx, 12:12 + 9 * self.n].values random_state_s = self.dyn_data.iloc[row_idx, 12:12 + 9 * self.n].values random_state_a = self.dyn_data.iloc[row_idx, 12 + 9 * self.n + 4:12 + 9 * self.n + 4 + 4 * (self.n - 1)].values random_state = np.append(random_state_s, random_state_a) # if abs(random_state[3]) < 5 and abs(random_state[4]) < 5: # acceptable = True state_failed = self.state_failed(random_state) if not state_failed: acceptable = True return random_state def next_state(self, state, action): # Note that the states defined in env are different state_dynamics = state[:9 * self.n] action_dynamics = np.append( action, state[9 * self.n:9 * self.n + 4 * (self.n - 1)]) state_change = self.dyn_nn.predict(state_dynamics, action_dynamics) next_state = state_change next_state[3:6] = state[3:6] + state_change[3:6] # next_state = state[:9] + state_change past_state = state[:9 * (self.n - 1)] new_state = np.concatenate((next_state, state[:9 * (self.n - 1)])) new_action = np.concatenate( (action, state[9 * self.n:9 * self.n + 4 * (self.n - 2)])) # return np.concatenate((new_state, new_action)) def step(self, ac): action = self.action_dict[ac] new_state = self.next_state(self.state, action) self.state = new_state reward = self.get_reward_state(new_state) done = False if self.state_failed( new_state) or not self.observation_space.contains(new_state): done = True return self.state, reward, done, {} def reset(self): self.state = self.sample_random_state() return self.state
class Color: def __init__(self, color, shade): """ Implements a color class characterized by a color and shade attributes. Parameters ---------- color: str Color in red, blue, green. shade: str Shade in light, dark. """ self.color = color self.shade = shade if color == 'blue': if shade == 'light': self.space = Box(low=np.array([0.3, 0.7, 0.9]), high=np.array([0.5, 0.8, 1.]), dtype=np.float32) elif shade == 'dark': self.space = Box(low=np.array([0.0, 0., 0.8]), high=np.array([0.2, 0.2, 0.9]), dtype=np.float32) else: raise NotImplementedError("shade is either 'light' or 'dark'") elif color == 'red': if shade == 'light': self.space = Box(low=np.array([0.9, 0.4, 0.35]), high=np.array([1, 0.6, 0.65]), dtype=np.float32) elif shade == 'dark': self.space = Box(low=np.array([0.5, 0., 0.]), high=np.array([0.7, 0.1, 0.1]), dtype=np.float32) else: raise NotImplementedError("shade is either 'light' or 'dark'") elif color == 'green': if shade == 'light': self.space = Box(low=np.array([0.4, 0.8, 0.4]), high=np.array([0.6, 1, 0.5]), dtype=np.float32) elif shade == 'dark': self.space = Box(low=np.array([0., 0.4, 0.]), high=np.array([0.1, 0.6, 0.1]), dtype=np.float32) else: raise NotImplementedError elif color == 'dark': if shade == 'dark': self.space = Box(low=np.array([0., 0., 0.]), high=np.array([0.3, 0.3, 0.3]), dtype=np.float32) elif shade == 'light': self.space = Box(low=np.array([1., 1., 1.]), high=np.array([2., 2., 2.]), dtype=np.float32) else: raise NotImplementedError else: raise NotImplementedError("color is 'red', 'blue' or 'green'") def contains(self, rgb): """ Whether the class contains a given rgb code. Parameters ---------- rgb: 1D nd.array of size 3 Returns ------- contains: Bool True if rgb code in given Color class. """ contains = self.space.contains(rgb) if self.color == 'red' and self.shade == 'light': contains = contains and (rgb[2] - rgb[1] <= 0.05) return contains def sample(self): """ Sample an rgb code from the Color class Returns ------- rgb: 1D nd.array of size 3 """ rgb = np.random.uniform(self.space.low, self.space.high, 3) if self.color == 'red' and self.shade == 'light': rgb[2] = rgb[1] + np.random.uniform(-0.05, 0.05) return rgb
class Environment(gym.Env): """ Markov Decision Process associated to the microgrid. Parameters ---------- microgrid: microgrid, mandatory The controlled microgrid. random_seed: int, optional Seed to be used to generate the needed random numbers to size microgrids. """ def __init__(self, env_config, seed=42): # Set seed np.random.seed(seed) self.states_normalization = Preprocessing.normalize_environment_states( env_config['microgrid']) self.TRAIN = True # Microgrid self.env_config = env_config self.mg = env_config['microgrid'] # State space self.mg.train_test_split() #np.zeros(2+self.mg.architecture['grid']*3+self.mg.architecture['genset']*1) # Number of states self.Ns = len(self.mg._df_record_state.keys()) + 1 # Number of actions #training_reward_smoothing try: self.training_reward_smoothing = env_config[ 'training_reward_smoothing'] except: self.training_reward_smoothing = 'sqrt' try: self.resampling_on_reset = env_config['resampling_on_reset'] except: self.resampling_on_reset = True if self.resampling_on_reset == True: self.forecast_args = env_config['forecast_args'] self.baseline_sampling_args = env_config['baseline_sampling_args'] self.saa = generate_sampler(self.mg, self.forecast_args) self.observation_space = Box(low=-1, high=np.float('inf'), shape=(self.Ns, ), dtype=np.float) #np.zeros(len(self.mg._df_record_state.keys())) # Action space self.metadata = {"render.modes": ["human"]} self.state, self.reward, self.done, self.info, self.round = None, None, None, None, None self.round = None # Start the first round self.seed() self.reset() try: assert (self.observation_space.contains(self.state)) except AssertionError: print("ERROR : INVALID STATE", self.state) def get_reward(self): if self.TRAIN == True: if self.training_reward_smoothing == 'sqrt': return -(self.mg.get_cost()**0.5) if self.training_reward_smoothing == 'peak_load': return -self.mg.get_cost( ) / self.mg.parameters['load'].values[0] return -self.mg.get_cost() def get_cost(self): return sum(self.mg._df_record_cost['cost']) def step(self, action): # CONTROL if self.done: print("WARNING : EPISODE DONE") # should never reach this point return self.state, self.reward, self.done, self.info try: assert (self.observation_space.contains(self.state)) except AssertionError: print("ERROR : INVALID STATE", self.state) try: assert (self.action_space.contains(action)) except AssertionError: print("ERROR : INVALD ACTION", action) # UPDATE THE MICROGRID control_dict = self.get_action(action) self.mg.run(control_dict) # COMPUTE NEW STATE AND REWARD self.state = self.transition() self.reward = self.get_reward() self.done = self.mg.done self.info = {} self.round += 1 return self.state, self.reward, self.done, self.info # control_dict = self.get_action(action) # self.mg.run(control_dict) # reward = self.reward() # s_ = self.transition() # self.state = s_ # done = self.mg.done # self.round += 1 # return s_, reward, done, {} def reset(self, testing=False): if "testing" in self.env_config: testing = self.env_config["testing"] self.round = 1 # Reseting microgrid self.mg.reset(testing=testing) if testing == True: self.TRAIN = False elif self.resampling_on_reset == True: Preprocessing.sample_reset(self.mg.architecture['grid'] == 1, self.saa, self.mg, sampling_args=sampling_args) self.state, self.reward, self.done, self.info = self.transition( ), 0, False, {} return self.state def get_action(self, action): """ :param action: current action :return: control_dict : dicco of controls """ ''' States are: binary variable whether charging or dischargin battery power, normalized to 1 binary variable whether importing or exporting grid power, normalized to 1 binary variable whether genset is on or off genset power, normalized to 1 ''' control_dict = [] return control_dict def states(self): # soc, price, load, pv 'df status?' observation_space = [] return observation_space # Transition function def transition(self): # net_load = round(self.mg.load - self.mg.pv) # soc = round(self.mg.battery.soc,1) # s_ = (net_load, soc) # next state updated_values = self.mg.get_updated_values() updated_values = { x: float(updated_values[x]) / self.states_normalization[x] for x in self.states_normalization } updated_values['hour_sin'] = np.sin( 2 * np.pi * updated_values['hour'] ) # the hour is already divided by 24 in the line above updated_values['hour_cos'] = np.cos(2 * np.pi * updated_values['hour']) updated_values.pop('hour', None) s_ = np.array(list(updated_values.values())) #np.array(self.mg.get_updated_values().values)#.astype(np.float)#self.mg.get_updated_values() #s_ = [ s_[key] for key in s_.keys()] return s_ def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def render(self, mode="human"): txt = "state: " + str(self.state) + " reward: " + str( self.reward) + " info: " + str(self.info) print(txt) # Mapping between action and the control_dict def get_action_continuous(self, action): """ :param action: current action :return: control_dict : dicco of controls """ ''' Actions are: binary variable whether charging or dischargin battery power, normalized to 1 binary variable whether importing or exporting grid power, normalized to 1 binary variable whether genset is on or off genset power, normalized to 1 ''' print(action) mg = self.mg pv = mg.pv load = mg.load net_load = load - pv capa_to_charge = mg.battery.capa_to_charge p_charge_max = mg.battery.p_charge_max p_charge = max(0, min(-net_load, capa_to_charge, p_charge_max)) capa_to_discharge = mg.battery.capa_to_discharge p_discharge_max = mg.battery.p_discharge_max p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max)) control_dict = {} if mg.architecture['battery'] == 1: control_dict['battery_charge'] = max( 0, action[0] * min(action[1] * mg.battery.capacity, mg.battery.capa_to_charge, mg.battery.power_charge)) control_dict['battery_discharge'] = max( 0, (1 - action[0]) * min(action[1] * mg.battery.capacity, mg.battery.capa_to_discharge, mg.battery.power_discharge)) if mg.architecture['grid'] == 1: if mg.grid.status == 1: control_dict['grid_import'] = max( 0, action[2] * min(action[3] * mg.grid.power_import, mg.grid.power_import)) control_dict['grid_export'] = max(0, (1 - action[2]) * min( action[3] * mg.grid.power_export, mg.grid.power_export)) if mg.architecture['genset'] == 1: control_dict['genset'] = max( 0, action[4] * min(action[5] * mg.genset.rated_power_import)) return control_dict def get_action_discrete(self, action): """ :param action: current action :return: control_dict : dicco of controls """ ''' Actions are: binary variable whether charging or dischargin battery power, normalized to 1 binary variable whether importing or exporting grid power, normalized to 1 binary variable whether genset is on or off genset power, normalized to 1 ''' control_dict = {} control_dict['pv_consumed'] = action[0] if self.mg.architecture['battery'] == 1: control_dict['battery_charge'] = action[1] * action[3] control_dict['battery_discharge'] = action[2] * (1 - action[3]) if self.mg.architecture['genset'] == 1: control_dict['genset'] = action[4] if self.mg.architecture['grid'] == 1: control_dict['grid_import'] = action[5] * action[7] control_dict['grid_export'] = action[6] * (1 - action[7]) elif self.mg.architecture['grid'] == 1: control_dict['grid_import'] = action[4] * action[6] control_dict['grid_export'] = action[5] * (1 - action[6]) return control_dict # Mapping between action and the control_dict def get_action_priority_list(self, action): """ :param action: current action :return: control_dict : dicco of controls """ ''' States are: binary variable whether charging or dischargin battery power, normalized to 1 binary variable whether importing or exporting grid power, normalized to 1 binary variable whether genset is on or off genset power, normalized to 1 ''' mg = self.mg pv = mg.pv load = mg.load net_load = load - pv capa_to_charge = mg.battery.capa_to_charge p_charge_max = mg.battery.p_charge_max p_charge = max(0, min(-net_load, capa_to_charge, p_charge_max)) capa_to_discharge = mg.battery.capa_to_discharge p_discharge_max = mg.battery.p_discharge_max p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max)) control_dict = {} control_dict = self.actions_agent_discret(mg, action) return control_dict def actions_agent_discret(self, mg, action): if mg.architecture['genset'] == 1 and mg.architecture['grid'] == 1: control_dict = self.action_grid_genset(mg, action) elif mg.architecture['genset'] == 1 and mg.architecture['grid'] == 0: control_dict = self.action_genset(mg, action) else: control_dict = self.action_grid(mg, action) return control_dict def action_grid(self, mg, action): # slack is grid pv = mg.pv load = mg.load net_load = load - pv capa_to_charge = mg.battery.capa_to_charge p_charge_max = mg.battery.p_charge_max p_charge_pv = max(0, min(-net_load, capa_to_charge, p_charge_max)) p_charge_grid = max(0, min(capa_to_charge, p_charge_max)) capa_to_discharge = mg.battery.capa_to_discharge p_discharge_max = mg.battery.p_discharge_max p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max)) # Charge if action == 0: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': p_charge_pv, 'battery_discharge': 0, 'grid_import': 0, 'grid_export': max(0, pv - min(pv, load) - p_charge_pv), 'genset': 0 } if action == 4: load = load + p_charge_grid control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': p_charge_grid, 'battery_discharge': 0, 'grid_import': max(0, load - min(pv, load)), 'grid_export': max(0, pv - min(pv, load) - p_charge_grid), 'genset': 0 } # décharger full elif action == 1: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': p_discharge, 'grid_import': max(0, load - min(pv, load) - p_discharge), 'grid_export': 0, 'genset': 0 } # Import elif action == 2: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': 0, 'grid_import': max(0, net_load), 'grid_export': 0, 'genset': 0 } # Export elif action == 3: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': 0, 'grid_import': 0, 'grid_export': abs(min(net_load, 0)), 'genset': 0 } return control_dict def action_grid_genset(self, mg, action): # slack is grid pv = mg.pv load = mg.load net_load = load - pv status = mg.grid.status # whether there is an outage or not capa_to_charge = mg.battery.capa_to_charge p_charge_max = mg.battery.p_charge_max p_charge_pv = max(0, min(-net_load, capa_to_charge, p_charge_max)) p_charge_grid = max(0, min(capa_to_charge, p_charge_max)) capa_to_discharge = mg.battery.capa_to_discharge p_discharge_max = mg.battery.p_discharge_max p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max)) capa_to_genset = mg.genset.rated_power * mg.genset.p_max p_genset = max(0, min(net_load, capa_to_genset)) # Charge if action == 0: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': p_charge_pv, 'battery_discharge': 0, 'grid_import': 0, 'grid_export': max(0, pv - min(pv, load) - p_charge_pv) * status, 'genset': 0 } if action == 5: load = load + p_charge_grid control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': p_charge_grid, 'battery_discharge': 0, 'grid_import': max(0, load - min(pv, load)) * status, 'grid_export': max(0, pv - min(pv, load) - p_charge_grid) * status, 'genset': 0 } # décharger full elif action == 1: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': p_discharge, 'grid_import': max(0, load - min(pv, load) - p_discharge) * status, 'grid_export': 0, 'genset': 0 } # Import elif action == 2: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': 0, 'grid_import': max(0, net_load) * status, 'grid_export': 0, 'genset': 0 } # Export elif action == 3: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': 0, 'grid_import': 0, 'grid_export': abs(min(net_load, 0)) * status, 'genset': 0 } # Genset elif action == 4: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': 0, 'grid_import': 0, 'grid_export': 0, 'genset': max(net_load, 0) } elif action == 6: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': p_discharge, 'grid_import': 0, 'grid_export': 0, 'genset': max(0, load - min(pv, load) - p_discharge), } return control_dict def action_genset(self, mg, action): # slack is genset pv = mg.pv load = mg.load net_load = load - pv capa_to_charge = mg.battery.capa_to_charge p_charge_max = mg.battery.p_charge_max p_charge = max(0, min(-net_load, capa_to_charge, p_charge_max)) capa_to_discharge = mg.battery.capa_to_discharge p_discharge_max = mg.battery.p_discharge_max p_discharge = max(0, min(net_load, capa_to_discharge, p_discharge_max)) capa_to_genset = mg.genset.rated_power * mg.genset.p_max p_genset = max(0, min(net_load, capa_to_genset)) # Charge if action == 0: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': p_charge, 'battery_discharge': 0, 'grid_import': 0, 'grid_export': 0, 'genset': 0 } # décharger full elif action == 1: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': p_discharge, 'grid_import': 0, 'grid_export': 0, 'genset': max(0, load - min(pv, load) - p_discharge) } # Genset elif action == 2: control_dict = { 'pv_consummed': min(pv, load), 'battery_charge': 0, 'battery_discharge': 0, 'grid_import': 0, 'grid_export': 0, 'genset': max(0, load - min(pv, load)) } return control_dict
class CopterEnvBase(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50 } def __init__(self, strict_actions=False): self.viewer = None self.setup = CopterSetup() self._seed() self._strict_action_space = strict_actions self.allowed_fly_range = Box(np.array([-10, -10, 0]), np.array([10, 10, 10])) self.observation_space = Box(-np.inf, np.inf, (4 * 3 + 4)) def _seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def _step(self, action): if self._strict_action_space: assert self.action_space.contains( action), "%r (%s) invalid" % (action, type(action)) else: action = np.clip(action, self.action_space.low, self.action_space.high) self._control = self._control_from_action(action) for i in range(2): simulate(self.copterstatus, self.setup, self._control, 0.01) done = not self.allowed_fly_range.contains(self.copterstatus.position) reward = -1 if done else 0 return self._get_state(), reward, done, { "rotor-speed": self.copterstatus.rotor_speeds } def _get_state(self): s = self.copterstatus return np.concatenate([ s.position, s.velocity, s.attitude, s.angular_velocity, s.rotor_speeds ]) def _reset(self): self.copterstatus = CopterStatus() return self._get_state() def _render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return if self.viewer is None: self.viewer = rendering.Viewer(500, 500) self.center = self.copterstatus.position[0] self.center = 0.9 * self.center + 0.1 * self.copterstatus.position[0] self.viewer.set_bounds(-7 + self.center, 7 + self.center, -1, 13) # draw ground _draw_ground(self.viewer, self.center) _draw_copter(self.viewer, self.setup, self.copterstatus) return self.viewer.render(return_rgb_array=mode == 'rgb_array') def _control_from_action(self, action): raise NotImplementedError()
class FarmEnv(gym.Env): """ Description: A wind farm is controlled by yaw angles Observation: Type: Box(n+2) Num Observation Min Max 0 current yaw angle (°) -max_yaw max_yaw ... ... ... ... n current yaw angle (°) -max_yaw max_yaw n+1 wind angle (°) 0 359 n+2 wind speed (kts) min_wind_speed max_wind_speed Actions: Type: Discrete(2) Num Action Min Max 0 yaw angle (°) -max_yaw max_yaw ... n yaw angle (°) -max_yaw max_yaw Reward: Reward is power increase for every step taken, including the termination step Starting State: TBD Episode Termination: Pole Angle is more than 12 degrees. Cart Position is more than 2.4 (center of the cart reaches the edge of the display). Episode length is greater than 200. Solved Requirements: Considered solved when the average return is greater than or equal to 195.0 over 100 consecutive trials. """ def __init__(self, config): self.farm = config['farm'] self.numwt = config['num_wind_turbines'] # initialize yaw boundaries self.allowed_yaw = config["max_yaw"] self.min_wind_speed = config["min_wind_speed"] self.max_wind_speed = config["max_wind_speed"] self.min_wind_angle = config["min_wind_angle"] self.max_wind_angle = config["max_wind_angle"] self.continuous_action_space = config["continuous_action_space"] self.best_explored_power = {} self.count_steps = 0 self.initialized_yaw_angle = 0 self.cur_yaws = np.full((self.numwt, ), 0, dtype=np.int32) self.turbine_powers = np.full((self.numwt, ), 0, dtype=np.float64) self.turbulent_intensities = np.full((self.numwt, ), 0, dtype=np.float64) self.thrust_coefs = np.full((self.numwt, ), 0, dtype=np.float64) self.wt_speed_u = np.full((self.numwt, ), 0, dtype=np.float64) self.wt_speed_v = np.full((self.numwt, ), 0, dtype=np.float64) self.cur_wind_speed = [8.] # in kts self.cur_wind_angle = [270.] # in degrees self.initial_wind_angle = 0 # in kts self.max_wind_direction_variation = 10, # max wind angle variation during episode self.cur_nominal_power = 0 self.cur_power = 0 self.cur_power_ratio = 0 self.cur_nominal_ti_sum = 0 if self.continuous_action_space: # action space is the yaw angles for the wt , to be multiplied by allowed_yaw° action_low = np.full((self.numwt, ), -1., dtype=np.float32) action_high = np.full((self.numwt, ), 1., dtype=np.float32) self.action_space = Box(low=action_low, high=action_high, shape=(self.numwt, )) else: # discrete action space self.action_space = MultiDiscrete( np.full((self.numwt, ), 2 * self.allowed_yaw, dtype=np.float32)) print(f'action space : {self.action_space}') print(f'action space : {self.action_space.sample()}') # observation space TODO observation_high = np.concatenate( ( # np.array([self.max_wind_angle, self.max_wind_speed])), np.array([1.] * self.numwt ), # yaw max positions for all wind turbines # np.array([1] * self.numwt), # x axis wind speed for all wind turbines # np.array([1] * self.numwt), # y axis wind speed for all wind turbines np.array([1] * self.numwt ), # max turbulence intensity for all wind turbines np.array([1]), # max power ratio # np.array([1] * self.numwt), # max thrust coef for all wind turbines # np.array([1] * self.numwt), # max power coef for all wind turbines np.array([1]), # max sinus wind angle np.array([1]), # max cosinus wind angle np.array( [1])), # max normalized wind speed (range 2 to 25.5 m.s-1) axis=0) observation_low = np.concatenate( ( # np.array([self.min_wind_angle, self.min_wind_speed])), np.array([-1] * self.numwt ), # yaw min positions for all wind turbines # np.array([-1] * self.numwt), # x axis wind speed for all wind turbines # np.array([-1] * self.numwt), # y axis wind speed for all wind turbines np.array([0.] * self.numwt ), # min turbulence intensity for all wind turbines np.array([-1]), # min power ratio # np.array([0] * self.numwt), # min thrust coef for all wind turbines # np.array([0] * self.numwt), # min power coef for all wind turbines np.array([-1]), # min sinus wind angle np.array([-1]), # min cosinus wind angle np.array( [0])), # min normalized wind speed (range 2 to 25.5 m.s-1) axis=0) print(f'observation low : {observation_low}') print(f'observation high : {observation_high}') self.observation_space = Box(low=observation_low, high=observation_high, shape=(self.numwt * 2 + 4, ), dtype=np.float64) print(f'observation space : {self.observation_space}') def reset(self, wd=None, ws=None): self.count_steps = 0 self.cur_yaws = np.full((self.numwt, ), 0, dtype=np.int32) # Define wind conditions for this episode # check wind speed in range (2 to 25,5) assert self.max_wind_speed < 25.5, "max wind speed too high" assert self.min_wind_speed > 2., "min wind speed too low" if wd: self.cur_wind_angle = wd else: self.cur_wind_angle = np.random.randint(self.min_wind_angle, self.max_wind_angle) if ws: self.cur_wind_speed = ws else: self.cur_wind_speed = np.random.uniform(self.min_wind_speed, self.max_wind_speed) self.initial_wind_angle = self.cur_wind_angle # Update the flow in the model print(f'wind angle {self.cur_wind_angle}') print(f'wind speed {self.cur_wind_speed}') self.farm.reinitialize_flow_field(wind_direction=[self.cur_wind_angle], wind_speed=[self.cur_wind_speed]) self.farm.calculate_wake() self.cur_nominal_power = self.farm.get_farm_power() self.best_explored_power[self.cur_wind_angle] = self.cur_nominal_power self.cur_nominal_ti_sum = np.sum(self.farm.get_turbine_ti()) state = self.get_observation() # print(f'initial state is {state}') # print(f'observation space is {self.observation_space}') return state # return current state of the environment def get_observation(self): self.turbulent_intensities = (np.array(self.farm.get_turbine_ti()) - 0.055) / 0.07 #rescaling self.cur_power = self.farm.get_farm_power() # self.thrust_coefs = self.farm.get_turbine_ct() # # turbine_powers = self.farm.get_turbine_power() # self.turbine_powers = turbine_powers / np.max(turbine_powers) # # wind_speed_points_at_wt = pd.DataFrame(self.farm.get_set_of_points(self.farm_layout[0], self.farm_layout[1], [80.] * self.numwt).head(self.numwt)) # u_wind_speed_points_at_wt = np.array(wind_speed_points_at_wt.u) # v_wind_speed_points_at_wt = np.array(wind_speed_points_at_wt.v) # self.wt_speed_u = u_wind_speed_points_at_wt / self.cur_wind_speed # self.wt_speed_v = v_wind_speed_points_at_wt / self.cur_wind_speed current_yaws = self.cur_yaws / self.allowed_yaw self.cur_power_ratio = ( self.cur_power - self.cur_nominal_power) / self.cur_nominal_power observation = np.concatenate( ( # self.wt_speed_u, # self.wt_speed_v, current_yaws, self.turbulent_intensities, # self.thrust_coefs, # self.turbine_powers, np.array([self.cur_power_ratio]), np.array([sind(self.cur_wind_angle)]), np.array([cosd(self.cur_wind_angle)]), np.array([self.cur_wind_speed / 25.5]), ), axis=0) return observation def step(self, action, no_variation=False): # check actions validity err_msg = "%r (%s) invalid" % (action, type(action)) assert self.action_space.contains(action), err_msg # Execute the actions if self.continuous_action_space: self.cur_yaws = action * self.allowed_yaw else: self.cur_yaws = action - self.allowed_yaw print(f'current yaws {self.cur_yaws}') if not no_variation: # Apply wind variation if self.cur_wind_angle <= self.initial_wind_angle + self.max_wind_direction_variation[ 0] or self.cur_wind_angle >= self.initial_wind_angle - self.max_wind_direction_variation[ 0]: self.cur_wind_angle = self.cur_wind_angle + np.random.randint( -1, 2) self.farm.reinitialize_flow_field( wind_direction=[self.cur_wind_angle], wind_speed=[self.cur_wind_speed]) print(f'new {self.cur_wind_angle}') self.farm.calculate_wake() self.cur_nominal_power = self.farm.get_farm_power() # Get the Observations from the simulation self.farm.calculate_wake(yaw_angles=self.cur_yaws) observation = self.get_observation() # check observation err_msg = "%r (%s) invalid" % (observation, type(observation)) assert self.observation_space.contains(observation), err_msg # reward calc # power_ratio = (self.cur_power - self.best_explored_power[self.cur_wind_angle]) / self.best_explored_power[self.cur_wind_angle] reward = self.cur_power_ratio * 100 print(f'power ratio {self.cur_power_ratio}') # if self.cur_power > self.best_explored_power[self.cur_wind_angle]: # self.best_explored_power[self.cur_wind_angle] = self.cur_power self.count_steps += 1 # Done Evaluation if self.count_steps == 30: done = True else: done = False return observation, reward, done, {}
class PhyHumanoid(gym.Env): def __init__(self, config): args = ['--arg_file'] args.append(os.path.join("data", config["config_file"])) self.enable_draw = config.get("enable_draw", False) self.show_kin = config.get("show_kin", False) self.draw_frame_skip = config.get("draw_frame_skip", 5) # self.enable_draw = config["enable_draw"] # self.show_kin = config["show_kin"] if self.enable_draw: self.num_draw_frame = 0 self.win_width = 800 self.win_height = int(self.win_width * 9.0 / 16.0) self._init_draw() self.playback_speed = 1 self._core = DeepMimicCore.cDeepMimicCore(self.enable_draw) rand_seed = np.random.randint(np.iinfo(np.int32).max) self._core.SeedRand(rand_seed) self._core.ParseArgs(args) self._core.Init() self._core.SetPlaybackSpeed(self.playback_speed) fps = 60 self.update_timestep = 1.0 / fps num_substeps = self._core.GetNumUpdateSubsteps() self.timestep = self.update_timestep / num_substeps self.agent_id = 0 self.act_size = self._core.GetActionSize(self.agent_id) print(self.act_size) self.action_space = Box(-2 * np.pi, 2 * np.pi, shape=(self.act_size, ), dtype=np.float32) self.state_size = self._core.GetStateSize(self.agent_id) self.goal_size = self._core.GetGoalSize(self.agent_id) self.ground_size = 0 self.obs_size = self.state_size + self.goal_size + self.ground_size self.observation_space = Box(-np.inf, np.inf, shape=(self.obs_size, ), dtype=np.float32) if self.enable_draw: self.reshaping = False self._setup_draw() if self.show_kin: self._core.Keyboard(107, 0, 0) def _init_draw(self): glutInit() # glutInitContextVersion(3, 2) glutInitContextFlags(GLUT_FORWARD_COMPATIBLE) glutInitContextProfile(GLUT_CORE_PROFILE) glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH) glutInitWindowSize(self.win_width, self.win_height) glutCreateWindow(b'ExtDeepMimic') return def _setup_draw(self): self._reshape(self.win_width, self.win_height) self._core.Reshape(self.win_width, self.win_height) return def _draw(self): if self.num_draw_frame % self.draw_frame_skip == 0: self._update_intermediate_buffer() self._core.Draw() glutSwapBuffers() self.reshaping = False self.num_draw_frame += 1 return def _reshape(self, w, h): self.reshaping = True self.win_width = w self.win_height = h return def _shutdown(self): print('Shutting down...') self._core.Shutdown() sys.exit(0) return def _update_intermediate_buffer(self): if not (self.reshaping): if (self.win_width != self._core.GetWinWidth() or self.win_height != self._core.GetWinHeight()): self._core.Reshape(self.win_width, self.win_height) return def reset(self): self._core.Reset() if self.enable_draw: self.num_draw_frame = 0 self._draw() obs = self._get_observation() return obs def step(self, action): prev_obs = self._get_observation() self._core.SetAction(self.agent_id, np.asarray(action).tolist()) # self._core.Update(self.timestep) need_update = True while need_update: self._core.Update(self.timestep) if self.enable_draw: self._draw() valid_episode = self._core.CheckValidEpisode() if valid_episode: done = self._core.IsEpisodeEnd() if done: obs = self._get_observation() valid_obs, obs = self._check_observation(obs, prev_obs) if valid_obs: reward = self._core.CalcReward(self.agent_id) return obs, reward, True, {"valid": 1.0} else: return obs, 0.0, True, {"valid": 0.0} else: need_update = not self._core.NeedNewAction(self.agent_id) else: obs = self._get_observation() valid_obs, obs = self._check_observation(obs, prev_obs) return obs, 0.0, True, {"valid": 0.0} obs = self._get_observation() valid_obs, obs = self._check_observation(obs, prev_obs) if valid_obs: reward = self._core.CalcReward(self.agent_id) return obs, reward, False, {"valid": 1.0} else: return obs, 0.0, True, {"valid": 0.0} def _get_observation(self): state = np.array(self._core.RecordState(self.agent_id)) goal = np.array(self._core.RecordGoal(self.agent_id)) # ground = np.array(self._core.RecordGround(self.agent_id)) obs = np.concatenate([state, goal], axis=0) return obs def _check_observation(self, obs, prev_obs): if np.isnan(obs).any(): return False, prev_obs else: if self.observation_space.contains(obs): return True, obs else: return False, prev_obs def get_state_size(self, agent_id): return self._core.GetStateSize(agent_id) def get_goal_size(self, agent_id): return self._core.GetGoalSize(agent_id) def get_action_size(self, agent_id): return self._core.GetActionSize(agent_id) def get_num_actions(self, agent_id): return self._core.GetNumActions(agent_id) def build_state_offset(self, agent_id): return np.array(self._core.BuildStateOffset(agent_id)) def build_state_scale(self, agent_id): return np.array(self._core.BuildStateScale(agent_id)) def build_goal_offset(self, agent_id): return np.array(self._core.BuildGoalOffset(agent_id)) def build_goal_scale(self, agent_id): return np.array(self._core.BuildGoalScale(agent_id)) def build_action_offset(self, agent_id): return np.array(self._core.BuildActionOffset(agent_id)) def build_action_scale(self, agent_id): return np.array(self._core.BuildActionScale(agent_id)) def build_action_bound_min(self, agent_id): return np.array(self._core.BuildActionBoundMin(agent_id)) def build_action_bound_max(self, agent_id): return np.array(self._core.BuildActionBoundMax(agent_id)) def build_state_norm_groups(self, agent_id): return np.array(self._core.BuildStateNormGroups(agent_id)) def build_goal_norm_groups(self, agent_id): return np.array(self._core.BuildGoalNormGroups(agent_id)) def get_reward_min(self, agent_id): return self._core.GetRewardMin(agent_id) def get_reward_max(self, agent_id): return self._core.GetRewardMax(agent_id)
def split(self, nid): # Try nb_split_attempts splits on region corresponding to node <nid> reg = self.tree.get_node(nid).data best_split_score = 0 best_bounds = None best_sub_regions = None is_split = False for i in range(self.nb_split_attempts): sub_reg1 = [ deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1) ] sub_reg2 = [ deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1) ] # repeat until the two sub regions contain at least minlen of the mother region while len(sub_reg1[0]) < self.minlen or len( sub_reg2[0]) < self.minlen: # decide on dimension dim = self.random_state.choice(range(self.nb_dims)) threshold = reg.bounds.sample()[dim] bounds1 = Box(reg.bounds.low, reg.bounds.high, dtype=np.float32) bounds1.high[dim] = threshold bounds2 = Box(reg.bounds.low, reg.bounds.high, dtype=np.float32) bounds2.low[dim] = threshold bounds = [bounds1, bounds2] valid_bounds = True if np.any(bounds1.high - bounds1.low < self.dims_ranges * self.min_dims_range_ratio): valid_bounds = False if np.any(bounds2.high - bounds2.low < self.dims_ranges * self.min_dims_range_ratio): valid_bounds = valid_bounds and False # perform split in sub regions sub_reg1 = [ deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1) ] sub_reg2 = [ deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1) ] for i, task in enumerate(reg.r_t_pairs[1]): if bounds1.contains(task): sub_reg1[1].append(task) sub_reg1[0].append(reg.r_t_pairs[0][i]) else: sub_reg2[1].append(task) sub_reg2[0].append(reg.r_t_pairs[0][i]) sub_regions = [sub_reg1, sub_reg2] # compute alp alp = [self.compute_alp(sub_reg1), self.compute_alp(sub_reg2)] # compute score split_score = len(sub_reg1) * len(sub_reg2) * np.abs(alp[0] - alp[1]) if split_score >= best_split_score and valid_bounds: is_split = True best_split_score = split_score best_sub_regions = sub_regions best_bounds = bounds if is_split: # add new nodes to tree for i, (r_t_pairs, bounds) in enumerate(zip(best_sub_regions, best_bounds)): self.tree.create_node(identifier=self.tree.size(), parent=nid, data=Region(self.maxlen, r_t_pairs=r_t_pairs, bounds=bounds, alp=alp[i])) else: assert len(reg.r_t_pairs[0]) == (self.maxlen + 1) reg.r_t_pairs[0] = deque( islice(reg.r_t_pairs[0], int(self.maxlen * self.discard_ratio), self.maxlen + 1)) reg.r_t_pairs[1] = deque( islice(reg.r_t_pairs[1], int(self.maxlen * self.discard_ratio), self.maxlen + 1)) return is_split
class ParametrizedWaves(object): """The main OpenAI Gym class. It encapsulates an environment with arbitrary behind-the-scenes dynamics. An environment can be partially or fully observed. The main API methods that users of this class need to know are: step reset render close seed And set the following attributes: action_space: The Space object corresponding to valid actions observation_space: The Space object corresponding to valid observations reward_range: A tuple corresponding to the min and max possible rewards Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range. The methods are accessed publicly as "step", "reset", etc.. The non-underscored versions are wrapper methods to which we may add functionality over time. """ # Set this in SOME subclasses metadata = {'render.modes': []} reward_range = (-float('inf'), float('inf')) spec = None def __init__(self): super().__init__() self.amplitude = np.random.randint(100) / 10 self.frequency = np.random.randint(100) / 10 self.action_space = Box(low=-10 * np.ones(2), high=10 * np.ones(2)) self.observation_space = Box(low=-10 * np.ones(2), high=10 * np.ones(2)) def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). Args: action (object): an action provided by the environment Returns: observation (object): agent's observation of the current environment reward (float) : amount of reward returned after previous action done (boolean): whether the episode has ended, in which case further step() calls will return undefined results info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning) """ if not self.action_space.contains(action): raise Exception("Action %s is not in action space." % action) x = np.linspace(0, 1, 1000) wave_ground_truth = eval_wave(x, self.amplitude, self.frequency) wave = eval_wave(x, action[0], action[1]) #reward = -squared_loss(wave_ground_truth, wave) reward = -squared_loss(np.array([self.amplitude, self.frequency]), np.array([action[0], action[1]])) observation = np.array([self.amplitude, self.frequency]) done = True return observation, reward, done, None def reset(self): """Resets the state of the environment and returns an initial observation. Returns: observation (object): the initial observation of the space. """ self.amplitude = np.random.randint(100) / 10 self.frequency = np.random.randint(100) / 10 observation = np.array([self.amplitude, self.frequency]) return observation def render(self, mode='human'): """Renders the environment. The set of supported modes varies per environment. (And some environments do not support rendering at all.) By convention, if mode is: - human: render to the current display or terminal and return nothing. Usually for human consumption. - rgb_array: Return an numpy.ndarray with shape (x, y, 3), representing RGB values for an x-by-y pixel image, suitable for turning into a video. - ansi: Return a string (str) or StringIO.StringIO containing a terminal-style text representation. The text can include newlines and ANSI escape sequences (e.g. for colors). Note: Make sure that your class's metadata 'render.modes' key includes the list of supported modes. It's recommended to call super() in implementations to use the functionality of this method. Args: mode (str): the mode to render with close (bool): close all open renderings Example: class MyEnv(Env): metadata = {'render.modes': ['human', 'rgb_array']} def render(self, mode='human'): if mode == 'rgb_array': return np.array(...) # return RGB frame suitable for video elif mode is 'human': ... # pop up a window and render else: super(MyEnv, self).render(mode=mode) # just raise an exception """ raise NotImplementedError def close(self): """Override _close in your subclass to perform any necessary cleanup. Environments will automatically close() themselves when garbage collected or when the program exits. """ return def seed(self, seed=None): """Sets the seed for this env's random number generator(s). Note: Some environments use multiple pseudorandom number generators. We want to capture all such seeds used in order to ensure that there aren't accidental correlations between multiple generators. Returns: list<bigint>: Returns the list of seeds used in this env's random number generators. The first value in the list should be the "main" seed, or the value which a reproducer should pass to 'seed'. Often, the main seed equals the provided 'seed', but this won't be true if seed=None, for example. """ logger.warn("Could not seed environment %s", self) return @property def unwrapped(self): """Completely unwrap this env. Returns: gym.Env: The base non-wrapped gym.Env instance """ return self def __str__(self): if self.spec is None: return '<{} instance>'.format(type(self).__name__) else: return '<{}<{}>>'.format(type(self).__name__, self.spec.id)