def step(self, action): if self.MANUAL_COLLISION: old_pos = self.env.get_xy() inner_next_obs, inner_rew, done, info = self.env.step(action) new_pos = self.env.get_xy() if self._is_in_collision(new_pos): self.env.set_xy(old_pos) done = False else: inner_next_obs, inner_rew, done, info = self.env.step(action) next_obs = self.get_current_obs() x, y = self.env.get_body_com("torso")[:2] # ref_x = x + self._init_torso_x # ref_y = y + self._init_torso_y info['outer_rew'] = 0 info['inner_rew'] = inner_rew reward = self.coef_inner_rew * inner_rew minx, maxx, miny, maxy = self._goal_range if minx <= x <= maxx and miny <= y <= maxy: done = True reward += self.goal_rew # we keep here the original one, so that theAvgReturn is directly # the freq of success info['rew_rew'] = 1 return Step(next_obs, reward, done, **info)
def step(self, action): """ action map: 0: left 1: down 2: right 3: up :param action: should be a one-hot vector encoding the action :return: """ possible_next_states = self.get_possible_next_states( self.state, action) probs = [x[1] for x in possible_next_states] next_state_idx = np.random.choice(len(probs), p=probs) next_state = possible_next_states[next_state_idx][0] next_x = next_state // self.n_col next_y = next_state % self.n_col next_state_type = self.desc[next_x, next_y] if next_state_type == 'H': done = True reward = 0 elif next_state_type in ['F', 'S']: done = False reward = 0 elif next_state_type == 'G': done = True reward = 1 else: raise NotImplementedError self.state = next_state return Step(observation=self.state, reward=reward, done=done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum( np.square(action / scaling)) forward_reward = self.get_body_comvel("torso")[0] reward = forward_reward - ctrl_cost done = False return Step(next_obs, reward, done)
def step(self, action): # Copy action first to remove side effects action_copy = copy(action) self.forward_dynamics(action_copy) next_obs = self.get_current_obs() action_copy = np.clip(action_copy, *self.action_bounds) ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action_copy)) assert action.all() == action_copy.all() run_cost = -1 * self.get_body_comvel("torso")[0] cost = ctrl_cost + run_cost reward = -cost done = False return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() action = np.clip(action, *self.action_bounds) lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * self.ctrl_cost_coeff * \ np.sum(np.square(action / scaling)) forward_reward = self.get_body_comvel("torso")[0] reward = forward_reward - ctrl_cost qpos = self.sim.data.qpos done = not (qpos[0] > 0.8 and qpos[0] < 2.0 and qpos[2] > -1.0 and qpos[2] < 1.0) return Step(next_obs, reward, done)
def step(self, action): qpos = np.copy(self.sim.data.qpos) qpos[2] += action[1] ori = qpos[2] # compute increment in each direction dx = math.cos(ori) * action[0] dy = math.sin(ori) * action[0] # ensure that the robot is within reasonable range qpos[0] = np.clip(qpos[0] + dx, -7, 7) qpos[1] = np.clip(qpos[1] + dy, -7, 7) self.sim.data.qpos[:] = qpos self.sim.forward() next_obs = self.get_current_obs() return Step(next_obs, 0, False)
def step(self, action): self.forward_dynamics(action) comvel = self.get_body_comvel("torso") forward_reward = comvel[0] lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) contact_cost = 0.5 * 1e-3 * np.sum( np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))), survive_reward = 0.05 reward = forward_reward - ctrl_cost - contact_cost + survive_reward state = self._state notdone = np.isfinite(state).all() \ and state[2] >= 0.2 and state[2] <= 1.0 done = not notdone ob = self.get_current_obs() return Step(ob, float(reward), done)
def step(self, action): # enforce action space a = action.copy() # NOTE: we MUST copy the action before modifying it a = np.clip(a, self.action_space.low, self.action_space.high) dist = np.linalg.norm(self._point - self._goal) done = dist < np.linalg.norm(self.action_space.low) # dense reward reward = -dist # done bonus if done: reward += self._done_bonus # sometimes we don't want to terminate done = done and not self._never_done return Step(np.copy(self._point), reward, done)
def step(self, action): """ Note: override this method with great care, as it post-processes the observations, etc. """ # Copy action first to remove side effects action_copy = copy(action) reward_computer = self.compute_reward(action_copy) # forward the state action_copy = self._inject_action_noise(action_copy) for _ in range(self.frame_skip): self.forward_dynamics(action_copy) # notifies that we have stepped the world next(reward_computer) # actually get the reward reward = next(reward_computer) self._invalidate_state_caches() done = self.is_current_done() next_obs = self.get_current_obs() return Step(observation=next_obs, reward=reward, done=done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() alive_bonus = self.alive_bonus data = self.sim.data comvel = self.get_body_comvel("torso") lin_vel_reward = comvel[0] lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = .5 * self.ctrl_cost_coeff * np.sum( np.square(action / scaling)) impact_cost = .5 * self.impact_cost_coeff * np.sum( np.square(np.clip(data.cfrc_ext, -1, 1))) vel_deviation_cost = 0.5 * self.vel_deviation_cost_coeff * np.sum( np.square(comvel[1:])) reward = lin_vel_reward + alive_bonus - ctrl_cost - \ impact_cost - vel_deviation_cost done = data.qpos[2] < 0.8 or data.qpos[2] > 2.0 return Step(next_obs, reward, done)
def step(self, action): queued_action = self._queued_actions[:self._action_flat_dim] next_obs, reward, done, info = self.env.step(queued_action) self._queued_actions = np.concatenate( [self._queued_actions[self._action_flat_dim:], action]) return Step(next_obs, reward, done, **info)
def step(self, action): next_obs, reward, done, info = self.env.step(action) return Step(self.inject_obs_noise(next_obs), reward, done, **info)
def step(self, action): next_obs, reward, done, info = self.env.step(action) self.add_to_buffer(next_obs) return Step(self.buffer, reward, done, **info)
def step(self, action): next_obs, reward, done, info = self.env.step(action) return Step(self.occlude(next_obs), reward, done, **info)