예제 #1
0
 def step(self, action):
     if self.MANUAL_COLLISION:
         old_pos = self.env.get_xy()
         inner_next_obs, inner_rew, done, info = self.env.step(action)
         new_pos = self.env.get_xy()
         if self._is_in_collision(new_pos):
             self.env.set_xy(old_pos)
             done = False
     else:
         inner_next_obs, inner_rew, done, info = self.env.step(action)
     next_obs = self.get_current_obs()
     x, y = self.env.get_body_com("torso")[:2]
     # ref_x = x + self._init_torso_x
     # ref_y = y + self._init_torso_y
     info['outer_rew'] = 0
     info['inner_rew'] = inner_rew
     reward = self.coef_inner_rew * inner_rew
     minx, maxx, miny, maxy = self._goal_range
     if minx <= x <= maxx and miny <= y <= maxy:
         done = True
         reward += self.goal_rew
         # we keep here the original one, so that theAvgReturn is directly
         # the freq of success
         info['rew_rew'] = 1
     return Step(next_obs, reward, done, **info)
예제 #2
0
    def step(self, action):
        """
        action map:
        0: left
        1: down
        2: right
        3: up
        :param action: should be a one-hot vector encoding the action
        :return:
        """
        possible_next_states = self.get_possible_next_states(
            self.state, action)

        probs = [x[1] for x in possible_next_states]
        next_state_idx = np.random.choice(len(probs), p=probs)
        next_state = possible_next_states[next_state_idx][0]

        next_x = next_state // self.n_col
        next_y = next_state % self.n_col

        next_state_type = self.desc[next_x, next_y]
        if next_state_type == 'H':
            done = True
            reward = 0
        elif next_state_type in ['F', 'S']:
            done = False
            reward = 0
        elif next_state_type == 'G':
            done = True
            reward = 1
        else:
            raise NotImplementedError
        self.state = next_state
        return Step(observation=self.state, reward=reward, done=done)
예제 #3
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum(
         np.square(action / scaling))
     forward_reward = self.get_body_comvel("torso")[0]
     reward = forward_reward - ctrl_cost
     done = False
     return Step(next_obs, reward, done)
예제 #4
0
    def step(self, action):
        # Copy action first to remove side effects
        action_copy = copy(action)

        self.forward_dynamics(action_copy)
        next_obs = self.get_current_obs()
        action_copy = np.clip(action_copy, *self.action_bounds)
        ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action_copy))
        assert action.all() == action_copy.all()
        run_cost = -1 * self.get_body_comvel("torso")[0]
        cost = ctrl_cost + run_cost
        reward = -cost
        done = False
        return Step(next_obs, reward, done)
예제 #5
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     action = np.clip(action, *self.action_bounds)
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * self.ctrl_cost_coeff * \
         np.sum(np.square(action / scaling))
     forward_reward = self.get_body_comvel("torso")[0]
     reward = forward_reward - ctrl_cost
     qpos = self.sim.data.qpos
     done = not (qpos[0] > 0.8 and qpos[0] < 2.0 and qpos[2] > -1.0
                 and qpos[2] < 1.0)
     return Step(next_obs, reward, done)
예제 #6
0
 def step(self, action):
     qpos = np.copy(self.sim.data.qpos)
     qpos[2] += action[1]
     ori = qpos[2]
     # compute increment in each direction
     dx = math.cos(ori) * action[0]
     dy = math.sin(ori) * action[0]
     # ensure that the robot is within reasonable range
     qpos[0] = np.clip(qpos[0] + dx, -7, 7)
     qpos[1] = np.clip(qpos[1] + dy, -7, 7)
     self.sim.data.qpos[:] = qpos
     self.sim.forward()
     next_obs = self.get_current_obs()
     return Step(next_obs, 0, False)
예제 #7
0
 def step(self, action):
     self.forward_dynamics(action)
     comvel = self.get_body_comvel("torso")
     forward_reward = comvel[0]
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
     contact_cost = 0.5 * 1e-3 * np.sum(
         np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))),
     survive_reward = 0.05
     reward = forward_reward - ctrl_cost - contact_cost + survive_reward
     state = self._state
     notdone = np.isfinite(state).all() \
         and state[2] >= 0.2 and state[2] <= 1.0
     done = not notdone
     ob = self.get_current_obs()
     return Step(ob, float(reward), done)
예제 #8
0
    def step(self, action):
        # enforce action space
        a = action.copy()  # NOTE: we MUST copy the action before modifying it
        a = np.clip(a, self.action_space.low, self.action_space.high)

        dist = np.linalg.norm(self._point - self._goal)
        done = dist < np.linalg.norm(self.action_space.low)

        # dense reward
        reward = -dist
        # done bonus
        if done:
            reward += self._done_bonus

        # sometimes we don't want to terminate
        done = done and not self._never_done

        return Step(np.copy(self._point), reward, done)
예제 #9
0
    def step(self, action):
        """
        Note: override this method with great care, as it post-processes the
        observations, etc.
        """
        # Copy action first to remove side effects
        action_copy = copy(action)

        reward_computer = self.compute_reward(action_copy)
        # forward the state
        action_copy = self._inject_action_noise(action_copy)
        for _ in range(self.frame_skip):
            self.forward_dynamics(action_copy)
        # notifies that we have stepped the world
        next(reward_computer)
        # actually get the reward
        reward = next(reward_computer)
        self._invalidate_state_caches()
        done = self.is_current_done()
        next_obs = self.get_current_obs()
        return Step(observation=next_obs, reward=reward, done=done)
예제 #10
0
    def step(self, action):
        self.forward_dynamics(action)
        next_obs = self.get_current_obs()

        alive_bonus = self.alive_bonus
        data = self.sim.data

        comvel = self.get_body_comvel("torso")

        lin_vel_reward = comvel[0]
        lb, ub = self.action_bounds
        scaling = (ub - lb) * 0.5
        ctrl_cost = .5 * self.ctrl_cost_coeff * np.sum(
            np.square(action / scaling))
        impact_cost = .5 * self.impact_cost_coeff * np.sum(
            np.square(np.clip(data.cfrc_ext, -1, 1)))
        vel_deviation_cost = 0.5 * self.vel_deviation_cost_coeff * np.sum(
            np.square(comvel[1:]))
        reward = lin_vel_reward + alive_bonus - ctrl_cost - \
            impact_cost - vel_deviation_cost
        done = data.qpos[2] < 0.8 or data.qpos[2] > 2.0

        return Step(next_obs, reward, done)
예제 #11
0
 def step(self, action):
     queued_action = self._queued_actions[:self._action_flat_dim]
     next_obs, reward, done, info = self.env.step(queued_action)
     self._queued_actions = np.concatenate(
         [self._queued_actions[self._action_flat_dim:], action])
     return Step(next_obs, reward, done, **info)
예제 #12
0
 def step(self, action):
     next_obs, reward, done, info = self.env.step(action)
     return Step(self.inject_obs_noise(next_obs), reward, done, **info)
예제 #13
0
 def step(self, action):
     next_obs, reward, done, info = self.env.step(action)
     self.add_to_buffer(next_obs)
     return Step(self.buffer, reward, done, **info)
예제 #14
0
 def step(self, action):
     next_obs, reward, done, info = self.env.step(action)
     return Step(self.occlude(next_obs), reward, done, **info)