def step(self, action): """ action map: 0: left 1: down 2: right 3: up :param action: should be a one-hot vector encoding the action :return: """ possible_next_states = self.get_possible_next_states( self.state, action) probs = [x[1] for x in possible_next_states] next_state_idx = np.random.choice(len(probs), p=probs) next_state = possible_next_states[next_state_idx][0] next_x = next_state // self.n_col next_y = next_state % self.n_col next_state_type = self.desc[next_x, next_y] if next_state_type == 'H': done = True reward = 0 elif next_state_type in ['F', 'S']: done = False reward = 0 elif next_state_type == 'G': done = True reward = 1 else: raise NotImplementedError self.state = next_state return Step(observation=self.state, reward=reward, done=done)
def step(self, action): self._state = self._state + action x, y = self._state x -= self._goal[0] y -= self._goal[1] reward = -(x**2 + y**2)**0.5 done = abs(x) < 0.01 and abs(y) < 0.01 next_observation = np.copy(self._state) return Step(observation=next_observation, reward=reward, done=done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() action = np.clip(action, *self.action_bounds) ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action)) run_cost = 1.*np.abs(self.get_body_comvel("torso")[0] - self._goal_vel) cost = ctrl_cost + run_cost reward = -cost done = False return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() x, _, y = self.model.data.site_xpos[0] dist_penalty = 0.01 * x**2 + (y - 2)**2 v1, v2 = self.model.data.qvel[1:3] vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2 alive_bonus = 10 r = float(alive_bonus - dist_penalty - vel_penalty) done = y <= 1 return Step(next_obs, r, done)
def step(self, action): _, _, done, info = self.inner_env.step(action) if done: return Step(self.get_current_obs(), -10, done, **info) com = self.inner_env.get_body_com("torso") x, y = com[:2] reward = 0 new_objs = [] for obj in self.objects: ox, oy, typ = obj # object within zone! if (ox - x)**2 + (oy - y)**2 < self.catch_range**2: if typ == APPLE: reward = reward + 1 else: reward = reward - 1 else: new_objs.append(obj) self.objects = new_objs done = len(self.objects) == 0 return Step(self.get_current_obs(), reward, done, **info)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum( np.square(action / scaling)) forward_reward = -1.5 * np.abs( self.get_body_comvel("torso")[0] - self._goal_vel) reward = forward_reward - ctrl_cost done = False return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum( np.square(action / scaling)) forward_reward = self.get_body_comvel("torso")[0] #forward_reward = -1.5*np.abs(self.get_body_comvel("torso")[0] - 0.15) # max achievable vel is around 0.20 for vpg. reward = forward_reward - ctrl_cost done = False return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 vel = self.get_body_comvel("torso")[0] reward = vel + self.alive_coeff - \ 0.5 * self.ctrl_cost_coeff * np.sum(np.square(action / scaling)) state = self._state notdone = np.isfinite(state).all() and \ (np.abs(state[3:]) < 100).all() and (state[0] > .7) and \ (abs(state[2]) < .2) done = not notdone return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() action = np.clip(action, *self.action_bounds) lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * self.ctrl_cost_coeff * \ np.sum(np.square(action / scaling)) forward_reward = self.get_body_comvel("torso")[0] reward = forward_reward - ctrl_cost qpos = self.model.data.qpos done = not (qpos[0] > 0.8 and qpos[0] < 2.0 and qpos[2] > -1.0 and qpos[2] < 1.0) return Step(next_obs, reward, done)
def step(self, action): qpos = np.copy(self.model.data.qpos) qpos[2, 0] += action[1] ori = qpos[2, 0] # compute increment in each direction dx = math.cos(ori) * action[0] dy = math.sin(ori) * action[0] # ensure that the robot is within reasonable range qpos[0, 0] = np.clip(qpos[0, 0] + dx, -7, 7) qpos[1, 0] = np.clip(qpos[1, 0] + dy, -7, 7) self.model.data.qpos = qpos self.model.forward() next_obs = self.get_current_obs() return Step(next_obs, 0, False)
def step(self, action): if isinstance(self._wrapped_env.action_space, Box): # rescale the action lb, ub = self._wrapped_env.action_space.bounds scaled_action = lb + (action + 1.) * 0.5 * (ub - lb) scaled_action = np.clip(scaled_action, lb, ub) else: scaled_action = action wrapped_step = self._wrapped_env.step(scaled_action) next_obs, reward, done, info = wrapped_step if self._normalize_obs: next_obs = self._apply_normalize_obs(next_obs) if self._normalize_reward: reward = self._apply_normalize_reward(reward) return Step(next_obs, reward * self._scale_reward, done, **info)
def step(self, action): self.forward_dynamics(action) comvel = self.get_body_comvel("torso") forward_reward = self.goal_direction * comvel[0] lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) contact_cost = 0.5 * 1e-3 * np.sum( np.square(np.clip(self.model.data.cfrc_ext, -1, 1))), survive_reward = 0.05 reward = forward_reward - ctrl_cost - contact_cost + survive_reward state = self._state notdone = np.isfinite(state).all() \ and state[2] >= 0.2 and state[2] <= 1.0 done = not notdone ob = self.get_current_obs() return Step(ob, float(reward), done)
def step(self, action): """ Note: override this method with great care, as it post-processes the observations, etc. """ reward_computer = self.compute_reward(action) # forward the state action = self._inject_action_noise(action) for _ in range(self.frame_skip): self.forward_dynamics(action) # notifies that we have stepped the world next(reward_computer) # actually get the reward reward = next(reward_computer) self._invalidate_state_caches() done = self.is_current_done() next_obs = self.get_current_obs() return Step(observation=next_obs, reward=reward, done=done)
def step(self, action): self.forward_dynamics(action) com = self.get_body_com("torso") # ref_x = x + self._init_torso_x goal_reward = np.exp(-np.sum( np.abs(com[:2] - self._goal_pos))) # make it happy, not suicidal lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) contact_cost = 0.5 * 1e-3 * np.sum( np.square(np.clip(self.model.data.cfrc_ext, -1, 1))), survive_reward = 0.05 reward = goal_reward - ctrl_cost - contact_cost + survive_reward state = self._state notdone = np.isfinite(state).all() \ and state[2] >= 0.2 and state[2] <= 1.0 done = not notdone ob = self.get_current_obs() return Step(ob, float(reward), done)
def step(self, action, **kwargs): """ Run one timestep of the environment's dynamics. When end of episode is reached, reset() should be called to reset the environment's internal state. Input ----- action : an action provided by the environment Outputs ------- (observation, reward, done, info) observation : agent's observation of the current environment reward [Float] : amount of reward due to the previous action done : a boolean, indicating whether the episode has ended info : a dictionary containing other diagnostic information from the previous action """ prev_state = self._state self._state = prev_state + np.clip(action, -0.1, 0.1) reward = self.reward(prev_state, action, self._state) done = self.done(self._state) next_observation = np.copy(self._state) return Step(next_observation, reward, done)
def step(self, action): if self.MANUAL_COLLISION: old_pos = self.wrapped_env.get_xy() _, _, done, info = self.wrapped_env.step(action) new_pos = self.wrapped_env.get_xy() if self._is_in_collision(new_pos): self.wrapped_env.set_xy(old_pos) done = False else: _, _, done, info = self.wrapped_env.step(action) next_obs = self.get_current_obs() x, y = self.wrapped_env.get_body_com("torso")[:2] # ref_x = x + self._init_torso_x # ref_y = y + self._init_torso_y reward = 0 minx, maxx, miny, maxy = self._goal_range # print "goal range: x [%s,%s], y [%s,%s], now [%s,%s]" % (str(minx), str(maxx), str(miny), str(maxy), # str(x), str(y)) if minx <= x <= maxx and miny <= y <= maxy: done = True reward = 1 return Step(next_obs, reward, done, **info)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() alive_bonus = self.alive_bonus data = self.model.data comvel = self.get_body_comvel("torso") lin_vel_reward = comvel[0] lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = .5 * self.ctrl_cost_coeff * np.sum( np.square(action / scaling)) impact_cost = .5 * self.impact_cost_coeff * np.sum( np.square(np.clip(data.cfrc_ext, -1, 1))) vel_deviation_cost = 0.5 * self.vel_deviation_cost_coeff * np.sum( np.square(comvel[1:])) reward = lin_vel_reward + alive_bonus - ctrl_cost - \ impact_cost - vel_deviation_cost done = data.qpos[2] < 0.8 or data.qpos[2] > 2.0 return Step(next_obs, reward, done)
def step(self, action): next_obs, reward, done, info = self._wrapped_env.step(action) return Step(self.inject_obs_noise(next_obs), reward, done, **info)
def step(self, action): queued_action = self._queued_actions[:self.action_dim] next_obs, reward, done, info = self._wrapped_env.step(queued_action) self._queued_actions = np.concatenate( [self._queued_actions[self.action_dim:], action]) return Step(next_obs, reward, done, **info)
def step(self, action): next_obs, reward, done, info = self.env.step(action) return Step(next_obs, reward, done, **info)
def step(self, action): next_obs, reward, done, info = self._wrapped_env.step(action) self.add_to_buffer(next_obs) return Step(self.buffer, reward, done, **info)