def step(self, action): """ Perform a step in gazebo. When end of episode is reached, reset() should be called to reset the environment's internal state. Input ----- action : an action provided by the environment Outputs ------- (observation, reward, done, info) observation : agent's observation of the current environment reward: float amount of reward due to the previous action done : a boolean, indicating whether the episode has ended info : a dictionary containing other diagnostic information from the previous action """ self._robot.send_command(action) obs = self.get_observation() reward = self.reward(obs.achieved_goal, self.goal) done = self.done(obs.achieved_goal, self.goal) next_observation = obs.observation return Step(observation=next_observation, reward=reward, done=done)
def step(self, action): """ action map: 0: left 1: down 2: right 3: up :param action: should be a one-hot vector encoding the action :return: """ possible_next_states = self.get_possible_next_states( self.state, action) probs = [x[1] for x in possible_next_states] next_state_idx = np.random.choice(len(probs), p=probs) next_state = possible_next_states[next_state_idx][0] next_x = next_state // self.n_col next_y = next_state % self.n_col next_state_type = self.desc[next_x, next_y] if next_state_type == 'H': done = True reward = 0 elif next_state_type in ['F', 'S']: done = False reward = 0 elif next_state_type == 'G': done = True reward = 1 else: raise NotImplementedError self.state = next_state return Step(observation=self.state, reward=reward, done=done)
def step(self, action): """Step the environment state. Args: action (np.ndarray): The action to take in the environment. Returns: np.ndarray: Observation. The observation of the environment. float: Reward. The reward acquired at this time step. boolean: Done. Whether the environment was completed at this time step. Always False for this environment. """ # enforce action space a = action.copy() # NOTE: we MUST copy the action before modifying it a = np.clip(a, self.action_space.low, self.action_space.high) self._point = np.clip(self._point + a, -self._arena_size, self._arena_size) dist = np.linalg.norm(self._point - self._goal) done = dist < np.linalg.norm(self.action_space.low) # dense reward reward = -dist # done bonus if done: reward += self._done_bonus # sometimes we don't want to terminate done = done and not self._never_done return Step(np.copy(self._point), reward, done, task=self._task)
def step(self, action): """ Perform a step in gazebo. When end of episode is reached, reset() should be called to reset the environment's internal state. Input ----- action : an action provided by the environment Outputs ------- (observation, reward, done, info) observation : agent's observation of the current environment reward: float amount of reward due to the previous action done : a boolean, indicating whether the episode has ended info : a dictionary containing other diagnostic information from the previous action """ assert action.shape == self.action_space.shape a = action.copy() # Note: you MUST copy the action if you modify it a *= self._action_scale a = np.clip(a, self.action_space.low, self.action_space.high) self._robot.send_command(a) obs = self.get_observation() reward = self.reward(obs.achieved_goal, self.goal) reward -= self._control_cost_coeff * np.linalg.norm(a) done = self.done(obs.achieved_goal, self.goal) next_observation = obs.observation return Step(observation=next_observation, reward=reward, done=done)
def step(self, action): r""" Run one timestep of the environment's dynamics. When end of episode is reached, reset() should be called to reset the environment's internal state. Parameters ---------- action : array_like An action provided by the environment. Returns ------- : :py:func:`garage.envs.base.Step` A step in the rollout. Contains the following information: - observation (array_like): Agent's observation of the current environment. - reward (float): Amount of reward due to the previous action. - done (bool): Is the current step a terminal or goal state, ending the rollout. - actions (array_like): The action taken at the current. - state (array_like): The cloned simulation state at the current cell, used for resetting if chosen to start a rollout. - is_terminal (bool): Whether or not the current cell is a terminal state. - is_goal (bool): Whether or not the current cell is a goal state. """ self._env_state_before_action = self._env_state.copy() self._action = action self._actions.append(action) action_return = self._action # Update simulation step obs = self.simulator.step(self._action) if (obs is None) or (self.open_loop is True) or (self.blackbox_sim_state): obs = np.array(self._init_state) if self.simulator.is_terminal() or self.simulator.is_goal(): self._done = True # Calculate the reward for this step self._reward = self.reward_function.give_reward( action=self._action, info=self.simulator.get_reward_info()) self._cum_reward += self._reward # Update instance attributes self._step = self._step + 1 self._simulator_state = self.simulator.clone_state() self._env_state = np.concatenate((self._simulator_state, np.array([self._cum_reward]), np.array([self._step])), axis=0) return Step(observation=obs, reward=self._reward, done=self._done, actions=action_return, state=self._env_state_before_action, is_terminal=self.simulator.is_terminal(), is_goal=self.simulator.is_goal())
def step(self, action): """ Run one timestep of the environment's dynamics. When end of episode is reached, reset() should be called to reset the environment's internal state. Input ----- action : an action provided by the environment Outputs ------- (observation, reward, done, info) observation : agent's observation of the current environment reward [Float] : amount of reward due to the previous action done : a boolean, indicating whether the episode has ended info : a dictionary containing other diagnostic information from the previous action """ self._action = action self._actions.append(action) action_return = self._action # Update simulation step obs = self.simulator.step(self._action) if (obs is None) or (self.open_loop is True) or (self.blackbox_sim_state): obs = np.array(self._init_state) # if self.simulator.is_goal(): if self.simulator.is_terminal() or self.simulator.is_goal(): self._done = True # Calculate the reward for this step self._reward = self.reward_function.give_reward( action=self._action, info=self.simulator.get_reward_info()) self._cum_reward += self._reward # Update instance attributes self._step = self._step + 1 self._simulator_state = self.simulator.clone_state() self._env_state = np.concatenate( (self._simulator_state, np.array([self._cum_reward ]), np.array([self._step])), axis=0) if self._done: self.simulator.simulate(self._actions, self._init_state) if not (self.simulator.is_goal() or self.simulator.is_terminal()): pdb.set_trace() return Step( observation=obs, reward=self._reward, done=self._done, cache=self._info, actions=action_return, # step = self._step -1, # real_actions=self._action, state=self._env_state, # root_action=self.root_action, is_terminal=self.simulator.is_terminal(), is_goal=self.simulator.is_goal())
def step(self, action): # no clipping / rescaling of actions # action = np.clip(action, self.action_space.low, self.action_space.high) # rot_ctrl = np.array([1., 0., 1., 0.]) # action = np.concatenate([action, rot_ctrl]) # action, _ = np.split(action, (self.sim.model.nmocap * 7,)) # action = action.reshape(self.sim.model.nmocap, 7) # pos_delta = action[:, :3] if self._control_method == "torque_control": self.forward_dynamics(action) self.sim.forward() else: reset_mocap2body_xpos(self.sim) self.sim.data.mocap_pos[:] = self.sim.data.mocap_pos + action for _ in range(50): self.sim.step() self._step += 1 # obs = self.get_current_obs() # achieved_goal = obs['achieved_goal'] # goal = obs['desired_goal'] achieved_goal = self.position # reward = self._compute_reward(achieved_goal, goal) obs = self.get_obs() achieved_dist = self._goal_distance(achieved_goal, self._goal) # reward = rewards._sigmoids(self._goal_distance(achieved_goal, goal) / self._goal_distance(self.initial_pos, goal), 0., "cosine") # reward = 1. - achieved_dist / self._goal_distance(self._start_pos, self._goal) / 2. # before # reward = 1. - achieved_dist / self._goal_distance(np.zeros(3), self._goal) / 2. # TODO sparse reward reward = 1. - achieved_dist / self._goal_distance( self._start_pos, self._goal) # print(self.initial_pos, achieved_goal) done = (achieved_dist < self._distance_threshold) if done: reward = 20. # 20. info = { "episode": { "l": self._step, "r": reward, "d": done, "position": self.position, "task": np.copy(self.onehot) } } # just take gripper position as observation return Step(obs, reward, False, **info)
def step(self, action): # enforce action space a = action.copy() # NOTE: we MUST copy the action before modifying it a = np.clip(a, self.action_space.low, self.action_space.high) dist = np.linalg.norm(self._point - self._goal) done = dist < np.linalg.norm(self.action_space.low) # dense reward reward = -dist # done bonus if done: reward += self._done_bonus # sometimes we don't want to terminate done = done and not self._never_done return Step(np.copy(self._point), reward, done)
def step(self, action): """ :param action: power on left & right motor :type action: numpy.ndarray """ if self.discretized: # discretization from {<-1,-0.33> , <-0.33,0.33> , <0.33,1>} to [-1, 0, 1] action = np.clip(np.round(action * 1.5), a_min=-1, a_max=1) # Set new state next_pos, next_ori = self._get_new_position(action) self.agent_pos = next_pos self.agent_ori = next_ori obs = self.get_current_obs() # Determine reward and termination next_state_type = self._tile_type_at_pos(next_pos) if next_state_type == 'H': done = True reward = -1 elif next_state_type in ['F', 'S']: done = False reward = -self.STEP_PENALTY elif next_state_type == 'G': done = True reward = 1 else: raise NotImplementedError # Cache observation if self.do_caching: k = tuple(np.array(obs, dtype='int8')) v = (self.current_map_idx, tuple(self.agent_pos), self.agent_ori) if k not in self.states_cache: self.states_cache[k] = {v} else: self.states_cache[k].add(v) # Return observation and others return Step(observation=obs, reward=reward, done=done, agent_pos=self.agent_pos, agent_ori=self.agent_ori, map=self.current_map_idx)
def step(self, action): assert (self.hrl_policy is not None) skill_path = skill_rollout( env=TfEnv(self.env), agent=self.hrl_policy.get_skill_policy(action), skill_stopping_func=self.hrl_policy.get_skill_stopping_func( action), max_path_length=self.hrl_policy.skill_max_timesteps, reset_start_rollout= False # do not reset the env, continue from current state ) next_obs = AsaEnv.get_current_obs_wrapped(self.env) reward = np.sum(skill_path['rewards']) term = skill_path['terminated'][-1] return Step(observation=next_obs, reward=reward, done=term, subpath_infos=SubpolicyPathInfo(skill_path, store=self.subpath_infos))
def step(self, action): # no clipping / rescaling of actions if self._for_her: # actions are in [-1, 1] action = (action + 1.) / 2. # actions are in [0, 1] action = action * (self.action_space.high - self.action_space.low) + self.action_space.low action = np.clip(action, self.action_space.low, self.action_space.high) # rot_ctrl = np.array([1., 0., 1., 0.]) # action = np.concatenate([action, rot_ctrl]) # action, _ = np.split(action, (self.sim.model.nmocap * 7,)) # action = action.reshape(self.sim.model.nmocap, 7) # pos_delta = action[:, :3] # action = np.concatenate((np.zeros(2), np.array(action).flatten())) action = np.array(action).flatten() if self._control_method == "torque_control": self.forward_dynamics(action) self.sim.forward() else: reset_mocap2body_xpos(self.sim) self.sim.data.mocap_pos[0, :3] = self.sim.data.mocap_pos[0, :3] + action[:3] self.set_gripper_state(action[3]) for _ in range(1): self.sim.step() self.sim.forward() self._step += 1 # obs = self.get_current_obs() # achieved_goal = obs['achieved_goal'] # goal = obs['desired_goal'] grasped = self._grasp() # TODO fix grasp recognition (always returns False) object_pos = self.sim.data.get_site_xpos('object0') desired_goal = object_pos if self._for_her: if not grasped: achieved_goal = self.position else: # print("Grasped!") achieved_goal = np.squeeze(object_pos.copy()) desired_goal = self.goal else: achieved_goal = self.position # reward = self._compute_reward(achieved_goal, goal) obs = self.get_obs() # penalize x/y movement of object penalize_2d_motion = np.linalg.norm(object_pos[:2] - np.array(TASKS[self._task][:2])) # reward positive change in z direction (up) lifting = object_pos[2] - self.init_box_height # achieved_dist = self._goal_distance(achieved_goal, self._goal) achieved_dist = self._goal_distance(achieved_goal, desired_goal) # reward = rewards._sigmoids(self._goal_distance(achieved_goal, goal) / self._goal_distance(self.initial_pos, goal), 0., "cosine") # reward = 1. - achieved_dist / self._goal_distance(self._start_pos, self._goal) / 2. # before # reward = 1. - achieved_dist / self._goal_distance(np.zeros(3), self._goal) / 2. # TODO sparse reward # if grasped: # print("Grasped!") # print(self.initial_pos, achieved_goal) done = grasped and lifting > 0.005 # (action[3] < 0.2 and not grasped) or grasped and lifting > 0.02 #(achieved_dist < self._distance_threshold) if self._for_her: reward = self.compute_reward(achieved_goal, desired_goal, {}) else: if self._sparse_reward: reward = 0. - penalize_2d_motion + lifting + float(grasped) * 0.3 else: reward = (.3 + lifting * 30.) * float(grasped) + 1. - achieved_dist / self._goal_distance( self._start_pos, object_pos) - penalize_2d_motion if done: # done: reward = 20. # 20. if self._for_her: info = { "l": self._step, "r": reward, "d": done, "grasped": grasped, "is_success": grasped and lifting > 0.005 } else: info = { "episode": { "l": self._step, "r": reward, "d": done, "grasped": grasped, "position": self.position, "task": np.copy(self.onehot) } } # just take gripper position as observation return Step(obs, reward, done, **info)
def step(self, action): next_obs, reward, done, info = self.env.step(action) return Step(next_obs, reward, done, **info)