Python Step 예제들, rllab.envs.base.Step Python 예제들

예제 #1

0

파일 보기

 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     #print(next_obs)
     distance_to_goal = self.get_distance_to_goal()
     #reward = -distance_to_goal
     # modified reward with more emphesis on the xy-position
     vec = self.get_vec_to_goal()
     reward = -np.linalg.norm(vec, ord=1)
     #print('reward = ', reward)
     done = False
     if reward > -0.03:
         done = True
         reward = reward + 100
         print("******** Reached Target ********")
     if self.kill_outside and (distance_to_goal > self.kill_radius):
         done = True
         reward = reward - 100
         print("******** OUT of region ********")
     return Step(next_obs, reward, done)

예제 #2

0

파일 보기

파일: ant_env_dense.py 프로젝트: russellmendonca/mri_old

 def step(self, action):
     self.forward_dynamics(action)
     com = self.get_body_com("torso")
     # ref_x = x + self._init_torso_x
     #print(self._goal_idx)
     goal_reward = -np.sum(np.abs(com[:2] - self.goals[self._goal_idx])
                           ) + 4.0  # make it happy, not suicidal
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
     contact_cost = 0.5 * 1e-3 * np.sum(
         np.square(np.clip(self.model.data.cfrc_ext, -1, 1))),
     survive_reward = 0.05
     reward = goal_reward - ctrl_cost - contact_cost + survive_reward
     state = self._state
     notdone = np.isfinite(state).all() \
         and state[2] >= 0.2 and state[2] <= 1.0
     done = not notdone
     ob = self.get_current_obs()
     return Step(ob, float(reward), done)

예제 #3

0

파일 보기

파일: picker.py 프로젝트: ilBarbara/gps-rllab3

    def step(self,action):
        self.model.data.ctrl = action
        
        reward = 0
        timesInHand = 0

        for _ in range(self.frame_skip):
            self.model.step()
            step_reward = self.reward()
            timesInHand += step_reward > 0
            reward += step_reward

        done = reward == 0 and self.numClose > 0 # Stop it if the block is flinged
        ob = self.get_current_obs()

        new_com = self.model.data.com_subtree[0]
        self.dcom = new_com - self.current_com
        self.current_com = new_com

        return Step(ob, float(reward), done,timeInHand=timesInHand)

예제 #4

0

파일 보기

파일: grid_env_r_baseline.py 프로젝트: maxiaoba/QMDPNET

 def step(self, act):
     self.state, r = self.qmdp.transition(self.state, act)
     self.state_img = self.process_goals(self.state)
     obs = np.zeros((self.params['grid_n'], self.params['grid_m'], 3))
     obs[:, :, 0] = self.env_img
     obs[:, :, 1] = self.goal_img
     obs[:, :, 2] = self.state_img
     done = False
     self.act = act
     self.step_count = self.step_count + 1
     # if np.isclose(r, self.params['R_obst']) or self.step_count > self.params['traj_limit']:
     current_coord = self.state_lin_to_bin(self.state)
     goal_coord = self.state_lin_to_bin(self.goal_state)
     dist = np.sqrt((current_coord[0] - goal_coord[0])**2 +
                    (current_coord[1] - goal_coord[1])**2)
     r += self.params['kdist'] * dist
     if self.step_count > self.params['traj_limit'] or self.goal_img[
             current_coord[0]][current_coord[1]] == 1:
         done = True
     return Step(observation=obs, reward=r, done=done)

예제 #5

0

파일 보기

파일: snake2_env.py 프로젝트: alexlioralexli/rllab-finetuning

 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum(
         np.square(action / scaling))
     forward_reward = np.linalg.norm(self.get_body_comvel(
         "torso"))  # swimmer has no problem of jumping reward
     reward = forward_reward - ctrl_cost
     done = False
     if self.sparse_rew:
         if abs(self.get_body_com("torso")[0]) > 100.0:
             reward = 1.0
             done = True
         else:
             reward = 0.
     com = np.concatenate([self.get_body_com("torso").flat]).reshape(-1)
     ori = self.get_ori()
     return Step(next_obs, reward, done, com=com, ori=ori)

예제 #6

0

파일 보기

파일: point_env.py 프로젝트: ZikangXiong/cpo

 def step(self, action):
     qpos = np.copy(self.model.data.qpos)
     qpos[2, 0] += action[1]
     ori = qpos[2, 0]
     # compute increment in each direction
     dx = math.cos(ori) * action[0]
     dy = math.sin(ori) * action[0]
     # ensure that the robot is within reasonable range
     qpos[0, 0] = np.clip(qpos[0, 0] + dx, -self.size, self.size)
     qpos[1, 0] = np.clip(qpos[1, 0] + dy, -self.size, self.size)
     self.model.data.qpos = qpos
     self.model.forward()
     next_obs = self.get_current_obs()
     if self.align_mode:
         reward = max(self.reward_dir[0] * dx + self.reward_dir[1] * dy, 0)
     else:
         x, y = qpos[0, 0], qpos[1, 0]
         reward = -y * dx + x * dy
         reward /= (1 + np.abs(np.sqrt(x ** 2 + y ** 2) - self.target_dist))
     return Step(next_obs, reward, False)

예제 #7

0

파일 보기

파일: normalized_env.py 프로젝트: jerrickhoang/rllab

 def step(self, action):
     if isinstance(self._wrapped_env.action_space, Box) or isinstance(
             self._wrapped_env.action_space, gymBox):
         # rescale the action
         if isinstance(self._wrapped_env.action_space, Box):
             lb, ub = self._wrapped_env.action_space.bounds
         else:
             lb = self._wrapped_env.action_space.low
             ub = self._wrapped_env.action_space.high
         scaled_action = lb + (action + 1.) * 0.5 * (ub - lb)
         scaled_action = np.clip(scaled_action, lb, ub)
     else:
         scaled_action = action
     wrapped_step = self._wrapped_env.step(scaled_action)
     next_obs, reward, done, info = wrapped_step
     if self._normalize_obs:
         next_obs = self._apply_normalize_obs(next_obs)
     if self._normalize_reward:
         reward = self._apply_normalize_reward(reward)
     return Step(next_obs, reward * self._scale_reward, done, **info)

예제 #8

0

파일 보기

파일: mujoco_env_safe.py 프로젝트: ZikangXiong/cpo

    def step(self, action):
        _, reward, done, info = self.wrapped_env.step(action)
        next_obs = self.get_current_obs()

        if self._circle_mode:
            pos = self.wrapped_env.get_body_com("torso")
            vel = self.wrapped_env.get_body_comvel("torso")
            dt = self.wrapped_env.model.opt.timestep
            x, y = pos[0], pos[1]
            dx, dy = vel[0], vel[1]
            reward = -y * dx + x * dy
            reward /= (1 + np.abs(np.sqrt(x**2 + y**2) - self._target_dist))

        if self._nonlinear_reward:
            reward *= np.abs(reward)
        self._step += 1
        if self._max_path_length_range is not None:
            if self._step > self._last_step:
                done = True
        return Step(next_obs, reward, done, **info)

예제 #9

0

파일 보기

    def step(self, action):
        if self.MANUAL_COLLISION:
            old_pos = self.wrapped_env.get_xy()
            inner_next_obs, inner_rew, done, info = self.wrapped_env.step(
                action)
            new_pos = self.wrapped_env.get_xy()
            if self._is_in_collision(new_pos):
                self.wrapped_env.set_xy(old_pos)
                done = False
        else:
            inner_next_obs, inner_rew, done, info = self.wrapped_env.step(
                action)
        next_obs = self.get_current_obs()

        reward = self.coef_inner_rew * inner_rew
        info['inner_rew'] = inner_rew
        if self.include_maze_obs:
            info['xy_pos'] = self.goal_observation

        return Step(next_obs, reward, done, **info)

예제 #10

0

파일 보기

파일: julia2pythonZMQ_RARL.py 프로젝트: maxiaoba/VehicleLowlevelControl

    def step(self, Action):
        action = Action['action']
        action = np.tanh(action)
        data = self._conn.sendreq({"cmd": "step", "action": action.tolist()})
        prev_action = self.prev_action
        self.prev_action = action
        assert 'obs' in data
        assert 'rew' in data
        assert 'done' in data
        assert 'info' in data
        reward = data['rew']
        if Action['policy_num'] == 2:
            if data["info"]["fail"]:
                reward = 200.0
            else:
                dacc = action[2]
                dsteer = action[3]
                reward = -0.05 * (np.abs(dacc) + np.abs(dsteer))

        return Step(observation=data['obs'], reward=reward, done=data['done'])

예제 #11

0

파일 보기

파일: point_env.py 프로젝트: yqj13777866390/skew-explore

    def step(self, action):
        action = action * 0.12
        qpos = np.copy(self.model.data.qpos)
        #qpos[2, 0] += action[1]
        #ori = qpos[2, 0]
        ## compute increment in each direction
        #dx = math.cos(ori) * action[0]
        #dy = math.sin(ori) * action[0]
        ## ensure that the robot is within reasonable range
        #qpos[0, 0] = np.clip(qpos[0, 0] + dx, -7, 7)
        #qpos[1, 0] = np.clip(qpos[1, 0] + dy, -7, 7)
        #self.model.data.qpos = qpos
        #self.model.forward()
        #next_obs = self.get_current_obs()

        xy = [qpos[0, 0] + action[0], qpos[1, 0] + action[1]]
        self.set_xy(xy)
        next_obs = self.get_xy()

        return Step(next_obs, 0, False)

예제 #12

0

파일 보기

파일: mountain_car_env.py 프로젝트: trxw/rllab

    def step(self, action):
        """
        Note: override this method with great care, as it post-processes the
        observations, etc.
        """
        reward_computer = self.compute_reward(action)
        # forward the state
        action = self._inject_action_noise(action)
        for _ in range(self.frame_skip):
            self.forward_dynamics(action)
        # notifies that we have stepped the world
        next(reward_computer)
        # actually get the reward
        reward = next(reward_computer)
        self._invalidate_state_caches()
        self.steps += 1
        done = self.is_current_done() or self.steps >= 400
        next_obs = self.get_current_obs()

        return Step(observation=next_obs, reward=reward, done=done)

예제 #13

0

파일 보기

파일: point_env.py 프로젝트: Ageliss/Model-Free-Fine-Tuning

    def step(self, action):
        #next state = what happens after taking "action"
        temp_state = np.copy(self._state)
        dt = 0.1
        temp_state[0] = self._state[
            0] + self._state[2] * dt + 0.5 * action[0] * dt * dt
        temp_state[1] = self._state[
            1] + self._state[3] * dt + 0.5 * action[1] * dt * dt
        temp_state[2] = self._state[2] + action[0] * dt
        temp_state[3] = self._state[3] + action[1] * dt
        self._state = np.copy(temp_state)

        #make the reward what you care about
        x, y, vx, vy = self._state
        reward = vx - np.sqrt(
            abs(y - 0)
        )  #we care about moving in the forward x direction... and keeping our y value close to 0... (aka "going straight")
        done = 0  #x>500 #when do you consider this to be "done" (rollout stops... "terminal")
        next_observation = np.copy(self._state)
        return Step(observation=next_observation, reward=reward, done=done)

예제 #14

0

파일 보기

파일: simple_humanoid_env_safe.py 프로젝트: ZikangXiong/cpo

    def step(self, action):
        self.wrapped_env.forward_dynamics(action)
        next_obs = self.get_current_obs()

        alive_bonus = self.wrapped_env.alive_bonus
        data = self.wrapped_env.model.data

        comvel = self.wrapped_env.get_body_comvel("torso")

        lin_vel_reward = comvel[0]
        vel_deviation_cost = 0.5 * self.wrapped_env.vel_deviation_cost_coeff * np.sum(
            np.square(comvel[1:]))

        vel_reward = lin_vel_reward - vel_deviation_cost

        lb, ub = self.action_bounds
        scaling = (ub - lb) * 0.5
        ctrl_cost = .5 * self.wrapped_env.ctrl_cost_coeff * np.sum(
            np.square(action / scaling))
        impact_cost = .5 * self.wrapped_env.impact_cost_coeff * np.sum(
            np.square(np.clip(data.cfrc_ext, -1, 1)))

        done = data.qpos[2] < 0.8 or data.qpos[2] > 2.0

        if self._circle_mode:
            pos = self.wrapped_env.get_body_com("torso")
            vel = self.wrapped_env.get_body_comvel("torso")
            dt = self.wrapped_env.model.opt.timestep
            x, y = pos[0], pos[1]
            dx, dy = vel[0], vel[1]
            vel_reward = -y * dx + x * dy
            vel_reward /= (1 +
                           np.abs(np.sqrt(x**2 + y**2) - self._target_dist))

        reward = vel_reward + alive_bonus - ctrl_cost - impact_cost

        self._step += 1
        if self._max_path_length_range is not None:
            if self._step > self._last_step:
                done = True
        return Step(next_obs, reward, done)

예제 #15

0

파일 보기

    def step(self, action):
        if not hasattr(self, "iteration"):
            self.iteration = 0
            import pdb; pdb.set_trace()
            self.init_pos = self.get_body_com("distal_4")
        self.frame_skip = 5
        pobj = self.get_body_com("object")
        pdistr = self.get_body_com("distractor")
        pgoal = self.get_body_com("goal")
        ptip = self.get_body_com("distal_4")
        reward_ctrl = - np.square(action).sum()
        # if self.iteration >= 100 and self.iteration < 130:# and np.mean(self.dist[-10:]) <= 0.017:
        #     # print('going back!')
        #     reward_dist = - np.linalg.norm(self.init_pos[:-1]-ptip[:-1])
        #     reward_distr = np.linalg.norm(pdistr[:-1]-ptip[:-1])
        #     # reward = reward_dist + 0.01 * reward_ctrl# + 0.1 * reward_distr
        #     reward = 1.1 * reward_dist + 0.1 * reward_ctrl + 0.1 * reward_distr# + 0.1 * reward_distr
        #     # reward = reward_dist + 0.1 * reward_ctrl + 0.1 * reward_distr# + 0.1 * reward_distr
        # elif self.iteration >= 130:
        #     reward_dist = - np.linalg.norm(pgoal[:-1]-pobj[:-1])
        #     reward_dist_distr = - np.linalg.norm(pgoal[:-1]-pdistr[:-1])
        #     reward_near = - np.linalg.norm(pdistr[:-1]-ptip[:-1])
        #     # reward_return = - np.linalg.norm(self.init_pos-ptip)
        #     reward = reward_dist + reward_dist_distr + 0.1 * reward_ctrl + 0.5 * reward_near
        # else:
        #     reward_dist = - np.linalg.norm(pgoal[:-1]-pobj[:-1])
        #     reward_near = - np.linalg.norm(pobj[:-1] - ptip[:-1])
        #     # reward_return = - np.linalg.norm(self.init_pos-ptip)
        #     self.dist.append(-reward_dist)
        #     reward = reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near
        reward_dist = - np.linalg.norm(pgoal-pobj)
        reward_dist_distr = - np.linalg.norm(pgoal-pdistr)
        reward_near = - np.linalg.norm(pdistr-ptip)
        # reward_return = - np.linalg.norm(self.init_pos-ptip)
        reward = reward_dist + reward_dist_distr + 0.1 * reward_ctrl + 0.5 * reward_near
        self.forward_dynamics(action) # TODO - frame skip
        next_obs = self.get_current_obs()

        done = False
        self.iteration += 1
        return Step(next_obs, reward, done)

예제 #16

0

파일 보기

    def step(self, actid):
        action = self.actions[actid]
        load = self.state[0]
        Pij, Qij, Vi2, Lij, total_loss, iter_total, convergence_flag = utils.load_flow(
            action, load)
        if convergence_flag != 1:
            print('power flow not converge')
        Vi = np.sqrt(Vi2)
        # penalt for voltage constraint violation 0.01; total loss; tap change
        #reward = -0.01*np.sum(Vi<0.95)-0.01*np.sum(Vi>1.05)-total_loss-0.001*np.sum(np.abs(action-self.state[24:27]))
        #reward = - total_loss*10*100
        info = {}
        info['gain'] = -(total_loss * 5 * 40 +
                         0.1 * np.sum(np.abs(action - self.state[1:6]))
                         )  ## bus4: 10; bus34: 5
        info['cost'] = np.sum(Vi < 0.95) + np.sum(Vi > 1.05)
        reward = info['gain'] - info['cost']

        if (self.local_time + 1) % 168 == 0:
            done = True
        else:
            done = False
        # update to next state
        self.global_time += 1

        if self.mode == 'Train' and self.global_time >= self.load_len_train:
            self.global_time = 0

        if self.mode == 'Test' and self.global_time >= self.load_len_test:
            self.global_time = 0

        self.local_time += 1
        self.state = np.zeros(1 + 5 + 1)
        if self.mode == 'Train':
            self.state[:1] = self.load_train[self.global_time]
        else:
            self.state[:1] = self.load_test[self.global_time]
        self.state[1:6] = action  # current tap
        self.state[6] = self.local_time

        return Step(self.state, reward, done, **info)

예제 #17

0

파일 보기

    def step(self, action):
        self.forward_dynamics(action)
        next_obs = self.get_current_obs()
        action = np.clip(action, *self.action_bounds)

        reward_ctrl = -1e-1 * 0.5 * np.sum(np.square(action))

        norm_next_obs = (next_obs -
                         self._task_config.s_mean) / self._task_config.s_std
        reward_run = np.sum(norm_next_obs * self._task_config.goal_velocity *
                            self._task_config.coef)
        reward_run = reward_run.astype(np.float32)

        reward = reward_ctrl + reward_run
        done = False
        reward_state = norm_next_obs
        return Step(next_obs,
                    reward,
                    done,
                    reward_ctrl=reward_ctrl,
                    reward_state=reward_state)

예제 #18

0

파일 보기

파일: point_2d_env.py 프로젝트: jonny97/MB-MPO-trajectory-buffer

 def step(self, action, **kwargs):
     """
     Run one timestep of the environment's dynamics. When end of episode
     is reached, reset() should be called to reset the environment's internal state.
     Input
     -----
     action : an action provided by the environment
     Outputs
     -------
     (observation, reward, done, info)
     observation : agent's observation of the current environment
     reward [Float] : amount of reward due to the previous action
     done : a boolean, indicating whether the episode has ended
     info : a dictionary containing other diagnostic information from the previous action
     """
     prev_state = self._state
     self._state = prev_state + np.clip(action, -0.1, 0.1)
     reward = self.reward(prev_state, action, self._state)
     done = self.done(self._state)
     next_observation = np.copy(self._state)
     return Step(next_observation, reward, done)

예제 #19

0

파일 보기

    def step(self, action, collectingInitialData=False):
        xposbefore = self.get_body_com("torso")[0]
        self.forward_dynamics(action)
        comvel = self.get_body_comvel("torso")
        forward_reward = comvel[0]
        xposafter = self.get_body_com("torso")[0]
        lb, ub = self.action_bounds
        scaling = (ub - lb) * 0.5
        ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
        contact_cost = 0
        survive_reward = 0.05

        reward = forward_reward - ctrl_cost - contact_cost + survive_reward

        state = self._state
        notdone = np.isfinite(state).all() \
            and self.get_body_com("torso")[2] >= 0.3 and self.get_body_com("torso")[2] <= 1.0 #used to be 0.2, state[2]
        done = not notdone
        ob = self.get_current_obs()

        return Step(ob, float(reward), done)

예제 #20

0

파일 보기

 def step(self, action):
     self.forward_dynamics(action)
     comvel = self.get_body_comvel("torso")
     forward_reward = comvel[0]
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
     contact_cost = 0.5 * 1e-3 * np.sum(
         np.square(np.clip(self.model.data.cfrc_ext, -1, 1))),
     survive_reward = 0.05
     reward = forward_reward - ctrl_cost - contact_cost + survive_reward
     state = self._state
     notdone = np.isfinite(state).all() \
         and state[2] >= 0.2 and state[2] <= 1.0
     done = not notdone
     ob = self.get_current_obs()
     if forward_reward <= 3.0:
         reward = 0.
     else:
         reward = 1.0
     return Step(ob, float(reward), done)

예제 #21

0

파일 보기

    def step(self, action, collectingInitialData=False):
        xposbefore = self.model.data.qpos[0, 0]
        self.forward_dynamics(action)
        next_obs = self.get_current_obs()
        action = np.clip(action, *self.action_bounds)
        ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action))
        run_cost = -1 * self.get_body_comvel("torso")[0]
        cost = ctrl_cost + run_cost

        ############ rllab rewards
        reward = -cost

        ########### open ai gym rewards
        '''xposafter = self.model.data.qpos[0,0]
        reward_ctrl = - 0.1 * np.square(action).sum()
        reward_run = (xposafter - xposbefore)/self.model.opt.timestep
        reward = reward_ctrl + reward_run'''
        #########

        done = False
        return Step(next_obs, reward, done)

예제 #22

0

파일 보기

파일: humanoid_env.py 프로젝트: zmhhmz/AdMRL

    def step(self, action):
        self.forward_dynamics(action)
        alive_bonus = self.alive_bonus
        data = self.model.data

        comvel = self.get_body_comvel("torso")
        lin_vel_reward = comvel[0]
        lb, ub = self.action_bounds
        scaling = (ub - lb) * 0.5
        ctrl_cost = .5 * self.ctrl_cost_coeff * np.sum(
            np.square(action / scaling))
        impact_cost = 0.
        vel_deviation_cost = 0.5 * self.vel_deviation_cost_coeff * np.sum(
            np.square(comvel[1:]))
        reward = lin_vel_reward + alive_bonus - ctrl_cost - \
            impact_cost - vel_deviation_cost
        pos = data.qpos.flat[2]
        done = pos < 0.8 or pos > 2.0

        next_obs = self.get_current_obs()
        return Step(next_obs, reward, done)

예제 #23

0

파일 보기

    def step(self, action):
        self.forward_dynamics(action)
        next_obs = self.get_current_obs()
        blockchoice = self.goal[0]
        curr_block_xidx = int(3 + 2 * blockchoice)
        curr_block_yidx = int(4 + 2 * blockchoice)
        #TODO: Maybe need to change angle here
        curr_gripper_pos = self.model.data.site_xpos[0, :2]
        curr_block_pos = np.array(
            [next_obs[curr_block_xidx], next_obs[curr_block_yidx]])
        goal_pos = self.goal[1:3]
        dist_to_block = np.linalg.norm(curr_gripper_pos - curr_block_pos)
        block_dist = np.linalg.norm(goal_pos - curr_block_pos)
        goal_dist = np.linalg.norm(goal_pos)

        if self.sparse and block_dist > 0.2:
            reward = -5 * goal_dist
        else:
            reward = -5 * block_dist
        done = False
        return Step(next_obs, reward, done)

예제 #24

0

파일 보기

    def step(self, action):
      #  self.forward_dynamics(action)
      #  comvel = self.get_body_comvel("torso")
      #  forward_reward = self.goal_direction*comvel[0]
      #  lb, ub = self.action_bounds
      #  scaling = (ub - lb) * 0.5
      #  ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
      #  contact_cost = 0.5 * 1e-3 * np.sum(
      #      np.square(np.clip(self.model.data.cfrc_ext, -1, 1))),
      #  survive_reward = 0.05
      #  reward = forward_reward - ctrl_cost - contact_cost + survive_reward
      #  state = self._state
      #  notdone = np.isfinite(state).all() \
      #      and state[2] >= 0.2 and state[2] <= 1.0
      #  done = not notdone
      #  ob = self.get_current_obs()
      #  return Step(ob, float(reward), done)
      #  obs = self.get_current_obs()

      #  selected_arm_mean = self.arm_means[action]
      #  reward = float(np.random.random() < selected_arm_mean)
      #  self.ts += 1
      #  done = self.ts >= self.max_path_length
      #  state = np.zeros((2))
      #  state[0] = reward
      #  state[1] = 1

      #  return Step(state, reward, done)
      ps = self.Ps[self.state, action]
      next_state = special.weighted_sample(ps, np.arange(self.n_states))
      reward_mean = self.Rs[self.state, action]
      reward = reward_mean + np.random.normal() * 1 / np.sqrt(self.tau)
      self.ts += 1
      self.state = next_state
      done = self.ts >= self.max_path_length
      state = np.zeros((2 + self.n_states))
      state[self.state] = 1
      state[self.n_states] = reward
      state[self.n_states + 1] = done
      return Step(state, reward, done)

예제 #25

0

파일 보기

    def step(self, action):
        assert self.action_space.contains(
            action), "%r (%s) invalid" % (action, type(action))

        # Apply the action
        c = np.asarray(action)
        buff = struct.pack('%sf' % len(c), *c)
        self.s.send(buff)

        # Receive the state
        stateSize = self.s.recv(4)
        stateSize = struct.unpack('i', stateSize)[0]
        state = self.s.recv(1000)
        while len(state) < stateSize:
            state += self.s.recv(1000)
        state = np.asarray([
            struct.unpack('f', state[i:i + 4])[0]
            for i in range(0, len(state), 4)
        ])

        # Get the y position of the root joint
        y = state[3]
        done = y < self.y_threshold

        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # skeleton just fell!
            self.steps_beyond_done = 0
            reward = -100.0
        else:
            if self.steps_beyond_done == 0:
                logger.warn(
                    "You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior."
                )
            self.steps_beyond_done += 1
            reward = 0.0

        next_observation = np.array(self.state)
        return Step(observation=next_observation, reward=reward, done=done)

예제 #26

0

파일 보기

 def step(self, action):
     A, B = self._dynamics[self._get_dynamics_id()]
     if self.force_act is not None:
         action = self.force_act
     chosen_controller = self._controllers[action]
     process_noise = np.random.standard_normal(size=A.shape) * self.NOISE
     control_gain = (A - B.dot(chosen_controller)) + process_noise
     if self.normalize_action:
         assert False
         gradient = control_gain.dot(self.state)
         next_state = 0.2 * gradient / np.linalg.norm(gradient) + self.state
     else:
         next_state = 0.2 * control_gain.dot(
             self.state / np.linalg.norm(self.state)) + self.state
     reward = -np.linalg.norm(next_state)**2  # quad reward
     if self.observation_space.contains(next_state):
         done = abs(reward) < 0.4  # will change
     else:
         done = True
     # print reward
     self.state = next_state.copy()
     return Step(observation=self.state, reward=reward, done=done)

예제 #27

0

파일 보기

파일: reacher.py 프로젝트: NagisaZj/maesn_baseline

    def step(self, action):
        #action = np.clip(action, -0.1, 0.1)
        tmp_finger = self.get_body_com("fingertip")
        vec = self.get_body_com("fingertip") - self._goal

        reward_dist = -np.linalg.norm(vec)
        #print(vec,reward_dist)
        reward_ctrl = -np.square(action).sum()
        sparse_reward = self.sparsify_rewards(reward_dist)
        reward = reward_dist + reward_ctrl
        sparse_reward = sparse_reward + reward_ctrl
        reward = sparse_reward
        self.do_simulation(action, self.frame_skip)
        ob = self._get_obs()
        done = False
        env_infos = dict(finger=tmp_finger.tolist(),
                         reward_dist=reward_dist,
                         reward_ctrl=reward_ctrl,
                         sparse_reward=sparse_reward,
                         goal=self._goal)
        #print(env_infos['finger'])
        return Step(ob, reward, done)

예제 #28

0

파일 보기

파일: maze_env.py 프로젝트: sushuang9210/GAIL_LSTM

 def step(self, action):
     if self.MANUAL_COLLISION:
         old_pos = self.wrapped_env.get_xy()
         _, _, done, info = self.wrapped_env.step(action)
         new_pos = self.wrapped_env.get_xy()
         if self._is_in_collision(new_pos):
             self.wrapped_env.set_xy(old_pos)
             done = False
     else:
         _, _, done, info = self.wrapped_env.step(action)
     next_obs = self.get_current_obs()
     x, y = self.wrapped_env.get_body_com("torso")[:2]
     # ref_x = x + self._init_torso_x
     # ref_y = y + self._init_torso_y
     reward = 0
     minx, maxx, miny, maxy = self._goal_range
     # print "goal range: x [%s,%s], y [%s,%s], now [%s,%s]" % (str(minx), str(maxx), str(miny), str(maxy),
     #                                                          str(x), str(y))
     if minx <= x <= maxx and miny <= y <= maxy:
         done = True
         reward = 1
     return Step(next_obs, reward, done, **info)

예제 #29

0

파일 보기

파일: ant_task_env.py 프로젝트: zmhhmz/AdMRL

 def step(self, action):
     self.forward_dynamics(action)
     comvel = self.get_body_comvel("torso")
     if self._task_config.goal_velocity == -math.inf:
         forward_reward = -1 * comvel[0]
     elif self._task_config.goal_velocity == math.inf:
         forward_reward = comvel[0]
     else:
         forward_reward = -np.abs(comvel[0] - self._task_config.goal_velocity) + 1.0         
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
     contact_cost = 0.
     # contact_cost = 0.5 * 1e-3 * np.sum(
     #     np.square(np.clip(self.model.data.cfrc_ext, -1, 1))),
     survive_reward = 0.05
     reward = forward_reward - ctrl_cost - contact_cost + survive_reward
     state = self._state
     notdone = np.isfinite(state).all() and state[2] >= 0.2 and state[2] <= 1.0
     done = not notdone
     ob = self.get_current_obs()
     return Step(ob, float(reward), done)

예제 #30

0

파일 보기

파일: lob.py 프로젝트: yceny/price_rl

    def step(self, action):

        self.model.data.ctrl = action

        # Taking Steps in the Environment

        reward = 0
        for _ in range(self.frame_skip):
            self.model.step()
            step_reward = self.timestep_reward()
            reward += step_reward

        # Reached the End of Trajectory

        done = False
        onGround = self.touching_group(
            "geom_object",
            ["ground", "goal_wall1", "goal_wall2", "goal_wall3", "goal_wall4"])
        if onGround and self.numClose > 10:
            reward += self.final_reward()
            done = True

        ob = self.get_current_obs()
        new_com = self.model.data.com_subtree[0]
        self.dcom = new_com - self.current_com
        self.current_com = new_com

        # Recording Metrics

        obj_position = self.get_body_com("object")
        goal_position = self.get_body_com("goal")
        distance = np.linalg.norm((goal_position - obj_position)[:2])
        normalizedDistance = distance / self.init_block_goal_dist

        return Step(ob,
                    float(reward),
                    done,
                    distance=distance,
                    norm_distance=normalizedDistance)