예제 #1
0
    def step(self, action):
        if self._symmetric_action_space:
            new_action = torch.as_tensor(
                action.round().clip(0, self._max_stock) + self._max_stock / 2,
                dtype=torch.int32)
        else:
            new_action = torch.as_tensor(action, dtype=torch.int32).clamp(
                0, self._max_stock)
        if self.day_position % self._substep_count == 0:
            order_cost = self._make_fast_order(new_action)
            (sales, availability) = \
                self._generateDemand(self.real.clamp_(0.0, 1.))
            waste = self._waste()  # Update waste and store result
            self._reduceShelfLives()
            self._step_counter += 1
            self._updateEnv()
        else:

            self.day_position += 1
            order_cost = self._make_order(new_action)
            (sales, availability) = \
                self._generateDemand(self.real.clamp_(0.0, 1.))
            waste = 0  # By default, no waste before the end of day
            self._updateObs()
        sales.sub_(order_cost)
        utility = self.utility_function.reward(sales, waste, availability)
        done = self._step_counter == self.horizon
        info = EnvInfo(sales=sales,
                       availability=availability,
                       waste=waste,
                       reward=utility,
                       traj_done=done)
        return EnvStep(self.get_obs(), utility, done, info)
예제 #2
0
파일: atari_env.py 프로젝트: codelast/rlpyt
    def step(self, action):
        """
        在environment中向前走一步。 这个函数在Collector类(例如)的collect_batch()函数中会被调用。
        注意:policy network的前向传播过程不是在这里发生的,而是在agent类(例如DqnAgent)的step()函数里发生(由Collector类的
        collect_batch()函数调用)。environment里的step(),输入的action已经是policy network推断出来的action了,在这里做的工作主要是:
        计算该action带来的reward,判断trajectory是否结束,记录一些统计信息等。

        :param action: 一个标量,其值在 self._action_set 的index范围内。TODO: 确认是否正确?
        :return: 一个 EnvStep 对象,包含observation等数据。
        """
        a = self._action_set[action]  # 从action set(动作集)中取出一个具体的action
        game_score = np.array(
            0., dtype="float32"
        )  # 游戏分数,其实就是一个标量值。这个函数里算出来的只是当前step的score而不是整个游戏过程的score
        # 可以设置每一个step走游戏的几帧,这里就连续地执行N-1(假设N为帧数)次action
        for _ in range(self._frame_skip - 1):
            game_score += self.ale.act(
                a)  # 执行一个action,得到一个score,累加到原来已经得到的分数上,这里累加也只是累加本step内的分数
        self._get_screen(1)
        game_score += self.ale.act(a)  # 上面skip的frame,还差一帧,这里补上执行一次action
        lost_life = self._check_life(
        )  # Advances from lost_life state. 看看游戏角色是不是挂了
        if lost_life and self._episodic_lives:
            self._reset_obs()  # Internal reset.
        self._update_obs()
        # 奖励值。当设置了_clip_reward的时候使用-1,0,1作为reward,否则就使用真实的游戏分数作为reward
        reward = np.sign(game_score) if self._clip_reward else game_score
        game_over = self.ale.game_over(
        ) or self._step_counter >= self.horizon  # 判断游戏是不是结束了,当horizon达到阈值时也结束
        done = game_over or (self._episodic_lives and lost_life)  # bool类型
        info = EnvInfo(game_score=game_score,
                       traj_done=game_over)  # 当前environment的一些信息,比如游戏分数等
        self._step_counter += 1  # 用于统计走了多少个step的计数器
        return EnvStep(self.get_obs(), reward, done, info)
예제 #3
0
    def step(self, action):
        time_step = self._env.step(action)
        reward = time_step.reward
        terminal = time_step.last()
        info = time_step.info
        info.update({
            key: value
            for key, value in time_step.observation.items()
            if key not in self._observation_keys
        })
        observation = self._filter_observation(time_step.observation)

        self._step_count += 1
        info['traj_done'] = self._step_count >= self._max_path_length

        global EnvInfo
        if EnvInfo is None:
            EnvInfo = namedtuple("EnvInfo", list(info.keys()))
        info = EnvInfo(
            **{k: v
               for k, v in info.items() if k in EnvInfo._fields})

        global Observation
        if Observation is None:
            Observation = namedarraytuple("Observation",
                                          list(observation.keys()))
        observation = Observation(
            **{
                k: v.copy()
                for k, v in observation.items() if k in self._observation_keys
            })

        return EnvStep(observation, reward, terminal, info)
예제 #4
0
    def step(self, action):
        ''' Passes action to env and returns next state, reward, and terminal

            Args:
                action(int): Int represnting action in action space

            Returns:
                (EnvStep:named_tuple_array)        
        '''
        reward = 0

        if self.is_action_continuous: 
            action *= np.array(self._env.action_space.get_high())

        for _ in range(self._steps_per_action):
            sensor_dict, temp_reward, terminal, _ = self._env.step(action)
            reward += temp_reward

        if self.rollout_count % self.gif_freq == 0 and self.has_img:
            self.gif_images.append(self.get_img(sensor_dict))

        state_rep = self._get_state_rep(sensor_dict)

        self.curr_step += 1
        if self.curr_step >= self._max_steps:
            terminal = True

        return EnvStep(state_rep, np.array(reward), terminal, None)
예제 #5
0
 def step(self, action):
     o, r, d, info = self.env.step(action)
     self.time_elapsed += 1
     if self.time_limit is not None:
         d = self.time_elapsed >= self.time_limit or d
     return EnvStep(np.array(self.state), r, d,
                    EnvInfo(**info, state=self.state))
예제 #6
0
    def step(self, action):
        """
        Returns:
            obs
            reward
            done
            log
        """
        print(type(action))

        # assert action in [0, 1], action
        # if action[0] == 0 and self.cur_pos > 0:
        #     self.cur_pos -= 1
        # elif action[0] == 1:
        #     self.cur_pos += 1
        if action == 0 and self.cur_pos > 0:
            self.cur_pos -= 1
        elif action == 1:
            self.cur_pos += 1
        done = self.cur_pos >= self.end_pos

        # info = EnvInfo(game_score=game_score, traj_done=game_over)
        info = None
        reward = 1 if done else 0
        self._step_counter += 1
        return EnvStep(self.get_obs(), reward, done, info)
예제 #7
0
 def step(self, action):
     timestep = self.env.step(action)
     self._last_observation = timestep.observation
     reward = timestep.reward or 0.
     if timestep.last():
         self.game_over = True
     return EnvStep(timestep.observation, reward, timestep.last(),
                    EnvInfo())
예제 #8
0
    def step(self, action):
        time_step = self._env.step(action)
        _ = dict(time_step.observation)
        obs = self.render()
        reward = time_step.reward or 0
        done = time_step.last()

        info = EnvInfo(np.array(time_step.discount, np.float32), None, done)
        return EnvStep(obs, reward, done, info)
예제 #9
0
    def step(self, action):
        if self.player_turn:
            self.player_turn = False
            a = self.player_action_space.revert(action)
            if a.size <= 1:
                a = a.item()
            o, r, d, info = self.env.step(a)
            self.last_obs = o
            self.last_action = a
            obs = self.observer_observation_space.convert(o)
            if self.time_limit:
                if "TimeLimit.truncated" in info:
                    info["timeout"] = info.pop("TimeLimit.truncated")
                else:
                    info["timeout"] = False

            self.last_info = info  #(info["timeout"])
            #             info = (False)
            if isinstance(r, float):
                r = np.dtype("float32").type(r)  # Scalar float32.
            self.last_reward = r
            self.curr_episode_length += 1
            if self.curr_episode_length >= self.max_episode_length:
                d = True
            self.last_done = d
            return EnvStep(obs, r, d, info)

        else:
            r_action = self.observer_action_space.revert(action)
            r_action = self.obs_action_translator(r_action, self.window_size,
                                                  self.obs_size)
            self.player_turn = True
            self.last_obs_act = r_action
            masked_obs = np.multiply(r_action, self.last_obs)
            info = self.last_info
            r = self.last_reward
            d = self.last_done
            if self.add_channel:
                masked_obs = np.concatenate([r_action, masked_obs], axis=0)
            else:
                masked_obs[r_action == 0] = -1
            obs = self.player_observation_space.convert(masked_obs)

            return EnvStep(obs, r, d, info)
예제 #10
0
    def step(self, action):
        obs, reward, done, info = super().step(action)

        # fix the labels later in GoalWrapper
        updated_info = EnvInfo(game_score=info.game_score,
                               traj_done=info.traj_done,
                               labels=self.static_info,
                               goal_labels=self.static_info)

        return EnvStep(obs, reward, done, updated_info)
예제 #11
0
 def step(self, actions):
     """
     Action is either a single value (discrete, one-hot), or a tuple with an action for each of the
     discrete action subspaces.
     """
     action = actions.item()
     obs, rew, done, info = self.env.step(action)
     # print('Obs shape as returned by the env:', obs.shape)
     fake_info = tuple()  # do not need for testing
     return EnvStep(obs, rew, done, fake_info)
예제 #12
0
    def step(self, action):
        action = {self._node_id: action}
        obs, reward, done, info = self._env.step(action)
        self.obs = self.transform_obs(obs)
        done = done[self._node_id]
        reward = reward[self._node_id]
        info = info[self._node_id]

        info = EnvInfo(None, 0, done)
        return EnvStep(self.obs, reward, done, info)
예제 #13
0
파일: lab.py 프로젝트: DavidMChan/rlpyt
 def step(self, action):
     reward = self._lab.step(self._action_set[action])
     finished = not self._lab.is_running()
     if not finished:
         self._update_obs()
     self._total_reward += reward
     self._step_counter += 1
     return EnvStep(self.get_obs(), reward, finished, EnvInfo(
         total_reward=self._total_reward,
         traj_done=finished,
     ))
예제 #14
0
 def step(self, action):
     a = self.action_space.revert(action)
     o, r, d, info = self.env.step(a)
     obs = self.observation_space.convert(o)
     if self._time_limit:
         if "TimeLimit.truncated" in info:
             info["timeout"] = info.pop("TimeLimit.truncated")
         else:
             info["timeout"] = False
     info = info_to_nt(info)
     return EnvStep(obs, r, d, info)
예제 #15
0
    def step(self, action):
        time_step = self._env.step(action)
        obs = dict(time_step.observation)
        state_obs = np.concatenate([value for key, value in obs.items()])
        img_obs = self.render()
        reward = time_step.reward or 0
        done = time_step.last()

        info = EnvInfo(np.array(time_step.discount, np.float32), None, done)
        obs = StateObs(img_obs, state_obs) if self.use_state else img_obs
        return EnvStep(obs, reward, done, info)
예제 #16
0
    def step(self, action):
        self.iter += 1
        velocity = np.linalg.norm(action - self.state)
        self.state = action.copy()
        dist = np.linalg.norm(self.state - self.goal)

        rewards = dict()
        rewards['goal'] = 0.9 * np.exp(-0.5 * 10 * dist)
        rewards['vel'] = 0.1 * np.exp(-0.5 * 10 * velocity)
        rewards['col'] = -1 * np.exp(-0.5 * self.colision_dist(self.state)) * self.in_collision(self.state)
        # print(rewards)
        return EnvStep(self.get_obs(), sum(rewards.values()) / self.horizon, self.iter == self.horizon, EnvInfo())
예제 #17
0
 def step(self, action):
     assert self._step is not None, 'Must reset environment.'
     obs, reward, done, info = self.env.step(action)
     self._step += 1
     if self._step >= self._duration:
         # done = True
         # if 'discount' not in info:
         #     info['discount'] = np.array(1.0).astype(np.float32)
         if isinstance(info, EnvInfo):
             # The last attribute in EnvInfo indicates termination of the trajectory
             # we do not set done = True because it should only be controlled by the environment
             info = EnvInfo(info.discount, info.game_score, True)
         self._step = None
     return EnvStep(obs, reward, done, info)
예제 #18
0
 def step(self, action):
     """Reverts the action from rlpyt format to gym format (i.e. if composite-to-
     dictionary spaces), steps the gym environment, converts the observation
     from gym to rlpyt format (i.e. if dict-to-composite), and converts the
     env_info from dictionary into namedtuple."""
     a = self.action_space.revert(action)
     o, r, d, info = self.env.step(a)
     obs = self.observation_space.convert(o)
     if self._time_limit:
         if "TimeLimit.truncated" in info:
             info["timeout"] = info.pop("TimeLimit.truncated")
         else:
             info["timeout"] = False
     info = info_to_nt(info, self._info_schemas)
     return EnvStep(obs, r, d, info)
예제 #19
0
    def step(self, action, sym_features=None):
        """
        :param sym_features: if given, this is a safety-wrapped environment
        """
        constraint_used = False
        if self._actions is not None:
            action = self._actions[action]
        else:
            # continuous actions from agents are in [-1, 1]; convert back here
            action = (action + 1) / 2 * self._action_range + self._action_lb
        if sym_features is not None or self.oracle_safety:
            current_state = self._env.current_oracle_state()
            if sym_features is not None:
                sym_features = sym_features.squeeze()
                nan_idx = np.isnan(sym_features)
                sym_features[nan_idx] = current_state[nan_idx]
            else:  # oracle safety
                sym_features = current_state
            if not self._env.constraint_func(action, sym_features):
                constraint_used = True
                action = self.constrained_sample(sym_features)
                if action is None:
                    if self._fallback_action is None:
                        raise ValueError(
                            "No safe action found! Consider adding fallback.")
                    action = self._fallback_action

            if self.log_unsafe_transitions:
                unsafe_info = {
                    "oracle": self._env.current_oracle_state(),
                    "sym_feats": sym_features.copy(),
                    "img": self._env.render(),
                    "action": action.copy(),
                    "constraint_used": constraint_used,
                }

        obs, reward, done, info = self._env.step(action)
        info = SafetyEnvInfo(info["unsafe"], 0, constraint_used)
        if (self.log_unsafe_transitions and info.action_unsafe
                and sym_features is not None):
            debug_dir = Path.home() / "debug"
            debug_dir.mkdir(exist_ok=True)
            i = str(np.random.randint(1000))
            unsafe_info["oracle_next"] = self._env.current_oracle_state()
            (debug_dir / f"{i}.pkl").write_bytes(pickle.dumps(unsafe_info))
        return EnvStep(obs, reward, done, info)
예제 #20
0
 def step(self, action):
     a = self._action_set[action]
     game_score = np.array(0., dtype="float32")
     for _ in range(self._frame_skip - 1):
         game_score += self.ale.act(a)
     self._get_screen(1)
     game_score += self.ale.act(a)
     lost_life = self._check_life()  # Advances from lost_life state.
     if lost_life and self._episodic_lives:
         self._reset_obs()  # Internal reset.
     self._update_obs()
     reward = np.sign(game_score) if self._clip_reward else game_score
     game_over = self.ale.game_over() or self._step_counter >= self.horizon
     done = game_over or (self._episodic_lives and lost_life)
     info = EnvInfo(game_score=game_score, traj_done=game_over)
     self._step_counter += 1
     return EnvStep(self.get_obs(), reward, done, info)
예제 #21
0
 def step(self, action):
     """Reverts the action from rlpyt format to gym format (i.e. if composite-to-
     dictionary spaces), steps the gym environment, converts the observation
     from gym to rlpyt format (i.e. if dict-to-composite), and converts the
     env_info from dictionary into namedtuple."""
     a = self.action_space.revert(action)
     o, r, d, info = self.env.step(a)
     obs = self.observation_space.convert(o.transpose((2, 0, 1)))
     if self._time_limit:
         if "TimeLimit.truncated" in info:
             info["timeout"] = info.pop("TimeLimit.truncated")
         else:
             info["timeout"] = False
     info = info_to_nt(info)
     if isinstance(r, float):
         r = np.dtype("float32").type(r)  # Scalar float32.
     return EnvStep(obs, r, d, info)
예제 #22
0
 def step(self, action):
     """
     Take step with action, then observe result
     """
     self._time += 1  # Update time step
     timeout = False
     self._action[:] = action
     if self._time_limit and self._time >= self._time_limit:
         self._action[:] = -1  # Can force a reset by taking action of -1
         self._time = 0  # Reset time
         timeout = True
     self.env.act(self._action)
     r, self._o, d = self.env.observe()
     r, d = r.squeeze(), d.squeeze()
     if d: self._reset_obs()  # If done, reset the stacked frames
     self._update_obs()  # Add newest observation to stacked frames
     o = self._get_obs()  # Get stacked observation in correct order
     return EnvStep(o, r, d, EnvInfo(timeout=timeout))
예제 #23
0
    def step(self, action):
        assert self._norm_action_space.contains(action)
        action = self._convert_action(action)
        assert self._true_action_space.contains(action)
        reward = 0
        extra = {'internal_state': self._env.physics.get_state().copy()}

        for _ in range(self._frame_skip):
            time_step = self._env.step(action)
            reward += time_step.reward or 0
            done = time_step.last()
            if done:
                break
        obs = self._get_obs(time_step)
        extra['discount'] = time_step.discount
        extra['traj_done'] = done
        extra['game_score'] = reward
        info = self.info_class(**extra)
        return EnvStep(obs, reward, done, info)
예제 #24
0
    def step(self, action):
        if action == 0 and self.y < self.h - 1:
            if self.grid[self.x, self.y + 1] != 2:
                self.y += 1
        elif action == 1 and self.x < self.w - 1:
            if self.grid[self.x + 1, self.y] != 2:
                self.x += 1
        elif action == 2 and self.y > 0:
            if self.grid[self.x, self.y - 1] != 2:
                self.y -= 1
        elif action == 3 and self.x > 0:
            if self.grid[self.x - 1, self.y] != 2:
                self.x -= 1
        else:
            # stand still
            pass

        info = EnvInfo(traj_done=False, labels=None, goal_labels=None)
        return EnvStep(self._get_grid(), 0, False, info)
예제 #25
0
 def step(self, action):
     total_reward = 0.0
     for step in range(self._action_repeat):
         _, reward, done, info = self._env.step(action)
         total_reward += reward
         if self._life_done:
             lives = self._env.ale.lives()
             done = done or lives < self._lives
             self._lives = lives
         if done:
             break
         elif step >= self._action_repeat - 2:
             index = step - (self._action_repeat - 2)
             if self._grayscale:
                 self._env.ale.getScreenGrayscale(self._buffers[index])
             else:
                 self._env.ale.getScreenRGB2(self._buffers[index])
     obs = self._get_obs()
     env_info = EnvInfo(None, total_reward, done, None)
     return EnvStep(obs, total_reward, done, env_info)
예제 #26
0
 def step(self, action):
     a = self._action_set[action]
     game_score = np.array(0., dtype="float32")
     for _ in range(self._frame_skip - 1):
         game_score += self.ale.act(a)
     self._get_screen(1)
     game_score += self.ale.act(a)
     lost_life = self._check_life()  # Advances from lost_life state.
     if lost_life and self._episodic_lives:
         self._reset_obs()  # Internal reset.
     self._update_obs()
     reward = np.sign(game_score) if self._clip_reward else game_score
     game_over = self.ale.game_over() or self._step_counter >= self.horizon
     done = game_over or (self._episodic_lives and lost_life)
     # Include reporting of current room ID in Montezuma Revenge (stored at RAM address 3)
     info = MontezumaEnvInfo(game_score=game_score,
                             traj_done=game_over,
                             room_id=self.ale.getRAM()[3])
     self._step_counter += 1
     return EnvStep(self.get_obs(), reward, done, info)
예제 #27
0
    def step(self, action):
        cont_action = np.array(self.action_map[action]).astype(float)
        done = False
        for _ in range(self.frame_skip - 1):
            env_step = self.env.step(cont_action, blind=True)
            if env_step.last():
                done = True
                break
        if not done:
            env_step = self.env.step(cont_action)
        done = env_step.last()
        self._update_obs(env_step.observation['pixels'])
        self.last_obs = env_step.observation

        if self.static_labels is None:
            self.static_labels = self.labels()

        info = EnvInfo(traj_done=done,
                       labels=self.static_labels,
                       goal_labels=self.static_labels)
        return EnvStep(self.get_obs(), 0, done, info)
예제 #28
0
    def step(self, action):
        step_results = None
        if action == MiniGridEnv.Actions.forward: # Go forward action; no slippage for other actions
            if np.random.uniform() < self.slipperiness:
                action = random.choice(["left", "right"])
                # By default, the agent can only move in a direction if it's facing that way.
                # We can model slippage by turning a direction, moving that way, then turning back
                # and only counting it as a single action.
                self._env.step_count -= 2
                if action == "left":
                    self._env.step(MiniGridEnv.Actions.left)
                    self._env.step(MiniGridEnv.Actions.forward)
                    step_results = self._env.step(MiniGridEnv.Actions.right)
                else:
                    self._env.step(MiniGridEnv.Actions.right)
                    self._env.step(MiniGridEnv.Actions.forward)
                    step_results = self._env.step(MiniGridEnv.Actions.left)
        if step_results is None:
            step_results = self._env.step(action)

        obs, reward, done, info = step_results
        obs = StateObs(obs['image'], np.concatenate([obs['mission'], obs['direction']]))
        info = EnvInfo(None, None, done)
        return EnvStep(obs, reward, done, info)
예제 #29
0
파일: envs.py 프로젝트: w121211/rlpyt
    def step(self, action: np.ndarray):
        """
        Args:
            action: [int, int]
        Return:
            obs: target_im (H, W, C), cur_im (H, W, C), field_info (x0, y0)
        """
        # print(self.action_map[action[0]], self.action_map[action[1]])
        # idx = action[0]
        # print(action)
        i_item = 0
        # action = action.reshape((N_ACTIONS / 2, 2))
        dmove = np.array([HOR_MOVE[action], VER_MOVE[action]], dtype=np.float32)
        xy0 = self.cur_coord[i_item, :2] + dmove
        self.cur_coord[i_item] = np.concatenate((xy0, xy0 + self.obj_wh), axis=0)
        self.cur_im = self._render(self.cur_coord)

        reward = self._reward(self.cur_coord, self.target_coord)
        done = self.cur_step >= MAX_STEP
        info = EnvInfo()
        self.cur_step += 1

        # return self._obs(), reward, done, {}
        return EnvStep(self._obs(), reward, done, info)
예제 #30
0
    def step(self, action):
        if self.player_turn:
            self.player_turn = False
            a = self.player_action_space.revert(action)
            if a.size <= 1:
                a = a.item()
            o, r, d, info = self.env.step(a)
            self.last_obs = o
            self.last_action = a
            if self.serial:
                obs = np.concatenate(
                    [np.zeros(self.last_obs_act.shape), self.last_masked_obs])
            else:
                obs = np.concatenate([self.last_obs_act, self.last_masked_obs])
            if self.inc_player_last_act:
                obs = np.append(obs, a)
            obs = self.observer_observation_space.convert(obs)
            if self.time_limit:
                if "TimeLimit.truncated" in info:
                    info["timeout"] = info.pop("TimeLimit.truncated")
                else:
                    info["timeout"] = False

            self.last_info = (info["timeout"])
            info = (False)
            if isinstance(r, float):
                r = np.dtype("float32").type(r)  # Scalar float32.
            self.last_reward = r
            # if (not d) and (self.observer_reward_shaping is not None):
            #     r = self.observer_reward_shaping(r,self.last_obs_act)
            self.curr_episode_length += 1
            if self.curr_episode_length >= self.max_episode_length:
                d = True
            self.last_done = d
            return EnvStep(obs, r, d, info)

        else:
            if not np.array_equal(action, action.astype(bool)):
                action = np.random.binomial(1, action)
            r_action = self.observer_action_space.revert(action)
            if self.serial:
                if self.fully_obs:
                    r_action = 1
                elif self.rand_obs:
                    r_action = random.randint(0, 1)
                self.ser_cum_act[self.ser_counter] = r_action
                self.ser_counter += 1
                if self.ser_counter == self.obs_size:
                    self.player_turn = True
                    self.ser_counter = 0
                    masked_obs = np.multiply(
                        np.reshape(self.ser_cum_act, self.last_obs.shape),
                        self.last_obs)
                    self.last_masked_obs = masked_obs
                    self.last_obs_act = self.ser_cum_act.copy()
                    self.ser_cum_act = np.zeros(
                        self.env.env.observation_space.shape)
                    r = self.last_reward
                    # if self.player_reward_shaping is not None:
                    #     r = self.player_reward_shaping(r, self.last_obs_act)
                    d = self.last_done
                    info = self.last_info
                    obs = np.concatenate([
                        np.reshape(self.last_obs_act, masked_obs.shape),
                        masked_obs
                    ])
                    obs = self.player_observation_space.convert(obs)
                else:
                    r = 0
                    info = (False)
                    obs = np.concatenate([
                        np.reshape(self.ser_cum_act,
                                   self.last_masked_obs.shape),
                        self.last_masked_obs
                    ])
                    if self.inc_player_last_act:
                        obs = np.append(obs, self.last_action)

                    obs = self.observer_observation_space.convert(obs)
                    d = False

            else:
                if not self.cont_act:
                    r_action = self.obs_action_translator(
                        r_action, self.power_vec, self.obs_size)
                if self.fully_obs:
                    r_action = np.ones(r_action.shape)
                elif self.rand_obs:
                    r_action = np.random.randint(0, 2, r_action.shape)
                self.player_turn = True
                self.last_obs_act = r_action
                masked_obs = np.multiply(
                    np.reshape(r_action, self.last_obs.shape), self.last_obs)
                self.last_masked_obs = masked_obs
                info = self.last_info
                r = self.last_reward
                # if self.player_reward_shaping is not None:
                #     r = self.player_reward_shaping(r, r_action)
                d = self.last_done
                obs = np.concatenate(
                    [np.reshape(r_action, masked_obs.shape), masked_obs])
                obs = self.player_observation_space.convert(obs)

            return EnvStep(obs, r, d, info)