Пример #1
0
    def before_step(self, action):
        assert not self.closed

        if self.done:
            raise error.ResetNeeded("Trying to step environment which is currently done. While the monitor is active for {}, you cannot step beyond the end of an episode. Call 'env.reset()' to start the next episode.".format(self.env_id))
        elif self.steps is None:
            raise error.ResetNeeded("Trying to step an environment before reset. While the monitor is active for {}, you must call 'env.reset()' before taking an initial step.".format(self.env_id))
Пример #2
0
    def before_step(self, action):
        assert not self.closed

        if self.done:
            raise error.ResetNeeded(
                "Trying to step environment which is currently done. While the monitor is active, you cannot step beyond the end of an episode. Call 'env.reset()' to start the next episode."
            )

        self.actions.append(action)
Пример #3
0
    def step(self,
             action: Tuple[int, int],
             player: Optional[int] = None) -> Tuple[Any, float, bool, Dict]:
        """

        Args:
            action: locaton we
            player: In more complex environments, we'll want to ensure we're not playing as the
                the same player twice. This provides a way of checking we're not breaking
                order by mistake

        Returns:
            observation, reward, done, info

        """
        # check the action is valid and the game isn't over
        action = tuple(action)
        if self.board[action] != 0:
            raise error.InvalidAction(f"action {action} is not a vaid choice")
        if self.done:
            raise error.ResetNeeded("Call reset as game is over")
        if player and player != self.curr_turn:
            raise error.InvalidAction(
                f"Player {self.curr_turn}'s turn. Move request from {player}")

        logger.debug("Selected action: %s on turn %d", action,
                     self.turns_played + 1)

        # set the location on the board to the current player. Since curr_turn
        # and current player use the same indicator, we just use that
        self.board[action] = self.curr_turn

        # check if the game is over. Reward is player that won (1 or -1)
        reward = check_win(self.board)
        if reward:
            self.done = True
            return self._get_obs(), float(reward), self.done, {}

        # check if the game is over (i.e. no more turns). Since we don't have a win
        # it must be a draw
        if self.turns_played == 9:
            self.done = True
            return self._get_obs(), 0.0, self.done, {}

        # otherwise game is still going. Advance turn and return state + no reward
        self.curr_turn = next(self.turn_iterator)
        return self._get_obs(), 0.0, self.done, {}
    def step(self, action):
        if not self.episode_number or self.timesteps is self.horizon:
            raise error.ResetNeeded()

        state = self._get_new_state()
        self._take_action(action)
        reward = self._get_reward()

        message = "Timestep {}:==: Action: {} ; Reward: {}".format(
            self.timesteps, BaseEnv.action_space.lookup[action], reward)
        self.logger.debug(message)

        self.timesteps = self.timesteps + 1
        if self.timesteps is not self.horizon:
            self.current = self.current + 1
            return state, reward, False, float(self.horizon - self.timesteps)
        else:
            return state, reward, True, 0.0
Пример #5
0
    def step(self, action):
        if self.done:
            raise error.ResetNeeded("")

        r, c, stone = action
        if self.board[r][c] != self.EMPTY:
            raise error.InvalidAction(
                "Stone '{}' already exists in row: {}, col: {}".format(
                    self.board[r][c], r, c))

        if stone >= self.STONE_TYPE_COUNT:
            raise error.InvalidAction("Unknown stone type '{}'".format(stone))

        if stone == self.last_stone:
            raise error.InvalidAction("Need to change stone.")

        self.board[r][c] = self.STONES[stone]
        self.last_stone = self.STONES[stone]
        self.remaining_place -= 1

        reward, self.done = self._check_status()

        return copy.deepcopy(self.board), reward, self.done, {}
Пример #6
0
    def step(self, action: list):
        # sanity checks
        if self.done:
            raise error.ResetNeeded(
                "Environment is finished, please run env.reset() before taking actions"
            )
        if get_init_len(action) != self.n_agents:
            raise error.InvalidAction(
                f"Length of action array must be same as n_agents({self.n_agents})"
            )
        if any(np.array(action) < 0):
            raise error.InvalidAction(
                f"You can't order negative amount. You agents actions are: {action}"
            )

        # concatenate previous states, self.prev_states in an queue of previous states
        self.prev_states.popleft()
        self.prev_states.append(self._get_observations())
        # make incoming step
        demand = self._get_demand()
        orders_inc = [order.popleft() for order in self.orders]
        self.next_incoming_orders = [
            demand
        ] + orders_inc[:-1]  # what's the demand for each agent
        ship_inc = [shipment.popleft() for shipment in self.inbound_shipments]
        # calculate inbound shipments respecting orders and stock levels
        for i in range(self.n_agents -
                       1):  # manufacturer is assumed to have no constraints
            max_possible_shipment = (max(0, self.stocks[i + 1]) +
                                     ship_inc[i + 1]
                                     )  # stock + incoming shipment
            order = orders_inc[i] + max(
                0,
                -self.stocks[i + 1])  # incoming order + stockout (backorder)
            shipment = min(order, max_possible_shipment)
            self.inbound_shipments[i].append(shipment)
        self.inbound_shipments[-1].append(orders_inc[-1])
        # update stocks
        self.stocks = [(stock + inc)
                       for stock, inc in zip(self.stocks, ship_inc)]
        for i in range(1, self.n_agents):
            self.stocks[i] -= orders_inc[i - 1]
        self.stocks[0] -= demand  # for the retailer
        # update orders
        for i in range(self.n_agents):
            self.orders[i].append(action[i])
        self.next_incoming_orders = [self._get_demand()
                                     ] + [x[0] for x in self.orders[:-1]]

        # calculate costs
        self.holding_cost = np.zeros(self.n_agents, dtype=np.float)
        self.stockout_cost = np.zeros(self.n_agents, dtype=np.float)
        for i in range(self.n_agents):
            if self.stocks[i] >= 0:
                self.holding_cost[i] = (max(0, self.stocks[i]) *
                                        self.score_weight[0][i]
                                        )  # only applicable when stocks > 0
            else:
                self.stockout_cost[i] = (-min(0, self.stocks[i]) *
                                         self.score_weight[1][i]
                                         )  # only applicable when stocks < 0
        self.cum_holding_cost += self.holding_cost
        self.cum_stockout_cost += self.stockout_cost
        # calculate reward
        rewards = self._get_rewards()

        # check if done
        if self.turn == self.n_turns - 1:
            print(
                f"\nTotal cost is: EUR {sum(self.cum_holding_cost + self.cum_stockout_cost)}"
            )
            self.done = True
        else:
            self.turn += 1
        state = self._get_observations()
        # todo flatten observation dict
        return state, rewards, self.done, {}