示例#1
0
    def _log_one_step(self, user_obs, doc_obs, slate, responses, reward,
                      is_terminal, sequence_example):
        """Adds one step of agent-environment interaction into SequenceExample.

    Args:
      user_obs: An array of floats representing user state observations
      doc_obs: A list of observations of the documents
      slate: An array of indices to doc_obs
      responses: A list of observations of responses for items in the slate
      reward: A float for the reward returned after this step
      is_terminal: A boolean for whether a terminal state has been reached
      sequence_example: A SequenceExample proto for logging current episode
    """
        def _add_float_feature(feature, values):
            feature.feature.add(float_list=tf.train.FloatList(value=values))

        def _add_int64_feature(feature, values):
            feature.feature.add(int64_list=tf.train.Int64List(value=values))

        if self._episode_writer is None:
            return
        fl = sequence_example.feature_lists.feature_list

        if isinstance(self._env.environment, environment.MultiUserEnvironment):
            for i, (single_user, single_slate, single_user_responses,
                    single_reward) in enumerate(
                        zip(user_obs, slate, responses, reward)):
                user_space = list(
                    self._env.observation_space.spaces['user'].spaces)[i]
                _add_float_feature(fl['user_%d' % i],
                                   spaces.flatten(user_space, single_user))
                _add_int64_feature(fl['slate_%d' % i], single_slate)
                _add_float_feature(fl['reward_%d' % i], [single_reward])
                for j, response in enumerate(single_user_responses):
                    resp_space = self._env.observation_space.spaces[
                        'response'][i][0]
                    for k in response:
                        _add_float_feature(
                            fl['response_%d_%d_%s' % (i, j, k)],
                            spaces.flatten(resp_space, response))
        else:  # single-user environment
            _add_float_feature(
                fl['user'],
                spaces.flatten(self._env.observation_space.spaces['user'],
                               user_obs))
            _add_int64_feature(fl['slate'], slate)
            for i, response in enumerate(responses):
                resp_space = self._env.observation_space.spaces['response'][0]
                for k in response:
                    _add_float_feature(fl['response_%d_%s' % (i, k)],
                                       spaces.flatten(resp_space, response))
            _add_float_feature(fl['reward'], [reward])

        for i, doc in enumerate(list(doc_obs.values())):
            doc_space = list(
                self._env.observation_space.spaces['doc'].spaces.values())[i]
            _add_float_feature(fl['doc_%d' % i],
                               spaces.flatten(doc_space, doc))

        _add_int64_feature(fl['is_terminal'], [is_terminal])
示例#2
0
def test_fast_obs_2():
    env = Warehouse(3,
                    8,
                    3,
                    3,
                    2,
                    1,
                    5,
                    10,
                    None,
                    RewardType.GLOBAL,
                    fast_obs=False)
    env.reset()

    slow_obs_space = env.observation_space

    for _ in range(10):
        slow_obs = [env._make_obs(agent) for agent in env.agents]
        env._use_fast_obs()
        fast_obs = [env._make_obs(agent) for agent in env.agents]
        assert len(fast_obs) == 3
        assert len(slow_obs) == 3

        flattened_slow = [
            spaces.flatten(osp, obs)
            for osp, obs in zip(slow_obs_space, slow_obs)
        ]

        for i in range(len(fast_obs)):
            print(slow_obs[0])
            assert list(fast_obs[i]) == list(flattened_slow[i])

        env._use_slow_obs()
        env.step(env.action_space.sample())
示例#3
0
def test_partial_hand():
    """Given a partial hand, ensure that we are able to map this to
    a partial deck and does not return a whole array.

    This is the behavior when data and numpy are difference:

    * data - just return a list of the the data points
    * numpy array - return a fixed numpy array of max size
    """
    deck_empty = PartialDeck(cards=[])
    assert deck_empty.to_data() == []
    assert deck_empty.to_data_for_numpy() == (
        [Card.get_null_data()] * PartialDeck.get_max_size()
    )

    expected_first_card = Card.from_str("A,S")
    deck = PartialDeck(cards=[expected_first_card] * 2)
    assert deck.to_data() == [expected_first_card.to_data()] * 2

    # Test for flatten numpy data
    numpy_data = deck.to_data_for_numpy()
    assert len(numpy_data) == PartialDeck.get_max_size()
    obs_space = deck.get_observation_space()
    flattend_numpy = spaces.flatten(obs_space, numpy_data)
    assert (flattend_numpy[0:2] == expected_first_card.to_data_for_numpy()).all()
    assert (flattend_numpy[2:4] == expected_first_card.to_data_for_numpy()).all()
    assert (flattend_numpy[4:6] == Card.get_null_data()).all()
示例#4
0
    def get_action(self, observation):
        """ Return action choice by the agents

        :param observation: stat of environment
        :type observation: gym.Space
        """
        if not self.greedy_exploration.be_greedy(
                self.step) and self.with_exploration:
            return self.action_space.sample()

        observation = torch.tensor(
            [flatten(self.observation_space, observation)],
            device=self.device).float()

        prediction = self.network.forward(observation)

        def return_values(values):
            if isinstance(values, list):
                return [return_values(v) for v in values]

            q_values = values * self.z
            q_values = torch.sum(q_values, dim=2)
            return torch.argmax(q_values).detach().item()

        return return_values(prediction)
示例#5
0
 def step(self, action):
     self.step_count += 1
     self.reward = self.compute_reward(action)
     self.utility += self.reward
     self.move_vehicles()
     if self.step_count >= self.task_num_per_episode:
         self.done = True
     else:
         self.done = False
         self.s["snr"] = np.array([
             min(self.snr_ref * (abs(v["position"]) / 200)**-2, 1)
             for v in self.vehicles
         ] + [0] * (self.max_v - self.num_vehicles))
         self.s["freq_remain"] = np.array(
             [v["freq_remain"] for v in self.vehicles] + [0] *
             (self.max_v - self.num_vehicles))
         self.s["u_max"] = np.array([v["u_max"] for v in self.vehicles] +
                                    [0] * (self.max_v - self.num_vehicles))
         # self.s["time_remain"] = np.array([min(-v["position"]/v["velocity"]+500/abs(v["velocity"]), 100) for v in self.vehicles] + [0]*(self.max_v-self.num_vehicles))
         task = self.tasks[self.step_count]
         self.s["serv_prob"] = np.array([
             self.compute_service_availability(task, v)
             for v in self.vehicles
         ] + [0] * (self.max_v - self.num_vehicles))
         self.s["task"] = np.array(task)
     return spaces.flatten(self.observation_space,
                           self.s), self.reward, self.done, {}
示例#6
0
 def step(self, action):
     action = np.clip(action, self.action_space.low, self.action_space.high)
     if self._flatten_actions:
         action = spaces.unflatten(self.env.action_space, action)
     obs, reward, done, info = self.env.step(action)
     if self._flatten_obs:
         obs = spaces.flatten(self.env.observation_space, obs)
     return obs, reward, done, info
示例#7
0
    def observation(self, observation):
        """Flattens an observation.

        Args:
            observation: The observation to flatten

        Returns:
            The flattened observation
        """
        return spaces.flatten(self.env.observation_space, observation)
示例#8
0
 def reset(self):
     """Resets the environment and returns the start state"""
     # for _ in range(random.randint(1,10)):
     #     self.add_vehicle()
     self.move_vehicles()
     # self.add_vehicle()
     # self.generate_local_tasks()
     # self.generate_offload_tasks()
     self.step_count = 0
     self.next_state = None
     self.reward = None
     self.done = False
     for v in self.vehicles:
         v["freq"] = v["freq_init"]
         v["freq_remain"] = max(
             0, v["freq_init"] - sum([i[1] / i[2] for i in v["tasks"]]))
         v["position"] = v["position_init"]
         alpha_max = v["freq_remain"] / v["freq"]
         v["u_max"] = sum(
             [np.log(1 + alpha_max * i[2]) for i in v["tasks"]])
     with open(self.count_file, 'a') as f:
         f.write(
             str(self.utility) + ' ' +
             ' '.join([str(i) for i in self.low_count]) + ' ' +
             ' '.join([str(i) for i in self.low_delay]) + ' ' +
             ' '.join([str(i) for i in self.high_count]) + ' ' +
             ' '.join([str(i) for i in self.high_delay]) + ' ' + '\n')
     self.high_count = [0, 0, 0, 0]
     self.high_delay = [0, 0, 0, 0]
     self.low_count = [0, 0, 0, 0]
     self.low_delay = [0, 0, 0, 0]
     self.utility = 0
     task = self.tasks[0]
     self.s = {
         "snr":
         np.array([
             min(self.snr_ref * (abs(v["position"]) / 200)**-2, 1)
             for v in self.vehicles
         ] + [0] * (self.max_v - self.num_vehicles)),
         # "time_remain":np.array([min(-v["position"]/v["velocity"]+500/abs(v["velocity"]), 100) for v in self.vehicles] + [0]*(self.max_v-self.num_vehicles)),
         "freq_remain":
         np.array([v["freq_remain"] for v in self.vehicles] + [0] *
                  (self.max_v - self.num_vehicles)),
         "u_max":
         np.array([v["u_max"] for v in self.vehicles] + [0] *
                  (self.max_v - self.num_vehicles)),
         "serv_prob":
         np.array([
             self.compute_service_availability(task, v)
             for v in self.vehicles
         ] + [0] * (self.max_v - self.num_vehicles)),
         "task":
         np.array(task)
     }
     return spaces.flatten(self.observation_space, self.s)
示例#9
0
    def test_flatten_unflatten(self, observation_space, ordered_values):
        """
        test flatten and unflatten functions directly
        """
        original = observation_space.sample()

        flattened = flatten(observation_space, original)
        unflattened = unflatten(observation_space, flattened)

        self._check_observations(original, flattened, unflattened,
                                 ordered_values)
def make_traj_opt_align(
    traj_optimizer: TrajOptimizer,
    env: Env,
    true_reward: np.ndarray,
    test_rewards: np.ndarray,
    epsilon: float,
    parallel: Optional[Parallel] = None,
    n_test_states: Optional[int] = None,
) -> np.ndarray:
    state_shape = env.observation_space.sample().shape
    action_shape = env.action_space.sample().shape

    if n_test_states is not None:
        raw_states = np.array([
            flatten(env.observation_space, env.observation_space.sample())
            for _ in range(n_test_states)
        ])
    else:
        n_test_states = 1
        raw_states = np.array([env.state])
    assert raw_states.shape == (n_test_states, *state_shape)

    opt_plans = make_plans(
        true_reward.reshape(1, 4),
        raw_states,
        traj_optimizer,
        parallel,
        action_shape,
        memorize=True,
    )
    assert opt_plans.shape == (
        1,
        n_test_states,
        50,
        *action_shape,
    ), f"opt_plans shape={opt_plans.shape} is not expected {(1,n_test_states,50,*action_shape)}"
    opt_values: np.ndarray = rollout_plans(env, opt_plans, raw_states)

    plans = make_plans(test_rewards, raw_states, traj_optimizer, parallel,
                       action_shape)
    assert plans.shape == (
        len(test_rewards),
        n_test_states,
        50,
        *action_shape,
    ), f"plans shape={plans.shape} is not expected {(len(test_rewards),n_test_states,50,*action_shape)}"
    values = rollout_plans(env, plans, raw_states)
    assert values.shape == (
        len(test_rewards),
        n_test_states,
    ), f"Values shape={values.shape} is not expected {(len(test_rewards), n_test_states)}"

    alignment = cast(np.ndarray, np.all(opt_values - values < epsilon, axis=1))
    return alignment
示例#11
0
    def encode(self, observation):
        """Encode user observation and document observations to an image."""
        # It converts the observation from the simulator to a numpy array to be
        # consumed by DQN agent, which assume the input is a "image".
        # The first row is user's observation. The remaining rows are documents'
        # observation, one row for each document.
        image = np.zeros(self._observation_shape + (self._stack_size, ),
                         dtype=self._observation_dtype)
        image[0, :, 0] = self._pad_with_zeros(
            spaces.flatten(self._input_observation_space.spaces['user'],
                           observation['user']))
        doc_space = zip(
            self._input_observation_space.spaces['doc'].spaces.values(),
            observation['doc'].values())
        image[1:, :, 0] = np.array([
            self._pad_with_zeros(spaces.flatten(doc_space, d))
            for doc_space, d in doc_space
        ])

        return image
示例#12
0
文件: dqn.py 项目: qtests/rl-workshop
    def forward(self, states):
        # Forward flattened state
        states_flattened = [
            spaces.flatten(self.env.observation_space, s) for s in states
        ]
        states_tensor = Tensor(states_flattened)

        # Move tensor to GPU if available
        if torch.cuda.is_available():
            states_tensor = states_tensor.cuda()

        return self.network(states_tensor)
示例#13
0
    def step(self, action):
        try:
            action = np.split(action, self.n_agents)
        except (AttributeError, IndexError):
            action = [action]

        observation, reward, done, info = super().step(action)
        observation = np.concatenate(
            [spaces.flatten(s, o) for s, o in zip(self.observation_space, observation)]
        )
        reward = np.sum(reward)
        done = all(done)
        return observation, reward, done, info
示例#14
0
    def get_qvalues(self, state):
        # Flatten state
        state = tuple(spaces.flatten(self.env.observation_space, state))

        # Generate new entry in table for new states
        if state not in self.q_table:
            # By adding an entry in the Q-table, we make the agent's
            # behavior dependent on previous runs and hence previous seeds!
            # This is not expected in greedy mode.
            if self.is_greedy:
                return np.random.rand(self.env.action_space.n)
            
            self.q_table[state] = np.random.rand(self.env.action_space.n)

        return self.q_table[state]
示例#15
0
    def learn(self, observation, action, reward, next_observation,
              done) -> None:
        """ learn from parameters

        :param observation: stat of environment
        :type observation: gym.Space
        :param action: action taken by agent
        :type action: int, float, list
        :param reward: reward win
        :type reward: int, float, np.int, np.float
        :type reward: int, np.int
        :param next_observation:
        :type next_observation: gym.Space
        :param done: if env is finished
        :type done: bool
        """
        self.memory.append([flatten(self.observation_space, observation)],
                           action, reward,
                           [flatten(self.observation_space, next_observation)],
                           done)
        self.step += 1

        if (self.step % self.step_train) == 0:
            self.train()
示例#16
0
    def __get_all_players_observation_with_action(
            self, state: FullState,
            decision: BaseDecision) -> List[np.ndarray]:
        """This return a map of the action space,
        with a multi-discrete action space for checking next_accepted_action
        """
        obs = [None] * self.n_agents

        next_player: int = self.next_player
        action_obs: Dict[str, np.ndarray] = decision.action_range_to_numpy()
        player_obs_space = self.state.to_player_data(next_player,
                                                     for_numpy=True)

        # TODO: only set action for the next player
        obs[next_player] = spaces.flatten(
            # Note this includes action observation
            self.observation_space,
            [action_obs, player_obs_space],
        )
        return obs
示例#17
0
 def transform(self, attr: AttributationLike) -> AttributationLike:
     obs_space = self.obs_space
     if self.obs_image_channel_dim is not None:
         attr = np.sum(attr, axis=self.obs_image_channel_dim)
         obs_space = remove_channel_dim_from_image_space(obs_space)
     attr = flatten(self.obs_space, attr)
     if self.mode == AttributationNormalizationMode.ALL:
         scaling_factor = self._calculate_safe_scaling_factor(np.abs(attr))
     elif self.mode == AttributationNormalizationMode.POSITIVE:
         attr = (attr > 0) * attr
         scaling_factor = self._calculate_safe_scaling_factor(attr)
     elif self.mode == AttributationNormalizationMode.NEGATIVE:
         attr = (attr < 0) * attr
         scaling_factor = -self._calculate_safe_scaling_factor(np.abs(attr))
     elif self.mode == AttributationNormalizationMode.ABSOLUTE_VALUE:
         attr = np.abs(attr)
         scaling_factor = self._calculate_safe_scaling_factor(attr)
     else:
         raise EnumValueNotFound(self.mode, AttributationNormalizationMode)
     attr_norm = self._scale(attr, scaling_factor)
     return unflatten(obs_space, attr_norm)
示例#18
0
def flatten_observation(space, x=None):
    # Note that it does not preserve dtype
    def _flatten_bounds(space, bounds_type):
        if isinstance(space, spaces.Box):
            if bounds_type == 'high':
                return np.asarray(space.high).flatten()
            else:
                return np.asarray(space.low).flatten()
        elif isinstance(space, spaces.Discrete):
            if bounds_type == 'high':
                return np.one(space.n)
            else:
                return np.zeros(space.n)
        elif isinstance(space, spaces.Tuple):
            return np.concatenate(
                [_flatten_bounds(s, bounds_type) for s in space.spaces])
        elif isinstance(space, spaces.Dict):
            return np.concatenate([
                _flatten_bounds(s, bounds_type) for s in space.spaces.values()
            ])
        elif isinstance(space, spaces.MultiBinary):
            if bounds_type == 'high':
                return np.one(space.n)
            else:
                return np.zeros(space.n)
        elif isinstance(space, spaces.MultiDiscrete):
            if bounds_type == 'high':
                return np.one(reduce(__mul__, space.nvec))
            else:
                return np.zeros(reduce(__mul__, space.nvec))
        else:
            raise NotImplementedError

    if x is None:
        return spaces.Box(low=_flatten_bounds(space, 'low'),
                          high=_flatten_bounds(space, 'high'),
                          dtype=np.float64)
    else:
        return spaces.flatten(space, x)
 def observation(self, observation):
     return flatten(self.env.observation_space['state'], observation)
示例#20
0
 def test_call_network(self):
     for ob, ac in self.list_work:
         self.network(observation_space=ob, action_space=ac)(torch.tensor(
             [flatten(ob, ob.sample())]).float())
 def observation(self, observation):
     return spaces.flatten(self.env.observation_space, observation)
示例#22
0
 def observation(self, observation):
     return flatten(self.observation_space, observation)
示例#23
0
文件: dqn.py 项目: qtests/rl-workshop
    def inspect_memory(self, top_n=10, max_col=80):
        # Functions to encode/decode states
        encode_state = lambda s: tuple(
            spaces.flatten(self.env.observation_space, s))
        decode_state = lambda s: spaces.unflatten(self.env.observation_space, s
                                                  )

        # Function to create barchart from counter
        def count_barchart(counter, ax, xlabel=None, normalize=True):
            # Sort and extract key, counts
            sorted_tuples = counter.most_common()
            sorted_keys = [key for key, count in sorted_tuples]
            sorted_counts = [count for key, count in sorted_tuples]

            # Normalize counts
            if normalize:
                total = sum(counters['reward'].values())
                sorted_counts = [c / total for c in sorted_counts]

            # Plotting
            x_indexes = range(len(sorted_counts))
            ax.bar(x_indexes, sorted_counts)
            ax.set_xticks(x_indexes)
            ax.set_xticklabels(sorted_keys)
            ax.set_ylabel('proportion')
            if xlabel is not None:
                ax.set_xlabel(xlabel)
            ax.set_title('Replay Memory')

        # Function to print top states from counter
        def top_states(counter):
            for i, (state, count) in enumerate(counter.most_common(top_n), 1):
                state_label = str(decode_state(state))
                state_label = state_label.replace('\n', ' ')
                state_label = state_label[:max_col] + '..' if len(
                    state_label) > max_col else state_label
                print('{:>2}) Count: {} state: {}'.format(
                    i, count, state_label))

        # Count statistics
        counters = defaultdict(Counter)
        for state, action, reward, next_state, done in self.memory:
            counters['state'][encode_state(state)] += 1
            counters['action'][action] += 1
            counters['reward'][reward] += 1
            counters['next_state'][encode_state(next_state)] += 1
            counters['done'][done] += 1

        # Plot reward/action
        fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
        count_barchart(counters['reward'], ax1, 'rewards')
        count_barchart(counters['action'], ax2, 'actions')
        plt.plot()
        plt.show()

        # Print top states
        print('Top state:')
        top_states(counters['state'])
        print()

        print('Top next_state:')
        top_states(counters['next_state'])
        print()

        # Done signal
        print('Proportion of done: {:.2f}%'.format(
            100 * counters['done'][True] / sum(counters['done'].values())))
示例#24
0
 def _extract_state(self, observation):
     user_space = self._observation_space.spaces["user"]
     return spaces.flatten(user_space, observation["user"])
示例#25
0
 def _get_observations(self):
   # return (self.hands[0], self.played_cards, self.scores)
   first = min([i for i in range(self.players) if not self.played_cards[i][0] == 0], default=0)
   obs = (self.hands[0], self.played_cards[first:] + self.played_cards[:first], self.scores)
   return spaces.flatten(self.unflattened_observation_space, obs)
示例#26
0
 def observation(self, observation):
     return spaces.flatten(self.env.observation_space,
                           np.moveaxis(observation, -1, 0)) / 255.
示例#27
0
 def observation(self, observation):
     import ipdb
     ipdb.set_trace()
     return spaces.flatten(self.env.observation_space, observation) / 255.
示例#28
0
 def reset(self, **kwargs):
     if self._flatten_obs:
         return spaces.flatten(self.env.observation_space,
                               self.env.reset(**kwargs))
     else:
         return self.env.reset(**kwargs)
示例#29
0
 def to_flattened_numpy_data(self, player_id: int):
     return spaces.flatten(self.get_observation_space(), self.to_data_for_numpy())
示例#30
0
 def observation(self, observation):
     return tuple([
         spaces.flatten(obs_space, obs)
         for obs_space, obs in zip(self.env.observation_space, observation)
     ])