예제 #1
0
 def _clone_to_walker(self, state, obs, reward):
     if obs is None or state is None:
         return
     # Virtual reward with respect to the new state
     indexes = np.random.choice(np.arange(self.swarm.walkers.n),
                                size=self.n_comp_add)
     n_walkers = len(indexes)
     assert n_walkers == self.n_comp_add
     w_rewards = self.swarm.walkers.states.cum_rewards[indexes]
     walkers_obs = self.swarm.walkers.env_states.observs[indexes].reshape(
         n_walkers, -1)
     distances = np.linalg.norm(walkers_obs - obs.reshape(1, -1), axis=1)
     distances = relativize(
         distances.flatten())**self.swarm.walkers.dist_scale
     distances = distances / distances.sum()
     rewards = (relativize(np.concatenate(
         [w_rewards, [reward]]))**self.swarm.walkers.reward_scale)
     rewards = rewards / rewards.sum()
     w_virt_rew = 2 - distances**rewards[:-1]
     other_ix = np.random.permutation(np.arange(n_walkers))
     other_virt_rew = 2 - distances[other_ix]**rewards[-1]
     # Clone probabilities with respect to new state
     all_virtual_rewards_are_equal = (w_virt_rew == other_virt_rew).all()
     if all_virtual_rewards_are_equal:
         clone_probs = np.zeros(n_walkers, dtype=float_type)
     else:
         clone_probs = (other_virt_rew - w_virt_rew) / w_virt_rew
         clone_probs = np.sqrt(np.clip(clone_probs, 0, 1.1))
     # Clone the new state to the selected walkers
     will_clone = clone_probs > self.swarm.walkers.random_state.random_sample(
         n_walkers)
     if will_clone.sum() == 0:
         return
     new_rewards = np.ones(n_walkers)[will_clone].copy() * reward
     try:
         self.swarm.walkers.states.cum_rewards[indexes][
             will_clone] = new_rewards
         for ix, wc in zip(indexes, will_clone):
             if wc:
                 self.swarm.walkers.env_states.states[ix] = copy.deepcopy(
                     state)
                 self.swarm.walkers.env_states.observs[ix] = copy.deepcopy(
                     obs)
         self.swarm.walkers.update_best()
     except Exception as e:
         return
         orig_states = self.swarm.walkers.env_states.states
         msg = "indexes: %s will_clone: %s new_states: %s states shape: %s\n"
         data = (indexes, will_clone, [], orig_states.shape)
         msg_2 = "clone_probs: %s rewards: %s reward: %s state: %s\n"
         data_2 = (clone_probs, rewards, reward, state)
         x = orig_states[indexes][will_clone]
         msg_3 = "will_clone shape: %s clone_probs shape: %s SHAPE: %s DATA: %s" % (
             will_clone.shape,
             clone_probs.shape,
             type(x),
             x,
         )
         print((msg % data) + (msg_2 % data_2) + msg_3)
         raise e
예제 #2
0
    def _evaluate_model(self, points):

        func = partial(evaluate_one_reward, y=np.array(self.y), data=pd.DataFrame(self.X))
        result = self.pool.map(func, points.tolist())#, chunksize=points.shape[0] //
        # multiprocessing.cpu_count())
        model_score, scores, ends = tuple(zip(*result))
        ends = np.zeros_like(np.array(ends))
        scores = np.array(scores)
        score = relativize(np.array(model_score)) ** 0.5
        for i in range(scores.shape[1]):
            score = score * relativize(scores[:, i])
        entropy = np.array(score)
        #ends = score < score.mean()
        self.entropy = max(self.entropy, entropy.max())
        return score, ends
예제 #3
0
 def calculate_virtual_reward(self):
     """Apply the virtual reward formula to account for all the different goal scores."""
     rewards = -1 * self.states.cum_rewards if self.minimize else self.states.cum_rewards
     processed_rewards = relativize(rewards)
     score_reward = processed_rewards**self.reward_scale
     score_dist = self.states.distances**self.dist_scale
     virt_rw = score_reward * score_dist
     dist_prob = score_dist / score_dist.sum()
     reward_prob = score_reward / score_reward.sum()
     total_entropy = numpy.prod(2 - dist_prob**reward_prob)
     self._min_entropy = numpy.prod(2 - reward_prob**reward_prob)
     self.efficiency = self._min_entropy / total_entropy
     self.update_states(virtual_rewards=virt_rw,
                        processed_rewards=processed_rewards)
     if self.critic is not None:
         critic_states = self.critic.calculate(
             walkers_states=self.states,
             model_states=self.model_states,
             env_states=self.env_states,
         )
         self.states.update(other=critic_states)
         virt_rew = self.states.virtual_rewards * self.states.critic
     else:
         virt_rew = self.states.virtual_rewards
     self.states.update(virtual_rewards=virt_rew)
예제 #4
0
    def calculate_virtual_reward(self) -> None:
        """
        Calculate the virtual reward and update the internal state.

        The cumulative_reward is transformed with the relativize function. \
        The distances stored in the :class:`StatesWalkers` are already transformed.
        """
        processed_rewards = relativize(self.states.cum_rewards)
        virt_rw = processed_rewards**self.reward_scale * self.states.distances**self.dist_scale
        self.update_states(virtual_rewards=virt_rw,
                           processed_rewards=processed_rewards)
예제 #5
0
    def calculate_distances(self) -> None:
        """Calculate the corresponding distance function for each observation with \
        respect to another observation chosen at random.

        The internal :class:`StateWalkers` is updated with the relativized distance values.
        """
        compas_ix = numpy.random.permutation(numpy.arange(
            self.n))  # self.get_alive_compas()
        obs = self.env_states.observs.reshape(self.n, -1)
        distances = self.distance_function(obs, obs[compas_ix])
        distances = relativize(distances.flatten())
        self.update_states(distances=distances, compas_dist=compas_ix)
예제 #6
0
 def test_update_clone_probs(self, walkers):
     walkers.reset()
     walkers.states.update(virtual_rewards=relativize(np.arange(walkers.n)))
     walkers.update_clone_probs()
     assert 0 < np.sum(
         walkers.states.clone_probs == walkers.states.clone_probs[0]), (
             walkers.states.virtual_rewards,
             walkers.states.clone_probs,
         )
     walkers.reset()
     walkers.update_clone_probs()
     assert np.sum(walkers.states.clone_probs ==
                   walkers.states.clone_probs[0]) == walkers.n
     assert walkers.states.clone_probs.shape[0] == walkers.n
     assert len(walkers.states.clone_probs.shape) == 1
예제 #7
0
    def calculate_distances(self) -> None:
        """Calculate the corresponding distance function for each state with \
        respect to another state chosen at random.

        The internal state is update with the relativized distance values.

        The distance is performed on the RAM memory of the Atari emulator
        """
        compas_ix = np.random.permutation(np.arange(self.n))
        # This unpacks RAMs from Uber Go-explore custom Montezuma environment
        rams = self.env_states.states.reshape(self.n,
                                              -1)[:, :-12].astype(np.uint8)
        vec = rams - rams[compas_ix]
        dist_ram = self.distance_function(vec, axis=1).flatten()
        distances = relativize(dist_ram)
        self.update_states(distances=distances, compas_dist=compas_ix)
예제 #8
0
 def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None):
     if swarm is None:
         return numpy.ones(self.n_points**self.n_points)
     if swarm.critic.bounds is None:
         swarm.critic.bounds = Bounds.from_array(X, scale=1.1)
     # target grid to interpolate to
     xi = numpy.linspace(swarm.critic.bounds.low[0],
                         swarm.critic.bounds.high[0], self.n_points)
     yi = numpy.linspace(swarm.critic.bounds.low[1],
                         swarm.critic.bounds.high[1], self.n_points)
     xx, yy = numpy.meshgrid(xi, yi)
     grid = numpy.c_[xx.ravel(), yy.ravel()]
     if swarm.swarm.critic.warmed:
         memory_values = swarm.swarm.critic.predict(grid)
         memory_values = relativize(-memory_values)
     else:
         memory_values = numpy.arange(grid.shape[0])
     return memory_values
예제 #9
0
 def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None):
     """Return the normalized ``cum_rewards`` of the walkers."""
     rewards: numpy.ndarray = relativize(swarm.walkers.states.cum_rewards)
     return rewards
예제 #10
0
 def get_z_coords(self, swarm: Swarm, X: numpy.ndarray = None):
     rewards: numpy.ndarray = relativize(swarm.walkers.states.cum_rewards)
     return rewards