def test_states_from_data(self, env_data, batch_size, states_dim): env, model_states = env_data states = judo.zeros((batch_size, states_dim)) observs = judo.ones((batch_size, states_dim)) rewards = judo.arange(batch_size) oobs = judo.zeros(batch_size, dtype=dtype.bool) state = env.states_from_data(batch_size=batch_size, states=states, observs=observs, rewards=rewards, oobs=oobs) assert isinstance(state, StatesEnv) for val in state.vals(): assert dtype.is_tensor(val) assert len(val) == batch_size
def params_to_arrays(param_dict: StateDict, n_walkers: int) -> Dict[str, Tensor]: """ Create a dictionary containing the arrays specified by param_dict. Args: param_dict: Dictionary defining the attributes of the tensors. n_walkers: Number items in the first dimension of the data tensors. Returns: Dictionary with the same keys as param_dict, containing arrays specified \ by `param_dict` values. """ tensor_dict = {} for key, val in param_dict.items(): # Shape already includes the number of walkers. Remove walkers axis to create size. shape = val.get("shape") if shape is None: val_size = val.get("size") elif len(shape) > 1: val_size = shape[1:] else: val_size = val.get("size") # Create appropriate shapes with current state's number of walkers. sizes = n_walkers if val_size is None else tuple([n_walkers]) + val_size if "size" in val: del val["size"] if "shape" in val: del val["shape"] tensor_dict[key] = judo.zeros(sizes, **val) return tensor_dict
def reset(self, batch_size: int = 1, **kwargs) -> StatesEnv: """ Reset the :class:`Function` to the start of a new episode and returns \ an :class:`StatesEnv` instance describing its internal state. Args: batch_size: Number of walkers that the returned state will have. **kwargs: Ignored. This environment resets without using any external data. Returns: :class:`EnvStates` instance describing the state of the :class:`Function`. \ The first dimension of the data tensors (number of walkers) will be \ equal to batch_size. """ oobs = judo.zeros(batch_size, dtype=judo.bool) new_points = self.sample_bounds(batch_size=batch_size) rewards = self.function(new_points).flatten() new_states = self.states_from_data( states=new_points, observs=new_points, rewards=rewards, oobs=oobs, batch_size=batch_size, ) return new_states
def minimize_batch( self, x: typing.Tensor) -> Tuple[typing.Tensor, typing.Tensor]: """ Minimize a batch of points. Args: x: Array representing a batch of points to be optimized, stacked \ across the first dimension. Returns: Tuple of arrays containing the local optimum found for each point, \ and an array with the values assigned to each of the points found. """ x = judo.to_numpy(judo.copy(x)) with Backend.use_backend("numpy"): result = judo.zeros_like(x) rewards = judo.zeros((x.shape[0], 1)) for i in range(x.shape[0]): new_x, reward = self.minimize_point(x[i, :]) result[i, :] = new_x rewards[i, :] = float(reward) self.bounds.high = tensor(self.bounds.high) self.bounds.low = tensor(self.bounds.low) result, rewards = tensor(result), tensor(rewards) return result, rewards
def test_points_in_bounds(self, bounds_fixture): zeros = judo.zeros((3, 3)) assert all(bounds_fixture.points_in_bounds(zeros)) tens = judo.ones((3, 3)) * 10.0 res = bounds_fixture.points_in_bounds(tens) assert not res.any(), (res, tens) tens = tensor([[-10, 0, 1], [0, 0, 0], [10, 10, 10]]) assert sum(bounds_fixture.points_in_bounds(tens)) == 1
def test_step(self, function_env, batch_size): states = function_env.reset(batch_size=batch_size) actions = StatesModel( actions=judo.zeros(states.observs.shape), batch_size=batch_size, dt=judo.ones((1, 2)), ) new_states: StatesEnv = function_env.step(actions, states) assert isinstance(new_states, StatesEnv) assert new_states.oobs[0].item() == 0
def small_tree(): node_data = {"a": judo.arange(10), "b": judo.zeros(10)} edge_data = {"c": judo.ones(10)} g = networkx.DiGraph() for i in range(8): g.add_node(to_node_id(i), **node_data) pairs = [(0, 1), (1, 2), (2, 3), (2, 4), (2, 5), (3, 6), (3, 7)] for a, b in pairs: g.add_edge(to_node_id(a), to_node_id(b), **edge_data) return g
def test_calculate_end_condition(self, walkers): walkers.reset() walkers.env_states.update(oobs=judo.ones(walkers.n, dtype=dtype.bool)) assert walkers.calculate_end_condition() walkers.env_states.update(oobs=judo.zeros(walkers.n, dtype=dtype.bool)) assert not walkers.calculate_end_condition() walkers.max_epochs = 10 walkers._epoch = 8 assert not walkers.calculate_end_condition() walkers._epoch = 11 assert walkers.calculate_end_condition()
def test_get_best_index(self, walkers): # Rewards = [1,1,...] InBounds = [0,0,...] walkers.states.update(cum_rewards=judo.ones(walkers.n), in_bounds=judo.zeros(walkers.n, dtype=dtype.bool)) best_idx = walkers.get_best_index() # If there are no in_bound rewards, the last walker is returned assert best_idx == walkers.n - 1 # Some OOB rewards # # Rewards = [0,1,0,...] InBounds = [0,1,...] oobs_best_idx = 1 oobs_rewards = judo.zeros(walkers.n) oobs_rewards[oobs_best_idx] = 1 some_oobs = judo.zeros(walkers.n) some_oobs[oobs_best_idx] = 1 walkers.states.update(cum_rewards=oobs_rewards, in_bounds=judo.astype(some_oobs, dtype.bool)) best_idx = walkers.get_best_index() assert best_idx == oobs_best_idx # If the walkers are minimizing, set all but one reward to 1.0 # If the walkers are maximizing, set all but one reward to 0.0 positive_val = 0.0 if walkers.minimize else 1.0 negative_val = 1.0 if walkers.minimize else 0.0 # Rewards = [-,+,-,-,-,...] InBounds = [1,...] mixed_rewards = judo.full((walkers.n, ), fill_value=negative_val, dtype=dtype.float) mixed_best = 1 # could be any index mixed_rewards[mixed_best] = positive_val walkers.states.update(cum_rewards=mixed_rewards, in_bounds=judo.ones(walkers.n, dtype=dtype.bool)) best_idx = walkers.get_best_index() assert best_idx == mixed_best
def test_clone(self, states_class): batch_size = 10 states = states_class(batch_size=batch_size) states.miau = judo.arange(states.n) states.miau_2 = judo.arange(states.n) will_clone = judo.zeros(states.n, dtype=judo.bool) will_clone[3:6] = True compas_ix = tensor(list(range(states.n))[::-1]) states.clone(will_clone=will_clone, compas_ix=compas_ix) target_1 = judo.arange(10) assert bool( judo.all(target_1 == states.miau)), (target_1 - states.miau, states_class)
def _accumulate_and_update_rewards(self, rewards: Tensor): """ Use as reward either the sum of all the rewards received during the \ current run, or use the last reward value received as reward. Args: rewards: Array containing the last rewards received by every walker. """ if self._accumulate_rewards: if self.states.get("cum_rewards") is None: cum_rewards = judo.zeros(rewards.shape[0]) else: cum_rewards = self.states.cum_rewards cum_rewards = cum_rewards + rewards else: cum_rewards = rewards self.update_states(cum_rewards=cum_rewards)
def test_accumulate_rewards(self, walkers): walkers.reset() walkers._accumulate_rewards = True walkers.states.update( cum_rewards=[0, 0]) # Override array of Floats and set to None walkers.states.update(cum_rewards=None) rewards = judo.arange(len(walkers)) walkers._accumulate_and_update_rewards(rewards) assert (walkers.states.cum_rewards == rewards).all() walkers._accumulate_rewards = False walkers.states.update(cum_rewards=judo.zeros(len(walkers))) rewards = judo.arange(len(walkers)) walkers._accumulate_and_update_rewards(rewards) assert (walkers.states.cum_rewards == rewards).all() walkers._accumulate_rewards = True walkers.states.update(cum_rewards=judo.ones(len(walkers))) rewards = judo.arange(len(walkers)) walkers._accumulate_and_update_rewards(rewards) assert (walkers.states.cum_rewards == rewards + 1).all()
def update_clone_probs(self) -> None: """ Calculate the new probability of cloning for each walker. Updates the :class:`StatesWalkers` with both the probability of cloning \ and the index of the randomly chosen companions that were selected to \ compare the virtual rewards. """ all_virtual_rewards_are_equal = (self.states.virtual_rewards == self.states.virtual_rewards[0]).all() if all_virtual_rewards_are_equal: clone_probs = judo.zeros(self.n, dtype=dtype.float) compas_ix = judo.arange(self.n) else: compas_ix = self.get_in_bounds_compas() companions = self.states.virtual_rewards[compas_ix] # This value can be negative!! clone_probs = (companions - self.states.virtual_rewards ) / self.states.virtual_rewards self.update_states(clone_probs=clone_probs, compas_clone=compas_ix)
def reset(self): """Clear the internal data of the class.""" params = self.get_params_dict() other_attrs = [name for name in self.keys() if name not in params] for attr in other_attrs: setattr(self, attr, None) self.update( id_walkers=judo.zeros(self.n, dtype=judo.hash_type), compas_dist=judo.arange(self.n), compas_clone=judo.arange(self.n), processed_rewards=judo.zeros(self.n, dtype=judo.float), cum_rewards=judo.zeros(self.n, dtype=judo.float), virtual_rewards=judo.ones(self.n, dtype=judo.float), distances=judo.zeros(self.n, dtype=judo.float), clone_probs=judo.zeros(self.n, dtype=judo.float), will_clone=judo.zeros(self.n, dtype=judo.bool), in_bounds=judo.ones(self.n, dtype=judo.bool), )
def sample_bounds(self, batch_size: int) -> typing.Tensor: """ Return a matrix of points sampled uniformly from the :class:`Function` \ domain. Args: batch_size: Number of points that will be sampled. Returns: Array containing ``batch_size`` points that lie inside the \ :class:`Function` domain, stacked across the first dimension. """ new_points = judo.zeros(tuple([batch_size]) + self.shape, dtype=judo.float32) for i in range(batch_size): values = self.random_state.uniform( low=judo.astype(self.bounds.low, judo.float), high=judo.astype(self.bounds.high, judo.float32), ) values = judo.astype(values, self.bounds.low.dtype) new_points[i, :] = values return new_points
def fai_iteration( observs: Tensor, rewards: Tensor, oobs: Tensor = None, dist_coef: float = 1.0, reward_coef: float = 1.0, eps=1e-8, other_reward: Tensor = 1.0, ): """Perform a FAI iteration.""" oobs = oobs if oobs is not None else judo.zeros(rewards.shape, dtype=dtype.bool) virtual_reward = calculate_virtual_reward( observs, rewards, oobs, dist_coef=dist_coef, reward_coef=reward_coef, other_reward=other_reward, ) compas_ix, will_clone = calculate_clone(virtual_rewards=virtual_reward, oobs=oobs, eps=eps) return compas_ix, will_clone
def test_from_judo(self, backend): x = judo.zeros((10, 10)) assert judo.sqrt(x).sum() == 0
def best_state(self): return judo.zeros(self.shape)
def reset(self): """Reset the data of the :class:`StepStatesWalkers`.""" super(StepStatesWalkers, self).reset() self.update(init_actions=judo.zeros((len(self), 1)), init_dt=judo.ones((len(self), 1)))
def __init__(self, n_walkers: int, env_state_params: StateDict, model_state_params: StateDict, reward_scale: float = 1.0, distance_scale: float = 1.0, max_epochs: int = None, accumulate_rewards: bool = True, distance_function: Optional[DistanceFunction] = None, ignore_clone: Optional[Dict[str, Set[str]]] = None, critic: Optional[BaseCritic] = None, minimize: bool = False, reward_limit: float = None, fix_best: bool = True, **kwargs): """ Initialize a :class:`Walkers`. Args: n_walkers: Number of walkers of the instance. env_state_params: Dictionary to instantiate the States of an :class:`Environment`. model_state_params: Dictionary to instantiate the States of a :class:`Model`. reward_scale: Regulates the importance of the reward. Recommended to \ keep in the [0, 5] range. Higher values correspond to \ higher importance. distance_scale: Regulates the importance of the distance. Recommended to \ keep in the [0, 5] range. Higher values correspond to \ higher importance. max_epochs: Maximum number of iterations that the walkers are allowed \ to perform. accumulate_rewards: If ``True`` the rewards obtained after transitioning \ to a new state will accumulate. If ``False`` only the last \ reward will be taken into account. distance_function: Function to compute the distances between two \ groups of walkers. It will be applied row-wise \ to the walkers observations and it will return a \ vector of typing_.Scalars. Defaults to l2 norm. ignore_clone: Dictionary containing the attribute values that will \ not be cloned. Its keys can be be either "env", of \ "model", to reference the `env_states` and the \ `model_states`. Its values are a set of strings with \ the names of the attributes that will not be cloned. critic: :class:`Critic` that will be used to calculate custom rewards. minimize: If ``True`` the algorithm will perform a minimization \ process. If ``False`` it will be a maximization process. reward_limit: The algorithm run will stop after reaching this \ reward value. If you are running a minimization process \ it will be considered the minimum reward possible, and \ if you are maximizing a reward it will be the maximum \ value. fix_best: If ``True`` Override the last walker of the Swarm with the \ best walker at the beginning of each epoch. kwargs: Additional attributes stored in the :class:`StatesWalkers`. """ # Add data specific to the child class in the StatesWalkers class as new attributes. if critic is not None: kwargs["critic_score"] = kwargs.get("critic_score", judo.zeros(n_walkers)) self.dtype = dtype.float best_state, best_obs, best_reward, best_id = (None, None, numpy.NINF, None) super(Walkers, self).__init__(n_walkers=n_walkers, env_state_params=env_state_params, model_state_params=model_state_params, reward_scale=reward_scale, distance_scale=distance_scale, max_epochs=max_epochs, accumulate_rewards=accumulate_rewards, distance_function=distance_function, ignore_clone=ignore_clone, best_reward=best_reward, best_obs=best_obs, best_state=best_state, best_id=best_id, **kwargs) self.critic = critic self.minimize = minimize self.efficiency = 0 self._min_entropy = 0 if reward_limit is None: reward_limit = numpy.NINF if self.minimize else numpy.inf self.reward_limit = reward_limit self.clone_to_best = fix_best