def _compute(self, tolerance, verbose=False): start_time = time.time() num_optimization_steps = 1000 value_functions = avf_values( self.env, num_optimization_steps=num_optimization_steps, num_tasks=self.num_tasks) value_functions = np.squeeze(value_functions) value_functions = np.atleast_2d(value_functions) self.metric = np.zeros((self.num_states, self.num_states)) for s in range(self.num_states): # We take advantage of symmetry for faster computation. for t in range(s + 1, self.num_states): max_difference = 0.0 for policy in range(value_functions.shape[0]): for a in range(self.num_actions): q1 = (self.env.rewards[s, a] + self.env.gamma * np.matmul(self.env.transition_probs[s, a, :], value_functions[policy, :])) q2 = (self.env.rewards[t, a] + self.env.gamma * np.matmul(self.env.transition_probs[t, a, :], value_functions[policy, :])) action_diff = abs(q1 - q2) if action_diff > max_difference: max_difference = action_diff self.metric[s, t] = max_difference self.metric[t, s] = max_difference # We don't really have a sampled versiion of this. total_time = time.time() - start_time self.statistics = metric.Statistics(0., total_time, num_optimization_steps, 0.)
def _compute(self, tolerance, verbose=False): del tolerance assert self.env.q_values is not None, 'Q-Values have not been computed.' self.metric = np.max(np.abs(self.env.q_values[:, None, :] - self.env.q_values[None, :, :]), axis=-1) self.statistics = metric.Statistics(0., 0., 0, 0.)
def _compute(self, tolerance, verbose=False): """Compute exact/online lax-bisimulation metric up to specified tolerance. Args: tolerance: float, maximum difference in metric estimate between successive iterations. Once this threshold is past, computation stops. verbose: bool, whether to print verbose messages. """ # Initial metric is all zeros. curr_metric = np.zeros((self.num_states, self.num_states)) metric_difference = tolerance * 2. i = 1 exact_metric_differences = [] start_time = time.time() while metric_difference > tolerance: new_metric = np.zeros((self.num_states, self.num_states)) state_action_metric = np.zeros((self.num_states, self.num_actions, self.num_states, self.num_actions)) for s in range(self.num_states): for t in range(self.num_states): for a in range(self.num_actions): for b in range(self.num_actions): next_state_distrib_1 = self.env.transition_probs[s, a, :] next_state_distrib_2 = self.env.transition_probs[t, b, :] rew1 = self.env.rewards[s, a] rew2 = self.env.rewards[t, b] emd = ot.emd2( next_state_distrib_1, next_state_distrib_2, curr_metric) state_action_metric[s, a, t, b] = ( abs(rew1 - rew2) + self.gamma * emd) # Now that we've updated the state-action metric, we compute the Hausdorff # metric. for s in range(self.num_states): for t in range(s + 1, self.num_states): # First we find \sup_x\inf_y d(x, y) from Definition 5 in paper. max_a = None for a in range(self.num_actions): min_b = np.min(state_action_metric[s, a, t, :]) if max_a is None or min_b > max_a: max_a = min_b # Next we find \sup_y\inf_x d(x, y) from Definition 5 in paper. max_b = None for b in range(self.num_actions): min_a = np.min(state_action_metric[s, :, t, b]) if max_b is None or min_a > max_b: max_b = min_a new_metric[s, t] = max(max_a, max_b) new_metric[t, s] = new_metric[s, t] metric_difference = np.max(abs(new_metric - curr_metric)) exact_metric_differences.append(metric_difference) if verbose: logging.info('Iteration %d: %f', i, metric_difference) curr_metric = np.copy(new_metric) i += 1 total_time = time.time() - start_time self.metric = curr_metric self.statistics = metric.Statistics( tolerance, total_time, i, exact_metric_differences)
def _compute(self, tolerance, verbose=False): """Compute exact/online bisimulation metric up to the specified tolerance. Args: tolerance: float, maximum difference in metric estimate between successive iterations. Once this threshold is past, computation stops. verbose: bool, whether to print verbose messages. """ # Initial metric is all zeros. curr_metric = np.zeros((self.num_states, self.num_states)) metric_difference = tolerance * 2. i = 1 exact_metric_differences = [] start_time = time.time() while metric_difference > tolerance: new_metric = np.zeros((self.num_states, self.num_states)) for s in range(self.num_states): for t in range(self.num_states): for a in range(self.num_actions): next_state_distrib_1 = self.env.transition_probs[s, a, :] next_state_distrib_2 = self.env.transition_probs[t, a, :] rew1 = self.env.rewards[s, a] rew2 = self.env.rewards[t, a] emd = ot.emd2(next_state_distrib_1, next_state_distrib_2, curr_metric) act_distance = abs(rew1 - rew2) + self.gamma * emd if act_distance > new_metric[s, t]: new_metric[s, t] = act_distance metric_difference = np.max(abs(new_metric - curr_metric)) exact_metric_differences.append(metric_difference) if verbose: logging.info('Iteration %d: %f', i, metric_difference) curr_metric = np.copy(new_metric) i += 1 total_time = time.time() - start_time self.metric = curr_metric self.statistics = metric.Statistics(tolerance, total_time, i, exact_metric_differences)
def _compute(self, tolerance, verbose=False): """Compute the bisimulation relation and convert it to a discrete metric. Args: tolerance: float, unused. verbose: bool, whether to print verbose messages. Returns: Statistics object containing statistics of computation. """ del tolerance equivalence_classes_changing = True iteration = 0 start_time = time.time() # All states start in the same equivalence class. equivalence_classes = [list(range(self.num_states))] state_to_class = [0] * self.num_states while equivalence_classes_changing: equivalence_classes_changing = False class_removed = False iteration += 1 new_equivalence_classes = copy.deepcopy(equivalence_classes) new_state_to_class = copy.deepcopy(state_to_class) for s1 in range(self.num_states): if self._state_matches_class( s1, equivalence_classes[state_to_class[s1]]): continue # We must find a new class for s1. equivalence_classes_changing = True previous_class = new_state_to_class[s1] new_state_to_class[s1] = -1 # Checking if there are still any elements in s1's old class. potential_new_class = [ x for x in new_equivalence_classes[previous_class] if x != s1] if potential_new_class: new_equivalence_classes[previous_class] = potential_new_class else: # remove s1's old class from the list of new_equivalence_classes. new_equivalence_classes.pop(previous_class) class_removed = True # Re-index the classes. for i, c in enumerate(new_state_to_class): if c > previous_class: new_state_to_class[i] = c - 1 for i, c in enumerate(new_equivalence_classes): if not class_removed and i == previous_class: continue if self._state_matches_class(s1, c): new_state_to_class[s1] = i new_equivalence_classes[i] += [s1] break if new_state_to_class[s1] < 0: # If we haven't found a matching equivalence class, we create a new # one. new_equivalence_classes.append([s1]) new_state_to_class[s1] = len(new_equivalence_classes) - 1 equivalence_classes = copy.deepcopy(new_equivalence_classes) state_to_class = copy.deepcopy(new_state_to_class) if iteration % 1000 == 0 and verbose: tf.logging.info('Iteration {}'.format(iteration)) # Now that we have the equivalence classes, we create the metric. self.metric = np.ones((self.num_states, self.num_states)) for c in equivalence_classes: for s1 in c: for s2 in c: self.metric[s1, s2] = 0. total_time = time.time() - start_time self.statistics = metric.Statistics(-1., total_time, iteration, 0.0)