def abstract_tf(intervals, new_state_bounds, sink): adder = 1 if sink else 0 abs_tf = np.zeros(len(intervals) + adder) for ns in new_state_bounds: min_mcrst = helper.get_mcrst(ns[0], intervals, sink) max_mcrst = helper.get_mcrst(ns[1], intervals, sink) if min_mcrst == max_mcrst: abs_tf[min_mcrst] += 1 else: den = ns[1] - ns[0] if min_mcrst == -1: abs_tf[min_mcrst] += (intervals[0][0] - ns[0]) / den else: abs_tf[min_mcrst] += (intervals[min_mcrst][1] - ns[0]) / den if max_mcrst == len(intervals): abs_tf[max_mcrst] += (ns[1] - intervals[-1][1]) / den else: abs_tf[max_mcrst] += (ns[1] - intervals[max_mcrst][0]) / den for i in range(min_mcrst + 1, max_mcrst): abs_tf[i] += (intervals[i][1] - intervals[i][0]) / den return helper.normalize_array(abs_tf)
def abstract_tf(intervals, new_state_bounds, sink): adder = 1 if sink else 0 abs_tf = [] for i in range(0, len(intervals) + adder): abs_tf.append([0, 0]) for ns in new_state_bounds: min_mcrst = helper.get_mcrst(ns[0], intervals, sink) max_mcrst = helper.get_mcrst(ns[1], intervals, sink) # update min interval. if min_mcrst == max_mcrst: abs_tf[min_mcrst][0] += 1 # update max interval. for i in range(min_mcrst, max_mcrst + 1): abs_tf[i][1] += 1 # correction (ev). if min_mcrst == -1 and max_mcrst == len(intervals): abs_tf[min_mcrst][1] -= 1 # normalization. den = len(new_state_bounds) return [[el[0] / den, el[1] / den] for el in abs_tf]
def create_arriving_mcrst_helper(self): for cont in self.container: for act in cont.keys(): # Evaluate the effect of act on every sample in the macrostate. # --> We assume valid the Lipschitz-0 hypothesis on delta s in order to add fictitious samples! <-- sample = cont[act] delta_s = sample['new_state'] - sample['state'] self.arriving_mcrst_helper[act] = {} # every action is a key. ns_index = helper.get_mcrst(sample['new_state'], self.intervals, self.sink) self.arriving_mcrst_helper[act][ ns_index] = 1 # every index of an arriving mcrst is a key. # Apply the delta s of the sample to every other state in the macrostate. for act2 in cont.keys(): if act != act2: # evaluation of act in all the other samples in the mcrst. new_state = cont[act2]['state'] + delta_s index = helper.get_mcrst(new_state, self.intervals, self.sink) if index in self.arriving_mcrst_helper[act].keys(): self.arriving_mcrst_helper[act][index] += 1 else: self.arriving_mcrst_helper[act][ index] = 1 # every index of an arriving mcrst is a key.
def abstract_tf(intervals, new_state_bounds, sink): adder = 1 if sink else 0 abs_tf = np.zeros(len(intervals) + adder) # I obtain the min & max new state I would get by performing action act in the mcrst, according to the samples. new_st_min = min([ns[0] for ns in new_state_bounds]) new_st_max = max([ns[1] for ns in new_state_bounds]) min_mcrst = helper.get_mcrst(new_st_min, intervals, sink) max_mcrst = helper.get_mcrst(new_st_max, intervals, sink) if min_mcrst == max_mcrst: abs_tf[min_mcrst] += 1 else: if min_mcrst == -1: abs_tf[min_mcrst] += (intervals[0][0] - new_st_min) else: abs_tf[min_mcrst] += (intervals[min_mcrst][1] - new_st_min) if max_mcrst == len(intervals): abs_tf[max_mcrst] += (new_st_max - intervals[-1][1]) else: abs_tf[max_mcrst] += (new_st_max - intervals[max_mcrst][0]) for i in range(min_mcrst + 1, max_mcrst): abs_tf[i] += (intervals[i][1] - intervals[i][0]) return helper.normalize_array(abs_tf)
def create_arriving_mcrst_helper(self): for cont in self.container: for act in cont.keys(): # Evaluate the effect of act on every sample in the macrostate. # --> We assume valid the Lipschitz-0 hypothesis on delta s in order to add fictitious samples! <-- sample = cont[act] delta_s = sample['new_state'] - sample['state'] self.arriving_mcrst_helper[act] = {} # Apply the delta s of the sample to every other state in the macrostate. for act2 in cont.keys(): if act != act2: new_state = cont[act2]['state'] + delta_s new_state_mcrst = helper.get_mcrst( new_state, self.intervals, self.sink) if new_state_mcrst in self.arriving_mcrst_helper[ act].keys(): self.arriving_mcrst_helper[act][ new_state_mcrst] += 1 else: self.arriving_mcrst_helper[act][ new_state_mcrst] = 1
def estimate_performance_abstract_policy(env, n_episodes, n_steps, abstract_policy, init_states, interv, INTERVALS): acc = 0 for i in range(0, n_episodes): env.reset(init_states[i]) g = 1 for j in range(0, n_steps): state = env.get_state() if interv is not None: action = abstract_policy[helper.get_mcrst(state, interv, SINK)][0] else: action = abstract_policy[helper.get_mcrst( state, INTERVALS, SINK)][0] new_state, r, _, _ = env.step(action) acc += g * r g *= GAMMA return acc / n_episodes
def sampling_abstract_optimal_pol(abs_opt_policy, det_samples, param, interv, INTERVALS): fictitious_samples = [] for sam in det_samples: single_sample = [] for s in sam: prev_action = deterministic_action(param, s[0]) if interv is not None: mcrst = helper.get_mcrst(s[0], interv, SINK) else: mcrst = helper.get_mcrst(s[0], INTERVALS, SINK) if prev_action in abs_opt_policy[mcrst]: single_sample.append([s[0], prev_action]) else: index = np.argmin( [abs(act - prev_action) for act in abs_opt_policy[mcrst]]) single_sample.append([s[0], abs_opt_policy[mcrst][index]]) fictitious_samples.append(single_sample) return fictitious_samples
def sampling_abstract_optimal_pol(abs_opt_policy, det_samples): fictitious_samples = [] for sam in det_samples: single_sample = [] for s in sam: prev_action = deterministic_action(np.reshape(s[0], (1, 1))) prev_action = prev_action[0] mcrst = helper.get_mcrst(s[0], INTERVALS, SINK) if prev_action in abs_opt_policy[mcrst]: single_sample.append([s[0], prev_action]) else: index = np.argmin( [abs(act - prev_action) for act in abs_opt_policy[mcrst]]) single_sample.append([s[0], abs_opt_policy[mcrst][index]]) fictitious_samples.append(single_sample) return fictitious_samples
def construct_problem(self): self.init_operation() # Initialize some variables of support. theta = cp.Variable((self.n_actions, self.i), nonneg=True) objective = cp.Minimize(-cp.sum(cp.multiply(self.I, cp.log(theta)))) constraints = [] # Sum of rows must be equal to 1. for k in range(0, self.n_actions): constraints.append(cp.sum(theta[k]) == 1) # Lipschitz hypothesis between actions in the same macrostate. for k in range(0, self.i): actions_mcrst = sorted(list(self.container[k].keys()), reverse=True) new_mcrst_possible = [] for act in actions_mcrst: new_mcrst = helper.get_mcrst( self.container[k][act]['new_state'], self.intervals, self.sink) if new_mcrst not in new_mcrst_possible: new_mcrst_possible.append(new_mcrst) # The helper might contain new_mcrst that are not yet included in new_mcrst_possible. from_helper = self.arriving_mcrst_helper[act].keys() for mcrst in from_helper: if mcrst not in new_mcrst_possible: new_mcrst_possible.append(mcrst) for i in range(0, len(actions_mcrst) - 1): for k2 in new_mcrst_possible: constraints.append( theta[self.get_id_from_action(actions_mcrst[i])][k2] - theta[self.get_id_from_action(actions_mcrst[ i + 1])][k2] <= self.L * abs(actions_mcrst[i] - actions_mcrst[i + 1])) constraints.append( theta[self.get_id_from_action(actions_mcrst[i])][k2] - theta[self.get_id_from_action(actions_mcrst[ i + 1])][k2] >= -self.L * abs(actions_mcrst[i] - actions_mcrst[i + 1])) problem = cp.Problem(objective, constraints) problem.solve(solver=cp.ECOS, abstol=1e-4, max_iters=200) return theta.value
def fill_I(self): matrix_i = np.zeros((self.n_actions, self.i)) for cont in self.container: for act, single_sample in cont.items(): new_mcrst = helper.get_mcrst(single_sample['new_state'], self.intervals, self.sink) # I assume that all the actions are different. matrix_i[self.get_id_from_action(act)][new_mcrst] += 1 # contribution of the fictitious samples. for act in self.arriving_mcrst_helper.keys(): for mcrst in self.arriving_mcrst_helper[act].keys(): matrix_i[self.get_id_from_action( act)][mcrst] += self.arriving_mcrst_helper[act][mcrst] self.I.value = matrix_i
def sampling_abstract_optimal_pol(abs_opt_policy, det_samples, param): fictitious_samples = [] for sam in det_samples: single_sample = [] for s in sam: # avoid to consider the sink state in the fictitious samples. if s[2] != 0: prev_action = deterministic_action(param, s[0]) mcrst = helper.get_mcrst(s[0], INTERVALS, SINK) if prev_action in abs_opt_policy[mcrst]: single_sample.append([s[0], prev_action]) else: index = np.argmin([ abs(act - prev_action) for act in abs_opt_policy[mcrst] ]) single_sample.append([s[0], abs_opt_policy[mcrst][index]]) fictitious_samples.append(single_sample) return fictitious_samples
def divide_samples(self, samples, problem, seed, intervals=None): if intervals is not None: self.intervals = intervals # container is an array of dictionaries. # Every dict has the actions as key and another dict as value. # The second dict has 'state', 'new_state', 'abs_reward', 'abs_tf' as keys. self.container = self.init_container() if self.sink: self.container.append({}) for sam in samples: for i, s in enumerate(sam): # every s is an array with this shape: ['state', 'action', 'reward', 'new_state'] mcrst = helper.get_mcrst(s[0], self.intervals, self.sink) self.container[mcrst][s[1]] = { 'state': s[0], 'new_state': s[3] } # to avoid a slow computation. help = Helper(seed) self.container = [ help.big_mcrst_correction(cont) if len(cont.items()) > helper.MAX_SAMPLES_IN_MCRST else cont for cont in self.container ] # calculate the abstract reward for every sample. if problem == 'lqg1d': reward_func = helper.calc_abs_reward_lqg elif problem == 'cartpole1d': reward_func = helper.calc_abs_reward_cartpole elif problem == 'minigolf': reward_func = helper.calc_abs_reward_minigolf for cont in self.container: for act in cont.keys(): cont[act]['abs_reward'] = reward_func(cont, act)