def step(self, a): transitions = self.P[self.s][a] i = discrete.categorical_sample([t[0] for t in transitions], self.np_random) p, s, r, d = transitions[i] row = s // self.nrow col = s - (row * self.nrow) letter = self.desc[row, col] while letter in b'CLDRU': last_s = s if letter in b'LDRU': a = ACTIONS[letter] p, s, r, d = self.P[s][a][0] elif letter == b'C': p, s, r, d = self.P[s][a][ 0] # 'C' makes the agent continue in the direction it last moved in row = s // self.nrow col = s - (row * self.nrow) letter = self.desc[row, col] if ( last_s == s ): # If we tried moving again but it put us at the same state, we tried to walk off the edge of the map and should stop trying break if letter == b'W': # When we hit a wall, we walk into it then step back out reverse_a = (a + 2) % 4 p, s, r, d = self.P[s][reverse_a][0] self.s = s self.lastaction = a return (int(s), r, d, {"prob": p})
def reset(self): indx = discrete.categorical_sample([prob for prob, state in self.isd], self.np_random) self.lastaction = None _, state = self.isd[indx] self.s = tuple(state) return self.s
def reset(self): self.s = discrete.categorical_sample(self.isd, self.np_random) self.lastaction = None self.rm_size = [] self.flow_time_link = 0 self.num_flows = 0 self.seed(7) wt = [[0.2 * (np.random.random() - 0.5) + 0.9 for i in range(self.nS)] for j in range(self.nA)] for j in range(self.nA): if j == 1: wt[j][0:self.nS] = [ 0.2 * (np.random.random() - 0.5) + 0.8 for i in range(self.nS) ] if j == 2: wt[j][0:self.nS] = [ 0.2 * (np.random.random() - 0.5) + 0.6 for i in range(self.nS) ] self.rate = np.matmul(wt, np.diag(self.bandwidth_cap)) return self.s
def step(self, a): transitions = self.P[self.s][a] i = discrete.categorical_sample([t[0] for t in transitions], self.np_random) p, s, r, d= transitions[i] self.s = s self.lastaction = a return (s, r, d, {"prob" : p})
def step(self, a): transitions = self.P[self.s][a] i = discrete.categorical_sample( [t[0] if len(t) > 0 else 0 for t in transitions], self.np_random) p, s, r, d = transitions[i] self.s = s self.last_action = a return int(s), r, d, {"prob": p}
def reset(self): self.desc = self._set_start() self.p, self.isd = self._create_transition_matrix() self.s = categorical_sample(self.isd, self.np_random) self.last_action = None self.remaining_steps = MAX_STEPS return self.s
def step(self, a): transitions = self.P[self.s][a] i = discrete.categorical_sample([t[0] for t in transitions], self.np_random) p, s, r, d, info = transitions[i] self.s = s self.lastaction = a info.update({"prob": p}) self.nsteps += 1 d = np.logical_or(d, self.nsteps > self.timeout) return s, r, d, info
def step(self, a): if not self.action_space.spaces[self.s].contains(a): raise ValueError( f"Action must be < {self.action_space.spaces[self.s].n} in space {self.s}, attempted {a}" ) transitions = self.P[self.s][a] i = categorical_sample([t[0] for t in transitions], self.np_random) p, s, r, d= transitions[i] self.lastaction = (self.s, a) self.s = s return (s, r, d, {"prob" : p})
def step(self, a): transitions = self.p[self.s][a] i = categorical_sample([t[0] for t in transitions], self.np_random) p, s, r, d = transitions[i] self.s = s self.last_action = a self.remaining_steps -= 1 if self.remaining_steps <= 0: d = True return s, r, d, {"prob": p}
def __init__(self, nS, vA, P, isd): self.P = P self.isd = isd self.lastaction = None # for rendering self.nS = nS self.vA = np.array(vA) assert (self.vA >= 0).all(), "Number of actions per state must be nonnegative." self.observation_space = spaces.Discrete(self.nS) self.action_space = spaces.Tuple(tuple(spaces.Discrete(nA) for nA in self.vA)) self.seed() self.s = categorical_sample(self.isd, self.np_random)
def step(self, a): transitions = self.P[self.s][a] i = categorical_sample([t[0] for t in transitions], self.np_random) p, new_s, r, d = transitions[i] row, col = self.coordinates(self.s) new_row, new_col = self.coordinates(new_s) new_letter = self.desc[new_row, new_col] _r = r(new_letter, row, col, new_row, new_col) self.s = new_s self.lastaction = a new_s = new_s + self.exists_adjacent_goal * self.nS / 2 return (new_s, _r, d, {"prob": p})
def step(self, action): assert self.action_space.contains(action) assert (1 < self.current_state[1] < self.max_steps_from_alert and action == 0) \ or self.current_state[1] == 1 \ or self.current_state[1] == self.max_steps_from_alert, \ 'Must choose wait action if alert is triggered' self.last_action = action transitions = self.P[self.current_state][action] i = categorical_sample([t[0] for t in transitions], self.np_random) prob, self.current_state, reward, done = transitions[i] return self.current_state, reward, done, prob
def step(self, a: int): state_locations = self.state_to_locations(self.s) if self.is_terminal(state_locations): return self.s, 0, True, {"prob": 0} single_agent_actions = [ ACTIONS_TO_INT[single_agent_action] for single_agent_action in integer_action_to_vector( a, self.n_agents) ] single_agent_states = tuple([ self.loc_to_int[single_agent_loc] for single_agent_loc in state_locations ]) single_agent_movements = [ self.single_agent_movements(single_agent_states[i], single_agent_actions[i]) for i in range(self.n_agents) ] next_local_states = () total_prob = 1 # single_agent_movements is a list of [[(s1,s'1,p), (s1,s''1,p),...], [(s2,s'2,p), (s2,s''2,p),...]], ...] # We want to first choose the transition for agent1, then for agent2, etc. for agent_movements in single_agent_movements: probs = [t[2] for t in agent_movements] chosen_movement_idx = categorical_sample(probs, self.np_random) next_local_states = next_local_states + ( (agent_movements[chosen_movement_idx][1]), ) total_prob *= agent_movements[chosen_movement_idx][2] # next_local_states holds a list of the local states of the agents next_locations = tuple([ self.valid_locations[local_state] for local_state in next_local_states ]) new_state = self.locations_to_state(next_locations) reward, done, collision = self.calc_transition_reward_from_local_states( single_agent_states, a, next_local_states) self.s = new_state return new_state, reward, done, { "prob": total_prob, "collision": collision }
def __init__(self): self.nS = 20 self.nA = 3 self.isd = [1 / self.nS for x in range(self.nS)] self.nF = 10 self.rm_size = [] self.num_flows = 0 self.flow_time_link = 0 self.cum_flowtime = 0 self.lastaction = None self.action_space = spaces.Discrete(self.nA) self.observation_space = spaces.Discrete(self.nS) self.seed(9) self.s = discrete.categorical_sample(self.isd, self.np_random) wt = [[0.2 * (np.random.random() - 0.5) + 0.9 for i in range(self.nS)] for j in range(self.nA)] for j in range(self.nA): if j == 1: wt[j][0:self.nS] = [ 0.2 * (np.random.random() - 0.5) + 0.7 for i in range(self.nS) ] if j == 2: wt[j][0:self.nS] = [ 0.2 * (np.random.random() - 0.5) + 0.5 for i in range(self.nS) ] # Mean of wt = [0.9, 0.8, 0.6] # or [0.9, 0.7, 0.5] (lines of MAB and A2C merge in the end) self.bandwidth_cap = [i + 1 for i in range(self.nS)] self.rate = np.matmul(wt, np.diag( self.bandwidth_cap)) # dimension: nA x nS self.P = {s: {a: [] for a in range(self.nA)} for s in range(self.nS)} for s in range(self.nS): for a in range(self.nA): for next_s in range(self.nS): self.P[s][a].append((1 / self.nS, next_s))
def __init__(self, env_map, random_start=False): self.base_map = env_map self.random_start = random_start self.desc = self._set_start() self.nrow, self.ncol = self.desc.shape self.nA = 4 self.nS = self.nrow * self.ncol self.p, self.isd = self._create_transition_matrix() self.last_action = None # for rendering self.remaining_steps = MAX_STEPS self.action_space = spaces.Discrete(self.nA) self.observation_space = spaces.Discrete(self.nS) self.seed() self.s = categorical_sample(self.isd, self.np_random)
def step(self, a): if self.num_flows < self.nF: self.newflow_size = self.nS # Need to read from a list of flow sizes self.rm_size.append(self.newflow_size) self.num_flows += 1 transitions = self.P[self.s][a] i = discrete.categorical_sample([t[0] for t in transitions], self.np_random) p, newstate = transitions[i] self.s = newstate wt = [[0.2 * (np.random.random() - 0.5) + 0.9 for i in range(self.nS)] for j in range(self.nA)] for j in range(self.nA): if j == 1: wt[j][0:self.nS] = [ 0.2 * (np.random.random() - 0.5) + 0.8 for i in range(self.nS) ] if j == 2: wt[j][0:self.nS] = [ 0.2 * (np.random.random() - 0.5) + 0.6 for i in range(self.nS) ] self.rate = np.matmul(wt, np.diag(self.bandwidth_cap)) reward = self.rate[a][self.s] self.rm_size, self.flow_time_link = self._get_flow_time( self.rm_size, self.flow_time_link, self.rate[a][self.s]) if self.rm_size == [] and self.num_flows >= self.nF: done = True self.cum_flowtime += self.flow_time_link else: done = False return (newstate, reward, done, {"prob": p})
def reset(self): self.s = categorical_sample(self.isd, self.np_random) self.start_s = self.s self.lastaction = None return self.s + self.exists_adjacent_goal * self.nS / 2
def reset(self): self.s = discrete.categorical_sample(self.isd, self.np_random) self.lastaction = None self.reward = self.calc_grid_reward() #Is this ok? self.P, self.Q = self.calc_reward_stop_probability() return self.s
def __call__(self, values): probas = self.get_probas(values) action = categorical_sample(probas, self.np_random) return action
def reset(self): self.s = discrete.categorical_sample(self.isd, self.np_random) self.last_action = None return int(self.s)
def reset(self): self.s = categorical_sample(self.isd, self.np_random) self.lastaction = None return self.s