def test_view(self): g = nx.Graph() g.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 4), (1, 5)]) nx.set_node_attributes(g, { 0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 4: (0, 5), 5: (1, 1) }, name="coords") orders = [(3, 2, 2, 3, 3) ] # <source, destination, time, length, price> drivers = np.array([1, 1, 1, 1, 1, 1]) action = np.array([1, 0, 0], dtype=float) env = TaxiEnv(g, orders, 1, drivers, 10) env.seed(123) env.set_view([2, 3, 4]) obs, _, _ = env.get_observation() # check observation space and content assert env.observation_space_shape == obs.shape view_size = 3 assert env.observation_space_shape == ( view_size * 4 + 10, ) # default income is not included, so its <driver, order, idle, time_id, node_id> assert env.action_space_shape == ( 3, ) # degree of 1 is 3, but of the rest is 2. So it should be 2 + 1 (staying action) assert env.current_node_id in [2, 3, 4] # an action [1, 0, 0] for the node 2 means to go to node 3, because its the only neighbor in the view env.step(action) assert env.current_node_id in [2, 3, 4] env.step(action) assert env.current_node_id in [2, 3, 4] obs, rew, done, info = env.step(action) assert (obs[:view_size] == np.array([0.5, 1, 0])).all( ) # there are 2 drivers in the node 3 at the end, one from node 2, one from node 4. assert (obs[view_size:2 * view_size] == np.array([0, 0, 0])).all() assert (obs[2 * view_size:3 * view_size] == np.array([0.5, 1, 0])).all() # next time iteration should happen assert env.time == 1 assert env.current_node_id in [2, 3] assert (obs[2 * view_size:3 * view_size] == np.array([0.5, 1, 0])).all() assert (obs[3 * view_size:3 * view_size + 10] == np.array( [0, 1, 0, 0, 0, 0, 0, 0, 0, 0])).all() assert obs[3 * view_size + 10:].shape == (3, ) assert (obs[3 * view_size + 10:] == np.array([ 1, 0, 0 ])).all() or (obs[3 * view_size + 10:] == np.array([0, 1, 0])).all() assert [d.position for d in env.all_driver_list] == [0, 1, 3, 2, 3, 5]
def test_sync(self): g = nx.Graph() g.add_edges_from([(0, 1), (1, 2), (2, 3)]) nx.set_node_attributes(g, { 0: (0, 1), 1: (0, 2), 2: (1, 1), 3: (1, 2) }, name="coords") orders = [(0, 1, 0, 1, 1), (1, 1, 1, 2, 2), (2, 2, 1, 3, 3), (3, 2, 2, 3, 3)] drivers = np.array([1, 0, 0, 5]) action = np.array([0.3, 0.4, 0.3], dtype=float) env = TaxiEnv(g, orders, 1, drivers, 10) env.step(action) env2 = TaxiEnv(g, orders, 1, drivers, 10) env2.sync(env) o1, _, _ = env.get_observation() o2, _, _ = env2.get_observation() assert (o1 == o2).all() env.seed(1) env2.seed(1) while not env.done: obs, rew, done, info = env.step(action) obs2, rew2, done2, info2 = env2.step(action) assert (obs == obs2).all() assert rew == rew2 assert done == done2 assert info == info2
class TaxiEnvBatch(gym.Env): ''' This class is a wrapper over taxi_env, providing an interface for cA2C, that requires processing drivers in batches + some additional context information ''' metadata = {'render.modes': ['rgb_array']} def __init__(self, world: nx.Graph, orders: Tuple[int, int, int, int, float], order_sampling_rate: float, drivers_per_node: Array[int], n_intervals: List, wc: float, count_neighbors: bool = False, weight_poorest: bool = False, normalize_rewards: bool = True, minimum_reward: bool = False, reward_bound: float = None, include_income_to_observation: bool = False, poorest_first: bool = False, idle_reward: bool = False) -> None: self.itEnv = TaxiEnv(world, orders, order_sampling_rate, drivers_per_node, n_intervals, wc, count_neighbors, weight_poorest, normalize_rewards, minimum_reward, reward_bound, include_income_to_observation, poorest_first, idle_reward) self.world = self.itEnv.world self.n_intervals = n_intervals self.n_drivers = self.itEnv.n_drivers self.time = 0 self.include_income_to_observation = include_income_to_observation self.one_cell_action_space = self.itEnv.max_degree + 1 self.action_space = spaces.Box(low=0, high=1, shape=(self.one_cell_action_space * self.itEnv.world_size, )) if include_income_to_observation: assert self.itEnv.observation_space_shape[0] == 3 * len( self.world) + self.itEnv.n_intervals + 3 self.observation_space_shape = ( self.itEnv.observation_space_shape[0] + 2 * len(self.world) - 3, ) else: assert self.itEnv.observation_space_shape[0] == 3 * len( self.world) + self.itEnv.n_intervals self.observation_space_shape = ( self.itEnv.observation_space_shape[0] - len(self.world), ) self.observation_space = spaces.Box(low=0, high=1, shape=self.observation_space_shape) def reset(self) -> Array[int]: self.time = 0 if self.itEnv.include_income_to_observation: t = self.itEnv.world_size + 3 observation = self.itEnv.reset()[:-t] # assuming all incomes are zero return np.concatenate( (observation, np.zeros(3 * self.itEnv.world_size))) else: t = self.itEnv.world_size return self.itEnv.reset()[:-t] def get_reset_info(self): ''' Currently used only to get max_orders and max_drivers, that should current_cell independent ''' return self.itEnv.get_reset_info() def step(self, action: Array[float]) -> Tuple[Array[int], float, bool, Dict]: cells_with_nonzero_drivers = np.sum([ 1 for n in self.itEnv.world.nodes(data=True) if n[1]['info'].get_driver_num() > 0 ]) nodes_with_orders = np.sum([ 1 for n in self.itEnv.world.nodes(data=True) if n[1]['info'].get_order_num() > 0 ]) total_orders = np.sum([ n[1]['info'].get_order_num() for n in self.itEnv.world.nodes(data=True) ]) global_observation = np.zeros(5 * self.itEnv.world_size + self.itEnv.n_intervals) global_done = False global_reward = 0 reward_per_node = np.zeros(self.itEnv.world_size) init_t = self.itEnv.time self.last_action_for_drawing = action total_served_orders = 0 max_driver = None max_order = None for i in range(cells_with_nonzero_drivers): current_cell = self.itEnv.current_node_id a = current_cell * self.one_cell_action_space action_per_cell = action[a:a + self.one_cell_action_space] observation, reward, done, info = self.itEnv.step(action_per_cell) reward_per_node[current_cell] = reward global_done = done global_reward += reward total_served_orders += info['served_orders'] # updated at each step, but the final should be corrent max_driver = info["driver normalization constant"] max_order = info["order normalization constant"] if self.itEnv.include_income_to_observation: assert observation.shape[ 0] == 3 * self.itEnv.world_size + self.itEnv.n_intervals + 3 size_without_income = 2 * self.itEnv.world_size + self.itEnv.n_intervals ws = self.itEnv.world_size offset = current_cell global_observation[: size_without_income] = observation[: size_without_income] global_observation[size_without_income + 3 * offset:size_without_income + 3 * offset + 3] = observation[-3:] else: global_observation = observation[:-self.itEnv.world_size] # if cells_with_nonzero_drivers == 0: # observation, reward, done, info = self.itEnv.step(action_per_cell) # # reward_per_node[current_cell] = reward # global_done = done # global_reward += reward # total_served_orders += info['served_orders'] # # # updated at each step, but the final should be corrent # max_driver = info["driver normalization constant"] # max_order = info["order normalization constant"] # # if self.itEnv.include_income_to_observation: # assert observation.shape[0] == 3*self.itEnv.world_size+self.itEnv.n_intervals+3 # size_without_income = 2*self.itEnv.world_size+self.itEnv.n_intervals # ws = self.itEnv.world_size # offset = current_cell # global_observation[:size_without_income] = observation[:size_without_income] # global_observation[size_without_income+3*offset:size_without_income+3*offset+3] = observation[-3:] # else: # global_observation = observation[:-self.itEnv.world_size] assert not global_done or init_t + 1 == self.itEnv.n_intervals assert self.itEnv.time == init_t + 1 self.time += 1 global_info = { "reward_per_node": reward_per_node, "served_orders": total_served_orders, "nodes_with_drivers": cells_with_nonzero_drivers, "nodes_with_orders": nodes_with_orders, "driver normalization constant": max_driver, "order normalization constant": max_order, "total_orders": total_orders, "idle_reward": float( np.mean([ d.get_idle_period() for d in self.itEnv.all_driver_list ])), "min_idle": float( np.min( [d.get_idle_period() for d in self.itEnv.all_driver_list])) } return global_observation, global_reward, global_done, global_info def seed(self, seed): self.itEnv.seed(seed) def get_min_revenue(self): return self.itEnv.get_min_revenue() def get_total_revenue(self): return self.itEnv.get_total_revenue() def compute_remaining_drivers_and_orders(self, state): return self.itEnv.compute_remaining_drivers_and_orders(state) def set_income_bound(self, bound): self.itEnv.set_income_bound(bound) def render(self, mode='rgb_array'): fig = plt.figure(figsize=(20, 20)) ax = fig.gca() ax.axis('off') pos = nx.get_node_attributes(self.world, 'coords') G = nx.DiGraph(self.world) nodelist = [] edgelist = [] action = self.last_action_for_drawing act = self.itEnv.action_space_shape[0] node_colors = [] edge_colors = [] for n in self.world.nodes(): node_action = action[act * n:act * (n + 1)] nodelist.append(n) node_colors.append(node_action[-1]) j = 0 added = 0 for nn in self.world.neighbors(n): if node_action[j] > 0: edgelist.append((n, nn)) edge_colors.append(node_action[j]) added += 1 j += 1 assert abs(np.sum(node_action) - 1) < 0.00001, node_action assert node_action[-1] != 0 or added > 0, (node_action, n) nx.draw_networkx(G, edgelist=edgelist, edge_color=edge_colors, vmin=-1, vmax=1, node_shape='.', edge_vmax=1.1, cmap=matplotlib.cm.get_cmap("Blues"), edge_cmap=matplotlib.cm.get_cmap("Blues"), node_color=node_colors, nodelist=nodelist, pos=pos, arrows=True, with_labels=False, ax=ax) canvas = FigureCanvasAgg(fig) canvas.draw() s, (width, height) = canvas.print_to_buffer() # Option 2a: Convert to a NumPy array X = np.frombuffer(s, np.uint8).reshape((height, width, 4)) plt.close(fig) return X