def reverse_normalize(self, obs): # invers the flatten obs = obs.reshape(-1, 4) side_lanes = self.env.road.network.all_side_lanes( self.env.vehicle.lane_index) x_position_range = 3.0 * MDPVehicle.SPEED_MAX y_position_range = AbstractLane.DEFAULT_WIDTH * len(side_lanes) y_position_range_0 = 0 - AbstractLane.DEFAULT_WIDTH velocity_range = MDPVehicle.SPEED_MAX vy_range = 0.4 * MDPVehicle.SPEED_MAX df = pandas.DataFrame({ 'x': obs[:, 0], 'y': obs[:, 1], 'vx': obs[:, 2], 'vy': obs[:, 3] }) df['x'] = utils.remap(df['x'], [-1, 1], [-x_position_range, x_position_range]) df['y'] = utils.remap(df['y'], [-1, 1], [y_position_range_0, y_position_range]) df['vx'] = utils.remap(df['vx'], [-1, 1], [-velocity_range, velocity_range]) df['vy'] = utils.remap(df['vy'], [-1, 1], [-vy_range, vy_range]) return df # true state
def observe(self): if self.absolute: raise NotImplementedError() else: # Add nearby traffic self.grid.fill(0) df = pd.DataFrame.from_records( [v.to_dict(self.env.vehicle) for v in self.env.road.vehicles]) # Normalize df = self.normalize(df) # Fill-in features for layer, feature in enumerate(self.features): for _, vehicle in df.iterrows(): x, y = vehicle["x"], vehicle["y"] # Recover unnormalized coordinates for cell index if "x" in self.features_range: x = utils.remap(x, [-1, 1], [self.features_range["x"][0], self.features_range["x"][1]]) if "y" in self.features_range: y = utils.remap(y, [-1, 1], [self.features_range["y"][0], self.features_range["y"][1]]) cell = (int((x - self.grid_size[0, 0]) / self.grid_step[0]), int((y - self.grid_size[1, 0]) / self.grid_step[1])) if 0 <= cell[1] < self.grid.shape[-2] and 0 <= cell[0] < self.grid.shape[-1]: self.grid[layer, cell[1], cell[0]] = vehicle[feature] # Clip obs = np.clip(self.grid, -1, 1) return obs
def normalize(self, df): """ Normalize the observation values. For now, assume that the road is straight along the x axis. :param Dataframe df: observation data """ side_lanes = self.env.road.network.all_side_lanes(self.env.vehicle.lane_index) x_position_range = 5.0 * MDPVehicle.SPEED_MAX y_position_range = AbstractLane.DEFAULT_WIDTH * len(side_lanes) velocity_range = 2*MDPVehicle.SPEED_MAX df['x'] = utils.remap(df['x'], [-x_position_range, x_position_range], [-1, 1]) df['y'] = utils.remap(df['y'], [-y_position_range, y_position_range], [-1, 1]) df['vx'] = utils.remap(df['vx'], [-velocity_range, velocity_range], [-1, 1]) df['vy'] = utils.remap(df['vy'], [-velocity_range, velocity_range], [-1, 1]) return df
def _reward(self, action): """ The vehicle is rewarded for driving with high velocity on lanes to the right and avoiding collisions, but an additional altruistic penalty is also suffered if any vehicle on the merging lane has a low velocity. :param action: the action performed :return: the reward of the state-action transition """ action_reward = { 0: self.LANE_CHANGE_REWARD, 1: 0, 2: self.LANE_CHANGE_REWARD, 3: 0, 4: 0 } reward = self.COLLISION_REWARD * self.vehicle.crashed \ + self.RIGHT_LANE_REWARD * self.vehicle.lane_index[2] / 1 \ + self.HIGH_VELOCITY_REWARD * self.vehicle.velocity_index / (self.vehicle.SPEED_COUNT - 1) # Altruistic penalty for vehicle in self.road.vehicles: if vehicle.lane_index == ("b", "c", 2) and isinstance( vehicle, ControlledVehicle): reward += self.MERGING_VELOCITY_REWARD * \ (vehicle.target_velocity - vehicle.velocity) / vehicle.target_velocity return utils.remap(action_reward[action] + reward, [ self.COLLISION_REWARD, self.HIGH_VELOCITY_REWARD + self.RIGHT_LANE_REWARD ], [0, 1])
def _reward(self, actions): """ The vehicle is rewarded for driving with high velocity on lanes to the right and avoiding collisions, but an additional altruistic penalty is also suffered if any vehicle on the merging lane has a low velocity. :param action: the action performed :return: the reward of the state-action transition """ action_reward = { 0: self.LANE_CHANGE_REWARD, 1: 0, 2: self.LANE_CHANGE_REWARD, 3: 0, 4: 0 } rewards = [] for i in range(len(self.all_vehicles)): reward = self.COLLISION_REWARD * self.all_vehicles[i].crashed \ + self.HIGH_VELOCITY_REWARD * self.all_vehicles[i].velocity_index / (self.all_vehicles[i].SPEED_COUNT - 1) if self.all_vehicles[i].lane_index == ("b", "c", 2) and isinstance( self.all_vehicles[i], ControlledVehicle): reward += self.MERGING_VELOCITY_REWARD * \ (self.all_vehicles[i].target_velocity - self.all_vehicles[i].velocity) / self.all_vehicles[i].target_velocity reward = utils.remap(action_reward[actions[i][0]] + reward, [ self.COLLISION_REWARD, self.HIGH_VELOCITY_REWARD + self.RIGHT_LANE_REWARD ], [0, 1]) rewards.append(reward) return np.array(rewards)
def _reward(self, action): reward = self.config["collision_reward"] * self.vehicle.crashed \ + self.HIGH_VELOCITY_REWARD * (self.vehicle.velocity_index == self.vehicle.SPEED_COUNT - 1) reward = self.ARRIVED_REWARD if self.has_arrived else reward if self.config["normalize_reward"]: reward = utils.remap(reward, [self.config["collision_reward"], self.ARRIVED_REWARD], [0, 1]) return reward
def _reward(self, action): reward = self.COLLISION_REWARD * self.vehicle.crashed \ + self.HIGH_VELOCITY_REWARD * self.vehicle.velocity_index / max(self.vehicle.SPEED_COUNT - 1, 1) reward = self.ARRIVED_REWARD if self.has_arrived else reward return utils.remap(reward, [self.COLLISION_REWARD, self.ARRIVED_REWARD], [0, 1])
def _reward(self, action): reward = self.COLLISION_REWARD * self.vehicle.crashed \ + self.HIGH_VELOCITY_REWARD * self.vehicle.velocity_index / max(self.vehicle.SPEED_COUNT - 1, 1) \ + self.LANE_CHANGE_REWARD * (action in [0, 2]) return utils.remap(reward, [ self.COLLISION_REWARD + self.LANE_CHANGE_REWARD, self.HIGH_VELOCITY_REWARD ], [0, 1])
def _reward(self, action: int) -> float: reward = self.COLLISION_REWARD * self.vehicle.crashed \ + self.HIGH_SPEED_REWARD * self.vehicle.speed_index / max(self.vehicle.SPEED_COUNT - 1, 1) \ + self.LANE_CHANGE_REWARD * (action in [0, 2]) return utils.remap(reward, [ self.COLLISION_REWARD + self.LANE_CHANGE_REWARD, self.HIGH_SPEED_REWARD ], [0, 1])
def _observation(self): """ Return the observation of the current state, which must be consistent with self.observation_space. :return: the observation """ # Add ego-vehicle df = pandas.DataFrame.from_records([self.vehicle.to_dict() ])[self.OBSERVATION_FEATURES] # Normalize values MAX_ROAD_LANES = 4 road_width = AbstractLane.DEFAULT_WIDTH * MAX_ROAD_LANES df.loc[0, 'x'] = 0 df.loc[0, 'y'] = utils.remap(df.loc[0, 'y'], [0, road_width], [0, 1]) df.loc[0, 'vx'] = utils.remap( df.loc[0, 'vx'], [MDPVehicle.SPEED_MIN, MDPVehicle.SPEED_MAX], [0, 1]) df.loc[0, 'vy'] = utils.remap( df.loc[0, 'vy'], [-MDPVehicle.SPEED_MAX, MDPVehicle.SPEED_MAX], [-1, 1]) # Add nearby traffic close_vehicles = self.road.closest_vehicles_to( self.vehicle, self.OBSERVATION_VEHICLES) if close_vehicles: df = df.append(pandas.DataFrame.from_records([ v.to_dict(self.vehicle) for v in close_vehicles[-self.OBSERVATION_VEHICLES + 1:] ])[self.OBSERVATION_FEATURES], ignore_index=True) # Normalize values delta_v = 2 * (MDPVehicle.SPEED_MAX - MDPVehicle.SPEED_MIN) df.loc[1:, 'x'] = utils.remap( df.loc[1:, 'x'], [-self.PERCEPTION_DISTANCE, self.PERCEPTION_DISTANCE], [-1, 1]) df.loc[1:, 'y'] = utils.remap(df.loc[1:, 'y'], [-road_width, road_width], [-1, 1]) df.loc[1:, 'vx'] = utils.remap(df.loc[1:, 'vx'], [-delta_v, delta_v], [-1, 1]) df.loc[1:, 'vy'] = utils.remap(df.loc[1:, 'vy'], [-delta_v, delta_v], [-1, 1]) # Fill missing rows if df.shape[0] < self.OBSERVATION_VEHICLES: rows = -np.ones((self.OBSERVATION_VEHICLES - df.shape[0], len(self.OBSERVATION_FEATURES))) df = df.append(pandas.DataFrame(data=rows, columns=self.OBSERVATION_FEATURES), ignore_index=True) # Reorder df = df[self.OBSERVATION_FEATURES] # Clip obs = np.clip(df.values, -1, 1) # Flatten obs = np.ravel(obs) return obs
def _reward(self, action): """ The reward is defined to foster driving at high speed, on the rightmost lanes, and to avoid collisions. :param action: the last action performed :return: the corresponding reward """ action_reward = {0: self.LANE_CHANGE_REWARD, 1: 0, 2: self.LANE_CHANGE_REWARD, 3: 0, 4: 0} neighbours = self.road.network.all_side_lanes(self.vehicle.lane_index) state_reward = \ + self.config["collision_reward"] * self.vehicle.crashed \ + self.RIGHT_LANE_REWARD * self.vehicle.target_lane_index[2] / (len(neighbours) - 1) \ + self.HIGH_VELOCITY_REWARD * self.vehicle.velocity_index / (self.vehicle.SPEED_COUNT - 1) return utils.remap(action_reward[action] + state_reward, [self.config["collision_reward"], self.HIGH_VELOCITY_REWARD+self.RIGHT_LANE_REWARD], [0, 1])
def normalize(self, df): """ Normalize the observation values. For now, assume that the road is straight along the x axis. :param Dataframe df: observation data """ if not self.features_range: self.features_range = { "vx": [-2*MDPVehicle.SPEED_MAX, 2*MDPVehicle.SPEED_MAX], "vy": [-2*MDPVehicle.SPEED_MAX, 2*MDPVehicle.SPEED_MAX] } for feature, f_range in self.features_range.items(): if feature in df: df[feature] = utils.remap(df[feature], [f_range[0], f_range[1]], [-1, 1]) return df
def normalize(self, df): """ Normalize the observation values. For now, assume that the road is straight along the x axis. :param Dataframe df: observation data """ if not self.features_range: side_lanes = self.env.road.network.all_side_lanes(self.env.vehicle.lane_index) self.features_range = { "x": [-5.0 * MDPVehicle.SPEED_MAX, 5.0 * MDPVehicle.SPEED_MAX], "y": [-AbstractLane.DEFAULT_WIDTH * len(side_lanes), AbstractLane.DEFAULT_WIDTH * len(side_lanes)], "vx": [-2*MDPVehicle.SPEED_MAX, 2*MDPVehicle.SPEED_MAX], "vy": [-2*MDPVehicle.SPEED_MAX, 2*MDPVehicle.SPEED_MAX] } for feature, f_range in self.features_range.items(): if feature in df: df[feature] = utils.remap(df[feature], [f_range[0], f_range[1]], [-1, 1]) return df
def _reward_cav(self, infos, terminal): """ When crashed receive -1, when exit highway receive 1, else 0 :param action: the action performed :return: the reward of the state-action transition """ CinB_reward = 0 BinC_reward = 0 High_velocity_reward = 1 # The reward received when driving at 40 m/s, linearly mapped to zero for 35 m/s # speed_reward = np.clip(utils.remap(self.vehicle.velocity, [30, 40], [0, High_velocity_reward]), 0, High_velocity_reward) speed_reward = np.clip( utils.remap(self.vehicle.velocity, [20, 40], [-High_velocity_reward, High_velocity_reward]), -High_velocity_reward, High_velocity_reward) scene_type = infos["scene_type"] if not scene_type: return speed_reward else: if scene_type == "CinB": return CinB_reward elif scene_type == "BinC": return BinC_reward else: return 0
def sample(self, explore=False): self.step += 1 # print(self._current_observation_n) if self._current_observation_n is None: # print('now updating') self._current_observation_n = self.env.reset() action_n = [] supplied_observation = [] observations = np.zeros((2, self.env.num_state)) next_observations = np.zeros((2, self.env.num_state)) if self.env.sim_step >= self.env.num_state - 3: print('wrong') observations[0][self.env.sim_step] = 1 observations[1][self.env.sim_step] = 1 next_observations[0][self.env.sim_step + 1] = 1 next_observations[1][self.env.sim_step + 1] = 1 relative_info = np.zeros((2, 2)) speed_max = 40 velocity_range = 2 * 40 x_position_range = speed_max delta_dx = self.env.road.vehicles[1].position[ 0] - self.env.road.vehicles[0].position[0] delta_vx = self.env.road.vehicles[1].velocity - self.env.road.vehicles[ 0].velocity relative_info[0][0] = utils.remap( delta_dx, [-x_position_range, x_position_range], [-1, 1]) relative_info[0][1] = utils.remap(delta_vx, [-velocity_range, velocity_range], [-1, 1]) relative_info[1][0] = -relative_info[0][0] relative_info[1][1] = -relative_info[0][1] observations[:, -2:] = relative_info if explore: for i in range(self.agent_num): action_n.append([np.random.randint(0, self.env.action_num)]) for i in range(self.leader_num): supplied_observation.append(observations[i]) for i in range(self.leader_num, self.env.agent_num): mix_obs = np.hstack( (observations[i], tf.one_hot(action_n[0][0], self.env.action_num))).reshape(1, -1) supplied_observation.append(mix_obs) else: for i in range(self.leader_num): supplied_observation.append(observations[i]) action_n.append(self.train_agents[0].act( observations[i].reshape(1, -1))) for i in range(self.leader_num, self.env.agent_num): mix_obs = np.hstack( (observations[i], tf.one_hot(action_n[0][0], self.env.action_num))).reshape(1, -1) supplied_observation.append(mix_obs) follower_action = self.train_agents[1].act( mix_obs.reshape(1, -1)) action_n.append(follower_action) action_n = np.asarray(action_n) pres_valid_conditions_n = [] next_valid_conditions_n = [] #self.env.render() ''' for i in range(5): for j in range(5): print("q value for upper agent ", i, j, self.train_agents[0]._qf.get_values(np.hstack((observations[0], tf.one_hot(i, self.env.action_num), tf.one_hot(j, self.env.action_num))).reshape(1, -1))) print() for i in range(5): for j in range(5): print("q value for lower agent ", i, j, self.train_agents[1]._qf.get_values(np.hstack((observations[1], tf.one_hot(i, self.env.action_num), tf.one_hot(j, self.env.action_num))).reshape(1, -1))) print('a0 = ', action_n[0]) print('a1 = ', action_n[1]) print(self.env.road.vehicles[0].position[0], self.env.road.vehicles[1].position[0]) ''' next_observation_n, reward_n, done_n, info = self.env.step(action_n) delta_dx = self.env.road.vehicles[1].position[ 0] - self.env.road.vehicles[0].position[0] delta_vx = self.env.road.vehicles[1].velocity - self.env.road.vehicles[ 0].velocity relative_info[0][0] = utils.remap( delta_dx, [-x_position_range, x_position_range], [-1, 1]) relative_info[0][1] = utils.remap(delta_vx, [-velocity_range, velocity_range], [-1, 1]) relative_info[1][0] = -relative_info[0][0] relative_info[1][1] = -relative_info[0][1] next_observations[:, -2:] = relative_info if self._global_reward: reward_n = np.array([np.sum(reward_n)] * self.agent_num) self._path_length += 1 self._path_return += np.array(reward_n, dtype=np.float32) self._total_samples += 1 opponent_action = np.array(action_n[[j for j in range(len(action_n)) ]].flatten()) for i, agent in enumerate(self.agents): agent.replay_buffer.add_sample( # observation=self._current_observation_n[i].astype(np.float32), observation=supplied_observation[i], # action=action_n[i].astype(np.float32), action=tf.one_hot(action_n[i], self.env.action_num), # reward=reward_n[i].astype(np.float32), reward=np.float32(reward_n[i]), # terminal=done_n[i].astype(np.float32), terminal=np.float32(done_n[i]), # next_observation=next_observation_n[i].astype(np.float32), next_observation=np.float32(next_observations[i]), # opponent_action=opponent_action.astype(np.float32) opponent_action=np.int32(opponent_action), ) self._current_observation_n = next_observation_n if self.step % (25 * 1000) == 0: print("steps: {}, episodes: {}, mean episode reward: {}".format( self.step, len(reward_n), np.mean(reward_n[-1000:]))) if np.all(done_n) or self._path_length >= self._max_path_length: self._current_observation_n = self.env.reset() self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length self._last_path_return = self._path_return self._path_length = 0 self._path_return = np.zeros(self.agent_num) self._n_episodes += 1 self.log_diagnostics() logger.log(tabular) logger.dump_all() else: self._current_observation_n = next_observation_n