예제 #1
0
    def reverse_normalize(self, obs):
        # invers the flatten
        obs = obs.reshape(-1, 4)

        side_lanes = self.env.road.network.all_side_lanes(
            self.env.vehicle.lane_index)
        x_position_range = 3.0 * MDPVehicle.SPEED_MAX
        y_position_range = AbstractLane.DEFAULT_WIDTH * len(side_lanes)
        y_position_range_0 = 0 - AbstractLane.DEFAULT_WIDTH
        velocity_range = MDPVehicle.SPEED_MAX
        vy_range = 0.4 * MDPVehicle.SPEED_MAX

        df = pandas.DataFrame({
            'x': obs[:, 0],
            'y': obs[:, 1],
            'vx': obs[:, 2],
            'vy': obs[:, 3]
        })

        df['x'] = utils.remap(df['x'], [-1, 1],
                              [-x_position_range, x_position_range])
        df['y'] = utils.remap(df['y'], [-1, 1],
                              [y_position_range_0, y_position_range])
        df['vx'] = utils.remap(df['vx'], [-1, 1],
                               [-velocity_range, velocity_range])
        df['vy'] = utils.remap(df['vy'], [-1, 1], [-vy_range, vy_range])

        return df  # true state
예제 #2
0
 def observe(self):
     if self.absolute:
         raise NotImplementedError()
     else:
         # Add nearby traffic
         self.grid.fill(0)
         df = pd.DataFrame.from_records(
             [v.to_dict(self.env.vehicle) for v in self.env.road.vehicles])
         # Normalize
         df = self.normalize(df)
         # Fill-in features
         for layer, feature in enumerate(self.features):
             for _, vehicle in df.iterrows():
                 x, y = vehicle["x"], vehicle["y"]
                 # Recover unnormalized coordinates for cell index
                 if "x" in self.features_range:
                     x = utils.remap(x, [-1, 1], [self.features_range["x"][0], self.features_range["x"][1]])
                 if "y" in self.features_range:
                     y = utils.remap(y, [-1, 1], [self.features_range["y"][0], self.features_range["y"][1]])
                 cell = (int((x - self.grid_size[0, 0]) / self.grid_step[0]),
                         int((y - self.grid_size[1, 0]) / self.grid_step[1]))
                 if 0 <= cell[1] < self.grid.shape[-2] and 0 <= cell[0] < self.grid.shape[-1]:
                     self.grid[layer, cell[1], cell[0]] = vehicle[feature]
         # Clip
         obs = np.clip(self.grid, -1, 1)
         return obs
예제 #3
0
    def normalize(self, df):
        """
            Normalize the observation values.

            For now, assume that the road is straight along the x axis.
        :param Dataframe df: observation data
        """
        side_lanes = self.env.road.network.all_side_lanes(self.env.vehicle.lane_index)
        x_position_range = 5.0 * MDPVehicle.SPEED_MAX
        y_position_range = AbstractLane.DEFAULT_WIDTH * len(side_lanes)
        velocity_range = 2*MDPVehicle.SPEED_MAX
        df['x'] = utils.remap(df['x'], [-x_position_range, x_position_range], [-1, 1])
        df['y'] = utils.remap(df['y'], [-y_position_range, y_position_range], [-1, 1])
        df['vx'] = utils.remap(df['vx'], [-velocity_range, velocity_range], [-1, 1])
        df['vy'] = utils.remap(df['vy'], [-velocity_range, velocity_range], [-1, 1])
        return df
예제 #4
0
    def _reward(self, action):
        """
            The vehicle is rewarded for driving with high velocity on lanes to the right and avoiding collisions, but
            an additional altruistic penalty is also suffered if any vehicle on the merging lane has a low velocity.
        :param action: the action performed
        :return: the reward of the state-action transition
        """
        action_reward = {
            0: self.LANE_CHANGE_REWARD,
            1: 0,
            2: self.LANE_CHANGE_REWARD,
            3: 0,
            4: 0
        }
        reward = self.COLLISION_REWARD * self.vehicle.crashed \
                 + self.RIGHT_LANE_REWARD * self.vehicle.lane_index[2] / 1 \
                 + self.HIGH_VELOCITY_REWARD * self.vehicle.velocity_index / (self.vehicle.SPEED_COUNT - 1)

        # Altruistic penalty
        for vehicle in self.road.vehicles:
            if vehicle.lane_index == ("b", "c", 2) and isinstance(
                    vehicle, ControlledVehicle):
                reward += self.MERGING_VELOCITY_REWARD * \
                          (vehicle.target_velocity - vehicle.velocity) / vehicle.target_velocity

        return utils.remap(action_reward[action] + reward, [
            self.COLLISION_REWARD,
            self.HIGH_VELOCITY_REWARD + self.RIGHT_LANE_REWARD
        ], [0, 1])
예제 #5
0
    def _reward(self, actions):
        """
            The vehicle is rewarded for driving with high velocity on lanes to the right and avoiding collisions, but
            an additional altruistic penalty is also suffered if any vehicle on the merging lane has a low velocity.
        :param action: the action performed
        :return: the reward of the state-action transition
        """
        action_reward = {
            0: self.LANE_CHANGE_REWARD,
            1: 0,
            2: self.LANE_CHANGE_REWARD,
            3: 0,
            4: 0
        }
        rewards = []
        for i in range(len(self.all_vehicles)):
            reward = self.COLLISION_REWARD * self.all_vehicles[i].crashed \
                + self.HIGH_VELOCITY_REWARD * self.all_vehicles[i].velocity_index / (self.all_vehicles[i].SPEED_COUNT - 1)

            if self.all_vehicles[i].lane_index == ("b", "c", 2) and isinstance(
                    self.all_vehicles[i], ControlledVehicle):
                reward += self.MERGING_VELOCITY_REWARD * \
                            (self.all_vehicles[i].target_velocity - self.all_vehicles[i].velocity) / self.all_vehicles[i].target_velocity
            reward = utils.remap(action_reward[actions[i][0]] + reward, [
                self.COLLISION_REWARD,
                self.HIGH_VELOCITY_REWARD + self.RIGHT_LANE_REWARD
            ], [0, 1])
            rewards.append(reward)
        return np.array(rewards)
예제 #6
0
 def _reward(self, action):
     reward = self.config["collision_reward"] * self.vehicle.crashed \
              + self.HIGH_VELOCITY_REWARD * (self.vehicle.velocity_index == self.vehicle.SPEED_COUNT - 1)
     reward = self.ARRIVED_REWARD if self.has_arrived else reward
     if self.config["normalize_reward"]:
         reward = utils.remap(reward, [self.config["collision_reward"], self.ARRIVED_REWARD], [0, 1])
     return reward
예제 #7
0
 def _reward(self, action):
     reward = self.COLLISION_REWARD * self.vehicle.crashed \
              + self.HIGH_VELOCITY_REWARD * self.vehicle.velocity_index / max(self.vehicle.SPEED_COUNT - 1, 1)
     reward = self.ARRIVED_REWARD if self.has_arrived else reward
     return utils.remap(reward,
                        [self.COLLISION_REWARD, self.ARRIVED_REWARD],
                        [0, 1])
예제 #8
0
 def _reward(self, action):
     reward = self.COLLISION_REWARD * self.vehicle.crashed \
              + self.HIGH_VELOCITY_REWARD * self.vehicle.velocity_index / max(self.vehicle.SPEED_COUNT - 1, 1) \
              + self.LANE_CHANGE_REWARD * (action in [0, 2])
     return utils.remap(reward, [
         self.COLLISION_REWARD + self.LANE_CHANGE_REWARD,
         self.HIGH_VELOCITY_REWARD
     ], [0, 1])
 def _reward(self, action: int) -> float:
     reward = self.COLLISION_REWARD * self.vehicle.crashed \
              + self.HIGH_SPEED_REWARD * self.vehicle.speed_index / max(self.vehicle.SPEED_COUNT - 1, 1) \
              + self.LANE_CHANGE_REWARD * (action in [0, 2])
     return utils.remap(reward, [
         self.COLLISION_REWARD + self.LANE_CHANGE_REWARD,
         self.HIGH_SPEED_REWARD
     ], [0, 1])
예제 #10
0
    def _observation(self):
        """
            Return the observation of the current state, which must be consistent with self.observation_space.
        :return: the observation
        """
        # Add ego-vehicle
        df = pandas.DataFrame.from_records([self.vehicle.to_dict()
                                            ])[self.OBSERVATION_FEATURES]
        # Normalize values
        MAX_ROAD_LANES = 4
        road_width = AbstractLane.DEFAULT_WIDTH * MAX_ROAD_LANES
        df.loc[0, 'x'] = 0
        df.loc[0, 'y'] = utils.remap(df.loc[0, 'y'], [0, road_width], [0, 1])
        df.loc[0, 'vx'] = utils.remap(
            df.loc[0,
                   'vx'], [MDPVehicle.SPEED_MIN, MDPVehicle.SPEED_MAX], [0, 1])
        df.loc[0, 'vy'] = utils.remap(
            df.loc[0, 'vy'], [-MDPVehicle.SPEED_MAX, MDPVehicle.SPEED_MAX],
            [-1, 1])

        # Add nearby traffic
        close_vehicles = self.road.closest_vehicles_to(
            self.vehicle, self.OBSERVATION_VEHICLES)
        if close_vehicles:
            df = df.append(pandas.DataFrame.from_records([
                v.to_dict(self.vehicle)
                for v in close_vehicles[-self.OBSERVATION_VEHICLES + 1:]
            ])[self.OBSERVATION_FEATURES],
                           ignore_index=True)

            # Normalize values
            delta_v = 2 * (MDPVehicle.SPEED_MAX - MDPVehicle.SPEED_MIN)
            df.loc[1:, 'x'] = utils.remap(
                df.loc[1:, 'x'],
                [-self.PERCEPTION_DISTANCE, self.PERCEPTION_DISTANCE], [-1, 1])
            df.loc[1:, 'y'] = utils.remap(df.loc[1:, 'y'],
                                          [-road_width, road_width], [-1, 1])
            df.loc[1:, 'vx'] = utils.remap(df.loc[1:, 'vx'],
                                           [-delta_v, delta_v], [-1, 1])
            df.loc[1:, 'vy'] = utils.remap(df.loc[1:, 'vy'],
                                           [-delta_v, delta_v], [-1, 1])

        # Fill missing rows
        if df.shape[0] < self.OBSERVATION_VEHICLES:
            rows = -np.ones((self.OBSERVATION_VEHICLES - df.shape[0],
                             len(self.OBSERVATION_FEATURES)))
            df = df.append(pandas.DataFrame(data=rows,
                                            columns=self.OBSERVATION_FEATURES),
                           ignore_index=True)

        # Reorder
        df = df[self.OBSERVATION_FEATURES]
        # Clip
        obs = np.clip(df.values, -1, 1)
        # Flatten
        obs = np.ravel(obs)
        return obs
예제 #11
0
 def _reward(self, action):
     """
     The reward is defined to foster driving at high speed, on the rightmost lanes, and to avoid collisions.
     :param action: the last action performed
     :return: the corresponding reward
     """
     action_reward = {0: self.LANE_CHANGE_REWARD, 1: 0, 2: self.LANE_CHANGE_REWARD, 3: 0, 4: 0}
     neighbours = self.road.network.all_side_lanes(self.vehicle.lane_index)
     state_reward = \
         + self.config["collision_reward"] * self.vehicle.crashed \
         + self.RIGHT_LANE_REWARD * self.vehicle.target_lane_index[2] / (len(neighbours) - 1) \
         + self.HIGH_VELOCITY_REWARD * self.vehicle.velocity_index / (self.vehicle.SPEED_COUNT - 1)
     return utils.remap(action_reward[action] + state_reward,
                        [self.config["collision_reward"], self.HIGH_VELOCITY_REWARD+self.RIGHT_LANE_REWARD],
                        [0, 1])
예제 #12
0
    def normalize(self, df):
        """
            Normalize the observation values.

            For now, assume that the road is straight along the x axis.
        :param Dataframe df: observation data
        """
        if not self.features_range:
            self.features_range = {
                "vx": [-2*MDPVehicle.SPEED_MAX, 2*MDPVehicle.SPEED_MAX],
                "vy": [-2*MDPVehicle.SPEED_MAX, 2*MDPVehicle.SPEED_MAX]
            }
        for feature, f_range in self.features_range.items():
            if feature in df:
                df[feature] = utils.remap(df[feature], [f_range[0], f_range[1]], [-1, 1])
        return df
    def normalize(self, df):
        """
            Normalize the observation values.

            For now, assume that the road is straight along the x axis.
        :param Dataframe df: observation data
        """
        if not self.features_range:
            side_lanes = self.env.road.network.all_side_lanes(self.env.vehicle.lane_index)
            self.features_range = {
                "x": [-5.0 * MDPVehicle.SPEED_MAX, 5.0 * MDPVehicle.SPEED_MAX],
                "y": [-AbstractLane.DEFAULT_WIDTH * len(side_lanes), AbstractLane.DEFAULT_WIDTH * len(side_lanes)],
                "vx": [-2*MDPVehicle.SPEED_MAX, 2*MDPVehicle.SPEED_MAX],
                "vy": [-2*MDPVehicle.SPEED_MAX, 2*MDPVehicle.SPEED_MAX]
            }
        for feature, f_range in self.features_range.items():
            if feature in df:
                df[feature] = utils.remap(df[feature], [f_range[0], f_range[1]], [-1, 1])
        return df
 def _reward_cav(self, infos, terminal):
     """
         When crashed receive -1, when exit highway receive 1, else 0
     :param action: the action performed
     :return: the reward of the state-action transition
     """
     CinB_reward = 0
     BinC_reward = 0
     High_velocity_reward = 1  # The reward received when driving at 40 m/s, linearly mapped to zero for 35 m/s
     # speed_reward = np.clip(utils.remap(self.vehicle.velocity, [30, 40], [0, High_velocity_reward]), 0, High_velocity_reward)
     speed_reward = np.clip(
         utils.remap(self.vehicle.velocity, [20, 40],
                     [-High_velocity_reward, High_velocity_reward]),
         -High_velocity_reward, High_velocity_reward)
     scene_type = infos["scene_type"]
     if not scene_type:
         return speed_reward
     else:
         if scene_type == "CinB":
             return CinB_reward
         elif scene_type == "BinC":
             return BinC_reward
         else:
             return 0
    def sample(self, explore=False):
        self.step += 1
        # print(self._current_observation_n)

        if self._current_observation_n is None:
            # print('now updating')
            self._current_observation_n = self.env.reset()

        action_n = []
        supplied_observation = []

        observations = np.zeros((2, self.env.num_state))
        next_observations = np.zeros((2, self.env.num_state))
        if self.env.sim_step >= self.env.num_state - 3:
            print('wrong')
        observations[0][self.env.sim_step] = 1
        observations[1][self.env.sim_step] = 1
        next_observations[0][self.env.sim_step + 1] = 1
        next_observations[1][self.env.sim_step + 1] = 1
        relative_info = np.zeros((2, 2))
        speed_max = 40
        velocity_range = 2 * 40
        x_position_range = speed_max
        delta_dx = self.env.road.vehicles[1].position[
            0] - self.env.road.vehicles[0].position[0]
        delta_vx = self.env.road.vehicles[1].velocity - self.env.road.vehicles[
            0].velocity
        relative_info[0][0] = utils.remap(
            delta_dx, [-x_position_range, x_position_range], [-1, 1])
        relative_info[0][1] = utils.remap(delta_vx,
                                          [-velocity_range, velocity_range],
                                          [-1, 1])
        relative_info[1][0] = -relative_info[0][0]
        relative_info[1][1] = -relative_info[0][1]
        observations[:, -2:] = relative_info
        if explore:
            for i in range(self.agent_num):
                action_n.append([np.random.randint(0, self.env.action_num)])

            for i in range(self.leader_num):
                supplied_observation.append(observations[i])
            for i in range(self.leader_num, self.env.agent_num):
                mix_obs = np.hstack(
                    (observations[i],
                     tf.one_hot(action_n[0][0],
                                self.env.action_num))).reshape(1, -1)
                supplied_observation.append(mix_obs)

        else:
            for i in range(self.leader_num):
                supplied_observation.append(observations[i])
                action_n.append(self.train_agents[0].act(
                    observations[i].reshape(1, -1)))

            for i in range(self.leader_num, self.env.agent_num):
                mix_obs = np.hstack(
                    (observations[i],
                     tf.one_hot(action_n[0][0],
                                self.env.action_num))).reshape(1, -1)
                supplied_observation.append(mix_obs)
                follower_action = self.train_agents[1].act(
                    mix_obs.reshape(1, -1))
                action_n.append(follower_action)

        action_n = np.asarray(action_n)

        pres_valid_conditions_n = []
        next_valid_conditions_n = []

        #self.env.render()
        '''
        for i in range(5):
            for j in range(5):
                print("q value for upper agent ", i, j, self.train_agents[0]._qf.get_values(np.hstack((observations[0], tf.one_hot(i, self.env.action_num), tf.one_hot(j, self.env.action_num))).reshape(1, -1)))
        print()
        for i in range(5):
            for j in range(5):
                print("q value for lower agent ", i, j, self.train_agents[1]._qf.get_values(np.hstack((observations[1], tf.one_hot(i, self.env.action_num), tf.one_hot(j, self.env.action_num))).reshape(1, -1))) 
        print('a0 = ', action_n[0])
        print('a1 = ', action_n[1])
        print(self.env.road.vehicles[0].position[0], self.env.road.vehicles[1].position[0])
        '''
        next_observation_n, reward_n, done_n, info = self.env.step(action_n)

        delta_dx = self.env.road.vehicles[1].position[
            0] - self.env.road.vehicles[0].position[0]
        delta_vx = self.env.road.vehicles[1].velocity - self.env.road.vehicles[
            0].velocity
        relative_info[0][0] = utils.remap(
            delta_dx, [-x_position_range, x_position_range], [-1, 1])
        relative_info[0][1] = utils.remap(delta_vx,
                                          [-velocity_range, velocity_range],
                                          [-1, 1])
        relative_info[1][0] = -relative_info[0][0]
        relative_info[1][1] = -relative_info[0][1]
        next_observations[:, -2:] = relative_info

        if self._global_reward:
            reward_n = np.array([np.sum(reward_n)] * self.agent_num)

        self._path_length += 1
        self._path_return += np.array(reward_n, dtype=np.float32)
        self._total_samples += 1

        opponent_action = np.array(action_n[[j for j in range(len(action_n))
                                             ]].flatten())
        for i, agent in enumerate(self.agents):
            agent.replay_buffer.add_sample(
                # observation=self._current_observation_n[i].astype(np.float32),
                observation=supplied_observation[i],
                # action=action_n[i].astype(np.float32),
                action=tf.one_hot(action_n[i], self.env.action_num),
                # reward=reward_n[i].astype(np.float32),
                reward=np.float32(reward_n[i]),
                # terminal=done_n[i].astype(np.float32),
                terminal=np.float32(done_n[i]),
                # next_observation=next_observation_n[i].astype(np.float32),
                next_observation=np.float32(next_observations[i]),
                # opponent_action=opponent_action.astype(np.float32)
                opponent_action=np.int32(opponent_action),
            )

        self._current_observation_n = next_observation_n

        if self.step % (25 * 1000) == 0:
            print("steps: {}, episodes: {}, mean episode reward: {}".format(
                self.step, len(reward_n), np.mean(reward_n[-1000:])))

        if np.all(done_n) or self._path_length >= self._max_path_length:
            self._current_observation_n = self.env.reset()
            self._max_path_return = np.maximum(self._max_path_return,
                                               self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            self._last_path_return = self._path_return
            self._path_length = 0

            self._path_return = np.zeros(self.agent_num)
            self._n_episodes += 1
            self.log_diagnostics()
            logger.log(tabular)
            logger.dump_all()
        else:
            self._current_observation_n = next_observation_n