Пример #1
0
    def encode(self, uid, trajectories):
        """standard encoder use the same method as DeepMove

        Recode poi id. Encode timestamp with its hour.

        Args:
            uid ([type]): same as AbstractTrajectoryEncoder
            trajectories ([type]): same as AbstractTrajectoryEncoder
                trajectory1 = [
                    (location ID, timestamp, timezone_offset_in_minutes),
                    (location ID, timestamp, timezone_offset_in_minutes),
                    .....
                ]
        """
        # 直接对 uid 进行重编码
        uid = self.uid
        self.uid += 1
        encoded_trajectories = []
        history_loc = []
        history_tim = []
        for index, traj in enumerate(trajectories):
            current_loc = []
            current_tim = []
            current_longi = []
            current_lati = []
            current_points = []
            start_time = parse_time(traj[0][1], traj[0][2])
            # 以当天凌晨的时间作为计算 time_off 的基准
            base_time = cal_basetime(start_time, True)
            for point in traj:
                loc = point[0]
                now_time = parse_time(point[1], point[2])
                if loc not in self.location2id:
                    self.location2id[loc] = self.loc_id
                    self.loc_id += 1
                current_points.append(loc)
                current_loc.append(self.location2id[loc])
                current_lati.append(self.geo_coord[loc][0])
                current_longi.append(self.geo_coord[loc][1])
                time_code = int(cal_timeoff(now_time, base_time))
                if time_code > self.tim_max:
                    self.tim_max = time_code
                current_tim.append(time_code)
            # 完成当前轨迹的编码,下面进行输入的形成
            trace = []
            target = current_loc[-1]
            target_tim = current_tim[-1]
            current_loc = current_loc[:-1]
            current_tim = current_tim[:-1]
            lati = self.geo_coord[self.location2id[current_points[-1]]][0]
            lati = np.array([lati for i in range(len(current_loc))])
            longi = self.geo_coord[self.location2id[current_points[-1]]][1]
            longi = np.array([longi for i in range(len(current_loc))])
            current_dis = euclidean_dist(lati - current_lati[:-1], longi - current_longi[:-1])
            trace.append(history_loc)
            trace.append(history_tim)
            trace.append(current_loc)
            trace.append(current_tim)
            trace.append(target)
            trace.append(target_tim)
            trace.append(uid)
            trace.append(current_dis)
            encoded_trajectories.append(trace)
            if self.history_type == 'splice':
                history_loc += current_loc
                history_tim += current_tim
            else:
                history_loc.append(current_loc)
                history_tim.append(current_tim)
        return encoded_trajectories
Пример #2
0
    def encode(self, uid, trajectories):
        """standard encoder use the same method as DeepMove

        Recode poi id. Encode timestamp with its hour.

        Args:
            uid ([type]): same as AbstractTrajectoryEncoder
            trajectories ([type]): same as AbstractTrajectoryEncoder
                trajectory1 = [
                    (location ID, timestamp, timezone_offset_in_minutes),
                    (location ID, timestamp, timezone_offset_in_minutes),
                    .....
                ]
        """
        # 直接对 uid 进行重编码
        uid = self.uid
        self.uid += 1
        encoded_trajectories = []
        history_loc = []
        history_tim = []
        for index, traj in enumerate(trajectories):
            current_loc = []
            current_tim = []
            start_time = parse_time(traj[0][1], traj[0][2])
            # 以当天凌晨的时间作为计算 time_off 的基准
            base_time = cal_basetime(start_time, True)
            for point in traj:
                loc = point[0]
                now_time = parse_time(point[1], point[2])
                if loc not in self.location2id:
                    self.location2id[loc] = self.loc_id
                    self.loc_id += 1
                current_loc.append(self.location2id[loc])
                time_code = int(cal_timeoff(now_time, base_time))
                if time_code > self.tim_max:
                    self.tim_max = time_code
                current_tim.append(time_code)
            # 完成当前轨迹的编码,下面进行输入的形成
            if index == 0:
                # 因为要历史轨迹特征,所以第一条轨迹是不能构成模型输入的
                if self.history_type == 'splice':
                    history_loc += current_loc
                    history_tim += current_tim
                else:
                    history_loc.append(current_loc)
                    history_tim.append(current_tim)
                continue
            trace = []
            target = current_loc[-1]
            target_tim = current_tim[-1]
            current_loc = current_loc[:-1]
            current_tim = current_tim[:-1]
            trace.append(history_loc)
            trace.append(history_tim)
            trace.append(current_loc)
            trace.append(current_tim)
            trace.append(target)
            trace.append(target_tim)
            trace.append(uid)
            encoded_trajectories.append(trace)
            if self.history_type == 'splice':
                history_loc += current_loc
                history_tim += current_tim
            else:
                history_loc.append(current_loc)
                history_tim.append(current_tim)
        return encoded_trajectories
 def cutter_filter(self):
     """
     切割后的轨迹存储格式: (dict)
         {
             uid: [
                 [
                     (location ID, timestamp, timezone_offset_in_minutes),
                     (location ID, timestamp, timezone_offset_in_minutes),
                     ...
                 ],
                 [
                     (location ID, timestamp, timezone_offset_in_minutes),
                     (location ID, timestamp, timezone_offset_in_minutes),
                     ...
                 ],
                 ...
             ],
             ...
         }
     """
     # load data according to config
     traj = pd.read_csv(
         os.path.join(self.data_path,
                      '{}.dyna'.format(self.config['dataset'])))
     user_set = pd.unique(traj['entity_id'])
     res = {}
     min_session_len = self.config['min_session_len']
     min_sessions = self.config['min_sessions']
     window_size = self.config['window_size']
     window_type = self.config['window_type']
     if window_type == 'time_window':
         # 按照时间窗口进行切割
         base_zero = window_size > 12
         for uid in user_set:
             usr_traj = traj[traj['entity_id'] == uid]
             sessions = []  # 存放该用户所有的 session
             session = []  # 单条轨迹
             # 这里还是使用当地时间吧
             start_time = parse_time(
                 usr_traj.iloc[0]['time'],
                 int(usr_traj.iloc[0]['timezone_offset_in_minutes']))
             base_time = cal_basetime(start_time, base_zero)
             for index, row in usr_traj.iterrows():
                 if index == 0:
                     assert start_time.hour - base_time.hour < window_size
                     session.append((row['location'], row['time'],
                                     row['timezone_offset_in_minutes']))
                 else:
                     now_time = parse_time(
                         row['time'],
                         int(row['timezone_offset_in_minutes']))
                     time_off = cal_timeoff(now_time, base_time)
                     if time_off < window_size and time_off >= 0:
                         session.append((row['location'], row['time'],
                                         row['timezone_offset_in_minutes']))
                     else:
                         if len(session) >= min_session_len:
                             sessions.append(session)
                         session = []
                         start_time = now_time
                         base_time = cal_basetime(start_time, base_zero)
                         session.append((row['location'], row['time'],
                                         row['timezone_offset_in_minutes']))
             if len(session) >= min_session_len:
                 sessions.append(session)
             if len(sessions) >= min_sessions:
                 res[uid] = sessions
     else:
         # 按照轨迹长度进行划分
         for uid in user_set:
             usr_traj = traj[traj['entity_id'] == uid]
             sessions = []  # 存放该用户所有的 session
             session = []  # 单条轨迹
             for index, row in usr_traj.iterrows():
                 if len(session) < window_size:
                     session.append((row['location'], row['time'],
                                     row['timezone_offset_in_minutes']))
                 else:
                     sessions.append(session)
                     session = []
                     session.append((row['location'], row['time'],
                                     row['timezone_offset_in_minutes']))
             if len(session) >= min_session_len:
                 sessions.append(session)
             if len(sessions) >= min_sessions:
                 res[uid] = sessions
     return res
 def cutter_filter(self):
     """
     切割后的轨迹存储格式: (dict)
     还需要考虑语义信息,将每个点对应的语义信息加入进去
     """
     """
         {
             uid: [
                 [
                     [loc, tim, [useful word list]],
                     [loc, tim, [useful word list]],
                     ...
                 ],
                 [
                     [loc, tim, [useful word list]],
                     [loc, tim, [useful word list]],
                     ...
                 ],
                 ...
             ],
             ...
         }
     """
     # load data according to config
     traj = pd.read_csv(
         os.path.join(self.data_path,
                      '{}.dyna'.format(self.config['dataset'])))
     poi = pd.read_csv(
         os.path.join(self.data_path,
                      '{}.geo'.format(self.config['dataset'])))
     # 统计语料库中出现在轨迹数据集中的单词
     useful_vec = {}
     text_vec = self.load_wordvec()  # 加载语料库
     user_set = pd.unique(traj['entity_id'])
     res = {}
     min_session_len = self.config['min_session_len']
     min_sessions = self.config['min_sessions']
     time_window_size = 24  # serm 论文的时间编码方式比较独特
     base_zero = time_window_size > 12
     useful_uid = 0  # 因为有些用户会被我们删除掉,所以需要对 uid 进行重新编号
     useful_loc = {}  # loc 同理
     loc_id = 0
     for uid in user_set:
         usr_traj = traj[traj['entity_id'] == uid]
         sessions = []  # 存放该用户所有的 session
         session = []  # 单条轨迹
         # 这里还是使用当地时间吧
         start_time = parse_time(
             usr_traj.iloc[0]['time'],
             int(usr_traj.iloc[0]['timezone_offset_in_minutes']))
         base_time = cal_basetime(start_time, base_zero)
         for index, row in usr_traj.iterrows():
             if index == 0:
                 assert start_time.hour - base_time.hour < time_window_size
                 # 处理第一个点的语义信息
                 useful_words_list = []
                 if self.config['dataset'] in [
                         'foursquare_tky', 'foursquare_nyk'
                 ]:
                     # TODO: 这种硬编码可能不太好
                     words = poi.iloc[
                         row['location']]['venue_category_name'].split(' ')
                     for w in words:
                         w = w.lower()
                         if (w in text_vec) and (w not in useful_vec):
                             useful_vec[w] = text_vec[w]
                         if w in useful_vec:
                             useful_words_list.append(w)
                 time_code = start_time.hour - base_time.hour
                 if start_time.weekday() == 5 or start_time.weekday() == 6:
                     time_code += 24
                 session.append(
                     [row['location'], time_code, useful_words_list])
             else:
                 now_time = parse_time(
                     row['time'], int(row['timezone_offset_in_minutes']))
                 time_off = cal_timeoff(now_time, base_time)
                 # 处理语义
                 useful_words_list = []
                 if self.config['dataset'] in [
                         'foursquare_tky', 'foursquare_nyk'
                 ]:
                     # TODO: 这种硬编码可能不太好
                     words = poi.iloc[
                         row['location']]['venue_category_name'].split(' ')
                     for w in words:
                         w = w.lower()
                         if (w in text_vec) and (w not in useful_vec):
                             useful_vec[w] = text_vec[w]
                         if w in useful_vec:
                             useful_words_list.append(w)
                 if time_off < time_window_size and time_off >= 0:
                     # 特殊的时间编码
                     time_code = int(time_off)
                     if now_time.weekday() in [5, 6]:
                         time_code += 24
                     assert int(time_off) < time_window_size
                     session.append(
                         [row['location'], time_code, useful_words_list])
                 else:
                     if len(session) >= min_session_len:
                         sessions.append(session)
                     session = []
                     start_time = now_time
                     base_time = cal_basetime(start_time, base_zero)
                     time_code = start_time.hour - base_time.hour
                     if start_time.weekday() in [5, 6]:
                         time_code += 24
                     session.append(
                         [row['location'], time_code, useful_words_list])
         if len(session) >= min_session_len:
             sessions.append(session)
         if len(sessions) >= min_sessions:
             # 到这里才确定 sessions 里的 loc 都是会被使用到的
             for i in range(len(sessions)):
                 for j in range(len(sessions[i])):
                     loc = sessions[i][j][0]
                     if loc not in useful_loc:
                         useful_loc[loc] = loc_id
                         loc_id += 1
                     sessions[i][j][0] = useful_loc[loc]
             res[useful_uid] = sessions
             useful_uid += 1
     # 这里的 uid_size 和 loc_size 可能要大于实际的 uid 和 loc,因为有些可能被过滤掉了
     loc_size = loc_id
     uid_size = useful_uid
     # 根据 useful_vec 计算 word_vec
     word_index = {}
     word_vec = []
     text_size = len(useful_vec)
     for i, w in enumerate(useful_vec.keys()):
         word_index[w] = i
         word_vec.append(useful_vec[w])
     print('loc_size: {}, uid_size: {}, text_size: {}'.format(
         loc_size, uid_size, text_size))
     return {
         'loc_size': loc_size,
         'tim_size': 48,
         'uid_size': uid_size,
         'text_size': text_size,
         'word_vec': word_vec,
         'word_index': word_index,
         'data': res
     }
Пример #5
0
 def cutter_filter(self):
     """
     切割后的轨迹存储格式: (dict)
         {
             uid: [
                 [
                     [loc, tim],
                     [loc, tim],
                     ...
                 ],
                 [
                     [loc, tim],
                     [loc, tim],
                     ...
                 ],
                 ...
             ],
             ...
         }
     """
     # load data according to config
     traj = pd.read_csv(
         os.path.join(self.data_path,
                      '{}.dyna'.format(self.config['dataset'])))
     user_set = pd.unique(traj['entity_id'])
     res = {}
     min_session_len = self.config['min_session_len']
     min_sessions = self.config['min_sessions']
     time_window_size = self.config['time_window_size']
     base_zero = time_window_size > 12
     for uid in user_set:
         usr_traj = traj[traj['entity_id'] == uid]
         sessions = []  # 存放该用户所有的 session
         session = []  # 单条轨迹
         # 这里还是使用当地时间吧
         start_time = parse_time(
             usr_traj.iloc[0]['time'],
             int(usr_traj.iloc[0]['timezone_offset_in_minutes']))
         base_time = cal_basetime(start_time, base_zero)
         for index, row in usr_traj.iterrows():
             if index == 0:
                 assert start_time.hour - base_time.hour < time_window_size
                 # time encode from 0 ~ time_window_size
                 session.append(
                     [row['location'], start_time.hour - base_time.hour])
             else:
                 now_time = parse_time(
                     row['time'], int(row['timezone_offset_in_minutes']))
                 time_off = cal_timeoff(now_time, base_time)
                 if time_off < time_window_size and time_off >= 0:
                     assert int(time_off) < time_window_size
                     session.append([row['location'], int(time_off)])
                 else:
                     if len(session) >= min_session_len:
                         sessions.append(session)
                     session = []
                     start_time = now_time
                     base_time = cal_basetime(start_time, base_zero)
                     session.append([
                         row['location'], start_time.hour - base_time.hour
                     ])
         if len(session) >= min_session_len:
             sessions.append(session)
         if len(sessions) >= min_sessions:
             res[str(uid)] = sessions
     # 这里的 uid_size 和 loc_size 可能要大于实际的 uid 和 loc,因为有些可能被过滤掉了
     poi = pd.read_csv(
         os.path.join(self.data_path,
                      '{}.geo'.format(self.config['dataset'])))
     loc_size = poi.shape[0]
     uid_size = len(user_set)
     print('loc_size: {}, uid_size: {}'.format(loc_size, uid_size))
     return {
         'loc_size': loc_size,
         'tim_size': time_window_size,
         'uid_size': uid_size,
         'data': res
     }