def get_data(self):
     """
     轨迹比较特殊,原子文件中存储的并不是轨迹而是一个一个点,因此需要先对轨迹进行切割
     """
     if self.data is None:
         if self.config['cache_dataset'] and os.path.exists(
                 self.encoder.cache_file_name):
             # load cache
             f = open(self.encoder.cache_file_name, 'r')
             self.data = json.load(f)
             self.pad_item = self.data['pad_item']
             f.close()
         else:
             cut_data = self.cutter_filter()
             encoded_data = self.encode_traj(cut_data)
             self.data = encoded_data
             self.pad_item = self.encoder.pad_item
             if self.config['cache_dataset']:
                 if not os.path.exists(self.cache_file_folder):
                     os.makedirs(self.cache_file_folder)
                 with open(self.encoder.cache_file_name, 'w') as f:
                     json.dump(encoded_data, f)
     # user 来划,以及按轨迹数来划。
     # TODO: 这里可以设一个参数,现在先按照轨迹数来划吧
     train_data, eval_data, test_data = self.divide_data()
     return generate_dataloader(train_data, eval_data, test_data,
                                self.encoder.feature_dict,
                                self.config['batch_size'],
                                self.config['num_workers'], self.pad_item,
                                self.encoder.feature_max_len)
예제 #2
0
    def get_data(self):
        """
        返回数据的DataLoader,包括训练数据、测试数据、验证数据

        Returns:
            tuple: tuple contains:
                train_dataloader: Dataloader composed of Batch (class) \n
                eval_dataloader: Dataloader composed of Batch (class) \n
                test_dataloader: Dataloader composed of Batch (class)
        """
        # 加载数据集
        x_train, y_train, x_val, y_val, x_test, y_test = [], [], [], [], [], []
        if self.data is None:
            self.data = {}
            if self.cache_dataset and os.path.exists(self.cache_file_name):
                x_train, y_train, x_val, y_val, x_test, y_test = self._load_cache_train_val_test(
                )
            else:
                x_train, y_train, x_val, y_val, x_test, y_test = self._generate_train_val_test(
                )
        # 数据归一化
        self.feature_dim = x_train.shape[-1]
        self.scaler = self._get_scalar(x_train, y_train)
        x_train[..., :self.output_dim] = self.scaler.transform(
            x_train[..., :self.output_dim])
        y_train[..., :self.output_dim] = self.scaler.transform(
            y_train[..., :self.output_dim])
        x_val[..., :self.output_dim] = self.scaler.transform(
            x_val[..., :self.output_dim])
        y_val[..., :self.output_dim] = self.scaler.transform(
            y_val[..., :self.output_dim])
        x_test[..., :self.output_dim] = self.scaler.transform(
            x_test[..., :self.output_dim])
        y_test[..., :self.output_dim] = self.scaler.transform(
            y_test[..., :self.output_dim])
        if self.normal_external:
            x_train[..., self.output_dim:] = self.scaler.transform(
                x_train[..., self.output_dim:])
            y_train[..., self.output_dim:] = self.scaler.transform(
                y_train[..., self.output_dim:])
            x_val[..., self.output_dim:] = self.scaler.transform(
                x_val[..., self.output_dim:])
            y_val[..., self.output_dim:] = self.scaler.transform(
                y_val[..., self.output_dim:])
            x_test[..., self.output_dim:] = self.scaler.transform(
                x_test[..., self.output_dim:])
            y_test[..., self.output_dim:] = self.scaler.transform(
                y_test[..., self.output_dim:])
        # 把训练集的X和y聚合在一起成为list,测试集验证集同理
        # x_train/y_train: (num_samples, input_length, ..., feature_dim)
        # train_data(list): train_data[i]是一个元组,由x_train[i]和y_train[i]组成
        train_data = list(zip(x_train, y_train))
        eval_data = list(zip(x_val, y_val))
        test_data = list(zip(x_test, y_test))
        # 转Dataloader
        self.train_dataloader, self.eval_dataloader, self.test_dataloader = \
            generate_dataloader(train_data, eval_data, test_data, self.feature_name,
                                self.batch_size, self.num_workers, pad_with_last_sample=self.pad_with_last_sample)
        return self.train_dataloader, self.eval_dataloader, self.test_dataloader, train_data
 def get_data(self):
     """
     轨迹比较特殊,原子文件中存储的并不是轨迹而是一个一个点,因此需要先对轨迹进行切割
     """
     if self.data is None:
         if self.config['cache_dataset'] and os.path.exists(
                 self.cache_file_name):
             # load cache
             f = open(self.cache_file_name, 'r')
             self.data = json.load(f)
             loc_pad = self.data['loc_size'] - 1
             tim_pad = self.data['tim_size'] - 1
             self.pad_item = {
                 'current_loc': loc_pad,
                 'history_loc': loc_pad,
                 'current_tim': tim_pad,
                 'history_tim': tim_pad,
                 'text': np.zeros((self.data['text_size']))
             }
             f.close()
         else:
             transformed_data = self.cutter_filter()
             # pad parameter
             loc_pad = transformed_data['loc_size']
             transformed_data['loc_size'] += 1
             tim_pad = transformed_data['tim_size']
             transformed_data['tim_size'] += 1
             self.pad_item = {
                 'current_loc': loc_pad,
                 'history_loc': loc_pad,
                 'current_tim': tim_pad,
                 'history_tim': tim_pad,
                 'text': np.zeros((transformed_data['text_size']))
             }
             self.data = transformed_data
             if self.config['cache_dataset']:
                 if not os.path.exists(self.cache_file_folder):
                     os.makedirs(self.cache_file_folder)
                 with open(self.cache_file_name, 'w') as f:
                     json.dump(transformed_data, f)
     # 切完轨迹之后,就是做 batch 了
     # 划分训练集、测试集、验证集:有两种方式按 user 来划,以及按轨迹数来划。
     # TODO: 这里可以设一个参数,现在先按照轨迹数来划吧
     train_data, eval_data, test_data = self.gen_input()
     return generate_dataloader(train_data, eval_data, test_data,
                                self.feature_name,
                                self.config['batch_size'],
                                self.config['num_workers'], self.pad_item,
                                self.pad_max_len)
예제 #4
0
 def get_data(self):
     """
     获取数据,数据归一化,之后返回训练集、测试集、验证集对应的DataLoader
     :return:
         train_dataloader (pytorch.DataLoader)
         eval_dataloader (pytorch.DataLoader)
         test_dataloader (pytorch.DataLoader)
         all the dataloaders are composed of Batch (class)
     """
     # 加载数据集
     x_train, y_train, x_val, y_val, x_test, y_test = [], [], [], [], [], []
     ext_x_train, ext_y_train, ext_x_test, ext_y_test, ext_x_val, ext_y_val = [], [], [], [], [], []
     if self.data is None:
         self.data = {}
         if self.cache_dataset and os.path.exists(self.cache_file_name):
             x_train, y_train, x_val, y_val, x_test, y_test,  \
                 ext_x_train, ext_y_train, ext_x_test, ext_y_test, ext_x_val, ext_y_val \
                 = self._load_cache_train_val_test()
         else:
             x_train, y_train, x_val, y_val, x_test, y_test, \
                 ext_x_train, ext_y_train, ext_x_test, ext_y_test, ext_x_val, ext_y_val \
                 = self._generate_train_val_test()
     # 数据归一化
     self.feature_dim = x_train.shape[-1]
     self.scaler = self._get_scalar(x_train, y_train)
     x_train = self.scaler.transform(x_train)
     y_train = self.scaler.transform(y_train)
     x_val = self.scaler.transform(x_val)
     y_val = self.scaler.transform(y_val)
     x_test = self.scaler.transform(x_test)
     y_test = self.scaler.transform(y_test)
     if self.normal_external:
         ext_x_train = self.scaler.transform(ext_x_train)
         ext_y_train = self.scaler.transform(ext_y_train)
         ext_x_val = self.scaler.transform(ext_x_val)
         ext_y_val = self.scaler.transform(ext_y_val)
         ext_x_test = self.scaler.transform(ext_x_test)
         ext_y_test = self.scaler.transform(ext_y_test)
     # 把训练集的X和y聚合在一起成为list,测试集验证集同理
     # x_train/y_train: (num_samples, input_length, ..., feature_dim)
     # train_data(list): train_data[i]是一个元组,由x_train[i]和y_train[i]组成
     train_data = list(zip(x_train, y_train, ext_x_train, ext_y_train))
     eval_data = list(zip(x_val, y_val, ext_x_val, ext_y_val))
     test_data = list(zip(x_test, y_test, ext_x_test, ext_y_test))
     # 转Dataloader
     self.train_dataloader, self.eval_dataloader, self.test_dataloader = \
         generate_dataloader(train_data, eval_data, test_data, self.feature_name,
                             self.batch_size, self.num_workers, pad_with_last_sample=self.pad_with_last_sample)
     return self.train_dataloader, self.eval_dataloader, self.test_dataloader
예제 #5
0
 def get_data(self):
     """
     获取切割后的数据集
     Returns:
     """
     if self.data is None:
         if self.config['cache_dataset_load'] and os.path.exists(
                 self.encoder.cache_file_name):
             # 加载cache
             f = open(self.encoder.cache_file_name, 'r')
             self.data = json.load(f)
             f.close()
         else:
             dyna = pd.read_csv(
                 os.path.join(self.data_path, '{}.dyna'.format(
                     self.config['dataset']))).values.T
             geo = pd.read_csv(
                 os.path.join(self.data_path, '{}.geo'.format(
                     self.config['dataset']))).values.T
             self.geo = geo
             sequence, sequence_user, sequence_time, sequence_distance, id_to_geo = self.build_sequence(
                 dyna)
             top_n = self.pop_n(sequence, self.num_sample)
             train_set = (sequence, sequence_user, sequence_time,
                          sequence_distance)
             final_train_set, final_eval_test, final_test_set = \
                 self.load_data(
                     train_set,
                     self.num_sample,
                     top_n,
                     id_to_geo,
                     True)
             self.data = {
                 "train": final_train_set,
                 "eval": final_eval_test,
                 "test": final_test_set
             }
             # 先不管这块
             if self.config['cache_dataset_save']:
                 if not os.path.exists(self.cache_file_folder):
                     os.makedirs(self.cache_file_folder)
                 with open(self.encoder.cache_file_name, 'w') as f:
                     json.dump(self.data, f)
     print("finish getting data!")
     # 这里没传self.pad_item
     return generate_dataloader(train_data=self.data["train"],
                                eval_data=self.data["eval"],
                                test_data=self.data["test"],
                                feature_name={
                                    'loc': 'float',
                                    'tim': 'float',
                                    'dis': 'float',
                                    'uid': 'float',
                                    'loc_neg': 'float',
                                    'tim_neg': 'float',
                                    'dis_neg': 'float',
                                    'target': 'int'
                                },
                                batch_size=self.config['batch_size'],
                                num_workers=self.config['num_workers'],
                                shuffle=False)