def get_data(self): """ 轨迹比较特殊,原子文件中存储的并不是轨迹而是一个一个点,因此需要先对轨迹进行切割 """ if self.data is None: if self.config['cache_dataset'] and os.path.exists( self.encoder.cache_file_name): # load cache f = open(self.encoder.cache_file_name, 'r') self.data = json.load(f) self.pad_item = self.data['pad_item'] f.close() else: cut_data = self.cutter_filter() encoded_data = self.encode_traj(cut_data) self.data = encoded_data self.pad_item = self.encoder.pad_item if self.config['cache_dataset']: if not os.path.exists(self.cache_file_folder): os.makedirs(self.cache_file_folder) with open(self.encoder.cache_file_name, 'w') as f: json.dump(encoded_data, f) # user 来划,以及按轨迹数来划。 # TODO: 这里可以设一个参数,现在先按照轨迹数来划吧 train_data, eval_data, test_data = self.divide_data() return generate_dataloader(train_data, eval_data, test_data, self.encoder.feature_dict, self.config['batch_size'], self.config['num_workers'], self.pad_item, self.encoder.feature_max_len)
def get_data(self): """ 返回数据的DataLoader,包括训练数据、测试数据、验证数据 Returns: tuple: tuple contains: train_dataloader: Dataloader composed of Batch (class) \n eval_dataloader: Dataloader composed of Batch (class) \n test_dataloader: Dataloader composed of Batch (class) """ # 加载数据集 x_train, y_train, x_val, y_val, x_test, y_test = [], [], [], [], [], [] if self.data is None: self.data = {} if self.cache_dataset and os.path.exists(self.cache_file_name): x_train, y_train, x_val, y_val, x_test, y_test = self._load_cache_train_val_test( ) else: x_train, y_train, x_val, y_val, x_test, y_test = self._generate_train_val_test( ) # 数据归一化 self.feature_dim = x_train.shape[-1] self.scaler = self._get_scalar(x_train, y_train) x_train[..., :self.output_dim] = self.scaler.transform( x_train[..., :self.output_dim]) y_train[..., :self.output_dim] = self.scaler.transform( y_train[..., :self.output_dim]) x_val[..., :self.output_dim] = self.scaler.transform( x_val[..., :self.output_dim]) y_val[..., :self.output_dim] = self.scaler.transform( y_val[..., :self.output_dim]) x_test[..., :self.output_dim] = self.scaler.transform( x_test[..., :self.output_dim]) y_test[..., :self.output_dim] = self.scaler.transform( y_test[..., :self.output_dim]) if self.normal_external: x_train[..., self.output_dim:] = self.scaler.transform( x_train[..., self.output_dim:]) y_train[..., self.output_dim:] = self.scaler.transform( y_train[..., self.output_dim:]) x_val[..., self.output_dim:] = self.scaler.transform( x_val[..., self.output_dim:]) y_val[..., self.output_dim:] = self.scaler.transform( y_val[..., self.output_dim:]) x_test[..., self.output_dim:] = self.scaler.transform( x_test[..., self.output_dim:]) y_test[..., self.output_dim:] = self.scaler.transform( y_test[..., self.output_dim:]) # 把训练集的X和y聚合在一起成为list,测试集验证集同理 # x_train/y_train: (num_samples, input_length, ..., feature_dim) # train_data(list): train_data[i]是一个元组,由x_train[i]和y_train[i]组成 train_data = list(zip(x_train, y_train)) eval_data = list(zip(x_val, y_val)) test_data = list(zip(x_test, y_test)) # 转Dataloader self.train_dataloader, self.eval_dataloader, self.test_dataloader = \ generate_dataloader(train_data, eval_data, test_data, self.feature_name, self.batch_size, self.num_workers, pad_with_last_sample=self.pad_with_last_sample) return self.train_dataloader, self.eval_dataloader, self.test_dataloader, train_data
def get_data(self): """ 轨迹比较特殊,原子文件中存储的并不是轨迹而是一个一个点,因此需要先对轨迹进行切割 """ if self.data is None: if self.config['cache_dataset'] and os.path.exists( self.cache_file_name): # load cache f = open(self.cache_file_name, 'r') self.data = json.load(f) loc_pad = self.data['loc_size'] - 1 tim_pad = self.data['tim_size'] - 1 self.pad_item = { 'current_loc': loc_pad, 'history_loc': loc_pad, 'current_tim': tim_pad, 'history_tim': tim_pad, 'text': np.zeros((self.data['text_size'])) } f.close() else: transformed_data = self.cutter_filter() # pad parameter loc_pad = transformed_data['loc_size'] transformed_data['loc_size'] += 1 tim_pad = transformed_data['tim_size'] transformed_data['tim_size'] += 1 self.pad_item = { 'current_loc': loc_pad, 'history_loc': loc_pad, 'current_tim': tim_pad, 'history_tim': tim_pad, 'text': np.zeros((transformed_data['text_size'])) } self.data = transformed_data if self.config['cache_dataset']: if not os.path.exists(self.cache_file_folder): os.makedirs(self.cache_file_folder) with open(self.cache_file_name, 'w') as f: json.dump(transformed_data, f) # 切完轨迹之后,就是做 batch 了 # 划分训练集、测试集、验证集:有两种方式按 user 来划,以及按轨迹数来划。 # TODO: 这里可以设一个参数,现在先按照轨迹数来划吧 train_data, eval_data, test_data = self.gen_input() return generate_dataloader(train_data, eval_data, test_data, self.feature_name, self.config['batch_size'], self.config['num_workers'], self.pad_item, self.pad_max_len)
def get_data(self): """ 获取数据,数据归一化,之后返回训练集、测试集、验证集对应的DataLoader :return: train_dataloader (pytorch.DataLoader) eval_dataloader (pytorch.DataLoader) test_dataloader (pytorch.DataLoader) all the dataloaders are composed of Batch (class) """ # 加载数据集 x_train, y_train, x_val, y_val, x_test, y_test = [], [], [], [], [], [] ext_x_train, ext_y_train, ext_x_test, ext_y_test, ext_x_val, ext_y_val = [], [], [], [], [], [] if self.data is None: self.data = {} if self.cache_dataset and os.path.exists(self.cache_file_name): x_train, y_train, x_val, y_val, x_test, y_test, \ ext_x_train, ext_y_train, ext_x_test, ext_y_test, ext_x_val, ext_y_val \ = self._load_cache_train_val_test() else: x_train, y_train, x_val, y_val, x_test, y_test, \ ext_x_train, ext_y_train, ext_x_test, ext_y_test, ext_x_val, ext_y_val \ = self._generate_train_val_test() # 数据归一化 self.feature_dim = x_train.shape[-1] self.scaler = self._get_scalar(x_train, y_train) x_train = self.scaler.transform(x_train) y_train = self.scaler.transform(y_train) x_val = self.scaler.transform(x_val) y_val = self.scaler.transform(y_val) x_test = self.scaler.transform(x_test) y_test = self.scaler.transform(y_test) if self.normal_external: ext_x_train = self.scaler.transform(ext_x_train) ext_y_train = self.scaler.transform(ext_y_train) ext_x_val = self.scaler.transform(ext_x_val) ext_y_val = self.scaler.transform(ext_y_val) ext_x_test = self.scaler.transform(ext_x_test) ext_y_test = self.scaler.transform(ext_y_test) # 把训练集的X和y聚合在一起成为list,测试集验证集同理 # x_train/y_train: (num_samples, input_length, ..., feature_dim) # train_data(list): train_data[i]是一个元组,由x_train[i]和y_train[i]组成 train_data = list(zip(x_train, y_train, ext_x_train, ext_y_train)) eval_data = list(zip(x_val, y_val, ext_x_val, ext_y_val)) test_data = list(zip(x_test, y_test, ext_x_test, ext_y_test)) # 转Dataloader self.train_dataloader, self.eval_dataloader, self.test_dataloader = \ generate_dataloader(train_data, eval_data, test_data, self.feature_name, self.batch_size, self.num_workers, pad_with_last_sample=self.pad_with_last_sample) return self.train_dataloader, self.eval_dataloader, self.test_dataloader
def get_data(self): """ 获取切割后的数据集 Returns: """ if self.data is None: if self.config['cache_dataset_load'] and os.path.exists( self.encoder.cache_file_name): # 加载cache f = open(self.encoder.cache_file_name, 'r') self.data = json.load(f) f.close() else: dyna = pd.read_csv( os.path.join(self.data_path, '{}.dyna'.format( self.config['dataset']))).values.T geo = pd.read_csv( os.path.join(self.data_path, '{}.geo'.format( self.config['dataset']))).values.T self.geo = geo sequence, sequence_user, sequence_time, sequence_distance, id_to_geo = self.build_sequence( dyna) top_n = self.pop_n(sequence, self.num_sample) train_set = (sequence, sequence_user, sequence_time, sequence_distance) final_train_set, final_eval_test, final_test_set = \ self.load_data( train_set, self.num_sample, top_n, id_to_geo, True) self.data = { "train": final_train_set, "eval": final_eval_test, "test": final_test_set } # 先不管这块 if self.config['cache_dataset_save']: if not os.path.exists(self.cache_file_folder): os.makedirs(self.cache_file_folder) with open(self.encoder.cache_file_name, 'w') as f: json.dump(self.data, f) print("finish getting data!") # 这里没传self.pad_item return generate_dataloader(train_data=self.data["train"], eval_data=self.data["eval"], test_data=self.data["test"], feature_name={ 'loc': 'float', 'tim': 'float', 'dis': 'float', 'uid': 'float', 'loc_neg': 'float', 'tim_neg': 'float', 'dis_neg': 'float', 'target': 'int' }, batch_size=self.config['batch_size'], num_workers=self.config['num_workers'], shuffle=False)