def __init__( self, dataset, max_ind_range, sub_sample_rate, randomize, split="train", raw_path="", pro_data="", memory_map=False ): # dataset # tar_fea = 1 # single target den_fea = 13 # 13 dense features # spa_fea = 26 # 26 sparse features # tad_fea = tar_fea + den_fea # tot_fea = tad_fea + spa_fea if dataset == "kaggle": days = 7 out_file = "kaggleAdDisplayChallenge_processed" elif dataset == "terabyte": days = 24 out_file = "terabyte_processed" else: raise(ValueError("Data set option is not supported")) self.max_ind_range = max_ind_range self.memory_map = memory_map # split the datafile into path and filename lstr = raw_path.split("/") self.d_path = "/".join(lstr[0:-1]) + "/" self.d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] self.npzfile = self.d_path + ( (self.d_file + "_day") if dataset == "kaggle" else self.d_file ) self.trafile = self.d_path + ( (self.d_file + "_fea") if dataset == "kaggle" else "fea" ) # check if pre-processed data is available data_ready = True if memory_map: for i in range(days): reo_data = self.npzfile + "_{0}_reordered.npz".format(i) if not path.exists(str(reo_data)): data_ready = False else: if not path.exists(str(pro_data)): data_ready = False # pre-process data if needed # WARNNING: when memory mapping is used we get a collection of files if data_ready: print("Reading pre-processed data=%s" % (str(pro_data))) file = str(pro_data) else: print("Reading raw data=%s" % (str(raw_path))) file = data_utils.getCriteoAdData( raw_path, out_file, max_ind_range, sub_sample_rate, days, split, randomize, dataset == "kaggle", memory_map ) # get a number of samples per day total_file = self.d_path + self.d_file + "_day_count.npz" with np.load(total_file) as data: total_per_file = data["total_per_file"] # compute offsets per file self.offset_per_file = np.array([0] + [x for x in total_per_file]) for i in range(days): self.offset_per_file[i + 1] += self.offset_per_file[i] # print(self.offset_per_file) # setup data if memory_map: # setup the training/testing split self.split = split if split == 'none' or split == 'train': self.day = 0 self.max_day_range = days if split == 'none' else days - 1 elif split == 'test' or split == 'val': self.day = days - 1 num_samples = self.offset_per_file[days] - \ self.offset_per_file[days - 1] self.test_size = int(np.ceil(num_samples / 2.)) self.val_size = num_samples - self.test_size else: sys.exit("ERROR: dataset split is neither none, nor train or test.") ''' # text print("text") for i in range(days): fi = self.npzfile + "_{0}".format(i) with open(fi) as data: ttt = 0; nnn = 0 for _j, line in enumerate(data): ttt +=1 if np.int32(line[0]) > 0: nnn +=1 print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") # processed print("processed") for i in range(days): fi = self.npzfile + "_{0}_processed.npz".format(i) with np.load(fi) as data: yyy = data["y"] ttt = len(yyy) nnn = np.count_nonzero(yyy) print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") # reordered print("reordered") for i in range(days): fi = self.npzfile + "_{0}_reordered.npz".format(i) with np.load(fi) as data: yyy = data["y"] ttt = len(yyy) nnn = np.count_nonzero(yyy) print("day=" + str(i) + " total=" + str(ttt) + " non-zeros=" + str(nnn) + " ratio=" +str((nnn * 100.) / ttt) + "%") ''' # load unique counts with np.load(self.d_path + self.d_file + "_fea_count.npz") as data: self.counts = data["counts"] self.m_den = den_fea # X_int.shape[1] self.n_emb = len(self.counts) print("Sparse features= %d, Dense features= %d" % (self.n_emb, self.m_den)) # Load the test data # Only a single day is used for testing if self.split == 'test' or self.split == 'val': # only a single day is used for testing fi = self.npzfile + "_{0}_reordered.npz".format( self.day ) with np.load(fi) as data: self.X_int = data["X_int"] # continuous feature self.X_cat = data["X_cat"] # categorical feature self.y = data["y"] # target else: # load and preprocess data with np.load(file) as data: X_int = data["X_int"] # continuous feature X_cat = data["X_cat"] # categorical feature y = data["y"] # target self.counts = data["counts"] self.m_den = X_int.shape[1] # den_fea self.n_emb = len(self.counts) print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den)) # create reordering indices = np.arange(len(y)) if split == "none": # randomize all data if randomize == "total": indices = np.random.permutation(indices) print("Randomized indices...") X_int[indices] = X_int X_cat[indices] = X_cat y[indices] = y else: indices = np.array_split(indices, self.offset_per_file[1:-1]) # randomize train data (per day) if randomize == "day": # or randomize == "total": for i in range(len(indices) - 1): indices[i] = np.random.permutation(indices[i]) print("Randomized indices per day ...") train_indices = np.concatenate(indices[:-1]) test_indices = indices[-1] test_indices, val_indices = np.array_split(test_indices, 2) print("Defined %s indices..." % (split)) # randomize train data (across days) if randomize == "total": train_indices = np.random.permutation(train_indices) print("Randomized indices across days ...") # create training, validation, and test sets if split == 'train': self.X_int = [X_int[i] for i in train_indices] self.X_cat = [X_cat[i] for i in train_indices] self.y = [y[i] for i in train_indices] elif split == 'val': self.X_int = [X_int[i] for i in val_indices] self.X_cat = [X_cat[i] for i in val_indices] self.y = [y[i] for i in val_indices] elif split == 'test': self.X_int = [X_int[i] for i in test_indices] self.X_cat = [X_cat[i] for i in test_indices] self.y = [y[i] for i in test_indices] print("Split data according to indices...")
def __init__( self, args, sub_sample_rate, split="train", raw_path="", pro_data="", ): # dataset # tar_fea = 1 # single target den_fea = args.den_feature_num # 13 dense features partitions = 7 out_file = args.processed_data_file # --processed-data-file # split the datafile into path and filename lstr = raw_path.split("/") self.d_path = "/".join(lstr[0:-1]) + "/" self.d_file = lstr[-1].split(".")[0] self.npzfile = self.d_path + ((self.d_file + "_split")) self.trafile = self.d_path + ((self.d_file + "_fea")) # Added. self.transfer_map = None # check if pre-processed data is available data_ready = True if not path.exists(str(pro_data)): data_ready = False # pre-process data if needed # WARNNING: when memory mapping is used we get a collection of files if data_ready: print("Reading pre-processed data=%s" % (str(pro_data))) file = str(pro_data) else: print("Reading raw data=%s" % (str(raw_path))) file = data_utils.getCriteoAdData( args, raw_path, out_file, sub_sample_rate, partitions, split, ) # get a number of samples per day lstr = pro_data.split("/") pro_path = "/".join(lstr[0:-1]) + "/" pro_file = lstr[-1].split(".")[0] total_file = pro_path + pro_file + "_part_count.npz" with np.load(total_file) as data: total_per_file = data["total_per_file"] # compute offsets per file self.offset_per_file = np.array([0] + [x for x in total_per_file]) for i in range(partitions): self.offset_per_file[i + 1] += self.offset_per_file[i] # print(self.offset_per_file) # setup data # load and preprocess data with np.load(file) as data: X_int = data["X_int"] # continuous feature X_cat = data["X_cat"] # categorical feature y = data["y"] # target self.counts = data["counts"] self.m_den = X_int.shape[1] # den_fea self.m_fea = X_cat.shape[1] # Added. self.n_emb = len(self.counts) print("Sparse fea = %d, Dense fea = %d" % (self.n_emb, self.m_den)) # create reordering indices = np.arange(len(y)) if split == "none": # randomize all data indices = np.random.permutation(indices) print("Randomized indices...") X_int[indices] = X_int X_cat[indices] = X_cat y[indices] = y else: indices = np.array_split(indices, self.offset_per_file[1:-1]) train_indices = np.concatenate(indices[:-1]) test_indices = indices[-1] test_indices, val_indices = np.array_split(test_indices, 2) print("Defined %s indices..." % (split)) # randomize train data (across partitions) train_indices = np.random.permutation(train_indices) print("Randomized indices across partitions ...") # create training, validation, and test sets if split == 'train': self.X_int = [X_int[i] for i in train_indices] self.X_cat = [X_cat[i] for i in train_indices] self.y = [y[i] for i in train_indices] elif split == 'val': self.X_int = [X_int[i] for i in val_indices] self.X_cat = [X_cat[i] for i in val_indices] self.y = [y[i] for i in val_indices] elif split == 'test': self.X_int = [X_int[i] for i in test_indices] self.X_cat = [X_cat[i] for i in test_indices] self.y = [y[i] for i in test_indices] print("Split data according to indices...")