def read_dataset( dataset, mini_batch_size, randomize, num_batches, split=True, raw_data="", processed_data="", inference_only=False, ): # load print("Loading %s dataset..." % dataset) nbatches = 0 num_samples = num_batches * mini_batch_size X_cat, X_int, y, counts = data_utils.loadDataset(dataset, num_samples, raw_data, processed_data) # transform ( X_cat_train, X_int_train, y_train, X_cat_val, X_int_val, y_val, X_cat_test, X_int_test, y_test, ) = data_utils.transformCriteoAdData(X_cat, X_int, y, split, randomize, False) ln_emb = counts m_den = X_int_train.shape[1] n_emb = len(counts) print("Sparse features = %d, Dense features = %d" % (n_emb, m_den)) # adjust parameters if not inference_only: lX = [] lS_lengths = [] lS_indices = [] lT = [] train_nsamples = len(y_train) data_size = train_nsamples nbatches = int(np.floor((data_size * 1.0) / mini_batch_size)) print("Training data") if num_batches != 0 and num_batches < nbatches: print("Limiting to %d batches of the total % d batches" % (num_batches, nbatches)) nbatches = num_batches else: print("Total number of batches %d" % nbatches) # training data main loop for j in range(0, nbatches): # number of data points in a batch print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r") n = min(mini_batch_size, data_size - (j * mini_batch_size)) # dense feature idx_start = j * mini_batch_size # WARNING: X_int_train is a PyTorch tensor lX.append((X_int_train[idx_start:(idx_start + n)]).numpy().astype( np.float32)) # Training targets - outputs # WARNING: y_train is a PyTorch tensor lT.append((y_train[idx_start:idx_start + n]).numpy().reshape( -1, 1).astype(np.int32)) # sparse feature (sparse indices) lS_emb_indices = [] # for each embedding generate a list of n lookups, # where each lookup is composed of multiple sparse indices for size in range(n_emb): lS_batch_indices = [] for _b in range(n): # num of sparse indices to be used per embedding, e.g. for # store lengths and indices lS_batch_indices += (( X_cat_train[idx_start + _b][size].view(-1)).numpy().astype( np.int32)).tolist() lS_emb_indices.append(lS_batch_indices) lS_indices.append(lS_emb_indices) # Criteo Kaggle data it is 1 because data is categorical lS_lengths.append([(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]) print("\n") # adjust parameters lX_test = [] lS_lengths_test = [] lS_indices_test = [] lT_test = [] test_nsamples = len(y_test) data_size = test_nsamples nbatches_test = int(np.floor((data_size * 1.0) / mini_batch_size)) print("Testing data") if num_batches != 0 and num_batches < nbatches_test: print("Limiting to %d batches of the total % d batches" % (num_batches, nbatches_test)) nbatches_test = num_batches else: print("Total number of batches %d" % nbatches_test) # testing data main loop for j in range(0, nbatches_test): # number of data points in a batch print("Reading in batch: %d / %d" % (j + 1, nbatches_test), end="\r") n = min(mini_batch_size, data_size - (j * mini_batch_size)) # dense feature idx_start = j * mini_batch_size # WARNING: X_int_train is a PyTorch tensor lX.append( (X_int_test[idx_start:(idx_start + n)]).numpy().astype(np.float32)) # Training targets - outputs # WARNING: y_train is a PyTorch tensor lT.append((y_test[idx_start:idx_start + n]).numpy().reshape( -1, 1).astype(np.int32)) # sparse feature (sparse indices) lS_emb_indices = [] # for each embedding generate a list of n lookups, # where each lookup is composed of multiple sparse indices for size in range(n_emb): lS_batch_indices = [] for _b in range(n): # num of sparse indices to be used per embedding, e.g. for # store lengths and indices lS_batch_indices += (( X_cat_test[idx_start + _b][size].view(-1)).numpy().astype( np.int32)).tolist() lS_emb_indices.append(lS_batch_indices) lS_indices_test.append(lS_emb_indices) # Criteo Kaggle data it is 1 because data is categorical lS_lengths_test.append([(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]) if not inference_only: return ( nbatches, lX, lS_lengths, lS_indices, lT, nbatches_test, lX_test, lS_lengths_test, lS_indices_test, lT_test, ln_emb, m_den, ) else: return ( nbatches_test, lX_test, lS_lengths_test, lS_indices_test, lT_test, None, None, None, None, None, ln_emb, m_den, )
def read_dataset( dataset, mini_batch_size, randomize, num_batches, split=True, raw_data="", processed_data="", inference_only=False, ): # load print("Loading %s dataset..." % dataset) nbatches = 0 num_samples = num_batches * mini_batch_size X_cat, X_int, y, counts = data_utils.loadDataset( dataset, num_samples, raw_data, processed_data ) # transform ( X_cat_train, X_int_train, y_train, X_cat_val, X_int_val, y_val, X_cat_test, X_int_test, y_test, ) = data_utils.transformCriteoAdData(X_cat, X_int, y, split, randomize, False) ln_emb = counts m_den = X_int_train.shape[1] n_emb = len(counts) print("Sparse features = %d, Dense features = %d" % (n_emb, m_den)) # Remap embedding indices. X_cat_train_remap, X_cat_val_remap, X_cat_test_remap = embedding_index_remap( X_cat_train.numpy(), [X_cat_train.numpy(), X_cat_val.numpy(), X_cat_test.numpy()] ) # adjust parameters if not inference_only: lX = [] lS_offsets = [] lS_indices = [] lT = [] train_nsamples = len(y_train) data_size = train_nsamples nbatches = int(np.floor((data_size * 1.0) / mini_batch_size)) print("Training data") if num_batches != 0 and num_batches < nbatches: print( "Limiting to %d batches of the total % d batches" % (num_batches, nbatches) ) nbatches = num_batches else: print("Total number of batches %d" % nbatches) lX, lT, lS_indices, lS_offsets = prepare_batches( nbatches, mini_batch_size, data_size, X_int_train.numpy(), y_train.numpy(), n_emb, X_cat_train.numpy()) # adjust parameters lX_test = [] lS_offsets_test = [] lS_indices_test = [] lT_test = [] test_nsamples = len(y_test) data_size = test_nsamples nbatches_test = int(np.floor((data_size * 1.0) / mini_batch_size)) print("Testing data") if num_batches != 0 and num_batches < nbatches_test: print( "Limiting to %d batches of the total % d batches" % (num_batches, nbatches_test) ) nbatches_test = num_batches else: print("Total number of batches %d" % nbatches_test) lX_test, lT_test, lS_indices_test, lS_offsets_test = prepare_batches( nbatches_test, mini_batch_size, data_size, X_int_test.numpy(), y_test.numpy(), n_emb, X_cat_test.numpy()) if not inference_only: return ( nbatches, lX, lS_offsets, lS_indices, lT, nbatches_test, lX_test, lS_offsets_test, lS_indices_test, lT_test, ln_emb, m_den, ) else: return ( nbatches_test, lX_test, lS_offsets_test, lS_indices_test, lT_test, None, None, None, None, None, ln_emb, m_den, )
def read_dataset( dataset, mini_batch_size, randomize, num_batches, split=True, raw_data="", processed_data="", ): # load print("Loading %s dataset..." % dataset) nbatches = 0 num_samples = num_batches * mini_batch_size X_cat, X_int, y, counts = data_utils.loadDataset( dataset, num_samples, raw_data, processed_data ) # transform (X_cat_train, X_int_train, y_train, X_cat_val, X_int_val, y_val, X_cat_test, X_int_test, y_test) = data_utils.transformCriteoAdData( X_cat, X_int, y, split, randomize, False ) ln_emb = counts m_den = X_int_train.shape[1] n_emb = len(counts) print("Sparse features = %d, Dense features = %d" % (n_emb, m_den)) # adjust parameters lX = [] lS = [] lS_offsets = [] lS_indices = [] lT = [] train_nsamples = len(y_train) data_size = train_nsamples nbatches = int(np.floor((data_size * 1.0) / mini_batch_size)) print("Training data") if num_batches != 0 and num_batches < nbatches: print( "Limiting to %d batches of the total % d batches" % (num_batches, nbatches) ) nbatches = num_batches else: print("Total number of batches %d" % nbatches) # training data main loop for j in range(0, nbatches): # number of data points in a batch print("Reading in train batch: %d / %d" % (j + 1, nbatches), end="\r") n = min(mini_batch_size, data_size - (j * mini_batch_size)) # dense feature idx_start = j * mini_batch_size # WARNING: X_int_train is a PyTorch tensor Xt = X_int_train[idx_start : (idx_start + n)] Xt = Xt.numpy().astype(np.float32) lX.append(torch.tensor(Xt)) # Training targets - ouptuts # WARNING: y_train is a PyTorch tensor P = y_train[idx_start : idx_start + n] P = P.numpy().reshape(-1, 1).astype(np.float32) lT.append(torch.tensor(P)) # sparse feature (sparse indices) lS_emb = [] lS_emb_offsets = [] lS_emb_indices = [] # for each embedding generate a list of n lookups, # where each lookup is composed of multiple sparse indices for size in range(n_emb): lS_batch = [] lS_batch_offsets = [] lS_batch_indices = [] offset = 0 for _b in range(n): # num of sparse indices to be used per embedding, e.g. for # Criteo Kaggle data it is 1 because data is categorical sparse_group_size = np.int64(1) # WARNING: X_cat_train is a PyTorch tensor sparse_group = X_cat_train[idx_start + _b][size].view(-1) sparse_group = sparse_group.numpy().astype(np.int64) # store lengths and indices lS_batch.append(sparse_group.tolist()) lS_batch_offsets += [offset] lS_batch_indices += sparse_group.tolist() # update offset for next iteration offset += sparse_group_size lS_emb.append(lS_batch) lS_emb_offsets.append(torch.tensor(lS_batch_offsets)) lS_emb_indices.append(torch.tensor(lS_batch_indices)) lS.append(lS_emb) lS_offsets.append(lS_emb_offsets) lS_indices.append(lS_emb_indices) # adjust parameters print("\n") lX_test = [] lS_test = [] lS_offsets_test = [] lS_indices_test = [] lT_test = [] test_nsamples = len(y_test) data_size = test_nsamples nbatches_test = int(np.floor((data_size * 1.0) / mini_batch_size)) print("Testing data") if num_batches != 0 and num_batches < nbatches_test: print( "Limiting to %d batches of the total % d batches" % (num_batches, nbatches_test) ) nbatches_test = num_batches else: print("Total number of batches %d" % nbatches_test) # testing data main loop for j in range(0, nbatches_test): # number of data points in a batch print("Reading in test batch: %d / %d" % (j + 1, nbatches_test), end="\r") n = min(mini_batch_size, data_size - (j * mini_batch_size)) # dense feature idx_start = j * mini_batch_size # WARNING: X_int_test is a PyTorch tensor Xt = X_int_test[idx_start : (idx_start + n)] Xt = Xt.numpy().astype(np.float32) lX_test.append(torch.tensor(Xt)) # Training targets - ouptuts # WARNING: y_test is a PyTorch tensor P = y_test[idx_start : idx_start + n] P = P.numpy().reshape(-1, 1).astype(np.float32) lT_test.append(torch.tensor(P)) # sparse feature (sparse indices) lS_emb = [] lS_emb_offsets = [] lS_emb_indices = [] # for each embedding generate a list of n lookups, # where each lookup is composed of multiple sparse indices for size in range(n_emb): lS_batch = [] lS_batch_offsets = [] lS_batch_indices = [] offset = 0 for _b in range(n): # num of sparse indices to be used per embedding (between # r = 1 # For Criteo data since it categorical num_indices=1 sparse_group_size = np.int64(1) # WARNING: X_cat_test is a PyTorch tensor sparse_group = X_cat_test[idx_start + _b][size].view(-1) sparse_group = sparse_group.numpy().astype(np.int64) # store lengths and indices lS_batch.append(sparse_group.tolist()) lS_batch_offsets += [offset] lS_batch_indices += sparse_group.tolist() # update offset for next iteration offset += sparse_group_size lS_emb.append(lS_batch) lS_emb_offsets.append(torch.tensor(lS_batch_offsets)) lS_emb_indices.append(torch.tensor(lS_batch_indices)) lS_test.append(lS_emb) lS_offsets_test.append(lS_emb_offsets) lS_indices_test.append(lS_emb_indices) return ( nbatches, lX, lS, lS_offsets, lS_indices, lT, nbatches_test, lX_test, lS_test, lS_offsets_test, lS_indices_test, lT_test, ln_emb, m_den, )
def read_dataset( dataset, max_ind_range, sub_sample_rate, mini_batch_size, num_batches, randomize, split="train", raw_data="", processed_data="", memory_map=False, inference_only=False, test_mini_batch_size=1, ): # split the datafile into path and filename lstr = raw_data.split("/") d_path = "/".join(lstr[0:-1]) + "/" d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1] # npzfile = d_path + ((d_file + "_day") if dataset == "kaggle" else d_file) # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea") # load print("Loading %s dataset..." % dataset) nbatches = 0 file, days = data_utils.loadDataset( dataset, max_ind_range, sub_sample_rate, randomize, split, raw_data, processed_data, memory_map, ) if memory_map: # WARNING: at this point the data has been reordered and shuffled across files # e.g. day_<number>_reordered.npz, what remains is simply to read and feed # the data from each file, going in the order of days file-by-file, to the # model during training. train_data = CriteoDatasetWMemoryMap( dataset, max_ind_range, sub_sample_rate, randomize, "train", raw_data, processed_data, ) test_data = CriteoDatasetWMemoryMap( dataset, max_ind_range, sub_sample_rate, randomize, "test", raw_data, processed_data, ) train_loader = torch.utils.data.DataLoader( train_data, batch_size=mini_batch_size, shuffle=False, num_workers=0, collate_fn=collate_wrapper_criteo, pin_memory=False, drop_last=False, # True ) test_loader = torch.utils.data.DataLoader( test_data, batch_size=test_mini_batch_size, shuffle=False, num_workers=0, collate_fn=collate_wrapper_criteo, pin_memory=False, drop_last=False, # True ) return train_data, train_loader, test_data, test_loader else: # load and preprocess data with np.load(file) as data: X_int = data["X_int"] X_cat = data["X_cat"] y = data["y"] counts = data["counts"] # get a number of samples per day total_file = d_path + d_file + "_day_count.npz" with np.load(total_file) as data: total_per_file = data["total_per_file"] # transform ( X_cat_train, X_int_train, y_train, X_cat_val, X_int_val, y_val, X_cat_test, X_int_test, y_test, ) = data_utils.transformCriteoAdData( X_cat, X_int, y, days, split, randomize, total_per_file ) ln_emb = counts m_den = X_int_train.shape[1] n_emb = len(counts) print("Sparse features = %d, Dense features = %d" % (n_emb, m_den)) # adjust parameters def assemble_samples(X_cat, X_int, y, max_ind_range, print_message): if max_ind_range > 0: X_cat = X_cat % max_ind_range nsamples = len(y) data_size = nsamples # using floor is equivalent to dropping last mini-batch (drop_last = True) nbatches = int(np.floor((data_size * 1.0) / mini_batch_size)) print(print_message) if num_batches != 0 and num_batches < nbatches: print( "Limiting to %d batches of the total % d batches" % (num_batches, nbatches) ) nbatches = num_batches else: print("Total number of batches %d" % nbatches) # data main loop lX = [] lS_lengths = [] lS_indices = [] lT = [] for j in range(0, nbatches): # number of data points in a batch print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r") n = min(mini_batch_size, data_size - (j * mini_batch_size)) # dense feature idx_start = j * mini_batch_size lX.append((X_int[idx_start : (idx_start + n)]).astype(np.float32)) # Targets - outputs lT.append( (y[idx_start : idx_start + n]).reshape(-1, 1).astype(np.int32) ) # sparse feature (sparse indices) lS_emb_indices = [] # for each embedding generate a list of n lookups, # where each lookup is composed of multiple sparse indices for size in range(n_emb): lS_batch_indices = [] for _b in range(n): # num of sparse indices to be used per embedding, e.g. for # store lengths and indices lS_batch_indices += ( (X_cat[idx_start + _b][size].reshape(-1)).astype(np.int32) ).tolist() lS_emb_indices.append(lS_batch_indices) lS_indices.append(lS_emb_indices) # Criteo Kaggle data it is 1 because data is categorical lS_lengths.append( [(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)] ) print("\n") return nbatches, lX, lS_lengths, lS_indices, lT # adjust training data (nbatches, lX, lS_lengths, lS_indices, lT) = assemble_samples( X_cat_train, X_int_train, y_train, max_ind_range, "Training data" ) # adjust testing data (nbatches_t, lX_t, lS_lengths_t, lS_indices_t, lT_t) = assemble_samples( X_cat_test, X_int_test, y_test, max_ind_range, "Testing data" ) # end if memory_map return ( nbatches, lX, lS_lengths, lS_indices, lT, nbatches_t, lX_t, lS_lengths_t, lS_indices_t, lT_t, ln_emb, m_den, )