示例#1
0
def read_dataset(
    dataset,
    mini_batch_size,
    randomize,
    num_batches,
    split=True,
    raw_data="",
    processed_data="",
    inference_only=False,
):
    # load
    print("Loading %s dataset..." % dataset)
    nbatches = 0
    num_samples = num_batches * mini_batch_size
    X_cat, X_int, y, counts = data_utils.loadDataset(dataset, num_samples,
                                                     raw_data, processed_data)

    # transform
    (
        X_cat_train,
        X_int_train,
        y_train,
        X_cat_val,
        X_int_val,
        y_val,
        X_cat_test,
        X_int_test,
        y_test,
    ) = data_utils.transformCriteoAdData(X_cat, X_int, y, split, randomize,
                                         False)
    ln_emb = counts
    m_den = X_int_train.shape[1]
    n_emb = len(counts)
    print("Sparse features = %d, Dense features = %d" % (n_emb, m_den))

    # adjust parameters
    if not inference_only:
        lX = []
        lS_lengths = []
        lS_indices = []
        lT = []
        train_nsamples = len(y_train)
        data_size = train_nsamples
        nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
        print("Training data")
        if num_batches != 0 and num_batches < nbatches:
            print("Limiting to %d batches of the total % d batches" %
                  (num_batches, nbatches))
            nbatches = num_batches
        else:
            print("Total number of batches %d" % nbatches)

        # training data main loop
        for j in range(0, nbatches):
            # number of data points in a batch
            print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r")
            n = min(mini_batch_size, data_size - (j * mini_batch_size))
            # dense feature
            idx_start = j * mini_batch_size
            # WARNING: X_int_train is a PyTorch tensor
            lX.append((X_int_train[idx_start:(idx_start + n)]).numpy().astype(
                np.float32))
            # Training targets - outputs
            # WARNING: y_train is a PyTorch tensor
            lT.append((y_train[idx_start:idx_start + n]).numpy().reshape(
                -1, 1).astype(np.int32))
            # sparse feature (sparse indices)
            lS_emb_indices = []
            # for each embedding generate a list of n lookups,
            # where each lookup is composed of multiple sparse indices
            for size in range(n_emb):
                lS_batch_indices = []
                for _b in range(n):
                    # num of sparse indices to be used per embedding, e.g. for
                    # store lengths and indices
                    lS_batch_indices += ((
                        X_cat_train[idx_start +
                                    _b][size].view(-1)).numpy().astype(
                                        np.int32)).tolist()
                lS_emb_indices.append(lS_batch_indices)
            lS_indices.append(lS_emb_indices)
            # Criteo Kaggle data it is 1 because data is categorical
            lS_lengths.append([(list(np.ones(n).astype(np.int32)))
                               for _ in range(n_emb)])
        print("\n")

    # adjust parameters
    lX_test = []
    lS_lengths_test = []
    lS_indices_test = []
    lT_test = []
    test_nsamples = len(y_test)
    data_size = test_nsamples
    nbatches_test = int(np.floor((data_size * 1.0) / mini_batch_size))
    print("Testing data")
    if num_batches != 0 and num_batches < nbatches_test:
        print("Limiting to %d batches of the total % d batches" %
              (num_batches, nbatches_test))
        nbatches_test = num_batches
    else:
        print("Total number of batches %d" % nbatches_test)

    # testing data main loop
    for j in range(0, nbatches_test):
        # number of data points in a batch
        print("Reading in batch: %d / %d" % (j + 1, nbatches_test), end="\r")
        n = min(mini_batch_size, data_size - (j * mini_batch_size))
        # dense feature
        idx_start = j * mini_batch_size
        # WARNING: X_int_train is a PyTorch tensor
        lX.append(
            (X_int_test[idx_start:(idx_start + n)]).numpy().astype(np.float32))
        # Training targets - outputs
        # WARNING: y_train is a PyTorch tensor
        lT.append((y_test[idx_start:idx_start + n]).numpy().reshape(
            -1, 1).astype(np.int32))
        # sparse feature (sparse indices)
        lS_emb_indices = []
        # for each embedding generate a list of n lookups,
        # where each lookup is composed of multiple sparse indices
        for size in range(n_emb):
            lS_batch_indices = []
            for _b in range(n):
                # num of sparse indices to be used per embedding, e.g. for
                # store lengths and indices
                lS_batch_indices += ((
                    X_cat_test[idx_start + _b][size].view(-1)).numpy().astype(
                        np.int32)).tolist()
            lS_emb_indices.append(lS_batch_indices)
        lS_indices_test.append(lS_emb_indices)
        # Criteo Kaggle data it is 1 because data is categorical
        lS_lengths_test.append([(list(np.ones(n).astype(np.int32)))
                                for _ in range(n_emb)])

    if not inference_only:
        return (
            nbatches,
            lX,
            lS_lengths,
            lS_indices,
            lT,
            nbatches_test,
            lX_test,
            lS_lengths_test,
            lS_indices_test,
            lT_test,
            ln_emb,
            m_den,
        )
    else:
        return (
            nbatches_test,
            lX_test,
            lS_lengths_test,
            lS_indices_test,
            lT_test,
            None,
            None,
            None,
            None,
            None,
            ln_emb,
            m_den,
        )
示例#2
0
def read_dataset(
    dataset,
    mini_batch_size,
    randomize,
    num_batches,
    split=True,
    raw_data="",
    processed_data="",
    inference_only=False,
):
    # load
    print("Loading %s dataset..." % dataset)
    nbatches = 0
    num_samples = num_batches * mini_batch_size
    X_cat, X_int, y, counts = data_utils.loadDataset(
        dataset, num_samples, raw_data, processed_data
    )

    # transform
    (
        X_cat_train,
        X_int_train,
        y_train,
        X_cat_val,
        X_int_val,
        y_val,
        X_cat_test,
        X_int_test,
        y_test,
    ) = data_utils.transformCriteoAdData(X_cat, X_int, y, split, randomize, False)
    ln_emb = counts
    m_den = X_int_train.shape[1]
    n_emb = len(counts)
    print("Sparse features = %d, Dense features = %d" % (n_emb, m_den))

    # Remap embedding indices.
    X_cat_train_remap, X_cat_val_remap, X_cat_test_remap = embedding_index_remap(
        X_cat_train.numpy(),
        [X_cat_train.numpy(), X_cat_val.numpy(), X_cat_test.numpy()]
    )

    # adjust parameters
    if not inference_only:
        lX = []
        lS_offsets = []
        lS_indices = []
        lT = []
        train_nsamples = len(y_train)
        data_size = train_nsamples
        nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
        print("Training data")
        if num_batches != 0 and num_batches < nbatches:
            print(
                "Limiting to %d batches of the total % d batches"
                % (num_batches, nbatches)
            )
            nbatches = num_batches
        else:
            print("Total number of batches %d" % nbatches)

        lX, lT, lS_indices, lS_offsets = prepare_batches(
            nbatches, mini_batch_size, data_size, X_int_train.numpy(),
            y_train.numpy(), n_emb, X_cat_train.numpy())

    # adjust parameters
    lX_test = []
    lS_offsets_test = []
    lS_indices_test = []
    lT_test = []
    test_nsamples = len(y_test)
    data_size = test_nsamples
    nbatches_test = int(np.floor((data_size * 1.0) / mini_batch_size))
    print("Testing data")
    if num_batches != 0 and num_batches < nbatches_test:
        print(
            "Limiting to %d batches of the total % d batches"
            % (num_batches, nbatches_test)
        )
        nbatches_test = num_batches
    else:
        print("Total number of batches %d" % nbatches_test)

    lX_test, lT_test, lS_indices_test, lS_offsets_test = prepare_batches(
        nbatches_test, mini_batch_size, data_size, X_int_test.numpy(),
        y_test.numpy(), n_emb, X_cat_test.numpy())

    if not inference_only:
        return (
            nbatches,
            lX,
            lS_offsets,
            lS_indices,
            lT,
            nbatches_test,
            lX_test,
            lS_offsets_test,
            lS_indices_test,
            lT_test,
            ln_emb,
            m_den,
        )
    else:
        return (
            nbatches_test,
            lX_test,
            lS_offsets_test,
            lS_indices_test,
            lT_test,
            None,
            None,
            None,
            None,
            None,
            ln_emb,
            m_den,
        )
示例#3
0
def read_dataset(
    dataset,
    mini_batch_size,
    randomize,
    num_batches,
    split=True,
    raw_data="",
    processed_data="",
):
    # load
    print("Loading %s dataset..." % dataset)
    nbatches = 0
    num_samples = num_batches * mini_batch_size
    X_cat, X_int, y, counts = data_utils.loadDataset(
        dataset, num_samples, raw_data, processed_data
    )

    # transform
    (X_cat_train, X_int_train, y_train,
     X_cat_val, X_int_val, y_val,
     X_cat_test, X_int_test, y_test) = data_utils.transformCriteoAdData(
        X_cat, X_int, y, split, randomize, False
    )
    ln_emb = counts
    m_den = X_int_train.shape[1]
    n_emb = len(counts)
    print("Sparse features = %d, Dense features = %d" % (n_emb, m_den))

    # adjust parameters
    lX = []
    lS = []
    lS_offsets = []
    lS_indices = []
    lT = []
    train_nsamples = len(y_train)
    data_size = train_nsamples
    nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
    print("Training data")
    if num_batches != 0 and num_batches < nbatches:
        print(
            "Limiting to %d batches of the total % d batches" % (num_batches, nbatches)
        )
        nbatches = num_batches
    else:
        print("Total number of batches %d" % nbatches)

    # training data main loop
    for j in range(0, nbatches):
        # number of data points in a batch
        print("Reading in train batch: %d / %d" % (j + 1, nbatches), end="\r")
        n = min(mini_batch_size, data_size - (j * mini_batch_size))
        # dense feature
        idx_start = j * mini_batch_size
        # WARNING: X_int_train is a PyTorch tensor
        Xt = X_int_train[idx_start : (idx_start + n)]
        Xt = Xt.numpy().astype(np.float32)
        lX.append(torch.tensor(Xt))
        # Training targets - ouptuts
        # WARNING: y_train is a PyTorch tensor
        P = y_train[idx_start : idx_start + n]
        P = P.numpy().reshape(-1, 1).astype(np.float32)
        lT.append(torch.tensor(P))
        # sparse feature (sparse indices)
        lS_emb = []
        lS_emb_offsets = []
        lS_emb_indices = []
        # for each embedding generate a list of n lookups,
        # where each lookup is composed of multiple sparse indices
        for size in range(n_emb):
            lS_batch = []
            lS_batch_offsets = []
            lS_batch_indices = []
            offset = 0
            for _b in range(n):
                # num of sparse indices to be used per embedding, e.g. for
                # Criteo Kaggle data it is 1 because data is categorical
                sparse_group_size = np.int64(1)
                # WARNING: X_cat_train is a PyTorch tensor
                sparse_group = X_cat_train[idx_start + _b][size].view(-1)
                sparse_group = sparse_group.numpy().astype(np.int64)
                # store lengths and indices
                lS_batch.append(sparse_group.tolist())
                lS_batch_offsets += [offset]
                lS_batch_indices += sparse_group.tolist()
                # update offset for next iteration
                offset += sparse_group_size
            lS_emb.append(lS_batch)
            lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
            lS_emb_indices.append(torch.tensor(lS_batch_indices))
        lS.append(lS_emb)
        lS_offsets.append(lS_emb_offsets)
        lS_indices.append(lS_emb_indices)

    # adjust parameters
    print("\n")
    lX_test = []
    lS_test = []
    lS_offsets_test = []
    lS_indices_test = []
    lT_test = []
    test_nsamples = len(y_test)
    data_size = test_nsamples
    nbatches_test = int(np.floor((data_size * 1.0) / mini_batch_size))
    print("Testing data")
    if num_batches != 0 and num_batches < nbatches_test:
        print(
            "Limiting to %d batches of the total % d batches"
            % (num_batches, nbatches_test)
        )
        nbatches_test = num_batches
    else:
        print("Total number of batches %d" % nbatches_test)

    # testing data main loop
    for j in range(0, nbatches_test):
        # number of data points in a batch
        print("Reading in test batch: %d / %d" % (j + 1, nbatches_test), end="\r")
        n = min(mini_batch_size, data_size - (j * mini_batch_size))
        # dense feature
        idx_start = j * mini_batch_size
        # WARNING: X_int_test is a PyTorch tensor
        Xt = X_int_test[idx_start : (idx_start + n)]
        Xt = Xt.numpy().astype(np.float32)
        lX_test.append(torch.tensor(Xt))
        # Training targets - ouptuts
        # WARNING: y_test is a PyTorch tensor
        P = y_test[idx_start : idx_start + n]
        P = P.numpy().reshape(-1, 1).astype(np.float32)
        lT_test.append(torch.tensor(P))
        # sparse feature (sparse indices)
        lS_emb = []
        lS_emb_offsets = []
        lS_emb_indices = []
        # for each embedding generate a list of n lookups,
        # where each lookup is composed of multiple sparse indices
        for size in range(n_emb):
            lS_batch = []
            lS_batch_offsets = []
            lS_batch_indices = []
            offset = 0
            for _b in range(n):
                # num of sparse indices to be used per embedding (between
                # r = 1 # For Criteo data since it categorical num_indices=1
                sparse_group_size = np.int64(1)
                # WARNING: X_cat_test is a PyTorch tensor
                sparse_group = X_cat_test[idx_start + _b][size].view(-1)
                sparse_group = sparse_group.numpy().astype(np.int64)
                # store lengths and indices
                lS_batch.append(sparse_group.tolist())
                lS_batch_offsets += [offset]
                lS_batch_indices += sparse_group.tolist()
                # update offset for next iteration
                offset += sparse_group_size
            lS_emb.append(lS_batch)
            lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
            lS_emb_indices.append(torch.tensor(lS_batch_indices))
        lS_test.append(lS_emb)
        lS_offsets_test.append(lS_emb_offsets)
        lS_indices_test.append(lS_emb_indices)

    return (
        nbatches,
        lX,
        lS,
        lS_offsets,
        lS_indices,
        lT,
        nbatches_test,
        lX_test,
        lS_test,
        lS_offsets_test,
        lS_indices_test,
        lT_test,
        ln_emb,
        m_den,
    )
示例#4
0
def read_dataset(
    dataset,
    max_ind_range,
    sub_sample_rate,
    mini_batch_size,
    num_batches,
    randomize,
    split="train",
    raw_data="",
    processed_data="",
    memory_map=False,
    inference_only=False,
    test_mini_batch_size=1,
):
    # split the datafile into path and filename
    lstr = raw_data.split("/")
    d_path = "/".join(lstr[0:-1]) + "/"
    d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
    # npzfile = d_path + ((d_file + "_day") if dataset == "kaggle" else d_file)
    # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")

    # load
    print("Loading %s dataset..." % dataset)
    nbatches = 0
    file, days = data_utils.loadDataset(
        dataset,
        max_ind_range,
        sub_sample_rate,
        randomize,
        split,
        raw_data,
        processed_data,
        memory_map,
    )

    if memory_map:
        # WARNING: at this point the data has been reordered and shuffled across files
        # e.g. day_<number>_reordered.npz, what remains is simply to read and feed
        # the data from each file, going in the order of days file-by-file, to the
        # model during training.
        train_data = CriteoDatasetWMemoryMap(
            dataset,
            max_ind_range,
            sub_sample_rate,
            randomize,
            "train",
            raw_data,
            processed_data,
        )

        test_data = CriteoDatasetWMemoryMap(
            dataset,
            max_ind_range,
            sub_sample_rate,
            randomize,
            "test",
            raw_data,
            processed_data,
        )

        train_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=mini_batch_size,
            shuffle=False,
            num_workers=0,
            collate_fn=collate_wrapper_criteo,
            pin_memory=False,
            drop_last=False,  # True
        )

        test_loader = torch.utils.data.DataLoader(
            test_data,
            batch_size=test_mini_batch_size,
            shuffle=False,
            num_workers=0,
            collate_fn=collate_wrapper_criteo,
            pin_memory=False,
            drop_last=False,  # True
        )

        return train_data, train_loader, test_data, test_loader

    else:
        # load and preprocess data
        with np.load(file) as data:
            X_int = data["X_int"]
            X_cat = data["X_cat"]
            y = data["y"]
            counts = data["counts"]

        # get a number of samples per day
        total_file = d_path + d_file + "_day_count.npz"
        with np.load(total_file) as data:
            total_per_file = data["total_per_file"]

        # transform
        (
            X_cat_train,
            X_int_train,
            y_train,
            X_cat_val,
            X_int_val,
            y_val,
            X_cat_test,
            X_int_test,
            y_test,
        ) = data_utils.transformCriteoAdData(
            X_cat, X_int, y, days, split, randomize, total_per_file
        )
        ln_emb = counts
        m_den = X_int_train.shape[1]
        n_emb = len(counts)
        print("Sparse features = %d, Dense features = %d" % (n_emb, m_den))

        # adjust parameters
        def assemble_samples(X_cat, X_int, y, max_ind_range, print_message):
            if max_ind_range > 0:
                X_cat = X_cat % max_ind_range

            nsamples = len(y)
            data_size = nsamples
            # using floor is equivalent to dropping last mini-batch (drop_last = True)
            nbatches = int(np.floor((data_size * 1.0) / mini_batch_size))
            print(print_message)
            if num_batches != 0 and num_batches < nbatches:
                print(
                    "Limiting to %d batches of the total % d batches"
                    % (num_batches, nbatches)
                )
                nbatches = num_batches
            else:
                print("Total number of batches %d" % nbatches)

            # data main loop
            lX = []
            lS_lengths = []
            lS_indices = []
            lT = []
            for j in range(0, nbatches):
                # number of data points in a batch
                print("Reading in batch: %d / %d" % (j + 1, nbatches), end="\r")
                n = min(mini_batch_size, data_size - (j * mini_batch_size))
                # dense feature
                idx_start = j * mini_batch_size
                lX.append((X_int[idx_start : (idx_start + n)]).astype(np.float32))
                # Targets - outputs
                lT.append(
                    (y[idx_start : idx_start + n]).reshape(-1, 1).astype(np.int32)
                )
                # sparse feature (sparse indices)
                lS_emb_indices = []
                # for each embedding generate a list of n lookups,
                # where each lookup is composed of multiple sparse indices
                for size in range(n_emb):
                    lS_batch_indices = []
                    for _b in range(n):
                        # num of sparse indices to be used per embedding, e.g. for
                        # store lengths and indices
                        lS_batch_indices += (
                            (X_cat[idx_start + _b][size].reshape(-1)).astype(np.int32)
                        ).tolist()
                    lS_emb_indices.append(lS_batch_indices)
                lS_indices.append(lS_emb_indices)
                # Criteo Kaggle data it is 1 because data is categorical
                lS_lengths.append(
                    [(list(np.ones(n).astype(np.int32))) for _ in range(n_emb)]
                )
            print("\n")

            return nbatches, lX, lS_lengths, lS_indices, lT

        # adjust training data
        (nbatches, lX, lS_lengths, lS_indices, lT) = assemble_samples(
            X_cat_train, X_int_train, y_train, max_ind_range, "Training data"
        )

        # adjust testing data
        (nbatches_t, lX_t, lS_lengths_t, lS_indices_t, lT_t) = assemble_samples(
            X_cat_test, X_int_test, y_test, max_ind_range, "Testing data"
        )
    # end if memory_map

    return (
        nbatches,
        lX,
        lS_lengths,
        lS_indices,
        lT,
        nbatches_t,
        lX_t,
        lS_lengths_t,
        lS_indices_t,
        lT_t,
        ln_emb,
        m_den,
    )