示例#1
0
def fetch_GLASS(path, valid_size=0.2, test_size=0.2, seed=None):

    path = Path(path)
    data_path = path / 'glass.data'

    if not data_path.exists():
        path.mkdir(parents=True)

        download(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data',
            data_path)

    data = np.genfromtxt(data_path, delimiter=',')

    X, Y = (data[:, 1:-1]).astype(np.float32), (data[:, -1] - 1).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        stratify=Y,
                                                        test_size=test_size,
                                                        random_state=seed)

    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      stratify=y_train,
                                                      test_size=valid_size /
                                                      (1 - test_size),
                                                      random_state=seed)

    return dict(X_train=X_train,
                y_train=y_train,
                X_valid=X_val,
                y_valid=y_val,
                X_test=X_test,
                y_test=y_test)
示例#2
0
def fetch_MICROSOFT(path, **kwargs):
    train_path = os.path.join(path, 'msrank_train.tsv')
    test_path = os.path.join(path, 'msrank_test.tsv')
    if not all(os.path.exists(fname) for fname in (train_path, test_path)):
        os.makedirs(path, exist_ok=True)
        download(
            "https://www.dropbox.com/s/izpty5feug57kqn/msrank_train.tsv?dl=1",
            train_path)
        download(
            "https://www.dropbox.com/s/tlsmm9a6krv0215/msrank_test.tsv?dl=1",
            test_path)

        for fname in (train_path, test_path):
            raw = open(fname).read().replace('\\t', '\t')
            with open(fname, 'w') as f:
                f.write(raw)

    data_train = pd.read_csv(train_path, header=None, skiprows=1, sep='\t')
    data_test = pd.read_csv(test_path, header=None, skiprows=1, sep='\t')

    train_idx_path = os.path.join(path, 'train_idx.txt')
    valid_idx_path = os.path.join(path, 'valid_idx.txt')
    if not all(
            os.path.exists(fname)
            for fname in (train_idx_path, valid_idx_path)):
        download(
            "https://www.dropbox.com/s/pba6dyibyogep46/train_idx.txt?dl=1",
            train_idx_path)
        download(
            "https://www.dropbox.com/s/yednqu9edgdd2l1/valid_idx.txt?dl=1",
            valid_idx_path)
    train_idx = pd.read_csv(train_idx_path, header=None)[0].values
    valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values

    X_train, y_train, query_train = data_train.iloc[
        train_idx,
        2:].values, data_train.iloc[train_idx,
                                    0].values, data_train.iloc[train_idx,
                                                               1].values
    X_valid, y_valid, query_valid = data_train.iloc[
        valid_idx,
        2:].values, data_train.iloc[valid_idx,
                                    0].values, data_train.iloc[valid_idx,
                                                               1].values
    X_test, y_test, query_test = data_test.iloc[:,
                                                2:].values, data_test.iloc[:,
                                                                           0].values, data_test.iloc[:,
                                                                                                     1].values

    return dict(
        X_train=X_train.astype(np.float32),
        y_train=y_train.astype(np.int64),
        query_train=query_train,
        X_valid=X_valid.astype(np.float32),
        y_valid=y_valid.astype(np.int64),
        query_valid=query_valid,
        X_test=X_test.astype(np.float32),
        y_test=y_test.astype(np.int64),
        query_test=query_test,
    )
示例#3
0
def fetch_COVTYPE(path, valid_size=0.2, test_size=0.2, seed=None):

    path = Path(path)
    data_path = path / 'covtype.data'

    if not data_path.exists():
        path.mkdir(parents=True)
        archive_path = path / 'covtype.data.gz'
        download(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz',
            archive_path)

        with gzip.open(archive_path, 'rb') as f_in:

            with open(data_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    data = np.genfromtxt(data_path, delimiter=',')

    X, Y = (data[:, :-1]).astype(np.float32), (data[:, -1] - 1).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        stratify=Y,
                                                        test_size=test_size,
                                                        random_state=seed)

    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      stratify=y_train,
                                                      test_size=valid_size /
                                                      (1 - test_size),
                                                      random_state=seed)

    return dict(X_train=X_train,
                y_train=y_train,
                X_valid=X_val,
                y_valid=y_val,
                X_test=X_test,
                y_test=y_test)
示例#4
0
def fetch_HIGGS(path,
                train_size=None,
                valid_size=None,
                test_size=5 * 10**5,
                **kwargs):
    data_path = os.path.join(path, 'higgs.csv')
    if not os.path.exists(data_path):
        os.makedirs(path, exist_ok=True)
        archive_path = os.path.join(path, 'HIGGS.csv.gz')
        download(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz',
            archive_path)
        with gzip.open(archive_path, 'rb') as f_in:
            with open(data_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    n_features = 29
    types = {i: (np.float32 if i != 0 else np.int) for i in range(n_features)}
    data = pd.read_csv(data_path, header=None, dtype=types)
    data_train, data_test = data.iloc[:-test_size], data.iloc[-test_size:]

    X_train, y_train = data_train.iloc[:, 1:].values, data_train.iloc[:,
                                                                      0].values
    X_test, y_test = data_test.iloc[:, 1:].values, data_test.iloc[:, 0].values

    if all(sizes is None for sizes in (train_size, valid_size)):
        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')
        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')
        if not all(
                os.path.exists(fname)
                for fname in (train_idx_path, valid_idx_path)):
            download(
                "https://www.dropbox.com/s/i2uekmwqnp9r4ix/stratified_train_idx.txt?dl=1",
                train_idx_path)
            download(
                "https://www.dropbox.com/s/wkbk74orytmb2su/stratified_valid_idx.txt?dl=1",
                valid_idx_path)
        train_idx = pd.read_csv(train_idx_path, header=None)[0].values
        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values
    else:
        assert train_size, "please provide either train_size or none of sizes"
        if valid_size is None:
            valid_size = len(X_train) - train_size
            assert valid_size > 0
        if train_size + valid_size > len(X_train):
            warnings.warn(
                'train_size + valid_size = {} exceeds dataset size: {}.'.
                format(train_size + valid_size, len(X_train)), Warning)

        shuffled_indices = np.random.permutation(np.arange(len(X_train)))
        train_idx = shuffled_indices[:train_size]
        valid_idx = shuffled_indices[train_size:train_size + valid_size]

    return dict(
        X_train=X_train[train_idx],
        y_train=y_train[train_idx],
        X_valid=X_train[valid_idx],
        y_valid=y_train[valid_idx],
        X_test=X_test,
        y_test=y_test,
    )
示例#5
0
def fetch_TICTACTOE(path, valid_size=0.2, test_size=0.2, seed=None):

    path = Path(path)
    data_path = path / 'tic-tac-toe.data'

    if not data_path.exists():
        path.mkdir(parents=True, exist_ok=True)

        download(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-tac-toe/tic-tac-toe.data',
            data_path)

    data = pd.read_csv(data_path, names=np.arange(10))
    encoder = OrdinalEncoder(return_df=False)
    data = encoder.fit_transform(data)

    X, Y = (data[:, :-1]).astype(np.float32), (data[:, -1] - 1).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        stratify=Y,
                                                        test_size=test_size,
                                                        random_state=seed)

    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      stratify=y_train,
                                                      test_size=valid_size /
                                                      (1 - test_size),
                                                      random_state=seed)

    return dict(X_train=X_train,
                y_train=y_train,
                X_valid=X_val,
                y_valid=y_val,
                X_test=X_test,
                y_test=y_test)
示例#6
0
def fetch_YEAR(path,
               train_size=None,
               valid_size=None,
               test_size=51630,
               **kwargs):
    data_path = os.path.join(path, 'data.csv')
    if not os.path.exists(data_path):
        os.makedirs(path, exist_ok=True)
        download(
            'https://www.dropbox.com/s/l09pug0ywaqsy0e/YearPredictionMSD.txt?dl=1',
            data_path)
    n_features = 91
    types = {i: (np.float32 if i != 0 else np.int) for i in range(n_features)}
    data = pd.read_csv(data_path, header=None, dtype=types)
    data_train, data_test = data.iloc[:-test_size], data.iloc[-test_size:]

    X_train, y_train = data_train.iloc[:, 1:].values, data_train.iloc[:,
                                                                      0].values
    X_test, y_test = data_test.iloc[:, 1:].values, data_test.iloc[:, 0].values

    if all(sizes is None for sizes in (train_size, valid_size)):
        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')
        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')
        if not all(
                os.path.exists(fname)
                for fname in (train_idx_path, valid_idx_path)):
            download(
                "https://www.dropbox.com/s/00u6cnj9mthvzj1/stratified_train_idx.txt?dl=1",
                train_idx_path)
            download(
                "https://www.dropbox.com/s/420uhjvjab1bt7k/stratified_valid_idx.txt?dl=1",
                valid_idx_path)
        train_idx = pd.read_csv(train_idx_path, header=None)[0].values
        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values
    else:
        assert train_size, "please provide either train_size or none of sizes"
        if valid_size is None:
            valid_size = len(X_train) - train_size
            assert valid_size > 0
        if train_size + valid_size > len(X_train):
            warnings.warn(
                'train_size + valid_size = {} exceeds dataset size: {}.'.
                format(train_size + valid_size, len(X_train)), Warning)

        shuffled_indices = np.random.permutation(np.arange(len(X_train)))
        train_idx = shuffled_indices[:train_size]
        valid_idx = shuffled_indices[train_size:train_size + valid_size]

    return dict(
        X_train=X_train[train_idx],
        y_train=y_train[train_idx],
        X_valid=X_train[valid_idx],
        y_valid=y_train[valid_idx],
        X_test=X_test,
        y_test=y_test,
    )
示例#7
0
def fetch_ALOI(path, valid_size=0.2, test_size=0.2, seed=None):

    from PIL import Image
    from tqdm import tqdm

    path = Path(path)
    data_path = path / 'aloi_red4'
    npz_path = path / 'aloi_red4.npz'

    if not data_path.exists():

        path.mkdir(parents=True, exist_ok=True)

        for data_type in ["ill", "col", "view", "stereo"]:

            archive_path = path / f'aloi_red4_{data_type}.tar'

            download(
                f'http://aloi.science.uva.nl/tars/aloi_red4_{data_type}.tar',
                archive_path)

            with tarfile.open(archive_path, 'r') as f_in:

                f_in.extractall(path=data_path)

    if not npz_path.exists():

        X = np.empty((110250, 3, 144, 192), dtype=np.uint8)
        Y = np.empty(110250, dtype=np.uint16)

        # loop over classes
        i = 0
        for c in tqdm(range(1000), desc='Converting ALOI png to npy'):

            c_path = data_path / "png4" / str(c + 1)

            # loop over class instances
            for i_path in c_path.glob('*.png'):

                im_frame = Image.open(i_path)
                X[i] = np.transpose(np.array(im_frame), (2, 0, 1))
                Y[i] = c
                i += 1

        np.savez_compressed(npz_path, X=X, Y=Y)

    else:

        data = np.load(npz_path)
        X, Y = data['X'], data['Y']

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        stratify=Y,
                                                        test_size=test_size,
                                                        random_state=seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                      y_train,
                                                      stratify=y_train,
                                                      test_size=valid_size /
                                                      (1 - test_size),
                                                      random_state=seed)

    return dict(X_train=X_train,
                y_train=y_train,
                X_valid=X_val,
                y_valid=y_val,
                X_test=X_test,
                y_test=y_test)
示例#8
0
def fetch_EPSILON(path,
                  train_size=None,
                  valid_size=None,
                  test_size=None,
                  **kwargs):
    train_path = os.path.join(path, 'epsilon_normalized')
    test_path = os.path.join(path, 'epsilon_normalized.t')
    if not all(os.path.exists(fname) for fname in (train_path, test_path)):
        os.makedirs(path, exist_ok=True)
        train_archive_path = os.path.join(path, 'epsilon_normalized.bz2')
        test_archive_path = os.path.join(path, 'epsilon_normalized.t.bz2')
        if not all(
                os.path.exists(fname)
                for fname in (train_archive_path, test_archive_path)):
            download(
                "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.bz2",
                train_archive_path)
            download(
                "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.t.bz2",
                test_archive_path)
        print("unpacking dataset")
        for file_name, archive_name in zip(
            (train_path, test_path), (train_archive_path, test_archive_path)):
            zipfile = bz2.BZ2File(archive_name)
            with open(file_name, 'wb') as f:
                f.write(zipfile.read())

    print("reading dataset (it may take a long time)")
    X_train, y_train = load_svmlight_file(train_path,
                                          dtype=np.float32,
                                          n_features=2000)
    X_test, y_test = load_svmlight_file(test_path,
                                        dtype=np.float32,
                                        n_features=2000)
    X_train, X_test = X_train.toarray(), X_test.toarray()
    y_train, y_test = y_train.astype(np.int), y_test.astype(np.int)
    y_train[y_train == -1] = 0
    y_test[y_test == -1] = 0

    if all(sizes is None for sizes in (train_size, valid_size, test_size)):
        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')
        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')
        if not all(
                os.path.exists(fname)
                for fname in (train_idx_path, valid_idx_path)):
            download(
                "https://www.dropbox.com/s/wxgm94gvm6d3xn5/stratified_train_idx.txt?dl=1",
                train_idx_path)
            download(
                "https://www.dropbox.com/s/fm4llo5uucdglti/stratified_valid_idx.txt?dl=1",
                valid_idx_path)
        train_idx = pd.read_csv(train_idx_path, header=None)[0].values
        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values
    else:
        assert train_size, "please provide either train_size or none of sizes"
        if valid_size is None:
            valid_size = len(X_train) - train_size
            assert valid_size > 0
        if train_size + valid_size > len(X_train):
            warnings.warn(
                'train_size + valid_size = {} exceeds dataset size: {}.'.
                format(train_size + valid_size, len(X_train)), Warning)
        if test_size is not None:
            warnings.warn('Test set is fixed for this dataset.', Warning)

        shuffled_indices = np.random.permutation(np.arange(len(X_train)))
        train_idx = shuffled_indices[:train_size]
        valid_idx = shuffled_indices[train_size:train_size + valid_size]

    return dict(X_train=X_train[train_idx],
                y_train=y_train[train_idx],
                X_valid=X_train[valid_idx],
                y_valid=y_train[valid_idx],
                X_test=X_test,
                y_test=y_test)
示例#9
0
def fetch_YAHOO(path, *args):
    train_path = os.path.join(path, 'yahoo_train.tsv')
    valid_path = os.path.join(path, 'yahoo_valid.tsv')
    test_path = os.path.join(path, 'yahoo_test.tsv')
    if not all(
            os.path.exists(fname)
            for fname in (train_path, valid_path, test_path)):
        os.makedirs(path, exist_ok=True)
        train_archive_path = os.path.join(path, 'yahoo_train.tsv.gz')
        valid_archive_path = os.path.join(path, 'yahoo_valid.tsv.gz')
        test_archive_path = os.path.join(path, 'yahoo_test.tsv.gz')
        if not all(
                os.path.exists(fname)
                for fname in (train_archive_path, valid_archive_path,
                              test_archive_path)):
            download(
                "https://www.dropbox.com/s/7rq3ki5vtxm6gzx/yahoo_set_1_train.gz?dl=1",
                train_archive_path)
            download(
                "https://www.dropbox.com/s/3ai8rxm1v0l5sd1/yahoo_set_1_validation.gz?dl=1",
                valid_archive_path)
            download(
                "https://www.dropbox.com/s/3d7tdfb1an0b6i4/yahoo_set_1_test.gz?dl=1",
                test_archive_path)

        for file_name, archive_name in zip(
            (train_path, valid_path, test_path),
            (train_archive_path, valid_archive_path, test_archive_path)):
            with gzip.open(archive_name, 'rb') as f_in:
                with open(file_name, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

        for fname in (train_path, valid_path, test_path):
            raw = open(fname).read().replace('\\t', '\t')
            with open(fname, 'w') as f:
                f.write(raw)

    data_train = pd.read_csv(train_path, header=None, skiprows=1, sep='\t')
    data_valid = pd.read_csv(valid_path, header=None, skiprows=1, sep='\t')
    data_test = pd.read_csv(test_path, header=None, skiprows=1, sep='\t')

    X_train, y_train, query_train = data_train.iloc[:,
                                                    2:].values, data_train.iloc[:,
                                                                                0].values, data_train.iloc[:,
                                                                                                           1].values
    X_valid, y_valid, query_valid = data_valid.iloc[:,
                                                    2:].values, data_valid.iloc[:,
                                                                                0].values, data_valid.iloc[:,
                                                                                                           1].values
    X_test, y_test, query_test = data_test.iloc[:,
                                                2:].values, data_test.iloc[:,
                                                                           0].values, data_test.iloc[:,
                                                                                                     1].values

    return dict(
        X_train=X_train.astype(np.float32),
        y_train=y_train,
        query_train=query_train,
        X_valid=X_valid.astype(np.float32),
        y_valid=y_valid,
        query_valid=query_valid,
        X_test=X_test.astype(np.float32),
        y_test=y_test,
        query_test=query_test,
    )
示例#10
0
def fetch_A9A(path,
              train_size=None,
              valid_size=None,
              test_size=None,
              **kwargs):
    train_path = os.path.join(path, 'a9a')
    test_path = os.path.join(path, 'a9a.t')
    if not all(os.path.exists(fname) for fname in (train_path, test_path)):
        os.makedirs(path, exist_ok=True)
        download("https://www.dropbox.com/s/9cqdx166iwonrj9/a9a?dl=1",
                 train_path)
        download("https://www.dropbox.com/s/sa0ds895c0v4xc6/a9a.t?dl=1",
                 test_path)

    X_train, y_train = load_svmlight_file(train_path,
                                          dtype=np.float32,
                                          n_features=123)
    X_test, y_test = load_svmlight_file(test_path,
                                        dtype=np.float32,
                                        n_features=123)
    X_train, X_test = X_train.toarray(), X_test.toarray()
    y_train[y_train == -1] = 0
    y_test[y_test == -1] = 0
    y_train, y_test = y_train.astype(np.int), y_test.astype(np.int)

    if all(sizes is None for sizes in (train_size, valid_size, test_size)):
        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')
        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')
        if not all(
                os.path.exists(fname)
                for fname in (train_idx_path, valid_idx_path)):
            download(
                "https://www.dropbox.com/s/xy4wwvutwikmtha/stratified_train_idx.txt?dl=1",
                train_idx_path)
            download(
                "https://www.dropbox.com/s/nthpxofymrais5s/stratified_test_idx.txt?dl=1",
                valid_idx_path)
        train_idx = pd.read_csv(train_idx_path, header=None)[0].values
        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values
    else:
        assert train_size, "please provide either train_size or none of sizes"
        if valid_size is None:
            valid_size = len(X_train) - train_size
            assert valid_size > 0
        if train_size + valid_size > len(X_train):
            warnings.warn(
                'train_size + valid_size = {} exceeds dataset size: {}.'.
                format(train_size + valid_size, len(X_train)), Warning)
        if test_size is not None:
            warnings.warn('Test set is fixed for this dataset.', Warning)

        shuffled_indices = np.random.permutation(np.arange(len(X_train)))
        train_idx = shuffled_indices[:train_size]
        valid_idx = shuffled_indices[train_size:train_size + valid_size]

    return dict(X_train=X_train[train_idx],
                y_train=y_train[train_idx],
                X_valid=X_train[valid_idx],
                y_valid=y_train[valid_idx],
                X_test=X_test,
                y_test=y_test)
示例#11
0
def fetch_PROTEIN(path,
                  train_size=None,
                  valid_size=None,
                  test_size=None,
                  **kwargs):
    """
    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#protein
    """
    train_path = os.path.join(path, 'protein')
    test_path = os.path.join(path, 'protein.t')
    if not all(os.path.exists(fname) for fname in (train_path, test_path)):
        os.makedirs(path, exist_ok=True)
        download("https://www.dropbox.com/s/pflp4vftdj3qzbj/protein.tr?dl=1",
                 train_path)
        download("https://www.dropbox.com/s/z7i5n0xdcw57weh/protein.t?dl=1",
                 test_path)
    for fname in (train_path, test_path):
        raw = open(fname).read().replace(' .', '0.')
        with open(fname, 'w') as f:
            f.write(raw)

    X_train, y_train = load_svmlight_file(train_path,
                                          dtype=np.float32,
                                          n_features=357)
    X_test, y_test = load_svmlight_file(test_path,
                                        dtype=np.float32,
                                        n_features=357)
    X_train, X_test = X_train.toarray(), X_test.toarray()
    y_train, y_test = y_train.astype(np.int), y_test.astype(np.int)

    if all(sizes is None for sizes in (train_size, valid_size, test_size)):
        train_idx_path = os.path.join(path, 'stratified_train_idx.txt')
        valid_idx_path = os.path.join(path, 'stratified_valid_idx.txt')
        if not all(
                os.path.exists(fname)
                for fname in (train_idx_path, valid_idx_path)):
            download(
                "https://www.dropbox.com/s/wq2v9hl1wxfufs3/small_stratified_train_idx.txt?dl=1",
                train_idx_path)
            download(
                "https://www.dropbox.com/s/7o9el8pp1bvyy22/small_stratified_valid_idx.txt?dl=1",
                valid_idx_path)
        train_idx = pd.read_csv(train_idx_path, header=None)[0].values
        valid_idx = pd.read_csv(valid_idx_path, header=None)[0].values
    else:
        assert train_size, "please provide either train_size or none of sizes"
        if valid_size is None:
            valid_size = len(X_train) - train_size
            assert valid_size > 0
        if train_size + valid_size > len(X_train):
            warnings.warn(
                'train_size + valid_size = {} exceeds dataset size: {}.'.
                format(train_size + valid_size, len(X_train)), Warning)
        if test_size is not None:
            warnings.warn('Test set is fixed for this dataset.', Warning)

        shuffled_indices = np.random.permutation(np.arange(len(X_train)))
        train_idx = shuffled_indices[:train_size]
        valid_idx = shuffled_indices[train_size:train_size + valid_size]

    return dict(X_train=X_train[train_idx],
                y_train=y_train[train_idx],
                X_valid=X_train[valid_idx],
                y_valid=y_train[valid_idx],
                X_test=X_test,
                y_test=y_test)
示例#12
0
        query_train=query_train,
        X_valid=X_valid.astype(np.float32),
        y_valid=y_valid,
        query_valid=query_valid,
        X_test=X_test.astype(np.float32),
        y_test=y_test,
        query_test=query_test,
    )


def fetch_CLICK(path, valid_size=100_000, seed=None, **kwargs):
    # based on: https://www.kaggle.com/slamnz/primer-airlines-delay
    csv_path = os.path.join(path, 'click.csv')
    if not os.path.exists(csv_path):
        os.makedirs(path, exist_ok=True)
        download('https://www.dropbox.com/s/w43ylgrl331svqc/click.csv?dl=1',
                 csv_path)

    data = pd.read_csv(csv_path, index_col=0)
    X, y = data.drop(columns=['target']), data['target']
    X_train, X_test = X[:-100_000].copy(), X[-100_000:].copy()
    y_train, y_test = y[:-100_000].copy(), y[-100_000:].copy()

    y_train = (y_train.values.reshape(-1) == 1).astype('int64')
    y_test = (y_test.values.reshape(-1) == 1).astype('int64')

    cat_features = [
        'url_hash', 'ad_id', 'advertiser_id', 'query_id', 'keyword_id',
        'title_id', 'description_id', 'user_id'
    ]

    X_train, X_val, y_train, y_val = train_test_split(X_train,