예제 #1
0
#!/usr/bin/env python3

# This demo shows how to load libsvm file using napkinXC's load_libsvm_file function,
# which is easier to use, faster, and more memory efficient than Sklearn's load_svmlight_file.
# This examples requires Sklearn installed.

from time import time
from sklearn.datasets import load_svmlight_file
from napkinxc.datasets import download_dataset, load_libsvm_file

# Use download_dataset function to download one of the benchmark datasets
# from XML Repository (http://manikvarma.org/downloads/XC/XMLRepository.html).
download_dataset("eurlex-4k", "train")
file = "data/Eurlex/eurlex_train.txt"

# Load using Sklearn
# Because Sklearn method cannot handle header from XML Repository, offset and number of features needs to be provided.
start = time()
X, Y = load_svmlight_file(file,
                          multilabel=True,
                          zero_based=True,
                          n_features=5000,
                          offset=1)
print("Sklearn's load_svmlight_file time:", time() - start)

# Load using napkinXC
# It supports two different output formats for labels, list like in Sklearn version and sparse Scipy csr_matrix, list is default.
start = time()
X, Y = load_libsvm_file(file, labels_format='list')
print("napkinXC's load_libsvm_file time:", time() - start)
예제 #2
0
    "amazon": "Amazon-670K",
    "amazon-3M": "Amazon-3M",
    "deliciousLarge": "Delicious-200K",
    "eurlex": "EURLex-4K",
    "wiki10": "Wiki10-31K",
    "wikiLSHTC": "WikiLSHTC-325K",
    "WikipediaLarge-500K": "Wikipedia-500K",
}

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(
            "Usage: download_dataset.py [dataset name] [format (optional)] [root dir (optional)]"
        )
        exit(1)

    dataset = old_aliases.get(sys.argv[1], sys.argv[1])

    format = "bow"
    if len(sys.argv) >= 3:
        root = sys.argv[2]

    root = "data"
    if len(sys.argv) >= 4:
        root = sys.argv[3]

    dataset_meta = _get_data_meta(dataset, format=format)
    download_dataset(dataset, format=format, root=root, verbose=True)
    shutil.move(os.path.join(root, dataset_meta['dir']),
                os.path.join(root, sys.argv[1]))
예제 #3
0
def test_load_libsvm():
    datasets = {
        "eurlex-4k": {
            "file": os.path.join(TEST_DATA_PATH, "Eurlex/eurlex_test.txt"),
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 5000,
                "offset": 1
            }
        },
        "amazonCat-13k": {
            "file": os.path.join(TEST_DATA_PATH,
                                 "AmazonCat/amazonCat_test.txt"),
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 203882,
                "offset": 1
            }
        },
        "amazonCat-14k": {
            "file":
            os.path.join(TEST_DATA_PATH,
                         "AmazonCat-14K/amazonCat-14K_test.txt"),
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 597540,
                "offset": 1
            }
        },
        "wiki10-31k": {
            "file": os.path.join(TEST_DATA_PATH, "Wiki10/wiki10_test.txt"),
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 101938,
                "offset": 1
            }
        }
    }

    for d, v in datasets.items():
        download_dataset(d, subset='test', format='bow', root=TEST_DATA_PATH)
        print("\n{} time comparison:".format(d))

        t_start = time()
        sk_X, sk_Y = load_svmlight_file(v["file"], **v["sklearn_args"])
        print(
            "\tsklearn.datasets.load_svmlight_file time: {}s".format(time() -
                                                                     t_start))

        t_start = time()
        nxc_X1, nxc_Y_list = load_libsvm_file(v["file"], labels_format="list")
        print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() -
                                                                      t_start))

        t_start = time()
        nxc_X2, nxc_Y_csrm = load_libsvm_file(v["file"],
                                              labels_format="csr_matrix")
        print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() -
                                                                      t_start))

        assert np.array_equal(nxc_X1.indptr, nxc_X2.indptr)
        assert np.array_equal(nxc_X1.indices, nxc_X2.indices)
        assert np.array_equal(nxc_X1.data, nxc_X2.data)

        assert np.array_equal(nxc_X1.indptr, sk_X.indptr)
        assert np.array_equal(nxc_X1.indices, sk_X.indices)
        assert np.allclose(nxc_X1.data, sk_X.data)
        assert nxc_X1.shape[0] == nxc_Y_csrm.shape[0]

        assert len(nxc_Y_list) == len(sk_Y)
        for nxc_y, sk_y in zip(nxc_Y_list, sk_Y):
            assert len(nxc_y) == len(sk_y)
            assert all(y1 == y2 for y1, y2 in zip(nxc_y, sk_y))
예제 #4
0
def test_load_libsvm():
    datasets = {
        "eurlex-4k": {
            "file": "data/Eurlex/eurlex_test.txt",
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 5000,
                "offset": 1
            }
        },
        "amazonCat-13k": {
            "file": "data/AmazonCat/amazonCat_test.txt",
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 203882,
                "offset": 1
            }
        },
        "amazonCat-14k": {
            "file": "data/AmazonCat-14K/amazonCat-14K_test.txt",
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 597540,
                "offset": 1
            }
        },
        "wiki10-31k": {
            "file": "data/Wiki10/wiki10_test.txt",
            "sklearn_args": {
                "multilabel": True,
                "zero_based": True,
                "n_features": 101938,
                "offset": 1
            }
        }
    }

    for d, v in datasets.items():
        download_dataset(d, subset='test', format='bow')
        print("\n{} time comparison:".format(d))

        t_start = time()
        sk_X, sk_Y = load_svmlight_file(v["file"], **v["sklearn_args"])
        print(
            "\tsklearn.datasets.load_svmlight_file time: {}s".format(time() -
                                                                     t_start))

        t_start = time()
        nxc_X, nxc_Y = load_libsvm_file(v["file"])
        print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() -
                                                                      t_start))

        assert np.array_equal(nxc_X.indptr, sk_X.indptr)
        assert np.array_equal(nxc_X.indices, sk_X.indices)
        assert np.allclose(nxc_X.data, sk_X.data)

        assert len(nxc_Y) == len(sk_Y)
        for nxc_y, sk_y in zip(nxc_Y, sk_Y):
            assert len(nxc_y) == len(sk_y)
            assert all(y1 == y2 for y1, y2 in zip(nxc_y, sk_y))
예제 #5
0
def pytest_configure(config):
    print("Downloading/checking test data...")
    download_dataset(TEST_DATASET, "train", root=TEST_DATA_PATH)
    download_dataset(TEST_DATASET, "test", root=TEST_DATA_PATH)