def test_load_with_long_qid(): # load svmfile with longint qid attribute data = b""" 1 qid:0 0:1 1:2 2:3 0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985 0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985 3 qid:9223372036854775807 0:1440446648 1:72048431380967004 2:236784985""" X, y, qid = load_svmlight_file(BytesIO(data), query_id=True) true_X = [[1, 2, 3], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985]] true_y = [1, 0, 0, 3] trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807] assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f = BytesIO() dump_svmlight_file(X, y, f, query_id=qid, zero_based=True) f.seek(0) X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f.seek(0) X, y = load_svmlight_file(f, query_id=False, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X)
def test_load_zeros(): f = BytesIO() true_X = sp.csr_matrix(np.zeros(shape=(3, 4))) true_y = np.array([0, 1, 0]) dump_svmlight_file(true_X, true_y, f) for zero_based in ['auto', True, False]: f.seek(0) X, y = load_svmlight_file(f, n_features=4, zero_based=zero_based) assert_array_almost_equal(y, true_y) assert_array_almost_equal(X.toarray(), true_X.toarray())
def test_dump_invalid(): X, y = load_svmlight_file(datafile) f = BytesIO() y2d = [y] with pytest.raises(ValueError): dump_svmlight_file(X, y2d, f) f = BytesIO() with pytest.raises(ValueError): dump_svmlight_file(X, y[:-1], f)
def test_dump_multilabel(): X = [[1, 0, 3, 0, 5], [0, 0, 0, 0, 0], [0, 5, 0, 1, 0]] y_dense = [[0, 1, 0], [1, 0, 1], [1, 1, 0]] y_sparse = sp.csr_matrix(y_dense) for y in [y_dense, y_sparse]: f = BytesIO() dump_svmlight_file(X, y, f, multilabel=True) f.seek(0) # make sure it dumps multilabel correctly assert f.readline() == b"1 0:1 2:3 4:5\n" assert f.readline() == b"0,2 \n" assert f.readline() == b"0,1 1:5 3:1\n"
def test_dump_query_id(): # test dumping a file with query_id X, y = load_svmlight_file(datafile) X = X.toarray() query_id = np.arange(X.shape[0]) // 2 f = BytesIO() dump_svmlight_file(X, y, f, query_id=query_id, zero_based=True) f.seek(0) X1, y1, query_id1 = load_svmlight_file(f, query_id=True, zero_based=True) assert_array_almost_equal(X, X1.toarray()) assert_array_almost_equal(y, y1) assert_array_almost_equal(query_id, query_id1)
def test_dump_comment(): X, y = load_svmlight_file(datafile) X = X.toarray() f = BytesIO() ascii_comment = "This is a comment\nspanning multiple lines." dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_almost_equal(y, y2) # XXX we have to update this to support Python 3.x utf8_comment = b"It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc" f = BytesIO() with pytest.raises(UnicodeDecodeError): dump_svmlight_file(X, y, f, comment=utf8_comment) unicode_comment = utf8_comment.decode("utf-8") f = BytesIO() dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_almost_equal(y, y2) f = BytesIO() with pytest.raises(ValueError): dump_svmlight_file(X, y, f, comment="I've got a \0.")
def test_load_offset_exhaustive_splits(): rng = np.random.RandomState(0) X = np.array([ [0, 0, 0, 0, 0, 0], [1, 2, 3, 4, 0, 6], [1, 2, 3, 4, 0, 6], [0, 0, 0, 0, 0, 0], [1, 0, 3, 0, 0, 0], [0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0], ]) X = sp.csr_matrix(X) n_samples, n_features = X.shape y = rng.randint(low=0, high=2, size=n_samples) query_id = np.arange(n_samples) // 2 f = BytesIO() dump_svmlight_file(X, y, f, query_id=query_id) f.seek(0) size = len(f.getvalue()) # load the same data in 2 parts with all the possible byte offsets to # locate the split so has to test for particular boundary cases for mark in range(size): f.seek(0) X_0, y_0, q_0 = load_svmlight_file(f, n_features=n_features, query_id=True, offset=0, length=mark) X_1, y_1, q_1 = load_svmlight_file(f, n_features=n_features, query_id=True, offset=mark, length=-1) q_concat = np.concatenate([q_0, q_1]) y_concat = np.concatenate([y_0, y_1]) X_concat = sp.vstack([X_0, X_1]) assert_array_almost_equal(y, y_concat) assert_array_equal(query_id, q_concat) assert_array_almost_equal(X.toarray(), X_concat.toarray())
def test_load_with_offsets(sparsity, n_samples, n_features): rng = np.random.RandomState(0) X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features)) if sparsity: X[X < sparsity] = 0.0 X = sp.csr_matrix(X) y = rng.randint(low=0, high=2, size=n_samples) f = BytesIO() dump_svmlight_file(X, y, f) f.seek(0) size = len(f.getvalue()) # put some marks that are likely to happen anywhere in a row mark_0 = 0 mark_1 = size // 3 length_0 = mark_1 - mark_0 mark_2 = 4 * size // 5 length_1 = mark_2 - mark_1 # load the original sparse matrix into 3 independent CSR matrices X_0, y_0 = load_svmlight_file(f, n_features=n_features, offset=mark_0, length=length_0) X_1, y_1 = load_svmlight_file(f, n_features=n_features, offset=mark_1, length=length_1) X_2, y_2 = load_svmlight_file(f, n_features=n_features, offset=mark_2) y_concat = np.concatenate([y_0, y_1, y_2]) X_concat = sp.vstack([X_0, X_1, X_2]) assert_array_almost_equal(y, y_concat) assert_array_almost_equal(X.toarray(), X_concat.toarray())
def test_dump_concise(): one = 1 two = 2.1 three = 3.01 exact = 1.000000000000001 # loses the last decimal place almost = 1.0000000000000001 X = [[one, two, three, exact, almost], [1e9, 2e18, 3e27, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] y = [one, two, three, exact, almost] f = BytesIO() dump_svmlight_file(X, y, f) f.seek(0) # make sure it's using the most concise format possible assert (f.readline() == b"1 0:1 1:2.1 2:3.01 3:1.000000000000001 4:1\n") assert f.readline() == b"2.1 0:1000000000 1:2e+18 2:3e+27\n" assert f.readline() == b"3.01 \n" assert f.readline() == b"1.000000000000001 \n" assert f.readline() == b"1 \n" f.seek(0) # make sure it's correct too :) X2, y2 = load_svmlight_file(f) assert_array_almost_equal(X, X2.toarray()) assert_array_almost_equal(y, y2)
def test_dump(): X_sparse, y_dense = load_svmlight_file(datafile) X_dense = X_sparse.toarray() y_sparse = sp.csr_matrix(y_dense) # slicing a csr_matrix can unsort its .indices, so test that we sort # those correctly X_sliced = X_sparse[np.arange(X_sparse.shape[0])] y_sliced = y_sparse[np.arange(y_sparse.shape[0])] for X in (X_sparse, X_dense, X_sliced): for y in (y_sparse, y_dense, y_sliced): for zero_based in (True, False): for dtype in [np.float32, np.float64, np.int32, np.int64]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. if (sp.issparse(y) and y.shape[0] == 1): # make sure y's shape is: (n_samples, n_labels) # when it is sparse y = y.T # Note: with dtype=np.int32 we are performing unsafe casts, # where X.astype(dtype) overflows. The result is # then platform dependent and X_dense.astype(dtype) may be # different from X_sparse.astype(dtype).asarray(). X_input = X.astype(dtype) dump_svmlight_file(X_input, y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() comment = str(comment, "utf-8") assert "scikit-learn %s" % mrex.__version__ in comment comment = f.readline() comment = str(comment, "utf-8") assert ["one", "zero"][zero_based] + "-based" in comment X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert X2.dtype == dtype assert_array_equal(X2.sorted_indices().indices, X2.indices) X2_dense = X2.toarray() if sp.issparse(X_input): X_input_dense = X_input.toarray() else: X_input_dense = X_input if dtype == np.float32: # allow a rounding error at the last decimal place assert_array_almost_equal(X_input_dense, X2_dense, 4) assert_array_almost_equal( y_dense.astype(dtype, copy=False), y2, 4) else: # allow a rounding error at the last decimal place assert_array_almost_equal(X_input_dense, X2_dense, 15) assert_array_almost_equal( y_dense.astype(dtype, copy=False), y2, 15)
def dump_svmlight_file(self, file): data = np.array(self.data) X = data[:, 0:2] y = data[:, 2] dump_svmlight_file(X, y, file)