def testTrainTestSplit(self): X = np.arange(100).reshape((10, 10)) y = np.arange(10) # simple test split = train_test_split(X, y, test_size=None, train_size=.5) X_train, X_test, y_train, y_test = split assert len(y_test) == len(y_train) # test correspondence of X and y np.testing.assert_array_equal(X_train[:, 0], y_train * 10) np.testing.assert_array_equal(X_test[:, 0], y_test * 10) # allow nd-arrays X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) split = train_test_split(X_4d, y_3d) assert split[0].shape == (7, 5, 3, 2) assert split[1].shape == (3, 5, 3, 2) assert split[2].shape == (7, 7, 11) assert split[3].shape == (3, 7, 11) # test unshuffled split y = np.arange(10) for test_size in [2, 0.2]: train, test = train_test_split(y, shuffle=False, test_size=test_size) np.testing.assert_array_equal(test, [8, 9]) np.testing.assert_array_equal(train, [0, 1, 2, 3, 4, 5, 6, 7])
def testTrainTestSplitInvalidSizes2(self): for train_size, test_size in [(-10, 0.8), (0, 0.8), (11, 0.8), (0.8, -10), (0.8, 0), (0.8, 11)]: with pytest.raises(ValueError, match=r'should be .* in the \(0, 1\) range'): train_test_split(range(10), train_size=train_size, test_size=test_size)
def test_train_test_split_invalid_sizes1(setup): for train_size, test_size in [(1.2, 0.8), (1., 0.8), (0.0, 0.8), (-.2, 0.8), (0.8, 1.2), (0.8, 1.), (0.8, 0.), (0.8, -.2)]: with pytest.raises(ValueError, match=r'should be .* in the \(0, 1\) range'): train_test_split(range(10), train_size=train_size, test_size=test_size)
def test_mixied_input_type_train_test_split(setup): rs = np.random.RandomState(0) df_raw = pd.DataFrame(rs.rand(10, 4)) df = md.DataFrame(df_raw, chunk_size=5) X, y = df.iloc[:, :-1], df.iloc[:, -1] for x_to_tensor, y_to_tensor in itertools.product(range(1), range(1)): x = X if x_to_tensor: x = mt.tensor(x) yy = y if y_to_tensor: yy = mt.tensor(yy) x_train, x_test, y_train, y_test = train_test_split( x, y, random_state=0, run_kwargs={'extra_config': { 'check_nsplits': False }}) assert isinstance(x_train, type(x)) assert isinstance(x_test, type(x)) assert isinstance(y_train, type(yy)) assert isinstance(y_test, type(yy))
def testDistributedSplit(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: run_kwargs = {'timeout': timeout} rs = np.random.RandomState(0) df_raw = pd.DataFrame(rs.rand(10, 4)) df = md.DataFrame(df_raw, chunk_size=5) X, y = df.iloc[:, :-1], df.iloc[:, -1] for x_to_tensor, y_to_tensor in itertools.product( range(1), range(1)): x = X if x_to_tensor: x = mt.tensor(x) yy = y if y_to_tensor: yy = mt.tensor(yy) x_train, x_test, y_train, y_test = train_test_split( x, y, random_state=0, session=sess, run_kwargs=run_kwargs) self.assertIsInstance(x_train, type(x)) self.assertIsInstance(x_test, type(x)) self.assertIsInstance(y_train, type(yy)) self.assertIsInstance(y_test, type(yy))
def testTrainTestSplitDataFrame(self): X = np.ones(10) types = [pd.DataFrame, md.DataFrame] for InputFeatureType in types: # X dataframe X_df = InputFeatureType(X) X_train, X_test = train_test_split(X_df) assert isinstance(X_train, DATAFRAME_TYPE) assert isinstance(X_test, DATAFRAME_TYPE)
def testTrainTestSplitErrors(self): self.assertRaises(ValueError, train_test_split) self.assertRaises(ValueError, train_test_split, range(3), train_size=1.1) self.assertRaises(ValueError, train_test_split, range(3), test_size=0.6, train_size=0.6) self.assertRaises(ValueError, train_test_split, range(3), test_size=np.float32(0.6), train_size=np.float32(0.6)) self.assertRaises(ValueError, train_test_split, range(3), test_size="wrong_type") self.assertRaises(ValueError, train_test_split, range(3), test_size=2, train_size=4) self.assertRaises(TypeError, train_test_split, range(3), some_argument=1.1) self.assertRaises(ValueError, train_test_split, range(3), range(42)) self.assertRaises(ValueError, train_test_split, range(10), shuffle=False, stratify=True) with pytest.raises( ValueError, match=r'train_size=11 should be either positive and ' r'smaller than the number of samples 10 or a ' r'float in the \(0, 1\) range'): train_test_split(range(10), train_size=11, test_size=1)
def testTrainTestplitListInput(self): # Check that when y is a list / list of string labels, it works. X = np.ones(7) y1 = ['1'] * 4 + ['0'] * 3 y2 = np.hstack((np.ones(4), np.zeros(3))) y3 = y2.tolist() for stratify in (False, ): X_train1, X_test1, y_train1, y_test1 = train_test_split( X, y1, stratify=y1 if stratify else None, random_state=0) X_train2, X_test2, y_train2, y_test2 = train_test_split( X, y2, stratify=y2 if stratify else None, random_state=0) X_train3, X_test3, y_train3, y_test3 = train_test_split( X, y3, stratify=y3 if stratify else None, random_state=0) np.testing.assert_equal(X_train1, X_train2) np.testing.assert_equal(y_train2, y_train3) np.testing.assert_equal(X_test1, X_test3) np.testing.assert_equal(y_test3, y_test2)
def testTrainTestSplitSparse(self): # check that train_test_split converts scipy sparse matrices # to csr, as stated in the documentation X = np.arange(100).reshape((10, 10)) sparse_types = [sps.csr_matrix, sps.csc_matrix, sps.coo_matrix] for InputFeatureType in sparse_types: X_s = InputFeatureType(X) for x in (X_s, mt.tensor(X_s, chunk_size=(2, 5))): X_train, X_test = train_test_split(x) assert isinstance(X_train.fetch(), SparseNDArray) assert isinstance(X_test.fetch(), SparseNDArray)
def testMixiedInputTypeTrainTestSplit(self): rs = np.random.RandomState(0) df_raw = pd.DataFrame(rs.rand(10, 4)) df = md.DataFrame(df_raw, chunk_size=5) X, y = df.iloc[:, :-1], df.iloc[:, -1] for x_to_tensor, y_to_tensor in itertools.product(range(1), range(1)): x = X if x_to_tensor: x = mt.tensor(x) yy = y if y_to_tensor: yy = mt.tensor(yy) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0) self.assertIsInstance(x_train, type(x)) self.assertIsInstance(x_test, type(x)) self.assertIsInstance(y_train, type(yy)) self.assertIsInstance(y_test, type(yy))