def testLocalClassifierFromToParquet(self): n_rows = 1000 n_columns = 10 rs = np.random.RandomState(0) X = rs.rand(n_rows, n_columns) y = rs.rand(n_rows) df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)]) df['id'] = [f'i{i}' for i in range(n_rows)] booster = xgboost.train({}, xgboost.DMatrix(X, y), num_boost_round=2) with tempfile.TemporaryDirectory() as d: m_name = os.path.join(d, 'c.model') result_dir = os.path.join(d, 'result') os.mkdir(result_dir) data_dir = os.path.join(d, 'data') os.mkdir(data_dir) booster.save_model(m_name) df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet')) df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet')) df = md.read_parquet(data_dir).set_index('id') model = XGBClassifier() model.load_model(m_name) result = model.predict(df, run=False) r = md.DataFrame(result).to_parquet(result_dir) # tiles to ensure no iterative tiling exists g = r.build_graph(tiled=True) self.assertTrue(all(isinstance(n.op, Fuse) for n in g)) self.assertEqual(len(g), 2) r.execute() ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() model2 = xgboost.XGBClassifier() model2.load_model(m_name) expected = model2.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected)
def testDistributedXGBClassifier(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: run_kwargs = {'timeout': timeout} X, y = self.X, self.y y = (y * 10).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, y, eval_set=[(X, y)], session=sess, run_kwargs=run_kwargs) prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) history = classifier.evals_result() self.assertIsInstance(prediction, mt.Tensor) self.assertIsInstance(history, dict) self.assertEqual(list(history)[0], 'validation_0') self.assertEqual(list(history['validation_0'])[0], 'merror') self.assertEqual(len(history['validation_0']), 1) self.assertEqual(len(history['validation_0']['merror']), 2)
def test_local_classifier(setup): y = (y_raw * 10).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X_raw, y, eval_set=[(X_raw, y)]) prediction = classifier.predict(X_raw) assert prediction.ndim == 1 assert prediction.shape[0] == len(X_raw) history = classifier.evals_result() assert isinstance(prediction, mt.Tensor) assert isinstance(history, dict) assert list(history)[0] == 'validation_0' # default metrics may differ, see https://github.com/dmlc/xgboost/pull/6183 eval_metric = list(history['validation_0'])[0] assert eval_metric in ('merror', 'mlogloss') assert len(history['validation_0']) == 1 assert len(history['validation_0'][eval_metric]) == 2 prob = classifier.predict_proba(X_raw) assert prob.shape == X_raw.shape # test dataframe X_df = X_df_raw classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X_df, y) prediction = classifier.predict(X_df) assert prediction.ndim == 1 assert prediction.shape[0] == len(X_raw) # test weight weights = [ mt.random.rand(X_raw.shape[0]), md.Series(mt.random.rand(X_raw.shape[0])), md.DataFrame(mt.random.rand(X_raw.shape[0])) ] y_df = md.DataFrame(y) for weight in weights: classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X_raw, y_df, sample_weights=weight) prediction = classifier.predict(X_raw) assert prediction.ndim == 1 assert prediction.shape[0] == len(X_raw) # should raise error if weight.ndim > 1 with pytest.raises(ValueError): XGBClassifier(verbosity=1, n_estimators=2).fit(X_raw, y_df, sample_weights=mt.random.rand(1, 1)) # test binary classifier new_y = (y > 0.5).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X_raw, new_y) prediction = classifier.predict(X_raw) assert prediction.ndim == 1 assert prediction.shape[0] == len(X_raw) # test predict data with unknown shape X2 = X_raw[X_raw[:, 0] > 0.1].astype(mt.int32) prediction = classifier.predict(X2) assert prediction.ndim == 1 # test train with unknown shape cond = X_raw[:, 0] > 0 X3 = X_raw[cond] y3 = y[cond] classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X3, y3) prediction = classifier.predict(X_raw) assert prediction.ndim == 1 assert prediction.shape[0] == len(X_raw) classifier = XGBClassifier(verbosity=1, n_estimators=2) with pytest.raises(TypeError): classifier.fit(X_raw, y, wrong_param=1) classifier.fit(X_raw, y) with pytest.raises(TypeError): classifier.predict(X_raw, wrong_param=1)
def testLocalClassifier(self): X, y = self.X, self.y y = (y * 10).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, y, eval_set=[(X, y)]) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) history = classifier.evals_result() self.assertIsInstance(prediction, mt.Tensor) self.assertIsInstance(history, dict) self.assertEqual(list(history)[0], 'validation_0') # default metrics may differ, see https://github.com/dmlc/xgboost/pull/6183 eval_metric = list(history['validation_0'])[0] self.assertIn(eval_metric, ('merror', 'mlogloss')) self.assertEqual(len(history['validation_0']), 1) self.assertEqual(len(history['validation_0'][eval_metric]), 2) prob = classifier.predict_proba(X) self.assertEqual(prob.shape, X.shape) # test dataframe X_df = self.X_df classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X_df, y) prediction = classifier.predict(X_df) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) # test weight weights = [ mt.random.rand(X.shape[0]), md.Series(mt.random.rand(X.shape[0])), md.DataFrame(mt.random.rand(X.shape[0])) ] y_df = md.DataFrame(self.y) for weight in weights: classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, y_df, sample_weights=weight) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) # should raise error if weight.ndim > 1 with self.assertRaises(ValueError): XGBClassifier(verbosity=1, n_estimators=2).fit(X, y_df, sample_weights=mt.random.rand( 1, 1)) # test binary classifier new_y = (self.y > 0.5).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, new_y) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) # test predict data with unknown shape X2 = X[X[:, 0] > 0.1].astype(mt.int32) prediction = classifier.predict(X2) self.assertEqual(prediction.ndim, 1) # test train with unknown shape cond = X[:, 0] > 0 X3 = X[cond] y3 = y[cond] classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X3, y3) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) classifier = XGBClassifier(verbosity=1, n_estimators=2) with self.assertRaises(TypeError): classifier.fit(X, y, wrong_param=1) classifier.fit(X, y) with self.assertRaises(TypeError): classifier.predict(X, wrong_param=1)
def testLocalClassifier(self): X, y = self.X, self.y y = (y * 10).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, y, eval_set=[(X, y)]) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) history = classifier.evals_result() self.assertIsInstance(prediction, mt.Tensor) self.assertIsInstance(history, dict) self.assertEqual(list(history)[0], 'validation_0') self.assertEqual(list(history['validation_0'])[0], 'merror') self.assertEqual(len(history['validation_0']), 1) self.assertEqual(len(history['validation_0']['merror']), 2) prob = classifier.predict_proba(X) self.assertEqual(prob.shape, X.shape) # test dataframe X_df = self.X_df classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X_df, y) prediction = classifier.predict(X_df) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) # test weight weights = [mt.random.rand(X.shape[0]), md.Series(mt.random.rand(X.shape[0])), md.DataFrame(mt.random.rand(X.shape[0]))] y_df = md.DataFrame(self.y) for weight in weights: classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, y_df, sample_weights=weight) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) # should raise error if weight.ndim > 1 with self.assertRaises(ValueError): XGBClassifier(verbosity=1, n_estimators=2).fit( X, y_df, sample_weights=mt.random.rand(1, 1)) # test binary classifier new_y = (self.y > 0.5).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, new_y) prediction = classifier.predict(X) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) classifier = XGBClassifier(verbosity=1, n_estimators=2) with self.assertRaises(TypeError): classifier.fit(X, y, wrong_param=1) classifier.fit(X, y) with self.assertRaises(TypeError): classifier.predict(X, wrong_param=1)
def testDistributedXGBClassifier(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: run_kwargs = {'timeout': timeout} X, y = self.X, self.y y = (y * 10).astype(mt.int32) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, y, eval_set=[(X, y)], session=sess, run_kwargs=run_kwargs) prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs) self.assertEqual(prediction.ndim, 1) self.assertEqual(prediction.shape[0], len(self.X)) history = classifier.evals_result() self.assertIsInstance(prediction, mt.Tensor) self.assertIsInstance(history, dict) self.assertEqual(list(history)[0], 'validation_0') # default metrics may differ, see https://github.com/dmlc/xgboost/pull/6183 eval_metric = list(history['validation_0'])[0] self.assertIn(eval_metric, ('merror', 'mlogloss')) self.assertEqual(len(history['validation_0']), 1) self.assertEqual(len(history['validation_0'][eval_metric]), 2) X = md.DataFrame(np.random.rand(100, 20), chunk_size=20) y = md.DataFrame(np.random.randint(0, 2, (100, 1)), chunk_size=20) classifier = XGBClassifier(verbosity=1, n_estimators=2) classifier.fit(X, y, session=sess, run_kwargs=run_kwargs) prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs) self.assertIsInstance(prediction, md.Series) self.assertEqual(prediction.shape[0], len(X))