예제 #1
0
    def testLocalClassifierFromToParquet(self):
        n_rows = 1000
        n_columns = 10
        rs = np.random.RandomState(0)
        X = rs.rand(n_rows, n_columns)
        y = rs.rand(n_rows)
        df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)])
        df['id'] = [f'i{i}' for i in range(n_rows)]

        booster = xgboost.train({}, xgboost.DMatrix(X, y), num_boost_round=2)

        with tempfile.TemporaryDirectory() as d:
            m_name = os.path.join(d, 'c.model')
            result_dir = os.path.join(d, 'result')
            os.mkdir(result_dir)
            data_dir = os.path.join(d, 'data')
            os.mkdir(data_dir)

            booster.save_model(m_name)

            df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet'))
            df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet'))

            df = md.read_parquet(data_dir).set_index('id')
            model = XGBClassifier()
            model.load_model(m_name)
            result = model.predict(df, run=False)
            r = md.DataFrame(result).to_parquet(result_dir)

            # tiles to ensure no iterative tiling exists
            g = r.build_graph(tiled=True)
            self.assertTrue(all(isinstance(n.op, Fuse) for n in g))
            self.assertEqual(len(g), 2)
            r.execute()

            ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
            model2 = xgboost.XGBClassifier()
            model2.load_model(m_name)
            expected = model2.predict(X)
            expected = np.stack([1 - expected, expected]).argmax(axis=0)
            np.testing.assert_array_equal(ret, expected)
예제 #2
0
    def testDistributedXGBClassifier(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            run_kwargs = {'timeout': timeout}

            X, y = self.X, self.y
            y = (y * 10).astype(mt.int32)
            classifier = XGBClassifier(verbosity=1, n_estimators=2)
            classifier.fit(X,
                           y,
                           eval_set=[(X, y)],
                           session=sess,
                           run_kwargs=run_kwargs)
            prediction = classifier.predict(X,
                                            session=sess,
                                            run_kwargs=run_kwargs)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

            history = classifier.evals_result()

            self.assertIsInstance(prediction, mt.Tensor)
            self.assertIsInstance(history, dict)

            self.assertEqual(list(history)[0], 'validation_0')
            self.assertEqual(list(history['validation_0'])[0], 'merror')
            self.assertEqual(len(history['validation_0']), 1)
            self.assertEqual(len(history['validation_0']['merror']), 2)
예제 #3
0
def test_local_classifier(setup):
    y = (y_raw * 10).astype(mt.int32)
    classifier = XGBClassifier(verbosity=1, n_estimators=2)
    classifier.fit(X_raw, y, eval_set=[(X_raw, y)])
    prediction = classifier.predict(X_raw)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X_raw)

    history = classifier.evals_result()

    assert isinstance(prediction, mt.Tensor)
    assert isinstance(history, dict)

    assert list(history)[0] == 'validation_0'
    # default metrics may differ, see https://github.com/dmlc/xgboost/pull/6183
    eval_metric = list(history['validation_0'])[0]
    assert eval_metric in ('merror', 'mlogloss')
    assert len(history['validation_0']) == 1
    assert len(history['validation_0'][eval_metric]) == 2

    prob = classifier.predict_proba(X_raw)
    assert prob.shape == X_raw.shape

    # test dataframe
    X_df = X_df_raw
    classifier = XGBClassifier(verbosity=1, n_estimators=2)
    classifier.fit(X_df, y)
    prediction = classifier.predict(X_df)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X_raw)

    # test weight
    weights = [
        mt.random.rand(X_raw.shape[0]),
        md.Series(mt.random.rand(X_raw.shape[0])),
        md.DataFrame(mt.random.rand(X_raw.shape[0]))
    ]
    y_df = md.DataFrame(y)
    for weight in weights:
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X_raw, y_df, sample_weights=weight)
        prediction = classifier.predict(X_raw)

        assert prediction.ndim == 1
        assert prediction.shape[0] == len(X_raw)

    # should raise error if weight.ndim > 1
    with pytest.raises(ValueError):
        XGBClassifier(verbosity=1,
                      n_estimators=2).fit(X_raw,
                                          y_df,
                                          sample_weights=mt.random.rand(1, 1))

    # test binary classifier
    new_y = (y > 0.5).astype(mt.int32)
    classifier = XGBClassifier(verbosity=1, n_estimators=2)
    classifier.fit(X_raw, new_y)
    prediction = classifier.predict(X_raw)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X_raw)

    # test predict data with unknown shape
    X2 = X_raw[X_raw[:, 0] > 0.1].astype(mt.int32)
    prediction = classifier.predict(X2)

    assert prediction.ndim == 1

    # test train with unknown shape
    cond = X_raw[:, 0] > 0
    X3 = X_raw[cond]
    y3 = y[cond]
    classifier = XGBClassifier(verbosity=1, n_estimators=2)
    classifier.fit(X3, y3)
    prediction = classifier.predict(X_raw)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X_raw)

    classifier = XGBClassifier(verbosity=1, n_estimators=2)
    with pytest.raises(TypeError):
        classifier.fit(X_raw, y, wrong_param=1)
    classifier.fit(X_raw, y)
    with pytest.raises(TypeError):
        classifier.predict(X_raw, wrong_param=1)
예제 #4
0
    def testLocalClassifier(self):
        X, y = self.X, self.y
        y = (y * 10).astype(mt.int32)
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X, y, eval_set=[(X, y)])
        prediction = classifier.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        history = classifier.evals_result()

        self.assertIsInstance(prediction, mt.Tensor)
        self.assertIsInstance(history, dict)

        self.assertEqual(list(history)[0], 'validation_0')
        # default metrics may differ, see https://github.com/dmlc/xgboost/pull/6183
        eval_metric = list(history['validation_0'])[0]
        self.assertIn(eval_metric, ('merror', 'mlogloss'))
        self.assertEqual(len(history['validation_0']), 1)
        self.assertEqual(len(history['validation_0'][eval_metric]), 2)

        prob = classifier.predict_proba(X)
        self.assertEqual(prob.shape, X.shape)

        # test dataframe
        X_df = self.X_df
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X_df, y)
        prediction = classifier.predict(X_df)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        # test weight
        weights = [
            mt.random.rand(X.shape[0]),
            md.Series(mt.random.rand(X.shape[0])),
            md.DataFrame(mt.random.rand(X.shape[0]))
        ]
        y_df = md.DataFrame(self.y)
        for weight in weights:
            classifier = XGBClassifier(verbosity=1, n_estimators=2)
            classifier.fit(X, y_df, sample_weights=weight)
            prediction = classifier.predict(X)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

        # should raise error if weight.ndim > 1
        with self.assertRaises(ValueError):
            XGBClassifier(verbosity=1,
                          n_estimators=2).fit(X,
                                              y_df,
                                              sample_weights=mt.random.rand(
                                                  1, 1))

        # test binary classifier
        new_y = (self.y > 0.5).astype(mt.int32)
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X, new_y)
        prediction = classifier.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        # test predict data with unknown shape
        X2 = X[X[:, 0] > 0.1].astype(mt.int32)
        prediction = classifier.predict(X2)

        self.assertEqual(prediction.ndim, 1)

        # test train with unknown shape
        cond = X[:, 0] > 0
        X3 = X[cond]
        y3 = y[cond]
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X3, y3)
        prediction = classifier.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        with self.assertRaises(TypeError):
            classifier.fit(X, y, wrong_param=1)
        classifier.fit(X, y)
        with self.assertRaises(TypeError):
            classifier.predict(X, wrong_param=1)
예제 #5
0
    def testLocalClassifier(self):
        X, y = self.X, self.y
        y = (y * 10).astype(mt.int32)
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X, y, eval_set=[(X, y)])
        prediction = classifier.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        history = classifier.evals_result()

        self.assertIsInstance(prediction, mt.Tensor)
        self.assertIsInstance(history, dict)

        self.assertEqual(list(history)[0], 'validation_0')
        self.assertEqual(list(history['validation_0'])[0], 'merror')
        self.assertEqual(len(history['validation_0']), 1)
        self.assertEqual(len(history['validation_0']['merror']), 2)

        prob = classifier.predict_proba(X)
        self.assertEqual(prob.shape, X.shape)

        # test dataframe
        X_df = self.X_df
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X_df, y)
        prediction = classifier.predict(X_df)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        # test weight
        weights = [mt.random.rand(X.shape[0]), md.Series(mt.random.rand(X.shape[0])),
                   md.DataFrame(mt.random.rand(X.shape[0]))]
        y_df = md.DataFrame(self.y)
        for weight in weights:
            classifier = XGBClassifier(verbosity=1, n_estimators=2)
            classifier.fit(X, y_df, sample_weights=weight)
            prediction = classifier.predict(X)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

        # should raise error if weight.ndim > 1
        with self.assertRaises(ValueError):
            XGBClassifier(verbosity=1, n_estimators=2).fit(
                X, y_df, sample_weights=mt.random.rand(1, 1))

        # test binary classifier
        new_y = (self.y > 0.5).astype(mt.int32)
        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        classifier.fit(X, new_y)
        prediction = classifier.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        classifier = XGBClassifier(verbosity=1, n_estimators=2)
        with self.assertRaises(TypeError):
            classifier.fit(X, y, wrong_param=1)
        classifier.fit(X, y)
        with self.assertRaises(TypeError):
            classifier.predict(X, wrong_param=1)
예제 #6
0
    def testDistributedXGBClassifier(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            run_kwargs = {'timeout': timeout}

            X, y = self.X, self.y
            y = (y * 10).astype(mt.int32)
            classifier = XGBClassifier(verbosity=1, n_estimators=2)
            classifier.fit(X,
                           y,
                           eval_set=[(X, y)],
                           session=sess,
                           run_kwargs=run_kwargs)
            prediction = classifier.predict(X,
                                            session=sess,
                                            run_kwargs=run_kwargs)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

            history = classifier.evals_result()

            self.assertIsInstance(prediction, mt.Tensor)
            self.assertIsInstance(history, dict)

            self.assertEqual(list(history)[0], 'validation_0')
            # default metrics may differ, see https://github.com/dmlc/xgboost/pull/6183
            eval_metric = list(history['validation_0'])[0]
            self.assertIn(eval_metric, ('merror', 'mlogloss'))
            self.assertEqual(len(history['validation_0']), 1)
            self.assertEqual(len(history['validation_0'][eval_metric]), 2)

            X = md.DataFrame(np.random.rand(100, 20), chunk_size=20)
            y = md.DataFrame(np.random.randint(0, 2, (100, 1)), chunk_size=20)
            classifier = XGBClassifier(verbosity=1, n_estimators=2)
            classifier.fit(X, y, session=sess, run_kwargs=run_kwargs)
            prediction = classifier.predict(X,
                                            session=sess,
                                            run_kwargs=run_kwargs)

            self.assertIsInstance(prediction, md.Series)
            self.assertEqual(prediction.shape[0], len(X))