예제 #1
0
def test_local_classifier_from_to_parquet(setup):
    n_rows = 1000
    n_columns = 10
    rs = np.random.RandomState(0)
    X = rs.rand(n_rows, n_columns)
    y = (rs.rand(n_rows) > 0.5).astype(np.int32)
    df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)])

    # test with existing model
    classifier = lightgbm.LGBMClassifier(n_estimators=2)
    classifier.fit(X, y, verbose=True)

    with tempfile.TemporaryDirectory() as d:
        result_dir = os.path.join(d, 'result')
        os.mkdir(result_dir)
        data_dir = os.path.join(d, 'data')
        os.mkdir(data_dir)

        df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet'))
        df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet'))

        df = md.read_parquet(data_dir)
        model = LGBMClassifier()
        model.load_model(classifier)
        result = model.predict(df, run=False)
        r = md.DataFrame(result).to_parquet(result_dir)

        r.execute()

        ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
        expected = classifier.predict(X)
        expected = np.stack([1 - expected, expected]).argmax(axis=0)
        np.testing.assert_array_equal(ret, expected)
예제 #2
0
    def testDistributedLGBMClassifier(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            run_kwargs = {'timeout': timeout}

            X, y = self.X, self.y
            y = (y * 10).astype(mt.int32)
            classifier = LGBMClassifier(n_estimators=2)
            classifier.fit(X, y, session=sess, run_kwargs=run_kwargs)
            prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

            self.assertIsInstance(prediction, mt.Tensor)

            classifier = LGBMClassifier(n_estimators=2)
            classifier.fit(X, y, eval_set=[(X, y)], session=sess, run_kwargs=run_kwargs)
            prediction = classifier.predict(X, session=sess, run_kwargs=run_kwargs)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

            self.assertIsInstance(prediction, mt.Tensor)
        def func():
            import lightgbm
            import xgboost
            import mars.tensor as mt
            from mars.learn.contrib.lightgbm import LGBMClassifier

            n_rows = 1000
            n_columns = 10
            chunk_size = 50
            rs = mt.random.RandomState(0)
            X = rs.rand(n_rows, n_columns, chunk_size=chunk_size)
            y = rs.rand(n_rows, chunk_size=chunk_size)
            y = (y * 10).astype(mt.int32)
            classifier = LGBMClassifier(n_estimators=2)
            classifier.fit(X, y, eval_set=[(X, y)])
            prediction = classifier.predict(X)
예제 #4
0
    def testLocalClassifierFromToParquet(self):
        n_rows = 1000
        n_columns = 10
        rs = np.random.RandomState(0)
        X = rs.rand(n_rows, n_columns)
        y = (rs.rand(n_rows) > 0.5).astype(np.int32)
        df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)])

        # test with existing model
        classifier = lightgbm.LGBMClassifier(n_estimators=2)
        classifier.fit(X, y, verbose=True)

        with tempfile.TemporaryDirectory() as d:
            result_dir = os.path.join(d, 'result')
            os.mkdir(result_dir)
            data_dir = os.path.join(d, 'data')
            os.mkdir(data_dir)

            df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet'))
            df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet'))

            df = md.read_parquet(data_dir)
            model = LGBMClassifier()
            model.load_model(classifier)
            result = model.predict(df, run=False)
            r = md.DataFrame(result).to_parquet(result_dir)

            # tiles to ensure no iterative tiling exists
            g = r.build_graph(tiled=True)
            self.assertTrue(all(isinstance(n.op, Fuse) for n in g))
            self.assertEqual(len(g), 2)
            r.execute()

            ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy()
            expected = classifier.predict(X)
            expected = np.stack([1 - expected, expected]).argmax(axis=0)
            np.testing.assert_array_equal(ret, expected)
예제 #5
0
    def testLocalClassifier(self):
        X, y = self.X, self.y
        y = (y * 10).astype(mt.int32)
        classifier = LGBMClassifier(n_estimators=2)
        classifier.fit(X, y, eval_set=[(X, y)], verbose=True)
        prediction = classifier.predict(X)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        self.assertIsInstance(prediction, mt.Tensor)

        # test sparse tensor
        X_sparse = self.X_sparse
        classifier = LGBMClassifier(n_estimators=2)
        classifier.fit(X_sparse, y, eval_set=[(X_sparse, y)], verbose=True)
        prediction = classifier.predict(X_sparse)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        self.assertIsInstance(prediction, mt.Tensor)

        prob = classifier.predict_proba(X)
        self.assertEqual(prob.shape, X.shape)

        prediction_empty = classifier.predict(
            mt.array([]).reshape((0, X.shape[1])))
        self.assertEqual(prediction_empty.shape, (0, ))

        # test dataframe
        X_df = self.X_df
        classifier = LGBMClassifier(n_estimators=2)
        classifier.fit(X_df, y, verbose=True)
        prediction = classifier.predict(X_df)

        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        prob = classifier.predict_proba(X_df)

        self.assertEqual(prob.ndim, 2)
        self.assertEqual(prob.shape, (len(self.X), 10))

        # test weight
        weights = [
            mt.random.rand(X.shape[0]),
            md.Series(mt.random.rand(X.shape[0]))
        ]
        y_df = md.DataFrame(y)
        for weight in weights:
            classifier = LGBMClassifier(n_estimators=2)
            classifier.fit(X, y_df, sample_weight=weight, verbose=True)
            prediction = classifier.predict(X)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

        # should raise error if weight.ndim > 1
        with self.assertRaises(ValueError):
            LGBMClassifier(n_estimators=2).fit(X,
                                               y_df,
                                               sample_weight=mt.random.rand(
                                                   1, 1),
                                               verbose=True)

        # test binary classifier
        new_y = (self.y > 0.5).astype(mt.int32)
        classifier = LGBMClassifier(n_estimators=2)
        classifier.fit(X, new_y, verbose=True)

        prediction = classifier.predict(X)
        self.assertEqual(prediction.ndim, 1)
        self.assertEqual(prediction.shape[0], len(self.X))

        prediction = classifier.predict_proba(X)
        self.assertEqual(prediction.ndim, 2)
        self.assertEqual(prediction.shape[0], len(self.X))

        # test with existing model
        classifier = lightgbm.LGBMClassifier(n_estimators=2)
        classifier.fit(X, new_y, verbose=True)

        label_result = predict(classifier, X_df)
        self.assertEqual(label_result.ndim, 1)
        self.assertEqual(label_result.shape[0], len(self.X))

        proba_result = predict_proba(classifier, X_df)
        self.assertEqual(proba_result.ndim, 2)
        self.assertEqual(proba_result.shape[0], len(self.X))
예제 #6
0
    def testDistributedLGBMClassifier(self):
        service_ep = 'http://127.0.0.1:' + self.web_port
        timeout = 120 if 'CI' in os.environ else -1
        with new_session(service_ep) as sess:
            run_kwargs = {'timeout': timeout}

            X, y = self.X, self.y
            y = (y * 10).astype(mt.int32)
            classifier = LGBMClassifier(n_estimators=2)
            classifier.fit(X, y, session=sess, run_kwargs=run_kwargs)
            prediction = classifier.predict(X,
                                            session=sess,
                                            run_kwargs=run_kwargs)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

            # fi on fitted model shall work well
            classifier.fit(X, y, session=sess, run_kwargs=run_kwargs)
            prediction = classifier.predict(X,
                                            session=sess,
                                            run_kwargs=run_kwargs)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

            self.assertIsInstance(prediction, mt.Tensor)

            classifier = LGBMClassifier(n_estimators=2)
            classifier.fit(X,
                           y,
                           eval_set=[(X, y)],
                           session=sess,
                           run_kwargs=run_kwargs)
            prediction = classifier.predict(X,
                                            session=sess,
                                            run_kwargs=run_kwargs)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(self.X))

            self.assertIsInstance(prediction, mt.Tensor)

            X = md.DataFrame(np.random.rand(100, 20), chunk_size=20)
            y = md.DataFrame(np.random.randint(0, 2, (100, 1)), chunk_size=20)
            classifier = LGBMClassifier(n_estimators=2)
            classifier.fit(X, y, session=sess, run_kwargs=run_kwargs)
            prediction = classifier.predict(X,
                                            session=sess,
                                            run_kwargs=run_kwargs)

            self.assertEqual(prediction.ndim, 1)
            self.assertEqual(prediction.shape[0], len(X))
            self.assertIsInstance(prediction, md.Series)
예제 #7
0
def test_local_classifier(setup):
    y_data = (y * 10).astype(mt.int32)
    classifier = LGBMClassifier(n_estimators=2)
    classifier.fit(X, y_data, eval_set=[(X, y_data)], verbose=True)
    prediction = classifier.predict(X)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X)

    assert isinstance(prediction, mt.Tensor)

    # test sparse tensor
    X_sparse_data = X_sparse
    classifier = LGBMClassifier(n_estimators=2)
    classifier.fit(X_sparse_data, y_data,
                   eval_set=[(X_sparse_data, y_data)], verbose=True)
    prediction = classifier.predict(X_sparse_data)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X)

    assert isinstance(prediction, mt.Tensor)

    prob = classifier.predict_proba(X)
    assert prob.shape == X.shape

    prediction_empty = classifier.predict(mt.array([]).reshape((0, X.shape[1])))
    assert prediction_empty.shape == (0,)

    # test dataframe
    X_df_data = X_df
    classifier = LGBMClassifier(n_estimators=2)
    classifier.fit(X_df_data, y_data, verbose=True)
    prediction = classifier.predict(X_df_data)

    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X)

    prob = classifier.predict_proba(X_df)

    assert prob.ndim == 2
    assert prob.shape == (len(X), 10)

    # test weight
    weights = [mt.random.rand(X.shape[0]), md.Series(mt.random.rand(X.shape[0]))]
    y_df = md.DataFrame(y_data)
    for weight in weights:
        classifier = LGBMClassifier(n_estimators=2)
        classifier.fit(X, y_df, sample_weight=weight, verbose=True)
        prediction = classifier.predict(X)

        assert prediction.ndim == 1
        assert prediction.shape[0] == len(X)

    # should raise error if weight.ndim > 1
    with pytest.raises(ValueError):
        LGBMClassifier(n_estimators=2).fit(
            X, y_df, sample_weight=mt.random.rand(1, 1), verbose=True)

    # test binary classifier
    new_y = (y_data > 0.5).astype(mt.int32)
    classifier = LGBMClassifier(n_estimators=2)
    classifier.fit(X, new_y, verbose=True)

    prediction = classifier.predict(X)
    assert prediction.ndim == 1
    assert prediction.shape[0] == len(X)

    prediction = classifier.predict_proba(X)
    assert prediction.ndim == 2
    assert prediction.shape[0] == len(X)

    # test with existing model
    X_np = X.execute().fetch()
    new_y_np = new_y.execute().fetch()
    raw_classifier = lightgbm.LGBMClassifier(n_estimators=2)
    raw_classifier.fit(X_np, new_y_np, verbose=True)

    classifier = LGBMClassifier(raw_classifier)
    label_result = classifier.predict(X_df)
    assert label_result.ndim == 1
    assert label_result.shape[0] == len(X)

    proba_result = classifier.predict_proba(X_df)
    assert proba_result.ndim == 2
    assert proba_result.shape[0] == len(X)