예제 #1
0
    def test_experiment_loadsavemodel(self):
        (train, label) = get_X_y(train_file, label_column, sep=',')
        (test, label1) = get_X_y(test_file, label_column, sep=',')
        cat = OneHotVectorizer() << categorical_columns
        ftree = FastTreesBinaryClassifier()
        pipeline = Pipeline([cat, ftree])
        pipeline.fit(train, label)
        metrics1, scores1 = pipeline.test(test,
                                          label1,
                                          'binary',
                                          output_scores=True)
        sum1 = metrics1.sum().sum()
        (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin')
        fl = os.fdopen(fd, 'w')
        fl.close()
        pipeline.save_model(modelfilename)

        pipeline2 = Pipeline()
        pipeline2.load_model(modelfilename)
        metrics2, scores2 = pipeline2.test(test,
                                           label1,
                                           'binary',
                                           output_scores=True)
        sum2 = metrics2.sum().sum()

        assert_equal(sum1, sum2,
                     "model metrics don't match after loading model")
예제 #2
0
    def test_test(self):
        transformed_data, transformed_data_df = transform_data()
        fl = FastLinearRegressor(
            feature=[
                'parity',
                'in',
                'sp',
                'stratum'],
            label='age')
        flpipe = Pipeline([fl])
        flpipe.fit(transformed_data)
        metrics, scores = flpipe.test(transformed_data, output_scores=True)
        metrics_df, scores_df = flpipe.test(
            transformed_data_df, output_scores=True)

        assert_array_equal(scores, scores_df)
        assert_array_equal(metrics, metrics_df)

        flpipe.fit(
            transformed_data_df.drop(
                'age',
                axis=1),
            transformed_data_df['age'])
        metrics, scores = flpipe.test(transformed_data, output_scores=True)
        metrics_df, scores_df = flpipe.test(
            transformed_data_df, output_scores=True)

        assert_array_equal(scores, scores_df)
        assert_array_equal(metrics, metrics_df)
예제 #3
0
    def test_model_datastream(self):
        model_nimbusml = Pipeline(
            steps=[
                ('cat',
                 OneHotVectorizer() << categorical_columns),
                ('linear',
                 FastLinearBinaryClassifier(
                     shuffle=False,
                     number_of_threads=1))])

        model_nimbusml.fit(train, label)

        # Save with pickle
        pickle_filename = get_temp_file(suffix='.p')
        with open(pickle_filename, 'wb') as f:
            pickle.dump(model_nimbusml, f)

        with open(pickle_filename, "rb") as f:
            model_nimbusml_pickle = pickle.load(f)

        os.remove(pickle_filename)

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_pickle.predict(test).head(5)

        metrics, score = model_nimbusml.test(test, test_label, output_scores=True)
        metrics_pickle, score_pickle = model_nimbusml_pickle.test(
            test, test_label, output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(
            metrics.sum().sum(),
            metrics_pickle.sum().sum(),
            decimal=2)

        # Save load with pipeline methods
        model_filename = get_temp_file(suffix='.m')
        model_nimbusml.save_model(model_filename)
        model_nimbusml_load = Pipeline()
        model_nimbusml_load.load_model(model_filename)

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_load.predict(test).head(5)

        metrics, score = model_nimbusml.test(test, test_label, output_scores=True)
        model_nimbusml_load, score_load = model_nimbusml_load.test(
            test, test_label, evaltype='binary', output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(
            metrics.sum().sum(),
            model_nimbusml_load.sum().sum(),
            decimal=2)

        os.remove(model_filename)
예제 #4
0
    def test_metrics_check_output_scores(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionBinaryClassifier()
        e = Pipeline([lr])
        e.fit(X_train, y_train, verbose=0)
        metrics, scores = e.test(X_test, y_test, output_scores=False)
        assert len(scores) == 0
        metrics, scores = e.test(X_test, y_test, output_scores=True)
        assert len(scores) > 0
예제 #5
0
    def test_metrics_evaluate_clusterer(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = KMeansPlusPlus(n_clusters=2, initialization_algorithm="Random")
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame(), verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # if abs(metrics['NMI'][0] - 0.7) >= 0.15:
        #    raise AssertionError("NMI loss should be %f not %f" % \
        # (0.7, metrics['NMI'][0]))
        # if abs(metrics['AvgMinScore'][0] - 0.014) >= 0.015:
        #    raise AssertionError("AvgMinScore  should be %f not %f" % (\
        # 0.014, metrics['AvgMinScore'][0]))
        assert_almost_equal(metrics['NMI'][0],
                            0.7,
                            decimal=0,
                            err_msg="NMI loss should be %s" % 0.7)
        assert_almost_equal(metrics['AvgMinScore'][0],
                            0.032,
                            decimal=2,
                            err_msg="AvgMinScore  should be %s" % 0.014)
예제 #6
0
    def test_metrics_evaluate_regressor(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = FastTreesRegressor()
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame(), verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # TODO: debug flucations, and increase decimal precision on checks
        assert_almost_equal(metrics['L1(avg)'][0],
                            0.107,
                            decimal=1,
                            err_msg="L1 loss should be %s" % 0.107)
        assert_almost_equal(metrics['L2(avg)'][0],
                            0.0453,
                            decimal=1,
                            err_msg="L2(avg) should be %s" % 0.0453)
        assert_almost_equal(metrics['Loss-fn(avg)'][0],
                            0.0453,
                            decimal=1,
                            err_msg="Loss-fn(avg)loss should be %s" % 0.0453)
예제 #7
0
    def test_metrics_evaluate_binary(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionBinaryClassifier()
        e = Pipeline([lr])
        e.fit(X_train, y_train, verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # TODO: debug flucations, and increase decimal precision on checks
        assert_almost_equal(metrics['AUC'][0],
                            0.980,
                            decimal=1,
                            err_msg="AUC should be %s" % 0.980)
        assert_almost_equal(metrics['Accuracy'][0],
                            0.632,
                            decimal=1,
                            err_msg="Accuracy should be %s" % 0.632)
        assert_almost_equal(metrics['Positive precision'][0],
                            1,
                            decimal=1,
                            err_msg="Positive precision should be %s" % 1)
        assert_almost_equal(metrics['Positive recall'][0],
                            0.125,
                            decimal=1,
                            err_msg="Positive recall should be %s" % 0.125)
        assert_almost_equal(metrics['Negative precision'][0],
                            0.611,
                            decimal=1,
                            err_msg="Negative precision should be %s" % 0.611)
        assert_almost_equal(metrics['Negative recall'][0],
                            1,
                            decimal=1,
                            err_msg="Negative recall should be %s" % 1)
        assert_almost_equal(metrics['Log-loss'][0],
                            0.686,
                            decimal=1,
                            err_msg="Log-loss should be %s" % 0.686)
        assert_almost_equal(metrics['Log-loss reduction'][0],
                            0.3005,
                            decimal=3,
                            err_msg="Log-loss reduction should be %s" % 0.3005)
        assert_almost_equal(
            metrics['Test-set entropy (prior Log-Loss/instance)'][0],
            0.981,
            decimal=1,
            err_msg="Test-set entropy (prior Log-Loss/instance) should be %s" %
            0.981)
        assert_almost_equal(metrics['F1 Score'][0],
                            0.222,
                            decimal=1,
                            err_msg="F1 Score should be %s" % 0.222)
        assert_almost_equal(metrics['AUPRC'][0],
                            0.966,
                            decimal=1,
                            err_msg="AUPRC should be %s" % 0.966)
예제 #8
0
    def test_model_datastream(self):
        model_nimbusml = Pipeline(
            steps=[('cat', OneHotVectorizer() << categorical_columns),
                   ('linear',
                    FastLinearBinaryClassifier(shuffle=False, train_threads=1)
                    )])

        model_nimbusml.fit(train, label)

        # Save with pickle
        pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb'))
        model_nimbusml_pickle = pickle.load(open("nimbusml_model.p", "rb"))

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_pickle.predict(test).head(5)

        metrics, score = model_nimbusml.test(test,
                                             test_label,
                                             output_scores=True)
        metrics_pickle, score_pickle = model_nimbusml_pickle.test(
            test, test_label, output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(metrics.sum().sum(),
                            metrics_pickle.sum().sum(),
                            decimal=2)

        # Save load with pipeline methods
        model_nimbusml.save_model('model.nimbusml.m')
        model_nimbusml_load = Pipeline()
        model_nimbusml_load.load_model('model.nimbusml.m')

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_load.predict(test).head(5)

        metrics, score = model_nimbusml.test(test,
                                             test_label,
                                             output_scores=True)
        model_nimbusml_load, score_load = model_nimbusml_load.test(
            test, test_label, evaltype='binary', output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(metrics.sum().sum(),
                            model_nimbusml_load.sum().sum(),
                            decimal=2)
예제 #9
0
    def test_uciadult_sweep(self):
        # grid search over number_of_trees and then confirm the best number_of_trees by
        # full train
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        # number_of_trees 100 will actually be never run by grid search
        # as its not in param_grid below
        learner = FastTreesBinaryClassifier(number_of_trees=100,
                                            number_of_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__number_of_trees=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)

        grid.fit(X_train, y_train)
        assert grid.best_params_['learner__number_of_trees'] == 10

        # compare AUC on number_of_trees 1, 5, 10
        pipe.set_params(learner__number_of_trees=1)
        pipe.fit(X_train, y_train)
        metrics1, _ = pipe.test(X_train, y_train)

        pipe.set_params(learner__number_of_trees=5)
        pipe.fit(X_train, y_train)
        metrics5, _ = pipe.test(X_train, y_train)

        pipe.set_params(learner__number_of_trees=10)
        pipe.fit(X_train, y_train)
        metrics10, _ = pipe.test(X_train, y_train)

        assert metrics10['AUC'][0] > metrics5['AUC'][0]
        assert metrics10['AUC'][0] > metrics1['AUC'][0]
        assert metrics10['AUC'][0] > 0.59
예제 #10
0
    def test_metrics_evaluate_ranking_group_id_from_existing_column_in_X(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df().drop(['Label', 'Species'], axis=1)
        X_train, X_test = train_test_split(df)
        X_test.is_copy = False
        X_train.is_copy = False
        y_train = X_train['Setosa']
        y_test = X_test['Setosa']
        gvals_test = np.zeros(10).tolist() \
            + np.ones(10).tolist() \
            + (np.ones(10) * 2).tolist() \
            + (np.ones(8) * 3).tolist()

        gvals_train = np.zeros(30).tolist() \
            + np.ones(30).tolist() \
            + (np.ones(30) * 2).tolist() \
            + (np.ones(22) * 3).tolist()

        X_train.drop(['Setosa'], axis=1, inplace=True)
        X_test.drop(['Setosa'], axis=1, inplace=True)
        X_train['group_id'] = np.asarray(gvals_train, np.uint32)
        X_test['group_id'] = np.asarray(gvals_test, np.uint32)
        ft = FastForestRegressor()
        e = Pipeline([ft])
        e.fit(X_train, y_train, verbose=0)
        metrics, _ = e.test(X_test,
                            y_test,
                            evaltype='ranking',
                            group_id='group_id')
        assert_almost_equal(metrics['NDCG@1'][0],
                            1,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 1)
        assert_almost_equal(metrics['NDCG@2'][0],
                            1,
                            decimal=5,
                            err_msg="NDCG@2 should be %s" % 1)
        assert_almost_equal(metrics['NDCG@3'][0],
                            1,
                            decimal=5,
                            err_msg="NDCG@3 should be %s" % 1)
        assert_almost_equal(metrics['DCG@1'][0],
                            4.32808,
                            decimal=3,
                            err_msg="DCG@1 should be %s" % 4.32808)
        assert_almost_equal(metrics['DCG@2'][0],
                            7.05880,
                            decimal=3,
                            err_msg="DCG@2 should be %s" % 7.05880)
        assert_almost_equal(metrics['DCG@3'][0],
                            8.68183,
                            decimal=3,
                            err_msg="DCG@3 should be %s" % 8.68183)
예제 #11
0
    def test_lightgbmranker_asdataframe(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        df = pd.read_csv(file_path, encoding='utf-8')
        df['group'] = df['group'].astype(np.uint32)

        e = Pipeline([ToKey(columns={'rank': 'rank', 'group': 'group'}),
                      LightGbmRanker() << {
                          Role.Feature: ['Class', 'dep_day', 'duration'],
                          Role.Label: 'rank', Role.GroupId: 'group'}])

        e.fit(df)

        metrics, _ = e.test(df)
        assert_almost_equal(
            metrics['NDCG@1'][0],
            0.43571429,
            decimal=7,
            err_msg="NDCG@1 should be %s" %
                    0.43571429)
        assert_almost_equal(
            metrics['NDCG@2'][0],
            0.5128226,
            decimal=7,
            err_msg="NDCG@2 should be %s" %
                    0.5128226)
        assert_almost_equal(
            metrics['NDCG@3'][0],
            0.55168069,
            decimal=7,
            err_msg="NDCG@3 should be %s" %
                    0.55168069)
        assert_almost_equal(
            metrics['DCG@1'][0],
            4.688759,
            decimal=3,
            err_msg="DCG@1 should be %s" %
                    4.688759)
        assert_almost_equal(
            metrics['DCG@2'][0],
            9.012395,
            decimal=3,
            err_msg="DCG@2 should be %s" %
                    9.012395)
        assert_almost_equal(
            metrics['DCG@3'][0],
            11.446943,
            decimal=3,
            err_msg="DCG@3 should be %s" %
                    11.446943)
예제 #12
0
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
    data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [1.0, 2.0, 3.0], [2.0, 2.0, 2.0]]
    label = [1, 0, 1, 1]
    if fit_X_type == "sparse":
        model = Pipeline([Binner(), LightGbmClassifier(minimum_example_count_per_leaf=1)])
    else:
        model = Pipeline([Binner(), LogisticRegressionBinaryClassifier()])
    data_with_new_type = transform_data(data, fit_X_type)
    label_with_new_type = transform_data(label, fit_Y_type)
    model.fit(data_with_new_type, label_with_new_type)
    metrics, scores = model.test(
        data_with_new_type, label_with_new_type, output_scores=True)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.predict(test_data_with_new_type), scores, metrics
예제 #13
0
 def test_metrics_evaluate_anomalydetection(self):
     np.random.seed(0)
     df = get_dataset("iris").as_df().drop(['Label', 'Species'], axis=1)
     X_train, X_test = train_test_split(df)
     X_test.is_copy = False
     X_train = X_train[X_train['Setosa'] == 1]
     y_test = X_test['Setosa'].apply(lambda x: 1 if x == 0 else 0)
     X_train.drop(['Setosa'], axis=1, inplace=True)
     X_test.drop(['Setosa'], axis=1, inplace=True)
     svm = OneClassSvmAnomalyDetector()  # noqa
     e = Pipeline([svm])
     e.fit(X_train, verbose=0)
     if e.nodes[-1].label_column_name_ is not None:
         raise ValueError("'{0}' should be None".format(
             e.nodes[-1].label_column_name_))
     assert y_test.name == 'Setosa'
     metrics, _ = e.test(X_test, y_test)
     assert_almost_equal(metrics['AUC'][0],
                         1.0,
                         decimal=5,
                         err_msg="AUC should be %s" % 1.0)
     assert_almost_equal(metrics['DR @K FP'][0],
                         1.0,
                         decimal=5,
                         err_msg="DR @K FP should be %s" % 1.0)
     assert_almost_equal(metrics['DR @P FPR'][0],
                         1.0,
                         decimal=5,
                         err_msg="DR @P FPR should be %s" % 1.0)
     assert_almost_equal(metrics['DR @NumPos'][0],
                         1.0,
                         decimal=5,
                         err_msg="DR @NumPos should be %s" % 1.0)
     assert_almost_equal(metrics['Threshold @K FP'][0],
                         -0.0788,
                         decimal=2,
                         err_msg="Threshold @K FP should be %s" % -0.0788)
     assert_almost_equal(metrics['Threshold @P FPR'][0],
                         -0.00352,
                         decimal=2,
                         err_msg="Threshold @P FPR "
                         "should be %s" % -0.00352)
     assert_almost_equal(metrics['Threshold @NumPos'][0],
                         1.5110,
                         decimal=1,
                         err_msg="Threshold @NumPos should be %s" % 1.5110)
     assert_almost_equal(metrics['NumAnomalies'][0],
                         25,
                         decimal=5,
                         err_msg="NumAnomalies should be %s" % 25)
예제 #14
0
    def test_lightgbmranker_asfilestream(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        # Pure-nimbusml paradigm
        train_stream = FileDataStream.read_csv(file_path, encoding='utf-8')

        # pipeline
        pipeline = Pipeline([
            # the group_id column must be of key type
            ToKey(columns={
                'rank': 'rank',
                'group': 'group'
            }),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group')
        ])

        # train
        pipeline.fit(train_stream)

        # test
        eval_stream = FileDataStream.read_csv(file_path)
        metrics, _ = pipeline.test(eval_stream)
        assert_almost_equal(metrics['NDCG@1'][0],
                            43.571429,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 43.571429)
        assert_almost_equal(metrics['NDCG@2'][0],
                            51.28226,
                            decimal=5,
                            err_msg="NDCG@2 should be %s" % 51.28226)
        assert_almost_equal(metrics['NDCG@3'][0],
                            55.168069,
                            decimal=5,
                            err_msg="NDCG@3 should be %s" % 55.168069)
        assert_almost_equal(metrics['DCG@1'][0],
                            4.688759,
                            decimal=3,
                            err_msg="DCG@1 should be %s" % 4.688759)
        assert_almost_equal(metrics['DCG@2'][0],
                            9.012395,
                            decimal=3,
                            err_msg="DCG@2 should be %s" % 9.012395)
        assert_almost_equal(metrics['DCG@3'][0],
                            11.446943,
                            decimal=3,
                            err_msg="DCG@3 should be %s" % 11.446943)
예제 #15
0
    def test_unfitted_pickled_pipeline_can_be_fit(self):
        pipeline = Pipeline(
            steps=[
                ('cat',
                 OneHotVectorizer() << categorical_columns),
                ('linear',
                 FastLinearBinaryClassifier(
                     shuffle=False,
                     number_of_threads=1))])

        pipeline.fit(train, label)
        metrics, score = pipeline.test(test, test_label, output_scores=True)

        # Create a new unfitted pipeline
        pipeline = Pipeline(
            steps=[
                ('cat',
                 OneHotVectorizer() << categorical_columns),
                ('linear',
                 FastLinearBinaryClassifier(
                     shuffle=False,
                     number_of_threads=1))])

        pickle_filename = get_temp_file(suffix='.p')

        # Save with pickle
        with open(pickle_filename, 'wb') as f:
            pickle.dump(pipeline, f)

        with open(pickle_filename, "rb") as f:
            pipeline_pickle = pickle.load(f)

        os.remove(pickle_filename)

        pipeline_pickle.fit(train, label)
        metrics_pickle, score_pickle = pipeline_pickle.test(
            test, test_label, output_scores=True)

        assert_almost_equal(score.sum().sum(),
                            score_pickle.sum().sum(),
                            decimal=2)

        assert_almost_equal(metrics.sum().sum(),
                            metrics_pickle.sum().sum(),
                            decimal=2)
예제 #16
0
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
    data = [
        "This is sentence 1", "Talk about second", "Thrid one",
        "Final example."
    ]
    label = [1, 0, 1, 1]
    model = Pipeline([
        NGramFeaturizer(),
        LightGbmClassifier(min_data_per_leaf=1, n_thread=1)
    ])
    data_with_new_type = transform_data(data, fit_X_type)
    label_with_new_type = transform_data(label, fit_Y_type)
    model.fit(data_with_new_type, label_with_new_type)
    metrics, scores = model.test(data_with_new_type,
                                 label_with_new_type,
                                 output_scores=True)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.predict(test_data_with_new_type), scores, metrics
예제 #17
0
    def test_lightgbmranker_asdataframe_groupid(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        df = pd.read_csv(file_path, encoding='utf-8')
        df['group'] = df['group'].astype(np.uint32)

        e = Pipeline([
            ToKey(columns={
                'rank': 'rank',
                'group': 'group'
            }),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group')
        ])

        e.fit(df)

        metrics, _ = e.test(df)
        assert_almost_equal(metrics['NDCG@1'][0],
                            43.571429,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 43.571429)
        assert_almost_equal(metrics['NDCG@2'][0],
                            51.28226,
                            decimal=5,
                            err_msg="NDCG@2 should be %s" % 51.28226)
        assert_almost_equal(metrics['NDCG@3'][0],
                            55.168069,
                            decimal=5,
                            err_msg="NDCG@3 should be %s" % 55.168069)
        assert_almost_equal(metrics['DCG@1'][0],
                            4.688759,
                            decimal=3,
                            err_msg="DCG@1 should be %s" % 4.688759)
        assert_almost_equal(metrics['DCG@2'][0],
                            9.012395,
                            decimal=3,
                            err_msg="DCG@2 should be %s" % 9.012395)
        assert_almost_equal(metrics['DCG@3'][0],
                            11.446943,
                            decimal=3,
                            err_msg="DCG@3 should be %s" % 11.446943)
예제 #18
0
    def test_metrics_evaluate_binary_sklearn(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionBinaryClassifier()
        e = Pipeline([lr])
        e.fit(X_train, y_train, verbose=0)

        metrics, scores = e.test(X_test, y_test, output_scores=True)
        aucnimbusml = metrics['AUC']
        precision, recall, _ = precision_recall_curve(y_test,
                                                      scores['Probability'])
        aucskpr = auc(recall, precision)
        precision, recall, _ = precision_recall_curve(y_test, scores['Score'])
        aucsksc = auc(recall, precision)
        print(aucnimbusml, aucskpr, aucsksc)
        assert aucskpr == aucsksc
예제 #19
0
    def test_metrics_evaluate_ranking_group_id_from_new_dataframe(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df().drop(['Label', 'Species'], axis=1)
        X_train, X_test = train_test_split(df)
        X_test.is_copy = False
        X_train.is_copy = False
        y_train = X_train['Setosa']
        y_test = X_test['Setosa']
        gvals_test = np.zeros(10).tolist() + np.ones(10).tolist() \
            + (np.ones(10) * 2).tolist() + (np.ones(8) * 3).tolist()

        gvals_train = np.zeros(30).tolist() \
            + np.ones(30).tolist() \
            + (np.ones(30) * 2).tolist() \
            + (np.ones(22) * 3).tolist()

        X_train.drop(['Setosa'], axis=1, inplace=True)
        X_test.drop(['Setosa'], axis=1, inplace=True)
        X_train['group_id'] = np.asarray(gvals_train, np.uint32)
        X_test['group_id'] = np.asarray(gvals_test, np.uint32)
        ft = FastForestRegressor()
        e = Pipeline([ft])
        e.fit(X_train, y_train, verbose=0)
        groups_df = pd.DataFrame(data=dict(groups=gvals_test))
        metrics, _ = e.test(X_test,
                            y_test,
                            evaltype='ranking',
                            group_id=groups_df)
        assert_almost_equal(metrics['NDCG@1'][0],
                            100,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 100)
        assert_almost_equal(metrics['NDCG@2'][0],
                            100,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 100)
        assert_almost_equal(metrics['NDCG@3'][0],
                            100,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 100)
예제 #20
0
    def test_pipeline_saves_complete_model_file_when_pickled(self):
        model_nimbusml = Pipeline(
            steps=[
                ('cat',
                 OneHotVectorizer() << categorical_columns),
                ('linear',
                 FastLinearBinaryClassifier(
                     shuffle=False,
                     number_of_threads=1))])

        model_nimbusml.fit(train, label)
        metrics, score = model_nimbusml.test(test, test_label, output_scores=True)

        pickle_filename = get_temp_file(suffix='.p')

        # Save with pickle
        with open(pickle_filename, 'wb') as f:
            pickle.dump(model_nimbusml, f)

        # Remove the pipeline model from disk so
        # that the unpickled pipeline is forced
        # to get its model from the pickled file.
        os.remove(model_nimbusml.model)

        with open(pickle_filename, "rb") as f:
            model_nimbusml_pickle = pickle.load(f)

        os.remove(pickle_filename)

        metrics_pickle, score_pickle = model_nimbusml_pickle.test(
            test, test_label, output_scores=True)

        assert_almost_equal(score.sum().sum(),
                            score_pickle.sum().sum(),
                            decimal=2)

        assert_almost_equal(metrics.sum().sum(),
                            metrics_pickle.sum().sum(),
                            decimal=2)
예제 #21
0
    def test_metrics_evaluate_multiclass(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionClassifier()
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame(), verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # TODO: debug flucations, and increase decimal precision on checks
        assert_almost_equal(metrics['Accuracy(micro-avg)'][0],
                            0.763,
                            decimal=1,
                            err_msg="Accuracy(micro-avg) should be %s" % 0.763)
        assert_almost_equal(metrics['Accuracy(macro-avg)'][0],
                            0.718,
                            decimal=1,
                            err_msg="Accuracy(macro-avg) should be %s" % 0.718)
        assert_almost_equal(metrics['Log-loss'][0],
                            0.419,
                            decimal=3,
                            err_msg="Log-loss should be %s" % 0.419)
        assert_almost_equal(metrics['Log-loss reduction'][0],
                            0.38476,
                            decimal=3,
                            err_msg="Log-loss reduction should be %s" %
                            0.38476)
        assert_almost_equal(metrics['(class 0)'][0],
                            0.223,
                            decimal=1,
                            err_msg="(class 0) should be %s" % 0.223)
        assert_almost_equal(metrics['(class 1)'][0],
                            0.688,
                            decimal=1,
                            err_msg="(class 1) should be %s" % 0.688)
예제 #22
0
 def test_metrics_evaluate_binary_from_filedatastream(self):
     path = get_dataset('infert').as_filepath()
     data = FileDataStream.read_csv(path)
     e = Pipeline([
         OneHotVectorizer(columns={'edu': 'education'}),
         LightGbmRegressor(feature=['induced', 'edu'],
                           label='age',
                           number_of_threads=1)
     ])
     e.fit(data, verbose=0)
     metrics, _ = e.test(data)
     # TODO: debug flucations, and increase decimal precision on checks
     assert_almost_equal(metrics['L1(avg)'][0],
                         4.104164,
                         decimal=4,
                         err_msg="L1 loss should be %s" % 4.104164)
     assert_almost_equal(metrics['L2(avg)'][0],
                         24.15286,
                         decimal=4,
                         err_msg="L2(avg) should be %s" % 24.15286)
     assert_almost_equal(metrics['Loss-fn(avg)'][0],
                         24.15286,
                         decimal=4,
                         err_msg="Loss-fn(avg)loss should be %s" % 24.15286)
예제 #23
0
    def test_syntax_slots_wo_pipeline(self):
        # data
        df = get_dataset("infert").as_df()
        df = df.drop(['row_num', ], axis=1)
        X = df.drop('case', axis=1)
        y = df['case']

        # transform
        xf1 = OneHotVectorizer(columns=['age', 'parity', 'education_str'])
        X_xf1 = xf1.fit_transform(X, verbose=0)
        assert "age.21" in list(X_xf1.columns)

        # learner
        # (1 .a.)
        model = AveragedPerceptronBinaryClassifier()

        # (1. b)
        try:
            model = AveragedPerceptronBinaryClassifier(feature=['age'])
            model.fit(X_xf1, y, verbose=0)
            cont = True
            assert False
        except Exception as e:
            # does not work
            cont = False
            print(e)

        if cont:
            y_pred = model.predict(X_xf1)
            assert y_pred.shape == (248, 3)

        pipeline = Pipeline([
            OneHotVectorizer(columns=['age', 'parity', 'education_str']),
            AveragedPerceptronBinaryClassifier(feature='age')
        ])

        pipeline.fit(X, y, verbose=0)

        y_pred_withpipeline = pipeline.predict(X)
        print(y_pred_withpipeline.head())
        assert y_pred_withpipeline.shape == (248, 3)

        metrics, scores = pipeline.test(X, y, output_scores=True)
        print(metrics)
        assert scores.shape == (248, 3)
        assert metrics.shape == (1, 11)

        # back to X_xf1
        print(list(X_xf1.columns))
        l1 = list(sorted(set(_.split('.')[-1] for _ in X_xf1.columns)))
        levels = [['age', 'education', 'education_str', 'parity',
                   'pooled', 'spontaneous', 'stratum', 'induced'], [''] + l1]
        names = ['columns', 'slots']
        labels = [[], []]
        ages = []
        for _ in X_xf1.columns:
            spl = _.split('.')
            l1 = levels[0].index(spl[0])
            try:
                l2 = levels[1].index(spl[1])
            except IndexError:
                l2 = levels[1].index('')
            labels[0].append(l1)
            labels[1].append(l2)
            if spl[0] == 'age':
                ages.append(l2)
        X_xf1.columns = pandas.MultiIndex(
            levels=levels, labels=labels, names=names)
        print(X_xf1.head(n=2).T)

        col_ages = [('age', a) for a in ages]
        print(col_ages)
        try:
            model = AveragedPerceptronBinaryClassifier(feature=col_ages)
            model.fit(X_xf1, y, verbose=0)
            y_pred = model.predict(X_xf1)
            assert y_pred.shape == (248, 3)
        except Exception as e:
            # Does not work, probably confusion between list and tuple in nimbusml
            print(e)

        try:
            model = AveragedPerceptronBinaryClassifier(feature=['age'])
            model.fit(X_xf1, y, verbose=0)
            y_pred = model.predict(X_xf1)
            assert y_pred.shape == (248, 3)
        except Exception as e:
            # Does not work.
            print(e)
예제 #24
0
def accuracy(ovr):
    pipe = Pipeline([ovr])
    pipe.fit(X_train, y_train)
    metrics, _ = pipe.test(X_train, y_train)
    return metrics
예제 #25
0
###############################################################################
# KMeansPlusPlus
import pandas
from nimbusml import Pipeline
from nimbusml.cluster import KMeansPlusPlus

# define 3 clusters with centroids (1,1,1), (11,11,11) and (-11,-11,-11)
X_train = pandas.DataFrame(data=dict(x=[0, 1, 2, 10, 11, 12, -10, -11, -12],
                                     y=[0, 1, 2, 10, 11, 12, -10, -11, -12],
                                     z=[0, 1, 2, 10, 11, 12, -10, -11, -12]))

# these should clearly belong to just 1 of the 3 clusters
X_test = pandas.DataFrame(data=dict(x=[-1, 3, 9, 13, -13, -20],
                                    y=[-1, 3, 9, 13, -13, -20],
                                    z=[-1, 3, 9, 13, -13, -20]))

y_test = pandas.DataFrame(data=dict(clusterid=[2, 2, 1, 1, 0, 0]))

pipe = Pipeline([KMeansPlusPlus(n_clusters=3)]).fit(X_train)

metrics, predictions = pipe.test(X_test, y_test, output_scores=True)

# print predictions
print(predictions.head())

# print evaluation metrics
print(metrics)
예제 #26
0
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import LogisticRegressionClassifier
from nimbusml.preprocessing.schema import PrefixColumnConcatenator
from nimbusml.preprocessing.schema import ColumnDropper
from sklearn.model_selection import train_test_split

# use 'iris' data set to create test and train data
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

concat = PrefixColumnConcatenator() << {'Sepal': 'Sepal_'}
concat1 = PrefixColumnConcatenator() << {'Petal': 'Petal_'}
dropcols = ColumnDropper() << [
    'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa',
    'Species'
]

pipeline = Pipeline(
    [concat, concat1, dropcols,
     LogisticRegressionClassifier()])
pipeline.fit(X_train, y_train)

# Evaluate the model
metrics, scores = pipeline.test(X_test, y_test, output_scores=True)
print(metrics)
###############################################################################
# LightGbmRanker
import numpy as np
import pandas as pd
from nimbusml import Pipeline, Role
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmRanker

np.random.seed(0)
file_path = get_dataset("gen_tickettrain").as_filepath()

df = pd.read_csv(file_path)
df['group'] = df['group'].astype(np.uint32)

X = df.drop(['rank'], axis=1)
y = df['rank']

e = Pipeline([LightGbmRanker() << {Role.Feature: [
    'Class', 'dep_day', 'duration'], Role.Label: 'rank',
    Role.GroupId: 'group'}])

e.fit(df)

# test
metrics, scores = e.test(X, y, evaltype='ranking',
                         group_id='group', output_scores=True)
print(metrics)
예제 #28
0
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.naive_bayes import NaiveBayesClassifier
from nimbusml.utils import get_X_y
from sklearn.model_selection import train_test_split

# use 'wiki_detox_train' data set to create test and train data
# Sentiment	SentimentText
# 1	  ==RUDE== Dude, you are rude upload that carl picture back, or else.
# 1	  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIKI THEN!!!
np.random.seed(0)
train_file = get_dataset("wiki_detox_train").as_filepath()
(train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t')

X_train, X_test, y_train, y_test = train_test_split(train, label)

# map text reviews to vector space
texttransform = NGramFeaturizer(
    word_feature_extractor=Ngram(),
    vector_normalizer='None') << 'SentimentText'
nb = NaiveBayesClassifier(feature=['SentimentText'])

ppl = Pipeline([texttransform, nb])
ppl.fit(X_train, y_train)

# evaluate the model
metrics, scores = ppl.test(X_test, y_test, output_scores=True)

print(metrics)