示例#1
0
    def test_pipeline_with_no_columns_raise(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier()
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        with self.assertRaises(RuntimeError):
            # Message
            # System.InvalidOperationException:
            # 'LightGBM Error, code is -1, error message is
            # 'Cannot construct Dataset since there are not useful features.
            # It should be at least two unique rows.
            # If the num_row (num_data) is small,
            # you can set min_data=1 and min_data_in_bin=1 to fix this.
            # Otherwise please make sure you are using the right dataset.'
            ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
示例#2
0
    def test_clone_sweep(self):
        # grid search, then clone pipeline and grid search again
        # results should be same
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        learner = FastTreesBinaryClassifier(number_of_trees=100,
                                            number_of_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__number_of_trees=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)
        grid.fit(X_train, y_train)

        pipe1 = pipe.clone()
        grid1 = GridSearchCV(pipe1, param_grid)
        grid1.fit(X_train, y_train)

        assert grid.best_params_[
            'learner__number_of_trees'] == grid1.best_params_[
                'learner__number_of_trees']
示例#3
0
    def test_test(self):
        transformed_data, transformed_data_df = transform_data()
        fl = FastLinearRegressor(
            feature=[
                'parity',
                'in',
                'sp',
                'stratum'],
            label='age')
        flpipe = Pipeline([fl])
        flpipe.fit(transformed_data)
        metrics, scores = flpipe.test(transformed_data, output_scores=True)
        metrics_df, scores_df = flpipe.test(
            transformed_data_df, output_scores=True)

        assert_array_equal(scores, scores_df)
        assert_array_equal(metrics, metrics_df)

        flpipe.fit(
            transformed_data_df.drop(
                'age',
                axis=1),
            transformed_data_df['age'])
        metrics, scores = flpipe.test(transformed_data, output_scores=True)
        metrics_df, scores_df = flpipe.test(
            transformed_data_df, output_scores=True)

        assert_array_equal(scores, scores_df)
        assert_array_equal(metrics, metrics_df)
示例#4
0
    def test_get_fit_info_fastl(self):
        train_file = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(train_file)
        data = FileDataStream(train_file, schema)

        pipeline = Pipeline([
            Filter(columns=['Ozone']),
            FastLinearRegressor(feature=['Solar_R', 'Temp'], label='Ozone')
        ])

        info = pipeline.get_fit_info(data)
        exp = [{
            'name':
            None,
            'outputs':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'schema_after':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'type':
            'start'
        }, {
            'inputs': ['Ozone'],
            'name':
            'Filter',
            'outputs': ['Ozone'],
            'schema_after':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'type':
            'transform'
        }]
        for el in info[0]:
            if 'operator' in el:
                del el['operator']
        self.assertEqual(exp, info[0][:2])
示例#5
0
    def test_pipeline_info(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            MeanVarianceScaler() << {
                'new_y': 'yy'
            },
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {
                'Feature': ['workclass', 'education'],
                Role.Label: 'new_y'
            }
        ])

        infos = exp.get_fit_info(df)[0]
        for inf in infos:
            if 'operator' in inf:
                del inf['operator']
        exp = [{
            'name': None,
            'schema_after': ['education', 'workclass', 'yy'],
            'type': 'start',
            'outputs': ['education', 'workclass', 'yy']
        }, {
            'name': 'TypeConverter',
            'inputs': ['yy'],
            'outputs': ['new_y'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y'],
            'type': 'transform'
        }, {
            'name': 'MeanVarianceScaler',
            'inputs': ['new_y'],
            'type': 'transform',
            'outputs': ['new_y'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y']
        }, {
            'name': 'OneHotVectorizer',
            'inputs': ['workclass', 'education'],
            'type': 'transform',
            'outputs': ['workclass', 'education'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y']
        }, {
            'name': 'ColumnDropper',
            'type': 'transform',
            'schema_after': ['education', 'workclass', 'new_y'],
            'inputs': ['education', 'workclass', 'yy', 'new_y'],
            'outputs': ['education', 'workclass', 'new_y']
        }, {
            'name': 'FastLinearRegressor',
            'inputs': ['Feature:education,workclass', 'Label:new_y'],
            'type': 'regressor',
            'outputs': ['Score'],
            'schema_after': ['Score']
        }]
        if infos != exp:
            raise Exception(infos)
示例#6
0
    def test_syntax12_mixed2(self):
        X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                  workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                  weight=[10., 1., 1., 1., 1.],
                                  y=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline(
            [
                OneHotVectorizer(
                    columns=[
                        'workclass', 'education']),
                Concat(
                    columns={
                        'Feature': ['workclass', 'education']}),
                FastTreesRegressor(
                    num_trees=5, feature='Feature', weight='weight') << {
                    Role.Label: 'y'}])
        exp.fit(X, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Feature'
        assert exp.nodes[-1].label_column_ == 'y'
        assert exp.nodes[-1].weight_column_ == 'weight'
        # y is required here as well as weight.
        # It is replaced by fakes values.
        # The test does not fail but the weight is not taken into account.
        X['y'] = -5
        X['weight'] = -5
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
示例#7
0
    def test_syntax8_label(self):
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   yy=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop('yy', axis=1)

        exp = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {'Feature': ['workclass', 'education'],
                                      Role.Label: 'new_y'}
        ])
        exp.fit(df, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Features'
        assert exp.nodes[-1].label_column_ == 'new_y'
        # The pipeline requires it now as it is transformed all along.
        X['yy'] = 0.0
        prediction = exp.predict(X, verbose=0)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        if prediction['Score'].min() < 0.4:
            raise Exception(prediction)
        if prediction['Score'].max() > 2.00:
            raise Exception(prediction)
示例#8
0
def nimbus_pred(model_path, test_set_path):
    X = pd.read_csv(test_set_path)
    X['c'] = X['c'].astype("category")
    p = Pipeline()
    p.load_model(model_path)
    pred = p.predict(X)
    print(pred)
示例#9
0
    def test_syntax4_dict(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            Concat() << {
                'Inputs': ['edu1', 'edu2', 'wki']
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Inputs'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#10
0
    def test_ensemble_supports_get_fit_info(self):
        df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                               workclass=['X', 'X', 'Y', 'Y', 'Y'],
                               yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'}

        r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info
        r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info
        r3 = LightGbmRegressor(normalize="Yes") << col_info

        pipeline = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            ColumnDropper() << 'yy',
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])

        info = pipeline.get_fit_info(df)

        last_info_node = info[0][-1]
        self.assertEqual(last_info_node['inputs'],
                         ['Feature:education,workclass', 'Label:new_y'])
        self.assertEqual(last_info_node['name'], 'VotingRegressor')
        self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor))
        self.assertEqual(last_info_node['outputs'], ['Score'])
        self.assertEqual(last_info_node['schema_after'], ['Score'])
        self.assertEqual(last_info_node['type'], 'regressor')
示例#11
0
    def test_syntax6_change_role(self):
        # REVIEW: the pipeline drops all columns but one -->
        # nimbusml still thinks the Features are eduction, workclass
        # and does not automatically detects that the only remaining
        # columns should play that role
        # (maybe because the label column is here too even though
        # the only remaining column without a role is Features).
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {'f1': 'education'},
            OneHotHashVectorizer() << {'f2': 'education'},
            OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'},
            Concat() << {'Features': ['f%d' % i for i in range(1, 4)]},
            Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'],
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features']
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#12
0
    def test_ensemble_supports_user_defined_transforms(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}))

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r1.fit(train_df)
        result1 = r1.predict(test2_df)

        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r2.fit(train_df)
        result2 = r2.predict(test2_df)

        r3 = LightGbmRegressor(**lgbmArgs)
        r3.fit(train_df)
        result3 = r3.predict(test2_df)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([
            RangeFilter(min=0, max=10, columns='c1'),
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test2_df)

        self.assertEqual(len(result4), 3)

        average1 = (result1[0] + result2[0] + result3[0]) / 3
        average2 = (result1[1] + result2[1] + result3[1]) / 3
        average3 = (result1[2] + result2[2] + result3[2]) / 3
        self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5)
        self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5)
        self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
示例#13
0
    def test_predictor_loaded_from_zip_has_feature_contributions(self):
        features = ['age', 'education-num', 'hours-per-week']

        model_nimbusml = FastLinearBinaryClassifier(feature=features)
        model_nimbusml.fit(train, label)
        fc = model_nimbusml.get_feature_contributions(test)

        # Save the model to zip
        model_filename = 'nimbusml_model.zip'
        model_nimbusml.save_model(model_filename)
        # Load the model from zip
        model_nimbusml_zip = Pipeline()
        model_nimbusml_zip.load_model(model_filename)

        fc_zip = model_nimbusml_zip.get_feature_contributions(test)

        assert [
            'FeatureContributions.' + feature in fc_zip.columns
            for feature in features
        ]

        assert [
            fc['FeatureContributions.' + feature].equals(
                fc_zip['FeatureContributions.' + feature])
            for feature in features
        ]

        os.remove(model_filename)
示例#14
0
 def test_PcaTransformer_int(self):
     df_ = get_dataset("infert").as_df()
     res = {}
     dt = {}
     for ty in (int, float):
         df = df_.copy()
         df['age'] = df['age'].astype(ty)
         df['parity'] = df['parity'].astype(ty)
         df['spontaneous'] = df['spontaneous'].astype(ty)
         df['stratum'] = df['stratum'].astype(ty)
         X = ['age', 'parity', 'spontaneous', 'stratum']
         pipe = Pipeline([
             ColumnConcatenator() << {
                 'X': X
             },
             PcaTransformer(rank=3) << 'X'
         ])
         y = pipe.fit_transform(df[X], verbose=0)
         res[ty] = y.sum().sum()
         dt[ty] = list(y.dtypes)
     vals = list(res.values())
     assert_almost_equal(vals[0], vals[1])
     dt = list(dt.values())
     dt[0].sort()
     dt[1].sort()
     assert dt[0] != dt[1]
示例#15
0
    def test_metrics_evaluate_clusterer(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = KMeansPlusPlus(n_clusters=2, initialization_algorithm="Random")
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame(), verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # if abs(metrics['NMI'][0] - 0.7) >= 0.15:
        #    raise AssertionError("NMI loss should be %f not %f" % \
        # (0.7, metrics['NMI'][0]))
        # if abs(metrics['AvgMinScore'][0] - 0.014) >= 0.015:
        #    raise AssertionError("AvgMinScore  should be %f not %f" % (\
        # 0.014, metrics['AvgMinScore'][0]))
        assert_almost_equal(metrics['NMI'][0],
                            0.7,
                            decimal=0,
                            err_msg="NMI loss should be %s" % 0.7)
        assert_almost_equal(metrics['AvgMinScore'][0],
                            0.032,
                            decimal=2,
                            err_msg="AvgMinScore  should be %s" % 0.014)
示例#16
0
    def test_syntax4_fail(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            FastLinearBinaryClassifier(max_iterations=1) <<
            ['edu1', 'edu2', 'wki']
        ])
        try:
            exp.fit(X, y)
            assert False
        except RuntimeError as e:
            assert "ConcatTransform() << {'Input': ['edu1', 'edu2', 'wki']}" \
                   in str(e)
示例#17
0
    def test_syntax10_weights(self):
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   weight=[1., 1., 1., 2., 1.],
                                   y=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop(['y', 'weight'], axis=1)
        y = df['y']
        w = df['weight']

        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            FastLinearRegressor()
        ])
        exp.fit(X, y, weight=w, verbose=0)
        assert exp.nodes[-1].feature_column == 'Features'
        assert exp.nodes[-1].label_column == 'y'
        assert exp.nodes[-1].weight_column == 'weight'
        X['weight'] = -5
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        if prediction['Score'].min() < 1.:
            raise Exception(prediction)
        if prediction['Score'].max() > 3.6:
            raise Exception(prediction)
        if len(set(prediction['Score'])) < 4:
            raise Exception(prediction)
示例#18
0
    def test_syntax4_fail2(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            FastLinearBinaryClassifier(max_iterations=1) <<
            ['edu1', 'edu4', 'wki']
        ])
        try:
            exp.fit(X, y)
            raise AssertionError("The test should not reach this line.")
        except Exception as e:
            assert "Feature column 'edu4' not found" in str(e)
示例#19
0
    def test_syntax7_rename(self):
        # Error message are usually not informative enough.
        # Missing column --> no indication of other columns.
        # Error is (one transform should handle it)
        # 'The label column 'y' of the training data has a data type
        # not suitable for binary classification: Vec<Key<U4, 0-1>, 2>.
        # Type must be Bool, R4, R8 or Key with two classes.

        df = pandas.DataFrame(
            dict(
                education=[
                    'A', 'B', 'A', 'B', 'A'], workclass=[
                    'X', 'X', 'Y', 'Y', 'Y'], y=[
                    'red', 'white', 'red', 'white', 'white']))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << 'y',
            OneHotVectorizer() << ['workclass', 'education'],
            TypeConverter(result_type='R4') << {'yi': 'y'},
            Drop() << 'y',
            FastLinearBinaryClassifier(max_iterations=1) << 'yi'
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        assert prediction.min() > 0.01
        assert prediction.max() < 0.05
示例#20
0
    def test_syntax5_regular_expression(self):
        # REVIEW: not implemented yet
        # The best would be to handle regular expression inside nimbusml.
        # It could be handled in entrypoint.py just before calling nimbusml.
        # It can be handled inside Pipeline if it is aware of
        # the input schema.

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': 'f[0-9]+'
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Features'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#21
0
    def test_unpickled_pipeline_has_feature_contributions(self):
        features = ['age', 'education-num', 'hours-per-week']
        
        model_nimbusml = Pipeline(
            steps=[FastLinearBinaryClassifier(feature=features)])
        model_nimbusml.fit(train, label)
        fc = model_nimbusml.get_feature_contributions(test)

        # Save with pickle
        pickle_filename = get_temp_file(suffix='.p')
        with open(pickle_filename, 'wb') as f:
            pickle.dump(model_nimbusml, f)
        # Unpickle model
        with open(pickle_filename, "rb") as f:
            model_nimbusml_pickle = pickle.load(f)

        fc_pickle = model_nimbusml_pickle.get_feature_contributions(test)

        assert ['FeatureContributions.' + feature in fc_pickle.columns
                for feature in features]

        assert [fc['FeatureContributions.' + feature].equals(
            fc_pickle['FeatureContributions.' + feature])
                for feature in features]

        os.remove(pickle_filename)
示例#22
0
    def test_syntax6_regular_expression(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': ['f%d' % i for i in range(1, 4)]
            },
            Drop() << '~Features',
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#23
0
    def test_pipeline_pca(self):
        X = numpy.array([[1.0, 2, 3], [2, 3, 4], [3, 4, 5]])
        exp = Pipeline([PcaTransformer(rank=2)])
        infos = exp.get_fit_info(X)[0]
        for inf in infos:
            if 'operator' in inf:
                del inf['operator']
        exp = [{
            'name': None,
            'schema_after': ['F0', 'F1', 'F2'],
            'type': 'start',
            'outputs': ['F0', 'F1', 'F2']
        }, {
            'name': 'TypeConverter',
            'inputs': ['F0', 'F1', 'F2'],
            'type': 'transform',
            'outputs': ['F0', 'F1', 'F2'],
            'schema_after': ['F0', 'F1', 'F2']
        }, {
            'name': 'PcaTransformer',
            'inputs': ['temp_'],
            'type': 'transform',
            'outputs': ['temp_'],
            'schema_after': ['F0', 'F1', 'F2', 'temp_']
        }]
        # This id depends on id(node), different at each execution.
        infos[-1]["inputs"] = ["temp_"]
        # This id depends on id(node), different at each execution.
        infos[-1]["outputs"] = ["temp_"]
        # This id depends on id(node), different at each execution.
        infos[-1]["schema_after"][-1] = ["temp_"]

        self.assertTrue(any(x != y for x, y in zip(exp, infos)))
示例#24
0
    def test_syntax11_learner(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            FastLinearBinaryClassifier(max_iterations=1) << {
                'Features': ['edu1', 'edu2'],
                Role.Label: 'y'
            }
        ])
        exp.fit(df)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#25
0
    def test_datetime_column_parsed_from_string(self):
        dates = ["2018-01-02", "2018-02-01"]
        df = pd.DataFrame({'c1': dates, 'c2': [3, 4]})

        file_name = get_temp_file('.csv')
        df.to_csv(file_name)
        df = pd.read_csv(file_name, parse_dates=['c1'], index_col=0)

        self.assertEqual(df.dtypes[0], np.dtype('datetime64[ns]'))

        pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})])
        result = pipeline.fit_transform(df)

        self.assertEqual(result.loc[0, 'c1'].year, 2018)
        self.assertEqual(result.loc[0, 'c1'].month, 1)
        self.assertEqual(result.loc[0, 'c1'].day, 2)
        self.assertEqual(result.loc[0, 'c1'].hour, 0)
        self.assertEqual(result.loc[0, 'c1'].minute, 0)
        self.assertEqual(result.loc[0, 'c1'].second, 0)

        self.assertEqual(result.loc[1, 'c1'].year, 2018)
        self.assertEqual(result.loc[1, 'c1'].month, 2)
        self.assertEqual(result.loc[1, 'c1'].day, 1)
        self.assertEqual(result.loc[1, 'c1'].hour, 0)
        self.assertEqual(result.loc[1, 'c1'].minute, 0)
        self.assertEqual(result.loc[1, 'c1'].second, 0)

        self.assertEqual(len(result), 2)
        self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]'))

        os.remove(file_name)
示例#26
0
    def test_syntax3(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << 'education',
            OneHotVectorizer(max_num_terms=2) << 'workclass',
            # Currently the learner does not use edu1
            # unless it is specified explicitely so nimbusml
            # does not do what the syntax implicetely tells.
            # We need to modify either the bridge to look into
            # every available column at one step.
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
示例#27
0
 def test_model_summary_not_supported(self):
     for learner in learners_not_supported:
         pipeline = Pipeline(
             [OneHotVectorizer() << categorical_columns, learner])
         train_stream = FileDataStream(train_file, schema=file_schema)
         pipeline.fit(train_stream, label_column)
         assert_raises(TypeError, pipeline.summary)
示例#28
0
    def test_metrics_evaluate_regressor(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = FastTreesRegressor()
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame(), verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # TODO: debug flucations, and increase decimal precision on checks
        assert_almost_equal(metrics['L1(avg)'][0],
                            0.107,
                            decimal=1,
                            err_msg="L1 loss should be %s" % 0.107)
        assert_almost_equal(metrics['L2(avg)'][0],
                            0.0453,
                            decimal=1,
                            err_msg="L2(avg) should be %s" % 0.0453)
        assert_almost_equal(metrics['Loss-fn(avg)'][0],
                            0.0453,
                            decimal=1,
                            err_msg="Loss-fn(avg)loss should be %s" % 0.0453)
示例#29
0
    def test_globalcontrastrowscaler(self):
        in_df = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[0, 2.5, 2.6, 2.4],
                      Species=["setosa", "viginica", "setosa", 'versicolor']))

        in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float32)

        # generate two new Columns - Petal_Normed and Sepal_Normed
        concat = ColumnConcatenator() << {
            'concated_columns':
            ['Petal_Length', 'Sepal_Width', 'Sepal_Length']
        }

        # Performs a global contrast normalization on input values:
        # Y = (s * X - M) / D, where s is a scale, M is mean and D is either
        # L2 norm or standard deviation
        normed = GlobalContrastRowScaler() << {
            'normed_columns': 'concated_columns'
        }

        pipeline = Pipeline([concat, normed])
        out_df = pipeline.fit_transform(in_df)
        cols = [
            'concated_columns.' + s
            for s in ['Sepal_Length', 'Sepal_Width', 'Petal_Length']
        ]
        cols.extend([
            'normed_columns.' + s
            for s in ['Sepal_Length', 'Sepal_Width', 'Petal_Length']
        ])
        sum = out_df[cols].sum().sum()
        assert_greater(sum, 17.309, "sum should be greater than %s" % 17.309)
        assert_less(sum, 17.3102, "sum should be less than %s" % 17.31)
示例#30
0
    def test_lpscaler_automatically_converts_to_single(self):
        in_df = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[0, 2.5, 2.6, 2.4],
                      Species=["setosa", "viginica", "setosa", 'versicolor']))

        in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float64)

        src_cols = ['Sepal_Length', 'Sepal_Width', 'Petal_Length']

        pipeline = Pipeline([
            ColumnConcatenator() << {
                'concat': src_cols
            },
            LpScaler() << {
                'norm': 'concat'
            }
        ])
        out_df = pipeline.fit_transform(in_df)

        cols = ['concat.' + s for s in src_cols]
        cols.extend(['norm.' + s for s in src_cols])
        sum = out_df[cols].sum().sum()
        sum_range = (23.24, 23.25)
        assert_greater(sum, sum_range[0],
                       "sum should be greater than %s" % sum_range[0])
        assert_less(sum, sum_range[1],
                    "sum should be less than %s" % sum_range[1])