Exemplo n.º 1
0
 def test_linear_file_role(self):
     pipeline = Pipeline([
         OneHotVectorizer() << categorical_columns,
         FastLinearBinaryClassifier(number_of_threads=1, shuffle=False)
     ])
     train_stream = FileDataStream(train_file, schema=file_schema)
     train_stream._set_role('Label', label_column)
     pipeline.fit(train_stream)
     test_stream = FileDataStream(test_file, schema=file_schema)
     out_data = pipeline.predict(test_stream)
     check_accuracy(test_file, label_column, out_data, 0.65)
Exemplo n.º 2
0
    def test_syntax2(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << 'education',
            OneHotVectorizer(max_num_terms=2) << 'workclass',
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemplo n.º 3
0
    def test_syntax5(self):
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {'f1': 'education'},
            OneHotHashVectorizer() << {'f2': 'education'},
            OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'},
            Concat() << {'Features': ['f%d' % i for i in range(1, 4)]},
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'Features'
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemplo n.º 4
0
 def test_GamBinaryClassifier(self):
     np.random.seed(0)
     df = get_dataset("infert").as_df()
     df.columns = [i.replace(': ', '') for i in df.columns]
     df = (OneHotVectorizer() << 'education_str').fit_transform(df)
     X_train, X_test, y_train, y_test = \
         train_test_split(df.loc[:, df.columns != 'case'], df['case'])
     lr = GamBinaryClassifier().fit(X_train, y_train)
     scores = lr.predict(X_test)
     acc = np.mean(y_test == [i for i in scores])
     assert_greater(acc, 0.70, "accuracy should  %s" % 0.70)
Exemplo n.º 5
0
    def test_syntax4_fail2(self):

        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {'edu1': 'education'},
            OneHotHashVectorizer() << {'edu2': 'education'},
            OneHotVectorizer(max_num_terms=2) << {'wki': 'workclass'},
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['edu1', 'edu4',
                                                             'wki']
        ])
        try:
            exp.fit(X, y, verbose=0)
            raise AssertionError("The test should not reach this line.")
        except Exception as e:
            assert "Feature column 'edu4' not found" in str(e)
Exemplo n.º 6
0
    def test_syntax4_fail(self):

        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {'edu1': 'education'},
            OneHotHashVectorizer() << {'edu2': 'education'},
            OneHotVectorizer(max_num_terms=2) << {'wki': 'workclass'},
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['edu1', 'edu2',
                                                             'wki']
        ])
        try:
            exp.fit(X, y, verbose=0)
            assert False
        except RuntimeError as e:
            assert "ConcatTransform() << {'Input': ['edu1', 'edu2', 'wki']}" \
                   in str(e)
Exemplo n.º 7
0
    def test_syntax4_columns(self):

        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer(columns={'edu1': 'education'}),
            OneHotHashVectorizer(columns={'edu2': 'education'}),
            OneHotVectorizer(max_num_terms=2, columns={'wki': 'workclass'}),
            Concat(columns={'Inputs': ['edu1', 'edu2', 'wki']}),
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << 'Inputs'
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemplo n.º 8
0
    def test_syntax_concat_slots(self):
        X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                  workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                  weight=[10., 1., 1., 1., 1.],
                                  y=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            Concat() << {'newcol': ['workclass', 'education']},
        ])
        exp.fit(X, verbose=0)
        exp.predict(X)
Exemplo n.º 9
0
    def test_linear_file(self):
        pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                             FastLinearBinaryClassifier(train_threads=1,
                                                        shuffle=False)])

        train_stream = FileDataStream(train_file, schema=file_schema)
        assert 'sep' in train_stream.schema.options
        assert 'header' in train_stream.schema.options
        pipeline.fit(train_stream, label_column)
        test_stream = FileDataStream(test_file, schema=file_schema)
        out_data = pipeline.predict(test_stream)
        check_accuracy(test_file, label_column, out_data, 0.65)
Exemplo n.º 10
0
 def test_trees_file(self):
     pipeline = Pipeline([
         OneHotVectorizer() << categorical_columns,
         FastTreesBinaryClassifier() << {
             'Label': label_column
         }
     ])
     train_stream = FileDataStream(train_file, schema=file_schema)
     pipeline.fit(train_stream, label_column)
     test_stream = FileDataStream(test_file, schema=file_schema)
     out_data = pipeline.predict(test_stream)
     check_accuracy(test_file, label_column, out_data, 0.65)
Exemplo n.º 11
0
 def test_linear(self):
     np.random.seed(0)
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipe = Pipeline(steps=[('cat',
                             OneHotVectorizer() << categorical_columns),
                            ('linear',
                             FastLinearBinaryClassifier(
                                 shuffle=False, number_of_threads=1))])
     pipe.fit(train, label)
     out_data = pipe.predict(test)
     check_accuracy_scikit(test_file, label_column, out_data, 0.779)
Exemplo n.º 12
0
    def test_model_datastream(self):
        model_nimbusml = Pipeline(
            steps=[('cat', OneHotVectorizer() << categorical_columns),
                   ('linear',
                    FastLinearBinaryClassifier(shuffle=False,
                                               number_of_threads=1))])

        model_nimbusml.fit(train, label)

        # Save with pickle
        pickle_filename = 'nimbusml_model.p'
        with open(pickle_filename, 'wb') as f:
            pickle.dump(model_nimbusml, f)

        with open(pickle_filename, "rb") as f:
            model_nimbusml_pickle = pickle.load(f)

        os.remove(pickle_filename)

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_pickle.predict(test).head(5)

        metrics, score = model_nimbusml.test(test,
                                             test_label,
                                             output_scores=True)
        metrics_pickle, score_pickle = model_nimbusml_pickle.test(
            test, test_label, output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(metrics.sum().sum(),
                            metrics_pickle.sum().sum(),
                            decimal=2)

        # Save load with pipeline methods
        model_nimbusml.save_model('model.nimbusml.m')
        model_nimbusml_load = Pipeline()
        model_nimbusml_load.load_model('model.nimbusml.m')

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_load.predict(test).head(5)

        metrics, score = model_nimbusml.test(test,
                                             test_label,
                                             output_scores=True)
        model_nimbusml_load, score_load = model_nimbusml_load.test(
            test, test_label, evaltype='binary', output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(metrics.sum().sum(),
                            model_nimbusml_load.sum().sum(),
                            decimal=2)

        os.remove('model.nimbusml.m')
Exemplo n.º 13
0
    def test_get_schema_returns_correct_value_for_vector_valued_columns(self):
        pipeline = Pipeline([OneHotVectorizer() << 'c0'])
        pipeline.fit(train_df)

        schema = pipeline.get_output_columns()

        self.assertTrue('c0.a' in schema)
        self.assertTrue('c0.b' in schema)
        self.assertTrue('c1' in schema)
        self.assertTrue('c2' in schema)

        self.assertEqual(len(schema), 4)
Exemplo n.º 14
0
    def test_unfitted_pickled_pipeline_can_be_fit(self):
        pipeline = Pipeline(steps=[('cat',
                                    OneHotVectorizer() << categorical_columns),
                                   ('linear',
                                    FastLinearBinaryClassifier(
                                        shuffle=False, number_of_threads=1))])

        pipeline.fit(train, label)
        metrics, score = pipeline.test(test, test_label, output_scores=True)

        # Create a new unfitted pipeline
        pipeline = Pipeline(steps=[('cat',
                                    OneHotVectorizer() << categorical_columns),
                                   ('linear',
                                    FastLinearBinaryClassifier(
                                        shuffle=False, number_of_threads=1))])

        pickle_filename = 'nimbusml_model.p'

        # Save with pickle
        with open(pickle_filename, 'wb') as f:
            pickle.dump(pipeline, f)

        with open(pickle_filename, "rb") as f:
            pipeline_pickle = pickle.load(f)

        os.remove(pickle_filename)

        pipeline_pickle.fit(train, label)
        metrics_pickle, score_pickle = pipeline_pickle.test(test,
                                                            test_label,
                                                            output_scores=True)

        assert_almost_equal(score.sum().sum(),
                            score_pickle.sum().sum(),
                            decimal=2)

        assert_almost_equal(metrics.sum().sum(),
                            metrics_pickle.sum().sum(),
                            decimal=2)
Exemplo n.º 15
0
    def test_syntax11_append_insert(self):

        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        exp = Pipeline()
        exp.append(
            ("OneHotHashVectorizer",
             OneHotHashVectorizer() << {
                 'edu2': 'education'}))
        exp.insert(0, OneHotVectorizer() << {'edu1': 'education'})
        exp.append(
            FastLinearBinaryClassifier(
                maximum_number_of_iterations=1) << {
                'Features': [
                    'edu1',
                    'edu2'],
                Role.Label: 'y'})
        exp.append(OneHotHashVectorizer() << {'edu2': 'education'})
        del exp[-1]
        assert len(exp) == 3

        exp.fit(df, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)

        try:
            exp.append(OneHotHashVectorizer() << {'edu2': 'education'})
        except RuntimeError as e:
            assert "Model is fitted and cannot be modified" in str(e)
        try:
            exp.insert(0, OneHotHashVectorizer() << {'edu2': 'education'})
        except RuntimeError as e:
            assert "Model is fitted and cannot be modified" in str(e)
        try:
            del exp[0]
        except RuntimeError as e:
            assert "Model is fitted and cannot be modified" in str(e)

        obj = exp[1][1]
        assert obj.__class__.__name__ == "OneHotHashVectorizer"
        obj = exp[1][1]
        assert obj.__class__.__name__ == "OneHotHashVectorizer"
        res = exp['OneHotHashVectorizer']
        assert len(res) == 1
        graph = exp.graph_
        assert len(graph.nodes) >= len(exp)
Exemplo n.º 16
0
 def test_trees(self):
     np.random.seed(0)
     (train, label) = get_X_y(train_file, label_column, sep=',')
     (test, label1) = get_X_y(test_file, label_column, sep=',')
     pipe = Pipeline(
         steps=[
             ('cat',
              OneHotVectorizer() << categorical_columns),
             ('linear',
              FastTreesBinaryClassifier())])
     pipe.fit(train, label)
     out_data = pipe.predict(test)
     check_accuracy_scikit(test_file, label_column, out_data, 0.77)
Exemplo n.º 17
0
    def test_vectorized_with_prefixconcat_output_predictor_model(self):
        """
        This test shows how to prepend ColumnConcatenator transform
        to outputted predictor model from combined (with featurizers) pipeline
        so it successfully runs on featurized data with vectors.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create, fit and score with combined model.
        # Output predictor model separately.
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # train ColumnConcatenator on featurized data
        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator(columns={'c0': 'c0.'})])
        concat_pipeline.fit(df)

        # Load predictor pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        # combine concat and predictor models and score
        combined_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)
        result_2 = combined_predictor_pipeline.predict(df)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
Exemplo n.º 18
0
    def test_combine_with_classifier_trained_with_joined_X_and_y(self):
        np.random.seed(0)

        infert_df = get_dataset("infert").as_df()
        feature_cols = [c for c in infert_df.columns if c != 'case']

        transform = OneHotVectorizer() << 'education_str'
        df = transform.fit_transform(infert_df, as_binary_data_stream=True)

        predictor = LogisticRegressionBinaryClassifier(label='case',
                                                       feature=feature_cols)
        predictor.fit(df)

        df = transform.transform(infert_df, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        # Combine the models and perform a prediction
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(infert_df)

        result_2 = result_2['PredictedLabel'].astype(np.float64)
        self.assertTrue(result_1.equals(result_2))
Exemplo n.º 19
0
 def test_syntax5_failing(self):
     df, X, y = self.get_simple_df()
     vec = OneHotVectorizer() << {'edu1': ['education1']}
     try:
         vec.fit_transform(X, verbose=2)
         assert False
     except RuntimeError as e:
         assert "Error: *** System.ArgumentOutOfRangeException: 'Could not find input column" \
                in str(e)
     vec = OneHotVectorizer() << {'edu1': ['education']}
     res = vec.fit_transform(X)
     assert res.shape == (5, 5)
Exemplo n.º 20
0
 def test_syntax5_failing(self):
     df, X, y = self.get_simple_df()
     vec = OneHotVectorizer() << {'edu1': ['education1']}
     try:
         vec.fit_transform(X, verbose=2)
         assert False
     except RuntimeError as e:
         assert "Returned code is -1. Check the log for error messages.." \
                in str(e)
     vec = OneHotVectorizer() << {'edu1': ['education']}
     res = vec.fit_transform(X)
     assert res.shape == (5, 5)
Exemplo n.º 21
0
    def test_passing_in_a_single_pipeline_returns_new_pipeline(self):
        pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ])
        pipeline.fit(train_df)
        result_1 = pipeline.predict(test_df)

        combined_pipeline = Pipeline.combine_models(pipeline)
        result_2 = combined_pipeline.predict(test_df)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
        self.assertTrue(isinstance(combined_pipeline, Pipeline))
Exemplo n.º 22
0
 def test_SymSgdBinaryClassifier(self):
     np.random.seed(0)
     df = get_dataset("infert").as_df()
     df.columns = [i.replace(': ', '') for i in df.columns]
     df = (OneHotVectorizer() << 'education_str').fit_transform(df)
     X_train, X_test, y_train, y_test = train_test_split(
         df.loc[:, df.columns != 'case'], df['case'], random_state=0)
     lr = SymSgdBinaryClassifier(shuffle=False,
                                 number_of_threads=1).fit(X_train, y_train)
     scores = lr.predict(X_test)
     acc = np.mean(y_test == [i for i in scores])
     # Removing randomness (shuffle=False) may be worse
     # because classes are not well distributed.
     assert_greater(acc, 0.25, "accuracy should be around %s" % 0.65)
Exemplo n.º 23
0
    def test_syntax6_change_role(self):
        # REVIEW: the pipeline drops all columns but one -->
        # nimbusml still thinks the Features are eduction, workclass
        # and does not automatically detects that the only remaining
        # columns should play that role
        # (maybe because the label column is here too even though
        # the only remaining column without a role is Features).
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': ['f%d' % i for i in range(1, 4)]
            },
            Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'],
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) <<
            ['Features']
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemplo n.º 24
0
    def test_vectorized_output_predictor_model(self):
        """
        This test shows that outputted predictor model from 
        combined (with featurizers) pipeline fails to run
        on featurized data with vectors.
        """

        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        try:
            # This does not work because the input schema doesnt
            # match. Input schema looks for vector 'c0' with slots 'a,b'
            # but featurized data has only columns 'c0.a' and 'c0.b'
            predictor_pipeline.predict(df)

        except Exception as e:
            pass
        else:
            self.fail()
Exemplo n.º 25
0
    def test_schema_syntax_multilevel(self):
        df = pandas.DataFrame(
            data=dict(X1=[0.1, 0.2], X2=[0.1, 0.2], yl=[1, 0], tx=['e', 'r']))
        columns = [('X', 'X1'), ('X', 'X2'), ('Y', 'yl'), ('TX', 'tx')]
        df.columns = pandas.MultiIndex.from_tuples(columns)

        exp = Pipeline(
            [OneHotVectorizer() << ('TX', 'tx'),
             FastLinearBinaryClassifier()])

        assert exp.nodes[0]._columns == ('TX', 'tx')
        assert exp.nodes[0].input == [('TX', 'tx')]
        exp.fit(df, ('Y', 'yl'))
        pred = exp.predict(df)
        assert pred.shape == (2, 3)
Exemplo n.º 26
0
    def test_syntax5_regular_expression(self):
        # REVIEW: not implemented yet
        # The best would be to handle regular expression inside nimbusml.
        # It could be handled in entrypoint.py just before calling nimbusml.
        # It can be handled inside Pipeline if it is aware of
        # the input schema.

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': 'f[0-9]+'
            },
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) <<
            'Features'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemplo n.º 27
0
 def test_file_no_schema(self):
     pipeline = Pipeline([
         OneHotVectorizer() << categorical_columns,
         FastLinearBinaryClassifier(number_of_threads=1, shuffle=False)
     ])
     assert_raises_regex(TypeError,
                         'Filenames are not allowed',
                         pipeline.fit,
                         train_file,
                         y=label_column)
     assert_raises_regex(ValueError,
                         'Model is not fitted',
                         pipeline.predict,
                         test_file,
                         y=label_column)
Exemplo n.º 28
0
    def test_different_schema_with_filedatastream_input(self):
        train_filename = "train-data.csv"
        train_df.to_csv(train_filename, index=False, header=True)
        train_data_stream = FileDataStream.read_csv(train_filename, sep=',', header=True)

        test_filename = "test-data.csv"
        test_df.to_csv(test_filename, index=False, header=True)
        test_data_stream = FileDataStream.read_csv(test_filename, sep=',', header=True)

        # Create reference pipeline
        std_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)

        std_pipeline.fit(train_data_stream)
        result_1 = std_pipeline.predict(test_data_stream)

        # Create combined pipeline
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed)
        transform_pipeline.fit(train_data_stream)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_data_stream)

        os.remove(transform_pipeline.model)

        result_2 = combined_pipeline.predict(test_data_stream)

        self.assertTrue(result_1.equals(result_2))

        os.remove(train_filename)
        os.remove(test_filename)
Exemplo n.º 29
0
    def test_schema_with_vectorized_column(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(
            data, as_binary_data_stream=True)

        # col=row_num:I8:0 col=education:R4:1-3 col=age:I8:4 col=parity:I8:5
        # col=induced:I8:6 col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9
        # col=pooled.stratum:I8:10 quote+
        schema = featurized_data.schema

        self.assertEqual(len(schema), 9)
        self.assertEqual(schema['age'].Type, 'I8')
        self.assertEqual(schema['age'].Name, 'age')
        self.assertEqual(schema['age'].IsVector, False)

        self.assertEqual(schema['education'].Type, 'R4')
        self.assertEqual(schema['education'].Name, 'education')
        self.assertEqual(len(schema['education'].Pos), 3)
        self.assertEqual(schema['education'].IsVector, True)

        self.assertTrue('education.0-5yrs' not in schema)
        self.assertTrue('education.6-11yrs' not in schema)
        self.assertTrue('education.12+yrs' not in schema)

        # col=row_num:I8:0 col=education.0-5yrs:R4:1 col=education.6-11yrs:R4:2
        # col=education.12+yrs:R4:3 col=age:I8:4 col=parity:I8:5 col=induced:I8:6
        # col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9 col=pooled.stratum:I8:10
        # quote+ header=+
        schema = featurized_data.get_dataframe_schema()

        self.assertEqual(len(schema), 11)
        self.assertEqual(schema['age'].Type, 'I8')
        self.assertEqual(schema['age'].Name, 'age')
        self.assertEqual(schema['age'].IsVector, False)

        self.assertTrue('education' not in schema)
        self.assertTrue('education.0-5yrs' in schema)
        self.assertTrue('education.6-11yrs' in schema)
        self.assertTrue('education.12+yrs' in schema)

        self.assertEqual(schema['education.0-5yrs'].Type, 'R4')
        self.assertEqual(schema['education.0-5yrs'].Name, 'education.0-5yrs')
        self.assertEqual(schema['education.0-5yrs'].IsVector, False)
    def test_lightgbmbinaryclassifier(self):
        np.random.seed(0)

        df = get_dataset("infert").as_df()

        # remove : and ' ' from column names, and encode categorical column
        df.columns = [i.replace(': ', '') for i in df.columns]
        df = (OneHotVectorizer() << 'education_str').fit_transform(df)

        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'case'], df['case'])

        lr = LightGbmBinaryClassifier().fit(X_train, y_train)
        scores = lr.predict(X_test)
        accuracy = np.mean(y_test == [i for i in scores])
        assert_greater(accuracy, 0.98, "accuracy should be %s" % 0.98)