예제 #1
0
    def test_pipeline_pca(self):
        X = numpy.array([[1.0, 2, 3], [2, 3, 4], [3, 4, 5]])
        exp = Pipeline([PcaTransformer(rank=2)])
        infos = exp.get_fit_info(X)[0]
        for inf in infos:
            if 'operator' in inf:
                del inf['operator']
        exp = [{
            'name': None,
            'schema_after': ['F0', 'F1', 'F2'],
            'type': 'start',
            'outputs': ['F0', 'F1', 'F2']
        }, {
            'name': 'TypeConverter',
            'inputs': ['F0', 'F1', 'F2'],
            'type': 'transform',
            'outputs': ['F0', 'F1', 'F2'],
            'schema_after': ['F0', 'F1', 'F2']
        }, {
            'name': 'PcaTransformer',
            'inputs': ['temp_'],
            'type': 'transform',
            'outputs': ['temp_'],
            'schema_after': ['F0', 'F1', 'F2', 'temp_']
        }]
        # This id depends on id(node), different at each execution.
        infos[-1]["inputs"] = ["temp_"]
        # This id depends on id(node), different at each execution.
        infos[-1]["outputs"] = ["temp_"]
        # This id depends on id(node), different at each execution.
        infos[-1]["schema_after"][-1] = ["temp_"]

        self.assertTrue(any(x != y for x, y in zip(exp, infos)))
예제 #2
0
    def test_pipeline_info(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            MeanVarianceScaler() << {
                'new_y': 'yy'
            },
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {
                'Feature': ['workclass', 'education'],
                Role.Label: 'new_y'
            }
        ])

        infos = exp.get_fit_info(df)[0]
        for inf in infos:
            if 'operator' in inf:
                del inf['operator']
        exp = [{
            'name': None,
            'schema_after': ['education', 'workclass', 'yy'],
            'type': 'start',
            'outputs': ['education', 'workclass', 'yy']
        }, {
            'name': 'TypeConverter',
            'inputs': ['yy'],
            'outputs': ['new_y'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y'],
            'type': 'transform'
        }, {
            'name': 'MeanVarianceScaler',
            'inputs': ['new_y'],
            'type': 'transform',
            'outputs': ['new_y'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y']
        }, {
            'name': 'OneHotVectorizer',
            'inputs': ['workclass', 'education'],
            'type': 'transform',
            'outputs': ['workclass', 'education'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y']
        }, {
            'name': 'ColumnDropper',
            'type': 'transform',
            'schema_after': ['education', 'workclass', 'new_y'],
            'inputs': ['education', 'workclass', 'yy', 'new_y'],
            'outputs': ['education', 'workclass', 'new_y']
        }, {
            'name': 'FastLinearRegressor',
            'inputs': ['Feature:education,workclass', 'Label:new_y'],
            'type': 'regressor',
            'outputs': ['Score'],
            'schema_after': ['Score']
        }]
        if infos != exp:
            raise Exception(infos)
예제 #3
0
    def test_ensemble_supports_get_fit_info(self):
        df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                               workclass=['X', 'X', 'Y', 'Y', 'Y'],
                               yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'}

        r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info
        r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info
        r3 = LightGbmRegressor(normalize="Yes") << col_info

        pipeline = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            ColumnDropper() << 'yy',
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])

        info = pipeline.get_fit_info(df)

        last_info_node = info[0][-1]
        self.assertEqual(last_info_node['inputs'],
                         ['Feature:education,workclass', 'Label:new_y'])
        self.assertEqual(last_info_node['name'], 'VotingRegressor')
        self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor))
        self.assertEqual(last_info_node['outputs'], ['Score'])
        self.assertEqual(last_info_node['schema_after'], ['Score'])
        self.assertEqual(last_info_node['type'], 'regressor')
예제 #4
0
    def test_get_fit_info_fastl(self):
        train_file = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(train_file)
        data = FileDataStream(train_file, schema)

        pipeline = Pipeline([
            Filter(columns=['Ozone']),
            FastLinearRegressor(feature=['Solar_R', 'Temp'], label='Ozone')
        ])

        info = pipeline.get_fit_info(data)
        exp = [{
            'name':
            None,
            'outputs':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'schema_after':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'type':
            'start'
        }, {
            'inputs': ['Ozone'],
            'name':
            'Filter',
            'outputs': ['Ozone'],
            'schema_after':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'type':
            'transform'
        }]
        for el in info[0]:
            if 'operator' in el:
                del el['operator']
        self.assertEqual(exp, info[0][:2])
예제 #5
0
    def test_pipeline_with_no_columns(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None
        ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))
예제 #6
0
    def test_pipeline_with_no_columns_raise(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier()
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        with self.assertRaises(RuntimeError):
            # Message
            # System.InvalidOperationException:
            # 'LightGBM Error, code is -1, error message is
            # 'Cannot construct Dataset since there are not useful features.
            # It should be at least two unique rows.
            # If the num_row (num_data) is small,
            # you can set min_data=1 and min_data_in_bin=1 to fix this.
            # Otherwise please make sure you are using the right dataset.'
            ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
예제 #7
0
    def test_plot_fitted_cloned_pipeline(self):
        df = pd.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1.0, 3, 2, 3, 4]))
        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            FastLinearRegressor(feature=['workclass', 'education'], label='y'),
        ])
        info1 = exp.get_fit_info(df)[0]
        res1 = dot_export_pipeline(exp, df)
        assert res1 is not None
        exp.fit(df)
        info2 = exp.get_fit_info(df)[0]
        assert len(info1) == len(info2)
        exp.fit(df)
        info3 = exp.get_fit_info(df)[0]
        assert len(info1) == len(info3)

        for i, (a, b, c) in enumerate(zip(info1, info2, info3)):
            assert list(sorted(a)) == list(sorted(b))
            assert list(sorted(a)) == list(sorted(c))
            for k in sorted(a):
                if not isinstance(a[k], (list, dict, str, int, float, tuple)):
                    continue
                if b[k] != c[k]:
                    import pprint
                    pprint.pprint(b)
                    pprint.pprint(c)
                    raise Exception(
                        "Issue with "
                        "op={0}\nk='{1}'\n---\n{2}\n---\n{3}".format(
                            i, k, b[k], c[k]))
                if a[k] != b[k]:
                    import pprint
                    pprint.pprint(a)
                    pprint.pprint(b)
                    raise Exception(
                        "Issue with "
                        "op={0}\nk='{1}'\n---\n{2}\n---\n{3}".format(
                            i, k, a[k], b[k]))
        res2 = dot_export_pipeline(exp, df)
        assert res2 is not None
        assert res1 == res2
예제 #8
0
    def test_get_fit_info(self):
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'])
        transform_pipeline.fit(train_df)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df)

        info = combined_pipeline.get_fit_info(train_df)

        self.assertTrue(info[0][1]['name'] == 'DatasetTransformer')
예제 #9
0
 def test_get_fit_info_anomaly(self):
     df = get_dataset("iris").as_df()
     df.drop(['Label', 'Setosa', 'Species'], axis=1, inplace=True)
     X_train, X_test = train_test_split(df)
     svm = Pipeline([
         OneClassSvmAnomalyDetector(  # noqa
             kernel=PolynomialKernel(a=1.0))
     ])  # noqa
     svm.fit(X_train, verbose=0)
     scores = svm.predict(X_train)
     info = svm.get_fit_info(X_train)
     last = info[0][-1]
     out = last['outputs']
     assert len(scores) == len(X_train)
     assert out is not None
예제 #10
0
 def test_get_fit_info_clustering(self):
     X_train = pandas.DataFrame(
         data=dict(x=[0, 1, 2, 10, 11, 12, -10, -11, -12],
                   y=[0, 1, 2, 10, 11, 12, -10, -11, -12],
                   z=[0, 1, 2, 10, 11, 12, -10, -11, -12]))
     y_train = pandas.DataFrame(data=dict(
         clusterid=[0, 0, 0, 1, 1, 1, 2, 2, 2]))
     pipeline = Pipeline([KMeansPlusPlus(n_clusters=3)])
     pipeline.fit(X_train, y_train, verbose=0)
     scores = pipeline.predict(X_train)
     info = pipeline.get_fit_info(X_train, y_train)
     last = info[0][-1]
     out = last['outputs']
     assert out == ['PredictedLabel', 'Score.0', 'Score.1', 'Score.2']
     assert len(scores) == 9
    def test_averagedperceptron_unsupported_losses_syntax(self):
        df = get_dataset("infert").as_df().drop('row_num', axis=1)
        X = df
        y = df['case']

        pipeline = Pipeline([
            OneHotVectorizer(columns={
                'age1': 'age',
                'parity1': 'parity',
                'sp1': 'spontaneous'
            }),
            OneHotVectorizer(columns={'education_str': 'education_str'}),
            ColumnDuplicator(columns={'case2': 'case'}),
            AveragedPerceptronBinaryClassifier(
                feature=['age1', 'education_str'], label='case')
        ])

        try:
            model = pipeline.fit(X, y, verbose=0)
            raise AssertionError("same column name in X and y")
        except RuntimeError as e:
            assert "If any step in the pipeline has defined Label" in str(e)
        X = X.drop('case', axis=1)

        pipeline = Pipeline([
            OneHotVectorizer(columns={
                'age1': 'age',
                'parity1': 'parity',
                'sp1': 'spontaneous'
            }),
            OneHotVectorizer(columns={'education_str': 'education_str'}),
            # ColumnDuplicator(columns={'case2': 'case'}), # does not work
            AveragedPerceptronBinaryClassifier(
                feature=['age1', 'education_str'], label='case')
        ])

        info = pipeline.get_fit_info(df)[0]
        assert info[-1]['inputs'] != ['Feature:Features', 'Label:case']

        model = pipeline.fit(df)
        y_pred_withpipeline = model.predict(X)
        assert set(y_pred_withpipeline.columns) == {
            'PredictedLabel', 'Probability', 'Score'
        }
        assert y_pred_withpipeline.shape == (248, 3)
예제 #12
0
    def test_pipeline_info_strategy_previous_2_accumulate(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop('yy', axis=1)
        y = df['yy']

        exp = Pipeline([
            OneHotVectorizer() << ['workclass'],
            OneHotVectorizer() << ['education'],
            FastLinearRegressor()
        ])

        infos = exp.get_fit_info(X, y, iosklearn="accumulate")[0]
        for inf in infos:
            if 'operator' in inf:
                del inf['operator']
        exp = [{
            'name': None,
            'schema_after': ['education', 'workclass', 'yy'],
            'type': 'start',
            'outputs': ['education', 'workclass', 'yy']
        }, {
            'name': 'OneHotVectorizer',
            'inputs': ['workclass'],
            'type': 'transform',
            'outputs': ['workclass'],
            'schema_after': ['education', 'workclass', 'yy']
        }, {
            'name': 'OneHotVectorizer',
            'inputs': ['education'],
            'type': 'transform',
            'outputs': ['education'],
            'schema_after': ['education', 'workclass', 'yy']
        }, {
            'name': 'FastLinearRegressor',
            'inputs': ['Feature:education,workclass', 'Label:yy'],
            'type': 'regressor',
            'outputs': ['Score'],
            'schema_after': ['Score']
        }]
        assert infos == exp
예제 #13
0
    def test_get_fit_info_ranker(self):
        file_path = get_dataset("gen_tickettrain").as_filepath()
        file_schema = 'sep=, col=Label_1:R4:0 col=GroupId_2:TX:1 ' \
                      'col=Features_3:R4:3-5'
        train_stream = FileDataStream(file_path, schema=file_schema)
        pipeline = Pipeline([
            ToKey() << {
                'GroupId_2': 'GroupId_2'
            },
            ColumnConcatenator() << {
                'Features': ['Features_3']
            },
            LightGbmRanker() << {
                Role.Feature: 'Features',
                Role.Label: 'Label_1',
                Role.GroupId: 'GroupId_2'
            }
        ])

        info = pipeline.get_fit_info(train_stream)
        last = info[0][-1]
        inp = last['inputs']
        assert 'GroupId:GroupId_2' in inp
예제 #14
0
    def test_syntax_onehot_trained_all_rename(self):
        df = pandas.DataFrame(
            dict(edu=['A', 'B', 'A', 'B', 'A'],
                 wk=['X', 'X', 'Y', 'Y', 'Y'],
                 Label=[1.1, 2.2, 1.24, 3.4, 3.4]))

        onehot = (OneHotVectorizer() << {'edu2': 'edu'}).fit(df, verbose=0)
        df2 = onehot.transform(df)
        lr = (FastLinearRegressor() << ['edu2.A', 'edu2.B']).fit(df2,
                                                                 verbose=0)

        pipe = Pipeline([onehot.clone(), lr.clone() << ['edu2.A', 'edu2.B']])
        with self.assertRaises(RuntimeError):
            # 'Feature column 'edu2.A' not found
            pipe.fit(df, verbose=0)

        pipe = Pipeline([onehot.clone(), lr.clone() << ['edu2']])
        try:
            pipe.fit(df, verbose=0)
        except RuntimeError:
            # This should work!
            import pprint
            s = pprint.pformat(pipe.get_fit_info(df)[0])
            raise RuntimeError(s)