Exemplo n.º 1
0
 def test_trees_file(self):
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastTreesBinaryClassifier() << {
                              'Label': label_column}])
     train_stream = FileDataStream(train_file, schema=file_schema)
     pipeline.fit(train_stream, label_column)
     test_stream = FileDataStream(test_file, schema=file_schema)
     out_data = pipeline.predict(test_stream)
     check_accuracy(test_file, label_column, out_data, 0.65)
Exemplo n.º 2
0
 def data_wt_rename(self, label_name, group_id, features):
     simpleinput_file = get_dataset("gen_tickettrain").as_filepath()
     file_schema = 'sep=, col={label}:R4:0 col={group_id}:TX:1 ' \
                   'col={features}:R4:3-5'.format(
                     label=label_name, group_id=group_id, features=features)
     data = FileDataStream(simpleinput_file, schema=file_schema)
     if label_name != 'Label':
         data._set_role(Role.Label, label_name)
     return data
Exemplo n.º 3
0
 def test_linear_file_role(self):
     pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                          FastLinearBinaryClassifier(train_threads=1,
                                                     shuffle=False)])
     train_stream = FileDataStream(train_file, schema=file_schema)
     train_stream._set_role('Label', label_column)
     pipeline.fit(train_stream)
     test_stream = FileDataStream(test_file, schema=file_schema)
     out_data = pipeline.predict(test_stream)
     check_accuracy(test_file, label_column, out_data, 0.65)
Exemplo n.º 4
0
 def test_schema_airquality(self):
     train_file = get_dataset("airquality").as_filepath()
     found = DataSchema.read_schema(train_file)
     schema = "col=Unnamed0:I8:0 col=Ozone:R8:1 col=Solar_R:R8:2 " \
              "col=Wind:R8:3 col=Temp:I8:4 col=Month:I8:5 " \
              "col=Day:I8:6 header=+"
     assert str(found) == schema
     fds = FileDataStream(train_file, schema)
     assert str(fds.schema) == schema
     fds = FileDataStream.read_csv(train_file)
     assert str(fds.schema) == schema
Exemplo n.º 5
0
 def test_schema_infert(self):
     train_file = get_dataset("infert").as_filepath()
     found = DataSchema.read_schema(train_file)
     schema = "col=row_num:I8:0 col=education:TX:1 col=age:I8:2 " \
              "col=parity:I8:3 col=induced:I8:4 " + \
              "col=case:I8:5 col=spontaneous:I8:6 col=stratum:I8:7 " \
              "col=pooled.stratum:I8:8 header=+"
     assert str(found) == schema
     fds = FileDataStream(train_file, schema)
     assert str(fds.schema) == schema
     fds = FileDataStream.read_csv(train_file)
     assert str(fds.schema) == schema
Exemplo n.º 6
0
    def test_linear_file(self):
        pipeline = Pipeline([OneHotVectorizer() << categorical_columns,
                             FastLinearBinaryClassifier(train_threads=1,
                                                        shuffle=False)])

        train_stream = FileDataStream(train_file, schema=file_schema)
        assert 'sep' in train_stream.schema.options
        assert 'header' in train_stream.schema.options
        pipeline.fit(train_stream, label_column)
        test_stream = FileDataStream(test_file, schema=file_schema)
        out_data = pipeline.predict(test_stream)
        check_accuracy(test_file, label_column, out_data, 0.65)
Exemplo n.º 7
0
 def test_schema_infert_R4(self):
     train_file = get_dataset("infert").as_filepath()
     found = DataSchema.read_schema(train_file, numeric_dtype=numpy.float32)
     schema = "col=row_num:R4:0 col=education:TX:1 col=age:R4:2 " \
              "col=parity:R4:3 col=induced:R4:4 " + \
              "col=case:R4:5 col=spontaneous:R4:6 col=stratum:R4:7 " \
              "col=pooled.stratum:R4:8 header=+"
     assert str(found) == schema
     fds = FileDataStream(train_file, schema)
     assert str(fds.schema) == schema
     fds = FileDataStream.read_csv(train_file, numeric_dtype=numpy.float32)
     assert str(fds.schema) == schema
    def setUpClass(self):
        adult_path = get_dataset('uciadult_train').as_filepath()
        self.classification_data = FileDataStream.read_csv(adult_path)
        binary_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            LogisticRegressionBinaryClassifier(feature=['age', 'education'],
                                               label='label',
                                               number_of_threads=1)
        ])
        self.binary_model = binary_pipeline.fit(self.classification_data)
        self.binary_pfi = self.binary_model.permutation_feature_importance(
            self.classification_data)
        classifier_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearClassifier(feature=['age', 'education'],
                                 label='label',
                                 number_of_threads=1,
                                 shuffle=False)
        ])
        self.classifier_model = classifier_pipeline.fit(
            self.classification_data)
        self.classifier_pfi = self.classifier_model.permutation_feature_importance(
            self.classification_data)

        infert_path = get_dataset('infert').as_filepath()
        self.regression_data = FileDataStream.read_csv(infert_path)
        regressor_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearRegressor(feature=['induced', 'education'],
                                label='age',
                                number_of_threads=1,
                                shuffle=False)
        ])
        self.regressor_model = regressor_pipeline.fit(self.regression_data)
        self.regressor_pfi = self.regressor_model.permutation_feature_importance(
            self.regression_data)

        ticket_path = get_dataset('gen_tickettrain').as_filepath()
        self.ranking_data = FileDataStream.read_csv(ticket_path)
        ranker_pipeline = Pipeline([
            ToKey(columns=['group']),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group',
                           random_state=0,
                           number_of_threads=1)
        ])
        self.ranker_model = ranker_pipeline.fit(self.ranking_data)
        self.ranker_pfi = self.ranker_model.permutation_feature_importance(
            self.ranking_data)
Exemplo n.º 9
0
    def test_lightgbmranker_asfilestream(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        # Pure-nimbusml paradigm
        train_stream = FileDataStream.read_csv(file_path, encoding='utf-8')

        # pipeline
        pipeline = Pipeline([
            # the group_id column must be of key type
            ToKey(columns={
                'rank': 'rank',
                'group': 'group'
            }),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group')
        ])

        # train
        pipeline.fit(train_stream)

        # test
        eval_stream = FileDataStream.read_csv(file_path)
        metrics, _ = pipeline.test(eval_stream)
        assert_almost_equal(metrics['NDCG@1'][0],
                            43.571429,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 43.571429)
        assert_almost_equal(metrics['NDCG@2'][0],
                            51.28226,
                            decimal=5,
                            err_msg="NDCG@2 should be %s" % 51.28226)
        assert_almost_equal(metrics['NDCG@3'][0],
                            55.168069,
                            decimal=5,
                            err_msg="NDCG@3 should be %s" % 55.168069)
        assert_almost_equal(metrics['DCG@1'][0],
                            4.688759,
                            decimal=3,
                            err_msg="DCG@1 should be %s" % 4.688759)
        assert_almost_equal(metrics['DCG@2'][0],
                            9.012395,
                            decimal=3,
                            err_msg="DCG@2 should be %s" % 9.012395)
        assert_almost_equal(metrics['DCG@3'][0],
                            11.446943,
                            decimal=3,
                            err_msg="DCG@3 should be %s" % 11.446943)
Exemplo n.º 10
0
    def test_ensemble_supports_cv_with_user_defined_transforms(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'}
        handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'}
        lgbm_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ols_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ogd_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'shuffle': False,
            'normalize': 'Yes'
        }

        for split_start in ['before_transforms', 'after_transforms']:
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                LightGbmRegressor(**lgbm_args)
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            r1 = OrdinaryLeastSquaresRegressor(**ols_args)
            r2 = OnlineGradientDescentRegressor(**ogd_args)
            r3 = LightGbmRegressor(**lgbm_args)

            data = FileDataStream(path, schema)
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
Exemplo n.º 11
0
    def test_with_or_without_pipeline(self):
        # Bug 227810
        # data input (as a FileDataStream)
        path = get_dataset('infert').as_filepath()

        file_schema = 'sep=, col=education:TX:1 col=Features:R4:2-4,6-8 ' \
                      'col=case:R4:5 header=+'
        data = FileDataStream(path, schema=file_schema)

        # without pipeline -- fails
        m = LogisticRegressionBinaryClassifier(feature=['Features'],
                                               label='case')
        m.fit(data)
        scores1 = m.predict(data)

        # with pipeline -- works
        m = Pipeline([
            LogisticRegressionBinaryClassifier(feature=['Features'],
                                               label='case')
        ])
        m.fit(data)
        scores2 = m.predict(data)
        diff = np.abs(scores1.values.ravel() -
                      scores2[['PredictedLabel']].values.ravel())
        assert diff.sum() <= 2
Exemplo n.º 12
0
    def test_filter_no_renaming(self):
        with_nans = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[np.nan, 2.5, 2.6, 2.4],
                      Petal_Width=[.8, .7, .9, 0.7],
                      Species=["setosa", "viginica", "", 'versicolor']))

        tmpfile = 'tmpfile_with_nans.csv'
        with_nans.to_csv(tmpfile, index=False)

        file_schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \
                      'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 ' \
                      'col=Species:TX:4 header+'
        data = FileDataStream(tmpfile, schema=file_schema)

        try:
            xf = Filter(columns={'Petal_Length': 'Petal_Length'})
            xf.fit(data)
        except TypeError as e:
            assert 'Dictionaries are not allowed to specify input ' \
                   'columns.' in str(
                       e)

        try:
            xf = Filter(columns={'Petal_Length2': 'Petal_Length'})
            xf.fit(data)
        except TypeError as e:
            assert 'Dictionaries are not allowed to specify input ' \
                   'columns.' in str(
                       e)
Exemplo n.º 13
0
    def test_filter(self):
        with_nans = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[np.nan, 2.5, 2.6, 2.4],
                      Petal_Width=[.8, .7, .9, 0.7]))

        tmpfile = 'tmpfile_with_nans.csv'
        with_nans.to_csv(tmpfile, index=False, na_rep='?')

        file_schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \
                      'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 header+'
        data = FileDataStream(tmpfile, schema=file_schema)

        xf = Filter(columns=[
            'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width'
        ])

        features = xf.fit_transform(data)

        assert features.shape == (2, 4)
        print(features.columns)
        # columns ordering changed between 0.22 and 0.23
        assert set(features.columns) == {
            'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width'
        }
        os.remove(tmpfile)
Exemplo n.º 14
0
 def test_model_summary_not_supported(self):
     for learner in learners_not_supported:
         pipeline = Pipeline(
             [OneHotVectorizer() << categorical_columns, learner])
         train_stream = FileDataStream(train_file, schema=file_schema)
         pipeline.fit(train_stream, label_column)
         assert_raises(TypeError, pipeline.summary)
Exemplo n.º 15
0
 def test_model_summary(self):
     for learner in learners:
         pipeline = Pipeline(
             [OneHotVectorizer() << categorical_columns, learner])
         train_stream = FileDataStream(train_file, schema=file_schema)
         pipeline.fit(train_stream, label_column)
         pipeline.summary()
Exemplo n.º 16
0
    def test_combined_models_support_predict_proba_with_more_than_2_classes(
            self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(data)

        feature_cols = ['education', 'age']
        training_pipeline = Pipeline([
            DatasetTransformer(featurization_pipeline.model),
            OneVsRestClassifier(LogisticRegressionBinaryClassifier(),
                                feature=feature_cols,
                                label='induced')
        ])
        training_pipeline.fit(data, output_predictor_model=True)

        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator({'education': 'education.'})])
        concat_pipeline.fit(featurized_data)

        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(training_pipeline.predictor_model)

        concat_and_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)

        result = concat_and_predictor_pipeline.predict_proba(featurized_data)
        self.assertEqual(result.shape[1], 3)
Exemplo n.º 17
0
    def test_get_fit_info_fastl(self):
        train_file = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(train_file)
        data = FileDataStream(train_file, schema)

        pipeline = Pipeline([
            Filter(columns=['Ozone']),
            FastLinearRegressor(feature=['Solar_R', 'Temp'], label='Ozone')
        ])

        info = pipeline.get_fit_info(data)
        exp = [{
            'name':
            None,
            'outputs':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'schema_after':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'type':
            'start'
        }, {
            'inputs': ['Ozone'],
            'name':
            'Filter',
            'outputs': ['Ozone'],
            'schema_after':
            ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'],
            'type':
            'transform'
        }]
        for el in info[0]:
            if 'operator' in el:
                del el['operator']
        self.assertEqual(exp, info[0][:2])
Exemplo n.º 18
0
    def test_data_stream(self):
        df = pandas.DataFrame(dict(a=[0, 1], b=[0.1, 0.2]))
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
            df.to_csv(f, sep=',', index=False)

        fi = FileDataStream.read_csv(f.name, sep=',')
        fi2 = fi.clone()
        assert repr(fi) == repr(fi2)
        os.remove(f.name)
Exemplo n.º 19
0
 def test_defaults(self):
     schema = DataSchema.read_schema(infert_file, numeric_dtype=np.float32)
     data = FileDataStream.read_csv(infert_file, schema=schema)
     pipeline_steps = [
         OneHotVectorizer(columns={'edu': 'education'}),
         KMeansPlusPlus(
             n_clusters=5,
             feature=['edu', 'age', 'parity', 'spontaneous', 'stratum'])
     ]
     check_cv(pipeline_steps, data)
Exemplo n.º 20
0
    def test_ngramfeaturizer_single(self):

        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        xf = NGramFeaturizer(word_feature_extractor=n_gram(),
                             columns={'features': ['id', 'education']})

        features = xf.fit_transform(data)
        assert features.shape == (248, 652)
Exemplo n.º 21
0
 def test_groups(self):
     # one learner type is enough for testing sanity of groups argument
     file_schema = 'sep=, col=age:TX:2 col=Label:R4:5 ' \
                   'col=Features:R4:6-8 header=+'
     data = FileDataStream(infert_file, schema=file_schema)
     expected_metrics = {'AUC': 0.704883, 'Accuracy': 0.717414}
     pipeline = self.pipeline(learner_arguments={'feature': 'Features'},
                              transforms=[])
     check_cv(pipeline,
              data,
              groups='age',
              expected_metrics=expected_metrics)
Exemplo n.º 22
0
    def test_multiple_user_specified_columns_is_not_allowed(self):
        path = get_dataset('timeseries').as_filepath()
        data = FileDataStream.read_csv(path)

        try:
            pipeline = Pipeline([
                IidSpikeDetector(columns=['t2', 't3'], pvalue_history_length=5)
            ])
            pipeline.fit_transform(data)

        except RuntimeError as e:
            self.assertTrue('Only one column is allowed' in str(e))
            return

        self.fail()
Exemplo n.º 23
0
    def test_different_schema_with_filedatastream_input(self):
        train_filename = "train-data.csv"
        train_df.to_csv(train_filename, index=False, header=True)
        train_data_stream = FileDataStream.read_csv(train_filename, sep=',', header=True)

        test_filename = "test-data.csv"
        test_df.to_csv(test_filename, index=False, header=True)
        test_data_stream = FileDataStream.read_csv(test_filename, sep=',', header=True)

        # Create reference pipeline
        std_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)

        std_pipeline.fit(train_data_stream)
        result_1 = std_pipeline.predict(test_data_stream)

        # Create combined pipeline
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed)
        transform_pipeline.fit(train_data_stream)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_data_stream)

        os.remove(transform_pipeline.model)

        result_2 = combined_pipeline.predict(test_data_stream)

        self.assertTrue(result_1.equals(result_2))

        os.remove(train_filename)
        os.remove(test_filename)
Exemplo n.º 24
0
 def test_columns_concatenator(self):
     path = get_dataset('infert').as_filepath()
     file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                   'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                   'col=spontaneous:R4:6 header=+'
     data = FileDataStream(path, schema=file_schema)
     xf = ColumnConcatenator(
         columns={'features': ['age', 'parity', 'induced']})
     features = xf.fit_transform(data)
     assert features.shape == (248, 10)
     # columns ordering changed between 0.22 and 0.23
     assert set(features.columns) == {
         'age', 'case', 'education', 'features.age', 'features.induced',
         'features.parity', 'id', 'induced', 'parity', 'spontaneous'
     }
Exemplo n.º 25
0
    def test_schema_with_vectorized_column(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(
            data, as_binary_data_stream=True)

        # col=row_num:I8:0 col=education:R4:1-3 col=age:I8:4 col=parity:I8:5
        # col=induced:I8:6 col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9
        # col=pooled.stratum:I8:10 quote+
        schema = featurized_data.schema

        self.assertEqual(len(schema), 9)
        self.assertEqual(schema['age'].Type, 'I8')
        self.assertEqual(schema['age'].Name, 'age')
        self.assertEqual(schema['age'].IsVector, False)

        self.assertEqual(schema['education'].Type, 'R4')
        self.assertEqual(schema['education'].Name, 'education')
        self.assertEqual(len(schema['education'].Pos), 3)
        self.assertEqual(schema['education'].IsVector, True)

        self.assertTrue('education.0-5yrs' not in schema)
        self.assertTrue('education.6-11yrs' not in schema)
        self.assertTrue('education.12+yrs' not in schema)

        # col=row_num:I8:0 col=education.0-5yrs:R4:1 col=education.6-11yrs:R4:2
        # col=education.12+yrs:R4:3 col=age:I8:4 col=parity:I8:5 col=induced:I8:6
        # col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9 col=pooled.stratum:I8:10
        # quote+ header=+
        schema = featurized_data.get_dataframe_schema()

        self.assertEqual(len(schema), 11)
        self.assertEqual(schema['age'].Type, 'I8')
        self.assertEqual(schema['age'].Name, 'age')
        self.assertEqual(schema['age'].IsVector, False)

        self.assertTrue('education' not in schema)
        self.assertTrue('education.0-5yrs' in schema)
        self.assertTrue('education.6-11yrs' in schema)
        self.assertTrue('education.12+yrs' in schema)

        self.assertEqual(schema['education.0-5yrs'].Type, 'R4')
        self.assertEqual(schema['education.0-5yrs'].Name, 'education.0-5yrs')
        self.assertEqual(schema['education.0-5yrs'].IsVector, False)
Exemplo n.º 26
0
    def test_fit_transform(self):
        import azureml.dataprep as dprep

        path = get_dataset('infert').as_filepath()
        dflow = dprep.auto_read_file(path=path)
        dprep_data = DprepDataStream(dflow)
        file_data = FileDataStream.read_csv(path)

        xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'})
        pipe = Pipeline([xf])
        transformed_data = pipe.fit_transform(file_data)
        transformed_data1 = pipe.fit_transform(dprep_data)

        assert_array_equal(transformed_data.columns, transformed_data1.columns)
        assert_2d_array_equal(transformed_data.values,
                              transformed_data1.values)
Exemplo n.º 27
0
    def test_word_embedding_example2(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 header=+'
        data = FileDataStream(path, schema=file_schema)

        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens=True,
                            columns={'features': ['id', 'education']}),
            WordEmbedding(columns='features_TransformedText')
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 802)
        assert 'features_TransformedText.94' in list(features.columns)
Exemplo n.º 28
0
    def test_fit_transform(self):
        # data input (as a FileDataStream)
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        # transform usage
        xf = OneHotVectorizer(
            columns={
                'edu': 'education',
                'in': 'induced',
                'sp': 'spontaneous'})

        # fit and transform
        res1 = xf.fit_transform(data)
        res2 = xf.fit(data).transform(data)
        assert_frame_equal(res1, res2)
Exemplo n.º 29
0
    def test_multiple_user_specified_columns_is_not_allowed(self):
        path = get_dataset('timeseries').as_filepath()
        data = FileDataStream.read_csv(path)

        try:
            pipeline = Pipeline([
                SsaForecaster(series_length=8,
                              train_size=15,
                              window_size=5,
                              horizon=2,
                              columns=['t2', 't3'])
            ])
            pipeline.fit_transform(data)

        except RuntimeError as e:
            self.assertTrue('Only one column is allowed' in str(e))
            return

        self.fail()
Exemplo n.º 30
0
    def test_word_embedding_example_dict_newname(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens_column_name='features_TransformedText',
                            columns={'features': ['id', 'education']}),

            # What is features_TransformedText?
            WordEmbedding(
                columns={
                    'features_TransformedText2': 'features_TransformedText'})
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 409)