示例#1
0
    def test_syntax9_slots_label(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        X = train_reviews.loc[:, train_reviews.columns != 'like']
        y = train_reviews[['like']]

        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector()
        exp = Pipeline([transform_1, transform_2])
        res = exp.fit_transform(X, y)
        assert res is not None

        # Scikit compatibility (Compose transforms inside Scikit Pipeline).
        # In this scenario, we do not provide {input, output} arguments
        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector(slots_in_output=2)
        pipe = Pipeline([transform_1, transform_2])
        res = pipe.fit_transform(X, y)
        assert res is not None
    def test_multiple_user_specified_columns_is_not_allowed(self):
        path = get_dataset('timeseries').as_filepath()
        data = FileDataStream.read_csv(path)

        try:
            pipeline = Pipeline([
                IidSpikeDetector(columns=['t2', 't3'], pvalue_history_length=5)
            ])
            pipeline.fit_transform(data)

        except RuntimeError as e:
            self.assertTrue('Only one column is allowed' in str(e))
            return

        self.fail()
示例#3
0
 def test_PcaTransformer_int(self):
     df_ = get_dataset("infert").as_df()
     res = {}
     dt = {}
     for ty in (int, float):
         df = df_.copy()
         df['age'] = df['age'].astype(ty)
         df['parity'] = df['parity'].astype(ty)
         df['spontaneous'] = df['spontaneous'].astype(ty)
         df['stratum'] = df['stratum'].astype(ty)
         X = ['age', 'parity', 'spontaneous', 'stratum']
         pipe = Pipeline([
             ColumnConcatenator() << {
                 'X': X
             },
             PcaTransformer(rank=3) << 'X'
         ])
         y = pipe.fit_transform(df[X], verbose=0)
         res[ty] = y.sum().sum()
         dt[ty] = list(y.dtypes)
     vals = list(res.values())
     assert_almost_equal(vals[0], vals[1])
     dt = list(dt.values())
     dt[0].sort()
     dt[1].sort()
     assert dt[0] != dt[1]
示例#4
0
    def test_lpscaler_automatically_converts_to_single(self):
        in_df = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[0, 2.5, 2.6, 2.4],
                      Species=["setosa", "viginica", "setosa", 'versicolor']))

        in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float64)

        src_cols = ['Sepal_Length', 'Sepal_Width', 'Petal_Length']

        pipeline = Pipeline([
            ColumnConcatenator() << {
                'concat': src_cols
            },
            LpScaler() << {
                'norm': 'concat'
            }
        ])
        out_df = pipeline.fit_transform(in_df)

        cols = ['concat.' + s for s in src_cols]
        cols.extend(['norm.' + s for s in src_cols])
        sum = out_df[cols].sum().sum()
        sum_range = (23.24, 23.25)
        assert_greater(sum, sum_range[0],
                       "sum should be greater than %s" % sum_range[0])
        assert_less(sum, sum_range[1],
                    "sum should be less than %s" % sum_range[1])
示例#5
0
    def test_datetime_column_parsed_from_string(self):
        dates = ["2018-01-02", "2018-02-01"]
        df = pd.DataFrame({'c1': dates, 'c2': [3, 4]})

        file_name = get_temp_file('.csv')
        df.to_csv(file_name)
        df = pd.read_csv(file_name, parse_dates=['c1'], index_col=0)

        self.assertEqual(df.dtypes[0], np.dtype('datetime64[ns]'))

        pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})])
        result = pipeline.fit_transform(df)

        self.assertEqual(result.loc[0, 'c1'].year, 2018)
        self.assertEqual(result.loc[0, 'c1'].month, 1)
        self.assertEqual(result.loc[0, 'c1'].day, 2)
        self.assertEqual(result.loc[0, 'c1'].hour, 0)
        self.assertEqual(result.loc[0, 'c1'].minute, 0)
        self.assertEqual(result.loc[0, 'c1'].second, 0)

        self.assertEqual(result.loc[1, 'c1'].year, 2018)
        self.assertEqual(result.loc[1, 'c1'].month, 2)
        self.assertEqual(result.loc[1, 'c1'].day, 1)
        self.assertEqual(result.loc[1, 'c1'].hour, 0)
        self.assertEqual(result.loc[1, 'c1'].minute, 0)
        self.assertEqual(result.loc[1, 'c1'].second, 0)

        self.assertEqual(len(result), 2)
        self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]'))

        os.remove(file_name)
示例#6
0
    def test_globalcontrastrowscaler(self):
        in_df = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[0, 2.5, 2.6, 2.4],
                      Species=["setosa", "viginica", "setosa", 'versicolor']))

        in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float32)

        # generate two new Columns - Petal_Normed and Sepal_Normed
        concat = ColumnConcatenator() << {
            'concated_columns':
            ['Petal_Length', 'Sepal_Width', 'Sepal_Length']
        }

        # Performs a global contrast normalization on input values:
        # Y = (s * X - M) / D, where s is a scale, M is mean and D is either
        # L2 norm or standard deviation
        normed = GlobalContrastRowScaler() << {
            'normed_columns': 'concated_columns'
        }

        pipeline = Pipeline([concat, normed])
        out_df = pipeline.fit_transform(in_df)
        cols = [
            'concated_columns.' + s
            for s in ['Sepal_Length', 'Sepal_Width', 'Petal_Length']
        ]
        cols.extend([
            'normed_columns.' + s
            for s in ['Sepal_Length', 'Sepal_Width', 'Petal_Length']
        ])
        sum = out_df[cols].sum().sum()
        assert_greater(sum, 17.309, "sum should be greater than %s" % 17.309)
        assert_less(sum, 17.3102, "sum should be less than %s" % 17.31)
示例#7
0
    def test_fit_transform(self):
        import azureml.dataprep as dprep

        path = get_dataset('infert').as_filepath()
        dflow = dprep.auto_read_file(path=path)
        dprep_data = DprepDataStream(dflow)
        file_data = FileDataStream.read_csv(path)

        xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'})
        pipe = Pipeline([xf])
        transformed_data = pipe.fit_transform(file_data)
        transformed_data1 = pipe.fit_transform(dprep_data)

        assert_array_equal(transformed_data.columns, transformed_data1.columns)
        assert_2d_array_equal(transformed_data.values,
                              transformed_data1.values)
示例#8
0
    def test_minmaxscaler_float_order_noint(self):
        in_df = pandas.DataFrame(data=OrderedDict(xpetal=[-1.1, -2.2, -3.3],
                                                  ipetal=[1.0, 2.0, 3.0]))

        normed = MinMaxScaler() << ['xpetal', 'ipetal']
        pipeline = Pipeline([normed])
        out_df = pipeline.fit_transform(in_df, verbose=0)
        assert_equal(out_df.shape, (3, 2))
        assert_equal(list(out_df.columns), list(in_df.columns))
示例#9
0
    def test_multiple_user_specified_columns_is_not_allowed(self):
        path = get_dataset('timeseries').as_filepath()
        data = FileDataStream.read_csv(path)

        try:
            pipeline = Pipeline([
                SsaForecaster(series_length=8,
                              train_size=15,
                              window_size=5,
                              horizon=2,
                              columns=['t2', 't3'])
            ])
            pipeline.fit_transform(data)

        except RuntimeError as e:
            self.assertTrue('Only one column is allowed' in str(e))
            return

        self.fail()
示例#10
0
    def test_with_integer_inputs(self):
        df = pandas.DataFrame(data=dict(c0=[1, 3, 5, 7, 9]))

        xf = RobustScaler(columns='c0', center=True, scale=True)
        pipeline = Pipeline([xf])
        result = pipeline.fit_transform(df)

        expected_result = pandas.Series([-1.0, -0.5, 0.0, 0.5, 1.0])

        self.assertTrue(result.loc[:, 'c0'].equals(expected_result))
示例#11
0
    def test_transform_int(self):
        in_df = pandas.DataFrame(
            data=dict(xpetal=[-1, -2, -3], ipetal=[1, 2, 3]))

        normed = MeanVarianceScaler() << ['xpetal', 'ipetal']
        pipeline = Pipeline([normed])
        out_df = pipeline.fit_transform(in_df, verbose=0)
        assert_equal(out_df.shape, (3, 2))
        assert_almost_equal(out_df.loc[2, 'xpetal'], -1.3887302, decimal=3)
        assert_almost_equal(out_df.loc[2, 'ipetal'], 1.38873, decimal=3)
示例#12
0
    def test_minmaxscaler_int(self):
        in_df = pandas.DataFrame(
            data=dict(xpetal=[-1, -2, -3], ipetal=[1, 2, 3]))

        normed = MinMaxScaler() << ['xpetal', 'ipetal']
        pipeline = Pipeline([normed])
        out_df = pipeline.fit_transform(in_df, verbose=0)
        assert_equal(out_df.shape, (3, 2))
        if out_df.loc[2, 'xpetal'] != -1:
            raise Exception("Unexpected:\n" + str(out_df))
        assert_equal(out_df.loc[2, 'ipetal'], 1)
示例#13
0
 def test_PcaTransformer_no_concat(self):
     df = get_dataset("infert").as_df()
     X = [
         'age', 'parity', 'induced', 'spontaneous', 'stratum',
         'pooled.stratum'
     ]
     pipe = Pipeline([
         PcaTransformer(rank=3) <<
         ['age', 'parity', 'spontaneous', 'stratum']
     ])
     y = pipe.fit_transform(df[X].astype(numpy.float32))
     assert y is not None
示例#14
0
 def test_ssweembedding(self):
     wordvectors = pd.DataFrame(data=dict(w1=["like", "hate", "okay"],
                                          w2=["great", "horrible",
                                              "lukewarm"],
                                          w3=["awesome", "worst",
                                              "boring"]))
     mycols = ['w1', 'w2', 'w3']
     concat = ColumnConcatenator() << {'features': mycols}
     sswe = WordEmbedding() << 'features'
     pipeline = Pipeline([concat, sswe])
     y = pipeline.fit_transform(wordvectors)
     y = y[[col for col in y.columns if 'features' in col]]
     assert_almost_equal(y.sum().sum(), -97.6836, decimal=4,
                         err_msg="Sum should be %s" % -97.6836)
    def test_holidays(self):
        df = pandas.DataFrame(
            data=dict(tokens1=[1, 2, 3, 157161600], tokens2=[10, 11, 12, 13]))

        cols_to_drop = [
            'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', 'dtDayOfYear',
            'dtWeekOfMonth', 'dtQuarterOfYear', 'dtHalfOfYear', 'dtWeekIso',
            'dtYearIso', 'dtMonthLabel', 'dtAmPmLabel', 'dtDayOfWeekLabel',
            'dtIsPaidTimeOff'
        ]

        dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'
        pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
        y = pipeline.fit_transform(df)

        self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day')
示例#16
0
 def test_PcaTransformer(self):
     df = get_dataset("infert").as_df()
     X = [
         'age', 'parity', 'induced', 'spontaneous', 'stratum',
         'pooled.stratum'
     ]
     pipe = Pipeline(
         [ColumnConcatenator() << {
             'X': X
         }, PcaTransformer(rank=3) << 'X'])
     y = pipe.fit_transform(df[X].astype(numpy.float32))
     y = y[['X.0', 'X.1', 'X.2']]
     assert_almost_equal(y.sum().sum(),
                         11.293087,
                         decimal=3,
                         err_msg="Sum should be %s" % 11.293087)
示例#17
0
    def test_ngramfeaturizer(self):
        train_df = pandas.DataFrame(data=dict(review=['one', 'two']))

        pipeline = Pipeline([
            CharTokenizer(columns={'review_transform': 'review'}),
            NGramExtractor(ngram_length=3,
                           all_lengths=False,
                           columns={'ngrams': 'review_transform'}),
            ColumnDropper(columns=['review_transform', 'review'])
        ])

        result = pipeline.fit_transform(train_df)
        self.assertEqual(len(result.columns), 6)
        self.assertEqual(result.loc[0, 'ngrams.o|n|e'], 1.0)
        self.assertEqual(result.loc[1, 'ngrams.o|n|e'], 0.0)
        self.assertEqual(result.loc[0, 'ngrams.t|w|o'], 0.0)
        self.assertEqual(result.loc[1, 'ngrams.t|w|o'], 1.0)
示例#18
0
    def test_word_embedding_example2(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 header=+'
        data = FileDataStream(path, schema=file_schema)

        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens=True,
                            columns={'features': ['id', 'education']}),
            WordEmbedding(columns='features_TransformedText')
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 802)
        assert 'features_TransformedText.94' in list(features.columns)
示例#19
0
    def test_three_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model(
            self):
        """
        This test verifies that three models can be combined
        even if the transform increases the number of columns.
        """
        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline_1 = Pipeline(
            [RangeFilter(min=0.0, max=4.5) << 'c2'])
        df = transform_pipeline_1.fit_transform(train_df,
                                                as_binary_data_stream=True)

        # Create and fit a OneHotVectorizer transform using
        # the transformed data from the previous step and use it
        # to transform the data from the previous step.
        transform_pipeline_2 = Pipeline([OneHotVectorizer() << 'c0'],
                                        random_state=seed)
        transform_pipeline_2.fit(df)
        df = transform_pipeline_2.transform(df, as_binary_data_stream=True)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])],
            random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transforms and predictor defined previously.
        df = transform_pipeline_1.transform(test_df,
                                            as_binary_data_stream=True)
        df = transform_pipeline_2.transform(df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Combine the above Pipelines in to one Pipeline and use
        # the new Pipeline to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline_1,
                                                    transform_pipeline_2,
                                                    predictor_pipeline)
        result_2 = combined_pipeline.predict(test_df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#20
0
    def test_dprep_datastream(self):
        import azureml.dataprep as dprep

        dates = ["2018-01-02 00:00:00", "2018-02-01 10:00:00"]
        col2 = ['0', '1']
        label_array = np.repeat([0], 2)
        train_df = pd.DataFrame({
            'col1': dates,
            'col2': col2,
            'label': label_array
        })

        pipeline = Pipeline(steps=[
            Handler(columns={'2': 'col2'},
                    concat=False,
                    impute_by_slot=True,
                    replace_with='Mean')
        ])

        file_name = get_temp_file('.csv')
        train_df.to_csv(file_name)

        dataflow = dprep.read_csv(file_name, infer_column_types=True)
        dprepDataStream = DprepDataStream(dataflow)

        result = pipeline.fit_transform(dprepDataStream)

        self.assertEqual(result.loc[:, 'col1'].dtype,
                         np.dtype('datetime64[ns]'))

        self.assertEqual(result.loc[0, 'col1'].year, 2018)
        self.assertEqual(result.loc[0, 'col1'].month, 1)
        self.assertEqual(result.loc[0, 'col1'].day, 2)
        self.assertEqual(result.loc[0, 'col1'].hour, 0)
        self.assertEqual(result.loc[0, 'col1'].minute, 0)
        self.assertEqual(result.loc[0, 'col1'].second, 0)

        self.assertEqual(result.loc[1, 'col1'].year, 2018)
        self.assertEqual(result.loc[1, 'col1'].month, 2)
        self.assertEqual(result.loc[1, 'col1'].day, 1)
        self.assertEqual(result.loc[1, 'col1'].hour, 10)
        self.assertEqual(result.loc[1, 'col1'].minute, 0)
        self.assertEqual(result.loc[1, 'col1'].second, 0)

        os.remove(file_name)
示例#21
0
    def test_negative_values(self):
        milliseconds_in_year = 365 * 24 * 60 * 60 * 1000
        data = [i * milliseconds_in_year for i in [-1, -2, -3, -3.3]]

        df = pd.DataFrame({'c1': data, 'c2': [3, 4, 5, 6]})
        df = df.astype({'c1': np.dtype('datetime64[ms]')})

        pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})])
        result = pipeline.fit_transform(df)

        self.assertTrue(result.loc[:, 'c1'].equals(df.loc[:, 'c1']))
        self.assertEqual(result.loc[:, 'c1'].dtype, np.dtype('datetime64[ns]'))

        self.assertEqual(result.loc[0, 'c1'].year, 1969)
        self.assertEqual(result.loc[0, 'c1'].hour, 0)
        self.assertEqual(result.loc[0, 'c1'].minute, 0)
        self.assertEqual(result.loc[0, 'c1'].second, 0)

        self.assertEqual(result.loc[3, 'c1'].year, 1966)
示例#22
0
    def test_word_embedding_example_dict_newname(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens_column_name='features_TransformedText',
                            columns={'features': ['id', 'education']}),

            # What is features_TransformedText?
            WordEmbedding(
                columns={
                    'features_TransformedText2': 'features_TransformedText'})
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 409)
示例#23
0
    def test_ensemble_supports_cv_without_user_defined_transforms(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'}
        handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'}
        lgbm_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ols_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ogd_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'shuffle': False,
            'normalize': 'Yes'
        }

        pipeline = Pipeline([
            Indicator() << ind_args,
            Handler(replace_with='Mean') << handler_args
        ])
        transformed_data = pipeline.fit_transform(data, as_binary_data_stream=True)

        pipeline_steps = [LightGbmRegressor(**lgbm_args)]
        cv_results = CV(pipeline_steps).fit(transformed_data)
        l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

        r1 = OrdinaryLeastSquaresRegressor(**ols_args)
        r2 = OnlineGradientDescentRegressor(**ogd_args)
        r3 = LightGbmRegressor(**lgbm_args)

        pipeline_steps = [VotingRegressor(estimators=[r1, r2, r3], combiner='Average')]
        cv_results = CV(pipeline_steps).fit(transformed_data)
        l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

        self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
示例#24
0
    def test_LightLda(self):
        topics = pandas.DataFrame(data=dict(review=[
            "animals birds cats dogs fish horse",
            "horse birds house fish duck cats",
            "car truck driver bus pickup",
            "car truck driver bus pickup horse ",
            "car truck",
            "bus pickup",
            "space galaxy universe radiation",
            "radiation galaxy universe duck"]))

        pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram(
        ), vector_normalizer='None') << 'review', LightLda(num_topic=3)])
        y = pipeline.fit_transform(topics)
        assert_almost_equal(
            y.sum().sum(),
            7.000000044,
            decimal=8,
            err_msg="Sum should be %s" %
                    7.000000044)
示例#25
0
    def test_fromkey_multiple_columns(self):
        df = pandas.DataFrame(data=dict(
            num1=[0, 1, 2, 3, 4, 5, 6],
            cat1=Categorical.from_codes([0, 2, 3, 1, 2, -1, 1],
                                        categories=["a", "b", "c", "d"]),
            cat2=Categorical.from_codes([2, 0, 1, 2, 0, 1, 1],
                                        categories=["e", "f", "g"]),
            num=[0, 1, 2, 3, 4, 5, 6],
            text1=["i", "j", "i", "j", "i", "j", "i"],
            text2=["k", "l", "l", "k", "k", "l", "k"]))

        concat = ColumnConcatenator() << {'textvec': ['text1', 'text2']}
        tokey = ToKey() << ['textvec']
        pipeline = Pipeline([concat, tokey])
        data_idv = pipeline.fit_transform(df)
        assert sorted(
            list(
                data_idv.columns)) == [
            'cat1',
            'cat2',
            'num',
            'num1',
            'text1',
            'text2',
            'textvec.text1',
            'textvec.text2']
        assert list(data_idv['cat1'].cat.categories) == ['a', 'b', 'c', 'd']
        assert list(data_idv['cat1'].cat.codes) == [0, 2, 3, 1, 2, -1, 1]
        assert list(data_idv['cat2'].cat.categories) == ['e', 'f', 'g']
        assert list(data_idv['cat2'].cat.codes) == [2, 0, 1, 2, 0, 1, 1]
        assert list(
            data_idv['textvec.text1'].cat.categories) == [
            'i', 'k', 'j', 'l']
        assert list(data_idv['textvec.text1'].cat.codes) == [
            0, 2, 0, 2, 0, 2, 0]
        assert list(
            data_idv['textvec.text2'].cat.categories) == [
            'i', 'k', 'j', 'l']
        assert list(data_idv['textvec.text2'].cat.codes) == [
            1, 3, 3, 1, 1, 3, 1]
示例#26
0
    def test_timestamp_boundaries(self):
        # Here are the current min and max for a Pandas Timestamp
        # 1677-09-21 00:12:43.145225
        # 2262-04-11 23:47:16.854775807

        data = [pd.Timestamp(1677, 9, 22, 1), pd.Timestamp.max]
        df = pd.DataFrame({'c1': data, 'c2': [3, 4]})
        df = df.astype({'c1': np.dtype('datetime64[ms]')})

        pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})])
        result = pipeline.fit_transform(df)

        self.assertTrue(result.loc[:, 'c1'].equals(df.loc[:, 'c1']))
        self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]'))

        self.assertEqual(result.loc[0, 'c1'].year, 1677)
        self.assertEqual(result.loc[0, 'c1'].month, 9)
        self.assertEqual(result.loc[0, 'c1'].day, 22)

        self.assertEqual(result.loc[1, 'c1'].year, 2262)
        self.assertEqual(result.loc[1, 'c1'].month, 4)
        self.assertEqual(result.loc[1, 'c1'].day, 11)
示例#27
0
print(data.head())
#      t1    t2      t3
# 0  0.01  0.01  0.0100
# 1  0.02  0.02  0.0200
# 2  0.03  0.03  0.0200
# 3  0.03  0.03  0.0250
# 4  0.03  0.03  0.0005

# define the training pipeline
pipeline = Pipeline([
    SsaSpikeDetector(columns={'t2_spikes': 't2'},
                     pvalue_history_length=4,
                     training_window_size=8,
                     seasonal_window_size=3)
])

result = pipeline.fit_transform(data)
print(result)

#      t1     t2       t3  t2_spikes.Alert  t2_spikes.Raw Score  t2_spikes.P-Value Score
# 0  0.01   0.01   0.0100              0.0            -0.111334             5.000000e-01
# 1  0.02   0.02   0.0200              0.0            -0.076755             4.862075e-01
# 2  0.03   0.03   0.0200              0.0            -0.034871             3.856320e-03
# 3  0.03   0.03   0.0250              0.0            -0.012559             8.617091e-02
# 4  0.03   0.03   0.0005              0.0            -0.015723             2.252377e-01
# 5  0.03   0.05   0.0100              0.0            -0.001133             1.767711e-01
# 6  0.05   0.07   0.0500              0.0             0.006265             9.170460e-02
# 7  0.07   0.09   0.0900              0.0             0.002383             2.701134e-01
# 8  0.09  99.00  99.0000              1.0            98.879520             1.000000e-08
# 9  1.10   0.10   0.1000              0.0           -57.817568             6.635692e-02
示例#28
0
import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram

# create the data
customer_reviews = pandas.DataFrame(data=dict(review=[
    "I really did not like the taste of it", "It was surprisingly quite good!",
    "I will never ever ever go to that place again!!",
    "The best ever!! It was amazingly good and super fast",
    "I wish I had gone earlier, it was that great",
    "somewhat dissapointing. I'd probably wont try again",
    "Never visit again... rascals!"
]))

pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    output_tokens_column_name='review_TransformedText'),
    WordEmbedding() << 'review_TransformedText'
])
y = pipeline.fit_transform(customer_reviews)

# view a small subset of the review embeddings
print(y.iloc[:5, -3:])
#    review_TransformedText.147  review_TransformedText.148  review_TransformedText.149
# 0                    1.918661                   -0.714531                    3.062141
# 1                    1.891922                   -0.248650                    1.706620
# 2                    1.601611                    0.309785                    3.379576
# 3                    1.970666                    1.477450                    3.110802
# 4                    2.521791                    0.122538                    3.129919
示例#29
0
def transform_data():
    xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'})
    pipe = Pipeline([xf])
    transformed_data = pipe.fit_transform(data, as_binary_data_stream=True)
    transformed_data_df = pipe.fit_transform(data)
    return transformed_data, transformed_data_df
示例#30
0
from nimbusml import Pipeline
from nimbusml.preprocessing import DateTimeSplitter
from nimbusml.preprocessing.schema import ColumnSelector

df = pandas.DataFrame(data=dict(
    tokens1=[1, 2, 3, 157161600],
    tokens2=[10, 11, 12, 13]
))

cols_to_drop = [
    'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
    'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
    'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
    'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
]

dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'

pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
y = pipeline.fit_transform(df)

# view the three columns
pandas.set_option('display.max_columns', None)
pandas.set_option('display.width', 1000)
print(y)
#      tokens1  tokens2  dtYear  dtMonth  dtDay  dtHour  dtMinute  dtSecond  dtAmPm   dtHolidayName
# 0          1       10    1970        1      1       0         0         1       0  New Year's Day
# 1          2       11    1970        1      1       0         0         2       0  New Year's Day
# 2          3       12    1970        1      1       0         0         3       0  New Year's Day
# 3  157161600       13    1974       12     25       0         0         0       0   Christmas Day