def test_timedelta_days(self): X = DataFrame( [["2018-12-31", "2019-01-01"], ["2019-01-31", "2019-01-01"]], columns=["left", "right"]) pipeline = Pipeline([ ("union", FeatureUnion([ ("left_mapper", DataFrameMapper([ ("left", [DateDomain(), DaysSinceYearTransformer(year=2010)]) ])), ("right_mapper", DataFrameMapper([ ("right", [DateDomain(), DaysSinceYearTransformer(year=2010)]) ])) ])), ("expression", Alias(ExpressionTransformer("X[0] - X[1]"), "delta(left, right)", prefit=True)) ]) Xt = pipeline.fit_transform(X) self.assertEqual([[-1], [30]], Xt.tolist())
def test_days_transform(self): X = numpy.array([ datetime(1960, 1, 1), datetime(1960, 1, 2), datetime(1960, 2, 1), datetime(1959, 12, 31), datetime(2003, 4, 1) ]) transformer = DaysSinceYearTransformer(year=1960) self.assertEqual([0, 1, 31, -1, 15796], transformer.transform(X).tolist())
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options): cylinders_origin_mapping = { (8, 1) : "8/1", (6, 1) : "6/1", (4, 1) : "4/1", (6, 2) : "6/2", (4, 2) : "4/2", (4, 3) : "4/3" } mapper = DataFrameMapper([ (["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]), (["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]), (["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}), (["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]), (["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}), (["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()]) ]) pipeline = PMMLPipeline([ ("mapper", mapper), ("selector", SelectUnique()), ("regressor", regressor) ]) pipeline.fit(auto_X, auto_y, **fit_params) pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name)
store_csv(df, "Apollo") def build_apollo(mapper, name): pipeline = PMMLPipeline([("mapper", mapper), ("classifier", DecisionTreeClassifier())]) pipeline.fit(df, df["success"]) store_pkl(pipeline, name) success = DataFrame(pipeline.predict(df), columns=["success"]) success_proba = DataFrame( pipeline.predict_proba(df), columns=["probability(false)", "probability(true)"]) success = pandas.concat((success, success_proba), axis=1) store_csv(success, name) mapper = DataFrameMapper([(["launch", "return"], [ DateTimeDomain(), DaysSinceYearTransformer(year=1968), ExpressionTransformer("X[1] - X[0]") ])]) build_apollo(mapper, "DurationInDaysApollo") mapper = DataFrameMapper([(["launch", "return"], [ DateTimeDomain(), SecondsSinceYearTransformer(year=1968), ExpressionTransformer("X[1] - X[0]") ])]) build_apollo(mapper, "DurationInSecondsApollo")