예제 #1
0
 def test_timedelta_days(self):
     X = DataFrame(
         [["2018-12-31", "2019-01-01"], ["2019-01-31", "2019-01-01"]],
         columns=["left", "right"])
     pipeline = Pipeline([
         ("union",
          FeatureUnion([
              ("left_mapper",
               DataFrameMapper([
                   ("left",
                    [DateDomain(),
                     DaysSinceYearTransformer(year=2010)])
               ])),
              ("right_mapper",
               DataFrameMapper([
                   ("right",
                    [DateDomain(),
                     DaysSinceYearTransformer(year=2010)])
               ]))
          ])),
         ("expression",
          Alias(ExpressionTransformer("X[0] - X[1]"),
                "delta(left, right)",
                prefit=True))
     ])
     Xt = pipeline.fit_transform(X)
     self.assertEqual([[-1], [30]], Xt.tolist())
예제 #2
0
 def test_days_transform(self):
     X = numpy.array([
         datetime(1960, 1, 1),
         datetime(1960, 1, 2),
         datetime(1960, 2, 1),
         datetime(1959, 12, 31),
         datetime(2003, 4, 1)
     ])
     transformer = DaysSinceYearTransformer(year=1960)
     self.assertEqual([0, 1, 31, -1, 15796],
                      transformer.transform(X).tolist())
예제 #3
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
예제 #4
0
store_csv(df, "Apollo")


def build_apollo(mapper, name):
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("classifier", DecisionTreeClassifier())])
    pipeline.fit(df, df["success"])
    store_pkl(pipeline, name)
    success = DataFrame(pipeline.predict(df), columns=["success"])
    success_proba = DataFrame(
        pipeline.predict_proba(df),
        columns=["probability(false)", "probability(true)"])
    success = pandas.concat((success, success_proba), axis=1)
    store_csv(success, name)


mapper = DataFrameMapper([(["launch", "return"], [
    DateTimeDomain(),
    DaysSinceYearTransformer(year=1968),
    ExpressionTransformer("X[1] - X[0]")
])])

build_apollo(mapper, "DurationInDaysApollo")

mapper = DataFrameMapper([(["launch", "return"], [
    DateTimeDomain(),
    SecondsSinceYearTransformer(year=1968),
    ExpressionTransformer("X[1] - X[0]")
])])

build_apollo(mapper, "DurationInSecondsApollo")