示例#1
0
def build_audit_cat(classifier, name, with_proba=True, **fit_params):
    mapper = DataFrameMapper(
        [([column], ContinuousDomain())
         for column in ["Age", "Income"]] + [(["Hours"], [
             ContinuousDomain(),
             CutTransformer(bins=[0, 20, 40, 60, 80, 100],
                            labels=False,
                            right=False,
                            include_lowest=True)
         ])] +
        [([column], [CategoricalDomain(), LabelEncoder()]) for column in [
            "Employment", "Education", "Marital", "Occupation", "Gender",
            "Deductions"
        ]])
    pipeline = Pipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y, **fit_params)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
示例#2
0
 def test_fit_float_outlier(self):
     domain = clone(
         ContinuousDomain(outlier_treatment="as_missing_values",
                          low_value=0.0,
                          high_value=3.0,
                          missing_values=float("NaN"),
                          missing_value_replacement=1.0))
     self.assertEqual(0.0, domain.low_value)
     self.assertEqual(3.0, domain.high_value)
     X = DataFrame([[-2.0, float("NaN")], [2.0, 4.0], [float("NaN"), 0.0]])
     self.assertEqual([[False, True], [False, False], [True, False]],
                      domain._missing_value_mask(X).values.tolist())
     self.assertEqual([[True, False], [False, True], [False, False]],
                      domain._outlier_mask(X).values.tolist())
     Xt = domain.fit_transform(X)
     self.assertEqual([1.0, 2.0, 1.0], Xt[0].tolist())
     self.assertEqual([1.0, 1.0, 0.0], Xt[1].tolist())
     domain = clone(
         ContinuousDomain(outlier_treatment="as_extreme_values",
                          low_value=0.0,
                          high_value=3.0,
                          missing_values=-1.0))
     X = DataFrame([[-2.0, -1.0], [2.0, 4.0], [-1.0, 0.0]])
     self.assertEqual([[False, True], [False, False], [True, False]],
                      domain._missing_value_mask(X).values.tolist())
     self.assertEqual([[True, False], [False, True], [False, False]],
                      domain._outlier_mask(X).values.tolist())
     self.assertEqual([[True, False], [False, False], [False, False]],
                      domain._negative_outlier_mask(X).values.tolist())
     self.assertEqual([[False, False], [False, True], [False, False]],
                      domain._positive_outlier_mask(X).values.tolist())
     Xt = domain.fit_transform(X)
     self.assertEqual([0.0, 2.0, -1.0], X[0].tolist())
     self.assertEqual([-1.0, 3.0, 0.0], X[1].tolist())
示例#3
0
def build_audit(classifier, name, with_proba = True, **kwargs):
	continuous_mapper = DataFrameMapper([
		("Age", ContinuousDomain()),
		("Income", ContinuousDomain()),
		("Hours", ContinuousDomain())
	])
	categorical_mapper = DataFrameMapper([
		("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]),
		("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]),
		("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]),
		("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]),
		("Deductions", [CategoricalDomain(), LabelEncoder()]),
	])
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y)
	customize(classifier, **kwargs)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")
示例#4
0
def build_audit(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        ("Age", ContinuousDomain()),
        ("Employment", [
            LabelBinarizer(),
            SelectFromModel(EstimatorProxy(
                DecisionTreeClassifier(random_state=13)),
                            threshold="1.25 * mean")
        ]),
        ("Education", [
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(EstimatorProxy(
                    RandomForestClassifier(random_state=13, n_estimators=3)),
                                threshold="median"))
        ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]),
        ("Occupation", [LabelBinarizer(),
                        SelectorProxy(SelectKBest(k=3))]),
        ("Income", ContinuousDomain()), ("Gender", LabelEncoder()),
        ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain())
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
示例#5
0
def build_auto(regressor, name, fit_params = {}, predict_params = {}, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders"], [CategoricalDomain(), Alias(ExpressionTransformer("X[0] % 2.0 > 0.0", dtype = numpy.int8), name = "odd(cylinders)", prefit = True)]),
		(["cylinders", "origin"], [MultiDomain([None, CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), OneHotEncoder()]),
		(["model_year"], [CategoricalDomain(), CastTransformer(str), ExpressionTransformer("'19' + X[0] + '-01-01'"), CastTransformer("datetime64[D]"), DaysSinceYearTransformer(1977), Binarizer(threshold = 0)], {"alias" : "bin(model_year, 1977)"}),
		(["model_year", "origin"], [ConcatTransformer("/"), OneHotEncoder(sparse = False), SelectorProxy(SelectFromModel(RandomForestRegressor(n_estimators = 3, random_state = 13), threshold = "1.25 * mean"))]),
		(["weight", "displacement"], [ContinuousDomain(), ExpressionTransformer("(X[0] / X[1]) + 0.5", dtype = numpy.float64)], {"alias" : "weight / displacement + 0.5"}),
		(["displacement", "horsepower", "weight", "acceleration"], [MultiDomain([None, ContinuousDomain(), None, ContinuousDomain()]), StandardScaler()])
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)
示例#6
0
 def test_fit_float_missing(self):
     domain = ContinuousDomain(missing_values=-1.0,
                               missing_value_replacement=4.0)
     domain = clone(domain)
     self.assertEqual(-1.0, domain.missing_values)
     self.assertEqual(4.0, domain.missing_value_replacement)
     self.assertFalse(domain._empty_fit())
     X = DataFrame([1.0, -1.0, 3.0, 2.0, -1.0, 2.0])
     Xt = domain.fit_transform(X)
     self.assertIsInstance(Xt, DataFrame)
     self.assertEqual(1.0, domain.data_min_)
     self.assertEqual(3.0, domain.data_max_)
     self.assertEqual({
         "totalFreq": 6,
         "missingFreq": 2,
         "invalidFreq": 0
     }, domain.counts_)
     self.assertEqual(
         {
             "minimum": [1.0],
             "maximum": [3.0],
             "mean": [2.0],
             "standardDeviation": [0.7071067811865476],
             "median": [2.0],
             "interQuartileRange": [0.5]
         }, _array_to_list(domain.numeric_info_))
     self.assertEqual([1.0, 4.0, 3.0, 2.0, 4.0, 2.0], Xt[0].tolist())
     X = numpy.array([-1.0, -1.0])
     Xt = domain.transform(X)
     self.assertEqual([4.0, 4.0], Xt.tolist())
示例#7
0
def build_audit_cat(classifier, name, with_proba = True, fit_params = {}):
	marital_mapping = {
		"Married-spouse-absent" : "Married"
	}
	mapper = DataFrameMapper(
		[([column], ContinuousDomain(display_name = column)) for column in ["Age", "Income"]] +
		[(["Hours"], [ContinuousDomain(display_name = "Hours"), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] +
		[(["Employment", "Education"], [MultiDomain([CategoricalDomain(display_name = "Employment"), CategoricalDomain(display_name = "Education")]), OrdinalEncoder(dtype = numpy.int_)])] +
		[(["Marital"], [CategoricalDomain(display_name = "Marital"), FilterLookupTransformer(marital_mapping), OrdinalEncoder(dtype = numpy.uint16)])] +
		[(["Occupation"], [CategoricalDomain(display_name = "Occupation"), OrdinalEncoder(dtype = numpy.float_)])] +
		[([column], [CategoricalDomain(display_name = column), LabelEncoder()]) for column in ["Gender", "Deductions"]]
	)
	pipeline = Pipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
示例#8
0
def build_auto_na(regressor, name, predict_transformer = None, apply_transformer = None, **pmml_options):
	mapper = DataFrameMapper(
		[([column], [CategoricalDomain(missing_values = -1), CategoricalImputer(missing_values = -1), PMMLLabelBinarizer()]) for column in ["cylinders", "model_year"]] +
		[(["origin"], [CategoricalDomain(missing_values = -1), SimpleImputer(missing_values = -1, strategy = "most_frequent"), OneHotEncoder()])] +
		[(["acceleration"], [ContinuousDomain(missing_values = None), CutTransformer(bins = [5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25], labels = False), CategoricalImputer(), LabelBinarizer()])] +
		[(["displacement"], [ContinuousDomain(missing_values = None), SimpleImputer(), CutTransformer(bins = [0, 100, 200, 300, 400, 500], labels = ["XS", "S", "M", "L", "XL"]), LabelBinarizer()])] +
		[(["horsepower"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 50, high_value = 225), SimpleImputer(strategy = "median")])] +
		[(["weight"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_extreme_values", low_value = 2000, high_value = 5000), SimpleImputer(strategy = "median")])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	], predict_transformer = predict_transformer, apply_transformer = apply_transformer)
	pipeline.fit(auto_na_X, auto_na_y)
	if isinstance(regressor, DecisionTreeRegressor):
		tree = regressor.tree_
		node_impurity = {node_idx : tree.impurity[node_idx] for node_idx in range(0, tree.node_count) if tree.impurity[node_idx] != 0.0}
		pmml_options["node_extensions"] = {regressor.criterion : node_impurity}
	pipeline.configure(**pmml_options)
	pipeline.verify(auto_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_na_X), columns = ["mpg"])
	if isinstance(regressor, DecisionTreeRegressor):
		Xt = pipeline_transform(pipeline, auto_na_X)
		mpg_apply = DataFrame(regressor.apply(Xt), columns = ["nodeId"])
		mpg = pandas.concat((mpg, mpg_apply), axis = 1)
	store_csv(mpg, name)
示例#9
0
	def test_fit_float(self):
		domain = clone(ContinuousDomain(with_data = False, with_statistics = False))
		self.assertTrue(domain._empty_fit())
		domain = clone(ContinuousDomain(missing_values = float("NaN"), missing_value_treatment = "as_value", missing_value_replacement = -1.0, invalid_value_treatment = "as_is", invalid_value_replacement = 0.0))
		self.assertTrue(numpy.isnan(domain.missing_values))
		self.assertEqual("as_value", domain.missing_value_treatment)
		self.assertEqual(-1.0, domain.missing_value_replacement)
		self.assertEqual("as_is", domain.invalid_value_treatment)
		self.assertEqual(0.0, domain.invalid_value_replacement)
		self.assertFalse(hasattr(domain, "data_min_"))
		self.assertFalse(hasattr(domain, "data_max_"))
		self.assertFalse(hasattr(domain, "counts_"))
		self.assertFalse(hasattr(domain, "numeric_info_"))
		self.assertFalse(domain._empty_fit())
		X = DataFrame(numpy.array([1.0, float("NaN"), 3.0, 2.0, float("NaN"), 2.0]))
		Xt = domain.fit_transform(X)
		self.assertIsInstance(Xt, DataFrame)
		self.assertEqual(1.0, domain.data_min_)
		self.assertEqual(3.0, domain.data_max_)
		self.assertEqual({"totalFreq" : 6, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_)
		self.assertEqual({"minimum" : [1.0], "maximum" : [3.0], "mean" : [2.0], "standardDeviation" : [0.7071067811865476], "median" : [2.0], "interQuartileRange" : [0.5]}, _array_to_list(domain.numeric_info_))
		self.assertEqual([1.0, -1.0, 3.0, 2.0, -1.0, 2.0], Xt[0].tolist())
		X = numpy.array([float("NaN"), None])
		Xt = domain.transform(X)
		self.assertEqual([-1.0, -1.0], Xt.tolist())
示例#10
0
	def test_fit_float(self):
		domain = ContinuousDomain()
		self.assertEqual("return_invalid", domain.invalid_value_treatment)
		self.assertFalse(hasattr(domain, "data_min_"))
		self.assertFalse(hasattr(domain, "data_max_"))
		domain = domain.fit(numpy.array([1.0, float('NaN'), 3.0, 2.0, float('NaN'), 2.0]))
		self.assertEqual(1.0, domain.data_min_)
		self.assertEqual(3.0, domain.data_max_)
示例#11
0
def build_auto_na(regressor, name):
    mapper = DataFrameMapper(
        [([column], [
            CategoricalDomain(missing_values=-1),
            CategoricalImputer(missing_values=-1),
            PMMLLabelBinarizer()
        ]) for column in ["cylinders", "model_year"]] +
        [(["origin"], [CategoricalImputer(missing_values=-1),
                       OneHotEncoder()])] +
        [(["acceleration"], [
            ContinuousDomain(missing_values=None),
            CutTransformer(bins=[5, 7.5, 10, 12.5, 15, 17.5, 20, 22.5, 25],
                           labels=False),
            CategoricalImputer(),
            LabelBinarizer()
        ])] + [(["displacement"], [
            ContinuousDomain(missing_values=None),
            Imputer(),
            CutTransformer(bins=[0, 100, 200, 300, 400, 500],
                           labels=["XS", "S", "M", "L", "XL"]),
            LabelBinarizer()
        ])] + [(["horsepower"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=50,
                             high_value=225),
            Imputer()
        ])] + [(["weight"], [
            ContinuousDomain(missing_values=None,
                             outlier_treatment="as_extreme_values",
                             low_value=2000,
                             high_value=5000),
            Imputer()
        ])])
    pipeline = PMMLPipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_na_X, auto_na_y)
    if isinstance(regressor, DecisionTreeRegressor):
        tree = regressor.tree_
        node_impurity = {
            node_idx: tree.impurity[node_idx]
            for node_idx in range(0, tree.node_count)
            if tree.impurity[node_idx] != 0.0
        }
        pipeline.configure(
            node_extensions={regressor.criterion: node_impurity})
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_na_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
示例#12
0
def make_pipeline_model(numeric_feature,
                        category_feature,
                        estimator,
                        X=None,
                        y=None):
    '''
    通过指定类别型和数值型特征构建以及指定的模型构建pipeline,如果给出数据集就完成训练,最终返回pipeline模型
    numeric_feature: 数值特征 list
    category_feature: 类别特征 list
    X:X数据 传入pandas.DataFrame对象
    y:Y数据 传入pandas.Series对象
    
    return:
    pipeline_model
    '''
    feature_def = gen_features(
        columns=category_feature,
        classes=[CategoricalDomain, CategoricalImputer, LabelBinarizer])
    mapper_numerical = DataFrameMapper([(numeric_feature, [
        ContinuousDomain(),
        SimpleImputer(strategy='mean'),
        StandardScaler()
    ])])
    mapper_category = DataFrameMapper(feature_def)
    mapper = FeatureUnion([('mapper_numerical', mapper_numerical),
                           ('mapper_category', mapper_category)])
    pipeline_model = PMMLPipeline([('mapper', mapper),
                                   ('classifier', estimator)])
    if X is not None and y is not None:
        pipeline_model.fit(X, y)
    return pipeline_model
示例#13
0
def build_auto_h2o(regressor, name):
    transformer = ColumnTransformer(
        [(column, CategoricalDomain(), [column])
         for column in ["cylinders", "model_year", "origin"]] +
        [(column, ContinuousDomain(), [column]) for column in
         ["displacement", "horsepower", "weight", "acceleration"]])
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("uploader",
                              H2OFrameCreator(column_names=[
                                  "cylinders", "model_year", "origin",
                                  "displacement", "horsepower", "weight",
                                  "acceleration"
                              ],
                                              column_types=[
                                                  "enum", "enum", "enum",
                                                  "numeric", "numeric",
                                                  "numeric", "numeric"
                                              ])), ("regressor", regressor)])
    pipeline.fit(auto_X, H2OFrame(auto_y.to_frame()))
    pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    regressor = pipeline._final_estimator
    store_mojo(regressor, name + ".zip")
    store_pkl(pipeline, name + ".pkl")
    mpg = pipeline.predict(auto_X)
    mpg.set_names(["mpg"])
    store_csv(mpg.as_data_frame(), name + ".csv")
示例#14
0
def build_housing(regressor, name, with_kneighbors=False, **kwargs):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper),
                         ("transformer-pipeline",
                          Pipeline([
                              ("polynomial",
                               PolynomialFeatures(degree=2,
                                                  interaction_only=True,
                                                  include_bias=False)),
                              ("scaler", StandardScaler()),
                              ("selector",
                               SelectPercentile(score_func=f_regression,
                                                percentile=35)),
                          ])), ("regressor", regressor)])
    pipeline.fit(housing_X, housing_y)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values,
                                  housing_y.name)
    customize(regressor, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"])
    if (with_kneighbors == True):
        Xt = pipeline_transform(pipeline, housing_X)
        kneighbors = regressor.kneighbors(Xt)
        medv_ids = DataFrame(kneighbors[1] + 1,
                             columns=[
                                 "neighbor(" + str(x + 1) + ")"
                                 for x in range(regressor.n_neighbors)
                             ])
        medv = pandas.concat((medv, medv_ids), axis=1)
    store_csv(medv, name + ".csv")
示例#15
0
def build_iris(classifier, name, with_proba=True, **kwargs):
    pipeline = Pipeline([
        ("pipeline",
         Pipeline([("domain", ContinuousDomain()),
                   ("transform",
                    FeatureUnion([("normal_scale", FunctionTransformer(None)),
                                  ("log_scale",
                                   FunctionTransformer(numpy.log10))]))])),
        ("scaler", RobustScaler()),
        ("pca", IncrementalPCA(n_components=3, whiten=True)),
        ("classifier", classifier)
    ])
    pipeline.fit(iris_X, iris_y)
    pipeline = make_pmml_pipeline(pipeline, iris_X.columns.values, iris_y.name)
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(iris_X),
                                  columns=[
                                      "probability(setosa)",
                                      "probability(versicolor)",
                                      "probability(virginica)"
                                  ])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
示例#16
0
def build_iris(classifier, name, **pmml_options):
    cont_columns = [
        "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"
    ]
    cont_mappings = [([cont_column], ContinuousDomain())
                     for cont_column in cont_columns]
    mapper = DataFrameMapper(cont_mappings)
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(iris_X.sample(n=3, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(iris_X.sample(n=3, random_state=13))
    pipeline.configure(**pmml_options)
    store_pmml(pipeline, name)
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    species_proba = DataFrame(pipeline.predict_proba(iris_X),
                              columns=[
                                  "probability(setosa)",
                                  "probability(versicolor)",
                                  "probability(virginica)"
                              ])
    store_csv(pandas.concat((species, species_proba), axis=1), name)
示例#17
0
def build_auto(regressor, name, **kwargs):
    mapper = DataFrameMapper([
        (["cylinders"], CategoricalDomain()),
        (["displacement", "horsepower", "weight",
          "acceleration"], [ContinuousDomain(),
                            StandardScaler()]),
        (["model_year"], [CategoricalDomain(),
                          Binarizer(threshold=77)], {
                              "alias": "bin(model_year, 77)"
                          }),  # Pre/post 1973 oil crisis effects
        (["origin"], OneHotEncoder()),
        (["weight", "displacement"],
         ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), {
             "alias": "weight / displacement + 0.5"
         })
    ])
    pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_X, auto_y)
    pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name)
    if isinstance(regressor, XGBRegressor):
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    customize(regressor, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
示例#18
0
def build_auto(regressor, name, **pmml_options):
	cylinders_origin_mapping = {
		(8, 1) : "8/1",
		(6, 1) : "6/1",
		(4, 1) : "4/1",
		(6, 2) : "6/2",
		(4, 2) : "4/2",
		(6, 3) : "6/3",
		(4, 3) : "4/3"
	}
	mapper = DataFrameMapper([
		(["cylinders", "origin"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), MultiLookupTransformer(cylinders_origin_mapping, default_value = "other"), LabelBinarizer()]),
		(["model_year"], [CategoricalDomain(), Binarizer(threshold = 77)], {"alias" : "bin(model_year, 77)"}), # Pre/post 1973 oil crisis effects
		(["displacement", "horsepower", "weight", "acceleration"], [ContinuousDomain(), StandardScaler()]),
		(["weight", "displacement"], ExpressionTransformer("(X[0] / X[1]) + 0.5"), {"alias" : "weight / displacement + 0.5"})
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("selector", SelectUnique()),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, auto_y)
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"])
	store_csv(mpg, name)
示例#19
0
def build_wheat(kmeans, name, with_affinity = True, **pmml_options):
	mapper = DataFrameMapper([
		(wheat_X.columns.values, [ContinuousDomain(dtype = float), IdentityTransformer()])
	])
	scaler = ColumnTransformer([
		("robust", RobustScaler(), [0, 5])
	], remainder = MinMaxScaler())
	pipeline = Pipeline([
		("mapper", mapper),
		("scaler", scaler),
		("clusterer", kmeans)
	])
	pipeline.fit(wheat_X)
	pipeline = make_pmml_pipeline(pipeline, wheat_X.columns.values)
	pipeline.configure(**pmml_options)
	store_pkl(pipeline, name)
	cluster = DataFrame(pipeline.predict(wheat_X), columns = ["Cluster"])
	if with_affinity == True:
		Xt = pipeline_transform(pipeline, wheat_X)
		affinity_0 = kmeans_distance(kmeans, 0, Xt)
		affinity_1 = kmeans_distance(kmeans, 1, Xt)
		affinity_2 = kmeans_distance(kmeans, 2, Xt)
		cluster_affinity = DataFrame(numpy.transpose([affinity_0, affinity_1, affinity_2]), columns = ["affinity(0)", "affinity(1)", "affinity(2)"])
		cluster = pandas.concat((cluster, cluster_affinity), axis = 1)
	store_csv(cluster, name)
示例#20
0
def build_iris(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options):
	pipeline = Pipeline([
		("pipeline", Pipeline([
			("mapper", DataFrameMapper([
				(iris_X.columns.values, ContinuousDomain()),
				(["Sepal.Length", "Petal.Length"], Aggregator(function = "mean")),
				(["Sepal.Width", "Petal.Width"], Aggregator(function = "mean"))
			])),
			("transform", FeatureUnion([
				("normal_scale", FunctionTransformer(None, validate = True)),
				("log_scale", FunctionTransformer(numpy.log10, validate = True)),
				("power_scale", PowerFunctionTransformer(power = 2))
			]))
		])),
		("pca", IncrementalPCA(n_components = 3, whiten = True)),
		("classifier", classifier)
	])
	pipeline.fit(iris_X, iris_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, iris_X.columns.values, iris_y.name)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	species = DataFrame(pipeline.predict(iris_X, **predict_params), columns = ["Species"])
	if with_proba == True:
		species_proba = DataFrame(pipeline.predict_proba(iris_X, **predict_proba_params), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
		species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, name)
示例#21
0
def build_housing(regressor, name, with_kneighbors = False, **pmml_options):
	mapper = DataFrameMapper([
		(housing_X.columns.values, ContinuousDomain())
	])
	pipeline = Pipeline([
		("mapper", mapper),
		("transformer-pipeline", Pipeline([
			("polynomial", PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)),
			("scaler", StandardScaler()),
			("passthrough-transformer", "passthrough"),
			("selector", SelectPercentile(score_func = f_regression, percentile = 35)),
			("passthrough-final-estimator", "passthrough")
		])),
		("regressor", regressor)
	])
	pipeline.fit(housing_X, housing_y)
	pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values, housing_y.name)
	pipeline.configure(**pmml_options)
	pipeline.verify(housing_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	medv = DataFrame(pipeline.predict(housing_X), columns = ["MEDV"])
	if with_kneighbors == True:
		Xt = pipeline_transform(pipeline, housing_X)
		kneighbors = regressor.kneighbors(Xt)
		medv_ids = DataFrame(kneighbors[1] + 1, columns = ["neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors)])
		medv = pandas.concat((medv, medv_ids), axis = 1)
	store_csv(medv, name)
示例#22
0
def build_audit(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options):
	continuous_mapper = DataFrameMapper([
		(["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
	])
	categorical_mapper = DataFrameMapper([
		(["Employment"], [CategoricalDomain(), SubstringTransformer(0, 3), OneHotEncoder(drop = ["Vol"]), SelectFromModel(DecisionTreeClassifier(random_state = 13))]),
		(["Education"], [CategoricalDomain(), ReplaceTransformer("[aeiou]", ""), OneHotEncoder(drop = "first"), SelectFromModel(RandomForestClassifier(n_estimators = 3, random_state = 13), threshold = "1.25 * mean")]),
		(["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		(["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]),
		(["Gender"], [CategoricalDomain(), MatchesTransformer("^Male$"), CastTransformer(int)]),
		(["Deductions"], [CategoricalDomain()]),
	])
	pipeline = Pipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
示例#23
0
def build_versicolor(classifier, name, with_proba = True, **pmml_options):
	transformer = ColumnTransformer([
		("continuous_columns", Pipeline([
			("domain", ContinuousDomain()),
			("scaler", RobustScaler())
		]), versicolor_X.columns.values)
	])
	pipeline = Pipeline([
		("transformer", transformer),
		("transformer-selector-pipeline", Pipeline([
			("polynomial", PolynomialFeatures(degree = 3)),
			("selector", SelectKBest(k = "all"))
		])),
		("classifier", classifier)
	])
	pipeline.fit(versicolor_X, versicolor_y)
	pipeline = make_pmml_pipeline(pipeline, versicolor_X.columns.values, versicolor_y.name)
	pipeline.configure(**pmml_options)
	pipeline.verify(versicolor_X.sample(frac = 0.10, random_state = 13))
	store_pkl(pipeline, name)
	species = DataFrame(pipeline.predict(versicolor_X), columns = ["Species"])
	if with_proba == True:
		species_proba = DataFrame(pipeline.predict_proba(versicolor_X), columns = ["probability(0)", "probability(1)"])
		species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, name)
示例#24
0
def build_visit(regressor, name):
	mapper = DataFrameMapper(
		[(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] +
		[([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] +
		[(["age"], ContinuousDomain())] +
		[(["hhninc", "educ"], ContinuousDomain())]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
	pipeline.fit(visit_X, visit_y)
	pipeline.verify(visit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"])
	store_csv(docvis, name)
示例#25
0
def build_audit(classifier, name, with_proba=True, **kwargs):
    continuous_mapper = DataFrameMapper([
        (["Age", "Income",
          "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
    ])
    categorical_mapper = DataFrameMapper([
        (["Employment"], [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(DecisionTreeClassifier(random_state=13))
        ]),
        (["Education"], [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(RandomForestClassifier(random_state=13,
                                                   n_estimators=3),
                            threshold="1.25 * mean")
        ]),
        (["Marital"], [
            CategoricalDomain(),
            LabelBinarizer(neg_label=-1, pos_label=1),
            SelectKBest(k=3)
        ]),
        (["Occupation"],
         [CategoricalDomain(),
          LabelBinarizer(),
          SelectKBest(k=3)]),
        (["Gender"],
         [CategoricalDomain(),
          LabelBinarizer(neg_label=-3, pos_label=3)]),
        (["Deductions"], [CategoricalDomain(),
                          LabelEncoder()]),
    ])
    pipeline = Pipeline([
        ("union",
         FeatureUnion([("continuous", continuous_mapper),
                       ("categorical",
                        Pipeline([("mapper", categorical_mapper),
                                  ("polynomial", PolynomialFeatures())]))])),
        ("classifier", classifier)
    ])
    pipeline.fit(audit_X, audit_y)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(audit_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(audit_X.sample(frac=0.05, random_state=13))
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
示例#26
0
	def test_fit_float(self):
		domain = ContinuousDomain(missing_value_treatment = "as_value", missing_value_replacement = -1.0, invalid_value_treatment = "as_is")
		self.assertEqual("as_value", domain.missing_value_treatment)
		self.assertEqual(-1.0, domain.missing_value_replacement)
		self.assertEqual("as_is", domain.invalid_value_treatment)
		self.assertFalse(hasattr(domain, "data_min_"))
		self.assertFalse(hasattr(domain, "data_max_"))
		self.assertFalse(hasattr(domain, "counts_"))
		self.assertFalse(hasattr(domain, "numeric_info_"))
		X = DataFrame(numpy.array([1.0, float('NaN'), 3.0, 2.0, float('NaN'), 2.0]))
		Xt = domain.fit_transform(X)
		self.assertEqual(1.0, domain.data_min_)
		self.assertEqual(3.0, domain.data_max_)
		self.assertEqual({"totalFreq" : 6, "missingFreq" : 2, "invalidFreq" : 0}, domain.counts_)
		self.assertEqual({"minimum" : [1.0], "maximum" : [3.0], "mean" : [2.0], "standardDeviation" : [0.7071067811865476], "median" : [2.0], "interQuartileRange" : [0.5]}, _array_to_list(domain.numeric_info_))
		self.assertEqual(numpy.array([1.0, -1.0, 3.0, 2.0, -1.0, 2.0]).tolist(), Xt[0].tolist())
		X = numpy.array([float('NaN'), None])
		Xt = domain.transform(X)
		self.assertEqual(numpy.array([-1.0, -1.0]).tolist(), Xt.tolist())
示例#27
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
示例#28
0
def make_fit_lgbmlr(gbdt, lr):
    mapper = DataFrameMapper(
        [([cat_column],
          [CategoricalDomain(), LabelEncoder()])
         for cat_column in cat_columns] + [(cont_columns, ContinuousDomain())])
    classifier = GBDTLRClassifier(gbdt, lr)
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(df[cat_columns + cont_columns],
                 df[label_column],
                 classifier__gbdt__categorical_feature=range(
                     0, len(cat_columns)))
    return pipeline
示例#29
0
def feature_union(category_feature,numeric_feature):
    mapper_category = DataFrameMapper(gen_features(
            columns = category_feature,
            # LabelEncoder
            classes = [CategoricalDomain,CategoricalImputer,LabelEncoder]
            ))
    mapper_numerical = DataFrameMapper([
            (numeric_feature,[ContinuousDomain(),SimpleImputer(strategy='mean'),StandardScaler()])
            ])
    pipeline_transformer = FeatureUnion([('mapper_category',mapper_category),\
                                         ('mapper_numerical',mapper_numerical)]) 
    return pipeline_transformer
示例#30
0
	def test_fit_transform(self):
		domain = MultiDomain([ContinuousDomain(missing_value_replacement = 0.0), CategoricalDomain(missing_value_replacement = "zero")])
		X = DataFrame([[-1.0, "minus one"], [float("NaN"), None], [1.0, "one"]], columns = ["x1", "x2"])
		Xt = domain.fit_transform(X)
		self.assertTrue(isinstance(Xt, DataFrame))
		self.assertEqual([-1.0, 0.0, 1.0], Xt["x1"].tolist())
		self.assertEqual(["minus one", "zero", "one"], Xt["x2"].tolist())
		X = numpy.array([[float("NaN"), None]])
		Xt = domain.transform(X)
		self.assertTrue(isinstance(Xt, numpy.ndarray))
		self.assertTrue([0.0], Xt[:, 0].tolist())
		self.assertTrue(["zero"], Xt[:, 1].tolist())
示例#31
0
	def test_mapper(self):
		domain = ContinuousDomain()
		df = DataFrame([{"X1" : 2.0, "X2" : 2, "y" : 2.0}, {"X1" : 1.0, "X2" : 0.5}, {"X1" : 2}, {"X2" : 2}, {"X1" : 2.0, "y" : 1}, {"X1" : 3.0, "X2" : 3.5}])
		mapper = DataFrameMapper([
			(["X1", "X2"], [domain, SimpleImputer(), StandardScaler()]),
			("y", None)
		])
		mapper.fit_transform(df)
		self.assertEqual({"totalFreq" : [6, 6], "missingFreq" : [1, 2], "invalidFreq" : [0, 0]}, _array_to_list(domain.counts_))
		self.assertEqual({"minimum" : [1.0, 0.5], "maximum" : [3.0, 3.5], "mean" : [2.0, 2.0]}, _array_to_list(dict((k, domain.numeric_info_[k]) for k in ["minimum", "maximum", "mean"])))
		self.assertEqual([1.0, 0.5], domain.data_min_.tolist())
		self.assertEqual([3.0, 3.5], domain.data_max_.tolist())