예제 #1
0
def build_housing(regressor, name, with_kneighbors=False, **kwargs):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper),
                         ("transformer-pipeline",
                          Pipeline([
                              ("polynomial",
                               PolynomialFeatures(degree=2,
                                                  interaction_only=True,
                                                  include_bias=False)),
                              ("scaler", StandardScaler()),
                              ("selector",
                               SelectPercentile(score_func=f_regression,
                                                percentile=35)),
                          ])), ("regressor", regressor)])
    pipeline.fit(housing_X, housing_y)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values,
                                  housing_y.name)
    pipeline.verify(housing_X.sample(frac=0.05, random_state=13))
    customize(regressor, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    medv = DataFrame(pipeline.predict(housing_X), columns=["MEDV"])
    if (with_kneighbors == True):
        Xt = pipeline_transform(pipeline, housing_X)
        kneighbors = regressor.kneighbors(Xt)
        medv_ids = DataFrame(kneighbors[1] + 1,
                             columns=[
                                 "neighbor(" + str(x + 1) + ")"
                                 for x in range(regressor.n_neighbors)
                             ])
        medv = pandas.concat((medv, medv_ids), axis=1)
    store_csv(medv, name + ".csv")
예제 #2
0
def build_iris(classifier, name, with_proba=True, **kwargs):
    pipeline = Pipeline([
        ("pipeline",
         Pipeline([("domain", ContinuousDomain()),
                   ("transform",
                    FeatureUnion([("normal_scale", FunctionTransformer(None)),
                                  ("log_scale",
                                   FunctionTransformer(numpy.log10))]))])),
        ("scaler", RobustScaler()),
        ("pca", IncrementalPCA(n_components=3, whiten=True)),
        ("classifier", classifier)
    ])
    pipeline.fit(iris_X, iris_y)
    pipeline = make_pmml_pipeline(pipeline, iris_X.columns.values, iris_y.name)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(iris_X.sample(frac=0.10, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(iris_X.sample(frac=0.10, random_state=13))
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(iris_X),
                                  columns=[
                                      "probability(setosa)",
                                      "probability(versicolor)",
                                      "probability(virginica)"
                                  ])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
예제 #3
0
def build_audit_cat(classifier, name, with_proba=True, **fit_params):
    mapper = DataFrameMapper(
        [([column], ContinuousDomain())
         for column in ["Age", "Income"]] + [(["Hours"], [
             ContinuousDomain(),
             CutTransformer(bins=[0, 20, 40, 60, 80, 100],
                            labels=False,
                            right=False,
                            include_lowest=True)
         ])] +
        [([column], [CategoricalDomain(), LabelEncoder()]) for column in [
            "Employment", "Education", "Marital", "Occupation", "Gender",
            "Deductions"
        ]])
    pipeline = Pipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y, **fit_params)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
예제 #4
0
def build_audit(classifier, name, with_proba=True, **pmml_options):
    continuous_mapper = DataFrameMapper([
        (["Age", "Income",
          "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
    ])
    categorical_mapper = DataFrameMapper([
        (["Employment"], [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(DecisionTreeClassifier(random_state=13))
        ]),
        (["Education"], [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(RandomForestClassifier(random_state=13,
                                                   n_estimators=3),
                            threshold="1.25 * mean")
        ]),
        (["Marital"], [
            CategoricalDomain(),
            LabelBinarizer(neg_label=-1, pos_label=1),
            SelectKBest(k=3)
        ]),
        (["Occupation"],
         [CategoricalDomain(),
          LabelBinarizer(),
          SelectKBest(k=3)]),
        (["Gender"],
         [CategoricalDomain(),
          LabelBinarizer(neg_label=-3, pos_label=3)]),
        (["Deductions"], [CategoricalDomain(),
                          LabelEncoder()]),
    ])
    pipeline = Pipeline([
        ("union",
         FeatureUnion([("continuous", continuous_mapper),
                       ("categorical",
                        Pipeline([("mapper", categorical_mapper),
                                  ("polynomial", PolynomialFeatures())]))])),
        ("classifier", classifier)
    ])
    pipeline.fit(audit_X, audit_y)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    pipeline.configure(**pmml_options)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(audit_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(audit_X.sample(frac=0.05, random_state=13))
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if with_proba == True:
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
def save_model(model, current_time):
    ''' pickle and save as pmml '''

    # pickle
    pickle.dump(model, open(f'outputs/model_{current_time}.sav', 'wb'))

    # pmml
    pmml_object = sklearn2pmml.make_pmml_pipeline(model)
    sklearn2pmml.sklearn2pmml(pmml_object,
                              f'outputs/model_{current_time}.pmml.xml')
예제 #6
0
def build_regressor(data, name):
	X, y = data
	config = make_tpot_pmml_config(regressor_config_dict)
	del config["sklearn.neighbors.KNeighborsRegressor"]
	regressor = TPOTRegressor(generations = 3, population_size = 3, random_state = 13, config_dict = config, verbosity = 2)
	regressor.fit(X, y)
	pipeline = make_pmml_pipeline(regressor.fitted_pipeline_, active_fields = X.columns.values, target_fields = [y.name])
	print(repr(pipeline))
	store_pkl(pipeline, name)
	result = DataFrame(regressor.predict(X), columns = [y.name])
	store_csv(result, name)
예제 #7
0
def build_audit(classifier, name, with_proba=True, **kwargs):
    continuous_mapper = DataFrameMapper([("Age", ContinuousDomain()),
                                         ("Income", ContinuousDomain()),
                                         ("Hours", ContinuousDomain())])
    categorical_mapper = DataFrameMapper([
        ("Employment", [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(DecisionTreeClassifier(random_state=13))
        ]),
        ("Education", [
            CategoricalDomain(),
            LabelBinarizer(),
            SelectFromModel(RandomForestClassifier(random_state=13,
                                                   n_estimators=3),
                            threshold="1.25 * mean")
        ]),
        ("Marital", [
            CategoricalDomain(),
            LabelBinarizer(neg_label=-1, pos_label=1),
            SelectKBest(k=3)
        ]),
        ("Occupation",
         [CategoricalDomain(),
          LabelBinarizer(),
          SelectKBest(k=3)]),
        ("Gender",
         [CategoricalDomain(),
          LabelBinarizer(neg_label=-3, pos_label=3)]),
        ("Deductions", [CategoricalDomain(),
                        LabelEncoder()]),
    ])
    pipeline = Pipeline([
        ("union",
         FeatureUnion([("continuous", continuous_mapper),
                       ("categorical",
                        Pipeline([("mapper", categorical_mapper),
                                  ("polynomial", PolynomialFeatures())]))])),
        ("classifier", classifier)
    ])
    pipeline.fit(audit_X, audit_y)
    pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values,
                                  audit_y.name)
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(
            pipeline.predict_proba(audit_X),
            columns=["probability(0)", "probability(1)"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
예제 #8
0
def build_ocsvm_housing(svm, name):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper), ("scaler", MaxAbsScaler()),
                         ("estimator", svm)])
    pipeline.fit(housing_X)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
    store_pkl(pipeline, name)
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
                                 columns=["decisionFunction"])
    outlier = DataFrame(pipeline.predict(housing_X) <= 0,
                        columns=["outlier"
                                 ]).replace(True,
                                            "true").replace(False, "false")
    store_csv(pandas.concat([decisionFunction, outlier], axis=1), name)
예제 #9
0
def build_iforest_housing(iforest, name, **pmml_options):
    mapper = DataFrameMapper([(housing_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper), ("estimator", iforest)])
    pipeline.fit(housing_X)
    pipeline = make_pmml_pipeline(pipeline, housing_X.columns.values)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name)
    decisionFunction = DataFrame(pipeline.decision_function(housing_X),
                                 columns=["decisionFunction"])
    outlier = DataFrame(pipeline.predict(housing_X) == -1,
                        columns=["outlier"
                                 ]).replace(True,
                                            "true").replace(False, "false")
    store_csv(pandas.concat([decisionFunction, outlier], axis=1), name)
예제 #10
0
def build_classifier(data, name):
	X, y = data
	categories = pandas.unique(y)
	config = make_tpot_pmml_config(classifier_config_dict)
	del config["sklearn.neighbors.KNeighborsClassifier"]
	classifier = TPOTClassifier(generations = 1, population_size = 3, random_state = 13, config_dict = config, verbosity = 2)
	classifier.fit(X, y)
	pipeline = make_pmml_pipeline(classifier.fitted_pipeline_, active_fields = X.columns.values, target_fields = [y.name])
	print(repr(pipeline))
	store_pkl(pipeline, name)
	result = DataFrame(classifier.predict(X), columns = [y.name])
	if(len(categories) > 0):
		probabilities = DataFrame(classifier.predict_proba(X), columns = ["probability(" + str(category) + ")" for category in categories])
		result = pandas.concat([result, probabilities], axis = 1)
	store_csv(result, name)
예제 #11
0
def build_audit(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options):
	continuous_mapper = DataFrameMapper([
		(["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)]))
	])
	categorical_mapper = DataFrameMapper([
		(["Employment"], [CategoricalDomain(), SubstringTransformer(0, 3), OneHotEncoder(drop = ["Vol"]), SelectFromModel(DecisionTreeClassifier(random_state = 13))]),
		(["Education"], [CategoricalDomain(), ReplaceTransformer("[aeiou]", ""), OneHotEncoder(drop = "first"), SelectFromModel(RandomForestClassifier(n_estimators = 3, random_state = 13), threshold = "1.25 * mean")]),
		(["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		(["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]),
		(["Gender"], [CategoricalDomain(), MatchesTransformer("^Male$"), CastTransformer(int)]),
		(["Deductions"], [CategoricalDomain()]),
	])
	pipeline = Pipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, EstimatorProxy):
		estimator = classifier.estimator
		if hasattr(estimator, "estimators_"):
			child_estimators = estimator.estimators_
			if isinstance(child_estimators, numpy.ndarray):
				child_estimators = child_estimators.flatten().tolist()
			for child_estimator in child_estimators:
				child_estimator.pmml_feature_importances_ = child_estimator.feature_importances_
	elif isinstance(classifier, XGBClassifier):
		classifier.pmml_feature_importances_ = classifier.feature_importances_
	else:
		pass
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
예제 #12
0
def build_wheat(kmeans, name, with_affinity=True, **pmml_options):
    mapper = DataFrameMapper([(wheat_X.columns.values, ContinuousDomain())])
    pipeline = Pipeline([("mapper", mapper), ("scaler", MinMaxScaler()),
                         ("clusterer", kmeans)])
    pipeline.fit(wheat_X)
    pipeline = make_pmml_pipeline(pipeline, wheat_X.columns.values)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    cluster = DataFrame(pipeline.predict(wheat_X), columns=["Cluster"])
    if with_affinity == True:
        Xt = pipeline_transform(pipeline, wheat_X)
        affinity_0 = kmeans_distance(kmeans, 0, Xt)
        affinity_1 = kmeans_distance(kmeans, 1, Xt)
        affinity_2 = kmeans_distance(kmeans, 2, Xt)
        cluster_affinity = DataFrame(
            numpy.transpose([affinity_0, affinity_1, affinity_2]),
            columns=["affinity(0)", "affinity(1)", "affinity(2)"])
        cluster = pandas.concat((cluster, cluster_affinity), axis=1)
    store_csv(cluster, name + ".csv")
예제 #13
0
def build_auto(regressor, name, **pmml_options):
    cylinders_origin_mapping = {
        (8, 1): "8/1",
        (6, 1): "6/1",
        (4, 1): "4/1",
        (6, 2): "6/2",
        (4, 2): "4/2",
        (6, 3): "6/3",
        (4, 3): "4/3"
    }
    mapper = DataFrameMapper([
        (["cylinders", "origin"], [
            MultiDomain([CategoricalDomain(),
                         CategoricalDomain()]),
            MultiLookupTransformer(cylinders_origin_mapping,
                                   default_value="other"),
            LabelBinarizer()
        ]),
        (["model_year"], [CategoricalDomain(),
                          Binarizer(threshold=77)], {
                              "alias": "bin(model_year, 77)"
                          }),  # Pre/post 1973 oil crisis effects
        (["displacement", "horsepower", "weight",
          "acceleration"], [ContinuousDomain(),
                            StandardScaler()]),
        (["weight", "displacement"],
         ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), {
             "alias": "weight / displacement + 0.5"
         })
    ])
    pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_X, auto_y)
    pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name)
    pipeline.configure(**pmml_options)
    if isinstance(regressor, XGBRegressor):
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
예제 #14
0
def main():
    parser = argparse.ArgumentParser(
        prog=__file__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i',
                        '--input_path',
                        help='Path to the .csv with coefs and classes',
                        required=True)
    args = parser.parse_args()
    df = pd.read_csv(args.input_path)
    df = df.drop(df.columns[0], axis=1)
    Y = df["target_class"]
    X = df.drop("target_class", axis=1)
    print(X.shape)
    clf = tree.DecisionTreeClassifier(criterion='entropy')

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
    clf = clf.fit(X_train, Y_train)
    Y_test_predicted = clf.predict(X_test)
    pipeline = make_pmml_pipeline(clf)
    sklearn2pmml(pipeline, pmml="../models/bankrot.pmml")
예제 #15
0
def build_versicolor(classifier, name, with_proba=True, **kwargs):
    mapper = DataFrameMapper([(versicolor_X.columns.values,
                               [ContinuousDomain(),
                                RobustScaler()])])
    pipeline = Pipeline([("mapper", mapper),
                         ("transformer-pipeline",
                          Pipeline([("polynomial",
                                     PolynomialFeatures(degree=3)),
                                    ("selector", SelectKBest(k="all"))])),
                         ("classifier", classifier)])
    pipeline.fit(versicolor_X, versicolor_y)
    pipeline = make_pmml_pipeline(pipeline, versicolor_X.columns.values,
                                  versicolor_y.name)
    customize(classifier, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"])
    if (with_proba == True):
        species_proba = DataFrame(pipeline.predict_proba(versicolor_X),
                                  columns=["probability(0)", "probability(1)"])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
예제 #16
0
def build_iris(classifier, name, with_proba=True, **pmml_options):
    pipeline = Pipeline([
        ("pipeline",
         Pipeline([("mapper",
                    DataFrameMapper([
                        (iris_X.columns.values, ContinuousDomain()),
                        (["Sepal.Length",
                          "Petal.Length"], Aggregator(function="mean")),
                        (["Sepal.Width",
                          "Petal.Width"], Aggregator(function="mean"))
                    ])),
                   ("transform",
                    FeatureUnion([
                        ("normal_scale", FunctionTransformer(None)),
                        ("log_scale", FunctionTransformer(numpy.log10)),
                        ("power_scale", PowerFunctionTransformer(power=2))
                    ]))])), ("pca", IncrementalPCA(n_components=3,
                                                   whiten=True)),
        ("classifier", classifier)
    ])
    pipeline.fit(iris_X, iris_y)
    pipeline = make_pmml_pipeline(pipeline, iris_X.columns.values, iris_y.name)
    pipeline.configure(**pmml_options)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(iris_X.sample(frac=0.10, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(iris_X.sample(frac=0.10, random_state=13))
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    if with_proba == True:
        species_proba = DataFrame(pipeline.predict_proba(iris_X),
                                  columns=[
                                      "probability(setosa)",
                                      "probability(versicolor)",
                                      "probability(virginica)"
                                  ])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
예제 #17
0
def build_audit_cat(classifier, name, with_proba = True, fit_params = {}):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Income"]] +
		[(["Hours"], [ContinuousDomain(), CutTransformer(bins = [0, 20, 40, 60, 80, 100], labels = False, right = False, include_lowest = True)])] +
		[(["Employment", "Education"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), OrdinalEncoder(dtype = numpy.int_)])] +
		[(["Marital"], [CategoricalDomain(), OrdinalEncoder(dtype = numpy.uint16)])] +
		[(["Occupation"], [CategoricalDomain(), OrdinalEncoder(dtype = numpy.float_)])] +
		[([column], [CategoricalDomain(), LabelEncoder()]) for column in ["Gender", "Deductions"]]
	)
	pipeline = Pipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y, **fit_params)
	pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name)
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
예제 #18
0
def build_regressor(data, feature_pipeline, generations, population_size,
                    name):
    X, y = data
    Xt = feature_pipeline.fit_transform(X)
    Xt = Xt.astype(float)
    config = make_tpot_pmml_config(regressor_config_dict)
    config = filter_config(config)
    del config["sklearn.neighbors.KNeighborsRegressor"]
    regressor = TPOTRegressor(generations=generations,
                              population_size=population_size,
                              random_state=13,
                              config_dict=config,
                              verbosity=2)
    regressor.fit(Xt, y)
    pipeline = Pipeline(steps=feature_pipeline.steps +
                        regressor.fitted_pipeline_.steps)
    pipeline = make_pmml_pipeline(pipeline,
                                  active_fields=X.columns.values,
                                  target_fields=[y.name])
    print(repr(pipeline))
    store_pkl(pipeline, name)
    result = DataFrame(regressor.predict(Xt), columns=[y.name])
    store_csv(result, name)
예제 #19
0
def build_auto(regressor, name, **kwargs):
    mapper = DataFrameMapper([
        (["cylinders"], CategoricalDomain()),
        (["displacement", "horsepower", "weight",
          "acceleration"], [ContinuousDomain(),
                            StandardScaler()]),
        (["model_year"], [CategoricalDomain(),
                          Binarizer(threshold=77)], {
                              "alias": "bin(model_year, 77)"
                          }),  # Pre/post 1973 oil crisis effects
        (["origin"], OneHotEncoder()),
        (["weight", "displacement"],
         ExpressionTransformer("(X[:, 0] / X[:, 1]) + 0.5"), {
             "alias": "weight / displacement + 0.5"
         })
    ])
    pipeline = Pipeline([("mapper", mapper), ("regressor", regressor)])
    pipeline.fit(auto_X, auto_y)
    pipeline = make_pmml_pipeline(pipeline, auto_X.columns.values, auto_y.name)
    customize(regressor, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")
예제 #20
0
def build_versicolor(classifier, name, with_proba=True, **pmml_options):
    transformer = ColumnTransformer([("continuous_columns",
                                      Pipeline([("domain", ContinuousDomain()),
                                                ("scaler", RobustScaler())]),
                                      versicolor_X.columns.values)])
    pipeline = Pipeline([("transformer", transformer),
                         ("transformer-selector-pipeline",
                          Pipeline([("polynomial",
                                     PolynomialFeatures(degree=3)),
                                    ("selector", SelectKBest(k="all"))])),
                         ("classifier", classifier)])
    pipeline.fit(versicolor_X, versicolor_y)
    pipeline = make_pmml_pipeline(pipeline, versicolor_X.columns.values,
                                  versicolor_y.name)
    pipeline.configure(**pmml_options)
    pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13))
    store_pkl(pipeline, name + ".pkl")
    species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"])
    if with_proba == True:
        species_proba = DataFrame(pipeline.predict_proba(versicolor_X),
                                  columns=["probability(0)", "probability(1)"])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name + ".csv")
예제 #21
0
def build_classifier(data, feature_pipeline, generations, population_size,
                     name):
    X, y = data
    Xt = feature_pipeline.fit_transform(X)
    Xt = Xt.astype(float)
    categories = pandas.unique(y)
    config = make_tpot_pmml_config(classifier_config_dict)
    config = filter_config(config)
    del config[
        "sklearn.naive_bayes.GaussianNB"]  # Does not support nesting - see http://mantis.dmg.org/view.php?id=208
    del config["sklearn.neighbors.KNeighborsClassifier"]
    del config[
        "sklearn.svm.LinearSVC"]  # Does not support classifier.predict_proba(Xt)
    del config["sklearn.tree.DecisionTreeClassifier"]
    classifier = TPOTClassifier(generations=generations,
                                population_size=population_size,
                                random_state=13,
                                config_dict=config,
                                verbosity=2)
    classifier.fit(Xt, y)
    pipeline = Pipeline(steps=feature_pipeline.steps +
                        classifier.fitted_pipeline_.steps)
    pipeline = make_pmml_pipeline(pipeline,
                                  active_fields=X.columns.values,
                                  target_fields=[y.name])
    pipeline.verify(X.sample(frac=0.05, random_state=13))
    print(repr(pipeline))
    store_pkl(pipeline, name)
    result = DataFrame(classifier.predict(Xt), columns=[y.name])
    if (len(categories) > 0):
        probabilities = DataFrame(classifier.predict_proba(Xt),
                                  columns=[
                                      "probability(" + str(category) + ")"
                                      for category in categories
                                  ])
        result = pandas.concat([result, probabilities], axis=1)
    store_csv(result, name)
예제 #22
0
# coding:utf-8
import sklearn, sklearn.externals.joblib, sklearn_pandas, sklearn2pmml
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
from sklearn2pmml import PMMLPipeline
from sklearn.datasets import load_iris
from sklearn import tree

#
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml

import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.externals import joblib
obj = joblib.load("model.m")

pmml_pipeline = make_pmml_pipeline(obj,
                                   active_fields=[
                                       'sbp', 'tobacco', 'ldl', 'adiposity',
                                       'famhist', 'typea', 'obesity',
                                       'alcohol', 'age'
                                   ],
                                   target_fields=['chd'])
sklearn2pmml(pmml_pipeline, "result.pmml", with_repr=True, debug=True)
예제 #23
0
    else:
        incorrect += 1
        if predicted == "true":
            falsePositive += 1
        else:
            falseNegative += 1

sensitivity = truePositive / (truePositive + falseNegative)
specificity = trueNegative / (trueNegative + falsePositive)

# Print results
print(f"Results for model {type(model).__name__}")
print(f"Correct: {correct}")
print(f"Incorrect: {incorrect}")
print(f"Accuracy: {100 * correct / total:.2f}%")

print(f"True Positive Rate: {100 * sensitivity:.2f}%")
print(f"True Negative Rate: {100 * specificity:.2f}%")


from sklearn2pmml import sklearn2pmml
from sklearn2pmml import make_pmml_pipeline

# Export the trained model in PMML

pipeline = make_pmml_pipeline(
    model,
    active_fields= ["category", "urgency", "targetPrice", "price"],
    target_fields= ["approval"]
)
sklearn2pmml(pipeline, "order-approval.pmml")
                     norm=None,
                     tokenizer=Splitter())),
    (
        'linear',
        SGDClassifier(
            # params
        )),
])
# Train the model
pipeline.fit(train_data, train_labels)

# Pack verification data and verify
pipeline.verify(test_data)

# Save the pipeline + model
sklearn2pmml.sklearn2pmml(sklearn2pmml.make_pmml_pipeline(
    pipeline, active_fields=FEATURE_COLS, target_fields=TARGET_COL),
                          'model.pmml',
                          with_repr=True,
                          debug=True)

# Measure the accuracy
descision = pipeline.decision_function(test_data.sample(1))

jpmml_input_data = pd.read_csv("jpmml-test-input.csv", usecols=FEATURE_COLS)
input_pred = pipeline.predict(jpmml_input_data.squeeze())
with numpy.printoptions(threshold=numpy.inf):
    print(input_pred)

pred = pipeline.predict(test_data)
print('Accuracy = {:.3f}'.format(
    sum(l == p for l, p in zip(test_labels, pred)) / len(test_labels)))
예제 #25
0
파일: module_code.py 프로젝트: xiyucai/test
a = tree_to_rulescode(clf, dx_names, fc_file_name=False)

model_eval = pd.DataFrame(columns=['data_catogary', 'KS', 'AUC'])
model_eval.loc[0, :] = [
    'train_DATA',
    KS(clf, X_train, Y_train),
    auc(clf, X_train, Y_train)
]
model_eval.loc[1, :] = [
    'test_DATA',
    KS(clf, X_test, Y_test),
    auc(clf, X_test, Y_test)
]

model_eval = model_ev(clf=clf,
                      df=df,
                      dx_feacolname=dx_feature_col,
                      df_result=model_eval,
                      loan_month='type',
                      bad_='bad')

pipeline = sp.make_pmml_pipeline(clf,
                                 active_fields=dx_names,
                                 target_fields='is_bad')
pipeline.configure(node_id=True, winner_id=True,
                   numberofFields=True)  #添加pmml文件的node_id

pmml_path = path_v + '\\' + '_model.pmml'
sklearn2pmml(pipeline, pmml_path, with_repr=True)