def test_fit(self):
     regressor = DummyRegressor()
     regressor_proxy = EstimatorProxy(regressor, attr_names_=["constant_"])
     self.assertFalse(hasattr(regressor_proxy, "constant_"))
     regressor_proxy.fit(numpy.array([[0], [0]]), numpy.array([0.0, 2.0]))
     self.assertEqual(1.0, regressor.constant_)
     self.assertEqual(1.0, regressor_proxy.constant_)
Exemplo n.º 2
0
def build_audit(classifier, name, with_proba=True):
    mapper = DataFrameMapper([
        ("Age", ContinuousDomain()),
        ("Employment", [
            LabelBinarizer(),
            SelectFromModel(EstimatorProxy(
                DecisionTreeClassifier(random_state=13)),
                            threshold="1.25 * mean")
        ]),
        ("Education", [
            LabelBinarizer(),
            SelectorProxy(
                SelectFromModel(EstimatorProxy(
                    RandomForestClassifier(random_state=13, n_estimators=3)),
                                threshold="median"))
        ]), ("Marital", [LabelBinarizer(), SelectKBest(k=3)]),
        ("Occupation", [LabelBinarizer(),
                        SelectorProxy(SelectKBest(k=3))]),
        ("Income", ContinuousDomain()), ("Gender", LabelEncoder()),
        ("Deductions", LabelEncoder()), ("Hours", ContinuousDomain())
    ])
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    store_pkl(pipeline, name + ".pkl")
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    if (with_proba == True):
        adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                                   columns=["probability_0", "probability_1"])
        adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name + ".csv")
Exemplo n.º 3
0
		classifier.pmml_feature_importances_ = classifier.feature_importances_
	else:
		pass
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)

if "Audit" in datasets:
	build_audit(EstimatorProxy(DecisionTreeClassifier(min_samples_leaf = 2, random_state = 13)), "DecisionTreeAudit", compact = False)
	build_audit(BaggingClassifier(DecisionTreeClassifier(random_state = 13, min_samples_leaf = 5), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAudit")
	build_audit(DummyClassifier(strategy = "most_frequent"), "DummyAudit")
	build_audit(EstimatorProxy(ExtraTreesClassifier(n_estimators = 10, min_samples_leaf = 5, random_state = 13)), "ExtraTreesAudit")
	build_audit(GBDTLRClassifier(RandomForestClassifier(n_estimators = 17, random_state = 13), LogisticRegression()), "GBDTLRAudit")
	build_audit(GBDTLRClassifier(XGBClassifier(n_estimators = 17, random_state = 13), LogisticRegression()), "XGBLRAudit")
	build_audit(GBDTLRClassifier(XGBRFClassifier(n_estimators = 7, max_depth = 6, random_state = 13), SGDClassifier(loss = "log", penalty = "elasticnet", random_state = 13)), "XGBRFLRAudit")
	build_audit(EstimatorProxy(GradientBoostingClassifier(loss = "exponential", init = None, random_state = 13)), "GradientBoostingAudit")
	build_audit(HistGradientBoostingClassifier(max_iter = 71, random_state = 13), "HistGradientBoostingAudit")
	build_audit(LGBMClassifier(objective = "binary", n_estimators = 37), "LGBMAudit", predict_params = {"num_iteration" : 17}, predict_proba_params = {"num_iteration" : 17}, num_iteration = 17)
	build_audit(LinearDiscriminantAnalysis(solver = "lsqr"), "LinearDiscriminantAnalysisAudit")
	build_audit(LinearSVC(penalty = "l1", dual = False, random_state = 13), "LinearSVCAudit", with_proba = False)
	build_audit(LogisticRegression(multi_class = "multinomial", solver = "newton-cg", max_iter = 500), "MultinomialLogisticRegressionAudit")
	build_audit(LogisticRegressionCV(cv = 3, multi_class = "ovr"), "OvRLogisticRegressionAudit")
	build_audit(BaggingClassifier(LogisticRegression(), n_estimators = 3, max_features = 0.5, random_state = 13), "LogisticRegressionEnsembleAudit")
	build_audit(GaussianNB(), "NaiveBayesAudit")
Exemplo n.º 4
0
def build_audit(classifier, name, with_proba = True, **kwargs):
	continuous_mapper = DataFrameMapper([
		("Age", ContinuousDomain()),
		("Income", ContinuousDomain()),
		("Hours", ContinuousDomain())
	])
	categorical_mapper = DataFrameMapper([
		("Employment", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(DecisionTreeClassifier(random_state = 13))))]),
		("Education", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectFromModel(EstimatorProxy(RandomForestClassifier(random_state = 13, n_estimators = 3)), threshold = "1.25 * mean"))]),
		("Marital", [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]),
		("Occupation", [CategoricalDomain(), LabelBinarizer(), SelectorProxy(SelectKBest(k = 3))]),
		("Gender", [CategoricalDomain(), LabelBinarizer(neg_label = -3, pos_label = 3)]),
		("Deductions", [CategoricalDomain(), LabelEncoder()]),
	])
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("continuous", continuous_mapper),
			("categorical", Pipeline([
				("mapper", categorical_mapper),
				("polynomial", PolynomialFeatures())
			]))
		])),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, audit_y)
	customize(classifier, **kwargs)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	if(with_proba == True):
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name + ".csv")