def test_transform(self): X = numpy.asarray([ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" ]) transformer = MatchesTransformer("ar?y") self.assertEqual( [[True], [True], [False], [False], [True], [False], [False], [False], [False], [False], [False], [False]], transformer.transform(X).tolist())
def test_transform(self): X = numpy.asarray([ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" ]) Xt_exp = [ True, True, False, False, True, False, False, False, False, False, False, False ] transformer = MatchesTransformer("ar?y") self.assertEqual(Xt_exp, transformer.transform(X).tolist()) X = DataFrame(X.reshape(-1, 1), columns=["month"]) Xt = transformer.transform(X) self.assertTrue((12, 1), Xt.shape) self.assertEqual(Xt_exp, Xt.tolist())
def build_audit(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, **pmml_options): continuous_mapper = DataFrameMapper([ (["Age", "Income", "Hours"], MultiDomain([ContinuousDomain() for i in range(0, 3)])) ]) categorical_mapper = DataFrameMapper([ (["Employment"], [CategoricalDomain(), SubstringTransformer(0, 3), OneHotEncoder(drop = ["Vol"]), SelectFromModel(DecisionTreeClassifier(random_state = 13))]), (["Education"], [CategoricalDomain(), ReplaceTransformer("[aeiou]", ""), OneHotEncoder(drop = "first"), SelectFromModel(RandomForestClassifier(n_estimators = 3, random_state = 13), threshold = "1.25 * mean")]), (["Marital"], [CategoricalDomain(), LabelBinarizer(neg_label = -1, pos_label = 1), SelectKBest(k = 3)]), (["Occupation"], [CategoricalDomain(), LabelBinarizer(), SelectKBest(k = 3)]), (["Gender"], [CategoricalDomain(), MatchesTransformer("^Male$"), CastTransformer(int)]), (["Deductions"], [CategoricalDomain()]), ]) pipeline = Pipeline([ ("union", FeatureUnion([ ("continuous", continuous_mapper), ("categorical", Pipeline([ ("mapper", categorical_mapper), ("polynomial", PolynomialFeatures()) ])) ])), ("classifier", classifier) ]) pipeline.fit(audit_X, audit_y, **fit_params) pipeline = make_pmml_pipeline(pipeline, audit_X.columns.values, audit_y.name) pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X, **predict_params), columns = ["Adjusted"]) if with_proba == True: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name)