Пример #1
0
def build_audit(classifier, name, **pmml_options):
	if isinstance(classifier, LGBMClassifier):
		cat_columns = ["Age", "Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Income", "Hours"]
	else:
		cat_columns = ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]
		cont_columns = ["Age", "Income", "Hours"]
	if isinstance(classifier, LGBMClassifier):
		cat_mappings = [([cat_column], [cat_domain(name), label_encoder(name)]) for cat_column in cat_columns]
	else:
		cat_mappings = [([cat_column], [cat_domain(name), label_binarizer(name)]) for cat_column in cat_columns]
	cont_mappings = [([cont_column], cont_domain(name)) for cont_column in cont_columns]
	mapper = DataFrameMapper(cat_mappings + cont_mappings)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	if isinstance(classifier, LGBMClassifier):
		pipeline.fit(audit_X, audit_y, classifier__categorical_feature = [0, 1, 2, 3, 4, 5])
	elif isinstance(classifier, XGBClassifier):
		if name == "XGBoostAuditNA":
			audit_X["Age"] = audit_X["Age"].astype(float)
		pipeline.fit(audit_X, audit_y)
	else:
		pipeline.fit(audit_X, audit_y)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_X.sample(n = 3, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_X.sample(n = 3, random_state = 13))
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
	store_csv(pandas.concat((adjusted, adjusted_proba), axis = 1), name)
Пример #2
0
def build_sentiment(classifier, name, with_proba=True, **pmml_options):
    pipeline = PMMLPipeline([
        ("tf-idf",
         TfidfVectorizer(
             analyzer="word",
             preprocessor=None,
             strip_accents=None,
             lowercase=True,
             token_pattern=None,
             tokenizer=Splitter(),
             stop_words="english",
             ngram_range=(1, 2),
             norm=None,
             dtype=(numpy.float32 if isinstance(
                 classifier, RandomForestClassifier) else numpy.float64))),
        ("selector", SelectKBest(f_classif, k=500)), ("classifier", classifier)
    ])
    pipeline.fit(sentiment_X, sentiment_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name + ".pkl")
    score = DataFrame(pipeline.predict(sentiment_X), columns=["Score"])
    if with_proba == True:
        score_proba = DataFrame(pipeline.predict_proba(sentiment_X),
                                columns=["probability(0)", "probability(1)"])
        score = pandas.concat((score, score_proba), axis=1)
    store_csv(score, name + ".csv")
Пример #3
0
def build_iris(classifier, name, **pmml_options):
    cont_columns = [
        "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"
    ]
    cont_mappings = [([cont_column], ContinuousDomain())
                     for cont_column in cont_columns]
    mapper = DataFrameMapper(cont_mappings)
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(iris_X, iris_y)
    if isinstance(classifier, XGBClassifier):
        pipeline.verify(iris_X.sample(n=3, random_state=13),
                        precision=1e-5,
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(iris_X.sample(n=3, random_state=13))
    pipeline.configure(**pmml_options)
    store_pmml(pipeline, name)
    species = DataFrame(pipeline.predict(iris_X), columns=["Species"])
    species_proba = DataFrame(pipeline.predict_proba(iris_X),
                              columns=[
                                  "probability(setosa)",
                                  "probability(versicolor)",
                                  "probability(virginica)"
                              ])
    store_csv(pandas.concat((species, species_proba), axis=1), name)
Пример #4
0
def submodel_evaluation(train_data,valid_data,model_list,\
                        category_feature,numeric_feature): 
    X_train = train_data[category_feature+numeric_feature] 
    y_train = train_data['user_type']
    X_valid = valid_data[category_feature+numeric_feature]
    y_valid = valid_data['user_type']
    
    pipeline_transformer = feature_union(category_feature,numeric_feature)    
    model_result_dict = {}
    for model in model_list:
        model_name = model.__class__.__name__
        print('model %s evaluation'%model_name)
        
        sub_model = PMMLPipeline([
            ('mapper',pipeline_transformer),
            ('classifier',model)
        ])
        sub_model.fit(X_train,y_train)
        predict_valid = sub_model.predict_proba(X_valid)[:,1]
        predict_label = sub_model.predict(X_valid)
        model_ks = plot_ks_curve(predict_valid,valid_data['user_type'])
        model_auc = roc_auc_score(y_valid, predict_valid)
        accuracy = metrics.accuracy_score(y_valid,predict_label)
        model_result_dict[model_name] = [model_ks,model_auc,accuracy]
    return model_result_dict
Пример #5
0
def build_iris_vec(classifier, name):
	pipeline = PMMLPipeline([
		("classifier", classifier)
	])
	pipeline.fit(iris_X, iris_y)
	store_pmml(pipeline, name)
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	store_csv(pandas.concat((species, species_proba), axis = 1), name)
Пример #6
0
def build_audit(mapper, classifier, name, **pmml_options):
    pipeline = PMMLPipeline([("mapper", mapper), ("classifier", classifier)])
    pipeline.fit(audit_X, audit_y)
    pipeline.configure(**pmml_options)
    store_pkl(pipeline, name)
    adjusted = DataFrame(pipeline.predict(audit_X), columns=["Adjusted"])
    adjusted_proba = DataFrame(pipeline.predict_proba(audit_X),
                               columns=["probability(0)", "probability(1)"])
    adjusted = pandas.concat((adjusted, adjusted_proba), axis=1)
    store_csv(adjusted, name)
Пример #7
0
def build_apollo(mapper, name):
    pipeline = PMMLPipeline([("mapper", mapper),
                             ("classifier", DecisionTreeClassifier())])
    pipeline.fit(df, df["success"])
    store_pkl(pipeline, name)
    success = DataFrame(pipeline.predict(df), columns=["success"])
    success_proba = DataFrame(
        pipeline.predict_proba(df),
        columns=["probability(false)", "probability(true)"])
    success = pandas.concat((success, success_proba), axis=1)
    store_csv(success, name)
Пример #8
0
	def test_predict_proba_transform(self):
		predict_proba_transformer = FunctionTransformer(numpy.log)
		pipeline = PMMLPipeline([("estimator", DummyClassifier(strategy = "prior"))], predict_proba_transformer = predict_proba_transformer)
		X = DataFrame([1.0, 1.0, 1.0, 1.0, 1.0, 1.0], columns = ["x"])
		y = Series(["green", "red", "yellow", "green", "red", "green"], name = "y")
		pipeline.fit(X, y)
		self.assertEqual(["green", "red", "yellow"], pipeline._final_estimator.classes_.tolist())
		y_proba = [3 / 6.0, 2 / 6.0, 1 / 6.0]
		y_probat = [numpy.log(x) for x in y_proba]
		self.assertEqual([y_proba for i in range(0, 6)], pipeline.predict_proba(X).tolist())
		self.assertEqual([y_proba + y_probat for i in range(0, 6)], pipeline.predict_proba_transform(X).tolist())
Пример #9
0
def build_audit_dict(classifier, name, with_proba = True):
	pipeline = PMMLPipeline([
		("dict-transformer", DictVectorizer()),
		("classifier", classifier)
	])
	pipeline.fit(audit_dict_X, audit_y)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Пример #10
0
def build_iris_opt(classifier, name, fit_params = {}, **pmml_options):
	pipeline = PMMLPipeline([
		("classifier", classifier)
	])
	pipeline.fit(iris_X[iris_train_mask], iris_y[iris_train_mask], **fit_params)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13))
	store_pkl(pipeline, name)
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, name)
Пример #11
0
def build_sentiment(classifier, transformer, name, with_proba = True, **pmml_options):
	pipeline = PMMLPipeline([
		("transformer", transformer),
		("densifier", DenseTransformer()),
		("selector", SelectKBest(f_classif, k = 500)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)
Пример #12
0
def build_versicolor_direct(classifier, name, with_proba=True, **pmml_options):
    transformer = ColumnTransformer(
        [("all", "passthrough", ["Petal.Length", "Petal.Width"])],
        remainder="drop")
    pipeline = PMMLPipeline([("transformer", transformer),
                             ("classifier", classifier)])
    pipeline.fit(versicolor_X, versicolor_y)
    pipeline.configure(**pmml_options)
    pipeline.verify(versicolor_X.sample(frac=0.10, random_state=13))
    store_pkl(pipeline, name)
    species = DataFrame(pipeline.predict(versicolor_X), columns=["Species"])
    if with_proba == True:
        species_proba = DataFrame(pipeline.predict_proba(versicolor_X),
                                  columns=["probability(0)", "probability(1)"])
        species = pandas.concat((species, species_proba), axis=1)
    store_csv(species, name)
Пример #13
0
def build_audit_na_direct(classifier, name):
	mapper = DataFrameMapper([
		(["Age", "Hours", "Income"], None),
		(["Employment", "Education", "Marital", "Occupation", "Gender"], OneHotEncoder())
	])
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
	adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Пример #14
0
def build_audit_na_hist(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], [CategoricalDomain(), PMMLLabelBinarizer()]) for column in ["Employment", "Education", "Marital", "Occupation", "Gender"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	])
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
	adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Пример #15
0
def train_model(X: DataFrame, y: Series, attr_preprocs, pmml_fname, n_estimators):
    pipeline = PMMLPipeline([
        ("attribute_preprocessor", DataFrameMapper(attr_preprocs)),
        ('classifier', XGBClassifier(n_gpus=0, objective="binary:logistic", n_jobs=30, max_depth=2,
                                     n_estimators=n_estimators, colsample_bytree=0.5, colsample_bylevel=0.5,
                                     colsample_bynode=0.5, subsample=0.5, reg_alpha=0.8, reg_lambda=2, missing=-99998))
    ])

    pipeline.fit(X, y)
    prob = pipeline.predict_proba(X)
    fpr, tpr, thresholds = metrics.roc_curve(y, prob[:, 1], pos_label=1)
    auc = metrics.auc(fpr, tpr)
    ks = np.max(tpr - fpr)

    sklearn2pmml(pipeline, pmml_fname, with_repr=True)

    return pipeline, auc, ks
Пример #16
0
def build_audit_dict(classifier, name, with_proba = True):
	header = {
		"copyright" : "Copyright (c) 2021 Villu Ruusmann",
		"description" : "Integration test for dictionary (key-value mappings) input",
		"modelVersion" : "1.0.0"
	}
	pipeline = PMMLPipeline([
		("dict-transformer", DictVectorizer()),
		("classifier", classifier)
	], header = header)
	pipeline.fit(audit_dict_X, audit_y)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	store_csv(adjusted, name)
Пример #17
0
def build_sentiment(classifier, tokenizer, name, with_proba = True, **pmml_options):
	pipeline = PMMLPipeline([
		("union", FeatureUnion([
			("tf-idf", TfidfVectorizer(analyzer = "word", preprocessor = None, strip_accents = None, lowercase = True, tokenizer = tokenizer, stop_words = "english", ngram_range = (1, 2), norm = None, sublinear_tf = isinstance(classifier, LogisticRegressionCV), dtype = (numpy.float32 if isinstance(classifier, RandomForestClassifier) else numpy.float64))),
			("count", WordCountTransformer())
		])),
		("selector", SelectKBest(f_classif, k = 1000)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pkl(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba == True:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)
Пример #18
0
def build_audit_na(classifier, name, with_proba = True, fit_params = {}, predict_params = {}, predict_proba_params = {}, predict_transformer = None, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0.0,
		"MALE" : 1.0,
		"MISSING_VALUE" : 0.5
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999", dtype = int), name = "flag_missing(Age, -999)"), SimpleImputer(missing_values = -999, strategy = "constant", fill_value = 38)])] +
		[(["Age"], MissingIndicator())] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), SimpleImputer(missing_values = -999, add_indicator = True)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), SimpleImputer(strategy = "median", add_indicator = True)])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, missing_value_replacement = "N/A", with_data = False), SimpleImputer(missing_values = "N/A", strategy = "most_frequent"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), SimpleImputer(strategy = "constant"), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_transformer = predict_transformer, predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y, **fit_params)
	pipeline.configure(**pmml_options)
	if isinstance(classifier, XGBClassifier):
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(audit_na_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, predict_proba_params = predict_proba_params)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_na_X, **predict_params), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X, **predict_proba_params), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name)
Пример #19
0
def build_audit_na(classifier, name, with_proba = True, predict_proba_transformer = None, apply_transformer = None, **pmml_options):
	employment_mapping = {
		"CONSULTANT" : "PRIVATE",
		"PSFEDERAL" : "PUBLIC",
		"PSLOCAL" : "PUBLIC",
		"PSSTATE" : "PUBLIC",
		"SELFEMP" : "PRIVATE",
		"PRIVATE" : "PRIVATE"
	}
	gender_mapping = {
		"FEMALE" : 0,
		"MALE" : 1
	}
	mapper = DataFrameMapper(
		[(["Age"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("X[0] if pandas.notnull(X[0]) else -999"), name = "flag_missing(Age, -999)"), Imputer(missing_values = -999)])] +
		[(["Hours"], [ContinuousDomain(missing_values = None, with_data = False), Alias(ExpressionTransformer("-999 if pandas.isnull(X[0]) else X[0]"), name = "flag_missing(Hours, -999)"), Imputer(missing_values = -999)])] +
		[(["Income"], [ContinuousDomain(missing_values = None, outlier_treatment = "as_missing_values", low_value = 5000, high_value = 200000, with_data = False), Imputer()])] +
		[(["Employment"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(employment_mapping, "OTHER"), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()])] +
		[([column], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(missing_values = None), StringNormalizer(function = "lowercase"), PMMLLabelBinarizer()]) for column in ["Education", "Marital", "Occupation"]] +
		[(["Gender"], [CategoricalDomain(missing_values = None, with_data = False), CategoricalImputer(), StringNormalizer(function = "uppercase"), LookupTransformer(gender_mapping, None)])]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("classifier", classifier)
	], predict_proba_transformer = predict_proba_transformer, apply_transformer = apply_transformer)
	pipeline.fit(audit_na_X, audit_na_y)
	pipeline.configure(**pmml_options)
	store_pkl(pipeline, name + ".pkl")
	adjusted = DataFrame(pipeline.predict(audit_na_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_na_X), columns = ["probability(0)", "probability(1)"])
		adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
	if isinstance(classifier, DecisionTreeClassifier):
		Xt = pipeline_transform(pipeline, audit_na_X)
		adjusted_apply = DataFrame(classifier.apply(Xt), columns = ["nodeId"])
		adjusted = pandas.concat((adjusted, adjusted_apply), axis = 1)
	store_csv(adjusted, name + ".csv")
Пример #20
0
			(iris_X.columns.values, ContinuousDomain())
		])),
		("classifier", SelectFirstClassifier([
			("select", Pipeline([
				("classifier", DecisionTreeClassifier(random_state = 13))
			]), "X[1] <= 3"),
			("default", Pipeline([
				("scaler", StandardScaler()),
				("classifier", LogisticRegression(multi_class = "ovr", solver = "liblinear"))
			]), str(True))
		]))
	])
	pipeline.fit(iris_X, iris_y)
	store_pkl(pipeline, "SelectFirstIris")
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
	species_proba = DataFrame(pipeline.predict_proba(iris_X), columns = ["probability(setosa)", "probability(versicolor)", "probability(virginica)"])
	species = pandas.concat((species, species_proba), axis = 1)
	store_csv(species, "SelectFirstIris")

if "Iris" in datasets:
	classifier = RuleSetClassifier([
		("X['Petal.Length'] >= 2.45 and X['Petal.Width'] < 1.75", "versicolor"),
		("X['Petal.Length'] >= 2.45", "virginica")
	], default_score = "setosa")
	pipeline = PMMLPipeline([
		("classifier", classifier)
	])
	pipeline.fit(iris_X, iris_y)
	pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13))
	store_pkl(pipeline, "RuleSetIris")
	species = DataFrame(pipeline.predict(iris_X), columns = ["Species"])
Пример #21
0
from sklearn2pmml.preprocessing import PMMLLabelEncoder
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn_pandas import DataFrameMapper

binary = False

data = pd.read_csv("test/support/mpg.csv")
if binary:
    data["drv"] = data["drv"].replace("r", "4")

numeric_features = ["displ", "year", "cyl"]
categorical_features = ["class"]
text_features = []

mapper = DataFrameMapper(
    [(numeric_features, [ContinuousDomain()])] +
    [([f], [CategoricalDomain(), PMMLLabelEncoder()])
     for f in categorical_features] +
    [(f, [CategoricalDomain(),
          CountVectorizer(tokenizer=Splitter())]) for f in text_features])

pipeline = PMMLPipeline([("mapper", mapper),
                         ("model", LGBMClassifier(n_estimators=1000))])
pipeline.fit(data, data["drv"], model__categorical_feature=[3])

suffix = "binary" if binary else "multiclass"
sklearn2pmml(pipeline, "test/support/python/lightgbm_" + suffix + ".pmml")

print(list(pipeline.predict(data[:10])))
print(list(pipeline.predict_proba(data[0:1])[0]))
Пример #22
0
class AutoBuilder:
    """'
    E2E classifier builder
    
    Builds binary classifier, including:
        - dataset EDA (optional)
        - hyperparameter tuning (optional)
        - model performance assessment
        - SHAP-based feature analysis
        - feature selection
        - creating deployment package (pmml & pkl)

    Attributes:
        auto_build (method): automatically builds bin populates output_dir path, with model artifacts, and evaluation charts

    """
    def __init__(
        self,
        output_dir_path,
        csv_path,
        target_col="target",
        ignore_cols=[],
        eda_flag=True,
        tune_flag=True,
        cardinality_threshold=100,
        shap_plot_num=10,
        shap_frac=0.05,
        importance_cutoff=0.00,
        corr_cutoff=0.9,
        search_space=LGB_SEARCH_SPACE,
        tuning_iters=25,
        lgb_params={},
        random_state=1234,
    ):
        """
        Args:
            output_dir_path (string):  filepath where outputs package is created and saved
            csv_path (string): filepath to input csv, NOTE need to preprocess columns to be numeric or string type
            target_col (string, optional): target column, default 'target'
            ignore_cols (iterable, optional): columns to be dropped, default []
            eda_flag (boolean, optional): EDA plots to be generated, default True
            tune_flag (boolean, optional): Lightgbm hyperparameters to be tuned, default True
            shap_plot_num (numeric, optional): Generate SHAP dependency plots for N most important features, default 10
            shap_frac (numeric, optional): Proportion of data sampled for SHAP analysis, default 5%
            importance_cutoff (numeric, optional): Abs. avg. SHAP value threshold suggest dropping feature, default 0.00
            corr_cutoff (numeric, optional): Abs. avg. correlation suggest dropping feature, default 0.9
            search_space (numeric, optional): Tuning space for Bayesian optimisation, default is SKOPT_SEARCH_SPACE
            tuning_iter (numeric, optional): number of tuning iterations for Bayesian optimisation, default is 25,
            lgb_params (dict, optional): Hyperparams to use in case when tune_flag = False, default None
            random_state (numeric, optional): Random seed for train test split, and model-training - default is 1234
        """
        self.output_dir_path = output_dir_path
        self.csv_path = csv_path
        self.target_col = target_col
        self.ignore_cols = ignore_cols
        self.eda_flag = eda_flag
        self.tune_flag = tune_flag
        self.cardinality_threshold = cardinality_threshold
        self.shap_plot_num = shap_plot_num
        self.shap_frac = shap_frac
        self.importance_cutoff = importance_cutoff
        self.corr_cutoff = corr_cutoff
        self.search_space = search_space
        self.tuning_iters = tuning_iters
        self.lgb_params = lgb_params
        self.random_state = random_state

    def _gen_model_dir(self):
        """
        Creates output directory according to self.output_dir_path, removing previous output if there.

        Also makes subdirectories
            /bin
            /plots
        """
        logger.info(f"building directory {self.csv_path}")
        if os.path.exists(self.output_dir_path) and os.path.isdir(
                self.output_dir_path):
            shutil.rmtree(self.output_dir_path)
        os.mkdir(self.output_dir_path)
        os.mkdir(self.output_dir_path + "/bin")
        os.mkdir(self.output_dir_path + "/plots")

    def _process_csv(self):
        """
        Parses csv specified in self.csv_path, saving to self.raw

        Also
            - Drops ignore columns
            - Validates target and feature columns
                Target = binary, 0-1
                Features = numeric or string
        """
        logger.info(f"loading file {self.csv_path}")
        raw = pd.read_csv(self.csv_path).drop(columns=self.ignore_cols)

        logger.info("checking valid input data")
        assert raw[self.target_col].isna().sum() == 0

        assert list(sorted(raw[self.target_col].unique())) == [0, 1]

        valid_shape = raw.select_dtypes(
            include=["int64", "float64", "object"]).shape
        assert valid_shape == raw.shape
        self.raw = raw
        raw.to_csv(f"{self.output_dir_path}/bin/raw.csv")

    def _prepare_X_y(self):
        """
        Splits self raw into X_train y_train, X_test, y_test 

        Also records categorical and numerical columns, and saves csv of training set
        """

        y = self.raw[self.target_col]
        X = self.raw.drop(columns=self.target_col)

        logger.info("train test split")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.20, random_state=self.random_state)
        data_train = self.X_train.copy()
        data_train["target"] = self.y_train

        training_data_path = f"{self.output_dir_path}/bin/train.csv"
        data_train.to_csv(training_data_path, index=False)

        del X, y

    def _create_categorical_transformer(self):
        self.categorical_cols = self.X_train.select_dtypes(
            include=["object"]).columns
        self.numeric_cols = self.X_train.select_dtypes(
            include=["int64", "float64"]).columns

        self.mapper = DataFrameMapper(
            [([cat_column],
              [CategoricalDomain(), LabelEncoder()])
             for cat_column in self.categorical_cols] +
            [(self.numeric_cols, ContinuousDomain())])

        # hacky, also storing seperated X_train_encoded and classifier, because couldn't get SHAP and skopt to work for e2e pipeline
        self.X_train_encoded = self.mapper.fit_transform(self.X_train)
        self.var_names = self.X_train.columns

    def _tune(self):
        """
        Explores tuning space, updating self.lgb_params with values that minimize cross-validated brier score
        """
        # todo, can I save memory, code and possibly tune binning strats by passing unencoded X_train into pipeline?
        logger.info(f"tuning {self.tuning_iters}")
        results = utils.bayes_hyperparam_tune(
            model=lgb.LGBMClassifier(objective="binary"),
            X=self.X_train_encoded,
            y=self.y_train,
            search_space=self.search_space,
            n_iters=self.tuning_iters,
        )
        self.lgb_params = results.best_params_
        logger.info(f"best params {self.lgb_params}")

    def _save_model(self):
        """
        Saves sklearn pipeline as pkl and pmml files, also saves training file

        Args:
            pipeline (lightgbm pipeline) model to be saved
            output_dir (string): path to save model outputs
            train (df): dataset to save
        """
        pmml_path = f"{self.output_dir_path}/model-pmml.pmml"
        pkl_path = f"{self.output_dir_path}/model-bin.pkl"
        pickle.dump(self.pipeline, open(pkl_path, "wb"))
        # sklearn2pmml(self.pipeline, pmml_path)

    def _generate_shap_plots(self):
        classifier = lgb.LGBMClassifier(**self.lgb_params)
        classifier.fit(self.X_train_encoded, self.y_train)
        X_shap = pd.DataFrame(data=self.X_train_encoded,
                              columns=self.var_names)
        self.feature_importance = utils.create_shap_plots(
            classifier,
            X_shap,
            output_dir=self.output_dir_path,
            N=self.shap_plot_num,
            frac=self.shap_frac,
        )

    def auto_build(self):
        """
        Populates output_dir path, with model artifacts, and evalution charts
        """
        self._gen_model_dir()

        self._process_csv()

        self._prepare_X_y()

        if self.eda_flag:
            logger.info("EDA")
            utils.dataset_eda(data=self.X_train,
                              output_dir=self.output_dir_path)

        self._create_categorical_transformer()

        if self.tune_flag:
            self._tune()

        self._generate_shap_plots()

        logger.info("creating pipeline")
        classifier = lgb.LGBMClassifier(**self.lgb_params)
        self.pipeline = PMMLPipeline([("mapper", self.mapper),
                                      ("classifier", classifier)])

        self.pipeline.fit(self.X_train, self.y_train)

        logger.info("Assessing model")

        y_pred = self.pipeline.predict_proba(self.X_test)[:, 1]
        y_bm = np.repeat(self.y_train.mean(), self.y_test.shape)
        utils.evaluate_model(self.y_test, y_pred, y_bm, self.output_dir_path,
                             "Model")

        logger.info("suggeting features to remove")
        self.cols_to_remove = utils.find_features_to_remove(
            importance=self.feature_importance,
            X=self.X_train,
            importance_cutoff=self.importance_cutoff,
            corr_threshold=self.corr_cutoff,
        )
        logger.info(f"candidates to remove - {self.cols_to_remove}")

        logger.info(f"saving model \n{self.output_dir_path}")

        self._save_model()
        test_input = dict(self.X_test.iloc[0])
        test_score = self.pipeline.predict_proba(self.X_test.head(1))
        logger.info(
            f"test-case model inputs \n{ test_input } \n model score \n {test_score}"
        )

        logger.info("done!")