def convert_sklearn_to_pmml(model, pmml, feature_names=None, target_name=None): pipeline = PMMLPipeline([("regressor", model)]) if feature_names is not None: pipeline.active_fields = feature_names if target_name is not None: pipeline.target_field = target_name sklearn2pmml(pipeline, pmml, with_repr=True, debug=True)
def save_model(model, feature_names, model_path, label_text="label"): p, extension = os.path.splitext(model_path) model.feature_names = feature_names pickle_path = p + ".pkl" if extension == ".pmml": try: from sklearn2pmml import sklearn2pmml, PMMLPipeline except ImportError: raise ImportError( "You need to install `sklearn2pmml` to store models in pmml format" ) pipeline = PMMLPipeline([("model", model)]) pipeline.target_field = label_text pipeline.active_fields = np.array(feature_names) sklearn2pmml(pipeline, model_path) elif extension == ".onnx": try: from skl2onnx import convert_sklearn from skl2onnx.common.data_types import FloatTensorType from skl2onnx.helpers.onnx_helper import select_model_inputs_outputs from onnx.onnx_pb import StringStringEntryProto except ImportError: raise ImportError( "You need to install `skl2onnx` to store models in onnx format" ) onnx = convert_sklearn( model, name=label_text, initial_types=[("input", FloatTensorType((None, len(feature_names))))], doc_string="Model created by aict-tools to estimate {}".format(label_text), ) # this makes sure we only get the scores and that they are numpy arrays and not # a list of dicts. # must come before setting metadata as it clears the metadata_props if hasattr(model, "predict_proba"): onnx = select_model_inputs_outputs(onnx, ["probabilities"]) metadata = dict( model_author="aict-tools", aict_tools_version=__version__, feature_names=",".join(feature_names), model_type="classifier" if is_classifier(model) else "regressor", ) for key, value in metadata.items(): onnx.metadata_props.append(StringStringEntryProto(key=key, value=value)) with open(model_path, "wb") as f: f.write(onnx.SerializeToString()) else: pickle_path = model_path # Always store the pickle dump,just in case joblib.dump(model, pickle_path, compress=4)
def pickle_model(classifier, feature_names, model_path, label_text='label'): p, extension = path.splitext(model_path) classifier.feature_names = feature_names if (extension == '.pmml'): joblib.dump(classifier, p + '.pkl', compress=4) pipeline = PMMLPipeline([('classifier', classifier)]) pipeline.target_field = label_text pipeline.active_fields = np.array(feature_names) sklearn2pmml(pipeline, model_path) else: joblib.dump(classifier, model_path, compress=4)
def pickle_model(classifier, feature_names, model_path, label_text='label'): p, extension = os.path.splitext(model_path) classifier.feature_names = feature_names if (extension == '.pmml'): joblib.dump(classifier, p + '.pkl', compress=4) pipeline = PMMLPipeline([ ('classifier', classifier) ]) pipeline.target_field = label_text pipeline.active_fields = np.array(feature_names) sklearn2pmml(pipeline, model_path) else: joblib.dump(classifier, model_path, compress=4)
import xgboost from sklearn import datasets from sklearn2pmml import sklearn2pmml from sklearn2pmml import PMMLPipeline boston = datasets.load_boston() X = boston.data y = boston.target feature_names = boston.feature_names model = xgboost.XGBRegressor(learning_rate=0.1, n_estimators=10, max_depth=10, silent=False) boston_pipeline = PMMLPipeline([("regressor", model)]) boston_pipeline.active_fields = feature_names boston_pipeline.fit(X, y) sklearn2pmml(boston_pipeline, "boston.pmml", with_repr=True, debug=True)
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"initialization","info",display=True,emptyBin=False,customMsg=None,weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = filter(lambda x:x.get_algorithm_slug()==self._slug,algosToRun)[0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns)-set(allDateCols)) print categorical_columns result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [x for x in numerical_columns if x != result_column] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print "model_path",model_path pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame if self._mlEnv == "spark": pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,categorical_columns,result_column,algoType="regression") pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) featureMapping = sorted((attr["idx"], attr["name"]) for attr in (chain(*indexed.schema["features"].metadata["ml_attr"]["attrs"].values()))) # print indexed.select([result_column,"features"]).show(5) MLUtils.save_pipeline_or_model(pipelineModel,pipeline_filepath) # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") dtreer = DecisionTreeRegressor(labelCol=result_column, featuresCol='features',predictionCol="prediction") if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 trainingData,validationData = indexed.randomSplit([defaultSplit,1-defaultSplit], seed=12345) paramGrid = ParamGridBuilder()\ .addGrid(dtreer.regParam, [0.1, 0.01]) \ .addGrid(dtreer.fitIntercept, [False, True])\ .addGrid(dtreer.elasticNetParam, [0.0, 0.5, 1.0])\ .build() crossval = CrossValidator(estimator=dtreer, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol="prediction", labelCol=result_column), numFolds=numFold) st = time.time() cvModel = crossval.fit(indexed) trainingTime = time.time()-st print "cvModel training takes",trainingTime bestModel = cvModel.bestModel elif validationDict["name"] == "trainAndtest": trainingData,validationData = indexed.randomSplit([float(validationDict["value"]),1-float(validationDict["value"])], seed=12345) st = time.time() fit = dtreer.fit(trainingData) trainingTime = time.time()-st print "time to train",trainingTime bestModel = fit featureImportance = bestModel.featureImportances print featureImportance,type(featureImportance) # print featureImportance[0],len(featureImportance[1],len(featureImportance[2])) print len(featureMapping) featuresArray = [(name, featureImportance[idx]) for idx, name in featureMapping] print featuresArray MLUtils.save_pipeline_or_model(bestModel,model_filepath) transformed = bestModel.transform(validationData) transformed = transformed.withColumn(result_column,transformed[result_column].cast(DoubleType())) transformed = transformed.select([result_column,"prediction",transformed[result_column]-transformed["prediction"]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"difference") transformed = transformed.select([result_column,"prediction","difference",FN.abs(transformed["difference"])*100/transformed[result_column]]) transformed = transformed.withColumnRenamed(transformed.columns[-1],"mape") sampleData = None nrows = transformed.count() if nrows > 100: sampleData = transformed.sample(False, float(100)/nrows, seed=420) else: sampleData = transformed print sampleData.show() evaluator = RegressionEvaluator(predictionCol="prediction",labelCol=result_column) metrics = {} metrics["r2"] = evaluator.evaluate(transformed,{evaluator.metricName: "r2"}) metrics["rmse"] = evaluator.evaluate(transformed,{evaluator.metricName: "rmse"}) metrics["mse"] = evaluator.evaluate(transformed,{evaluator.metricName: "mse"}) metrics["mae"] = evaluator.evaluate(transformed,{evaluator.metricName: "mae"}) runtime = round((time.time() - st_global),2) # print transformed.count() mapeDf = transformed.select("mape") # print mapeDf.show() mapeStats = MLUtils.get_mape_stats(mapeDf,"mape") mapeStatsArr = mapeStats.items() mapeStatsArr = sorted(mapeStatsArr,key=lambda x:int(x[0])) # print mapeStatsArr quantileDf = transformed.select("prediction") # print quantileDf.show() quantileSummaryDict = MLUtils.get_quantile_summary(quantileDf,"prediction") quantileSummaryArr = quantileSummaryDict.items() quantileSummaryArr = sorted(quantileSummaryArr,key=lambda x:int(x[0])) # print quantileSummaryArr self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("dtree Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.toPandas().to_dict()) self._model_summary.set_feature_importance(featureImportance) # print CommonUtils.convert_python_object_to_json(self._model_summary) elif self._mlEnv == "sklearn": model_filepath = model_path+"/"+self._slug+"/model.pkl" x_train,x_test,y_train,y_test = self._dataframe_helper.get_train_test_data() x_train = MLUtils.create_dummy_columns(x_train,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns(x_test,[x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test,x_train.columns,result_column) st = time.time() est = DecisionTreeRegressor() CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"training","info",display=True,emptyBin=False,customMsg=None,weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name":hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = {k:v for k,v in params_grid.items() if k in est.get_params()} print params_grid if hyperParamAlgoName == "gridsearchcv": estGrid = GridSearchCV(est,params_grid) gridParams = estGrid.get_params() hyperParamInitParam = {k:v for k,v in hyperParamInitParam.items() if k in gridParams} estGrid.set_params(**hyperParamInitParam) estGrid.fit(x_train,y_train) bestEstimator = estGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult(estGrid.cv_results_,est,x_train,x_test,y_train,y_test,appType,modelFilepath,evaluationMetricDict=evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models() self._result_setter.set_hyper_parameter_results(self._slug,resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug,{"ignoreList":sklearnHyperParameterResultObj.get_ignore_list(),"hideColumns":sklearnHyperParameterResultObj.get_hide_columns(),"metricColName":sklearnHyperParameterResultObj.get_comparison_metric_colname(),"columnOrder":sklearnHyperParameterResultObj.get_keep_columns()}) elif hyperParamAlgoName == "randomsearchcv": estRand = RandomizedSearchCV(est,params_grid) estRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = {"name":GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[evaluationMetricDict["name"]] algoParams = algoSetting.get_params_dict() algoParams = {k:v for k,v in algoParams.items() if k in est.get_params().keys()} est.set_params(**algoParams) self._result_setter.set_hyper_parameter_results(self._slug,None) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT["value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult(numFold,est,x_train,x_test,y_train,y_test,appType,evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": est.fit(x_train, y_train) bestEstimator = est trainingTime = time.time()-st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0]*len(y_score) featureImportance={} objs = {"trained_model":bestEstimator,"actual":y_test,"predicted":y_score,"probability":y_prob,"feature_importance":featureImportance,"featureList":list(x_train.columns),"labelMapping":{}} featureImportance = objs["trained_model"].feature_importances_ featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M"+"0"*(GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH-1)+"1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName+".pkl") joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["mse"] = mean_squared_error(y_test, y_score) metrics["mae"] = mean_absolute_error(y_test, y_score) metrics["rmse"] = sqrt(metrics["mse"]) transformed = pd.DataFrame({"prediction":y_score,result_column:y_test}) transformed["difference"] = transformed[result_column] - transformed["prediction"] transformed["mape"] = np.abs(transformed["difference"])*100/transformed[result_column] sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100,random_state=420) else: sampleData = transformed print sampleData.head() mapeCountArr = pd.cut(transformed["mape"],GLOBALSETTINGS.MAPEBINS).value_counts().to_dict().items() mapeStatsArr = [(str(idx),dictObj) for idx,dictObj in enumerate(sorted([{"count":x[1],"splitRange":(x[0].left,x[0].right)} for x in mapeCountArr],key = lambda x:x["splitRange"][0]))] predictionColSummary = transformed["prediction"].describe().to_dict() quantileBins = [predictionColSummary["min"],predictionColSummary["25%"],predictionColSummary["50%"],predictionColSummary["75%"],predictionColSummary["max"]] print quantileBins quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"],quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({"prediction":[np.sum,np.mean,np.size]}).reset_index() quantileDf.columns = ["prediction","sum","mean","count"] print quantileDf quantileArr = quantileDf.T.to_dict().items() quantileSummaryArr = [(obj[0],{"splitRange":(obj[1]["prediction"].left,obj[1]["prediction"].right),"count":obj[1]["count"],"mean":obj[1]["mean"],"sum":obj[1]["sum"]}) for obj in quantileArr] print quantileSummaryArr runtime = round((time.time() - st_global),2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("DTREE Regression") self._model_summary.set_algorithm_display_name("Decision Tree Regression") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method(validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(bestEstimator.get_params()) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) try: pmml_filepath = str(model_path)+"/"+str(self._slug)+"/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([ ("pretrained-estimator", objs["trained_model"]) ]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array([col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr = True) pmmlfile = open(pmml_filepath,"r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug:pmmlText}) except: pass if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":self._model_summary.get_model_accuracy(), "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":modelName } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name":self._model_summary.get_algorithm_name(), "evaluationMetricValue":resultArray[0]["R-Squared"], "evaluationMetricName":"r2", "slug":self._model_summary.get_slug(), "Model Id":resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown":modelDropDownObj, "levelcount":self._model_summary.get_level_counts(), "modelFeatureList":self._model_summary.get_feature_list(), "levelMapping":self._model_summary.get_level_map_dict(), "slug":self._model_summary.get_slug(), "name":self._model_summary.get_algorithm_name() } dtreerCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in dtreerCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({"dtreeregression":json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_dtree_regression_model_summart(modelSummaryJson) self._result_setter.set_dtreer_cards(dtreerCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context,self._scriptWeightDict,self._scriptStages,self._slug,"completion","info",display=True,emptyBin=False,customMsg=None,weightKey="total")
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() levels = df[result_column].unique() clf = SVC(kernel='linear', probability=True) labelEncoder = preprocessing.LabelEncoder() labelEncoder.fit(np.concatenate([y_train, y_test])) y_train = pd.Series(labelEncoder.transform(y_train)) y_test = labelEncoder.transform(y_test) classes = labelEncoder.classes_ transformed = labelEncoder.transform(classes) labelMapping = dict(list(zip(transformed, classes))) inverseLabelMapping = dict(list(zip(classes, transformed))) posLabel = inverseLabelMapping[self._targetLevel] appType = self._dataframe_context.get_app_type() print(appType, labelMapping, inverseLabelMapping, posLabel, self._targetLevel) if algoSetting.is_hyperparameter_tuning_enabled(): hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] hyperParamAlgoName = algoSetting.get_hyperparameter_algo_name() params_grid = algoSetting.get_params_dict_hyperparameter() params_grid = { k: v for k, v in list(params_grid.items()) if k in clf.get_params() } print(params_grid) if hyperParamAlgoName == "gridsearchcv": clfGrid = GridSearchCV(clf, params_grid) gridParams = clfGrid.get_params() hyperParamInitParam = { k: v for k, v in list(hyperParamInitParam.items()) if k in gridParams } clfGrid.set_params(**hyperParamInitParam) #clfGrid.fit(x_train,y_train) grid_param = {} grid_param['params'] = ParameterGrid(params_grid) #bestEstimator = clfGrid.best_estimator_ modelFilepath = "/".join(model_filepath.split("/")[:-1]) sklearnHyperParameterResultObj = SklearnGridSearchResult( grid_param, clf, x_train, x_test, y_train, y_test, appType, modelFilepath, levels, posLabel, evaluationMetricDict) resultArray = sklearnHyperParameterResultObj.train_and_save_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": sklearnHyperParameterResultObj.get_ignore_list(), "hideColumns": sklearnHyperParameterResultObj.get_hide_columns(), "metricColName": sklearnHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": sklearnHyperParameterResultObj.get_keep_columns() }) elif hyperParamAlgoName == "randomsearchcv": clfRand = RandomizedSearchCV(clf, params_grid) clfRand.set_params(**hyperParamInitParam) bestEstimator = None else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results( self._slug, None) algoParams = algoSetting.get_params_dict() algoParams = { k: v for k, v in list(algoParams.items()) if k in list(clf.get_params().keys()) } clf.set_params(**algoParams) print("!" * 50) print(clf.get_params()) print("!" * 50) if validationDict["name"] == "kFold": defaultSplit = GLOBALSETTINGS.DEFAULT_VALIDATION_OBJECT[ "value"] numFold = int(validationDict["value"]) if numFold == 0: numFold = 3 kFoldClass = SkleanrKFoldResult( numFold, clf, x_train, x_test, y_train, y_test, appType, levels, posLabel, evaluationMetricDict=evaluationMetricDict) kFoldClass.train_and_save_result() kFoldOutput = kFoldClass.get_kfold_result() bestEstimator = kFoldClass.get_best_estimator() elif validationDict["name"] == "trainAndtest": clf.fit(x_train, y_train) bestEstimator = clf # clf.fit(x_train, y_train) # bestEstimator = clf trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) # overall_precision_recall = MLUtils.calculate_overall_precision_recall(y_test,y_score,targetLevel = self._targetLevel) # print overall_precision_recall accuracy = metrics.accuracy_score(y_test, y_score) if len(levels) <= 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="binary") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="binary") auc = metrics.roc_auc_score(y_test, y_score) elif len(levels) > 2: precision = metrics.precision_score(y_test, y_score, pos_label=posLabel, average="macro") recall = metrics.recall_score(y_test, y_score, pos_label=posLabel, average="macro") # auc = metrics.roc_auc_score(y_test,y_score,average="weighted") auc = None y_score = labelEncoder.inverse_transform(y_score) y_test = labelEncoder.inverse_transform(y_test) featureImportance = {} feature_importance = dict( sorted(zip(x_train.columns, bestEstimator.feature_importances_), key=lambda x: x[1], reverse=True)) for k, v in feature_importance.items(): feature_importance[k] = CommonUtils.round_sig(v) objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": feature_importance, "featureList": list(x_train.columns), "labelMapping": labelMapping } if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pkl") joblib.dump(objs["trained_model"], "/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass cat_cols = list(set(categorical_columns) - {result_column}) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"], targetLevel=self._targetLevel) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Svm") self._model_summary.set_algorithm_display_name( "Support Vector Machine") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix( MLUtils.calculate_confusion_matrix(objs["actual"], objs["predicted"])) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy( round(metrics.accuracy_score(objs["actual"], objs["predicted"]), 2)) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats( overall_precision_recall["classwise_stats"]) self._model_summary.set_model_precision( overall_precision_recall["precision"]) self._model_summary.set_model_recall( overall_precision_recall["recall"]) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split( overall_precision_recall["prediction_split"]) self._model_summary.set_validation_method("Train and Test") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features( [col for col in x_train.columns if col != result_column]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)))) self._model_summary.set_num_trees(100) self._model_summary.set_num_rules(300) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": self._model_summary.get_model_accuracy(), "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": resultArray[0]["Accuracy"], "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } svmCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] for card in svmCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "svm": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_svm_model_summary(modelSummaryJson) self._result_setter.set_rf_cards(svmCards) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print("CATEGORICAL COLS - ", categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) print("=" * 150) print("X-Train Shape - ", x_train.shape) print("Y-Train Shape - ", y_train.shape) print("X-Test Shape - ", x_test.shape) print("Y-Test Shape - ", y_test.shape) print("~" * 50) print("X-Train dtype - ", type(x_train)) print("Y-Train dtype - ", type(y_train)) print("X-Test dtype - ", type(x_test)) print("Y-Test dtype - ", type(y_test)) print("~" * 50) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() self._result_setter.set_hyper_parameter_results(self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="REGRESSION") evaluationMetricDict = { "name": GLOBALSETTINGS.REGRESSION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] x_train_tensored, y_train_tensored, x_test_tensored, y_test_tensored = PYTORCHUTILS.get_tensored_data( x_train, y_train, x_test, y_test) trainset = torch_data_utils.TensorDataset(x_train_tensored, y_train_tensored) testset = torch_data_utils.TensorDataset(x_test_tensored, y_test_tensored) nnptr_params = algoSetting.get_nnptr_params_dict()[0] layers_for_network = PYTORCHUTILS.get_layers_for_network_module( nnptr_params, task_type="REGRESSION", first_layer_units=x_train.shape[1]) # Use GPU if available device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") network = PyTorchNetwork(layers_for_network).to(device) network.eval() other_params_dict = PYTORCHUTILS.get_other_pytorch_params( nnptr_params, task_type="REGRESSION", network_params=network.parameters()) print("~" * 50) print("NNPTR-PARAMS - ", nnptr_params) print("~" * 50) print("OTHER-PARAMS-DICT - ", other_params_dict) print("~" * 50) print("NEURAL-NETWORK - ", network) print("~" * 50) criterion = other_params_dict["loss_criterion"] n_epochs = other_params_dict["number_of_epochs"] batch_size = other_params_dict["batch_size"] optimizer = other_params_dict["optimizer"] dataloader_params = { "batch_size": batch_size, "shuffle": True # "num_workers": } train_loader = torch_data_utils.DataLoader(trainset, **dataloader_params) test_loader = torch_data_utils.DataLoader(testset, **dataloader_params) ''' Training the network; Batchnormalization(num_features) should be equal to units_op for that layer in training config; else --> RuntimeError('running_mean should contain 100 elements not 200',) ''' for epoch in range(n_epochs): batchwise_losses = [] average_loss = 0.0 for i, (inputs, labels) in enumerate(train_loader): inputs = inputs.to(device) labels = labels.to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward + backward + optimize outputs = network(inputs.float()) loss = criterion(outputs, labels.float()) loss.backward() optimizer.step() average_loss += loss.item() batchwise_losses.append(loss.item()) average_loss_per_epoch = old_div(average_loss, (i + 1)) print("+" * 80) print("EPOCH - ", epoch) print("BATCHWISE_LOSSES shape - ", len(batchwise_losses)) print("AVERAGE LOSS PER EPOCH - ", average_loss_per_epoch) print("+" * 80) trainingTime = time.time() - st bestEstimator = network outputs_x_test_tensored = network(x_test_tensored.float()) y_score_mid = outputs_x_test_tensored.tolist() y_score = [x[0] for x in y_score_mid] print("Y-SCORE - ", y_score) print("Y-SCORE length - ", len(y_score)) y_prob = None featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".pt") torch.save(objs["trained_model"], "/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) runtime = round((time.time() - st), 2) else: runtime = round((time.time() - hyper_st), 2) try: modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) print("TRANSFORMED PREDICTION TYPE - ", type(transformed["prediction"])) print(transformed["prediction"]) print("TRANSFORMED RESULT COL TYPE - ", type(transformed[result_column])) print(transformed[result_column]) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name("Neural Network (PyTorch)") self._model_summary.set_algorithm_display_name( "Neural Network (PyTorch)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(nnptr_params) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = nnptr_params self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) # self._model_management.set_model_evaluation_metrics(data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in modelmanagement_["hidden_layer_info"]: string = "" key = str(modelmanagement_["hidden_layer_info"][i] ["layer"]) + " " + str(i) + ":" for j in modelmanagement_["hidden_layer_info"][i]: string = string + str(j) + ":" + str( modelmanagement_["hidden_layer_info"][i][j]) + ", " modelManagementModelSettingsJson.append([key, string]) print(modelManagementModelSettingsJson) nnptrCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] nnptrPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] nnptrOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nnptrDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nnptr_Overview_Node = NarrativesTree() nnptr_Overview_Node.set_name("Overview") nnptr_Performance_Node = NarrativesTree() nnptr_Performance_Node.set_name("Performance") nnptr_Deployment_Node = NarrativesTree() nnptr_Deployment_Node.set_name("Deployment") for card in nnptrOverviewCards: nnptr_Overview_Node.add_a_card(card) for card in nnptrPerformanceCards: nnptr_Performance_Node.add_a_card(card) for card in nnptrDeploymentCards: nnptr_Deployment_Node.add_a_card(card) for card in nnptrCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (PyTorch)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_nnptr_regression_model_summary( modelSummaryJson) self._result_setter.set_nnptr_cards(nnptrCards) self._result_setter.set_nnptr_nodes([ nnptr_Overview_Node, nnptr_Performance_Node, nnptr_Deployment_Node ]) self._result_setter.set_nnptr_fail_card({ "Algorithm_Name": "Neural Network (PyTorch)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
import numpy as np import xgboost from sklearn import datasets from sklearn2pmml import sklearn2pmml from sklearn2pmml import PMMLPipeline iris = datasets.load_iris() X = iris.data y = iris.target feature_names = iris.feature_names model = xgboost.XGBClassifier(learning_rate=0.1, n_estimators=10, max_depth=10, silent=False) iris_pipeline = PMMLPipeline([("classifier", model)]) iris_pipeline.active_fields = np.array(feature_names) iris_pipeline.fit(X, y) sklearn2pmml(iris_pipeline, "iris.pmml", with_repr=True, debug=True)
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") appType = self._dataframe_context.get_app_type() algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) print(categorical_columns) result_column = self._dataframe_context.get_result_column() numerical_columns = self._dataframe_helper.get_numeric_columns() numerical_columns = [ x for x in numerical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame if self._mlEnv == "spark": pass elif self._mlEnv == "sklearn": model_filepath = model_path + "/" + self._slug + "/model.pkl" x_train, x_test, y_train, y_test = self._dataframe_helper.get_train_test_data( ) x_train = MLUtils.create_dummy_columns( x_train, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.create_dummy_columns( x_test, [x for x in categorical_columns if x != result_column]) x_test = MLUtils.fill_missing_columns(x_test, x_train.columns, result_column) st = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._result_setter.set_hyper_parameter_results( self._slug, None) evaluationMetricDict = algoSetting.get_evaluvation_metric( Type="Regression") evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] params_tf = algoSetting.get_tf_params_dict() algoParams = algoSetting.get_params_dict() algoParams = {k: v for k, v in list(algoParams.items())} model = tf.keras.models.Sequential() first_layer_flag = True for i in range(len(list( params_tf['hidden_layer_info'].keys()))): if params_tf['hidden_layer_info'][str( i)]["layer"] == "Dense": if first_layer_flag: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], input_shape=(len(x_train.columns), ), use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass first_layer_flag = False else: model.add( tf.keras.layers.Dense( params_tf['hidden_layer_info'][str( i)]["units"], activation=params_tf['hidden_layer_info'][ str(i)]["activation"], use_bias=params_tf['hidden_layer_info'][ str(i)]["use_bias"], kernel_initializer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_initializer"], bias_initializer=params_tf[ 'hidden_layer_info'][str( i)]["bias_initializer"], kernel_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["kernel_regularizer"], bias_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["bias_regularizer"], activity_regularizer=params_tf[ 'hidden_layer_info'][str( i)]["activity_regularizer"], kernel_constraint=params_tf[ 'hidden_layer_info'][str( i)]["kernel_constraint"], bias_constraint=params_tf[ 'hidden_layer_info'][str( i)]["bias_constraint"])) try: if params_tf['hidden_layer_info'][str( i)]["batch_normalization"] == "True": model.add( tf.keras.layers.BatchNormalization()) except: print( "BATCH_NORM_FAILED ##########################" ) pass elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Dropout": model.add( tf.keras.layers.Dropout( float(params_tf['hidden_layer_info'][str(i)] ["rate"]))) elif params_tf['hidden_layer_info'][str( i)]["layer"] == "Lambda": if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Addition": model.add( tf.keras.layers.Lambda(lambda x: x + int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Multiplication": model.add( tf.keras.layers.Lambda(lambda x: x * int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Subtraction": model.add( tf.keras.layers.Lambda(lambda x: x - int( params_tf['hidden_layer_info'][str(i)][ "units"]))) if params_tf['hidden_layer_info'][str( i)]["lambda"] == "Division": model.add( tf.keras.layers.Lambda(lambda x: old_div( x, int(params_tf['hidden_layer_info'][str(i)][ "units"])))) model.compile(optimizer=algoParams["optimizer"], loss=algoParams["loss"], metrics=[algoParams['metrics']]) model.fit(x_train, y_train, epochs=algoParams["number_of_epochs"], verbose=1, batch_size=algoParams["batch_size"]) bestEstimator = model print(model.summary()) trainingTime = time.time() - st y_score = bestEstimator.predict(x_test) y_score = list(y_score.flatten()) try: y_prob = bestEstimator.predict_proba(x_test) except: y_prob = [0] * len(y_score) featureImportance = {} objs = { "trained_model": bestEstimator, "actual": y_test, "predicted": y_score, "probability": y_prob, "feature_importance": featureImportance, "featureList": list(x_train.columns), "labelMapping": {} } #featureImportance = objs["trained_model"].feature_importances_ #featuresArray = [(col_name, featureImportance[idx]) for idx, col_name in enumerate(x_train.columns)] featuresArray = [] if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName + ".h5") objs["trained_model"].save("/".join(modelFilepathArr)) #joblib.dump(objs["trained_model"],"/".join(modelFilepathArr)) metrics = {} metrics["r2"] = r2_score(y_test, y_score) metrics["neg_mean_squared_error"] = mean_squared_error( y_test, y_score) metrics["neg_mean_absolute_error"] = mean_absolute_error( y_test, y_score) metrics["RMSE"] = sqrt(metrics["neg_mean_squared_error"]) metrics["explained_variance_score"] = explained_variance_score( y_test, y_score) transformed = pd.DataFrame({ "prediction": y_score, result_column: y_test }) transformed["difference"] = transformed[ result_column] - transformed["prediction"] transformed["mape"] = old_div( np.abs(transformed["difference"]) * 100, transformed[result_column]) sampleData = None nrows = transformed.shape[0] if nrows > 100: sampleData = transformed.sample(n=100, random_state=420) else: sampleData = transformed print(sampleData.head()) if transformed["mape"].max() > 100: GLOBALSETTINGS.MAPEBINS.append(transformed["mape"].max()) mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) GLOBALSETTINGS.MAPEBINS.pop(5) else: mapeCountArr = list( pd.cut(transformed["mape"], GLOBALSETTINGS.MAPEBINS). value_counts().to_dict().items()) mapeStatsArr = [(str(idx), dictObj) for idx, dictObj in enumerate( sorted([{ "count": x[1], "splitRange": (x[0].left, x[0].right) } for x in mapeCountArr], key=lambda x: x["splitRange"][0]))] print(mapeStatsArr) print(mapeCountArr) predictionColSummary = transformed["prediction"].describe( ).to_dict() quantileBins = [ predictionColSummary["min"], predictionColSummary["25%"], predictionColSummary["50%"], predictionColSummary["75%"], predictionColSummary["max"] ] print(quantileBins) quantileBins = sorted(list(set(quantileBins))) transformed["quantileBinId"] = pd.cut(transformed["prediction"], quantileBins) quantileDf = transformed.groupby("quantileBinId").agg({ "prediction": [np.sum, np.mean, np.size] }).reset_index() quantileDf.columns = ["prediction", "sum", "mean", "count"] print(quantileDf) quantileArr = list(quantileDf.T.to_dict().items()) quantileSummaryArr = [(obj[0], { "splitRange": (obj[1]["prediction"].left, obj[1]["prediction"].right), "count": obj[1]["count"], "mean": obj[1]["mean"], "sum": obj[1]["sum"] }) for obj in quantileArr] print(quantileSummaryArr) runtime = round((time.time() - st_global), 2) self._model_summary.set_model_type("regression") self._model_summary.set_algorithm_name( "Neural Network (TensorFlow)") self._model_summary.set_algorithm_display_name( "Neural Network (TensorFlow)") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_training_time(trainingTime) self._model_summary.set_target_variable(result_column) self._model_summary.set_validation_method( validationDict["displayName"]) self._model_summary.set_model_evaluation_metrics(metrics) self._model_summary.set_model_params(params_tf) self._model_summary.set_quantile_summary(quantileSummaryArr) self._model_summary.set_mape_stats(mapeStatsArr) self._model_summary.set_sample_data(sampleData.to_dict()) self._model_summary.set_feature_importance(featuresArray) self._model_summary.set_feature_list(list(x_train.columns)) self._model_summary.set_model_mse( metrics["neg_mean_squared_error"]) self._model_summary.set_model_mae( metrics["neg_mean_absolute_error"]) self._model_summary.set_rmse(metrics["RMSE"]) self._model_summary.set_model_rsquared(metrics["r2"]) self._model_summary.set_model_exp_variance_score( metrics["explained_variance_score"]) try: pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" modelPmmlPipeline = PMMLPipeline([("pretrained-estimator", objs["trained_model"])]) modelPmmlPipeline.target_field = result_column modelPmmlPipeline.active_fields = np.array( [col for col in x_train.columns if col != result_column]) sklearn2pmml(modelPmmlPipeline, pmml_filepath, with_repr=True) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except: pass if algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": metrics[evaluationMetricDict["name"]], "evaluationMetricName": evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } modelmanagement_ = params_tf modelmanagement_.update(algoParams) self._model_management = MLModelSummary() if algoSetting.is_hyperparameter_tuning_enabled(): pass else: self._model_management.set_layer_info( data=modelmanagement_['hidden_layer_info']) self._model_management.set_loss_function( data=modelmanagement_['loss']) self._model_management.set_optimizer( data=modelmanagement_['optimizer']) self._model_management.set_batch_size( data=modelmanagement_['batch_size']) self._model_management.set_no_epochs( data=modelmanagement_['number_of_epochs']) self._model_management.set_model_evaluation_metrics( data=modelmanagement_['metrics']) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_no_of_independent_variables( data=x_train) #no of independent varables self._model_management.set_training_time(runtime) # run time self._model_management.set_rmse(metrics["RMSE"]) self._model_management.set_algorithm_name( "Neural Network (TensorFlow)") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["RMSE", self._model_management.get_rmse()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] if algoSetting.is_hyperparameter_tuning_enabled(): modelManagementModelSettingsJson = [] else: modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], [ "Target Column", self._model_management.get_target_variable() ], [ "Number Of Independent Variables", self._model_management.get_no_of_independent_variables() ], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["batch_size", str(self._model_management.get_batch_size())], ["Loss", self._model_management.get_loss_function()], ["Optimizer", self._model_management.get_optimizer()], ["Epochs", self._model_management.get_no_epochs()], [ "Metrics", self._model_management.get_model_evaluation_metrics() ] ] for i in range( len(list(modelmanagement_['hidden_layer_info'].keys()))): string = "" key = "layer No-" + str(i) + "-" + str( modelmanagement_["hidden_layer_info"][str(i)]["layer"] + "-") for j in modelmanagement_["hidden_layer_info"][str(i)]: modelManagementModelSettingsJson.append([ key + j + ":", modelmanagement_["hidden_layer_info"][str(i)][j] ]) print(modelManagementModelSettingsJson) tfregCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] tfregPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards_regression( self._model_summary) ] tfregOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] tfregDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] TFReg_Overview_Node = NarrativesTree() TFReg_Overview_Node.set_name("Overview") TFReg_Performance_Node = NarrativesTree() TFReg_Performance_Node.set_name("Performance") TFReg_Deployment_Node = NarrativesTree() TFReg_Deployment_Node.set_name("Deployment") for card in tfregOverviewCards: TFReg_Overview_Node.add_a_card(card) for card in tfregPerformanceCards: TFReg_Performance_Node.add_a_card(card) for card in tfregDeploymentCards: TFReg_Deployment_Node.add_a_card(card) for card in tfregCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "Neural Network (TensorFlow)": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_tfreg_regression_model_summart( modelSummaryJson) self._result_setter.set_tfreg_cards(tfregCards) self._result_setter.set_tfreg_nodes([ TFReg_Overview_Node, TFReg_Performance_Node, TFReg_Deployment_Node ]) self._result_setter.set_tfreg_fail_card({ "Algorithm_Name": "Neural Network (TensorFlow)", "Success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")