def dtRegression(df, conf): """ input : df [spark.dataframe], conf [configuration params] output : decisiontree_regression model [model] """ featuresCol = conf["params"].get("featuresCol") impurity = conf["params"].get("impurity", "variance") maxDepth = conf["params"].get("maxDepth", 5) maxBin = conf["params"].get("maxBins",32) minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1) minInfoGain = conf ["params"].get("minInfoGain", 0.0) maxMemoryInMB = conf["params"].get("maxMemoryInMB",256) cacheNodeIds = conf["params"].get("cacheNodeIds", False) checkpointInterval = conf["params"].get("checkpointInterval", 10) seed = conf["params"].get("seed", None) varianceCol = conf["params"].get("varianceCol", None) dt = DecisionTreeRegressor(maxDepth=maxDepth,featuresCol=featuresCol) pipeline = Pipeline(stages=[featureIndexer, dt]) print ("maxDepth : " , dt.getMaxDepth()) #jika menggunakan ml-tuning if conf["tuning"]: #jika menggunakan ml-tuning cross validation if conf["tuning"].get("method").lower() == "crossval": paramgGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramgGrids[key]) grid = pg.build() folds = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, numFolds= folds) model = cv.fit(df) #jika menggunakan ml-tuning train validation split elif conf["tuning"].get("method").lower() == "trainvalsplit": paramgGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() tr = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr ) model = tvs.fit(df) #jika tidak menggunakan ml-tuning elif conf["tuning"] == None: print ("test") model = pipeline.fit(df) return model
def estimator_gridbuilder(estimator, paramnames_values): """Help to abbreviate ParamGridBuilder construction from dict""" pgb = ParamGridBuilder() for pn, vals in paramnames_values.items(): assert hasattr(vals, '__iter__'), "List of values required for each parameter name" pgb.addGrid(estimator.getParam(pn), vals) return estimator, pgb
def logisticClassifier(df, conf): feature_col = conf["params"].get("featuresCol", "features") label_col = conf["params"].get("labelCol", "label") pred_col = conf["params"].get("predictionCol", "prediction") prob_col = conf["params"].get("probabilityCol", "probability") max_iter = conf["params"].get("maxIter", 100) reg_param = conf["params"].get("regParam", 0.0) elasticNet_param = conf["params"].get("elasticNetParam", 0.0) tolr = conf["params"].get("tol", 1e-6) fit_intercept = conf["params"].get("fitIntercept", True) thres = conf["params"].get("threshold", 0.5) thresh = conf["params"].get("thresholds", None) std = conf["params"].get("standardization", True) weight = conf["params"].get("weightCol", None) aggr = conf["params"].get("aggregationDepth", 2) fml = conf["params"].get("family", "auto") lr = LogisticRegression(maxIter=max_iter, regParam=reg_param, elasticNetParam=elasticNet_param, \ tol=tolr, fitIntercept=fit_intercept, threshold=thres, standardization=std, \ aggregationDepth=aggr, family=fml) if conf["tuning"]: if conf["tuning"].get("method").lower() == "crossval": logReg = LogisticRegression() paramgGrids = conf["tuning"].get("paramGrids") folds = conf["tuning"].get("methodParam", 2) pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramgGrids[key]) grid = pg.build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=logReg, estimatorParamMaps=grid, evaluator=evaluator, numFolds=folds) model = cv.fit(df) elif conf["tuning"].get("method").lower() == "trainvalsplit": paramgGrids = conf["tuning"].get("paramGrids") tr = conf["tuning"].get("methodParam", 0.8) pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramgGrids[key]) grid = pg.build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr) model = tvs.fit(df) elif conf["tuning"] == None: model = lr.fit(df) return model
def randomforestRegression (df,conf): """input : - Dataframe train (df) - Hyperparameter configuration (conf) output : - Random Forest Regression Model """ # set params with default value (if value isn't set in rfr_params) feature_col = conf["params"].get("featuresCol", "features") label_col = conf["params"].get("labelCol", "label") pred_col = conf["params"].get("predictionCol", "prediction") max_depth = conf["params"].get("maxDepth", 5) num_trees = conf["params"].get("numTrees", 20) max_bins= conf["params"].get("maxBins", 32) seed = conf["params"].get("seed", None) minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1) minInfoGain = conf ["params"].get("minInfoGain", 0.0) maxMemoryInMB = conf["params"].get("maxMemoryInMB", 256) cacheNodeIds = conf["params"].get("cacheNodeIds", False) checkpointInterval = conf["params"].get("checkpointInterval", 10) impurity = conf["params"].get("impurity", "variance") subSamplingRate = conf["params"].get("subsamplingRate", 1.0) featureSubsetStrategy = conf["params"].get("featureSubsetStrategy", "auto") rfr = RandomForestRegressor(featuresCol=feature_col, labelCol=label_col, predictionCol=pred_col, maxDepth=max_depth, numTrees=num_trees, impurity=impurity) pipeline = Pipeline(stages=[featureIndexer, rfr]) if conf["tuning"]: if conf["tuning"].get("method").lower() == "crossval": folds = conf["tuning"].get("methodParam", 4) # Set the hiperparameter that we want to grid, ex: maxDepth and numTrees paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() cv = CrossValidator(estimator=rfr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=folds) model = cv.fit(df) elif conf["tuning"].get("method").lower() == "trainvalsplit": tr = conf["tuning"].get("methodParam", 0.8) # Set the hiperparameter that we want to grid, ex: maxDepth and numTrees paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=rfr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr) model = tvs.fit(df) elif conf["tuning"] == None: model = pipeline.fit(df) return model
def aftsurvivalRegression(df, conf): """ AFT Survival Regression training Input : - Dataframe of training (df) - tuning and hiperparameter configuration (conf) output : - AFT survival regression model (model) """ feature_col = conf["params"].get("featuresCol", "features") label_col = conf["params"].get("labelCol", "label") pred_col = conf["params"].get("predictionCol", "prediction") cens_col = conf["params"].get("censorCol", "censor") fit_intercept = conf["params"].get("fitIntercept",True) max_iter = conf["params"].get("maxIter", 100) tol = conf["params"].get("tol", ) quant_p = conf["params"].get("quantileProbabilities", [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]) quant_col = conf["params"].get("quantilesCol", None) agg_depth = conf["params"].get("aggregationDepth", 2) afts = AFTSurvivalRegression(featuresCol=feature_col,labelCol=label_col, predictionCol=pred_col, censorCol=cens_col, maxIter=max_iter, fitIntercept=fit_intercept, tol=tol, aggregationDepth=agg_depth) if conf["tuning"]: if conf["tuning"].get("method").lower() == "crossval": folds = conf["tuning"].get("methodParam", 2) # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() cv = CrossValidator(estimator=afts, estimatorParamMaps=grid, evaluator=evaluator, numFolds=folds) model = cv.fit(df) elif conf["tuning"].get("method").lower() == "trainvalsplit": tr = conf["tuning"].get("methodParam", 0.8) # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=afts, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr) model = tvs.fit(df) elif conf["tuning"] == None: model = afts.fit(df) return model
def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map) param_grid_builder.addGrid(self.ngram.n, self.ngram_map) param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.clf.regParam, self.clf_map) return param_grid_builder.build()
def _inner_search(self, estimator_family, train, test, model_key, model_class, fixed_params, hyper_params, scoring, greater_is_better, cv, *args, **kwargs): model_object = model_class(labelCol=kwargs['variable_to_predict'], **fixed_params.get(model_key, {})) tuned_parameters = hyper_params.get(model_key, {}) # build hyper parameter grid: pgb = ParamGridBuilder() for tp_key in tuned_parameters: pgb = pgb.addGrid(getattr(model_object, tp_key), tuned_parameters[tp_key]) param_map = pgb.build() # run cross validator: evaluator = self._get_scorer(estimator_family, scoring, *args, **kwargs) cv_f = CrossValidationSpark(estimator=model_object, estimatorParamMaps=param_map, evaluator=evaluator, cv=cv) hps, metrics = cv_f.fit(train) return hps, self._calc_cv_df(model_key, metrics, param_map)
def hyperparameter_tuned_model(clf, train_df): pipeline = Pipeline(stages=[clf]) paramGrid = ParamGridBuilder() for i in extra_config: if i == 'numFolds': continue paramGrid = paramGrid.addGrid(eval('clf.' + i), extra_config[i]) paramGrid = paramGrid.build() evaluator = MulticlassClassificationEvaluator() if tuning_method == 'CrossValidator': if 'numFolds' in extra_config: numFolds = extra_config['numFolds'] else: numFolds = 3 # default val_model = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=numFolds, seed=seed) if tuning_method == 'TrainValidationSplit': val_model = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, seed=seed, # 80% of the data will be used for training, 20% for validation. trainRatio=1 - test_size) # Run cross-validation, and choose the best set of parameters. return val_model.fit(train_df)
def build_dag(self, paramMaps): # Type of tree search to execute dag tree = BFSTree if self.tree_type.lower() == "bfs" else DFSTree stages = self.getStages() stage_prev = None # Locate the last estimator, will not transform training data after fit for i, stage in enumerate(stages): if isinstance(stage, Estimator): last_est_index = i nodes = [] roots = [] stage_nodes = {} for i, stage in enumerate(stages): # Determine what type of Node for the stage if isinstance(stage, Estimator): if i < last_est_index: Node = tree.FeatureExtractionEstimatorNode else: Node = tree.EstimatorNode else: Node = tree.TransformerNode # Separate ParamMaps for the stage temp_map = {} for param_map in paramMaps: for k, v in param_map.iteritems(): if k.parent == stage.uid: temp_map.setdefault(k, set()).add(v) # Check if have a param grid for this stage if temp_map: grid_builder = ParamGridBuilder() for k, v in temp_map.iteritems(): grid_builder.addGrid(k, v) stage_param_grid = grid_builder.build() new_nodes = [ Node(stage, param_map) for param_map in stage_param_grid ] else: new_nodes = [Node(stage, {})] # Make nodes for each node of parent stage if stage_prev: temp_nodes = [] parent_nodes = stage_nodes[stage_prev] for parent_node in parent_nodes: for node in new_nodes: child_node = copy.copy(node) child_node.children = [] child_node.parent = parent_node parent_node.children.append(child_node) temp_nodes.append(child_node) new_nodes = temp_nodes else: roots += new_nodes # Store all new nodes created for this stage stage_nodes[stage] = new_nodes nodes += new_nodes stage_prev = stage return roots, nodes
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) trainingData, validationData = MLUtils.get_training_and_validation_data( df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = { v: float(k) for k, v in enumerate(labelIdx.labels) } if self._dataframe_context.get_trainerMode() == "autoML": automl_enable = True else: automl_enable = False clf = NaiveBayes() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() print("=" * 100) print(algoParams) print("=" * 100) clfParams = [prm.name for prm in clf.params] algoParams = { getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams } #print("="*100) #print("ALGOPARAMS - ",algoParams) #print("="*100) paramGrid = ParamGridBuilder() # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if v == [None] * len(v): # continue # if k.name == 'thresholds': # paramGrid = paramGrid.addGrid(k,v[0]) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # if not algoSetting.is_hyperparameter_tuning_enabled(): for k, v in algoParams.items(): print(k, v) if v == [None] * len(v): continue paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() # else: # for k,v in algoParams.items(): # print k.name, v # if v[0] == [None] * len(v[0]): # continue # paramGrid = paramGrid.addGrid(k,v[0]) # paramGrid = paramGrid.build() #print("="*143) #print("PARAMGRID - ", paramGrid) #print("="*143) if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: if automl_enable: paramGrid = (ParamGridBuilder().addGrid( clf.smoothing, [1.0, 0.2]).build()) crossval = CrossValidator( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvnb = crossval.fit(trainingData) prediction = cvnb.transform(validationData) bestModel = cvnb.bestModel else: train_test_ratio = float( self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvspnb = tvs.fit(trainingData) prediction = tvspnb.transform(validationData) bestModel = tvspnb.bestModel modelmanagement_ = { param[0].name: param[1] for param in bestModel.stages[2].extractParamMap().items() } MLUtils.save_pipeline_or_model(bestModel, model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) # label_classes = prediction.select("label").distinct().collect() # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label'] #results = transformed.select(["prediction","label"]) # if len(label_classes) > 2: # metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model # else: # metrics = BinaryClassificationMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] metrics = MulticlassMetrics(predsAndLabels) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy print(f1_score, precision, recall, accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return (float(x[1])) else: return (float(1 - x[1])) else: return (float(x[int(posLabel)])) column_name = 'probability' def y_prob_for_eval_udf(): return udf(lambda x: cal_prob_eval(x)) prediction = prediction.withColumn( "y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select( ['y_prob_for_eval', 'prediction', 'label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns) act_list = prediction.select('label').collect() actual = [int(row.label) for row in act_list] pred_list = prediction.select('prediction').collect() predicted = [int(row.prediction) for row in pred_list] prob_list = prediction.select('probability').collect() probability = [list(row.probability) for row in prob_list] # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'), # "probability":prediction.select('probability'),"feature_importance":None, # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping} objs = { "trained_model": bestModel, "actual": actual, "predicted": predicted, "probability": probability, "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping } conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) # accuracy of the model '''ROC CURVE IMPLEMENTATION''' y_prob = probability y_score = predicted y_test = actual logLoss = log_loss(y_test, y_prob) if levels <= 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) roc_auc = roc_auc_score(y_test, y_score) roc_data_dict = { "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs, "y_prob": y_prob, "positive_label": posLabel } roc_dataframe = pd.DataFrame({ "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs }) #roc_dataframe.to_csv("binary_roc_data.csv") fpr, tpr, thresholds = roc_curve(y_test, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) elif levels > 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) y_test_roc_multi = [] for val in y_test: if val != posLabel: val = posLabel + 1 y_test_roc_multi.append(val) else: y_test_roc_multi.append(val) y_score_roc_multi = [] for val in y_score: if val != posLabel: val = posLabel + 1 y_score_roc_multi.append(val) else: y_score_roc_multi.append(val) roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi) fpr, tpr, thresholds = roc_curve(y_test_roc_multi, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = prediction.select('label').count() for item in val_cnts: print(labelMapping) classname = labelMapping[item['label']] prediction_split[classname] = round( item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption( clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Naive Bayes") self._model_summary.set_algorithm_display_name("Naive Bayes") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) # self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_model_F1_score(f1_score) self._model_summary.set_model_log_loss(logLoss) self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe) self._model_summary.set_AUC_score(roc_auc) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)) + [result_column])) #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } self._model_management = MLModelSummary() print(modelmanagement_) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_target_level( self._targetLevel) # target column value self._model_management.set_training_time(runtime) # run time self._model_management.set_model_accuracy(round(metrics.accuracy, 2)) # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy self._model_management.set_algorithm_name( "NaiveBayes") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) self._model_management.set_model_type(data='classification') self._model_management.set_var_smoothing( data=int(modelmanagement_['smoothing'])) # self._model_management.set_no_of_independent_variables(df) #no of independent varables modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["Accuracy", self._model_management.get_model_accuracy()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], ["Target Column", self._model_management.get_target_variable()], ["Target Column Value", self._model_management.get_target_level()], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["Model Type", self._model_management.get_model_type()], ["Smoothing", self._model_management.get_var_smoothing()], #,["priors",self._model_management.get_priors()] #,["var_smoothing",self._model_management.get_var_smoothing()] ] nbOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nbPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards( self._model_summary, endgame_roc_df) ] nbDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nbCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] NB_Overview_Node = NarrativesTree() NB_Overview_Node.set_name("Overview") NB_Performance_Node = NarrativesTree() NB_Performance_Node.set_name("Performance") NB_Deployment_Node = NarrativesTree() NB_Deployment_Node.set_name("Deployment") for card in nbOverviewCards: NB_Overview_Node.add_a_card(card) for card in nbPerformanceCards: NB_Performance_Node.add_a_card(card) for card in nbDeploymentCards: NB_Deployment_Node.add_a_card(card) for card in nbCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "naivebayes": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_naive_bayes_model_summary(modelSummaryJson) self._result_setter.set_nb_cards(nbCards) self._result_setter.set_nb_nodes( [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node]) self._result_setter.set_nb_fail_card({ "Algorithm_Name": "Naive Bayes", "success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("\n\n")
def build_ParamGrid(d): pgb=ParamGridBuilder() for k in d.keys(): pgb.addGrid(eval(k),d[k]) ParamGrid=pgb.build() return(ParamGrid)
# Grid Search # Linear regression with an intercept. Fit to training data. regression = LinearRegression(labelCol='consumption', fitIntercept=True) regression = regression.fit(cars_train) evaluator.evaluate(regression.transform(cars_test)) # Linear regression without an intercept. Fit to training data. regression = LinearRegression(labelCol='consumption', fitIntercept=False) regression = regression.fit(cars_train) evaluator.evaluate(regression.transform(cars_test)) from pyspark.ml.tuning import ParamGridBuilder # Create a parameter grid builder params = ParamGridBuilder() # Add grid points params = params.addGrid(regression.fitIntercept, [True, False]) # Construct the grid params = params.build() # How many models? print('Number of models to be tested: ', len(params)) # Create a cross-validator and fit to the training data cv = CrossValidator(estimator=regression, estimatorParamMaps=params, evaluator=evaluator) cv = cv.setNumFolds(10).setSeed(13).fit(cars_train) # What's the cross-validated RMSE for each model print(cv.avgMetrics) # Access the best model print(cv.bestModel)
def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.lr.regParam, self.lr_map) return param_grid_builder.build()
def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.lr.regParam, self.lr_map) return param_grid_builder.build()
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [x for x in algosToRun if x.get_algorithm_slug()==self._slug][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [x for x in categorical_columns if x != result_column] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() # pipeline_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/pipeline/" # model_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/model" # pmml_filepath = "file://"+str(model_path)+"/"+str(self._slug)+"/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str(self._slug) + "/trainedModel.pmml" CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) vectorFeats = pipeline.getStages()[-1].transform(df) input_feats = len(vectorFeats.select('features').take(1)[0][0]) trainingData, validationData = MLUtils.get_training_and_validation_data(df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = {v: float(k) for k, v in enumerate(labelIdx.labels)} clf = MultilayerPerceptronClassifier() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() clfParams = [prm.name for prm in clf.params] algoParams = {getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams} paramGrid = ParamGridBuilder() layer_param_val = algoParams[getattr(clf, 'layers')] for layer in layer_param_val: layer.insert(0, input_feats) layer.append(levels) print('layer_param_val =', layer_param_val) # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if k.name == 'layers': # paramGrid = paramGrid.addGrid(k,layer_param_val) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # else: for k, v in algoParams.items(): if v == [None] * len(v): continue if k.name == 'layers': paramGrid = paramGrid.addGrid(k, layer_param_val) else: paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = {"name": hyperParamInitParam["evaluationMetric"]} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = {"name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC} evaluationMetricDict["displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: crossval = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvrf = crossval.fit(trainingData) prediction = cvrf.transform(validationData) bestModel = cvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" else: train_test_ratio = float(self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult(estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models() self._result_setter.set_hyper_parameter_results(self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates(self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj.get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns()}) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() bestModelName = resultArray[0]["Model Id"] else: tvs = TrainValidationSplit(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvrf = tvs.fit(trainingData) prediction = tvrf.transform(validationData) bestModel = tvrf.bestModel bestModelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" MLUtils.save_pipeline_or_model(bestModel,model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) metrics = MulticlassMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy roc_auc = 'Undefined' if levels == 2: bin_metrics = BinaryClassificationMetrics(predsAndLabels) roc_auc = bin_metrics.areaUnderROC precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) print(f1_score,precision,recall,accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return(float(x[1])) else: return(float(1 - x[1])) else: return(float(x[int(posLabel)])) column_name= 'probability' def y_prob_for_eval_udf(): return udf(lambda x:cal_prob_eval(x)) prediction = prediction.withColumn("y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select(['y_prob_for_eval','prediction','label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None objs = {"trained_model": bestModel, "actual": prediction.select('label'), "predicted": prediction.select('prediction'), "probability": prediction.select('probability'), "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping} # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = objs['actual'].count() for item in val_cnts: classname = labelMapping[item['label']] prediction_split[classname] = round(item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): # modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(bestModelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption(clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Spark ML Multilayer Perceptron") self._model_summary.set_algorithm_display_name("Spark ML Multilayer Perceptron") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict(list(set(categorical_columns)) + [result_column])) self._model_summary.set_num_trees(None) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) modelManagementJson = { "Model ID": "SPMLP-" + bestModelName, "Project Name": self._dataframe_context.get_job_name(), "Algorithm": self._model_summary.get_algorithm_name(), "Status": 'Completed', "Accuracy": accuracy, "Runtime": runtime, "Created On": "", "Owner": "", "Deployment": 0, "Action": '' } # if not algoSetting.is_hyperparameter_tuning_enabled(): # modelDropDownObj = { # "name": self._model_summary.get_algorithm_name(), # "evaluationMetricValue": locals()[evaluationMetricDict["name"]], # accuracy # "evaluationMetricName": evaluationMetricDict["displayName"], # accuracy # "slug": self._model_summary.get_slug(), # "Model Id": bestModelName # } # modelSummaryJson = { # "dropdown": modelDropDownObj, # "levelcount": self._model_summary.get_level_counts(), # "modelFeatureList": self._model_summary.get_feature_list(), # "levelMapping": self._model_summary.get_level_map_dict(), # "slug": self._model_summary.get_slug(), # "name": self._model_summary.get_algorithm_name() # } # else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, #locals()[evaluationMetricDict["name"]], "evaluationMetricName": "accuracy", # evaluationMetricDict["name"], "slug": self._model_summary.get_slug(), "Model Id": bestModelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } mlpcCards = [json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary)] for card in mlpcCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary( {"sparkperceptron": json.loads(CommonUtils.convert_python_object_to_json(self._model_summary))}) self._result_setter.set_spark_multilayer_perceptron_model_summary(modelSummaryJson) self._result_setter.set_spark_multilayer_perceptron_management_summary(modelManagementJson) self._result_setter.set_mlpc_cards(mlpcCards) CommonUtils.create_update_and_save_progress_message(self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total")
def linearRegression(df, conf): """ input : df [spark.dataframe], conf [configuration params] output : linear_regression model [model] """ #memanggil parameter (nilai default) featuresCol= conf["params"].get("featuresCol", "features") labelCol= conf["params"].get("labelCol", "label") predictionCol = conf["params"].get("predictionCol", "prediction") max_iter = conf["params"].get("maxIter", 100) reg_param = conf["params"].get("regParam", 0.0) elasticnet_param = conf["params"].get("elasticNetParam", 0.0) tol = conf["params"].get("tol", 1e-6) fitIntercept = conf["params"].get("fitIntercept", True) standardization = conf["params"].get("standardization", True) solver = conf["params"].get("solver", "auto") weightCol = conf["params"].get("weightCol", None) aggregationDepth = conf["params"].get("aggregationDepth", 2) loss = conf["params"].get("loss", "squaredError") epsilon = conf["params"].get("epsilon", 1.35) lr = LinearRegression(maxIter=max_iter, regParam=reg_param, elasticNetParam=elasticnet_param) print ("maxIter : " , lr.getMaxIter()) print ("regParam : " , lr.getRegParam()) print ("aggrDepth : " , lr.getAggregationDepth()) #jika menggunakan ml-tuning if conf["tuning"]: #jika menggunakan ml-tuning cross validation if conf["tuning"].get("method").lower() == "crossval": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() folds = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds= folds) model = cv.fit(df) #jika menggunakan ml-tuning train validation split elif conf["tuning"].get("method").lower() == "trainvalsplit": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() tr = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr ) model = tvs.fit(df) #jika tidak menggunakan ml-tuning elif conf["tuning"] == None: print ("test") model = lr.fit(df) return model
# test datasets e.g., with k=3 folds, K-fold cross validation will generate 3 (training, test) # dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. # Each fold is used as the test set exactly once. # 15.1 One Way grid = (ParamGridBuilder() .addGrid(lr.regParam, [0.01, 0.5, 2.0]) # Regularization (L2) parameter value .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) # alpha. aplha =0 => only L2, alpha =1 => Only L1 .addGrid(lr.maxIter, [1, 5, 10]) # Max iteration .build()) # 15.2 Another way (This way creates problems) grid= ParamGridBuilder() grid.addGrid(lr.regParam, [0.01,0.5,2.0]) grid.addGrid(lr.elasticNetParam,[0.0,0.5,1.0]).addGrid(lr.maxIter,[1,5,10]) grid.build() # 15.3 Create 5-fold CrossValidator-object # Ref: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.CrossValidator # 15.3.1 Specify first way to evaluate cv results lr = LogisticRegression(labelCol="label", \ featuresCol="features" \ ) evaluator = BinaryClassificationEvaluator() # 15.3.2
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'], outputCol='features') # Split the data into training and testing sets flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23) # Create a regression object and train on training data regression = LinearRegression(labelCol="duration") # Combine steps into a pipeline pipeline = Pipeline(stages=[indexer, onehot, assembler, regression]) # Create parameter grid params = ParamGridBuilder() # Add grids for two parameters params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0, 10.0]) \ .addGrid(regression.elasticNetParam, [0, 0.5, 1.0]) # Build the parameter grid params = params.build() print('Number of models to be tested: ', len(params)) # object to evaluate performance evaluator = RegressionEvaluator(labelCol='duration') # create cross-validation object cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5, seed=13)
def generalizedLinearRegressor(dataFrame, conf): """ input: df [spark.dataFrame], conf [configuration params] output: generalized linear regression model [model] """ # calling params label_col = conf["params"].get("labelCol", "label") features_col = conf["params"].get("featuresCol", "features") prediction_col = conf["params"].get("predictionCol", "prediction") fam = conf["params"].get("family", "gaussian") fit_intercept = conf["params"].get("fitIntercept", True) max_iter = conf["params"].get("maxIter", 25) tolp = conf["params"].get("tol", 1e-6) reg_param = conf["params"].get("regParam", 0.0) weight_col = conf["params"].get("weightCol", None) solverp = conf["params"].get("solver", "irls") link_prediction_col = conf["params"].get("linkPredictionCol", None) variance_power = conf["params"].get("variancePower", 0.0) link_power = conf["params"].get("linkPower", None) if (fam == "gaussian"): li = conf["params"].get("link", "identity") elif (fam == "binomial"): li = conf["params"].get("link", "logit") elif (fam == "poisson"): li = conf["params"].get("link", "log") elif (fam == "gamma"): li = conf["params"].get("link", "inverse") elif (fam == "tweedle"): li = conf["params"].get("link", 1 - variance_power) else: li = conf["params"].get("link", None) glr = GeneralizedLinearRegression(labelCol=label_col, featuresCol=features_col, predictionCol=prediction_col, family=fam, link=li, fitIntercept=fit_intercept, maxIter=max_iter, tol=tolp, regParam=reg_param, solver=solverp, linkPredictionCol=link_prediction_col, variancePower=variance_power, linkPower=link_power) # with tuning if conf["tuning"]: # method: cross validation if conf["tuning"].get("method").lower() == "crossval": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() folds = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() cv = CrossValidator(estimator=glr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=folds) model = cv.fit(dataFrame) # method: train validation split elif conf["tuning"].get("method").lower() == "trainvalsplit": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() tr = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=glr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr) model = tvs.fit(dataFrame) # without tuning else: model = glr.fit(dataFrame) return model
print("Sample model input") print(flites.toPandas().sample(12)) # Split the data into training and testing sets flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23) # Create model objects and train on training data #tree = DecisionTreeClassifier().fit(flights_train) #gbt = GBTClassifier().fit(flights_train) forest = RandomForestClassifier() # Create parameter grid params = ParamGridBuilder() # Add grids for two parameters params = params.addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \ .addGrid(forest.maxDepth, [2, 5, 10]) # Build the parameter grid params = params.build() # Compare AUC on testing data evaluator = BinaryClassificationEvaluator() # create cross-validation object cv = CrossValidator(estimator=forest, estimatorParamMaps=params, evaluator=evaluator, numFolds=5, seed=13) # run fit on training data
def run_pipeline(name: str, data: str, save: str) -> None: spark = SparkSession.builder.appName(name).getOrCreate() # Dataset Creation # # read bike ride history csv's df = spark.read.csv(f'{data}/rides/*', header=True) df = df.select(['Duration', 'Start date', 'Start station number', 'Member type']) df = df.withColumn('Start station number', df['Start station number'].cast(IntegerType())) print(f'The rides dataset has [{df.count()}] rows!') # read station information csv stations = spark.read.csv(f'{data}/stations/*', header=True) print(f'The stations dataset has {stations.count()} rows!') stations = stations.withColumnRenamed('LATITUDE', 'start_station_lat') stations = stations.withColumnRenamed('LONGITUDE', 'start_station_long') stations = stations.withColumn('Start station number', stations['TERMINAL_NUMBER'].cast(IntegerType())) stations = stations.select(['start_station_lat', 'start_station_long', 'Start station number']) # remove rides longer than 1.5 hours one_and_a_half_hours = 60 * 60 * 1.5 df = df.filter(df['Duration'] <= one_and_a_half_hours) # remove rides shorter than 3 minutes three_minutes = 60 * 3 df = df.filter(df['Duration'] >= three_minutes) # remove unknown 'Member type's df = df.filter(df['Member type'] != 'Unknown') # remove non-existent stations df = df.filter(~(df['Start station number'] == 31008) & ~( df['Start station number'] == 32051) & ~(df['Start station number'] == 32034)) # make target feature df = df.withColumn('label', F.log1p(df.Duration)) # join on 'Start station number' print('Merging rides and stations dataframes!') df = df.join(stations, on='Start station number') df = df.withColumn('start_station_long', df['start_station_long'].cast(DoubleType())) df = df.withColumn('start_station_lat', df['start_station_lat'].cast(DoubleType())) print(f'Complete rides and stations dataset has {df.count()} rows!') # Feature Transformations # print('Doing Feature Transformations!') # convert to datetime type df = df.withColumn('Start date', F.to_timestamp('Start date', 'yyyy-MM-dd HH:mm:ss')) df = df.withColumn('day_of_week', F.dayofweek('Start date')) df = df.withColumn('week_of_year', F.weekofyear('Start date')) df = df.withColumn('month', F.month('Start date')) df = df.withColumn('minute', F.minute('Start date')) df = df.withColumn('hour', F.hour('Start date')) # make time features cyclical pi = 3.141592653589793 df = df.withColumn('sin_day_of_week', F.sin(2 * pi * df['day_of_week'] / 7)) df = df.withColumn('sin_week_of_year', F.sin(2 * pi * df['week_of_year'] / 53)) df = df.withColumn('sin_month', F.sin(2 * pi * (df['month'] - 1) / 12)) df = df.withColumn('sin_minute', F.sin(2 * pi * df['minute'] / 60)) df = df.withColumn('sin_hour', F.sin(2 * pi * df['hour'] / 24)) df = df.withColumn('cos_day_of_week', F.cos(2 * pi * df['day_of_week'] / 7)) df = df.withColumn('cos_week_of_year', F.cos(2 * pi * df['week_of_year'] / 53)) df = df.withColumn('cos_month', F.cos(2 * pi * (df['month'] - 1) / 12)) df = df.withColumn('cos_minute', F.cos(2 * pi * df['minute'] / 60)) df = df.withColumn('cos_hour', F.cos(2 * pi * df['hour'] / 24)) df = df.withColumn('hour_and_day_of_week', df['hour'].cast(StringType()) + '_' + df['day_of_week'].cast(StringType())) df = df.withColumn('member_type_and_day_of_week', df['Member type'] + '_' + df['day_of_week'].cast(StringType())) # drop unused columns drop_columns = [ 'Start date', 'Start station number', 'Duration', 'day_of_week', 'week_of_year', 'month', 'minute', 'hour' ] df = df.drop(*drop_columns) # df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df.columns]).show() # Model and Pipeline # # split training and test train, test = df.randomSplit([.7, .3]) # encode categorical column 'Member type' member_indexer = StringIndexer(inputCol='Member type', outputCol='member_idx') member_encoder = OneHotEncoder(inputCol='member_idx', outputCol='member_enc') # create vector of features named 'features' vector = VectorAssembler( inputCols=[ 'start_station_lat', 'start_station_long', 'sin_day_of_week', 'cos_day_of_week', 'sin_week_of_year', 'cos_week_of_year', 'sin_month', 'cos_month', 'sin_minute', 'cos_minute', 'sin_hour', 'cos_hour', 'member_enc' ], outputCol='features' ) # scale features scaler = StandardScaler( inputCol='features', outputCol='scaled_features' ) # define model model = GeneralizedLinearRegression( featuresCol='scaled_features' ) # create pipeline and fill in stages pipeline = Pipeline( stages=[ member_indexer, member_encoder, vector, scaler, model ] ) # evaluation method evaluation = RegressionEvaluator() # best parameter search grid = ParamGridBuilder() # grid = grid.addGrid(model.maxDepth, [5, 7]) # grid = grid.addGrid(model.numTrees, [200, 500]) grid = grid.addGrid(model.maxIter, [40, 50]) grid = grid.addGrid(model.family, ['gaussian', 'gamma']) grid = grid.addGrid(model.regParam, [0.0, 0.1]) grid = grid.build() # run cross validation cv = CrossValidator( estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluation, numFolds=7 ) print('Doing Cross Validation!') cv_models = cv.fit(train) print(f'CV results: {cv_models.avgMetrics} (RMSE)') best_model = cv_models.bestModel best_params = extract_best_params(best_model.stages[-1].extractParamMap()) print(f'Best params:\n{best_params}') results = cv_models.transform(test) print(f'CV results on holdout dataset: {evaluation.evaluate(results)} (RMSE)') print('Re-fitting pipeline on entire dataset!') cv_models = cv.fit(df) print('Saving to pipeline into S3!') entire_dataset_best_model = cv_models.bestModel entire_dataset_best_model.save(f'{save}/{name}') print('Done!') return
class BaseCVModel(metaclass=Metrics): best_params: Dict[str, Any] = None __best_model: PipelineModel = None logger = get_logger() metrics: Dict[str, Callable] = dict() def __init__(self, estimator=None, evaluator=None): self.set_params(estimator, evaluator) def set_params(self, estimator=None, evaluator=None): assert estimator is None or isinstance(estimator, Estimator),\ 'estimator must be a pyspark.ml.base.Estimator.' assert evaluator is None or isinstance(evaluator, Evaluator),\ 'evaluator must be a pyspark.ml.base.Evaluator.' if estimator is not None: self.estimator = estimator if evaluator is not None: self.evaluator = evaluator def _get_features(self, df: Optional[DataFrame] = None): """ Returns three lists of feature names to be used in the model training. Specifically binary, numeric (continuous), categorical features in that order. Returns ------- binary: List[str] numeric: List[str] categorical: List[str] """ raise NotImplementedError def train( self, df: DataFrame, params_map: Optional[Dict[str, List[Any]]] = None, num_folds: Optional[int] = 10, collect_sub_models: Optional[bool] = False, return_cv: Optional[bool] = False ) -> Union[PipelineModel, Tuple[PipelineModel, CrossValidatorModel]]: """ Train model. Params ------ df: Spark DataFrame Input train data params_map: Optional[Dict[str, List[Any]]] (default=None) Parameters mapping to grid search over num_folds: Optional[int] (default=10) Number of cross-validation folds collect_sub_models: Optional[bool] (default=False) Collect models per fold per parameter combination return_cv: Optional[bool] (default=False) Additionally return the CrossValidatorModel object or not Returns ------- self: PipelineModel The (best) model trained on df. cv_model: Optional[CrossValidatorModel] The CrossValidatorModel object. """ # get input features binary, numeric, categorical = self._get_features(df) # convert categorical to numeric labels indexed_cols = [f'{c}_idx' for c in categorical] indexers = [ StringIndexer(inputCol=c[:-6], outputCol=c) for c in indexed_cols ] self.features = binary + numeric + indexed_cols self.logger.info(f'Final model features list: {self.features}') # assemble features into feature vector assembler = VectorAssembler(inputCols=self.features, outputCol=self.estimator.getFeaturesCol()) p = Pipeline(stages=indexers + [assembler]).fit(df) self.logger.info('Index and vector assemble features') df = p.transform(df)\ .select(self.estimator.getFeaturesCol(), self.estimator.getLabelCol()) # if provided, set estimator params map if params_map: self.params_map = params_map # run cross-validation and choose the best set of parameters self.logger.info('Start Cross Validation') cv_params = { 'estimator': self.estimator, 'estimatorParamMaps': self.__params_grid, 'evaluator': self.evaluator, 'numFolds': num_folds, 'collectSubModels': collect_sub_models } cv_model = CrossValidator(**cv_params).fit(df) # set the best model p.stages.append(cv_model.bestModel) self.best_model = p self.logger.info( f'Set the best model with best params: {self.best_params}') if return_cv: return self.best_model, cv_model else: return self.best_model def test(self, df: DataFrame) -> Tuple[DataFrame, Dict[str, Any]]: """ Test the best model found so far. Params ------ df: Spark DataFrame Input test data Returns ------- predictions: DataFrame DataFrame `df` with added `prediction` column results: Dict[str, Any] Dictionary with any results from testing the model e.g., metrics, feature importances, plots etc. """ assert self.best_model is not None, 'Call train() or load() first.' df = df.withColumnRenamed(self.estimator.getLabelCol(), 'label') self.logger.info('Get model predictions') predictions = self.best_model.transform(df) # execute all metrics results = {'best_params': self.best_params} for name, metric in self.metrics.items(): results.update({name: metric(self, predictions)}) self.logger.info(f'Results: {results}') return predictions, results def train_final(self, df: DataFrame): """ Train final model using best parameters found on given dataframe. Params ------ df: Spark DataFrame (Ideally) both train and test combined """ assert self.best_params is not None, 'Call train() or load() first.' est = self.estimator\ .setParams(**self.best_params) # for k, v in self.best_params.items(): # getattr(est, 'set' + k[0].upper() + k[1:])(v) est = Pipeline(stages=self.best_model.stages[:-1] + [est]) self.best_model = est.fit(df) def score(self, df: DataFrame) -> DataFrame: """ Score on given dataset using best model found so far. Params ------ df: Spark DataFrame Input data to score Returns ------- df: Spark DataFrame Same as input with additional prediction columns """ return self.best_model.transform(df) @property def params_map(self) -> Dict[str, List[Any]]: return self.__params_map @params_map.setter def params_map(self, params_map: Dict[str, List[Any]]): assert isinstance(params_map, dict) self.__params_map = params_map self.__params_grid = ParamGridBuilder() for k, v in params_map.items(): self.__params_grid.addGrid(getattr(self.estimator, k), v) self.__params_grid = self.__params_grid.build() @property def best_model(self) -> PipelineModel: return self.__best_model @best_model.setter def best_model(self, model): assert isinstance(model, PipelineModel),\ 'model must be of type PipelineModel.' self.__best_model = model est = model.stages[-1] # coalesce = lambda *x: next(y for y in x if y is not None) if est._java_obj.parent() is not None\ and self.params_map is not None: self.best_params = { k: getattr(est._java_obj.parent(), 'get' + k[0].upper() + k[1:])() for k in self.params_map.keys() } elif hasattr(est, 'extractParamMap'): self.best_params = { param[0].name: param[1] for param in est.extractParamMap().items() } else: self.best_params = None def save(self, path: str): self.best_model.save(path) # save additional model metadata metadata = {} if hasattr(self, 'best_params'): metadata.update({'best_params': self.best_params}) if hasattr(self, 'features'): metadata.update({'features': self.features}) with open(path + '/BaseCVModel_metadata', 'w') as fp: json.dump(metadata, fp, separators=[',', ':']) def load(self, path: str): self.best_model = PipelineModel.load(path) # load additional model metadata import os metadata_exists = (os.path.isfile(path + '/BaseCVModel_metadata'), os.path.isfile(path + '/BaseModel_metadata')) if metadata_exists[0]: path = path + '/BaseCVModel_metadata' elif metadata_exists[1]: # for backward compatibility path = path + '/BaseModel_metadata' if any(metadata_exists): with open(path, 'r') as fp: metadata = json.load(fp) if 'best_params' in metadata: self.best_params = metadata['best_params'] if 'features' in metadata: self.features = metadata['features'] return self @Metrics.register() def self_evaluator(self, predictions: DataFrame): if hasattr(self, 'evaluator')\ and hasattr(self.evaluator, 'getMetricName'): return { self.evaluator.getMetricName(): self.evaluator.evaluate(predictions) }
from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(featuresCol = 'features', labelCol = 'Survived') model = lr.fit(tr) pred = model.transform(te) import pyspark.ml.evaluation as evals evaluator = evals.BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Survived') AUC = evaluator.evaluate(pred) AUC ############# model tunning from pyspark.ml.tuning import ParamGridBuilder params = ParamGridBuilder() params = params.addGrid(lr.regParam, np.arange(0, .1, .01)) params = params.addGrid(lr.elasticNetParam, [0, .5, 1]) params = params.build() print("Number of models to be tested:", len(params)) # create the CrossValidator from pyspark.ml.tuning import CrossValidator cv = CrossValidator(estimator = lr, estimatorParamMaps = params, evaluator = evaluator) cv = cv.setNumFolds(10).setSeed(24).fit(tr) # extract the best model best_model = cv.bestModel pred = best_model.transform(te) print(evaluator.evaluate(pred))