def run_ML_gbt_crossValidation(model, train, test, df, name): # Same idea than run_ML_regression_crossValidation() evaluator = BinaryClassificationEvaluator() grid = tune.ParamGridBuilder() grid = grid.addGrid(model.maxDepth, [1, 10, 20, 30]) grid = grid.addGrid(model.maxIter, [7, 10, 14, 18]) # grid = grid.addGrid(model.minInstancesPerNode, np.linspace(1, 32, 32, endpoint=True)) # grid = grid.addGrid(model.subsamplingRate, np.linspace(1, 10, 10, endpoint=True)) # grid = grid.addGrid(model.maxBins, np.linspace(20, 44, 32, endpoint=True)) # grid = grid.addGrid(model.minInfoGain, [0,1,2]) # grid = grid.addGrid(model.min_samples_leaf, [40,50,60]) grid = grid.build() cv = tune.CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5 ) models = cv.fit(train) bestModel = models.bestModel # obtain the best params result = df.copy() for index, rows in result.iterrows(): if rows['Name'] == name: result.at[index, 'Best_Param'] = "maxDepth: " + str(bestModel._java_obj.getMaxDepth()) + " - maxIter: " + str(bestModel._java_obj.getMaxIter()) finalPredictions = bestModel.transform(train) return finalPredictions, bestModel, result
def GBDTclf(trainingData, testData): max_depth = [1, 5, 10] grid = tune.ParamGridBuilder() \ .addGrid(GBDT.maxDepth, max_depth) \ .build() evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol='probability', labelCol='label') # 3-fold validation cv = tune.CrossValidator(estimator=GBDT, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3) # pipelineDtCV = Pipeline(stages=[cv]) cvModel = cv.fit(trainingData) results = cvModel.transform(testData) label = results.select("label").toPandas().values predict = results.select("prediction").toPandas().values np.savetxt('res/predictedGBDT_spark.txt', predict, fmt='%01d') print("[accuracy,precision,recall,f1]") # print(evaluate(label,predict)) return evaluate(label, predict)
def create_grid(self, dict_features): param_grid = tuning.ParamGridBuilder() for model_parameter, grid_values in dict_features.items(): if isinstance(grid_values, int) or isinstance(grid_values, float): param_grid.baseOn(eval('model.' + model_parameter), grid_values) else: param_grid.addGrid(eval('model.' + model_parameter), grid_values)
def train_evaluate(train_data, test_data): # 将文字的分类特征转为数字 stringIndexer = ft.StringIndexer(inputCol='alchemy_category', outputCol="alchemy_category_Index") encoder = ft.OneHotEncoder(dropLast=False, inputCol='alchemy_category_Index', outputCol="alchemy_category_IndexVec") assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1] assembler = ft.VectorAssembler(inputCols=assemblerInputs, outputCol="features") # dt = cl.DecisionTreeClassifier(labelCol="label", # featuresCol="features") rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features") evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol="probability", labelCol='label', metricName='areaUnderROC') grid_search = tune.ParamGridBuilder()\ .addGrid(rf.impurity, [ "gini","entropy"])\ .addGrid(rf.maxDepth, [ 5,10,15])\ .addGrid(rf.maxBins, [10, 15,20])\ .addGrid(rf.numTrees, [10, 20,30])\ .build() rf_cv = tune.CrossValidator(estimator=rf, estimatorParamMaps=grid_search, evaluator=evaluator, numFolds=5) # rf_tvs = tune.TrainValidationSplit( # estimator=rf, # estimatorParamMaps=grid_search, # evaluator=evaluator, # trainRatio=0.7 # ) pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv]) cv_pipeline_model = pipeline.fit(train_data) best_model = cv_pipeline_model.stages[-1] best_parm = get_best_param(best_model) AUC, AP = evaluate_model(cv_pipeline_model, test_data) return AUC, AP, best_parm, cv_pipeline_model
def run_ML_regression_crossValidation(model, train, test, df, name): # We’ll be using cross validation to choose the hyperparameters # by creating a grid of the possible pairs of values for the three hyperparameters, # elasticNetParam, regParam and maxIter # and using the cross validation error to compare all the different models so you can choose the best one # We will create a 5-fold CrossValidator # The first thing we need when doing cross validation for model selection is a way to compare different models evaluator = BinaryClassificationEvaluator() # Next, we need to create a grid of values to search over when looking for the optimal hyperparameters # Create the parameter grid grid = tune.ParamGridBuilder() # Add the hyperparameter grid = grid.addGrid(model.regParam, np.arange(0, .1, .01)) grid = grid.addGrid(model.elasticNetParam, [0, 1]) grid = grid.addGrid(model.maxIter, [1, 5, 10]) # Build the grid grid = grid.build() # Create the CrossValidator cv = tune.CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5, collectSubModels=True ) # Fit cross validation models models = cv.fit(train) # Extract the best model bestModel = models.bestModel # obtain the best params result = df.copy() for index, rows in result.iterrows(): if rows['Name'] == name: result.at[index, 'Best_Param'] = "regParam: " + str(bestModel._java_obj.getRegParam()) + " - MaxIter: " + str(bestModel._java_obj.getMaxIter()) + " - elasticNetParam: " + str(bestModel._java_obj.getElasticNetParam()) finalPredictions = bestModel.transform(train) return finalPredictions, bestModel, result
def run_ML_mpc_crossValidation(model, train, test, df, name): # Same idea than run_ML_regression_crossValidation() evaluator = MulticlassClassificationEvaluator() grid = tune.ParamGridBuilder() grid = grid.build() cv = tune.CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5 ) models = cv.fit(train) bestModel = models.bestModel # obtain the best params result = df.copy() for index, rows in result.iterrows(): if rows['Name'] == name: result.at[index, 'Best_Param'] = "unspecified" finalPredictions = bestModel.transform(train) return finalPredictions, bestModel, result
def run_ML_dt_crossValidation(model, train, test, df, name): # Same idea than run_ML_regression_crossValidation() evaluator = MulticlassClassificationEvaluator() grid = tune.ParamGridBuilder() grid = grid.addGrid(model.maxDepth, [4, 8]) grid = grid.addGrid(model.maxBins, [2, 4, 6]) grid = grid.build() cv = tune.CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5 ) models = cv.fit(train) bestModel = models.bestModel # obtain the best params result = df.copy() for index, rows in result.iterrows(): if rows['Name'] == name: result.at[index, 'Best_Param'] = "maxDepth: " + str(bestModel._java_obj.getMaxDepth()) + " - maxBins: " + str(bestModel._java_obj.getMaxBins()) finalPredictions = bestModel.transform(train) return finalPredictions, bestModel, result
from pyspark.ml.classification import LogisticRegression # Create a LogisticRegression Estimator lr = LogisticRegression() # Import the evaluation submodule import pyspark.ml.evaluation as evals # Create a BinaryClassificationEvaluator evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC") # Import the tuning submodule import pyspark.ml.tuning as tune # Create the parameter grid grid = tune.ParamGridBuilder() # Add the hyperparameter grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01)) grid = grid.addGrid(lr.elasticNetParam, [0, 1]) # Build the grid grid = grid.build() # Create the CrossValidator cv = tune.CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator ) # Call lr.fit() best_lr = lr.fit(training)
color='blue', ax=ax, width=width, position=1, legend=True) display() # COMMAND ---------- # MAGIC %md # MAGIC Applying regularization to further improve our RMSE # MAGIC First step to build a grid of two parameters - ElasticNetRegularization # COMMAND ---------- grid = tune.ParamGridBuilder() # COMMAND ---------- grid = grid.addGrid(regScaled.elasticNetParam, [0, 0.2, 0.4, 0.6, 0.8, 1]) # COMMAND ---------- grid = grid.addGrid(regScaled.regParam, np.arange(0, .1, .01)) # COMMAND ---------- grid = grid.build() # COMMAND ----------
##weightCol: data = data.withColumn('weight', fn.when(data['y'] == 1, 1.0).otherwise(0.02)) train, test = data.randomSplit([0.7, 0.3], seed=1234) #42 lr_model = cl.LogisticRegression( # maxIter=10, # regParam=0.01, elasticNetParam=0, family='binomial', threshold=0.5, weightCol='weight', labelCol='y') grid = tune.ParamGridBuilder()\ .addGrid(lr_model.maxIter,[200,300,500,800])\ .addGrid(lr_model.regParam,[0.001,0.002])\ .build() evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol='probability', labelCol='y') cv = tune.CrossValidator(estimator=lr_model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3) ppline = Pipeline(stages=[featuerCreator]) train_transfomer = ppline.fit(train) cv_model = cv.fit(train_transfomer.transform(train)) test = train_transfomer.transform(test)
loadedPipelineModel = PipelineModel.load(modelPath) test_loadedModel = loadedPipelineModel.transform(births_test) print ('test_loadedModel:', test_loadedModel) # 超参调优 import pyspark.ml.tuning as tune # 使用网格搜索 logistic = cl.LogisticRegression( labelCol='INFANT_ALIVE_AT_REPORT') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') cv = tune.CrossValidator( estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator )
## 超参调优 Parameter hyper-tuning ### 创建评估器 Create an estimator import pyspark.ml.classification as cl logistic = cl.LogisticRegression( labelCol='label') # 对评估器的参数还需进一步进行超参调优,故先不设定超参数 ### 网格搜索 Grid search import pyspark.ml.tuning as tune grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [10, 50, 80]) \ .addGrid(logistic.regParam, [0.01, 0.001]) \ .build() ### 创建管道 Create a pipeline from pyspark.ml import Pipeline pipeline = Pipeline(stages=[featuresCreator, indexer]) data_transformer = pipeline.fit(data_train) ## 模型拟合及性能评估 Fit the model & Model performance # 使用BinaryClassificationEvaluator评估模型性能 import pyspark.ml.evaluation as ev evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
pipeline = sm.Pipeline(stages=[scaler, reducer, classifier]) # Create an evaluator which will quantify model performance # evaluator = sme.BinaryClassificationEvaluator( # labelCol='label', # rawPredictionCol='predictedLabel', # metricName='areaUnderROC' # ) eval_f1 = sme.MulticlassClassificationEvaluator(labelCol='label', predictionCol='predictedLabel', metricName='f1') # Set up a parameter grid for cross validation param_grid = smt.ParamGridBuilder().addGrid( reducer.k, [10, 20, 50, 75]).addGrid(classifier.maxDepth, [2, 5, 10]).addGrid(classifier.subsamplingRate, [0.1, 0.2, 0.3]).build() # Bring everything together validator = smt.CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=eval_f1, numFolds=3) # Fit the model to the data #######################################################################' model = validator.fit(train) train_predictions = model.transform(train) val_predictions = model.transform(val)
def _transform(self, *args, **kwargs): data_frame = args[0] pred_col = self._predictor.output_column preds = self._predictor.predict(data_frame) return preds.withColumn(pred_col, cast_to_double(preds[pred_col])) cast_to_double = functions.udf(lambda row: float(row[0]), types.DoubleType()) param_grid = tuning.ParamGridBuilder().baseOn(['regularizer', regularizers.l1_l2]) .addGrid('activations', [['tanh', 'relu']]) .addGrid('initializers', [['glorot_normal','glorot_uniform']]) .addGrid('layer_dims', [[input_dim, 2000, 300, 1]]) .addGrid('metrics', [['mae']]) .baseOn(['learning_rate', 1e-2]) .baseOn(['reg_strength', 1e-2]) .baseOn(['reg_decay', 0.25]) .baseOn(['lr_decay', 0.90]) .addGrid('dropout_rate', [0.20, 0.35, 0.50, 0.65, 0.80]) .addGrid('loss', ['mse', 'msle']).build() estimator = DistKeras(trainers.ADAG, {'batch_size': 256, 'communication_window': 3, 'num_epoch': 10, 'num_workers': 50}, **param_grid[0])
def hyper_parameter_optimization_ml(): spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator]) data_transformer = pipeline.fit(births_train) # Specify our model and the list of parameters we want to loop through. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a logic that will do the validation work. cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(data_transformer.transform(births_train)) # See if cvModel performed better than our previous model data_train = data_transformer.transform(births_test) results = cvModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'})) # Parameters which the best model has. results = [ ([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics) ] print(sorted(results, key=lambda el: el[1], reverse=True)[0])
def train_validation_splitting_ml(): spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Select only the top five features. selector = ml_feature.ChiSqSelector( numTopFeatures=5, featuresCol=featuresCreator.getOutputCol(), outputCol='selectedFeatures', labelCol='INFANT_ALIVE_AT_REPORT' ) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, selector]) data_transformer = pipeline.fit(births_train) # Create LogisticRegression and Pipeline. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a TrainValidationSplit object. tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) # Fit our data to the model. tvsModel = tvs.fit(data_transformer.transform(births_train)) data_train = data_transformer.transform(births_test) # Calculate results. results = tvsModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))