def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map) param_grid_builder.addGrid(self.ngram.n, self.ngram_map) param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.clf.regParam, self.clf_map) return param_grid_builder.build()
def _get_param_grid(self): return ParamGridBuilder() \ .addGrid(self.classifier.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) \ .build()
# make predictions on test set make_predictions(GBT_model, test_GBT) # COMMAND ---------- # MAGIC %md #### Train with cross validation # COMMAND ---------- train_cv = train_GBT.union(val_GBT) # COMMAND ---------- # set parameter search grid paramGrid = ParamGridBuilder()\ .addGrid(gbt.maxDepth, [1, 10])\ .build() # options for classification evaluator evaluator = BinaryClassificationEvaluator(labelCol="label") # Cross validation cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=3) # Train GBT model with cross validation cv_model = cv.fit(train_cv) # COMMAND ----------
print('*' * 100) print('Cross-validated model w/ grid search') print('*' * 100) print('*' * 60) print('Cross-validated model - learning') print('*' * 60) print() ##### Cross validation - random forest # instantiate random forest classifier rf = RandomForestClassifier(featureSubsetStrategy = 'auto', impurity = 'gini') # parameter grid param_grid = ParamGridBuilder() \ .addGrid(rf.numTrees, [10, 15, 20, 25, 30, 40, 50]) \ .addGrid(rf.maxDepth, [2, 3, 4, 5, 6, 7, 8]) \ .addGrid(rf.maxBins, [16, 32, 48]) \ .build() # create pipeline that includes preprocessing steps and model stages = [assembler, minMaxScaler, rf] pipeline = Pipeline(stages = stages) # cross validator cv = CrossValidator(estimator = pipeline ,estimatorParamMaps = param_grid ,evaluator = BinaryClassificationEvaluator() ,numFolds = 8 ) # fit model using training data
indexed = indexed.join(meta, "product_id") # Split data into train and test data set (training, test) = indexed. \ select("user_id_index", "product_id_index", "score", "reviewed_at", "title"). \ randomSplit([0.6, 0.4], seed=0) # Train and evaluate with ALS from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder als = ALS(maxIter=5, userCol="user_id_index", itemCol="product_id_index", ratingCol="score") param_grid = ParamGridBuilder().addGrid(als.regParam, [0.01, 0.1, 1.0]).build() evaluator = RegressionEvaluator(metricName="rmse", labelCol="score", predictionCol="prediction") tvs = TrainValidationSplit( estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, ) model = tvs.fit(training) predictions = model.transform(test) predictions = predictions.fillna(0, subset=['prediction'])
(10, "spark compile", 1.0), (11, "hadoop software", 0.0)], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") # HashingTF,使用hash的方法将序列元素转化成他们的频率 # hashingTF.transform(tokenizer.transform(training)).select('text','features').collect() 可以看到features中是hash值对上频率 hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # 这里是一个取之范围,不是区间 # numFeatures 特征数量,应该和哈希值的范围决定有关 paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training) # Prepare test documents, which are unlabeled. test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"), (6, "mapreduce spark"), (7, "apache hadoop")], ["id", "text"])
print("Running Cross-Validation. Please wait.") start = time.time() pipeline = Pipeline( stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx]) pipelineFit = pipeline.fit(df) dataset = pipelineFit.transform(df) (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) # Create ParamGrid for Cross Validation to test various parameters paramGrid = ( ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0) # .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations # .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features .build()) # Create 5-fold CrossValidator cv = CrossValidator(estimator=lr, \ estimatorParamMaps=paramGrid, \ evaluator=evaluator, \ numFolds=5) cvModel = cv.fit(trainingData) predictions = cvModel.transform(testData) # Evaluate best model evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") end = time.time()
lrPredictions=dtModel.transform(testingData); lrPredictions.select("prediction", "label", "std_features").show(5) evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(lrPredictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) treeModel = dtModel.stages[2] # summary only print(treeModel) paramGrid = ParamGridBuilder()\ .addGrid(dt.maxDepth, [2,3,4,5,6,7]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(predictionCol='prediction', labelCol='label',metricName= "r2"), numFolds=3) # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(trainingData) print(cvModel.avgMetrics) #print( cvModel.bestModel.stages[2].summary.r2) for param in paramGrid: print param
##--------run on pyspark shell--------## ##load data to pyspark trainData = spark.read.format('libsvm').load('trian_svm.txt'); cvData = spark.read.format('libsvm').load('cv_svm.txt'); testData = spark.read.format('libsvm').load('test_svm.txt'); ##GBDT in pyspark #design a GBDT model gbdt = GBTClassifier(labelCol='label', featuresCol='features'); #build param grid paramGrid = ParamGridBuilder()\ .addGrid(gbdt.maxDepth, [6, 7, 8])\ .addGrid(gbdt.minInstancesPerNode, [200, 500, 800])\ .addGrid(gbdt.maxIter, [100, 120, 140])\ .addGrid(gbdt.stepSize, [0.04, 0.08])\ .addGrid(gbdt.subsamplingRate, [0.6, 0.8])\ .build(); #build train validation split tvs = TrainValidationSplit(estimator=gbdt, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), trainRatio=0.8); #train the model model_sp = tvs.fit(trainData); #predict train_pred = model_sp.transform(trainData).select('label', 'prediction'); cv_pred = model_sp.transform(cvData).select('label', 'prediction'); test_pred = model_sp.transform(testData).select('label', 'prediction');
plt.legend() plt.show() # ### Using ParamGrid for hyperparameter tuning # The parameters we wish to tweak are: # * maxIter # * regParam # * elasticNetParam - whether a lasso or ridge model will be best # In[30]: from pyspark.ml.tuning import ParamGridBuilder paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [10, 50, 100]).addGrid( lr.regParam, [0.1, 0.3, 1.0]).addGrid(lr.elasticNetParam, [0.0, 1.0]).build() # #### Define the RegressionEvaluator used to evaluate the models # We wish to minimize RMSE # In[31]: evaluator = RegressionEvaluator(labelCol='price', predictionCol='prediction', metricName='rmse') # ### Define the CrossValidator # This is used to put all the pieces together # * <b>estimator: </b>Can be a standalone estimator or a pipeline with an estimator at the end. We use our pipeline # * <b>estimatorParamMaps: </b>We add our paramGrid in order to build models with different combinations of the parameters
def main(): spark = (SparkSession .builder .appName("PowerPlant") .getOrCreate() ) powerPlantDF = spark.read.csv("../data/CCPP/sheet*.csv",header=True,inferSchema=True) vectorizer = VectorAssembler(inputCols = ["AT","V","AP","RH"],outputCol="features") split20DF,split80DF = powerPlantDF.randomSplit([0.20,0.80],seed=100) testSetDF = split20DF.cache() trainingSetDF = split80DF.cache() lr = LinearRegression(predictionCol="Predicted_PE", labelCol="PE", regParam=0.1, maxIter=100,) lrPipeline = Pipeline(stages=[vectorizer,lr]) lrModel = lrPipeline.fit(trainingSetDF) intercept = lrModel.stages[1].intercept weights = lrModel.stages[1].coefficients print("The y intercept: {}".format(intercept)) print("The coefficients: {}".format(weights)) print("Columns:{}".format(trainingSetDF.columns)) predictionsAndLabelsDF = lrModel.transform(testSetDF).select("AT","V","AP","RH","PE","Predicted_PE") regEval = RegressionEvaluator(predictionCol="Predicted_PE",labelCol="PE",metricName="rmse") rmse = regEval.evaluate(predictionsAndLabelsDF) print("Root Mean Squared Error: %.2f" % rmse) r2 = regEval.evaluate(predictionsAndLabelsDF,{regEval.metricName:"r2"}) print("r2: {0:.2f}".format(r2)) print("========== LR Cross Validation==========") crossval = CrossValidator(estimator=lrPipeline,evaluator=regEval,numFolds=3) regParam = [x/100.0 for x in range(1,11)] paramGrid = (ParamGridBuilder() .addGrid(lr.regParam,regParam) .addGrid(lr.maxIter,[50,100,150]) .addGrid(lr.elasticNetParam,[0,1]) .build() ) crossval.setEstimatorParamMaps(paramGrid) cvModel = crossval.fit(trainingSetDF).bestModel predictionsAndLabelsDF = cvModel.transform(testSetDF).select("AT","V","AP","RH","PE","Predicted_PE") rmseNew = regEval.evaluate(predictionsAndLabelsDF) r2New = regEval.evaluate(predictionsAndLabelsDF,{regEval.metricName:"r2"}) print("Old RMSE: {0:.2f}".format(rmse)) print("New RMSE: {0:.2f}".format(rmseNew)) print("Old r2: {0:.2f}".format(r2)) print("New r2: {0:.2f}".format(r2New)) print("Best RegParam: {0}".format(cvModel.stages[-1]._java_obj.parent().getRegParam())) print("Best maxIter: {0}".format(cvModel.stages[-1]._java_obj.parent().getMaxIter())) print("Best elasticNetParam: {0}".format(cvModel.stages[-1]._java_obj.parent().getElasticNetParam())) print("========Random Forest=========") rf = (RandomForestRegressor() .setLabelCol("PE") .setPredictionCol("Predicted_PE") .setFeaturesCol("features") .setSeed(100) .setMaxDepth(8) .setNumTrees(30) ) rfPipeline = (Pipeline() .setStages([vectorizer,rf]) ) crossval.setEstimator(rfPipeline) paramGrid = (ParamGridBuilder() .addGrid(rf.maxBins,[50,100]) .addGrid(rf.maxDepth,[4,8,12]) .addGrid(rf.numTrees,[20,30,40]) .build() ) crossval.setEstimatorParamMaps(paramGrid) rfModel = crossval.fit(trainingSetDF).bestModel predictionsAndLabelsDF = (rfModel .transform(testSetDF) .select("AT","V","AP","RH","PE","Predicted_PE") ) rmseRF = regEval.evaluate(predictionsAndLabelsDF) r2RF = regEval.evaluate(predictionsAndLabelsDF,{regEval.metricName:"r2"}) print("LR RMSE: {0:.2f}".format(rmseNew)) print("RF RMSE: {0:.2f}".format(rmseRF)) print("LR R2: {0:.2f}".format(r2New)) print("RF RMSE: {0:.2f}".format(r2RF)) print("The maxDepth is: {}".format(rfModel.stages[-1]._java_obj.parent().getMaxDepth())) print("The numTrees is: {}".format(rfModel.stages[-1]._java_obj.parent().getNumTrees())) print("The maxBins is: {}".format(rfModel.stages[-1]._java_obj.parent().getMaxBins())) spark.stop()
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.sql.functions import col, expr from pyspark.mllib.evaluation import RankingMetrics spark = SparkSession.builder.appName('Recommendation_system').getOrCreate() df_training = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/training_sample_10p.parquet') df_validation = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/validation_sample_10p.parquet') df_test = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/testing_sample_10p.parquet') als=ALS(userCol="user_id",itemCol="book_id",ratingCol="rating",coldStartStrategy="drop",nonnegative=True) param_grid = ParamGridBuilder().addGrid(als.rank, [15,25,35]).addGrid(als.maxIter, [5,8,10]).addGrid(als.regParam, [0.08,0.09,0.10]).build() evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction") cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3) model=cv.fit(df_training) best_model = model.bestModel print("Tuned Hyperparameters:-------------") print("Rank: ", best_model._java_obj.parent().getRank()) print("MaxIter: ", best_model._java_obj.parent().getMaxIter()) print("RegParam: ", best_model._java_obj.parent().getRegParam()) print("Recommendations: ------------------------------") user_recs = best_model.recommendForAllUsers(500) print(user_recs.count())
spark.conf.set("spark.sql.execution.arrow.enabled", "true") # Create a Spark DataFrame from a pandas DataFrame using Arrow ratings = spark.createDataFrame(dev) (training, test) = ratings.randomSplit([0.8,0.2]) # ALS als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Grid search paramGrid = ParamGridBuilder()\ .addGrid(als.rank, [4,8,12]) \ .addGrid(als.regParam, [0.1,1,10])\ .addGrid(als.maxIter, [5,10,15])\ .addGrid(als.alpha, [1,2,3])\ .build() # Tune hyper param tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=paramGrid, evaluator=rmse, trainRatio=0.8) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions)
def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.lr.regParam, self.lr_map) return param_grid_builder.build()
training_df.head(5) from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml import Pipeline from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.tuning import CrossValidator tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") idf = IDF(minDocFreq=3, inputCol="features", outputCol="idf") nb = NaiveBayes() pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb]) paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build() cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=4) cvModel = cv.fit(training_df) result = cvModel.transform(test_df) prediction_df = result.select("text", "label", "prediction") datasci_df = prediction_df.filter(prediction_df['label'] == 0.0) datasci_df.show(truncate=False) ao_df = prediction_df.filter(prediction_df['label'] == 1.0)
def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.lr.regParam, self.lr_map) return param_grid_builder.build()
# split the data into training and test sets (training, test) = df.randomSplit(weights=[0.8, 0.2]) # train model model = pipeline.fit(training) # prediction with training data prediction_training = model.transform(training) # prediction with test data prediction_test = model.transform(test) ## cross validation from pyspark.ml.tuning import CrossValidator, ParamGridBuilder paramGrid = ParamGridBuilder().\ addGrid(dTree.minInfoGain, [0,1,2]).\ addGrid(dTree.maxDepth, [2,5,10]).\ build() # evaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction') # 3-fold cross validation cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # train model through cross validation dTree_cv_model = cv.fit(training)
print('Average RMSE for {0} windows: {1}'.format(num_windows, np.mean(total_RMSE))) feature_columns = ['previous_hour_price', 'previous_hour_high_low_range', 'previous_hour_volume'] sliding_window_evaluation(dataframe=test, feature_columns=feature_columns, num_windows=3, test_size=0.2) ################################################################################################################# ##### Hyperparameter Tuning # Grid Search - Spark ML from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Defining our parameter grid paramGrid = (ParamGridBuilder() .addGrid(randomForest.numTrees, [10, 30, 100, 300]) .addGrid(randomForest.maxDepth, [3, None]) .build() ) # Cross validation with the parameter grid crossval = CrossValidator(estimator=randomForest, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=3) # Reporting the number of nodes on the cluster print('Number of nodes on the cluster:', sc._jsc.sc().getExecutorMemoryStatus().size()) # Performing the grid search cvModel = crossval.fit(trainingDataset)
# Split dataset into "train" and "test" sets (train, test) = df.randomSplit([trainSplit, testSplit], 42) # Setup evaluator -- default is F1 score classEvaluator = MulticlassClassificationEvaluator(metricName="accuracy") with mlflow.start_run(): # Gradient-boosted tree regression gbt = GBTRegressor(maxIter=maxIter) # Setup pipeline pipeline = Pipeline(stages=[gbt]) # Setup hyperparams grid paramGrid = ParamGridBuilder().build() # Setup model evaluators rmseevaluator = RegressionEvaluator() #Note: By default, it will show how many units off in the same scale as the target -- RMSE r2evaluator = RegressionEvaluator(metricName="r2") #Select R2 as our main scoring metric # Setup cross validator cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=r2evaluator, numFolds=numberOfCVFolds) # Fit model on "train" set cvModel = cv.fit(train) # Get the best model based on CrossValidator model = cvModel.bestModel # Run inference on "test" set
features = ["reviewed", "vehicle_year", "vehicle_color_encoded", "CloudCover"] assembler = VectorAssembler(inputCols=features, outputCol="features") # Specify the estimator (i.e., classification algorithm): from pyspark.ml.classification import RandomForestClassifier classifier = RandomForestClassifier(featuresCol="features", labelCol="star_rating") print(classifier.explainParams()) # Specify the hyperparameter grid: from pyspark.ml.tuning import ParamGridBuilder maxDepthList = [5, 10, 20] numTreesList = [20, 50, 100] subsamplingRateList = [0.5, 1.0] paramGrid = ParamGridBuilder() \ .addGrid(classifier.maxDepth, maxDepthList) \ .addGrid(classifier.numTrees, numTreesList) \ .addGrid(classifier.subsamplingRate, subsamplingRateList) \ .build() # Specify the evaluator: from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="star_rating", metricName="accuracy") # **Note:** We are treating `star_rating` as a multiclass label. # Specify the validator: from pyspark.ml.tuning import TrainValidationSplit validator = TrainValidationSplit(estimator=classifier, estimatorParamMaps=paramGrid, evaluator=evaluator) # ## Specify the pipeline
# Split the data into training and test sets (30% held out for testing) (training_data, test_data) = assembled_df.randomSplit([0.7, 0.3], seed=1234) num_folds = 5 evaluator = MulticlassClassificationEvaluator(labelCol="success", predictionCol="prediction", metricName="accuracy") # Train a RandomForest model. rf = RandomForestClassifier(labelCol="success", featuresCol="features", numTrees=500) paramGrid = (ParamGridBuilder().addGrid(param=rf.numTrees, values=[100, 300, 500]).build()) crossval = CrossValidator( estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=num_folds, seed=1234, ) model = crossval.fit(training_data) predictions_train = model.transform(training_data) predictions_test = model.transform(test_data) predictions_train.select("movie_name", "imdb_id", "prediction",
evaluator = BinaryClassificationEvaluator() evaluator.evaluate(predictions) # 12.4 gbt.explainParams() ############# GG. Gradient Boosting Cross-validation ################## # 12.5 Cross validation using parameter grid from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # 12.6 paramGrid = (ParamGridBuilder() .addGrid(gbt.maxDepth, [2, 4, 6]) .addGrid(gbt.maxBins, [20, 60]) .addGrid(gbt.maxIter, [10, 20]) .build()) # 12.7 cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5 ) # 12.8 Run cross validations. # Takes about 6 minutes as it is training over 20 trees! cvModel = cv.fit(train) predictions = cvModel.transform(test) evaluator.evaluate(predictions)
.appName('ImageFeatureSelector') \ .config('spark.executor.memory', '2G') \ .config('spark.executor.cores', '2') \ .config('spark.driver.memory', '3G') \ .config('spark.driver.cores', '1') \ .getOrCreate() train_df = spark.createDataFrame(load_train_data(imagenet_path)) pre_trained_model = InceptionV3(weights="imagenet") pre_trained_model.save('/tmp/model-full.h5') estimator = KerasImageFileEstimator(inputCol="uri", outputCol="prediction", labelCol="one_hot_label", imageLoader=load_image_from_uri, kerasOptimizer='adam', kerasLoss='categorical_crossentropy', modelFile='/tmp/model-full-tmp.h5' # local file path for model ) param_grid = (ParamGridBuilder().addGrid(estimator.kerasFitParams, [{"batch_size": 32, "verbose": 0}, {"batch_size": 64, "verbose": 0}]).build()) binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label") cv = CrossValidator(estimator=estimator, estimatorParamMaps=param_grid, evaluator=binary_evaluator, numFolds=2) cv_model = cv.fit(train_df) print(cv_model)
rForm = RFormula() lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features") # COMMAND ---------- from pyspark.ml import Pipeline stages = [rForm, lr] pipeline = Pipeline().setStages(stages) # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder params = ParamGridBuilder()\ .addGrid(rForm.formula, [ "lab ~ . + color:value1", "lab ~ . + color:value1 + color:value2"])\ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .addGrid(lr.regParam, [0.1, 2.0])\ .build() # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator()\ .setMetricName("areaUnderROC")\ .setRawPredictionCol("prediction")\ .setLabelCol("label") # COMMAND ---------- from pyspark.ml.tuning import TrainValidationSplit
def feature_selector(df: DataFrame, ranked_features: list, output_column: str, estimator_obj=RandomForestRegressor, feature_inclusion_increments: int = 1, train_test_split_ratio: list = None, cv: int = -1, evaluation_metric: str = 'r2'): """ Trains the estimator at multiple steps, with features progressively added to the input list based on their ranks :param df: the input dataset with features and output as columns :param ranked_features: the output of the feature ranking algorithm or a manually selected ranking scheme :param output_column: the name of the output column in the dataset :param estimator_obj: the training model object :param train_test_split_ratio: the default for train_test_split_ratio is [0.66, 0.33] :param cv: if left as default (c = -1), changes nothing. If selected as a value > 1, it enforces cross validation and overrides the train-test-splitting :param feature_inclusion_increments: :param evaluation_metric: evaluation metric to return for predictions on test set - "rmse": root mean squared error - "mse": mean squared error - "r2" (default): coefficient of determination - "mae": mean absolute error """ if train_test_split_ratio is None: train_test_split_ratio = [0.66, 0.33] feature_count_list = list( range(1, len(ranked_features), feature_inclusion_increments)) + [len(ranked_features)] estimator_features_col = 'features' while estimator_features_col in df.columns: estimator_features_col += '_' estimator_prediction_col = 'prediction' while estimator_prediction_col in df.columns: estimator_prediction_col += '_' estimator_obj.setFeaturesCol(estimator_features_col) estimator_obj.setPredictionCol(estimator_prediction_col) estimator_obj.setLabelCol(output_column) evaluator = RegressionEvaluator(labelCol=output_column, predictionCol=estimator_prediction_col, metricName=evaluation_metric) scores = [] if cv <= 1: df_train, df_test = df.randomSplit(train_test_split_ratio) for feature_count in feature_count_list: input_features = ranked_features[0:feature_count] assembler = VectorAssembler(inputCols=input_features, outputCol=estimator_features_col) df_train = assembler.transform(df_train) fit_model = estimator_obj.fit(df_train) df_test = assembler.transform(df_test) df_test = fit_model.transform(df_test) score = evaluator.evaluate(df_test) scores.append((feature_count, score)) df_train = df_train.drop(estimator_features_col) df_test = df_test.drop(estimator_features_col, estimator_prediction_col) else: for feature_count in feature_count_list: input_features = ranked_features[0:feature_count] assembler = VectorAssembler(inputCols=input_features, outputCol=estimator_features_col) df = assembler.transform(df) grid = ParamGridBuilder().addGrid( estimator_obj.featuresCol, [estimator_obj.getFeaturesCol()]).build() crossval = CrossValidator(estimator=estimator_obj, evaluator=evaluator, numFolds=cv, estimatorParamMaps=grid) fit_crossval = crossval.fit(df) scores.append((feature_count, fit_crossval.avgMetrics[0])) df = df.drop(estimator_features_col) return scores
corr = meas.select(sensorNameArray).toPandas().corr() mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True cmap = sb.diverging_palette(220, 10, as_cmap=True) sb.heatmap(corr, mask=mask, xticklabels=sensorNameArray, yticklabels=sensorNameArray, square=True, linewidths=.5, cbar_kws={"shrink": .5}) # #### Model Tuning # Spark has advanced model tuning capabilities as well. Let's improve our Random Forest # Classifier using the ML tuning api from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit # ParamGrids are grids of model tuning parameter values paramGrid = ParamGridBuilder()\ .addGrid(rf.maxDepth, [5,10,15])\ .addGrid(rf.numTrees, [20,25,30])\ .build() # A TrainValidationSplit is used for hyper-parameter tuning. It takes a model estimator, # parameter grid, and evaluator as input and runs the model multiple times to identify # the most optimal model parameters tvs = TrainValidationSplit(estimator=rf, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), trainRatio=0.8) (trainingData, testData) = li.transform(va).randomSplit([0.7, 0.3]) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(trainingData)
# COMMAND ---------- # MAGIC %md Third, we wrap the model training stage within a `CrossValidator` stage. `CrossValidator` knows how to call the GBT algorithm with different hyperparameter settings. It will train multiple models and choose the best one, based on minimizing some metric. In this example, our metric is [Root Mean Squared Error (RMSE)](https://en.wikipedia.org/wiki/Root-mean-square_deviation). # MAGIC # MAGIC ![Image of CV](http://training.databricks.com/databricks_guide/4-cv.png) # COMMAND ---------- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import RegressionEvaluator # Define a grid of hyperparameters to test: # - maxDepth: max depth of each decision tree in the GBT ensemble # - maxIter: iterations, i.e., number of trees in each GBT ensemble # In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100). paramGrid = ParamGridBuilder()\ .addGrid(gbt.maxDepth, [2, 5])\ .addGrid(gbt.maxIter, [10, 100])\ .build() # We define an evaluation metric. This tells CrossValidator how well we are doing by comparing the true labels with predictions. evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol()) # Declare the CrossValidator, which runs model tuning for us. cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid) # COMMAND ---------- # MAGIC %md Finally, we can tie our feature processing and model training stages together into a single `Pipeline`. # MAGIC # MAGIC ![Image of Pipeline](http://training.databricks.com/databricks_guide/5-pipeline.png)
def _get_param_grid(self): self.classifier.getClassifier return ParamGridBuilder() \ .addGrid(self.classifier.getClassifier().regParam, [0.1, 0.2, 0.4, 0.6, 0.8, 1]) \ .build()
# metrics1 = BinaryClassificationMetrics(PredictionandLabels) # (train score/train accuracy --- ) # (train error = 1-train score ?) metrics2 = MulticlassMetrics(PredictionandLabels) metrics2.accuracy metrics2.areaUnderPR print(metrics2.confusionMatrix()) # ---------------------------------------------------------------------------- # CV / Parameter Tuning approach --------------------------------------------- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator paramGrid = ParamGridBuilder().addGrid(dt1.impurity, ['entropy', 'gini']).addGrid( dt1.maxDepth, [2, 3, 4, 5, 6]).build() evaluator1 = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Survived', metricName='accuracy') crossVal4 = CrossValidator(estimator=dt1, estimatorParamMaps=paramGrid, evaluator=evaluator1, numFolds=10) model23 = crossVal4.fit(df3) model23.avgMetrics # --------------------------------------------------------
def _get_param_grid(self): return ParamGridBuilder() \ .addGrid(self.classifier.maxDepth, [5, 7, 11, 13]) \ .addGrid(self.classifier.numTrees, [5, 7, 11, 13, 17]) \ .build()
print("MLflow:") print(" run_id:",run_id) print(" experiment_id:",experiment_id) # Log MLflow parameters print("Parameters:") print(" maxDepthParams:",maxDepthParams) print(" maxBinsParams:",maxBinsParams) # Create pipeline dt = DecisionTreeRegressor(labelCol=colLabel, featuresCol=colFeatures) assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol=colFeatures) pipeline = Pipeline(stages=[assembler, dt]) paramGrid = ParamGridBuilder() \ .addGrid(dt.maxDepth, maxDepthParams) \ .addGrid(dt.maxBins, maxBinsParams) \ .build() evaluator = RegressionEvaluator( labelCol=colLabel, predictionCol=colPrediction, metricName=metricName) crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=numFolds) # Train model. This also runs the indexers. cvModel = crossval.fit(trainingData) model = cvModel.bestModel # Make predictions.
xs = crashes.flatMap(lambda x:x.split('\n')) \ .map(json.loads) \ .map(group_crashes) \ .map(improve_times) # conver to DF df = spark.createDataFrame(xs) feature_labels = df.columns feature_labels.pop(feature_labels.index('Number of Vehicles Involved')) df = reduce(string_to_index, feature_labels, df) indexes = ["i-"+f for f in feature_labels] df = VectorAssembler(inputCols=indexes, outputCol="features").transform(df) df = StringIndexer(inputCol='Number of Vehicles Involved', outputCol='label').fit(df).transform(df) grid = ParamGridBuilder().addGrid(nb.smoothing, [1.0, 1.5]) \ .build() cv = CrossValidator(estimator=nb, estimatorParamMaps=grid, evaluator=mce,numFolds=5, parallelism=4) cv_model = cv.fit(df) transformed = cv_model.transform(df) f1 = mce.evaluate(transformed) print("NB F1: {:0.4f}".format(f1)) cv_model.bestModel.save(sys.argv[2])
outputCol="features") dtc = DecisionTreeClassifier(featuresCol="features", labelCol="Survived") # COMMAND ---------- # MAGIC %md-sandbox # MAGIC `ParamGridBuilder()` allows us to string together all of the different possible hyperparameters we would like to test. In this case, we can test the maximum number of iterations, whether we want to use an intercept with the y axis, and whether we want to standardize our features. # MAGIC # MAGIC <img alt="Caution" title="Caution" style="vertical-align: text-bottom; position: relative; height:1.3em; top:0.0em" src="https://files.training.databricks.com/static/images/icon-warning.svg"/> Since grid search works through exhaustively building a model for each combination of parameters, it quickly becomes a lot of different unique combinations of parameters. # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder paramGrid = (ParamGridBuilder().addGrid(dtc.maxDepth, [2, 3, 4, 5, 6]).addGrid( dtc.maxBins, [16, 32, 48, 64]).build()) # COMMAND ---------- # MAGIC %md-sandbox # MAGIC ### Cross-Validation # MAGIC # MAGIC There are a number of different ways of conducting cross-validation, allowing us to trade off between computational expense and model performance. An exhaustive approach to cross-validation would include every possible split of the training set. More commonly, _k_-fold cross-validation is used where the training dataset is divided into _k_ smaller sets, or folds. A model is then trained on _k_-1 folds of the training data and the last fold is used to evaluate its performance. # MAGIC # MAGIC <img alt="Side Note" title="Side Note" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.05em; transform:rotate(15deg)" src="https://files.training.databricks.com/static/images/icon-note.webp"/> See <a href="https://en.wikipedia.org/wiki/Cross-validation_(statistics)" target="_blank">the Wikipedia article on Cross-Validation</a> for more information. # COMMAND ---------- # MAGIC %md # MAGIC Create a `MulticlassClassificationEvaluator()` to evaluate our grid search experiments and a `CrossValidator()` to build our models.