def test_fit_maximize_metric(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) bestModel = tvsModel.bestModel bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) validationMetrics = tvsModel.validationMetrics self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error") self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1") self.assertEqual(len(grid), len(validationMetrics), "validationMetrics has the same size of grid parameter") self.assertEqual(1.0, max(validationMetrics))
def test_save_load_simple_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) self.assertEqual(len(tvsModel.subModels), len(grid)) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testTrainValidationSplitSubModels" savingPathWithSubModels = testSubPath + "cvModel3" tvsModel.save(savingPathWithSubModels) tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels) self.assertEqual(len(tvsModel3.subModels), len(grid)) tvsModel4 = tvsModel3.copy() self.assertEqual(len(tvsModel4.subModels), len(grid)) savingPathWithoutSubModels = testSubPath + "cvModel2" tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels) self.assertEqual(tvsModel2.subModels, None) for i in range(len(grid)): self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
def test_copy(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator") self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual(len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original") for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index])
def build_model(training): #training = read_data() training.cache() columns = training.columns columns.remove("Occupancy") assembler = VectorAssembler(inputCols=columns, outputCol="featureVec") lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy") pipeline = Pipeline(stages=[assembler, lr]) param_grid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \ .build() evaluator = BinaryClassificationEvaluator(labelCol="Occupancy") validator = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, trainRatio=0.9) validator_model = validator.fit(training) return validator_model.bestModel
def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvs.setParallelism(1) tvsSerialModel = tvs.fit(dataset) tvs.setParallelism(2) tvsParallelModel = tvs.fit(dataset) self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
def test_save_load_nested_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(100) lr2 = LogisticRegression().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) originalParamMap = tvs.getEstimatorParamMaps() loadedParamMap = loadedTvs.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_save_load_trained_model(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame([(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) lrModel = tvsModel.bestModel tvsModelPath = temp_path + "/tvsModel" lrModel.save(tvsModelPath) loadedLrModel = LogisticRegressionModel.load(tvsModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame([(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) self.assertEqual(len(tvsModel.subModels), len(grid)) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testTrainValidationSplitSubModels" savingPathWithSubModels = testSubPath + "cvModel3" tvsModel.save(savingPathWithSubModels) tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels) self.assertEqual(len(tvsModel3.subModels), len(grid)) tvsModel4 = tvsModel3.copy() self.assertEqual(len(tvsModel4.subModels), len(grid)) savingPathWithoutSubModels = testSubPath + "cvModel2" tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels) self.assertEqual(tvsModel2.subModels, None) for i in range(len(grid)): self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
.addGrid(rf.maxDepth, [5,10,15])\ .addGrid(rf.numTrees, [20,25,30])\ .build() # A TrainValidationSplit is used for hyper-parameter tuning. It takes a model estimator, # parameter grid, and evaluator as input and runs the model multiple times to identify # the most optimal model parameters tvs = TrainValidationSplit(estimator=rf, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), trainRatio=0.8) (trainingData, testData) = li.transform(va).randomSplit([0.7, 0.3]) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(trainingData) predictions = model.transform(testData) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) i2s.transform(predictions).groupBy('predictedLabel', 'maintenanceType')\ .count().toPandas() fi = model.bestModel.featureImportances.toArray() sensorImportances = {} for sensorIndex in range(len(fi)): sensorImportances[sensorNames[sensorIndex]] = round(fi[sensorIndex]*100)
"lab ~ . + color:value1 + color:value2"])\ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .addGrid(lr.regParam, [0.1, 2.0])\ .build() # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator()\ .setMetricName("areaUnderROC")\ .setRawPredictionCol("prediction")\ .setLabelCol("label") # COMMAND ---------- from pyspark.ml.tuning import TrainValidationSplit tvs = TrainValidationSplit()\ .setTrainRatio(0.75)\ .setEstimatorParamMaps(params)\ .setEstimator(pipeline)\ .setEvaluator(evaluator) # COMMAND ---------- tvsFitted = tvs.fit(train) # COMMAND ---------- evaluator.evaluate(tvsFitted.transform(test)) tvsFitted.write.overwrite().save("temp/ModelLocation")
scaler_model = scaler.fit(dataset) scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') else: scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') dataset = scaler_model.transform(dataset) polyExpansion = PolynomialExpansion(degree=2, inputCol='scaled_feature_vec', outputCol='polyFeatures') dataset = polyExpansion.transform(dataset) dataset = dataset.select(F.col('duration'), F.col('polyFeatures'), F.col('key')).cache() glr = None if args.mode == 'train': glr = GeneralizedLinearRegression(labelCol='duration', featuresCol='polyFeatures', family='Binomial', linkPredictionCol='link_pred') paramGrid = ParamGridBuilder() \ .addGrid(glr.link, ['logit']) \ .addGrid(glr.regParam, [1e-5]) \ .build() tvs = TrainValidationSplit(estimator=glr, \ estimatorParamMaps=paramGrid, \ evaluator=RegressionEvaluator(metricName='r2', labelCol='duration'), \ trainRatio=0.7) tvs_model = tvs.fit(dataset) print('----> {}'.format(tvs_model.validationMetrics)) if args.save_model: tvs_model.write().save('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2') else: #glr_model = GeneralizedLinearRegressionModel.load('/user/ronghui_safe/hgy/nid/models/glm_binomial_model') glr_model = TrainValidationSplitModel.load('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2') dataset = glr_model.transform(dataset).select(F.col('duration'), F.col('prediction'), F.col('key')).cache() if args.mode == 'eval': evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='duration', metricName='r2') print('----> The performance on the whole dataset is {}'.format(round(evaluator.evaluate(dataset), 4))) dataset.drop('duration').repartition(50).write.csv('/user/ronghui_safe/hgy/nid/weights/{}_{}'.format(args.query_month, args.mode), header=True)
def get_als_model(): ### Create our SparkSession, this can take a couple minutes locally spark = SparkSession.builder.appName("Review_data_JSON2").config( 'spark.sql.broadcastTimeout', '34000').getOrCreate() ### Open the data from review.json df_reviews = spark.read.json("data_source/review.json") ### Take only 192.000 rows from the original review.json. ratingsRDD = df_reviews.select("user_id", "business_id", "stars").take(192000) df_reviews = spark.createDataFrame(ratingsRDD) columns_indexing = ["user_id", "business_id"] ### Using StringIndexer to create a category feature for user_id and business_id. indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(df_reviews) for column in columns_indexing ] ### Creating a Pipeline to index two columns from the current dataset. pipeline = Pipeline(stages=indexers) ### Creating the new DataFrame after encoding user and business Id's. df_reviews_prepro = pipeline.fit(df_reviews).transform(df_reviews) ### Spliting in training, validation and test datasets. (training_review, test_review) = df_reviews_prepro.select("user_id_index", "business_id_index", "stars").randomSplit([0.8, 0.2]) ### Creating our ALS prediction model. als_model = ALS(userCol="user_id_index", itemCol="business_id_index", ratingCol="stars", coldStartStrategy="drop", nonnegative=True) ### Tuning model param_grid = ParamGridBuilder()\ .addGrid(als_model.rank, [12, 13, 14])\ .addGrid(als_model.maxIter, [18, 19, 20])\ .addGrid(als_model.regParam, [.17, .18, .19])\ .build() ### Evaluate as Root Mean Squared Error evaluator = RegressionEvaluator(metricName="rmse", labelCol="stars", predictionCol="prediction") ### tvs = TrainValidationSplit(estimator=als_model, estimatorParamMaps=param_grid, evaluator=evaluator) ### Training the model. model = tvs.fit(training_review) best_model = model.bestModel return best_model
gbdt = GBTClassifier(labelCol='label', featuresCol='features'); #build param grid paramGrid = ParamGridBuilder()\ .addGrid(gbdt.maxDepth, [6, 7, 8])\ .addGrid(gbdt.minInstancesPerNode, [200, 500, 800])\ .addGrid(gbdt.maxIter, [100, 120, 140])\ .addGrid(gbdt.stepSize, [0.04, 0.08])\ .addGrid(gbdt.subsamplingRate, [0.6, 0.8])\ .build(); #build train validation split tvs = TrainValidationSplit(estimator=gbdt, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), trainRatio=0.8); #train the model model_sp = tvs.fit(trainData); #predict train_pred = model_sp.transform(trainData).select('label', 'prediction'); cv_pred = model_sp.transform(cvData).select('label', 'prediction'); test_pred = model_sp.transform(testData).select('label', 'prediction'); #convert spark df to pandas df train_pred_pd = train_pred.toPandas(); cv_pred_pd = cv_pred.toPandas(); test_pred_pd = test_pred.toPandas(); #evaluate the f1 score train_precision = metrics.precision_score(train_pred_pd.label.values, train_pred_pd.prediction.values); train_recall = metrics.recall_score(train_pred_pd.label.values, train_pred_pd.prediction.values); train_f1 = metrics.f1_score(train_pred_pd.label.values, train_pred_pd.prediction.values);
# ## Specify the evaluator from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="Class", metricName="areaUnderPR") # ## Tuning the hyperparameters using holdout cross-validation # For large DataFrames, holdout cross-validation will be more efficient. Use # the # [TrainValidationSplit](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplit) # class to specify holdout cross-validation: from pyspark.ml.tuning import TrainValidationSplit validator = TrainValidationSplit(estimator=rf, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.75, seed=54321) # Use the `fit` method to find the best set of hyperparameters: %time cv_model = validator.fit(df_train) # **Note:** Our train DataFrame is split again according to `trainRatio`. # The resulting model is an instance of the # [TrainValidationSplitModel](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplitModel) # class: type(cv_model) # The cross-validation results are stored in the `validationMetrics` attribute: cv_model.validationMetrics # Plotting Validation Metric for each set of hyperparameters (NumTrees). def plot_holdout_results(model): plt.plot(numTreesList, model.validationMetrics)
def SVM(trainingData, testData): start_time = time.time() print(" ") print("--------------------- SUPPORT VECTOR MACHINE ---------------------") svm = LinearSVC() ovr = OneVsRest(classifier=svm) # Parametri su cui effettuare il tuning paramGrid = ParamGridBuilder() \ .addGrid(svm.regParam, [1, 0]) \ .addGrid(svm.maxIter, [100, 1000]) \ .build() # Tuning sui vari parametri per scegliere il modello migliore tvs = TrainValidationSplit(estimator=ovr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), # Validation test: 80% traning, 20% validation. trainRatio=0.8) model = tvs.fit(trainingData) prediction = model.transform(testData) result = prediction.select('features', 'label', 'prediction') # Calcolo accuracy evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(prediction) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") f1score = evaluator.evaluate(prediction) # Confusion Matrix class_temp = prediction.select("label").groupBy("label") \ .count().sort('count', ascending=False).toPandas() class_temp = class_temp["label"].values.tolist() y_true = prediction.select("label") y_true = y_true.toPandas() y_pred = prediction.select("prediction") y_pred = y_pred.toPandas() cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_temp) print("Accuracy Hold-Out: ", accuracy) print("F1-Score Hold-Out: ", f1score) print("") print("") print("Doc Parameters : [", model.explainParams(), "]") print("") print("") print("Confusion Matrix: ") print(cnf_matrix) print("SVM HoldOut Execution TIME:", time.time() - start_time) # Richiamo SVM che utilizza la validazione K-Folds f1score_cv, cnf_matrix_cv, cv = SVMCV(trainingData, testData) # Restituisco il modello migliore tra Hold Out e K-Folds if (f1score <= f1score_cv): return (f1score_cv, cnf_matrix_cv, cv) else: return (f1score, cnf_matrix, tvs)
"""RMSE for basic model after resolving cold start problem is 0.92""" als = ALS(userCol="userid", itemCol="itemid", ratingCol="rating",coldStartStrategy="drop", nonnegative=True) #Tuning model using ParamGridBuilder param_grid=ParamGridBuilder()\ .addGrid(als.rank,(12,13,14))\ .addGrid(als.maxIter,(5,10,15))\ .addGrid(als.regParam,[0.01,0.05,0.10])\ .build() evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") #CrossValidation tvs=TrainValidationSplit(estimator=als,estimatorParamMaps=param_grid,evaluator=evaluator) model = tvs.fit(training) best_model=model.bestModel predictions = best_model.transform(test) rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) """Root-mean-square error = 0.916 Improving performance by cross validation """ from pyspark.ml.tuning import CrossValidator, ParamGridBuilder als = ALS(userCol="userid", itemCol="itemid", ratingCol="rating",coldStartStrategy="drop", nonnegative=True)
# En muchos casos, holdout cross-validation will be sufficient. Usar la clase # [TrainValidationSplit](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplit) # para especificar holdout cross-validation: from pyspark.ml.tuning import TrainValidationSplit tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.75, seed=54321) # material teorico: # [TrainValidationSplit](https://es.wikipedia.org/wiki/Validacion_cruzada) # Para cada combinacion de hiperparametros la regresion linear sera, # entrenada con un set aleatorio de 75% de registros para entrenamiento llenando el DataFrame `train` # y luego evaluado sobre el 25%. # usar el metodo `fit` para encontrar el mejor conjunto de parametros: %time tvs_model = tvs.fit(train) # El resultado es una instancia de la clase # [TrainValidationSplitModel](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.TrainValidationSplitModel) #: type(tvs_model) # Los resultados de validación cruzada se almacenan en el atributo `validationMetrics`: tvs_model.validationMetrics # Estos son los RMSE para cada conjunto de hiperparámetros. Más pequeño es mejor. def plot_holdout_results(model): plt.plot(regParamList, model.validationMetrics)
def gbtRegression(df, conf): """ input : df [spark.dataframe], conf [configuration params] output : decisiontree_regression model [model] """ featuresCol = conf["params"].get("featuresCol") labelCol = conf["params"].get("labelCol") predictionCol=conf["params"].get("predictionCol") impurity = conf["params"].get("impurity", "variance") maxDepth = conf["params"].get("maxDepth", 5) maxIter = conf["params"].get("maxIter", 20) maxBin = conf["params"].get("maxBins", 32) minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1) minInfoGain = conf ["params"].get("minInfoGain", 0.0) maxMemoryInMB = conf["params"].get("maxMemoryInMB",256) cacheNodeIds = conf["params"].get("cacheNodeIds", False) subsamplingRate= conf["params"].get("subsamplingRate", 1.0) checkpointInterval = conf["params"].get("checkpointInterval", 10) lossType = conf["params"].get("lossType", "squared") seed = conf["params"].get("seed", None) gbt = GBTRegressor(maxIter=maxIter, maxDepth=maxDepth, featuresCol="indexedFeatures") pipeline = Pipeline(stages=[featureIndexer, gbt]) print ("maxDepth : " , gbt.getMaxDepth()) print ("maxIter : ", gbt.getMaxIter()) #jika menggunakan ml-tuning if conf["tuning"]: #jika menggunakan ml-tuning cross validation if conf["tuning"].get("method").lower() == "crossval": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() folds = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, numFolds= folds) model = cv.fit(df) #jika menggunakan ml-tuning train validation split elif conf["tuning"].get("method").lower() == "trainvalsplit": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() tr = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr ) model = tvs.fit(df) #jika tidak menggunakan ml-tuning elif conf["tuning"] == None: print ("test") model = pipeline.fit(df) return model
######################### param_grid = ParamGridBuilder() \ .addGrid(lr.regParam, [1.0, 0.1, 0.01, 0.001]) \ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \ .build() ####################################### # Hyperparameter Tuning - Grid Search # ####################################### t_0 = time.time() train_val = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=param_grid, evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'), trainRatio=0.8) model = train_val.fit(train_df) print(model.bestModel.stages[-1].explainParam('regParam')) print(model.bestModel.stages[-1].explainParam('elasticNetParam')) print('Grid search took: {} seconds'.format(time.time() - t_0)) ################# # Model Metrics # ################# t_0 = time.time() predictions = model.transform(test_df) print('Model training took: {} seconds'.format(time.time() - t_0)) evaluator = BinaryClassificationEvaluator() auroc = evaluator.evaluate(predictions,
def linearRegression(df, conf): """ input : df [spark.dataframe], conf [configuration params] output : linear_regression model [model] """ #memanggil parameter (nilai default) featuresCol= conf["params"].get("featuresCol", "features") labelCol= conf["params"].get("labelCol", "label") predictionCol = conf["params"].get("predictionCol", "prediction") max_iter = conf["params"].get("maxIter", 100) reg_param = conf["params"].get("regParam", 0.0) elasticnet_param = conf["params"].get("elasticNetParam", 0.0) tol = conf["params"].get("tol", 1e-6) fitIntercept = conf["params"].get("fitIntercept", True) standardization = conf["params"].get("standardization", True) solver = conf["params"].get("solver", "auto") weightCol = conf["params"].get("weightCol", None) aggregationDepth = conf["params"].get("aggregationDepth", 2) loss = conf["params"].get("loss", "squaredError") epsilon = conf["params"].get("epsilon", 1.35) lr = LinearRegression(maxIter=max_iter, regParam=reg_param, elasticNetParam=elasticnet_param) print ("maxIter : " , lr.getMaxIter()) print ("regParam : " , lr.getRegParam()) print ("aggrDepth : " , lr.getAggregationDepth()) #jika menggunakan ml-tuning if conf["tuning"]: #jika menggunakan ml-tuning cross validation if conf["tuning"].get("method").lower() == "crossval": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() folds = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds= folds) model = cv.fit(df) #jika menggunakan ml-tuning train validation split elif conf["tuning"].get("method").lower() == "trainvalsplit": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() tr = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr ) model = tvs.fit(df) #jika tidak menggunakan ml-tuning elif conf["tuning"] == None: print ("test") model = lr.fit(df) return model
paramGrid = (ParamGridBuilder().addGrid(dt.maxDepth, [1, 2, 6]).addGrid( dt.maxBins, [20, 40]).build()) # COMMAND ---------- # MAGIC %md ### To build a general model, _TrainValidationSplit_ is used by us # COMMAND ---------- dt_tvs = TrainValidationSplit(estimator=dtp, evaluator=MulticlassClassificationEvaluator(), estimatorParamMaps=paramGrid, trainRatio=0.8) dtModel = dt_tvs.fit(train) # COMMAND ---------- # MAGIC %md ### Test the Recommender # MAGIC Now that we've trained the recommender, lets see how accurately it predicts known stars in the test set. # COMMAND ---------- prediction = dtModel.transform(test) predicted = prediction.select("features", "prediction", "trueLabel") predicted.show(10) # COMMAND ---------- # MAGIC %md ##TP, FP, TN, and FN all calculated
# [0.1, 0.05, 0.01]) \ paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures,[1000]) \ .addGrid(lr.regParam, [0.1]) \ .build() crossval = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator().setMetricName( 'areaUnderPR' ), # set area Under precision-recall curve as the evaluation metric # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) cvModel = crossval.fit(training_spark_df_binary) cvModel.bestModel.save("model") # Make predictions train_prediction = cvModel.transform(training_spark_df_binary) test_prediction = cvModel.transform(testing_spark_df_binary) otherDatasetTest = cvModel.transform(otherDatasetTest_df_binary) pd_prediction = test_prediction.select("*").toPandas() actual = pd_prediction["label"].tolist() pred = pd_prediction["prediction"].tolist() pd_prediction_other_dataset = otherDatasetTest.select("*").toPandas() actual_otherdataset = pd_prediction_other_dataset["label"].tolist() pred_otherdataset = pd_prediction_other_dataset["prediction"].tolist() tn, fp, fn, tp = confusion_matrix(actual, pred).ravel()
# building model als = ALS(nonnegative=True, checkpointInterval=3, coldStartStrategy="drop") paramGrid = ParamGridBuilder()\ .addGrid(als.rank, [5, 30, 70])\ .addGrid(als.regParam, [0.1, 1, 10])\ .build() rmse = RegressionEvaluator(metricName="rmse", labelCol="rating") # trainRatio makes train:0.5 valid:0.25 and test:0.25 tvs = TrainValidationSplit( estimator=als, estimatorParamMaps=paramGrid, evaluator=rmse, seed=seed, trainRatio=0.66, parallelism=3 ) model = tvs.fit(dftrain) model.transform(dftrain).show() testPred = model.transform(dftest) testPred.show(5) rmse.evaluate(testPred) model_path = os.getcwd() + '/ALS_model2' model.save(model_path)
EN = LinearRegression(labelCol = labelCol, featuresCol = 'features', fitIntercept=True, standardization=False) EN_paramGrid = ParamGridBuilder().addGrid(EN.regParam, [10,1,0.1, 0.01,0.001])\ .addGrid(EN.elasticNetParam, [0.0, 0.5, 1.0])\ .build() EN_tvs = TrainValidationSplit(estimator=EN, estimatorParamMaps=EN_paramGrid, evaluator=RegressionEvaluator(labelCol=labelCol), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) EN_model = EN_tvs.fit(train) EN_model.save("s3://buj201-two-sigma-challenge/EN_model") GBR = GBTRegressor(labelCol=labelCol, lossType="squared")¶ GBR_paramGrid = ParamGridBuilder().addGrid(BGR.maxDepth, [2,4,6])\ .addGrid(BGR.maxIter, [50,100,200])\ .addGrid(BGR.stepSize, [0.01,0.1,0.3])\ .build() GBR_tvs = TrainValidationSplit(estimator=GBR, estimatorParamMaps=GBR_paramGrid, evaluator=RegressionEvaluator(labelCol=labelCol), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8)
"s3://rtl-databricks-datascience/lpater/processed_data/bids_train.parquet/" ).groupBy("deal_id").count().orderBy( 'count', ascending=False).select("deal_id").toPandas()["deal_id"] ) #create a list of deal_ids to select from, ordered by how common they are # COMMAND ---------- #create the objects tree = DecisionTreeClassifier() paramGrid = ParamGridBuilder()\ .addGrid(tree.maxDepth, [4, 5, 6, 7, 8]) \ .build() tvs = TrainValidationSplit( estimator=tree, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR"), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # COMMAND ---------- # Run TrainValidationSplit, and choose the best regularization paramer. for deal_id in deal_ids: training_data = market_train.withColumnRenamed(deal_id, 'label') #testing_data = market_test.withColumnRenamed(deal_id,'label') model = tvs.fit(training_data) #print({"accuracy" : model.summary.accuracy}) #print({variable_names[variable_number] : model.coefficients[variable_number.item()] for variable_number in model.coefficients.indices})
.load("data/mllib/sample_linear_regression_data.txt") train, test = data.randomSplit([0.7, 0.3]) lr = LinearRegression(maxIter=10, regParam=0.1) # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. paramGrid = ParamGridBuilder()\ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .build() # In this case the estimator is simply the linear regression. # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train) # Make predictions on test data. model is the model with combination of parameters # that performed best. prediction = model.transform(test) for row in prediction.take(5): print(row) # $example off$ spark.stop()
def generalizedLinearRegressor(dataFrame, conf): """ input: df [spark.dataFrame], conf [configuration params] output: generalized linear regression model [model] """ # calling params label_col = conf["params"].get("labelCol", "label") features_col = conf["params"].get("featuresCol", "features") prediction_col = conf["params"].get("predictionCol", "prediction") fam = conf["params"].get("family", "gaussian") fit_intercept = conf["params"].get("fitIntercept", True) max_iter = conf["params"].get("maxIter", 25) tolp = conf["params"].get("tol", 1e-6) reg_param = conf["params"].get("regParam", 0.0) weight_col = conf["params"].get("weightCol", None) solverp = conf["params"].get("solver", "irls") link_prediction_col = conf["params"].get("linkPredictionCol", None) variance_power = conf["params"].get("variancePower", 0.0) link_power = conf["params"].get("linkPower", None) if (fam == "gaussian"): li = conf["params"].get("link", "identity") elif (fam == "binomial"): li = conf["params"].get("link", "logit") elif (fam == "poisson"): li = conf["params"].get("link", "log") elif (fam == "gamma"): li = conf["params"].get("link", "inverse") elif (fam == "tweedle"): li = conf["params"].get("link", 1 - variance_power) else: li = conf["params"].get("link", None) glr = GeneralizedLinearRegression(labelCol=label_col, featuresCol=features_col, predictionCol=prediction_col, family=fam, link=li, fitIntercept=fit_intercept, maxIter=max_iter, tol=tolp, regParam=reg_param, solver=solverp, linkPredictionCol=link_prediction_col, variancePower=variance_power, linkPower=link_power) # with tuning if conf["tuning"]: # method: cross validation if conf["tuning"].get("method").lower() == "crossval": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() folds = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() cv = CrossValidator(estimator=glr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=folds) model = cv.fit(dataFrame) # method: train validation split elif conf["tuning"].get("method").lower() == "trainvalsplit": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() tr = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=glr, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr) model = tvs.fit(dataFrame) # without tuning else: model = glr.fit(dataFrame) return model
def test_save_load_pipeline_estimator(self): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(5) lr2 = LogisticRegression().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100]) \ .addGrid(ova.classifier, [lr1, lr2]) \ .build() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel = tvs.fit(training) # test save/load of CrossValidatorModel tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages)) for loadedStage, originalStage in zip(loadedModel.bestModel.stages, tvsModel.bestModel.stages): self.assertEqual(loadedStage.uid, originalStage.uid) # Test nested pipeline nested_pipeline = Pipeline(stages=[tokenizer, Pipeline(stages=[hashingTF, ova])]) tvs2 = TrainValidationSplit(estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvs2Path = temp_path + "/tvs2" tvs2.save(tvs2Path) loadedTvs2 = TrainValidationSplit.load(tvs2Path) self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel2 = tvs2.fit(training) # test save/load of CrossValidatorModel tvsModelPath2 = temp_path + "/tvsModel2" tvsModel2.save(tvsModelPath2) loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2) self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid) loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1] original_nested_pipeline_model = tvsModel2.bestModel.stages[1] self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid) self.assertEqual(len(loaded_nested_pipeline_model.stages), len(original_nested_pipeline_model.stages)) for loadedStage, originalStage in zip(loaded_nested_pipeline_model.stages, original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid)
.addGrid(lr.regParam, [0.1, 2.0])\ .build() # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator()\ .setMetricName("areaUnderROC")\ .setRawPredictionCol("prediction")\ .setLabelCol("label") # COMMAND ---------- from pyspark.ml.tuning import TrainValidationSplit tvs = TrainValidationSplit()\ .setTrainRatio(0.75)\ .setEstimatorParamMaps(params)\ .setEstimator(pipeline)\ .setEvaluator(evaluator) # COMMAND ---------- tvsFitted = tvs.fit(train) # COMMAND ----------
lr = LogisticRegression(maxIter=10, regParam=0.01) paramMap = ({lr.regParam: 0.1, lr.threshold: 0.55, lr.maxIter: 100, }) paramGrid = ParamGridBuilder()\ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.threshold, [0.51, 0.56])\ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .build() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) model = tvs.fit(X_y) # lr.fit(train, paramMap) ################################################TESTING_MODEL############################################################### print('*' * 50, 'TESTING_MODEL', '*' * 50) predictions = model.transform(test) result = predictions.select("features", "label", "prediction").collect() for row in result: print("features=%s, label=%s -> prediction=%s" % (row.features, row.label, row.prediction)) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Acc = %g " % (accuracy))
def Train(self): st_global = time.time() CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "initialization", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") algosToRun = self._dataframe_context.get_algorithms_to_run() algoSetting = [ x for x in algosToRun if x.get_algorithm_slug() == self._slug ][0] categorical_columns = self._dataframe_helper.get_string_columns() uid_col = self._dataframe_context.get_uid_column() if self._metaParser.check_column_isin_ignored_suggestion(uid_col): categorical_columns = list(set(categorical_columns) - {uid_col}) allDateCols = self._dataframe_context.get_date_columns() categorical_columns = list(set(categorical_columns) - set(allDateCols)) numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] appType = self._dataframe_context.get_app_type() model_path = self._dataframe_context.get_model_path() if model_path.startswith("file"): model_path = model_path[7:] validationDict = self._dataframe_context.get_validation_dict() print("model_path", model_path) pipeline_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/pipeline/" model_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/model" pmml_filepath = "file://" + str(model_path) + "/" + str( self._slug) + "/modelPmml" df = self._data_frame levels = df.select(result_column).distinct().count() appType = self._dataframe_context.get_app_type() model_filepath = model_path + "/" + self._slug + "/model" pmml_filepath = str(model_path) + "/" + str( self._slug) + "/traindeModel.pmml" CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "training", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") st = time.time() pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) trainingData, validationData = MLUtils.get_training_and_validation_data( df, result_column, 0.8) # indexed labelIndexer = StringIndexer(inputCol=result_column, outputCol="label") # OriginalTargetconverter = IndexToString(inputCol="label", outputCol="originalTargetColumn") # Label Mapping and Inverse labelIdx = labelIndexer.fit(trainingData) labelMapping = {k: v for k, v in enumerate(labelIdx.labels)} inverseLabelMapping = { v: float(k) for k, v in enumerate(labelIdx.labels) } if self._dataframe_context.get_trainerMode() == "autoML": automl_enable = True else: automl_enable = False clf = NaiveBayes() if not algoSetting.is_hyperparameter_tuning_enabled(): algoParams = algoSetting.get_params_dict() else: algoParams = algoSetting.get_params_dict_hyperparameter() print("=" * 100) print(algoParams) print("=" * 100) clfParams = [prm.name for prm in clf.params] algoParams = { getattr(clf, k): v if isinstance(v, list) else [v] for k, v in algoParams.items() if k in clfParams } #print("="*100) #print("ALGOPARAMS - ",algoParams) #print("="*100) paramGrid = ParamGridBuilder() # if not algoSetting.is_hyperparameter_tuning_enabled(): # for k,v in algoParams.items(): # if v == [None] * len(v): # continue # if k.name == 'thresholds': # paramGrid = paramGrid.addGrid(k,v[0]) # else: # paramGrid = paramGrid.addGrid(k,v) # paramGrid = paramGrid.build() # if not algoSetting.is_hyperparameter_tuning_enabled(): for k, v in algoParams.items(): print(k, v) if v == [None] * len(v): continue paramGrid = paramGrid.addGrid(k, v) paramGrid = paramGrid.build() # else: # for k,v in algoParams.items(): # print k.name, v # if v[0] == [None] * len(v[0]): # continue # paramGrid = paramGrid.addGrid(k,v[0]) # paramGrid = paramGrid.build() #print("="*143) #print("PARAMGRID - ", paramGrid) #print("="*143) if len(paramGrid) > 1: hyperParamInitParam = algoSetting.get_hyperparameter_params() evaluationMetricDict = { "name": hyperParamInitParam["evaluationMetric"] } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] else: evaluationMetricDict = { "name": GLOBALSETTINGS.CLASSIFICATION_MODEL_EVALUATION_METRIC } evaluationMetricDict[ "displayName"] = GLOBALSETTINGS.SKLEARN_EVAL_METRIC_NAME_DISPLAY_MAP[ evaluationMetricDict["name"]] self._result_setter.set_hyper_parameter_results(self._slug, None) if validationDict["name"] == "kFold": numFold = int(validationDict["value"]) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkGridSearchResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, numFold, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: if automl_enable: paramGrid = (ParamGridBuilder().addGrid( clf.smoothing, [1.0, 0.2]).build()) crossval = CrossValidator( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), numFolds=3 if numFold is None else numFold) # use 3+ folds in practice cvnb = crossval.fit(trainingData) prediction = cvnb.transform(validationData) bestModel = cvnb.bestModel else: train_test_ratio = float( self._dataframe_context.get_train_test_split()) estimator = Pipeline(stages=[pipeline, labelIndexer, clf]) if algoSetting.is_hyperparameter_tuning_enabled(): modelFilepath = "/".join(model_filepath.split("/")[:-1]) pySparkHyperParameterResultObj = PySparkTrainTestResult( estimator, paramGrid, appType, modelFilepath, levels, evaluationMetricDict, trainingData, validationData, train_test_ratio, self._targetLevel, labelMapping, inverseLabelMapping, df) resultArray = pySparkHyperParameterResultObj.train_and_save_classification_models( ) self._result_setter.set_hyper_parameter_results( self._slug, resultArray) self._result_setter.set_metadata_parallel_coordinates( self._slug, { "ignoreList": pySparkHyperParameterResultObj.get_ignore_list(), "hideColumns": pySparkHyperParameterResultObj.get_hide_columns(), "metricColName": pySparkHyperParameterResultObj. get_comparison_metric_colname(), "columnOrder": pySparkHyperParameterResultObj.get_keep_columns() }) bestModel = pySparkHyperParameterResultObj.getBestModel() prediction = pySparkHyperParameterResultObj.getBestPrediction() else: tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator() if levels == 2 else MulticlassClassificationEvaluator(), trainRatio=train_test_ratio) tvspnb = tvs.fit(trainingData) prediction = tvspnb.transform(validationData) bestModel = tvspnb.bestModel modelmanagement_ = { param[0].name: param[1] for param in bestModel.stages[2].extractParamMap().items() } MLUtils.save_pipeline_or_model(bestModel, model_filepath) predsAndLabels = prediction.select(['prediction', 'label']).rdd.map(tuple) # label_classes = prediction.select("label").distinct().collect() # label_classes = prediction.agg((F.collect_set('label').alias('label'))).first().asDict()['label'] #results = transformed.select(["prediction","label"]) # if len(label_classes) > 2: # metrics = MulticlassMetrics(predsAndLabels) # accuracy of the model # else: # metrics = BinaryClassificationMetrics(predsAndLabels) posLabel = inverseLabelMapping[self._targetLevel] metrics = MulticlassMetrics(predsAndLabels) trainingTime = time.time() - st f1_score = metrics.fMeasure(inverseLabelMapping[self._targetLevel], 1.0) precision = metrics.precision(inverseLabelMapping[self._targetLevel]) recall = metrics.recall(inverseLabelMapping[self._targetLevel]) accuracy = metrics.accuracy print(f1_score, precision, recall, accuracy) #gain chart implementation def cal_prob_eval(x): if len(x) == 1: if x == posLabel: return (float(x[1])) else: return (float(1 - x[1])) else: return (float(x[int(posLabel)])) column_name = 'probability' def y_prob_for_eval_udf(): return udf(lambda x: cal_prob_eval(x)) prediction = prediction.withColumn( "y_prob_for_eval", y_prob_for_eval_udf()(col(column_name))) try: pys_df = prediction.select( ['y_prob_for_eval', 'prediction', 'label']) gain_lift_ks_obj = GainLiftKS(pys_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Run().toPandas() except: try: temp_df = pys_df.toPandas() gain_lift_ks_obj = GainLiftKS(temp_df, 'y_prob_for_eval', 'prediction', 'label', posLabel, self._spark) gain_lift_KS_dataframe = gain_lift_ks_obj.Rank_Ordering() except: print("gain chant failed") gain_lift_KS_dataframe = None #feature_importance = MLUtils.calculate_sparkml_feature_importance(df, bestModel.stages[-1], categorical_columns, numerical_columns) act_list = prediction.select('label').collect() actual = [int(row.label) for row in act_list] pred_list = prediction.select('prediction').collect() predicted = [int(row.prediction) for row in pred_list] prob_list = prediction.select('probability').collect() probability = [list(row.probability) for row in prob_list] # objs = {"trained_model":bestModel,"actual":prediction.select('label'),"predicted":prediction.select('prediction'), # "probability":prediction.select('probability'),"feature_importance":None, # "featureList":list(categorical_columns) + list(numerical_columns),"labelMapping":labelMapping} objs = { "trained_model": bestModel, "actual": actual, "predicted": predicted, "probability": probability, "feature_importance": None, "featureList": list(categorical_columns) + list(numerical_columns), "labelMapping": labelMapping } conf_mat_ar = metrics.confusionMatrix().toArray() print(conf_mat_ar) confusion_matrix = {} for i in range(len(conf_mat_ar)): confusion_matrix[labelMapping[i]] = {} for j, val in enumerate(conf_mat_ar[i]): confusion_matrix[labelMapping[i]][labelMapping[j]] = val print(confusion_matrix) # accuracy of the model '''ROC CURVE IMPLEMENTATION''' y_prob = probability y_score = predicted y_test = actual logLoss = log_loss(y_test, y_prob) if levels <= 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) roc_auc = roc_auc_score(y_test, y_score) roc_data_dict = { "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs, "y_prob": y_prob, "positive_label": posLabel } roc_dataframe = pd.DataFrame({ "y_score": y_score, "y_test": y_test, "positive_label_probs": positive_label_probs }) #roc_dataframe.to_csv("binary_roc_data.csv") fpr, tpr, thresholds = roc_curve(y_test, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) elif levels > 2: positive_label_probs = [] for val in y_prob: positive_label_probs.append(val[int(posLabel)]) y_test_roc_multi = [] for val in y_test: if val != posLabel: val = posLabel + 1 y_test_roc_multi.append(val) else: y_test_roc_multi.append(val) y_score_roc_multi = [] for val in y_score: if val != posLabel: val = posLabel + 1 y_score_roc_multi.append(val) else: y_score_roc_multi.append(val) roc_auc = roc_auc_score(y_test_roc_multi, y_score_roc_multi) fpr, tpr, thresholds = roc_curve(y_test_roc_multi, positive_label_probs, pos_label=posLabel) roc_df = pd.DataFrame({ "FPR": fpr, "TPR": tpr, "thresholds": thresholds }) roc_df["tpr-fpr"] = roc_df["TPR"] - roc_df["FPR"] optimal_index = np.argmax(np.array(roc_df["tpr-fpr"])) fpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "FPR"] tpr_optimal_index = roc_df.loc[roc_df.index[optimal_index], "TPR"] rounded_roc_df = roc_df.round({'FPR': 2, 'TPR': 4}) unique_fpr = rounded_roc_df["FPR"].unique() final_roc_df = rounded_roc_df.groupby("FPR", as_index=False)[["TPR" ]].mean() endgame_roc_df = final_roc_df.round({'FPR': 2, 'TPR': 3}) # Calculating prediction_split val_cnts = prediction.groupBy('label').count() val_cnts = map(lambda row: row.asDict(), val_cnts.collect()) prediction_split = {} total_nos = prediction.select('label').count() for item in val_cnts: print(labelMapping) classname = labelMapping[item['label']] prediction_split[classname] = round( item['count'] * 100 / float(total_nos), 2) if not algoSetting.is_hyperparameter_tuning_enabled(): modelName = "M" + "0" * (GLOBALSETTINGS.MODEL_NAME_MAX_LENGTH - 1) + "1" modelFilepathArr = model_filepath.split("/")[:-1] modelFilepathArr.append(modelName) bestModel.save("/".join(modelFilepathArr)) runtime = round((time.time() - st_global), 2) try: print(pmml_filepath) pmmlBuilder = PMMLBuilder(self._spark, trainingData, bestModel).putOption( clf, 'compact', True) pmmlBuilder.buildFile(pmml_filepath) pmmlfile = open(pmml_filepath, "r") pmmlText = pmmlfile.read() pmmlfile.close() self._result_setter.update_pmml_object({self._slug: pmmlText}) except Exception as e: print("PMML failed...", str(e)) pass cat_cols = list(set(categorical_columns) - {result_column}) self._model_summary = MLModelSummary() self._model_summary.set_algorithm_name("Naive Bayes") self._model_summary.set_algorithm_display_name("Naive Bayes") self._model_summary.set_slug(self._slug) self._model_summary.set_training_time(runtime) self._model_summary.set_confusion_matrix(confusion_matrix) # self._model_summary.set_feature_importance(objs["feature_importance"]) self._model_summary.set_feature_list(objs["featureList"]) self._model_summary.set_model_accuracy(accuracy) self._model_summary.set_training_time(round((time.time() - st), 2)) self._model_summary.set_precision_recall_stats([precision, recall]) self._model_summary.set_model_precision(precision) self._model_summary.set_model_recall(recall) self._model_summary.set_model_F1_score(f1_score) self._model_summary.set_model_log_loss(logLoss) self._model_summary.set_gain_lift_KS_data(gain_lift_KS_dataframe) self._model_summary.set_AUC_score(roc_auc) self._model_summary.set_target_variable(result_column) self._model_summary.set_prediction_split(prediction_split) self._model_summary.set_validation_method("KFold") self._model_summary.set_level_map_dict(objs["labelMapping"]) # self._model_summary.set_model_features(list(set(x_train.columns)-set([result_column]))) self._model_summary.set_model_features(objs["featureList"]) self._model_summary.set_level_counts( self._metaParser.get_unique_level_dict( list(set(categorical_columns)) + [result_column])) #self._model_summary.set_num_trees(objs['trained_model'].getNumTrees) self._model_summary.set_num_rules(300) self._model_summary.set_target_level(self._targetLevel) if not algoSetting.is_hyperparameter_tuning_enabled(): modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": modelName } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } else: modelDropDownObj = { "name": self._model_summary.get_algorithm_name(), "evaluationMetricValue": accuracy, "evaluationMetricName": "accuracy", "slug": self._model_summary.get_slug(), "Model Id": resultArray[0]["Model Id"] } modelSummaryJson = { "dropdown": modelDropDownObj, "levelcount": self._model_summary.get_level_counts(), "modelFeatureList": self._model_summary.get_feature_list(), "levelMapping": self._model_summary.get_level_map_dict(), "slug": self._model_summary.get_slug(), "name": self._model_summary.get_algorithm_name() } self._model_management = MLModelSummary() print(modelmanagement_) self._model_management.set_job_type( self._dataframe_context.get_job_name()) #Project name self._model_management.set_training_status( data="completed") # training status self._model_management.set_target_level( self._targetLevel) # target column value self._model_management.set_training_time(runtime) # run time self._model_management.set_model_accuracy(round(metrics.accuracy, 2)) # self._model_management.set_model_accuracy(round(metrics.accuracy_score(objs["actual"], objs["predicted"]),2))#accuracy self._model_management.set_algorithm_name( "NaiveBayes") #algorithm name self._model_management.set_validation_method( str(validationDict["displayName"]) + "(" + str(validationDict["value"]) + ")") #validation method self._model_management.set_target_variable( result_column) #target column name self._model_management.set_creation_date(data=str( datetime.now().strftime('%b %d ,%Y %H:%M '))) #creation date self._model_management.set_datasetName(self._datasetName) self._model_management.set_model_type(data='classification') self._model_management.set_var_smoothing( data=int(modelmanagement_['smoothing'])) # self._model_management.set_no_of_independent_variables(df) #no of independent varables modelManagementSummaryJson = [ ["Project Name", self._model_management.get_job_type()], ["Algorithm", self._model_management.get_algorithm_name()], ["Training Status", self._model_management.get_training_status()], ["Accuracy", self._model_management.get_model_accuracy()], ["RunTime", self._model_management.get_training_time()], #["Owner",None], ["Created On", self._model_management.get_creation_date()] ] modelManagementModelSettingsJson = [ ["Training Dataset", self._model_management.get_datasetName()], ["Target Column", self._model_management.get_target_variable()], ["Target Column Value", self._model_management.get_target_level()], ["Algorithm", self._model_management.get_algorithm_name()], [ "Model Validation", self._model_management.get_validation_method() ], ["Model Type", self._model_management.get_model_type()], ["Smoothing", self._model_management.get_var_smoothing()], #,["priors",self._model_management.get_priors()] #,["var_smoothing",self._model_management.get_var_smoothing()] ] nbOverviewCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_card_overview( self._model_management, modelManagementSummaryJson, modelManagementModelSettingsJson) ] nbPerformanceCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_cards( self._model_summary, endgame_roc_df) ] nbDeploymentCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_management_deploy_empty_card() ] nbCards = [ json.loads(CommonUtils.convert_python_object_to_json(cardObj)) for cardObj in MLUtils.create_model_summary_cards(self._model_summary) ] NB_Overview_Node = NarrativesTree() NB_Overview_Node.set_name("Overview") NB_Performance_Node = NarrativesTree() NB_Performance_Node.set_name("Performance") NB_Deployment_Node = NarrativesTree() NB_Deployment_Node.set_name("Deployment") for card in nbOverviewCards: NB_Overview_Node.add_a_card(card) for card in nbPerformanceCards: NB_Performance_Node.add_a_card(card) for card in nbDeploymentCards: NB_Deployment_Node.add_a_card(card) for card in nbCards: self._prediction_narrative.add_a_card(card) self._result_setter.set_model_summary({ "naivebayes": json.loads( CommonUtils.convert_python_object_to_json(self._model_summary)) }) self._result_setter.set_naive_bayes_model_summary(modelSummaryJson) self._result_setter.set_nb_cards(nbCards) self._result_setter.set_nb_nodes( [NB_Overview_Node, NB_Performance_Node, NB_Deployment_Node]) self._result_setter.set_nb_fail_card({ "Algorithm_Name": "Naive Bayes", "success": "True" }) CommonUtils.create_update_and_save_progress_message( self._dataframe_context, self._scriptWeightDict, self._scriptStages, self._slug, "completion", "info", display=True, emptyBin=False, customMsg=None, weightKey="total") print("\n\n")
def hotmodel(self, sc, sets, movieRDD): ''' training a super hot model ''' als = ALS(coldStartStrategy="drop") param_grid = ParamGridBuilder() \ .addGrid(als.rank, [6, 8]) \ .addGrid(als.maxIter,[8, 10, 12]) \ .build() evaluator = RegressionEvaluator( metricName="mse", labelCol="rating", predictionCol="prediction") tvs = TrainValidationSplit( estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, ) model = tvs.fit(sets['training']) ## should we save the model? best_rank = model.bestModel.rank best_iterations = model.bestModel._java_obj.parent().getMaxIter() print('hotmodel part 1') prediction = model.transform(sets['test']) prediction.alias('p')\ .join(movieRDD.alias('m'), col('p.item') == col('m.item'))\ .select([col('p.user'), col('m.title'), col('p.prediction'), col('p.rating')]) mse = evaluator.evaluate(prediction) print("MSE = {}".format(mse)) ''' hot model's tinder date ''' rating59169 = [ (118661, 9), # Avengers (371746, 9), # Iron Man 2008 (94625, 9), # Akira (1563738, 2), # One day 2011 (800369, 8), # Thor (1981115, 9), # Thor: The Dark World (3501632, 9), # Thor: Ragnarok (120338, 3), # Titanic (98635, 2), # When Harry Met Sally (125439, 3), # Notting Hill (332280, 1) # The Notebook ] user59169 = ratingRDD.groupBy().max('user').first()['max(user)'] + 1 user59169DF = spark.createDataFrame\ ([Row(user=user59169, item=r[0], rating=r[1]) for r in rating59169]) user59169DF = user59169DF.select('user','item','rating') # user59169DF = sc.parallelize(user59169DF) new_model = ALS(rank=best_rank, maxIter=best_iterations, coldStartStrategy="drop")\ .fit(ratingRDD2) unseen_movies = movieRDD.alias('m')\ .join(user59169DF.alias('r'), col('m.item') == col('r.item'), how='left_anti')\ .select('item') unseen_movies_user = unseen_movies.withColumn("user", lit(user59169)) print('hot model part 2') spark.conf.set("spark.sql.crossJoin.enabled", "true") unseen_ratings = new_model.transform(unseen_movies_user) unseen_ratings_titles = unseen_ratings.alias('r')\ .join(movieRDD.alias('m'), col('r.item') == col('m.item'))\ .select(['user', 'title', 'prediction']) ratings_per_movie = ratingRDD.groupBy('item').count() enough_ratings = ratings_per_movie.filter(col('count') < 500) enough_ratings.show() training_10 = unseen_ratings.alias('r')\ .join(enough_ratings.alias('e'), col('r.item') == col('e.item'), how='left_anti')\ .select(['item', 'user', 'prediction']).orderBy(col('prediction').desc()) training_100.alias('t').join(movieRDD.alias('m'), col('t.item') == col('m.item'))\ .select(['user', 'title', 'prediction'])\ .orderBy(col('prediction').desc()).show(10, truncate=False) # spark.stop()
predictions[park] = models[park].transform(test_ds[park]) else: for park in park_data_with_date_dict: #standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features") #pred4data = standardScaler.transform(pred4data) lr = LinearRegression(maxIter = 10) #elastic = forma ri regolarizzazione che mixa LASSO, e RIDGE paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).addGrid(lr.fitIntercept, [False, True]).addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]).build() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. models[park] = tvs.fit(train_ds[park]) # Make predictions on test data. model is the model with combination of parameters # that performed best. predictions[park] = models[park].transform(test_ds[park] ) saveModels(models,"LinearRegressionModel_","linear") # COMMAND ---------- #ATTENZIONE, la predizione è STATICA ... quindi anche se il plot è temporale, è solo per avere una visualizzazione "semplice" (altrimenti per fare un vero e proprio scatterplot variabili-predizione/valore vero ci vorrebbe una riduzione di dimensionalità ... ma mi sembra uno sforzo inutile) def plotPredictions(code): fig, axes = plt.subplots() axes.plot([p.label for p in predictions[code].collect()], color = "blue") axes.plot([p.prediction for p in predictions[code].collect()], color = "red") #fig.autofmt_xdate() display(fig)
.addGrid(mlpc.layers, generateLayersCombination(hidden_layers = [1,2,5], input_layer = [5], output_layer = [2])) \ .addGrid(mlpc.stepSize, [0.5,0.2,0.1,0.05,0.02]) \ .build() print("Calculating best model...") # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. start = time.time() tvs = TrainValidationSplit( estimator=mlpc, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train_data) # Save the parameters of the best model into variables bestmodel = model.bestModel layers = list(bestmodel._java_obj.parent().getLayers()) iters = bestmodel._java_obj.parent().getMaxIter() # solver = bestmodel._java_obj.parent().getSolver() # tol = bestmodel._java_obj.parent().getTol() lr = bestmodel._java_obj.parent().getStepSize() end = time.time() print("---------------------- Best model info ----------------------") print("Max epochs : " + str(iters)) print("Learning rate : " + str(lr)) print("Layers : " + str(layers))
evaluator=evaluator, estimatorParamMaps=grid) # COMMAND ---------- import mlflow #from mlflow import spark #import mlflow.mleap #fonctionne avec Spark 3.0 ? #import mlflow.pyfunc #import mleap.pyspark # COMMAND ---------- with mlflow.start_run(run_name='TripDuration_lr'): tunedModel = tuning.fit(trainingData) # We log a custom tag, a custom metric, and the best model to the main run. mlflow.set_tag('Citibike_training', 'Data_team') rmse = evaluator.evaluate(tunedModel.transform(testData), {evaluator.metricName: "rmse"}) r2 = evaluator.evaluate(tunedModel.transform(testData), {evaluator.metricName: "r2"}) mae = evaluator.evaluate(tunedModel.transform(testData), {evaluator.metricName: "mae"}) mse = evaluator.evaluate(tunedModel.transform(testData), {evaluator.metricName: "mse"}) mlflow.log_metric('rmse', rmse) mlflow.log_metric('r2', r2)
# In[27]: paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [50, 100]).addGrid( rf.maxDepth, [30, 15]).build() # In[28]: tvs = TrainValidationSplit(estimator=rf, estimatorParamMaps=paramGrid, evaluator=bce, trainRatio=0.8) # In[ ]: tvs.fit(tr_data) # In[ ]: prediction = tvs.transform(test) # In[5]: # convert to .py file. Now let's submit to queue # HW! # Items to Work on: 3 Options: # # 1. ML # * make a logistic regression model # * use cross-validation to search a good space of logisitc regression hyoerparams
def test_copy(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="r2") grid = ParamGridBuilder() \ .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \ .build() tvs = TrainValidationSplit( estimator=iee, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True ) tvsModel = tvs.fit(dataset) tvsCopied = tvs.copy() tvsModelCopied = tvsModel.copy() for param in [ lambda x: x.getCollectSubModels(), lambda x: x.getParallelism(), lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvs), param(tvsCopied)) for param in [ lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvsModel), param(tvsModelCopied)) self.assertEqual(tvs.getEstimator().uid, tvsCopied.getEstimator().uid, "Copied TrainValidationSplit has the same uid of Estimator") self.assertEqual(tvsModel.bestModel.uid, tvsModelCopied.bestModel.uid) self.assertEqual(len(tvsModel.validationMetrics), len(tvsModelCopied.validationMetrics), "Copied validationMetrics has the same size of the original") for index in range(len(tvsModel.validationMetrics)): self.assertEqual(tvsModel.validationMetrics[index], tvsModelCopied.validationMetrics[index]) tvsModel.validationMetrics[0] = 'foo' self.assertNotEqual( tvsModelCopied.validationMetrics[0], 'foo', "Changing the original validationMetrics should not affect the copied model" ) tvsModel.subModels[0].getInducedError = lambda: 'foo' self.assertNotEqual( tvsModelCopied.subModels[0].getInducedError(), 'foo', "Changing the original subModels should not affect the copied model" )
data = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") train, test = data.randomSplit([0.7, 0.3]) lr = LinearRegression(maxIter=10, regParam=0.1) # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. paramGrid = ParamGridBuilder()\ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .build() # In this case the estimator is simply the linear regression. # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train) # Make predictions on test data. model is the model with combination of parameters # that performed best. prediction = model.transform(test) for row in prediction.take(5): print(row) # $example off$ spark.stop()