def TVS(estimator, paramGrid, dataTrain, dataTest): # Definimos el TVS tvs = TrainValidationSplit(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), # 80% entrenamiento, 20% validacion trainRatio=0.8) # Entrenamos el modelo con la mejor combinacion # de parametros del grid por defecto model = tvs.fit(dataTrain) # Obtenemos predicciones sobre Test predictions = model.transform(dataTest) return predictions, model
def process(spark, train_data, test_data): #train_data - путь к файлу с данными для обучения модели #test_data - путь к файлу с данными для оценки качества модели #сейчас использую только train_data #запуск python PySparkMLFit.py train.parquet validate.parquet #загружаю train_data train_data = spark.read.parquet(train_data) #обучаю модель #add feature feature = VectorAssembler(inputCols=train_data.columns[:7], outputCol="features") # Train a GBT model. gbt = GBTRegressor(labelCol="ctr", featuresCol="features", maxIter=10) #pipeline pipeline = Pipeline(stages=[feature, gbt]) paramGrid = ParamGridBuilder().addGrid( gbt.maxDepth, [2, 3, 4, 5, 6, 7, 8, 9]).addGrid(gbt.maxBins, [10, 16, 20, 24, 32, 36]).build() # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. tvs = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol="ctr", predictionCol="prediction", metricName="rmse"), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train_data) #делаю выборку для тестирования (training_data1, test_data) = train_data.randomSplit([0.8, 0.2], seed=42) #по тестовой выборке выделенной из train_data считаю rmse и вывожу его prediction = model.transform(test_data) evaluator = RegressionEvaluator(labelCol="ctr", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(prediction) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) #сохраняю модель model.bestModel.write().overwrite().save("model")
def model_training(training_data, param_info): # 获取参数表以及gbt模型 param_grid, rf = model_setting(param_info) # 建立评估器,计算模式为准确值 evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction') # 建立超参数验证模型 tvs = TrainValidationSplit(estimator=rf, estimatorParamMaps=param_grid, evaluator=evaluator, trainRatio=0.8) # 训练模型 model = tvs.fit(dataset=training_data) # 返回最优模型 return model.bestModel
def get_validation(by='cv'): if by is 'cv': return CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) elif by is 'tvs': return TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, trainratio=0.8) else: print("lütfen tvsden ve cvden birini seçiniz") return None
def kNN_with_k_fixed(df, k): knn = KNNClassifier(featuresCol='features', labelCol='label', topTreeSize=1000, topTreeLeafSize=10, subTreeLeafSize=30) grid = ParamGridBuilder().addGrid(knn.k, [k]).build() evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') tts = TrainValidationSplit(estimator=knn, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.6666) ttsModel = tts.fit(df) result = evaluator.evaluate(ttsModel.transform(df)) print('kNN:k', k, result)
def _run_test_save_load_trained_model(self, LogisticRegressionCls, LogisticRegressionModelCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) lr = LogisticRegressionCls() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True, seed=42, ) tvsModel = tvs.fit(dataset) lrModel = tvsModel.bestModel lrModelPath = temp_path + "/lrModel" lrModel.save(lrModelPath) loadedLrModel = LogisticRegressionModelCls.load(lrModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedTvsModel = TrainValidationSplitModel.load(tvsModelPath) for param in [ lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvsModel), param(loadedTvsModel)) self.assertTrue( all( loadedTvsModel.isSet(param) for param in loadedTvsModel.params))
def main(name='Loan_model'): logger = logging.getLogger(__name__) spark = SparkSession.builder.appName(f'{name}').getOrCreate() data = spark.read.csv(path, inferSchema=True, header=True) logger.info(f'Vectorising Features') data = get_features(data, spark, target) logger.info(f'Obtaining Weight balance') data = data.withColumn('weights', weight_balance(data, col('label'))) logger.info(f'Create train and testing split 80-20') train, test = data.randomSplit([.8, .2], seed=1234) logger.info(f'Training and Optimising model') lr = LogisticRegression( featuresCol=data.columns[0], labelCol=data.columns[1], weightCol=data.columns[2], maxIter=100, ) pipeline = Pipeline(stages=[lr]) paramGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1]) \ .addGrid(lr.elasticNetParam, [0.001, 0.01, 0.1, 1]) \ .build() model_tune = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'), trainRatio=0.8) model = model_tune.fit(train) metrics = evaluate_model(model, test, spark) model.bestModel.write().overwrite().save(output_path) metrics.toPandas().to_csv(f'{output_path}testset_metrics.csv') logger.info(f'Model and metrics exported to {output_path}') return model, metrics
def RForest_with_maxFeatures_maxDepth_fixed(df, max_depth, max_features): RForest = DecisionTreeClassifier(featuresCol='features', labelCol='label', impurity='gini', maxMemoryInMB=1024) grid = ParamGridBuilder().addGrid(RForest.maxDepth, [max_depth]).addGrid( RForest.maxBins, [max_features]).build() evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') tts = TrainValidationSplit(estimator=RForest, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.6666) ttsModel = tts.fit(df) result = evaluator.evaluate(ttsModel.transform(df)) print('RForest:maxDepth', max_depth, ':maxBins', max_features, ':result', result)
def validate(estimator, train, grid): """ Elige los hiperparámetros de "estimator" a partir de "grid" y utilizando el 20% de los datos de "train" como partición de validación. Como métrica de comparación, utiliza AUC. """ tvs = TrainValidationSplit( estimator=estimator, estimatorParamMaps=grid, evaluator=BinaryClassificationEvaluator(labelCol="class"), trainRatio=0.8, seed=89) model = tvs.fit(train) for i, item in enumerate(model.getEstimatorParamMaps()): grid = ["%s: %s" % (p.name, str(v)) for p, v in item.items()] print(grid, model.getEvaluator().getMetricName(), model.validationMetrics[i])
def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvs.setParallelism(1) tvsSerialModel = tvs.fit(dataset) tvs.setParallelism(2) tvsParallelModel = tvs.fit(dataset) self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
def train(data: DataFrame, esti: Estimator, eid: str, param_grid_builder: Callable[[Estimator], list]) -> PipResult: try: print(f"--- train {eid}") # Prepare training and test data. df_train, df_test = data.randomSplit([0.9, 0.1], seed=12345) # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. params = param_grid_builder(esti) print(f"--- params") pprint(params) # In this case the estimator is simply the linear regression. # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. tvs = TrainValidationSplit( estimator=esti, estimatorParamMaps=params, evaluator=RegressionEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. trained_models: TrainValidationSplitModel = tvs.fit(df_train) # Make predictions on test data. model is the model with combination of parameters # that performed best. predictions = trained_models.transform(df_test) \ .select("features", "label", "prediction") # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print(f"-- Root Mean Squared Error (RMSE) on test data = {rmse}") fnam = cm.fnam(eid) hlp.save_model(trained_models.bestModel, hlp.get_datadir(), fnam) print(f"-- saved model to {fnam}") return PipResult(rmse, trained_models.bestModel, "OK") except Exception: print(tb.format_exc()) return PipResult(0.0, None, "ERROR")
def func2(): """ PIPeline机器学习 :return: """ row_df = sqlContext.read.format("csv").option("header", True).option("delimiter", "\t").load(Path + "train.tsv") df = row_df.select(["url", "alchemy_category"] # 不需要转换的字段 + [replace_question(col(column)).cast("double").alias(column) for column in row_df.columns[4:]]) # 需要转换的字段 train_df, test_df = df.randomSplit([0.7, 0.3]) ###建立机器学习Pipeline流程 stringIndexer = StringIndexer(inputCol="alchemy_category", outputCol="alchemy_category_index") # 创建indexer,字符串代码化 encoder=OneHotEncoder(dropLast=False,inputCol="alchemy_category_index",outputCol="alchemy_category_indexVec") assemblerInputs = ["alchemy_category_indexVec"] + row_df.columns[4:-1] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14) pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,dt]) print(pipeline.getStages()) ###使用Pipeline进行数据处理和训练 pipelineModel=pipeline.fit(train_df)#训练 print(pipelineModel.stages[3])#第三阶段会产生模型,这里看看模型 print(pipelineModel.stages[3].toDebugString) ####使用pipeline进行预测 predicted=pipelineModel.transform(test_df) print(predicted.columns) predicted.select("url","features","rawprediction","probability","label","prediction").show(5) predicted.select( "probability", "prediction").take(5) ####评估模型准确率 evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="label",metricName="areaUnderROC") auc=evaluator.evaluate(predicted) print("auc:",auc) #要遍历的参数们,选择最佳参数组合 paramGrid=ParamGridBuilder().addGrid(dt.impurity,["gini","entory"]).addGrid(dt.maxDepth,[5,10,15]).addGrid(dt.maxBins,[10,15,20]).build() tvs=TrainValidationSplit(estimator=dt,evaluator=evaluator,estimatorParamMaps=paramGrid,trainRatio=0.8)#trainRatio 数据会8:2的比例分为训练集,验证集 tvs_pipeline=Pipeline(stages=[stringIndexer,encoder,assembler,tvs]) tvs_pipelineModel=tvs_pipeline.fit(train_df) bestModel=tvs_pipelineModel.stages[3].bestModel print("bestModel",bestModel) predictions=tvs_pipelineModel.transform(test_df) auc2=evaluator.evaluate(predictions) print("auc2:",auc2)
def _run_test_save_load_nested_estimator(self, LogisticRegressionCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) ova = OneVsRest(classifier=LogisticRegressionCls()) lr1 = LogisticRegressionCls().setMaxIter(100) lr2 = LogisticRegressionCls().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), grid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) originalParamMap = tvs.getEstimatorParamMaps() loadedParamMap = loadedTvs.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def example_train_cluster(df): # Expected input: inventory vec = VectorAssembler(inputCols=["price", "size", "lat", "lng"], outputCol="v") kmeans = KMeans(featuresCol="v", predictionCol="pred") pipe = Pipeline(stages=[vec, kmeans]) ev = ClusteringEvaluator( predictionCol="pred", featuresCol="v", distanceMeasure="cosine") # cosine, squaredEuclidean grid = ParamGridBuilder().addGrid(kmeans.k, [3, 4, 5]).build() cv = TrainValidationSplit(estimator=pipe, estimatorParamMaps=grid, evaluator=ev, trainRatio=0.75) model = cv.fit(df) return model
def tune_model(estimator, param_grid, evaluator, train_data): tvs = TrainValidationSplit(estimator=estimator, estimatorParamMaps=param_grid, evaluator=evaluator, # 80% of the data will be used for training, 20% for validation. trainRatio=0.8, seed=16) model = tvs.fit(train_data) # print results for each combination for i, item in enumerate(model.getEstimatorParamMaps()): grid = ["%s: %s" % (p.name, str(v)) for p, v in item.items()] print(grid, model.getEvaluator().getMetricName(), model.validationMetrics[i]) return model.bestModel
def trainAndEvalModelByRandomForestClassifierAndTrainValidationSplit(stages, train_df, test_df, evaluator): ''' 使用 RandomForestClassifier 分类器和 TrainValidationSplit 训练验证模型,并找出最佳模型 :param stages: :param train_df: :param test_df: :param evaluator: :return: ''' rf = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=10) paramGrid = ParamGridBuilder().addGrid(rf.impurity, ['gini', 'entropy']).addGrid(rf.maxDepth, [5, 10, 15]).addGrid(rf.maxBins, [10, 15, 20]).addGrid(rf.numTrees, [10, 20, 30]).build() rftvs = TrainValidationSplit(estimator=rf, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8) rftvsPipeline = Pipeline(stages=stages+[rftvs]) rftvsPipelineModel = rftvsPipeline.fit(train_df) bestModel = rftvsPipelineModel.stages[3].bestModel predictions = rftvsPipelineModel.transform(test_df) auc = evaluator.evaluate(predictions) return (bestModel, predictions, auc)
def random_forest_classifier(training_data, testing_data): assembler = VectorAssembler(inputCols=[ "col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13", "col14", "col15", "col16", "col17", "col19", "col20", "col21", "col22", "col23", "col24", "col25" ], outputCol="features") training_data_vector = assembler.transform(training_data) training_data_vector = training_data_vector.select('index', 'features', 'label') label_indexer = StringIndexer( inputCol="label", outputCol="indexedLabel").fit(training_data_vector) feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=15).fit(training_data_vector) rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) param_grid = ParamGridBuilder() \ .addGrid(feature_indexer.maxCategories, [5, 15, 25]) \ .addGrid(rf.numTrees, [10, 50, 100]) \ .addGrid(rf.maxDepth, [5, 10]) \ .build() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=RegressionEvaluator(), trainRatio=0.8) model = tvs.fit(training_data_vector) testing_data_vector = assembler.transform(testing_data) predictions = model.transform( testing_data_vector.select('index', 'features', 'bid', 'target')) selected = predictions.select("index", 'bid', 'target', "probability", "prediction") __output('RandomForestClassifier', selected)
def predictDataForStation(stationData, columnName, station_id): columnsList = ["max_temp", "med_temp", "min_temp", "max_pressure", "min_pressure", "precip", "insolation"] # assembler = VectorAssembler(inputCols=columnsList,outputCol="features") assembler = VectorAssembler(inputCols=[columnName], outputCol="features") assembledData = assembler.transform(stationData) feature_data = assembledData.withColumn("label", stationData[columnName]).withColumn("features", assembledData.features) print("Getting training data...") test_data = feature_data.sample(False, 0.1) train_data = feature_data.sample(False, 0.9) print("Test data: " + str(test_data.count()) + " , Train data: " + str(train_data.count())) # BestModel lr = LinearRegression() paramGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.1, 0.01, 0.001, 0.0001, 0.0001]) \ .addGrid(lr.fitIntercept, [False, True]) \ .addGrid(lr.maxIter, [1, 10, 50, 100]) \ .build() try: print("Calculating and training the best model") tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), trainRatio=0.8) # Fit the model lrModel = tvs.fit(train_data) saveModel(lrModel.bestModel, station_id, columnName) ##### AQUESTES LINIES SON LES BONES!!!!###### # predictions = lrModel.transform(test_data).select("measure_date","station_id",columnName,"prediction") # groupedPredictions = predictions.groupBy("station_id").agg(avg(columnName),avg("prediction")) # insertDataIntoDatabase(groupedPredictions,columnName,station_id) except IllegalArgumentException as error: print("#####IllegalArgumentException on :\t " + str(station_id) + " on " + str(columnName) + "#####") print("IllegalArgumentException : {0}".format(error)) except py4j.protocol.Py4JJavaError as error: print("#####Py4JJavaError on :\t " + str(station_id) + " on " + str(columnName) + "#####") print("Py4JJavaError : {0}".format(error))
def lr_train_tvs(data): #Logistic Regression using Count Vector Features label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label") lsmodel=label_stringIdx.fit(data) data=lsmodel.transform(data) #(trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100) countVectors = CountVectorizer(inputCol="filtered", outputCol="cfeatures", vocabSize=10000, minDF=5) '''hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000) idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",minDocFreq=5)''' evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") lr = LogisticRegression(regParam=0.3, elasticNetParam=0,featuresCol=countVectors.getOutputCol(), labelCol="label") pipeline = Pipeline(stages=[countVectors,lr]) grid = ParamGridBuilder().addGrid(lr.maxIter, [10,15,20]).build() crossval = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.9) cvmodel=crossval.fit(data) return (evaluator.evaluate(cvmodel.transform(data)),lsmodel.labels,cvmodel)
def trainAndEvalModelByDecisionTreeClassifierAndTrainValidationSplit(stages, train_df, test_df, evaluator): ''' 使用 DecisionTreeClassifier 决策树分类器和 TrainValidationSplit 进行模型训练和验证,并找出最佳模型 :param stages: :param train_df: :param test_df: :param evaluator: :return: ''' dt = DecisionTreeClassifier(labelCol='label', featuresCol='features', impurity='gini', maxDepth=10, maxBins=14) paramGrid = ParamGridBuilder().addGrid(dt.impurity, ['gini', 'entropy']).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(dt.maxBins, [10, 15, 20]).build() # 执行模型参数训练 2*3*3=18次 tvs = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8) # 创建模型训练验证对象;参数 trainRatio=0.8 表示:训练验证前会将数据按照 8:2 的比例分成训练数据与验证数据 tvsPipline = Pipeline(stages=stages+[tvs]) # 建立模型训练 Pipeline 流程 tvsPiplineModel = tvsPipline.fit(train_df) # 生成训练后的模型 bestModel = tvsPiplineModel.stages[3].bestModel # print('========== [trainAndEvalModelByTrainValidationSplit] >>>> 查看训练完成后的最佳决策树模型规则:') # print(bestModel.toDebugString[:500]) # 只显示前500个字符 predictions = tvsPiplineModel.transform(test_df) auc = evaluator.evaluate(predictions) return (bestModel, predictions, auc)
def gridCV(self): param_grid = ParamGridBuilder() \ .addGrid(self.factor_model.rank, [5, 10, 15, 20]) \ .addGrid(self.factor_model.maxIter, [10, 15, 20, 30, 35]) \ .addGrid(self.factor_model.regParam, [0.05, 0.1, 0.15, 0.2]) \ .build() evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse") grid_model = TrainValidationSplit(estimator=self.factor_model, estimatorParamMaps=param_grid, evaluator=evaluator) model = grid_model.fit(self.train) best_model = model.bestModel print('rank', best_model.rank) print('max iter', best_model._java_obj.parent().getMaxIter()) print('reg param', best_model._java_obj.parent().getRegParam())
def tuning(classifier, paramGrid, train): tvs = TrainValidationSplit( estimator=classifier, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train) ParamMaps = model.getEstimatorParamMaps() for i, params in enumerate(ParamMaps): print("---------_", str(i), "_---------", " AUC: ", str(model.validationMetrics[i])) for param, value in params.items(): print(param.name, ": ", str(value), "; ", end='') print("\n") return model.bestModel
def test_meta_estimator_disable_post_training_autologging(dataset_regression): mlflow.pyspark.ml.autolog() lr = LinearRegression(solver="l-bfgs", regParam=0.01) eval_dataset = dataset_regression.sample(fraction=0.3, seed=1) lrParamMaps = [ { lr.maxIter: 1, lr.standardization: False }, { lr.maxIter: 200, lr.standardization: True }, { lr.maxIter: 2, lr.standardization: False }, ] eva = RegressionEvaluator(metricName="rmse") estimator = TrainValidationSplit(estimator=lr, estimatorParamMaps=lrParamMaps, evaluator=eva) with mock.patch( "mlflow.pyspark.ml._AutologgingMetricsManager.register_model" ) as mock_register_model, mock.patch( "mlflow.sklearn._AutologgingMetricsManager.is_metric_value_loggable" ) as mock_is_metric_value_loggable, mock.patch( "mlflow.pyspark.ml._AutologgingMetricsManager.log_post_training_metric" ) as mock_log_post_training_metric, mock.patch( "mlflow.pyspark.ml._AutologgingMetricsManager.register_prediction_input_dataset" ) as mock_register_prediction_input_dataset: with mlflow.start_run(): model = estimator.fit(dataset_regression) model.transform(eval_dataset) mock_register_model.assert_called_once() mock_is_metric_value_loggable.assert_not_called() mock_register_prediction_input_dataset.assert_not_called() mock_log_post_training_metric.assert_not_called()
def testCvWithLr(): spark = createLocalSparkSession() df = getDatasetMinist(spark) train, test = df.randomSplit([0.9, 0.1], seed=12345) lr = TFNeuralNetwork() paramGrid = ParamGridBuilder() \ .addGrid(lr.lr, [0.1, 0.01]) \ .addGrid(lr.maxIter, [10]) \ .build() tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) model = tvs.fit(train) pred = model.transform(test) pred.show()
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) self.assertEqual(len(tvsModel.subModels), len(grid)) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testTrainValidationSplitSubModels" savingPathWithSubModels = testSubPath + "cvModel3" tvsModel.save(savingPathWithSubModels) tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels) self.assertEqual(len(tvsModel3.subModels), len(grid)) tvsModel4 = tvsModel3.copy() self.assertEqual(len(tvsModel4.subModels), len(grid)) savingPathWithoutSubModels = testSubPath + "cvModel2" tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels) self.assertEqual(tvsModel2.subModels, None) for i in range(len(grid)): self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
def make_weather_trainers(trainRatio, estimator_gridbuilders, metricName=None): """Construct a list of TrainValidationSplit estimators for weather data where `estimator_gridbuilders` is a list of (Estimator, ParamGridBuilder) tuples and 0 < `trainRatio` <= 1 determines the fraction of rows used for training. The RegressionEvaluator will use a non-default `metricName`, if specified. """ feature_cols = ['latitude', 'longitude', 'elevation', 'doy'] column_names = dict(featuresCol="features", labelCol="tmax", predictionCol="tmax_pred") query = "SELECT station,date, dayofyear(date) as doy, latitude, longitude, elevation,tmax FROM __THIS__" getDOY = SQLTransformer( statement=query ) # TODO: engineer a day of year feature 'doy' from schema feature_assembler = VectorAssembler(inputCols=feature_cols, outputCol=column_names["featuresCol"]) ev = (RegressionEvaluator().setLabelCol( column_names["labelCol"]).setPredictionCol( column_names["predictionCol"])) if metricName: ev = ev.setMetricName(metricName) tvs_list = [] for est, pgb in estimator_gridbuilders: est = est.setParams(**column_names) pl = Pipeline( stages=[getDOY, feature_assembler, est]) # TODO: Construct a pipeline with estimator est paramGrid = pgb.build() tvs_list.append( TrainValidationSplit(estimator=pl, estimatorParamMaps=paramGrid, evaluator=ev, trainRatio=trainRatio)) return tvs_list
def train(self): print('Building stages...') stages = [] if(type(self.featurestages)!=list): self.featurestages=[self.featurestages] stages += self.featurestages #In case there is word2vec which has negative features, scale the features #to nonnegative values because naive bayes requires that if(('Word2Vec' in str(stages)) and ('NaiveBayes' in str(self.classifier))): print('Word2Vec and NaiveBayes detected, scaling to nonnegative [0.0,1.0]') stages[-1].setOutputCol('prefeatures') scaler = MinMaxScaler(inputCol='prefeatures', outputCol='features') stages = stages + [scaler] stages += [self.classifier] self.pipeline = Pipeline(stages = stages) print('Using the following stages: ' + str(self.pipeline.getStages())) print('Training model...') if(self.classifiergrid == None): print('Training without a Parameter Grid...') dftrain, dftest = self.DF.randomSplit([0.80, 0.20]) model = self.pipeline.fit(dftrain) self.predictions = model.transform(dftest) self.model=model else: print('Training with a Parameter Grid...') tvs = TrainValidationSplit(estimator=self.pipeline, estimatorParamMaps=self.classifiergrid, evaluator=BinaryClassificationEvaluator(), parallelism=4, trainRatio=0.8) dftrain, dftest = self.DF.randomSplit([0.80, 0.20]) model = tvs.fit(dftrain) self.predictions = model.transform(dftest) self.model=model
def GBT_CV(trainingData, testData): """ Gradient Boosted Tree Regression Model Selection :param trainingData: :param testData: :return: Trained model, predictions """ gbt = GBTRegressor(seed=42) paramGrid = ParamGridBuilder()\ .addGrid(gbt.maxIter, [50, 100, 200, 300, 400, 500 ]) \ .addGrid(gbt.maxDepth, [2, 6, 10, 14])\ .build() tvs = TrainValidationSplit( estimator=gbt, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) model = tvs.fit(trainingData) predictions = model.transform(testData) return model, predictions
def get_feature_importances(self, sdf): """ :param sdf: :return: """ evaluator = self.evaluator if evaluator is None: raise NotImplementedError("The evaluator parameter is not set.") space_grid = self.get_space_grid() model = self.model opt = self.opt if opt: crossval = TrainValidationSplit(estimator=model, estimatorParamMaps=space_grid, evaluator=evaluator, trainRatio=0.8) cvModel = crossval.fit(sdf) if isinstance(cvModel, PipelineModel): return cvModel.bestModel.stages[-1].featureImportances.toArray( ) else: return cvModel.bestModel.featureImportances.toArray() else: fitted_model = model.fit(sdf) if isinstance(fitted_model, PipelineModel): return fitted_model.stages[-1].featureImportances.toArray() else: return fitted_model.featureImportances.toArray()
def build_model(training): training.cache() columns = training.columns columns.remove("Occupancy") assembler = VectorAssembler(inputCols=columns, outputCol="featureVec") lr = LogisticRegression(featuresCol="featureVec", labelCol="Occupancy") pipeline = Pipeline(stages=[assembler, lr]) param_grid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.0001, 0.001, 0.01, 0.1, 1.0]) \ .build() evaluator = BinaryClassificationEvaluator(labelCol="Occupancy") validator = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, trainRatio=0.9) validator_model = validator.fit(training) return validator_model.bestModel