def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True) tvsModel = tvs.fit(dataset) self.assertEqual(len(tvsModel.subModels), len(grid)) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testTrainValidationSplitSubModels" savingPathWithSubModels = testSubPath + "cvModel3" tvsModel.save(savingPathWithSubModels) tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels) self.assertEqual(len(tvsModel3.subModels), len(grid)) tvsModel4 = tvsModel3.copy() self.assertEqual(len(tvsModel4.subModels), len(grid)) savingPathWithoutSubModels = testSubPath + "cvModel2" tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels) self.assertEqual(tvsModel2.subModels, None) for i in range(len(grid)): self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
def test_save_load_simple_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assert_param_maps_equal( loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_save_load_simple_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def loadaftRegression(conf, path): '''Loading model from path. Input : - Path Output : - Loaded model ''' # Load model if use crossvalidation tuning if conf["tuning"].get("method") == "crossval" : loaded_model = CrossValidatorModel.load(path) # Load model if use trainvalidationsplit tuning elif conf["tuning"].get("method") == "trainval": loaded_model = TrainValidationSplitModel.load(path) # Load model if non-tuning elif conf["tuning"].get("method") == None: loaded_model = AFTSurvivalRegressionModel.load(path) return loaded_model
def loadModel(conf, path): """ input : conf [dictionary], path [string] output: model [CrossValidatorModel, TrainValidationSplitModel, GeneralizedLinearRegressionModel] """ if conf["tuning"]: if conf["tuning"].get("method").lower() == "crossval": loading_model = CrossValidatorModel.load(path) elif conf["tuning"].get("method").lower() == "trainvalsplit": loading_model = TrainValidationSplitModel.load(path) elif conf["tuning"] == None: loading_model = GeneralizedLinearRegressionModel.load(path) return loading_model
def _run_test_save_load_trained_model(self, LogisticRegressionCls, LogisticRegressionModelCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) lr = LogisticRegressionCls() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, collectSubModels=True, seed=42, ) tvsModel = tvs.fit(dataset) lrModel = tvsModel.bestModel lrModelPath = temp_path + "/lrModel" lrModel.save(lrModelPath) loadedLrModel = LogisticRegressionModelCls.load(lrModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedTvsModel = TrainValidationSplitModel.load(tvsModelPath) for param in [ lambda x: x.getSeed(), lambda x: x.getTrainRatio(), ]: self.assertEqual(param(tvsModel), param(loadedTvsModel)) self.assertTrue( all( loadedTvsModel.isSet(param) for param in loadedTvsModel.params))
def main(): model_path = parse_args().model_path sc = SparkContext(master="local[*]", appName="tr-troll-detect") sc.setLogLevel("OFF") spark = SparkSession(sc) print(f"Loading model {model_path}") model = TrainValidationSplitModel.load(model_path) bert_name = "dbmdz/distilbert-base-turkish-cased" tokenizer = AutoTokenizer.from_pretrained(bert_name) bert_model = AutoModel.from_pretrained(bert_name) while True: user_tweet = input("> ") embedding = forward_bert(user_tweet, tokenizer, bert_model) tweet_in = spark.createDataFrame([(user_tweet, embedding, 0)], schema=["tweet", "features", "label"]) trol_prob = model.transform(tweet_in).collect()[0].probability[1] print(f"Probability of trolling : {trol_prob}")
def _run_test_save_load_nested_estimator(self, LogisticRegressionCls): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [ (Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0), ] * 10, ["features", "label"], ) ova = OneVsRest(classifier=LogisticRegressionCls()) lr1 = LogisticRegressionCls().setMaxIter(100) lr2 = LogisticRegressionCls().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), grid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) originalParamMap = tvs.getEstimatorParamMaps() loadedParamMap = loadedTvs.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assert_param_maps_equal(loadedModel.getEstimatorParamMaps(), grid) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def loadgbtRegression(conf, path): """ input : conf, path output : model (CrossValidatorModel / TrainValidationSplitModel / PipelineModel) """ #Jika menggunakan ML-Tuning if conf["tuning"]: #Jika menggunakan Cross Validation, maka tipe model = CrossValidatorModel if conf["tuning"].get("method").lower() == "crossval": loaded_model = CrossValidatorModel.load(path) #Jika menggunakan Train Validation, maka tipe model = TrainValidationSplitModel elif conf["tuning"].get("method").lower() == "trainvalsplit": loaded_model = TrainValidationSplitModel.load(path) #Jika tidak menggunakan ML-tuning, tipe model = PipelineModel elif conf["tuning"] == None: loaded_model = PipelineModel.load(path) return loaded_model
def test_save_load_nested_estimator(self): # This tests saving and loading the trained model only. # Save/load for TrainValidationSplit will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(100) lr2 = LogisticRegression().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() tvs = TrainValidationSplit(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) originalParamMap = tvs.getEstimatorParamMaps() loadedParamMap = loadedTvs.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_save_load_pipeline_estimator(self): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(5) lr2 = LogisticRegression().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100]) \ .addGrid(ova.classifier, [lr1, lr2]) \ .build() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel = tvs.fit(training) # test save/load of CrossValidatorModel tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages)) for loadedStage, originalStage in zip(loadedModel.bestModel.stages, tvsModel.bestModel.stages): self.assertEqual(loadedStage.uid, originalStage.uid) # Test nested pipeline nested_pipeline = Pipeline(stages=[tokenizer, Pipeline(stages=[hashingTF, ova])]) tvs2 = TrainValidationSplit(estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvs2Path = temp_path + "/tvs2" tvs2.save(tvs2Path) loadedTvs2 = TrainValidationSplit.load(tvs2Path) self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel2 = tvs2.fit(training) # test save/load of CrossValidatorModel tvsModelPath2 = temp_path + "/tvsModel2" tvsModel2.save(tvsModelPath2) loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2) self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid) loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1] original_nested_pipeline_model = tvsModel2.bestModel.stages[1] self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid) self.assertEqual(len(loaded_nested_pipeline_model.stages), len(original_nested_pipeline_model.stages)) for loadedStage, originalStage in zip(loaded_nested_pipeline_model.stages, original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid)
scaler_model = scaler.fit(dataset) scaler_model.save('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') else: scaler_model = StandardScalerModel.load('/user/ronghui_safe/hgy/nid/edw/standardScaler_model_v2') dataset = scaler_model.transform(dataset) polyExpansion = PolynomialExpansion(degree=2, inputCol='scaled_feature_vec', outputCol='polyFeatures') dataset = polyExpansion.transform(dataset) dataset = dataset.select(F.col('duration'), F.col('polyFeatures'), F.col('key')).cache() glr = None if args.mode == 'train': glr = GeneralizedLinearRegression(labelCol='duration', featuresCol='polyFeatures', family='Binomial', linkPredictionCol='link_pred') paramGrid = ParamGridBuilder() \ .addGrid(glr.link, ['logit']) \ .addGrid(glr.regParam, [1e-5]) \ .build() tvs = TrainValidationSplit(estimator=glr, \ estimatorParamMaps=paramGrid, \ evaluator=RegressionEvaluator(metricName='r2', labelCol='duration'), \ trainRatio=0.7) tvs_model = tvs.fit(dataset) print('----> {}'.format(tvs_model.validationMetrics)) if args.save_model: tvs_model.write().save('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2') else: #glr_model = GeneralizedLinearRegressionModel.load('/user/ronghui_safe/hgy/nid/models/glm_binomial_model') glr_model = TrainValidationSplitModel.load('/user/ronghui_safe/hgy/nid/edw/glm_binomial_model_v2') dataset = glr_model.transform(dataset).select(F.col('duration'), F.col('prediction'), F.col('key')).cache() if args.mode == 'eval': evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='duration', metricName='r2') print('----> The performance on the whole dataset is {}'.format(round(evaluator.evaluate(dataset), 4))) dataset.drop('duration').repartition(50).write.csv('/user/ronghui_safe/hgy/nid/weights/{}_{}'.format(args.query_month, args.mode), header=True)
# Select the columns to be used as the features featureColumns = [ c for c in [ "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol" ] ] #Create and configure the assembler assembler = VectorAssembler(inputCols=featureColumns, outputCol="features") # Transform the original data data = assembler.transform(data) # Load model and predict wine quality loaded_model = TrainValidationSplitModel.load( "D:/Edu/big_data_project/data/temp/rfc_model") loaded_preds = loaded_model.transform(data).select("features", "prediction") loaded_preds.show() # Create and sort final DataFrame data = data.join(loaded_preds, on=["features"]).drop("features") data = data.withColumnRenamed("prediction", "quality") data = data.select("date", "time", "location", "quality", "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol") data = data.orderBy(data["date"], data["time"]) data.show()