def test_logistic_regression(self): lr = LogisticRegression(maxIter=1) path = tempfile.mkdtemp() lr_path = path + "/logreg" lr.save(lr_path) lr2 = LogisticRegression.load(lr_path) self.assertEqual(lr2.uid, lr2.maxIter.parent, "Loaded LogisticRegression instance uid (%s) " "did not match Param's uid (%s)" % (lr2.uid, lr2.maxIter.parent)) self.assertEqual(lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter], "Loaded LogisticRegression instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def test_logistic_regression(self): lr = LogisticRegression(maxIter=1) path = tempfile.mkdtemp() lr_path = path + "/logreg" lr.save(lr_path) lr2 = LogisticRegression.load(lr_path) self.assertEqual( lr2.uid, lr2.maxIter.parent, "Loaded LogisticRegression instance uid (%s) " "did not match Param's uid (%s)" % (lr2.uid, lr2.maxIter.parent)) self.assertEqual( lr._defaultParamMap[lr.maxIter], lr2._defaultParamMap[lr2.maxIter], "Loaded LogisticRegression instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
# ---------------------------------------------------------------------- ## Model training and prediction start_time = time.time() print('model training start at: ', datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")) lr = LogisticRegression(labelCol="OUTCOME", featuresCol="features", maxIter=100) ### Fit the model on training data. trained_model_lr = lr.fit(train_2) print('model training completed at: ', datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")) m, s = divmod(time.time() - start_time, 60) h, m = divmod(m, 60) print('model training run time: %d:%02d:%02d' % (h, m, s)) lr.save("Benchmark/trained_model/") # V. Make predictions on test data pred_test = trained_model_lr.transform(test_2) evaluator = BinaryClassificationEvaluator(labelCol="OUTCOME", rawPredictionCol="rawPrediction") auroc = evaluator.evaluate(pred_test, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(pred_test, {evaluator.metricName: "areaUnderPR"}) print("The ROC_AUC is %.4f and the PR_AUC is %.4f" % (auroc, aupr)) # The ROC_AUC is 0.8365 and the PR_AUC is 0.3634
'header', 'true').csv('C:/Users/mrupv/bits/spa/Assignment2/paysim1/train.csv') df = data.withColumn("oldbalanceOrg",data["oldbalanceOrg"].cast("double"))\ .withColumn("newbalanceOrig",data["newbalanceOrig"].cast("double"))\ .withColumn("oldbalanceDest",data["oldbalanceDest"].cast("double"))\ .withColumn("newbalanceDest",data["newbalanceDest"].cast("double")) \ .withColumn("step",data["step"].cast("int")) \ .withColumn("amount",data["amount"].cast("double")) \ .withColumn("isFraud",data["isFraud"].cast("int")) type_indexer = StringIndexer(inputCol='type', outputCol='type_index') orig_indexer = StringIndexer(inputCol='nameOrig', outputCol='nameOrig_index') dest_indexer = StringIndexer(inputCol='nameDest', outputCol='nameDest_index') assembler = VectorAssembler(inputCols=[ 'step', 'type_index', 'amount', 'nameOrig_index', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest_index', 'oldbalanceDest', 'newbalanceDest' ], outputCol='features') model = LogisticRegression(featuresCol='features', labelCol='isFraud') pipeline = Pipeline( stages=[type_indexer, orig_indexer, dest_indexer, assembler, model]) model = pipeline.fit(df) #output_df = model.transform(df) model.save('model/')
Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)), Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)), Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0)) ]).toDF() blor = LogisticRegression(regParam=0.01, weightCol="weight") blorModel = blor.fit(bdf) blorModel.coefficients blorModel.intercept test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF() blorModel.transform(test1).head().prediction save_path = "C:\\PySpark\\spark_ml\\saved_models\\logistic_regression_example_1\\" estimator_path = save_path + "lr" # Save the estimator blor.save(estimator_path) lr2 = LogisticRegression.load(estimator_path) lr2.getRegParam() #save the model model_path = save_path + "lr_model" blorModel.save(model_path) from pyspark.ml.classification import LogisticRegressionModel model2 = LogisticRegressionModel.load(model_path) print(blorModel.coefficients[0] == model2.coefficients[0]) print(blorModel.intercept == model2.intercept) print(model2, blorModel) spark.stop()