def evaluate_model(pipeline_model, data): evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol="probability", labelCol="label") results = pipeline_model.transform(data) AUC = evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}) AP = evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}) return AUC, AP
def GBDTclf(trainingData, testData): max_depth = [1, 5, 10] grid = tune.ParamGridBuilder() \ .addGrid(GBDT.maxDepth, max_depth) \ .build() evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol='probability', labelCol='label') # 3-fold validation cv = tune.CrossValidator(estimator=GBDT, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3) # pipelineDtCV = Pipeline(stages=[cv]) cvModel = cv.fit(trainingData) results = cvModel.transform(testData) label = results.select("label").toPandas().values predict = results.select("prediction").toPandas().values np.savetxt('res/predictedGBDT_spark.txt', predict, fmt='%01d') print("[accuracy,precision,recall,f1]") # print(evaluate(label,predict)) return evaluate(label, predict)
def _calc_auc_auprc(df: DataFrame, prob_col: str, label_col: str) -> Tuple[float, float]: r""" Given a df, labels, and probabilities, return auc and auprc (area under precision recall curve) Parameters ---------- df : pyspark.sql.DataFrame prob_col : str colname w/ raw probabilities of being in class 1 label_col : str Returns ------- auc :float aucprc : float area under precision recall curve Raises ------ UncaughExceptions See Also -------- pyspark.ml.evaluation.BinaryClassificationEvaluator """ auc_eval = mle.BinaryClassificationEvaluator(rawPredictionCol=prob_col, labelCol=label_col, metricName='areaUnderROC') auc = auc_eval.evaluate(df) auprc_eval = mle.BinaryClassificationEvaluator(rawPredictionCol=prob_col, labelCol=label_col, metricName='areaUnderPR') auprc = auprc_eval.evaluate(df) return auc, auprc
def train_evaluate(train_data, test_data): # 将文字的分类特征转为数字 stringIndexer = ft.StringIndexer(inputCol='alchemy_category', outputCol="alchemy_category_Index") encoder = ft.OneHotEncoder(dropLast=False, inputCol='alchemy_category_Index', outputCol="alchemy_category_IndexVec") assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1] assembler = ft.VectorAssembler(inputCols=assemblerInputs, outputCol="features") # dt = cl.DecisionTreeClassifier(labelCol="label", # featuresCol="features") rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features") evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol="probability", labelCol='label', metricName='areaUnderROC') grid_search = tune.ParamGridBuilder()\ .addGrid(rf.impurity, [ "gini","entropy"])\ .addGrid(rf.maxDepth, [ 5,10,15])\ .addGrid(rf.maxBins, [10, 15,20])\ .addGrid(rf.numTrees, [10, 20,30])\ .build() rf_cv = tune.CrossValidator(estimator=rf, estimatorParamMaps=grid_search, evaluator=evaluator, numFolds=5) # rf_tvs = tune.TrainValidationSplit( # estimator=rf, # estimatorParamMaps=grid_search, # evaluator=evaluator, # trainRatio=0.7 # ) pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv]) cv_pipeline_model = pipeline.fit(train_data) best_model = cv_pipeline_model.stages[-1] best_parm = get_best_param(best_model) AUC, AP = evaluate_model(cv_pipeline_model, test_data) return AUC, AP, best_parm, cv_pipeline_model
# Import the evaluation submodule import pyspark.ml.evaluation as evals # Create a BinaryClassificationEvaluator evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")
def main(path_data, path_parameters, dir_models): logger = logging.getLogger(__name__) spark = ( pyspark.sql.SparkSession .builder .appName("Python Spark Random Forest model training") .enableHiveSupport() .getOrCreate() ) logger.info("Reading parquet data and splitting into test and train datasets") data_df = spark.read.parquet(path_data) splits = data_df.randomSplit([0.7, 0.3]) training_df = splits[0] validation_df = splits[1] logger.info("Constructing pipeline for prediction model") with open(path_parameters) as json_file: parameters = json.load(json_file) feature_columns = parameters['feature_columns'] rf_params = parameters['rf_params'] assembler = feature.VectorAssembler( inputCols=feature_columns, outputCol="features") rf = classification.RandomForestClassifier( labelCol="churn", **rf_params) rf_pipeline = pipeline.Pipeline(stages=[assembler, rf]) logger.info("Training prediction model") pipeline_model = rf_pipeline.fit(training_df) logger.info("Calculating model metrics") train_predictions_df = pipeline_model.transform(training_df) validation_predictions_df = pipeline_model.transform(validation_df) accuracy_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="accuracy", labelCol="churn", predictionCol="prediction") precision_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedPrecision", labelCol="churn", predictionCol="prediction") recall_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedRecall", labelCol="churn", predictionCol="prediction") f1_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="f1", labelCol="churn", predictionCol="prediction") auroc_evaluator = evaluation.BinaryClassificationEvaluator(metricName='areaUnderROC', labelCol="churn") logger.info("Saving model and metrics data") train_metrics = { "accuracy": accuracy_evaluator.evaluate(train_predictions_df), "precision": precision_evaluator.evaluate(train_predictions_df), "recall": recall_evaluator.evaluate(train_predictions_df), "f1": f1_evaluator.evaluate(train_predictions_df), "auroc": auroc_evaluator.evaluate(train_predictions_df) } validation_metrics = { "accuracy": accuracy_evaluator.evaluate(validation_predictions_df), "precision": precision_evaluator.evaluate(validation_predictions_df), "recall": recall_evaluator.evaluate(validation_predictions_df), "f1": f1_evaluator.evaluate(validation_predictions_df), "auroc": auroc_evaluator.evaluate(validation_predictions_df) } rf_model = pipeline_model.stages[-1] model_params = rf_model.extractParamMap() model_description = { "name": "Random Forest", "params": {param.name: value for param, value in model_params.items()}, } dir_model = pathlib.Path(dir_models) dir_model.mkdir(parents=True, exist_ok=True) path_pipeline_model = pathlib.Path(dir_model).joinpath("pipeline_model") path_train_metrics = pathlib.Path(dir_model).joinpath("metrics_train.json") path_validation_metrics = pathlib.Path(dir_model).joinpath("metrics_validation.json") path_model_description = pathlib.Path(dir_model).joinpath("model_description.json") pipeline_model.save(str(path_pipeline_model)) with open(path_train_metrics, "w") as f: json.dump(train_metrics, f) with open(path_validation_metrics, "w") as f: json.dump(validation_metrics, f) with open(path_model_description, "w") as f: json.dump(model_description, f)
train, test = data.randomSplit([0.7, 0.3], seed=1234) #42 lr_model = cl.LogisticRegression( # maxIter=10, # regParam=0.01, elasticNetParam=0, family='binomial', threshold=0.5, weightCol='weight', labelCol='y') grid = tune.ParamGridBuilder()\ .addGrid(lr_model.maxIter,[200,300,500,800])\ .addGrid(lr_model.regParam,[0.001,0.002])\ .build() evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol='probability', labelCol='y') cv = tune.CrossValidator(estimator=lr_model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3) ppline = Pipeline(stages=[featuerCreator]) train_transfomer = ppline.fit(train) cv_model = cv.fit(train_transfomer.transform(train)) test = train_transfomer.transform(test) results = cv_model.transform(test) print('predict_results_type:', type(results)) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
# Import the tuning submodule import pyspark.ml.tuning as tune from pyspark.sql.functions import udf, col # Create a LogisticRegression Estimator lr = LogisticRegression(maxIter=300) # Create Pipeline (need to drop the label) assembler = VectorAssembler( inputCols=df.select(modeling_vars).drop('label').columns, outputCol="features") pipeline = Pipeline(stages=[assembler, lr]) # Create a BinaryClassificationEvaluator evaluator = evals.BinaryClassificationEvaluator(metricName='areaUnderPR') # Create the parameter grid grid = tune.ParamGridBuilder() # Add the hyperparameter grid = grid.addGrid(lr.regParam, np.arange(0.00001, 1, 50)) # This does both l1 and l2 - list of 0 and 1 # NOTE - 1 = LASSO, 0 = Ridge regression grid = grid.addGrid(lr.elasticNetParam, [0, 1]) # Build the grid grid = grid.build() # Create the CrossValidator cv = tune.CrossValidator(estimator=pipeline,
births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # In[10]: model = pipeline.fit(births_train) test_model = model.transform(births_test) # In[12]: test_model.take(1) # In[13]: import pyspark.ml.evaluation as ev evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'})) # In[14]: pipelinePath = './model/infant_oneHotEncoder_Logistic_Pipeline' pipeline.write().overwrite().save(pipelinePath) # In[15]: loadedPipeline = Pipeline.load(pipelinePath) loadedPipeline.fit(births_train).transform(births_test).take(1) # In[16]:
def loadData4Validation(K=5): trainDataFile = '/user/mydata/tianchi/repeat_buyers_format2/train_format2.csv' #trainDataFile = '/user/mydata/tianchi/repeat_buyers_format2/train_format2_sub.csv' log("start to validate.\n") aucTotal = 0 for i in range(5): t1 = time.time() #读取数据,分割为训练集和测试集 df = sqlContext.read.csv(trainDataFile, header='true', inferSchema='true', sep=',').repartition(1000) df = df.withColumn('random', rand()) train_df, test_df = df.filter("random<=0.8").repartition( 1000), df.filter("random>0.2").repartition(1000) #训练阶段 trainData = dataProcess(train_df, mode='validation') # trainData = trainData.withColumn('random', rand()) # trainData = trainData.where("(label==0 and random>0.5) or label=1") log("data preprocessing costs " + str(time.time() - t1) + ".\n") #clf = LogisticRegression(featuresCol='features', labelCol='label', predictionCol='prediction') log('training it\n') #clf = GBTRegressor(maxIter=200, maxDepth=6, seed=42,subsamplingRate=0.7) clf = RandomForestRegressor(subsamplingRate=0.7, numTrees=50, featureSubsetStrategy='0.5') model = clf.fit(trainData) log('training cost ' + str(time.time() - t1) + 's\n') #测试阶段 testData = dataProcess(test_df, mode='validation') log('transforming them\n') train_prediction = model.transform(trainData) test_prediction = model.transform(testData) udfProcessFloat201 = udf(process01, DoubleType()) #log(str(train_prediction.rdd.take(2)) + '\n') train_prediction = train_prediction.withColumn( 'prediction_final', udfProcessFloat201(train_prediction.prediction)) test_prediction = test_prediction.withColumn( 'prediction_final', udfProcessFloat201(test_prediction.prediction)) #log(str(train_prediction.rdd.take(20)) + '\n') #""" #use spark to evaluate model print("#####evaluating#######\n\n\n\n\n\n\n\n\n\n\n") evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol='prediction_final', labelCol='label') print("############\n\n\n\n\n\n\n\n\n\n\n") train_auc = evaluator.evaluate(train_prediction, {evaluator.metricName: 'areaUnderROC'}) test_auc = evaluator.evaluate(test_prediction, {evaluator.metricName: 'areaUnderROC'}) log( str(i) + " epoch auc is " + str(test_auc) + ', training auc is ' + str(train_auc) + '\n') t2 = time.time() print("############time cost is " + str(t2 - t1) + "\n\n\n\n\n\n\n\n\n\n\n") #use spark to evaluate model #""" ''' #use sklearn to evaluate model print("############\n\n\n\n\n\n\n\n\n\n\n") from sklearn.metrics import auc from sklearn.metrics import roc_auc_score print("############\n\n\n\n\n\n\n\n\n\n\n") train_prediction, test_prediction = \ train_prediction.select('label', 'prediction_final').toPandas(), \ test_prediction.select('label', 'prediction_final').toPandas() trainLables, trainPredictions = \ train_prediction['label'].values, train_prediction['prediction_final'].values testLables, testPredictions = \ test_prediction['label'].values, test_prediction['prediction_final'].values train_auc, test_auc = roc_auc_score(trainLables, trainPredictions), roc_auc_score(testLables, testPredictions) log(str(i) + " epoch auc is " + str(test_auc) + ', training auc is ' +str(train_auc) + '\n') print("############\n\n\n\n\n\n\n\n\n\n\n") #use sklearn to evaluate model ''' aucTotal += test_auc print("av auc is ", aucTotal / 5) log(str(aucTotal / 5) + '\n') print("############\n\n\n\n\n\n\n\n\n\n\n")
# Bring everything together validator = smt.CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=eval_f1, numFolds=3) # Fit the model to the data #######################################################################' model = validator.fit(train) train_predictions = model.transform(train) val_predictions = model.transform(val) # Evaluate model performance eval_roc = sme.BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='predictedLabel', metricName='areaUnderROC') eval_accuracy = sme.MulticlassClassificationEvaluator( labelCol='label', predictionCol='predictedLabel', metricName='accuracy') eval_precision = sme.MulticlassClassificationEvaluator( labelCol='label', predictionCol='predictedLabel', metricName='weightedPrecision') eval_recall = sme.MulticlassClassificationEvaluator( labelCol='label', predictionCol='predictedLabel', metricName='weightedRecall')
dt_pipe_md = pipeline.Pipeline(stages=[assembler, classific]) dt_pipe_md_model = dt_pipe_md.fit(training_df) train_predictions_df = dt_pipe_md_model.transform(training_df) validation_predictions_df = dt_pipe_md_model.transform(validation_df) test_prediction_df = dt_pipe_md_model.transform(test_df) accuracy_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="accuracy", labelCol="churn", predictionCol="prediction") precision_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedPrecision", labelCol="churn") recall_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="weightedRecall", labelCol="churn", predictionCol="prediction") auroc_evaluator = evaluation.BinaryClassificationEvaluator( metricName='areaUnderROC', labelCol="churn") f1_evaluator = evaluation.MulticlassClassificationEvaluator( metricName="f1", labelCol="churn", predictionCol="prediction") train_metrics = { "accuracy": accuracy_evaluator.evaluate(train_predictions_df), "precision": precision_evaluator.evaluate(train_predictions_df), "recall": recall_evaluator.evaluate(train_predictions_df), "f1": f1_evaluator.evaluate(train_predictions_df), "auroc": auroc_evaluator.evaluate(train_predictions_df) } test_metrics = { "accuracy": accuracy_evaluator.evaluate(test_prediction_df), "precision": precision_evaluator.evaluate(test_prediction_df),
def hyper_parameter_optimization_ml(): spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator]) data_transformer = pipeline.fit(births_train) # Specify our model and the list of parameters we want to loop through. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a logic that will do the validation work. cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(data_transformer.transform(births_train)) # See if cvModel performed better than our previous model data_train = data_transformer.transform(births_test) results = cvModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'})) # Parameters which the best model has. results = [ ([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics) ] print(sorted(results, key=lambda el: el[1], reverse=True)[0])
def train_validation_splitting_ml(): spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Select only the top five features. selector = ml_feature.ChiSqSelector( numTopFeatures=5, featuresCol=featuresCreator.getOutputCol(), outputCol='selectedFeatures', labelCol='INFANT_ALIVE_AT_REPORT' ) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, selector]) data_transformer = pipeline.fit(births_train) # Create LogisticRegression and Pipeline. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a TrainValidationSplit object. tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) # Fit our data to the model. tvsModel = tvs.fit(data_transformer.transform(births_train)) data_train = data_transformer.transform(births_test) # Calculate results. results = tvsModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
def infant_survival_ml(): spark = SparkSession.builder.appName('infant-survival-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_ft.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Create a model. logistic = ml_classification.LogisticRegression(maxIter=10, regParam=0.01, labelCol='INFANT_ALIVE_AT_REPORT') # Create a pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator, logistic]) # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Run the pipeline and estimate the model. model = pipeline.fit(births_train) test_model = model.transform(births_test) print(test_model.take(1)) # Evaluate the performance of the model. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'})) # Save the Pipeline definition. pipelinePath = './infant_oneHotEncoder_Logistic_Pipeline' pipeline.write().overwrite().save(pipelinePath) # Load the Pipeline definition. loadedPipeline = Pipeline.load(pipelinePath) loadedPipeline.fit(births_train).transform(births_test).take(1) # Save the PipelineModel. modelPath = './infant_oneHotEncoder_Logistic_PipelineModel' model.write().overwrite().save(modelPath) # Load the PipelineModel. loadedPipelineModel = PipelineModel.load(modelPath) test_reloadedModel = loadedPipelineModel.transform(births_test) print(test_reloadedModel.take(1))
data_piped = pipe.fit(data).transform(data) data_piped.show() data_piped = data_piped.select('features', 'Survived') # splitting into train, test set tr, te = data_piped.randomSplit([.7, .3]) # fitting models from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(featuresCol = 'features', labelCol = 'Survived') model = lr.fit(tr) pred = model.transform(te) import pyspark.ml.evaluation as evals evaluator = evals.BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Survived') AUC = evaluator.evaluate(pred) AUC ############# model tunning from pyspark.ml.tuning import ParamGridBuilder params = ParamGridBuilder() params = params.addGrid(lr.regParam, np.arange(0, .1, .01)) params = params.addGrid(lr.elasticNetParam, [0, .5, 1]) params = params.build() print("Number of models to be tested:", len(params)) # create the CrossValidator from pyspark.ml.tuning import CrossValidator cv = CrossValidator(estimator = lr,