def run_ML_gbt_crossValidation(model, train, test, df, name):
    # Same idea than run_ML_regression_crossValidation()
    evaluator = BinaryClassificationEvaluator()
    grid = tune.ParamGridBuilder()
    grid = grid.addGrid(model.maxDepth, [1, 10, 20, 30])
    grid = grid.addGrid(model.maxIter, [7, 10, 14, 18])
    # grid = grid.addGrid(model.minInstancesPerNode, np.linspace(1, 32, 32, endpoint=True))
    # grid = grid.addGrid(model.subsamplingRate, np.linspace(1, 10, 10, endpoint=True))
    # grid = grid.addGrid(model.maxBins, np.linspace(20, 44, 32, endpoint=True))
    # grid = grid.addGrid(model.minInfoGain, [0,1,2])

    # grid = grid.addGrid(model.min_samples_leaf, [40,50,60])
    grid = grid.build()
    cv = tune.CrossValidator(estimator=model,
               estimatorParamMaps=grid,
               evaluator=evaluator,
               numFolds=5
               )
    models = cv.fit(train)
    bestModel = models.bestModel
    
    # obtain the best params
    result = df.copy()
    for index, rows in result.iterrows():
        if rows['Name'] == name:
            result.at[index, 'Best_Param'] = "maxDepth: " + str(bestModel._java_obj.getMaxDepth()) + " - maxIter: " + str(bestModel._java_obj.getMaxIter())
            
    finalPredictions = bestModel.transform(train)
    return finalPredictions, bestModel, result
예제 #2
0
def GBDTclf(trainingData, testData):

    max_depth = [1, 5, 10]
    grid = tune.ParamGridBuilder() \
        .addGrid(GBDT.maxDepth, max_depth) \
        .build()

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol='probability', labelCol='label')

    # 3-fold validation
    cv = tune.CrossValidator(estimator=GBDT,
                             estimatorParamMaps=grid,
                             evaluator=evaluator,
                             numFolds=3)

    # pipelineDtCV = Pipeline(stages=[cv])
    cvModel = cv.fit(trainingData)
    results = cvModel.transform(testData)

    label = results.select("label").toPandas().values
    predict = results.select("prediction").toPandas().values
    np.savetxt('res/predictedGBDT_spark.txt', predict, fmt='%01d')
    print("[accuracy,precision,recall,f1]")
    # print(evaluate(label,predict))
    return evaluate(label, predict)
예제 #3
0
def train_evaluate(train_data, test_data):
    # 将文字的分类特征转为数字
    stringIndexer = ft.StringIndexer(inputCol='alchemy_category',
                                     outputCol="alchemy_category_Index")

    encoder = ft.OneHotEncoder(dropLast=False,
                               inputCol='alchemy_category_Index',
                               outputCol="alchemy_category_IndexVec")

    assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1]
    assembler = ft.VectorAssembler(inputCols=assemblerInputs,
                                   outputCol="features")

    # dt = cl.DecisionTreeClassifier(labelCol="label",
    #                             featuresCol="features")
    rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features")

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol="probability",
        labelCol='label',
        metricName='areaUnderROC')

    grid_search = tune.ParamGridBuilder()\
        .addGrid(rf.impurity, [ "gini","entropy"])\
        .addGrid(rf.maxDepth, [ 5,10,15])\
        .addGrid(rf.maxBins, [10, 15,20])\
        .addGrid(rf.numTrees, [10, 20,30])\
        .build()

    rf_cv = tune.CrossValidator(estimator=rf,
                                estimatorParamMaps=grid_search,
                                evaluator=evaluator,
                                numFolds=5)

    # rf_tvs = tune.TrainValidationSplit(
    #     estimator=rf,
    #     estimatorParamMaps=grid_search,
    #     evaluator=evaluator,
    #     trainRatio=0.7
    # )
    pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv])
    cv_pipeline_model = pipeline.fit(train_data)

    best_model = cv_pipeline_model.stages[-1]
    best_parm = get_best_param(best_model)

    AUC, AP = evaluate_model(cv_pipeline_model, test_data)

    return AUC, AP, best_parm, cv_pipeline_model
def run_ML_regression_crossValidation(model, train, test, df, name):
    # We’ll be using cross validation to choose the hyperparameters
    # by creating a grid of the possible pairs of values for the three hyperparameters,
    # elasticNetParam, regParam and maxIter
    # and using the cross validation error to compare all the different models so you can choose the best one

    # We will create a 5-fold CrossValidator

    # The first thing we need when doing cross validation for model selection is a way to compare different models
    evaluator = BinaryClassificationEvaluator()

    # Next, we need to create a grid of values to search over when looking for the optimal hyperparameters

    # Create the parameter grid
    grid = tune.ParamGridBuilder()
    # Add the hyperparameter
    grid = grid.addGrid(model.regParam, np.arange(0, .1, .01))
    grid = grid.addGrid(model.elasticNetParam, [0, 1])
    grid = grid.addGrid(model.maxIter, [1, 5, 10])
    # Build the grid
    grid = grid.build()

    # Create the CrossValidator
    cv = tune.CrossValidator(estimator=model,
               estimatorParamMaps=grid,
               evaluator=evaluator,
               numFolds=5,
               collectSubModels=True
               )

    # Fit cross validation models
    models = cv.fit(train)
    # Extract the best model
    bestModel = models.bestModel
    
    # obtain the best params
    result = df.copy()
    for index, rows in result.iterrows():
        if rows['Name'] == name:
            result.at[index, 'Best_Param'] = "regParam: " + str(bestModel._java_obj.getRegParam()) + " - MaxIter: " + str(bestModel._java_obj.getMaxIter()) + " - elasticNetParam: " + str(bestModel._java_obj.getElasticNetParam())
    
    finalPredictions = bestModel.transform(train)
    return finalPredictions, bestModel, result
def run_ML_mpc_crossValidation(model, train, test, df, name):
    # Same idea than run_ML_regression_crossValidation()
    evaluator = MulticlassClassificationEvaluator()
    grid = tune.ParamGridBuilder()
    grid = grid.build()
    cv = tune.CrossValidator(estimator=model,
               estimatorParamMaps=grid,
               evaluator=evaluator,
               numFolds=5
               )
    models = cv.fit(train)
    bestModel = models.bestModel
    
    # obtain the best params
    result = df.copy()
    for index, rows in result.iterrows():
        if rows['Name'] == name:
            result.at[index, 'Best_Param'] = "unspecified"
            
    finalPredictions = bestModel.transform(train)
    return finalPredictions, bestModel, result
def run_ML_dt_crossValidation(model, train, test, df, name):
    # Same idea than run_ML_regression_crossValidation()
    evaluator = MulticlassClassificationEvaluator()
    grid = tune.ParamGridBuilder()
    grid = grid.addGrid(model.maxDepth, [4, 8])
    grid = grid.addGrid(model.maxBins, [2, 4, 6])
    grid = grid.build()
    cv = tune.CrossValidator(estimator=model,
               estimatorParamMaps=grid,
               evaluator=evaluator,
               numFolds=5
               )
    models = cv.fit(train)
    bestModel = models.bestModel
    
    # obtain the best params
    result = df.copy()
    for index, rows in result.iterrows():
        if rows['Name'] == name:
            result.at[index, 'Best_Param'] = "maxDepth: " + str(bestModel._java_obj.getMaxDepth()) + " - maxBins: " + str(bestModel._java_obj.getMaxBins())
            
    finalPredictions = bestModel.transform(train)
    return finalPredictions, bestModel, result
예제 #7
0
# Import the tuning submodule
import pyspark.ml.tuning as tune

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

# Create the CrossValidator
cv = tune.CrossValidator(estimator=lr,
               estimatorParamMaps=grid,
               evaluator=evaluator
               )
# Call lr.fit()
best_lr = lr.fit(training)

# Print best_lr
print(best_lr)

# Fit cross validation models
models = cv.fit(training)
# Extract the best model
best_lr = models.bestModel

# Use the model to predict the test set
test_results = best_lr.transform(test)
예제 #8
0
        elasticNetParam=0,
        family='binomial',
        threshold=0.5,
        weightCol='weight',
        labelCol='y')

    grid = tune.ParamGridBuilder()\
        .addGrid(lr_model.maxIter,[200,300,500,800])\
        .addGrid(lr_model.regParam,[0.001,0.002])\
        .build()

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol='probability', labelCol='y')

    cv = tune.CrossValidator(estimator=lr_model,
                             estimatorParamMaps=grid,
                             evaluator=evaluator,
                             numFolds=3)

    ppline = Pipeline(stages=[featuerCreator])
    train_transfomer = ppline.fit(train)

    cv_model = cv.fit(train_transfomer.transform(train))
    test = train_transfomer.transform(test)
    results = cv_model.transform(test)
    print('predict_results_type:', type(results))
    print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
    print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))

    best_param = [([{
        key.name: paramValues
    } for key, paramValues in zip(params.keys(), params.values())], metric)
# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0.00001, 1, 50))
# This does both l1 and l2 - list of 0 and 1
# NOTE - 1 = LASSO, 0 = Ridge regression
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

# Create the CrossValidator
cv = tune.CrossValidator(estimator=pipeline,
                         estimatorParamMaps=grid,
                         evaluator=evaluator,
                         numFolds=5)

#Here, we partition the data into X, Y training, validation and testing data. Data checks follow:
train = df_dev
test = df_test
valid = df_val

# Build the model
t0 = time()
logit_models = cv.fit(train)
tt = time() - t0
tt

# Follow the Sample_ML example
from time import time
예제 #10
0
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[featuresCreator, indexer])
data_transformer = pipeline.fit(data_train)

## 模型拟合及性能评估 Fit the model & Model performance

# 使用BinaryClassificationEvaluator评估模型性能
import pyspark.ml.evaluation as ev

evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
                                             labelCol='label')

# 进行5折交叉验证
cv = tune.CrossValidator(estimator=logistic,
                         estimatorParamMaps=grid,
                         evaluator=evaluator,
                         numFolds=5)

# 拟合模型,并在测试集上进行预测
cvModel = cv.fit(
    data_transformer \
        .transform(data_train)
)

prediction = cvModel.transform( \
    data_transformer \
        .transform(data_test))

results = prediction.select("id", "prediction", "probability", "label")

# 查看预测结果(前10行)
예제 #11
0
#     metricName='areaUnderROC'
# )
eval_f1 = sme.MulticlassClassificationEvaluator(labelCol='label',
                                                predictionCol='predictedLabel',
                                                metricName='f1')

# Set up a parameter grid for cross validation
param_grid = smt.ParamGridBuilder().addGrid(
    reducer.k,
    [10, 20, 50, 75]).addGrid(classifier.maxDepth,
                              [2, 5, 10]).addGrid(classifier.subsamplingRate,
                                                  [0.1, 0.2, 0.3]).build()

# Bring everything together
validator = smt.CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=eval_f1,
                               numFolds=3)

# Fit the model to the data #######################################################################'
model = validator.fit(train)

train_predictions = model.transform(train)
val_predictions = model.transform(val)

# Evaluate model performance

eval_roc = sme.BinaryClassificationEvaluator(labelCol='label',
                                             rawPredictionCol='predictedLabel',
                                             metricName='areaUnderROC')

eval_accuracy = sme.MulticlassClassificationEvaluator(
예제 #12
0
estimator = DistKeras(trainers.ADAG,

                      {'batch_size': 256,

                       'communication_window': 3,

                       'num_epoch': 10,

                       'num_workers': 50},

                      **param_grid[0])


evaluator = evaluation.RegressionEvaluator(metricName='r2')

cv_estimator = tuning.CrossValidator(estimator=estimator,

                                     estimatorParamMaps=param_grid,

                                     evaluator=evaluator,

                                     numFolds=5)

cv_model = cv_estimator.fit(df_train)

df_pred_train = cv_model.transform(df_train)

df_pred_test  = cv_model.transform(df_test)

예제 #13
0
def hyper_parameter_optimization_ml():
	spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator])
	data_transformer = pipeline.fit(births_train)

	# Specify our model and the list of parameters we want to loop through.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a logic that will do the validation work.
	cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	cvModel = cv.fit(data_transformer.transform(births_train))

	# See if cvModel performed better than our previous model
	data_train = data_transformer.transform(births_test)
	results = cvModel.transform(data_train)

	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))

	# Parameters which the best model has.
	results = [
		([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric)
		for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
	]
	print(sorted(results, key=lambda el: el[1], reverse=True)[0])
예제 #14
0
########################
#---Machine Learning---#
########################

# Create machine learning pipeline
piped = ml_pipeline(churn)

# Standardize dataset
training, testing = piped.randomSplit([.75, .25])

# Create evaluator
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")

# Create logistic regression and decision tree model
lr = LogisticRegression()
dt = DecisionTreeClassifier()

# Create cross validation object
cross_validation = tune.CrossValidator(estimator=lr, evaluator=evaluator)

# Train logisitic regression and decision tree model
fitted_model_lr = lr.fit(training)
fitted_model_dt = dt.fit(training)

# Plot ROC for logistic regression
roc_plot(fitted_model_lr)

# Compute accuracy for logistic regression and decision tree models
lr_accuracy = test_roc_performance(fitted_model_lr, testing, evaluator)
dt_accuracy = test_roc_performance(fitted_model_dt, testing, evaluator)