Exemplo n.º 1
0
# ---------------------------------------------------------------ESTIMACION DE PARAMETROS---------------------------------------------------------
# Estimacion de los parametros y validacion cruzada


from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing           

# Mejores parametros
cvModel.bestModel.extractParamMap()  

# Use test set to measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(predictions)
Exemplo n.º 2
0
#applying machine learning model
classifier = LogisticRegression(maxIter=2,
                                labelCol="label",
                                featuresCol="features")

#Evaluator for evaluating
evaluator = BinaryClassificationEvaluator(labelCol="label",
                                          rawPredictionCol="rawPrediction")

grid = ParamGridBuilder().addGrid(classifier.regParam,
                                  [1.0, 2.0]).addGrid(classifier.maxIter,
                                                      [10, 15]).build()

cv = CrossValidator(estimator=classifier,
                    estimatorParamMaps=grid,
                    evaluator=evaluator)

model = cv.fit(trainingData)

predictions = model.transform(testData)

predictions.show()

print(evaluator.evaluate(predictions))
'''
model=classifier.fit(trainingData)

#print(model.coefficients)
predictions=model.transform(testData)
    stages.append(
        VectorAssembler(
            inputCols=["idx_{0}".format(col) for col in input_columns],
            outputCol='features'))
    dec_tree = DecisionTreeRegressor(labelCol='label',
                                     featuresCol='features',
                                     maxDepth=5)

    stages.append(dec_tree)

    # Split the data into training and test sets (30% held out for testing)
    trainingData, testData = df.randomSplit([0.7, 0.3], seed=123)
    evaluator = RegressionEvaluator(metricName='rmse', labelCol='label')
    grid = ParamGridBuilder().addGrid(dec_tree.maxDepth, [3, 5, 7, 10]).build()
    cv = CrossValidator(estimator=dec_tree,
                        estimatorParamMaps=grid,
                        evaluator=evaluator,
                        numFolds=10)
    stages
    pipeline = Pipeline(stages=stages)

    model = pipeline.fit(trainingData)

    predictions = model.transform(testData)

    output = []

    output.append(
        predictions.select("prediction", "label", "features").show(5))

    output.append((model.stages[-1]))
Exemplo n.º 4
0
## |                      performing binary classification model.                                      |
## |                      How has this changed your performance metrics?                               |
## +---------------------------------------------------------------------------------------------------+

rf_cv = RandomForestClassifier(labelCol='genre', featuresCol='Features')
pipeline = Pipeline(stages=[assembler, rf_cv])

paramGrid = ParamGridBuilder() \
    .addGrid(rf_cv.numTrees, [10, 50, 100]) \
    .addGrid(rf_cv.maxDepth, [3, 6, 9]) \
    .build()

cv = CrossValidator(estimator=rf_cv,
                    estimatorParamMaps=paramGrid,
                    evaluator=BinaryClassificationEvaluator(
                        rawPredictionCol="rawPrediction",
                        labelCol='genre',
                        metricName="areaUnderROC"),
                    numFolds=5)

cv_model = cv.fit(training_updownsampled)

best_cv_model = cv_model.bestModel

print("Best Depth: ", best_cv_model._java_obj.getMaxDepth())
print("Good Number of Trees: ", best_cv_model._java_obj.getNumTrees())
''' output
Best Depth:  9
Good Number of Trees:  100
'''
Exemplo n.º 5
0
kmFeatures.groupBy("label").count().show()
labelData = kmFeatures.drop('prediction').show()

# In[167]:

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
#lr = LogisticRegression(maxIter=10, regParam=0.01)
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [10]).addGrid(
    lr.regParam, [0.01, 0.05, 1.00, 2.00]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=lr,
                    numFolds=10,
                    estimatorParamMaps=grid,
                    evaluator=evaluator)
cvModel = cv.fit(labelData)
evaluator.evaluate(cvModel.transform(labelData))

# In[173]:

cvModel.bestModel.intercept

# In[97]:

training, test = labelData.randomSplit([0.7, 0.3])
lr_model = lr.fit(training)

# In[129]:
Exemplo n.º 6
0
def train_models(saved_trained_models, sanitized_comments):
    models_dir = "models/"
    parquet_dir = "parquets/"
    if not saved_trained_models:
        # Initialize six logistic regression models.
        pos_lr = LogisticRegression(labelCol="trump_pos", featuresCol="vectors", maxIter=10)
        neg_lr = LogisticRegression(labelCol="trump_neg", featuresCol="vectors", maxIter=10)
        dem_pos_lr = LogisticRegression(labelCol="dem_pos", featuresCol="vectors", maxIter=10)
        dem_neg_lr = LogisticRegression(labelCol="dem_neg", featuresCol="vectors", maxIter=10)
        rep_pos_lr = LogisticRegression(labelCol="rep_pos", featuresCol="vectors", maxIter=10)
        rep_neg_lr = LogisticRegression(labelCol="rep_neg", featuresCol="vectors", maxIter=10)

        # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
        pos_evaluator = BinaryClassificationEvaluator(labelCol="trump_pos")
        neg_evaluator = BinaryClassificationEvaluator(labelCol="trump_neg")
        dem_pos_evaluator = BinaryClassificationEvaluator(labelCol="dem_pos")
        dem_neg_evaluator = BinaryClassificationEvaluator(labelCol="dem_neg")
        rep_pos_evaluator = BinaryClassificationEvaluator(labelCol="rep_pos")
        rep_neg_evaluator = BinaryClassificationEvaluator(labelCol="rep_neg")

        # There are a few parameters associated with logistic regression. We do not know what they are a priori.
        # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
        # We will assume the parameter is 1.0. Grid search takes forever.
        pos_param_grid = ParamGridBuilder().addGrid(pos_lr.regParam, [1.0]).build()
        neg_param_grid = ParamGridBuilder().addGrid(neg_lr.regParam, [1.0]).build()
        dem_pos_param_grid = ParamGridBuilder().addGrid(dem_pos_lr.regParam, [1.0]).build()
        dem_neg_param_grid = ParamGridBuilder().addGrid(dem_neg_lr.regParam, [1.0]).build()
        rep_pos_param_grid = ParamGridBuilder().addGrid(rep_pos_lr.regParam, [1.0]).build()
        rep_neg_param_grid = ParamGridBuilder().addGrid(rep_neg_lr.regParam, [1.0]).build()

        # We initialize a 5 fold cross-validation pipeline.
        pos_cross_val = CrossValidator(
            estimator=pos_lr,
            evaluator=pos_evaluator,
            estimatorParamMaps=pos_param_grid,
            numFolds=5)
        neg_cross_val = CrossValidator(
            estimator=neg_lr,
            evaluator=neg_evaluator,
            estimatorParamMaps=neg_param_grid,
            numFolds=5)
        dem_pos_cross_val = CrossValidator(
            estimator=dem_pos_lr,
            evaluator=dem_pos_evaluator,
            estimatorParamMaps=dem_pos_param_grid,
            numFolds=5)
        dem_neg_cross_val = CrossValidator(
            estimator=dem_neg_lr,
            evaluator=dem_neg_evaluator,
            estimatorParamMaps=dem_neg_param_grid,
            numFolds=5)
        rep_pos_cross_val = CrossValidator(
            estimator=rep_pos_lr,
            evaluator=rep_pos_evaluator,
            estimatorParamMaps=rep_pos_param_grid,
            numFolds=5)
        rep_neg_cross_val = CrossValidator(
            estimator=rep_neg_lr,
            evaluator=rep_neg_evaluator,
            estimatorParamMaps=rep_neg_param_grid,
            numFolds=5)

        # Split the data 50/50
        pos_train, pos_test = sanitized_comments.randomSplit([0.5, 0.5])
        neg_train, neg_test = sanitized_comments.randomSplit([0.5, 0.5])
        dem_pos_train, dem_pos_test = sanitized_comments.randomSplit([0.5, 0.5])
        dem_neg_train, dem_neg_test = sanitized_comments.randomSplit([0.5, 0.5])
        rep_pos_train, rep_pos_test = sanitized_comments.randomSplit([0.5, 0.5])
        rep_neg_train, rep_neg_test = sanitized_comments.randomSplit([0.5, 0.5])

        # Train the models
        print("Training positive classifier...")
        pos_model = pos_cross_val.fit(pos_train)
        print("Training negative classifier...")
        neg_model = neg_cross_val.fit(neg_train)
        print("Training positive democrat classifier...")
        dem_pos_model = dem_pos_cross_val.fit(dem_pos_train)
        print("Training negative democrat classifier...")
        dem_neg_model = dem_neg_cross_val.fit(dem_neg_train)
        print("Training positive republican classifier...")
        rep_pos_model = rep_pos_cross_val.fit(rep_pos_train)
        print("Training negative republican classifier...")
        rep_neg_model = rep_neg_cross_val.fit(rep_neg_train)

        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        pos_model.save(models_dir + "pos.model")
        neg_model.save(models_dir + "neg.model")
        dem_pos_model.save(models_dir + "dem_pos.model")
        dem_neg_model.save(models_dir + "dem_neg.model")
        rep_pos_model.save(models_dir + "rep_pos.model")
        rep_neg_model.save(models_dir + "rep_neg.model")

        # save testing data
        pos_test.write.parquet(os.path.join(script_dir, parquet_dir + "pos_test.parquet"))
        neg_test.write.parquet(os.path.join(script_dir, parquet_dir + "neg_test.parquet"))
        dem_pos_test.write.parquet(os.path.join(script_dir, parquet_dir + "dem_pos_test.parquet"))
        dem_neg_test.write.parquet(os.path.join(script_dir, parquet_dir + "dem_neg_test.parquet"))
        rep_pos_test.write.parquet(os.path.join(script_dir, parquet_dir + "rep_pos_test.parquet"))
        rep_neg_test.write.parquet(os.path.join(script_dir, parquet_dir + "rep_neg_test.parquet"))
    else:
        # load models
        pos_model = CrossValidatorModel.load(models_dir + "pos.model")
        neg_model = CrossValidatorModel.load(models_dir + "neg.model")
        dem_pos_model = CrossValidatorModel.load(models_dir + "dem_pos.model")
        dem_neg_model = CrossValidatorModel.load(models_dir + "dem_neg.model")
        rep_pos_model = CrossValidatorModel.load(models_dir + "rep_pos.model")
        rep_neg_model = CrossValidatorModel.load(models_dir + "rep_neg.model")
    return pos_model, neg_model, dem_pos_model, dem_neg_model, rep_pos_model, rep_neg_model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", \
                    labelCol="LOANIndex",metricName="areaUnderROC")

#==============================================================================
# Training and Evaluation
#==============================================================================

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

pipeline_gbt = Pipeline(stages=[gbt])
paramGrid_gbt = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [ 6,10]) \
    .addGrid(gbt.maxIter, [3]) \
    .addGrid(gbt.maxBins, [32,64])\
    .addGrid(gbt.stepSize, [0.7, 0.5, 0.6])\
     .build()

crossval_gbt = CrossValidator(estimator=pipeline_gbt,
                              estimatorParamMaps=paramGrid_gbt,
                              evaluator=evaluator,
                              numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and returns the best model.
cvModel_gbt = crossval_gbt.fit(train)

# NOW WE will predict over test set
predictionCV_gbt = cvModel_gbt.transform(test)
print(evaluator.evaluate(predictionCV_gbt))
#print ('It took', time.time()-start, 'seconds.')
Exemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(description='Pyspark Training')
    parser.add_argument(
        '--data',
        type=str,
        default="../../../data/sample_linear_regression_data.txt",
        help='Data location.')
    parser.add_argument('--cross_val',
                        type=bool,
                        default=False,
                        help='whether to use cross_validation')
    args = parser.parse_args()

    data = spark.read.format("libsvm").load(args.data)

    # Split the data into training and test sets (30% held out for testing)
    (train, test) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestRegressor()

    # Create a grid of hyperparameters. Each combination will be tested.
    paramGrid = ParamGridBuilder()\
        .addGrid(rf.numTrees, [2, 25]) \
        .addGrid(rf.maxDepth, [2, 6])\
        .addGrid(rf.maxBins, [15, 30])\
        .build()

    if args.cross_val:
        # Run five-fold cross validation to find best hyperparamters.
        crossval = CrossValidator(estimator=rf,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=RegressionEvaluator(
                                      labelCol="label",
                                      predictionCol="prediction",
                                      metricName="rmse"),
                                  numFolds=5)  # use 3+ folds in practice

        model = crossval.fit(train)
    else:
        # Grid search for best hyperparameters with a single validation set.
        tvs = TrainValidationSplit(
            estimator=rf,
            estimatorParamMaps=paramGrid,
            evaluator=RegressionEvaluator(labelCol="label",
                                          predictionCol="prediction",
                                          metricName="rmse"),
            # 80% of the data will be used for training, 20% for validation.
            trainRatio=0.8)

        # Run TrainValidationSplit, and choose the best set of parameters.
        model = tvs.fit(train)

    # Make predictions.
    predictions = model.transform(test)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="rmse")

    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
Exemplo n.º 9
0
# COMMAND ----------

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# COMMAND ----------

grid = ParamGridBuilder() \
  .addGrid(dtc.maxDepth, [2, 3, 4, 5, 6, 7, 8]) \
  .addGrid(dtc.maxBins, [2, 4, 8]) \
  .build()

# COMMAND ----------

cv = CrossValidator(estimator=pipeline,
                    evaluator=evaluator,
                    estimatorParamMaps=grid,
                    numFolds=3)

# COMMAND ----------

# MAGIC %md Run `CrossValidator`.  `CrossValidator` checks to see if an MLflow tracking server is available.  If so, it log runs within MLflow:
# MAGIC
# MAGIC * Under the current active run, log info for `CrossValidator`.  (Create a new run if none are active.)
# MAGIC * For each submodel (number of folds of cross-validation x number of ParamMaps tested)
# MAGIC   * Log a run for this submodel, along with the evaluation metric on the held-out data.

# COMMAND ----------

# Explicitly create a new run.
# This allows this cell to be run multiple times.
# If you omit mlflow.start_run(), then this cell could run once,
Exemplo n.º 10
0
prediction = pipelineModel.transform(train).select(
    F.col("mean_exam_points").cast("Float"),
    F.col("predicted_points").cast("Float"))
rmse = regressionEvaluator.evaluate(prediction)
print("RMSE is " + str(rmse))

pipelineModel.transform(train).show()

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit

paramGrid = ParamGridBuilder()  \
    .addGrid(lr.regParam, [0.01, 0.1])\
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = pipeline.fit(train)

prediction = cvModel.transform(train).select(
    F.col("mean_exam_points").cast("Float"),
    F.col("predicted_points").cast("Float"))

rmse = regressionEvaluator.evaluate(prediction)
print("RMSE is " + str(rmse))
# не улучшается - RMSE is 14.0434860345
Exemplo n.º 11
0
def test_gen_estimator_metadata(spark_session):  # pylint: disable=unused-argument
    tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1")
    hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(),
                           outputCol="features1")

    tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2")
    hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(),
                           outputCol="features2")

    vecAssembler = VectorAssembler(inputCols=["features1", "features2"],
                                   outputCol="features")

    lor = LogisticRegression(maxIter=10)
    ova = OneVsRest(classifier=lor)
    sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1])
    sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2])
    sub_pipeline3 = Pipeline(stages=[vecAssembler, ova])

    paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid(
        lor.regParam, [0.1, 0.01]).build())
    eva = MulticlassClassificationEvaluator()
    crossval = CrossValidator(estimator=sub_pipeline3,
                              estimatorParamMaps=paramGrid,
                              evaluator=eva,
                              numFolds=2)

    top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval])

    metadata = _gen_estimator_metadata(top_pipeline)

    expected_hierarchy = {
        "name":
        "Pipeline_1",
        "stages": [
            {
                "name": "Pipeline_2",
                "stages": [{
                    "name": "Tokenizer_1"
                }, {
                    "name": "HashingTF_1"
                }]
            },
            {
                "name": "Pipeline_3",
                "stages": [{
                    "name": "Tokenizer_2"
                }, {
                    "name": "HashingTF_2"
                }]
            },
            {
                "name": "CrossValidator",
                "evaluator": {
                    "name": "MulticlassClassificationEvaluator"
                },
                "tuned_estimator": {
                    "name":
                    "Pipeline_4",
                    "stages": [
                        {
                            "name": "VectorAssembler"
                        },
                        {
                            "name": "OneVsRest",
                            "classifier": {
                                "name": "LogisticRegression"
                            }
                        },
                    ],
                },
            },
        ],
    }
    assert metadata.hierarchy == expected_hierarchy
    assert metadata.uid_to_indexed_name_map == {
        top_pipeline.uid: "Pipeline_1",
        sub_pipeline1.uid: "Pipeline_2",
        tokenizer1.uid: "Tokenizer_1",
        hashingTF1.uid: "HashingTF_1",
        sub_pipeline2.uid: "Pipeline_3",
        tokenizer2.uid: "Tokenizer_2",
        hashingTF2.uid: "HashingTF_2",
        crossval.uid: "CrossValidator",
        sub_pipeline3.uid: "Pipeline_4",
        vecAssembler.uid: "VectorAssembler",
        ova.uid: "OneVsRest",
        lor.uid: "LogisticRegression",
        eva.uid: "MulticlassClassificationEvaluator",
    }
    assert (metadata.uid_to_indexed_name_map[
        metadata.param_search_estimators[0].uid] == "CrossValidator")
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
regEval = RegressionEvaluator(predictionCol="Predicted_EXP", labelCol="TOTAL_BENEFICIARY_AMT", metricName="r2")

# Run the evaluator on the DataFrame
r2 = regEval.evaluate(train_data_output)

print("Root Mean Squared Error: %.2f" % r2)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# We can reuse the RegressionEvaluator, regEval, to judge the model based on the best Root Mean Squared Error
# Let's create our CrossValidator with 3 fold cross validation
crossval = CrossValidator(estimator=dtPipeline, evaluator=regEval, numFolds=3)

# Let's tune over our dt.maxDepth parameter on the values 2 and 3, create a paramter grid using the ParamGridBuilder
paramGrid = (ParamGridBuilder()
.addGrid(dt.maxDepth, [6,7,8,9])
.build())

# Add the grid to the CrossValidator
crossval.setEstimatorParamMaps(paramGrid)

# Now let's find and return the best model
dtModel = crossval.fit(train_data).bestModel

train_data_output=dtModel.transform(train_data)

#from pyspark.sql import functions as F
Exemplo n.º 13
0
    print(f"{unique_features_count} of {num_ids} have unique features.")

    delta_ = delta(start_features)

    # 2.2 Clustering
    start_clustering = get_time()
    print("Starting clustering step.")
    bkm = BisectingKMeans()
    print("Fitting data to Bisecting K Means model")
    model = bkm.fit(features_df)
    clustering_pipeline = Pipeline(stages=[bkm])
    print("Building grid for cross-validation")
    paramGrid = ParamGridBuilder().addGrid(
        bkm.k, [2, 5, 10, 20, 50, 70, 100]).build()
    print("Starting cross-validation")
    crossval = CrossValidator(
        estimator=clustering_pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=ClusteringEvaluator(),
        numFolds=3,
    )
    cvModel = crossval.fit(features_df)
    cluster_df = cvModel.transform(features_df)
    cluster_df.select("prediction").describe().show()
    print("Finished step 2.")
    delta(ml_start)

    # End execution
    print("All steps complete.")
    delta_ = delta(global_start)
result9_df = result8_transformed

splits = result9_df.randomSplit([0.8, 0.2], seed=1)

train = splits[0].cache()
valid = splits[1].cache()

train.show(n)
valid.show(n)


# step 10
lr = LogisticRegression(regParam=0.01, maxIter=100, fitIntercept=True)

bceval = BinaryClassificationEvaluator()
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(n_fold)

paramGrid = ParamGridBuilder().addGrid(lr.maxIter, max_iter)\
    .addGrid(lr.regParam, reg_params).build()

cv.setEstimatorParamMaps(paramGrid)

cvmodel = cv.fit(train)

print(cvmodel.bestModel.coefficients)
print('')
print(cvmodel.bestModel.intercept)
print('')
print(cvmodel.bestModel.getMaxIter())
print('')
print(cvmodel.bestModel.getRegParam())
    'hdfs:/user/pg1910/pub/goodreads/testing_sample.parquet')

als = ALS(userCol="user_id",
          itemCol="book_id",
          ratingCol="rating",
          coldStartStrategy="drop",
          nonnegative=True)

param_grid = ParamGridBuilder().addGrid(als.rank, [15, 25, 35]).addGrid(
    als.maxIter, [5, 8, 10]).addGrid(als.regParam, [0.08, 0.09, 0.10]).build()
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",
                                predictionCol="prediction")

cv = CrossValidator(estimator=als,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=3)
model = cv.fit(df_training)

best_model = model.bestModel

print("Tuned Hyperparameters:-------------")
print("Rank: ", best_model._java_obj.parent().getRank())
print("MaxIter: ", best_model._java_obj.parent().getMaxIter())
print("RegParam: ", best_model._java_obj.parent().getRegParam())

print("Recommendations: ------------------------------")
user_recs = best_model.recommendForAllUsers(500)

# user_recs.write.csv('hdfs:/user/pg1910/pub/goodreads/user_recs.csv')
Exemplo n.º 16
0
def main(sqlContext):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    # load files
    label = sqlContext.read.load("labeled_data.csv",
                                 format="csv",
                                 sep=",",
                                 inferSchema="true",
                                 header="true")
    if (flag):
        comments = sqlContext.read.json("comments-minimal.json.bz2")
        submissions = sqlContext.read.json("submissions.json.bz2")
        print("loading done")
        comments.write.parquet("comments_data")
        submissions.write.parquet("submissions_data")
        print("writing done")
    else:
        comments = sqlContext.read.parquet("comments")
        submissions = sqlContext.read.parquet("submissions")
        print("loading done")
    comments.show()
    exit()
    if (save):
        # task 7 starts here
        associated = join(comments, label)
        withngrams = associated.withColumn("ngrams",
                                           makeNgrams_udf(associated['body']))
        withplabels = withngrams.withColumn("poslabel",
                                            pLabel_udf(withngrams['labeldjt']))
        withpnlabels = withplabels.withColumn(
            "neglabel", nLabel_udf(withplabels['labeldjt'])).select(
                "id", "ngrams", "poslabel", "neglabel")
        # withpnlabels.show()
        cv = CountVectorizer(binary=True,
                             inputCol="ngrams",
                             outputCol="features")
        model = cv.fit(withpnlabels)
        model.save("cv.model")
        # model.transform(withpnlabels).show()
        pos = model.transform(withpnlabels).select(
            "id",
            col("poslabel").alias("label"), "features")
        neg = model.transform(withpnlabels).select(
            "id",
            col("neglabel").alias("label"), "features")
        # pos.show()
        # neg.show()
        poslr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        neglr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam,
                                                  [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam,
                                                  [1.0]).build()
        posCrossval = CrossValidator(estimator=poslr,
                                     evaluator=posEvaluator,
                                     estimatorParamMaps=posParamGrid,
                                     numFolds=2)  # for test
        negCrossval = CrossValidator(estimator=neglr,
                                     evaluator=negEvaluator,
                                     estimatorParamMaps=negParamGrid,
                                     numFolds=2)  # for test
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        posModel.save("pos.model")
        negModel.save("neg.model")
        print("trained")
    else:
        # comments.show()
        # submissions.show()
        posModel = CrossValidatorModel.load("pos.model")
        negModel = CrossValidatorModel.load("neg.model")
        model = CountVectorizerModel.load("cv.model")
        # withngrams = comments.withColumn("ngrams", makeNgrams_udf(comments['body']))
        # cv = CountVectorizer(binary=True, inputCol="ngrams", outputCol="features")
        # model = cv.fit(withngrams)
        print("model loaded")

        if (predict == 0):
            # task 8 starts here
            temp_comments = comments.select("id", "link_id",
                                            "author_flair_text", "created_utc",
                                            "body")
            clean_comments = temp_comments.withColumn(
                "true_id", getLinkid_udf(temp_comments['link_id']))
            # print(clean_comments.count())
            clean_submissions = submissions.select(
                col("id").alias("sub_id"), "title")
            # clean_comments.show()
            # clean_submissions.show()
            com_sub = clean_comments.join(
                clean_submissions,
                clean_comments.true_id == clean_submissions.sub_id, "inner")
            com_sub.write.parquet("com_sub")
        else:
            # task 9 starts here
            com_sub = sqlContext.read.parquet("com_sub")
            com_sub = com_sub.sample(False, 0.0001, None)
            filtered = com_sub.filter(
                "body NOT LIKE '%/s%' and body NOT LIKE '>%'")
            # print(filtered.count())
            filtered_ngrams = filtered.withColumn(
                "ngrams", makeNgrams_udf(filtered['body']))
            # filtered_ngrams = filtered_ngrams.sample(False, 0.01, None)
            print("prepared")
            featuredata = model.transform(filtered_ngrams).select(
                "id", "author_flair_text", "created_utc", "sub_id", "title",
                "features")
            posResult = posModel.transform(featuredata)
            negResult = negModel.transform(featuredata)
            # posResult.show()
            # negResult.show()
            poslabel = posResult.withColumn(
                "positive", posTh_udf(posResult['probability'])
            )  # .select("id", "author_flair_text", "created_utc", "title", "positive")
            neglabel = negResult.withColumn(
                "negtive", negTh_udf(negResult['probability'])
            )  # .select(col("id").alias("nid"), "author_flair_text", "created_utc", "title", "negtive")
            print("predict done")
            # poslabel.show()
            # neglabel.show()
            # how to combine these 2 tables???

            # task 10 starts here
            # c_all = poslabel.count()
            all_day = poslabel.withColumn(
                "date",
                from_unixtime('created_utc').cast(
                    DateType())).groupby("date").count()
            pos_posts = poslabel.filter("positive = 1")
            # c_pos_posts = pos_posts.count()
            # p_pos_posts = c_pos_posts/c_all
            # print(p_pos_posts)
            # neg_posts = neglabel.filter("negtive = 1")
            # c_neg_posts = neg_posts.count()
            # p_neg_posts = c_neg_posts/c_all
            # print(p_neg_posts)
            pos_day = pos_posts.withColumn(
                "pos_date",
                from_unixtime('created_utc').cast(
                    DateType())).groupby("pos_date").count().withColumnRenamed(
                        "count", "pos_count")
            p_pos_day = all_day.join(pos_day, all_day.date == pos_day.pos_date,
                                     "left").withColumn(
                                         "pos_per", pos_count / count).show()

            print("end")
Exemplo n.º 17
0
        (11, "jogos são legais", 0.0)
    ], ["id", "text", "label"])

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=10)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    paramGrid = ParamGridBuilder() \
        .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=3)

    cvModel = crossval.fit(training)

    test = spark.createDataFrame([
        (4, "eu gosto de jogar"),
        (5, "faz tempo que não vou a igreja"),
        (6, "jesus cristo"),
        (7, "muitos jogos legais lançados recentemente")
    ], ["id", "text"])

    prediction = cvModel.transform(test)
    selected = prediction.select("id", "text", "probability", "prediction")
    for row in selected.collect():
        print(row)
Exemplo n.º 18
0
# Make predictions
predictionDF = model.transform(testDF)

# Choose (observation, prediction) pairs. Need this for calculating metrics
metricDF = predictionDF.select('PE', 'prediction')

# Calculate metrics
metrics = RegressionMetrics(metricDF.rdd)
print ("Default LR metrics: RMSE = %s, R2 = %s, MSE = %s" %(metrics.rootMeanSquaredError, metrics.r2, metrics.meanSquaredError))

# b) Parameter Tuning using CV

paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [10, 25, 50]).addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4]).build()

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator().setLabelCol('PE').setMetricName('rmse'), numFolds=3)

cvModel = cv.fit(trainDF)

# Make predictions
predictionDF = cvModel.transform(testDF)

# Choose (observation, prediction) pairs. Need this for calculating metrics
metricDF = predictionDF.select('PE', 'prediction')

# Calculate metrics
metrics = RegressionMetrics(metricDF.rdd)
print ("CV LR metrics: RMSE = %s, R2 = %s, MSE = %s" %(metrics.rootMeanSquaredError, metrics.r2, metrics.meanSquaredError))

# print ('Best Param (regParam, maxIter): (%s, %s)' %(cvModel.bestModel.stages[1]._java_obj.parent().getRegParam(), cvModel.bestModel.stages[-1]._java_obj.parent().getMaxIter()))
Exemplo n.º 19
0
train00, test0 = data_pre.randomSplit([0.8, 0.2], seed=24)
train0, validation0 = train00.randomSplit([0.8, 0.2], seed=24)

train = pipeline.fit(train0).transform(train0)
test = pipeline.fit(test0).transform(train0)
validation = pipeline.fit(test0).transform(train0)

# In[79]:

#创建模型1
lr = LogisticRegression(maxIter=20)
paramGrid1 = ParamGridBuilder().addGrid(lr.regParam, [0.3, 0.01]).addGrid(
    lr.elasticNetParam, [1.0, 0.0]).build()  #为1时L1,0时L2
evaluator1 = MulticlassClassificationEvaluator(metricName="f1")
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid1,
                          evaluator=evaluator1,
                          numFolds=3)
lr_model = crossval.fit(train)
print('lrpre value: {}'.format(
    evaluator1.evaluate(lr_model.transform(validation))))

# In[80]:

lr_model.getEstimatorParamMaps()
# lr_model.bestModel
# lr_model.avgMetrics

# In[81]:

lr_model.bestModel
Exemplo n.º 20
0
    def IntanceFitModel(Mtype, classifier, classes, features, folds, train):

        if Mtype == "OneVsRest":
            # instantiate the base classifier.
            lr = LogisticRegression()
            # instantiate the One Vs Rest Classifier.
            OVRclassifier = OneVsRest(classifier=lr)
            #             fitModel = OVRclassifier.fit(train)
            # Add parameters of your choice here:
            paramGrid = ParamGridBuilder().addGrid(lr.regParam,
                                                   [0.1, 0.01]).build()
            #Cross Validator requires the following parameters:
            crossval = CrossValidator(
                estimator=OVRclassifier,
                estimatorParamMaps=paramGrid,
                evaluator=MulticlassClassificationEvaluator(),
                numFolds=folds)  # 3 is best practice
            # Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
        if Mtype == "MultilayerPerceptronClassifier":
            # specify layers for the neural network:
            # input layer of size features, two intermediate of features+1 and same size as features
            # and output of size number of classes
            # Note: crossvalidator cannot be used here
            features_count = len(features[0][0])
            layers = [
                features_count, features_count + 1, features_count, classes
            ]
            MPC_classifier = MultilayerPerceptronClassifier(maxIter=100,
                                                            layers=layers,
                                                            blockSize=128,
                                                            seed=1234)
            fitModel = MPC_classifier.fit(train)
            return fitModel
        if Mtype in (
                "LinearSVC", "GBTClassifier"
        ) and classes != 2:  # These classifiers currently only accept binary classification
            print(
                Mtype,
                " could not be used because PySpark currently only accepts binary classification data for this algorithm"
            )
            return
        if Mtype in ("LogisticRegression", "NaiveBayes",
                     "RandomForestClassifier", "GBTClassifier", "LinearSVC",
                     "DecisionTreeClassifier"):

            # Add parameters of your choice here:
            if Mtype in ("LogisticRegression"):
                paramGrid = (
                    ParamGridBuilder(
                    )  #                              .addGrid(classifier.regParam, [0.1, 0.01]) \
                    .addGrid(classifier.maxIter, [10, 15, 20]).build())

            # Add parameters of your choice here:
            if Mtype in ("NaiveBayes"):
                paramGrid = (ParamGridBuilder().addGrid(
                    classifier.smoothing, [0.0, 0.2, 0.4, 0.6]).build())

            # Add parameters of your choice here:
            if Mtype in ("RandomForestClassifier"):
                paramGrid = (
                    ParamGridBuilder().addGrid(classifier.maxDepth, [2, 5, 10])
                    #                                .addGrid(classifier.maxBins, [5, 10, 20])
                    #                                .addGrid(classifier.numTrees, [5, 20, 50])
                    .build())

            # Add parameters of your choice here:
            if Mtype in ("GBTClassifier"):
                paramGrid = (
                    ParamGridBuilder(
                    )  #                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
                    #                              .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                    .addGrid(classifier.maxIter, [10, 15, 50, 100]).build())

            # Add parameters of your choice here:
            if Mtype in ("LinearSVC"):
                paramGrid = (ParamGridBuilder().addGrid(
                    classifier.maxIter,
                    [10, 15]).addGrid(classifier.regParam,
                                      [0.1, 0.01]).build())

            # Add parameters of your choice here:
            if Mtype in ("DecisionTreeClassifier"):
                paramGrid = (ParamGridBuilder() #                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
                             .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .build())

            #Cross Validator requires all of the following parameters:
            crossval = CrossValidator(
                estimator=classifier,
                estimatorParamMaps=paramGrid,
                evaluator=MulticlassClassificationEvaluator(),
                numFolds=folds)  # 3 + is best practice
            # Fit Model: Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
# MAGIC
# MAGIC Note, we are using `evaluatorPR` as our `evaluator` as the Precision-Recall curve is often better for an unbalanced distribution.

# COMMAND ----------

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Build the grid of different parameters
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .addGrid(dt.maxBins, [10, 20, 30]) \
    .build()

# Build out the cross validation
crossval = CrossValidator(estimator=dt,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluatorPR,
                          numFolds=3)

pipelineCV = Pipeline(stages=[indexer, va, crossval])

# Train the model using the pipeline, parameter grid, and preceding BinaryClassificationEvaluator
cvModel_u = pipelineCV.fit(train)

# COMMAND ----------

# MAGIC %md ### Review Results
# MAGIC Review the `areaUnderPR` (area under Precision Recall curve) and `areaUnderROC` (area under Receiver operating characteristic) or `AUC` (area under curve) metrics.

# COMMAND ----------

# Build the best model (training and test datasets)
# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

regEval = RegressionEvaluator(predictionCol="Predicted_PE")
regEval.setLabelCol("PE")\
  .setMetricName("rmse")

regParam = [i / 100.0 for i in range(1, 11)]

grid = ParamGridBuilder().addGrid(lr.regParam, regParam).build()

crossval = CrossValidator(estimator=lrPipeline,
                          estimatorParamMaps=grid,
                          evaluator=regEval,
                          numFolds=5)

cvModel = crossval.fit(trainingSet)

# COMMAND ----------

# MAGIC %md Now that we have tuned let's see what we got for tuning parameters and what our RMSE was versus our intial model

# COMMAND ----------

predictionsAndLabels = cvModel.transform(testSet)

valuesAndPreds = predictionsAndLabels.select("Predicted_PE", "PE").rdd.map(
    lambda x: (x.__getitem__('Predicted_PE'), x.__getitem__('PE')))
metrics = RegressionMetrics(valuesAndPreds)
Exemplo n.º 23
0
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
)

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

pipeline_rf = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth,
                                       [2, 3, 4, 5, 6, 7]).addGrid(
                                           rf.numTrees, [100, 300]).build()

evaluator = BinaryClassificationEvaluator(labelCol="label",
                                          rawPredictionCol="rawPrediction")
crossval = CrossValidator(estimator=pipeline_rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

## Fitting the CV
CV_model = crossval.fit(trainingData)

## Printing best model
print(CV_model.bestModel.stages[0])

test_pred = CV_model.transform(testData)
print(evaluator.getMetricName(), evaluator.evaluate(test_pred))
Exemplo n.º 24
0
eval = list()

for i in range(2):
	# defino el pipeline
	pipeline = Pipeline(stages=[day_of_week_indexer, 
                            airline_indexer, 
                            hour_departure_indexer, 
                            day_of_week_encoder, 
                            airline_encoder, 
                            hour_departure_encoder, 
                            assembler, 
                            modelos[i]])

	# defino el cross validator
	crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=grids[i],
                          evaluator=RegressionEvaluator(), 
                          numFolds = 10)

	# genero los modelos
	cvModel = crossval.fit(train_data)

	eval.append(RegressionEvaluator().evaluate(cvModel.transform(test_data)))
	bestModel.append(cvModel.bestModel.stages[-1])

# vemos la evaluacion del modelo 1
print(eval[0])

# obtengo mejores parametros del modelo 1
print(bestModel[0].extractParamMap().get(bestModel[0].getParam('regParam')))
print(bestModel[0].extractParamMap().get(bestModel[0].getParam('elasticNetParam')))
Exemplo n.º 25
0
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
# Build the model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
# Run cross validations
cvModel = cv.fit(trainingData)  # this will likely take a fair amount of time because of the amount of models that we're
                                # creating and testing
# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData) # cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions) # 0.9919124901837848


# 4.Naive Bayes

from pyspark.ml.classification import NaiveBayes
# create the trainer and set its parameters
Exemplo n.º 26
0
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Setting Random Forest Paramaters From Users
user_rf_param_numTreeSet = [4, 8, 16, 32, 64]
user_rf_param_maxDepthSet = [10, 20, 30]
user_rf_param_impuritySet = ['gini', 'entropy']
user_rf_param_numFolds = 3

#Settings for Random Forest - Paramaters Grid Search 
rf_paramGrid = ParamGridBuilder().addGrid(rfclassifier.numTrees, user_rf_param_numTreeSet).addGrid(rfclassifier.maxDepth, user_rf_param_maxDepthSet).addGrid(rfclassifier.impurity, user_rf_param_impuritySet).build()
evaluator = BinaryClassificationEvaluator()
multiEvaluator = MulticlassClassificationEvaluator()

#Setting Paramaters for Crossvalidation 
rf_cv = CrossValidator( estimator=pipeline, evaluator=evaluator, estimatorParamMaps=rf_paramGrid, numFolds=user_rf_param_numFolds)
rf_cvmodel = rf_cv.fit(train)

#Evaluating Random Forest Model Performance 
from pyspark.sql.functions import udf

rf_predictions = rf_cvmodel.transform(test)
auroc = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"})
aupr = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderPR"})
"The AUROC is %s and the AUPR is %s" % (auroc, aupr)

f1score = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "f1"})
weightedPrecision = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "weightedPrecision"})
weightedRecall = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "weightedRecall"})

"The F1 score: %s the Weighted Precision: %s the Weighted Recall is %s" % (f1score, weightedPrecision, weightedRecall)
Exemplo n.º 27
0
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, lr])

pipelineFit = pipeline.fit(trainingData)
predictions = pipelineFit.transform(testData)

paramGrid = (ParamGridBuilder()\
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])\
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
             .addGrid(lr.maxIter, [1, 10, 15])\
             .build())

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                              labelCol="not_fully_paid",
                                              metricName="f1")

cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

cvModel = cv.fit(trainingData)

prediction = cvModel.transform(testData)

selected = prediction.select("not_fully_paid", "prediction", "probability")\
   .orderBy("probability", ascending=False) \
      .show(n = 10, truncate = 30)

print("F1: %g" % (evaluator.evaluate(prediction)))
Exemplo n.º 28
0
                                         numTrees=20,
                                         maxDepth=5,
                                         predictionCol="prediction",
                                         rawPredictionCol="rawPrediction",
                                         probabilityCol="probability",
                                         labelCol="indexedSurvived",
                                         featuresCol="features",
                                         impurity="gini")
evaluator_node10 = MulticlassClassificationEvaluator(
    labelCol="indexedSurvived",
    predictionCol="prediction",
    metricName="accuracy")
param_grid_node11 = ParamGridBuilder().addGrid(estimator_node9.maxDepth,
                                               [3, 5, 8, 20]).build()
cv_node11 = CrossValidator(estimator=estimator_node9,
                           estimatorParamMaps=param_grid_node11,
                           evaluator=evaluator_node10)
model_node11 = cv_node11.fit(df_node7)
df_node11 = model_node11.transform(df_node7)
model_node7.save("hdfs://namenode:9000/example4/model_1/")
df_node13 = model_node7.transform(df_node3[1])
model_node11.save("hdfs://namenode:9000/example4/model_2/")
df_node14 = model_node11.transform(df_node13)

evaluator_node15 = MulticlassClassificationEvaluator(
    labelCol="indexedSurvived",
    predictionCol="prediction",
    metricName="accuracy")
score_node15 = evaluator_node15.evaluate(df_node14)
df_node15 = spark.createDataFrame([(score_node15, )], ["score"])
Exemplo n.º 29
0
predictionsDF.printSchema()

predictionsDF.select("churn", "prediction", "features", "state_code",
                     "account_length", "area_code",
                     "international_plan").show()

numFolds = 3
paramGrid = ParamGridBuilder().addGrid(dt.maxDepth,
                                       [2, 5, 10, 20, 30]).addGrid(
                                           dt.maxBins,
                                           [10, 20, 40, 80, 100]).build()

evaluator = (BinaryClassificationEvaluator().setLabelCol(
    "label").setRawPredictionCol("prediction"))

cv = (CrossValidator().setEstimator(pipeline).setEvaluator(
    evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(numFolds))

with open('/dbfs/FileStore/DecisionTreeResults.txt', 'w') as f:
    print("Training model with Decision Tree  algorithm", file=f)
cvModel = cv.fit(trainDF)

predictions = cvModel.transform(testDF)
predictions.printSchema()
predictions.show()

resultDF = predictions.select("label", "prediction", "churn")
resultDF.show(10)

accuracy = evaluator.evaluate(predictions)
with open('/dbfs/FileStore/DecisionTreeResults.txt', 'a+') as f:
    print("Classification accuracy of  Decision Tree  : ", accuracy, file=f)
Exemplo n.º 30
0
def _val(target, model):
    clf, paramGrid = model
    evaluator = BinaryClassificationEvaluator(labelCol=target, rawPredictionCol='prediction')
#    validator = TrainValidationSplit(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator)
    validator = CrossValidator(estimator=clf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)    
    return validator