# ---------------------------------------------------------------ESTIMACION DE PARAMETROS--------------------------------------------------------- # Estimacion de los parametros y validacion cruzada from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation paramGrid = (ParamGridBuilder() .addGrid(lr.regParam, [0.01, 0.5, 2.0]) .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) .addGrid(lr.maxIter, [1, 5, 10]) .build()) # Create 5-fold CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross validations cvModel = cv.fit(trainingData) # this will likely take a fair amount of time because of the amount of models that we're creating and testing # Mejores parametros cvModel.bestModel.extractParamMap() # Use test set to measure the accuracy of our model on new data predictions = cvModel.transform(testData) # cvModel uses the best model found from the Cross Validation # Evaluate best model evaluator.evaluate(predictions)
#applying machine learning model classifier = LogisticRegression(maxIter=2, labelCol="label", featuresCol="features") #Evaluator for evaluating evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction") grid = ParamGridBuilder().addGrid(classifier.regParam, [1.0, 2.0]).addGrid(classifier.maxIter, [10, 15]).build() cv = CrossValidator(estimator=classifier, estimatorParamMaps=grid, evaluator=evaluator) model = cv.fit(trainingData) predictions = model.transform(testData) predictions.show() print(evaluator.evaluate(predictions)) ''' model=classifier.fit(trainingData) #print(model.coefficients) predictions=model.transform(testData)
stages.append( VectorAssembler( inputCols=["idx_{0}".format(col) for col in input_columns], outputCol='features')) dec_tree = DecisionTreeRegressor(labelCol='label', featuresCol='features', maxDepth=5) stages.append(dec_tree) # Split the data into training and test sets (30% held out for testing) trainingData, testData = df.randomSplit([0.7, 0.3], seed=123) evaluator = RegressionEvaluator(metricName='rmse', labelCol='label') grid = ParamGridBuilder().addGrid(dec_tree.maxDepth, [3, 5, 7, 10]).build() cv = CrossValidator(estimator=dec_tree, estimatorParamMaps=grid, evaluator=evaluator, numFolds=10) stages pipeline = Pipeline(stages=stages) model = pipeline.fit(trainingData) predictions = model.transform(testData) output = [] output.append( predictions.select("prediction", "label", "features").show(5)) output.append((model.stages[-1]))
## | performing binary classification model. | ## | How has this changed your performance metrics? | ## +---------------------------------------------------------------------------------------------------+ rf_cv = RandomForestClassifier(labelCol='genre', featuresCol='Features') pipeline = Pipeline(stages=[assembler, rf_cv]) paramGrid = ParamGridBuilder() \ .addGrid(rf_cv.numTrees, [10, 50, 100]) \ .addGrid(rf_cv.maxDepth, [3, 6, 9]) \ .build() cv = CrossValidator(estimator=rf_cv, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator( rawPredictionCol="rawPrediction", labelCol='genre', metricName="areaUnderROC"), numFolds=5) cv_model = cv.fit(training_updownsampled) best_cv_model = cv_model.bestModel print("Best Depth: ", best_cv_model._java_obj.getMaxDepth()) print("Good Number of Trees: ", best_cv_model._java_obj.getNumTrees()) ''' output Best Depth: 9 Good Number of Trees: 100 '''
kmFeatures.groupBy("label").count().show() labelData = kmFeatures.drop('prediction').show() # In[167]: from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import CrossValidator, ParamGridBuilder #lr = LogisticRegression(maxIter=10, regParam=0.01) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [10]).addGrid( lr.regParam, [0.01, 0.05, 1.00, 2.00]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, numFolds=10, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(labelData) evaluator.evaluate(cvModel.transform(labelData)) # In[173]: cvModel.bestModel.intercept # In[97]: training, test = labelData.randomSplit([0.7, 0.3]) lr_model = lr.fit(training) # In[129]:
def train_models(saved_trained_models, sanitized_comments): models_dir = "models/" parquet_dir = "parquets/" if not saved_trained_models: # Initialize six logistic regression models. pos_lr = LogisticRegression(labelCol="trump_pos", featuresCol="vectors", maxIter=10) neg_lr = LogisticRegression(labelCol="trump_neg", featuresCol="vectors", maxIter=10) dem_pos_lr = LogisticRegression(labelCol="dem_pos", featuresCol="vectors", maxIter=10) dem_neg_lr = LogisticRegression(labelCol="dem_neg", featuresCol="vectors", maxIter=10) rep_pos_lr = LogisticRegression(labelCol="rep_pos", featuresCol="vectors", maxIter=10) rep_neg_lr = LogisticRegression(labelCol="rep_neg", featuresCol="vectors", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. pos_evaluator = BinaryClassificationEvaluator(labelCol="trump_pos") neg_evaluator = BinaryClassificationEvaluator(labelCol="trump_neg") dem_pos_evaluator = BinaryClassificationEvaluator(labelCol="dem_pos") dem_neg_evaluator = BinaryClassificationEvaluator(labelCol="dem_neg") rep_pos_evaluator = BinaryClassificationEvaluator(labelCol="rep_pos") rep_neg_evaluator = BinaryClassificationEvaluator(labelCol="rep_neg") # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. pos_param_grid = ParamGridBuilder().addGrid(pos_lr.regParam, [1.0]).build() neg_param_grid = ParamGridBuilder().addGrid(neg_lr.regParam, [1.0]).build() dem_pos_param_grid = ParamGridBuilder().addGrid(dem_pos_lr.regParam, [1.0]).build() dem_neg_param_grid = ParamGridBuilder().addGrid(dem_neg_lr.regParam, [1.0]).build() rep_pos_param_grid = ParamGridBuilder().addGrid(rep_pos_lr.regParam, [1.0]).build() rep_neg_param_grid = ParamGridBuilder().addGrid(rep_neg_lr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. pos_cross_val = CrossValidator( estimator=pos_lr, evaluator=pos_evaluator, estimatorParamMaps=pos_param_grid, numFolds=5) neg_cross_val = CrossValidator( estimator=neg_lr, evaluator=neg_evaluator, estimatorParamMaps=neg_param_grid, numFolds=5) dem_pos_cross_val = CrossValidator( estimator=dem_pos_lr, evaluator=dem_pos_evaluator, estimatorParamMaps=dem_pos_param_grid, numFolds=5) dem_neg_cross_val = CrossValidator( estimator=dem_neg_lr, evaluator=dem_neg_evaluator, estimatorParamMaps=dem_neg_param_grid, numFolds=5) rep_pos_cross_val = CrossValidator( estimator=rep_pos_lr, evaluator=rep_pos_evaluator, estimatorParamMaps=rep_pos_param_grid, numFolds=5) rep_neg_cross_val = CrossValidator( estimator=rep_neg_lr, evaluator=rep_neg_evaluator, estimatorParamMaps=rep_neg_param_grid, numFolds=5) # Split the data 50/50 pos_train, pos_test = sanitized_comments.randomSplit([0.5, 0.5]) neg_train, neg_test = sanitized_comments.randomSplit([0.5, 0.5]) dem_pos_train, dem_pos_test = sanitized_comments.randomSplit([0.5, 0.5]) dem_neg_train, dem_neg_test = sanitized_comments.randomSplit([0.5, 0.5]) rep_pos_train, rep_pos_test = sanitized_comments.randomSplit([0.5, 0.5]) rep_neg_train, rep_neg_test = sanitized_comments.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") pos_model = pos_cross_val.fit(pos_train) print("Training negative classifier...") neg_model = neg_cross_val.fit(neg_train) print("Training positive democrat classifier...") dem_pos_model = dem_pos_cross_val.fit(dem_pos_train) print("Training negative democrat classifier...") dem_neg_model = dem_neg_cross_val.fit(dem_neg_train) print("Training positive republican classifier...") rep_pos_model = rep_pos_cross_val.fit(rep_pos_train) print("Training negative republican classifier...") rep_neg_model = rep_neg_cross_val.fit(rep_neg_train) # Once we train the models, we don't want to do it again. We can save the models and load them again later. pos_model.save(models_dir + "pos.model") neg_model.save(models_dir + "neg.model") dem_pos_model.save(models_dir + "dem_pos.model") dem_neg_model.save(models_dir + "dem_neg.model") rep_pos_model.save(models_dir + "rep_pos.model") rep_neg_model.save(models_dir + "rep_neg.model") # save testing data pos_test.write.parquet(os.path.join(script_dir, parquet_dir + "pos_test.parquet")) neg_test.write.parquet(os.path.join(script_dir, parquet_dir + "neg_test.parquet")) dem_pos_test.write.parquet(os.path.join(script_dir, parquet_dir + "dem_pos_test.parquet")) dem_neg_test.write.parquet(os.path.join(script_dir, parquet_dir + "dem_neg_test.parquet")) rep_pos_test.write.parquet(os.path.join(script_dir, parquet_dir + "rep_pos_test.parquet")) rep_neg_test.write.parquet(os.path.join(script_dir, parquet_dir + "rep_neg_test.parquet")) else: # load models pos_model = CrossValidatorModel.load(models_dir + "pos.model") neg_model = CrossValidatorModel.load(models_dir + "neg.model") dem_pos_model = CrossValidatorModel.load(models_dir + "dem_pos.model") dem_neg_model = CrossValidatorModel.load(models_dir + "dem_neg.model") rep_pos_model = CrossValidatorModel.load(models_dir + "rep_pos.model") rep_neg_model = CrossValidatorModel.load(models_dir + "rep_neg.model") return pos_model, neg_model, dem_pos_model, dem_neg_model, rep_pos_model, rep_neg_model
from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", \ labelCol="LOANIndex",metricName="areaUnderROC") #============================================================================== # Training and Evaluation #============================================================================== from pyspark.ml.tuning import CrossValidator, ParamGridBuilder pipeline_gbt = Pipeline(stages=[gbt]) paramGrid_gbt = ParamGridBuilder() \ .addGrid(gbt.maxDepth, [ 6,10]) \ .addGrid(gbt.maxIter, [3]) \ .addGrid(gbt.maxBins, [32,64])\ .addGrid(gbt.stepSize, [0.7, 0.5, 0.6])\ .build() crossval_gbt = CrossValidator(estimator=pipeline_gbt, estimatorParamMaps=paramGrid_gbt, evaluator=evaluator, numFolds=3) # use 3+ folds in practice # Run cross-validation, and returns the best model. cvModel_gbt = crossval_gbt.fit(train) # NOW WE will predict over test set predictionCV_gbt = cvModel_gbt.transform(test) print(evaluator.evaluate(predictionCV_gbt)) #print ('It took', time.time()-start, 'seconds.')
def main(): parser = argparse.ArgumentParser(description='Pyspark Training') parser.add_argument( '--data', type=str, default="../../../data/sample_linear_regression_data.txt", help='Data location.') parser.add_argument('--cross_val', type=bool, default=False, help='whether to use cross_validation') args = parser.parse_args() data = spark.read.format("libsvm").load(args.data) # Split the data into training and test sets (30% held out for testing) (train, test) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestRegressor() # Create a grid of hyperparameters. Each combination will be tested. paramGrid = ParamGridBuilder()\ .addGrid(rf.numTrees, [2, 25]) \ .addGrid(rf.maxDepth, [2, 6])\ .addGrid(rf.maxBins, [15, 30])\ .build() if args.cross_val: # Run five-fold cross validation to find best hyperparamters. crossval = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse"), numFolds=5) # use 3+ folds in practice model = crossval.fit(train) else: # Grid search for best hyperparameters with a single validation set. tvs = TrainValidationSplit( estimator=rf, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse"), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train) # Make predictions. predictions = model.transform(test) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
# COMMAND ---------- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # COMMAND ---------- grid = ParamGridBuilder() \ .addGrid(dtc.maxDepth, [2, 3, 4, 5, 6, 7, 8]) \ .addGrid(dtc.maxBins, [2, 4, 8]) \ .build() # COMMAND ---------- cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=grid, numFolds=3) # COMMAND ---------- # MAGIC %md Run `CrossValidator`. `CrossValidator` checks to see if an MLflow tracking server is available. If so, it log runs within MLflow: # MAGIC # MAGIC * Under the current active run, log info for `CrossValidator`. (Create a new run if none are active.) # MAGIC * For each submodel (number of folds of cross-validation x number of ParamMaps tested) # MAGIC * Log a run for this submodel, along with the evaluation metric on the held-out data. # COMMAND ---------- # Explicitly create a new run. # This allows this cell to be run multiple times. # If you omit mlflow.start_run(), then this cell could run once,
prediction = pipelineModel.transform(train).select( F.col("mean_exam_points").cast("Float"), F.col("predicted_points").cast("Float")) rmse = regressionEvaluator.evaluate(prediction) print("RMSE is " + str(rmse)) pipelineModel.transform(train).show() from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit paramGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.01, 0.1])\ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=3) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = pipeline.fit(train) prediction = cvModel.transform(train).select( F.col("mean_exam_points").cast("Float"), F.col("predicted_points").cast("Float")) rmse = regressionEvaluator.evaluate(prediction) print("RMSE is " + str(rmse)) # не улучшается - RMSE is 14.0434860345
def test_gen_estimator_metadata(spark_session): # pylint: disable=unused-argument tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1") hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(), outputCol="features1") tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2") hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(), outputCol="features2") vecAssembler = VectorAssembler(inputCols=["features1", "features2"], outputCol="features") lor = LogisticRegression(maxIter=10) ova = OneVsRest(classifier=lor) sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1]) sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2]) sub_pipeline3 = Pipeline(stages=[vecAssembler, ova]) paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid( lor.regParam, [0.1, 0.01]).build()) eva = MulticlassClassificationEvaluator() crossval = CrossValidator(estimator=sub_pipeline3, estimatorParamMaps=paramGrid, evaluator=eva, numFolds=2) top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval]) metadata = _gen_estimator_metadata(top_pipeline) expected_hierarchy = { "name": "Pipeline_1", "stages": [ { "name": "Pipeline_2", "stages": [{ "name": "Tokenizer_1" }, { "name": "HashingTF_1" }] }, { "name": "Pipeline_3", "stages": [{ "name": "Tokenizer_2" }, { "name": "HashingTF_2" }] }, { "name": "CrossValidator", "evaluator": { "name": "MulticlassClassificationEvaluator" }, "tuned_estimator": { "name": "Pipeline_4", "stages": [ { "name": "VectorAssembler" }, { "name": "OneVsRest", "classifier": { "name": "LogisticRegression" } }, ], }, }, ], } assert metadata.hierarchy == expected_hierarchy assert metadata.uid_to_indexed_name_map == { top_pipeline.uid: "Pipeline_1", sub_pipeline1.uid: "Pipeline_2", tokenizer1.uid: "Tokenizer_1", hashingTF1.uid: "HashingTF_1", sub_pipeline2.uid: "Pipeline_3", tokenizer2.uid: "Tokenizer_2", hashingTF2.uid: "HashingTF_2", crossval.uid: "CrossValidator", sub_pipeline3.uid: "Pipeline_4", vecAssembler.uid: "VectorAssembler", ova.uid: "OneVsRest", lor.uid: "LogisticRegression", eva.uid: "MulticlassClassificationEvaluator", } assert (metadata.uid_to_indexed_name_map[ metadata.param_search_estimators[0].uid] == "CrossValidator")
from pyspark.ml.evaluation import RegressionEvaluator # Create an RMSE evaluator using the label and predicted columns regEval = RegressionEvaluator(predictionCol="Predicted_EXP", labelCol="TOTAL_BENEFICIARY_AMT", metricName="r2") # Run the evaluator on the DataFrame r2 = regEval.evaluate(train_data_output) print("Root Mean Squared Error: %.2f" % r2) from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # We can reuse the RegressionEvaluator, regEval, to judge the model based on the best Root Mean Squared Error # Let's create our CrossValidator with 3 fold cross validation crossval = CrossValidator(estimator=dtPipeline, evaluator=regEval, numFolds=3) # Let's tune over our dt.maxDepth parameter on the values 2 and 3, create a paramter grid using the ParamGridBuilder paramGrid = (ParamGridBuilder() .addGrid(dt.maxDepth, [6,7,8,9]) .build()) # Add the grid to the CrossValidator crossval.setEstimatorParamMaps(paramGrid) # Now let's find and return the best model dtModel = crossval.fit(train_data).bestModel train_data_output=dtModel.transform(train_data) #from pyspark.sql import functions as F
print(f"{unique_features_count} of {num_ids} have unique features.") delta_ = delta(start_features) # 2.2 Clustering start_clustering = get_time() print("Starting clustering step.") bkm = BisectingKMeans() print("Fitting data to Bisecting K Means model") model = bkm.fit(features_df) clustering_pipeline = Pipeline(stages=[bkm]) print("Building grid for cross-validation") paramGrid = ParamGridBuilder().addGrid( bkm.k, [2, 5, 10, 20, 50, 70, 100]).build() print("Starting cross-validation") crossval = CrossValidator( estimator=clustering_pipeline, estimatorParamMaps=paramGrid, evaluator=ClusteringEvaluator(), numFolds=3, ) cvModel = crossval.fit(features_df) cluster_df = cvModel.transform(features_df) cluster_df.select("prediction").describe().show() print("Finished step 2.") delta(ml_start) # End execution print("All steps complete.") delta_ = delta(global_start)
result9_df = result8_transformed splits = result9_df.randomSplit([0.8, 0.2], seed=1) train = splits[0].cache() valid = splits[1].cache() train.show(n) valid.show(n) # step 10 lr = LogisticRegression(regParam=0.01, maxIter=100, fitIntercept=True) bceval = BinaryClassificationEvaluator() cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(n_fold) paramGrid = ParamGridBuilder().addGrid(lr.maxIter, max_iter)\ .addGrid(lr.regParam, reg_params).build() cv.setEstimatorParamMaps(paramGrid) cvmodel = cv.fit(train) print(cvmodel.bestModel.coefficients) print('') print(cvmodel.bestModel.intercept) print('') print(cvmodel.bestModel.getMaxIter()) print('') print(cvmodel.bestModel.getRegParam())
'hdfs:/user/pg1910/pub/goodreads/testing_sample.parquet') als = ALS(userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop", nonnegative=True) param_grid = ParamGridBuilder().addGrid(als.rank, [15, 25, 35]).addGrid( als.maxIter, [5, 8, 10]).addGrid(als.regParam, [0.08, 0.09, 0.10]).build() evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3) model = cv.fit(df_training) best_model = model.bestModel print("Tuned Hyperparameters:-------------") print("Rank: ", best_model._java_obj.parent().getRank()) print("MaxIter: ", best_model._java_obj.parent().getMaxIter()) print("RegParam: ", best_model._java_obj.parent().getRegParam()) print("Recommendations: ------------------------------") user_recs = best_model.recommendForAllUsers(500) # user_recs.write.csv('hdfs:/user/pg1910/pub/goodreads/user_recs.csv')
def main(sqlContext): """Main function takes a Spark SQL context.""" # YOUR CODE HERE # YOU MAY ADD OTHER FUNCTIONS AS NEEDED # load files label = sqlContext.read.load("labeled_data.csv", format="csv", sep=",", inferSchema="true", header="true") if (flag): comments = sqlContext.read.json("comments-minimal.json.bz2") submissions = sqlContext.read.json("submissions.json.bz2") print("loading done") comments.write.parquet("comments_data") submissions.write.parquet("submissions_data") print("writing done") else: comments = sqlContext.read.parquet("comments") submissions = sqlContext.read.parquet("submissions") print("loading done") comments.show() exit() if (save): # task 7 starts here associated = join(comments, label) withngrams = associated.withColumn("ngrams", makeNgrams_udf(associated['body'])) withplabels = withngrams.withColumn("poslabel", pLabel_udf(withngrams['labeldjt'])) withpnlabels = withplabels.withColumn( "neglabel", nLabel_udf(withplabels['labeldjt'])).select( "id", "ngrams", "poslabel", "neglabel") # withpnlabels.show() cv = CountVectorizer(binary=True, inputCol="ngrams", outputCol="features") model = cv.fit(withpnlabels) model.save("cv.model") # model.transform(withpnlabels).show() pos = model.transform(withpnlabels).select( "id", col("poslabel").alias("label"), "features") neg = model.transform(withpnlabels).select( "id", col("neglabel").alias("label"), "features") # pos.show() # neg.show() poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=2) # for test negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=2) # for test posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) posModel.save("pos.model") negModel.save("neg.model") print("trained") else: # comments.show() # submissions.show() posModel = CrossValidatorModel.load("pos.model") negModel = CrossValidatorModel.load("neg.model") model = CountVectorizerModel.load("cv.model") # withngrams = comments.withColumn("ngrams", makeNgrams_udf(comments['body'])) # cv = CountVectorizer(binary=True, inputCol="ngrams", outputCol="features") # model = cv.fit(withngrams) print("model loaded") if (predict == 0): # task 8 starts here temp_comments = comments.select("id", "link_id", "author_flair_text", "created_utc", "body") clean_comments = temp_comments.withColumn( "true_id", getLinkid_udf(temp_comments['link_id'])) # print(clean_comments.count()) clean_submissions = submissions.select( col("id").alias("sub_id"), "title") # clean_comments.show() # clean_submissions.show() com_sub = clean_comments.join( clean_submissions, clean_comments.true_id == clean_submissions.sub_id, "inner") com_sub.write.parquet("com_sub") else: # task 9 starts here com_sub = sqlContext.read.parquet("com_sub") com_sub = com_sub.sample(False, 0.0001, None) filtered = com_sub.filter( "body NOT LIKE '%/s%' and body NOT LIKE '>%'") # print(filtered.count()) filtered_ngrams = filtered.withColumn( "ngrams", makeNgrams_udf(filtered['body'])) # filtered_ngrams = filtered_ngrams.sample(False, 0.01, None) print("prepared") featuredata = model.transform(filtered_ngrams).select( "id", "author_flair_text", "created_utc", "sub_id", "title", "features") posResult = posModel.transform(featuredata) negResult = negModel.transform(featuredata) # posResult.show() # negResult.show() poslabel = posResult.withColumn( "positive", posTh_udf(posResult['probability']) ) # .select("id", "author_flair_text", "created_utc", "title", "positive") neglabel = negResult.withColumn( "negtive", negTh_udf(negResult['probability']) ) # .select(col("id").alias("nid"), "author_flair_text", "created_utc", "title", "negtive") print("predict done") # poslabel.show() # neglabel.show() # how to combine these 2 tables??? # task 10 starts here # c_all = poslabel.count() all_day = poslabel.withColumn( "date", from_unixtime('created_utc').cast( DateType())).groupby("date").count() pos_posts = poslabel.filter("positive = 1") # c_pos_posts = pos_posts.count() # p_pos_posts = c_pos_posts/c_all # print(p_pos_posts) # neg_posts = neglabel.filter("negtive = 1") # c_neg_posts = neg_posts.count() # p_neg_posts = c_neg_posts/c_all # print(p_neg_posts) pos_day = pos_posts.withColumn( "pos_date", from_unixtime('created_utc').cast( DateType())).groupby("pos_date").count().withColumnRenamed( "count", "pos_count") p_pos_day = all_day.join(pos_day, all_day.date == pos_day.pos_date, "left").withColumn( "pos_per", pos_count / count).show() print("end")
(11, "jogos são legais", 0.0) ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=3) cvModel = crossval.fit(training) test = spark.createDataFrame([ (4, "eu gosto de jogar"), (5, "faz tempo que não vou a igreja"), (6, "jesus cristo"), (7, "muitos jogos legais lançados recentemente") ], ["id", "text"]) prediction = cvModel.transform(test) selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): print(row)
# Make predictions predictionDF = model.transform(testDF) # Choose (observation, prediction) pairs. Need this for calculating metrics metricDF = predictionDF.select('PE', 'prediction') # Calculate metrics metrics = RegressionMetrics(metricDF.rdd) print ("Default LR metrics: RMSE = %s, R2 = %s, MSE = %s" %(metrics.rootMeanSquaredError, metrics.r2, metrics.meanSquaredError)) # b) Parameter Tuning using CV paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [10, 25, 50]).addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4]).build() cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator().setLabelCol('PE').setMetricName('rmse'), numFolds=3) cvModel = cv.fit(trainDF) # Make predictions predictionDF = cvModel.transform(testDF) # Choose (observation, prediction) pairs. Need this for calculating metrics metricDF = predictionDF.select('PE', 'prediction') # Calculate metrics metrics = RegressionMetrics(metricDF.rdd) print ("CV LR metrics: RMSE = %s, R2 = %s, MSE = %s" %(metrics.rootMeanSquaredError, metrics.r2, metrics.meanSquaredError)) # print ('Best Param (regParam, maxIter): (%s, %s)' %(cvModel.bestModel.stages[1]._java_obj.parent().getRegParam(), cvModel.bestModel.stages[-1]._java_obj.parent().getMaxIter()))
train00, test0 = data_pre.randomSplit([0.8, 0.2], seed=24) train0, validation0 = train00.randomSplit([0.8, 0.2], seed=24) train = pipeline.fit(train0).transform(train0) test = pipeline.fit(test0).transform(train0) validation = pipeline.fit(test0).transform(train0) # In[79]: #创建模型1 lr = LogisticRegression(maxIter=20) paramGrid1 = ParamGridBuilder().addGrid(lr.regParam, [0.3, 0.01]).addGrid( lr.elasticNetParam, [1.0, 0.0]).build() #为1时L1,0时L2 evaluator1 = MulticlassClassificationEvaluator(metricName="f1") crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid1, evaluator=evaluator1, numFolds=3) lr_model = crossval.fit(train) print('lrpre value: {}'.format( evaluator1.evaluate(lr_model.transform(validation)))) # In[80]: lr_model.getEstimatorParamMaps() # lr_model.bestModel # lr_model.avgMetrics # In[81]: lr_model.bestModel
def IntanceFitModel(Mtype, classifier, classes, features, folds, train): if Mtype == "OneVsRest": # instantiate the base classifier. lr = LogisticRegression() # instantiate the One Vs Rest Classifier. OVRclassifier = OneVsRest(classifier=lr) # fitModel = OVRclassifier.fit(train) # Add parameters of your choice here: paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build() #Cross Validator requires the following parameters: crossval = CrossValidator( estimator=OVRclassifier, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=folds) # 3 is best practice # Run cross-validation, and choose the best set of parameters. fitModel = crossval.fit(train) return fitModel if Mtype == "MultilayerPerceptronClassifier": # specify layers for the neural network: # input layer of size features, two intermediate of features+1 and same size as features # and output of size number of classes # Note: crossvalidator cannot be used here features_count = len(features[0][0]) layers = [ features_count, features_count + 1, features_count, classes ] MPC_classifier = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) fitModel = MPC_classifier.fit(train) return fitModel if Mtype in ( "LinearSVC", "GBTClassifier" ) and classes != 2: # These classifiers currently only accept binary classification print( Mtype, " could not be used because PySpark currently only accepts binary classification data for this algorithm" ) return if Mtype in ("LogisticRegression", "NaiveBayes", "RandomForestClassifier", "GBTClassifier", "LinearSVC", "DecisionTreeClassifier"): # Add parameters of your choice here: if Mtype in ("LogisticRegression"): paramGrid = ( ParamGridBuilder( ) # .addGrid(classifier.regParam, [0.1, 0.01]) \ .addGrid(classifier.maxIter, [10, 15, 20]).build()) # Add parameters of your choice here: if Mtype in ("NaiveBayes"): paramGrid = (ParamGridBuilder().addGrid( classifier.smoothing, [0.0, 0.2, 0.4, 0.6]).build()) # Add parameters of your choice here: if Mtype in ("RandomForestClassifier"): paramGrid = ( ParamGridBuilder().addGrid(classifier.maxDepth, [2, 5, 10]) # .addGrid(classifier.maxBins, [5, 10, 20]) # .addGrid(classifier.numTrees, [5, 20, 50]) .build()) # Add parameters of your choice here: if Mtype in ("GBTClassifier"): paramGrid = ( ParamGridBuilder( ) # .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \ # .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \ .addGrid(classifier.maxIter, [10, 15, 50, 100]).build()) # Add parameters of your choice here: if Mtype in ("LinearSVC"): paramGrid = (ParamGridBuilder().addGrid( classifier.maxIter, [10, 15]).addGrid(classifier.regParam, [0.1, 0.01]).build()) # Add parameters of your choice here: if Mtype in ("DecisionTreeClassifier"): paramGrid = (ParamGridBuilder() # .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \ .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \ .build()) #Cross Validator requires all of the following parameters: crossval = CrossValidator( estimator=classifier, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=folds) # 3 + is best practice # Fit Model: Run cross-validation, and choose the best set of parameters. fitModel = crossval.fit(train) return fitModel
# MAGIC # MAGIC Note, we are using `evaluatorPR` as our `evaluator` as the Precision-Recall curve is often better for an unbalanced distribution. # COMMAND ---------- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # Build the grid of different parameters paramGrid = ParamGridBuilder() \ .addGrid(dt.maxDepth, [5, 10, 15]) \ .addGrid(dt.maxBins, [10, 20, 30]) \ .build() # Build out the cross validation crossval = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluatorPR, numFolds=3) pipelineCV = Pipeline(stages=[indexer, va, crossval]) # Train the model using the pipeline, parameter grid, and preceding BinaryClassificationEvaluator cvModel_u = pipelineCV.fit(train) # COMMAND ---------- # MAGIC %md ### Review Results # MAGIC Review the `areaUnderPR` (area under Precision Recall curve) and `areaUnderROC` (area under Receiver operating characteristic) or `AUC` (area under curve) metrics. # COMMAND ---------- # Build the best model (training and test datasets)
# COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.ml.evaluation import RegressionEvaluator regEval = RegressionEvaluator(predictionCol="Predicted_PE") regEval.setLabelCol("PE")\ .setMetricName("rmse") regParam = [i / 100.0 for i in range(1, 11)] grid = ParamGridBuilder().addGrid(lr.regParam, regParam).build() crossval = CrossValidator(estimator=lrPipeline, estimatorParamMaps=grid, evaluator=regEval, numFolds=5) cvModel = crossval.fit(trainingSet) # COMMAND ---------- # MAGIC %md Now that we have tuned let's see what we got for tuning parameters and what our RMSE was versus our intial model # COMMAND ---------- predictionsAndLabels = cvModel.transform(testSet) valuesAndPreds = predictionsAndLabels.select("Predicted_PE", "PE").rdd.map( lambda x: (x.__getitem__('Predicted_PE'), x.__getitem__('PE'))) metrics = RegressionMetrics(valuesAndPreds)
from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier( labelCol="label", featuresCol="features", ) from pyspark.ml.tuning import CrossValidator, ParamGridBuilder pipeline_rf = Pipeline(stages=[rf]) paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [2, 3, 4, 5, 6, 7]).addGrid( rf.numTrees, [100, 300]).build() evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction") crossval = CrossValidator(estimator=pipeline_rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) ## Fitting the CV CV_model = crossval.fit(trainingData) ## Printing best model print(CV_model.bestModel.stages[0]) test_pred = CV_model.transform(testData) print(evaluator.getMetricName(), evaluator.evaluate(test_pred))
eval = list() for i in range(2): # defino el pipeline pipeline = Pipeline(stages=[day_of_week_indexer, airline_indexer, hour_departure_indexer, day_of_week_encoder, airline_encoder, hour_departure_encoder, assembler, modelos[i]]) # defino el cross validator crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=grids[i], evaluator=RegressionEvaluator(), numFolds = 10) # genero los modelos cvModel = crossval.fit(train_data) eval.append(RegressionEvaluator().evaluate(cvModel.transform(test_data))) bestModel.append(cvModel.bestModel.stages[-1]) # vemos la evaluacion del modelo 1 print(eval[0]) # obtengo mejores parametros del modelo 1 print(bestModel[0].extractParamMap().get(bestModel[0].getParam('regParam'))) print(bestModel[0].extractParamMap().get(bestModel[0].getParam('elasticNetParam')))
pipelineFit = pipeline.fit(data) dataset = pipelineFit.transform(data) (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100) # Build the model lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation paramGrid = (ParamGridBuilder() .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0) # .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations # .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features .build()) # Create 5-fold CrossValidator cv = CrossValidator(estimator=lr, \ estimatorParamMaps=paramGrid, \ evaluator=evaluator, \ numFolds=5) # Run cross validations cvModel = cv.fit(trainingData) # this will likely take a fair amount of time because of the amount of models that we're # creating and testing # Use test set here so we can measure the accuracy of our model on new data predictions = cvModel.transform(testData) # cvModel uses the best model found from the Cross Validation # Evaluate best model evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions) # 0.9919124901837848 # 4.Naive Bayes from pyspark.ml.classification import NaiveBayes # create the trainer and set its parameters
from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator #Setting Random Forest Paramaters From Users user_rf_param_numTreeSet = [4, 8, 16, 32, 64] user_rf_param_maxDepthSet = [10, 20, 30] user_rf_param_impuritySet = ['gini', 'entropy'] user_rf_param_numFolds = 3 #Settings for Random Forest - Paramaters Grid Search rf_paramGrid = ParamGridBuilder().addGrid(rfclassifier.numTrees, user_rf_param_numTreeSet).addGrid(rfclassifier.maxDepth, user_rf_param_maxDepthSet).addGrid(rfclassifier.impurity, user_rf_param_impuritySet).build() evaluator = BinaryClassificationEvaluator() multiEvaluator = MulticlassClassificationEvaluator() #Setting Paramaters for Crossvalidation rf_cv = CrossValidator( estimator=pipeline, evaluator=evaluator, estimatorParamMaps=rf_paramGrid, numFolds=user_rf_param_numFolds) rf_cvmodel = rf_cv.fit(train) #Evaluating Random Forest Model Performance from pyspark.sql.functions import udf rf_predictions = rf_cvmodel.transform(test) auroc = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(rf_predictions, {evaluator.metricName: "areaUnderPR"}) "The AUROC is %s and the AUPR is %s" % (auroc, aupr) f1score = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "f1"}) weightedPrecision = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "weightedPrecision"}) weightedRecall = multiEvaluator.evaluate(rf_predictions, {multiEvaluator.metricName: "weightedRecall"}) "The F1 score: %s the Weighted Precision: %s the Weighted Recall is %s" % (f1score, weightedPrecision, weightedRecall)
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, lr]) pipelineFit = pipeline.fit(trainingData) predictions = pipelineFit.transform(testData) paramGrid = (ParamGridBuilder()\ .addGrid(lr.regParam, [0.01, 0.5, 2.0])\ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .addGrid(lr.maxIter, [1, 10, 15])\ .build()) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="not_fully_paid", metricName="f1") cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) cvModel = cv.fit(trainingData) prediction = cvModel.transform(testData) selected = prediction.select("not_fully_paid", "prediction", "probability")\ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) print("F1: %g" % (evaluator.evaluate(prediction)))
numTrees=20, maxDepth=5, predictionCol="prediction", rawPredictionCol="rawPrediction", probabilityCol="probability", labelCol="indexedSurvived", featuresCol="features", impurity="gini") evaluator_node10 = MulticlassClassificationEvaluator( labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy") param_grid_node11 = ParamGridBuilder().addGrid(estimator_node9.maxDepth, [3, 5, 8, 20]).build() cv_node11 = CrossValidator(estimator=estimator_node9, estimatorParamMaps=param_grid_node11, evaluator=evaluator_node10) model_node11 = cv_node11.fit(df_node7) df_node11 = model_node11.transform(df_node7) model_node7.save("hdfs://namenode:9000/example4/model_1/") df_node13 = model_node7.transform(df_node3[1]) model_node11.save("hdfs://namenode:9000/example4/model_2/") df_node14 = model_node11.transform(df_node13) evaluator_node15 = MulticlassClassificationEvaluator( labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy") score_node15 = evaluator_node15.evaluate(df_node14) df_node15 = spark.createDataFrame([(score_node15, )], ["score"])
predictionsDF.printSchema() predictionsDF.select("churn", "prediction", "features", "state_code", "account_length", "area_code", "international_plan").show() numFolds = 3 paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [2, 5, 10, 20, 30]).addGrid( dt.maxBins, [10, 20, 40, 80, 100]).build() evaluator = (BinaryClassificationEvaluator().setLabelCol( "label").setRawPredictionCol("prediction")) cv = (CrossValidator().setEstimator(pipeline).setEvaluator( evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(numFolds)) with open('/dbfs/FileStore/DecisionTreeResults.txt', 'w') as f: print("Training model with Decision Tree algorithm", file=f) cvModel = cv.fit(trainDF) predictions = cvModel.transform(testDF) predictions.printSchema() predictions.show() resultDF = predictions.select("label", "prediction", "churn") resultDF.show(10) accuracy = evaluator.evaluate(predictions) with open('/dbfs/FileStore/DecisionTreeResults.txt', 'a+') as f: print("Classification accuracy of Decision Tree : ", accuracy, file=f)
def _val(target, model): clf, paramGrid = model evaluator = BinaryClassificationEvaluator(labelCol=target, rawPredictionCol='prediction') # validator = TrainValidationSplit(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator) validator = CrossValidator(estimator=clf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) return validator