def NeuralNetwork(trainingData, testData): start_time = time.time() print(" ") print("--------------------- NEURAL NETWORK ---------------------") layers = [187, 8, 5] nn = MultilayerPerceptronClassifier(layers=layers) # Parametri su cui effettuare il tuning paramGrid = ParamGridBuilder() \ .addGrid(nn.stepSize, [1, 0.01]) \ .addGrid(nn.maxIter, [100, 1000]) \ .build() # Tuning sui vari parametri per scegliere il modello migliore tvs = TrainValidationSplit( estimator=nn, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), # Validation test: 80% traning, 20% validation. trainRatio=0.8) model = tvs.fit(trainingData) prediction = model.transform(testData) predictionAndLabels = prediction.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") accuracy = evaluator.evaluate(predictionAndLabels) evaluator = MulticlassClassificationEvaluator(metricName="f1") f1score = evaluator.evaluate(predictionAndLabels) # Confusion Matrix class_temp = prediction.select("label").groupBy("label") \ .count().sort('count', ascending=False).toPandas() class_temp = class_temp["label"].values.tolist() y_true = prediction.select("label") y_true = y_true.toPandas() y_pred = prediction.select("prediction") y_pred = y_pred.toPandas() cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_temp) print("Accuracy Hold out: ", accuracy) print("F1-Score Hold out: ", f1score) print("") print("") print("Doc Parameters : [", model.explainParams(), "]") print("") print("") print("Confusion Matrix: ") print(cnf_matrix) print("Neural Network HoldOut Execution TIME:", time.time() - start_time) # Richiamo NN che utilizza la validazione K-Folds f1score_cv, cnf_matrix_cv, cv = NeuralNetworkCV(trainingData, testData) # Restituisco il modello migliore tra Hold Out e K-Folds if (f1score <= f1score_cv): return (f1score_cv, cnf_matrix_cv, cv) else: return (f1score, cnf_matrix, tvs)
vectorassembler_stage ] pipeline = Pipeline(stages=all_stages) pipeline_model = pipeline.fit(taxi) final_columns = feature_columns + ['features', 'label'] taxi_df = pipeline_model.transform(taxi).select(final_columns) #taxi_df.show(5) train, test = taxi_df.randomSplit([0.8, 0.2], seed=1234) random_forest = RandomForestClassifier(featuresCol='features', labelCol='label') param_grid = ParamGridBuilder().\ addGrid(random_forest.maxDepth, [2, 3, 4]).\ addGrid(random_forest.minInfoGain, [0.0, 0.1, 0.2, 0.3]).\ build() evaluator = BinaryClassificationEvaluator() crossvalidation = CrossValidator(estimator=random_forest, estimatorParamMaps=param_grid, evaluator=evaluator) crossvalidation_mod = crossvalidation.fit(taxi_df) pred_test = crossvalidation_mod.transform(test) pred_test.show(5) label_pred_test = pred_test.select('label', 'prediction') label_pred_test.rdd.zipWithIndex().countByKey() print('Accuracy : ', evaluator.setMetricName('areaUnderROC').evaluate(pred_test)) print('Precision : ',
def sparse_vector_to_array(dv): print(type(dv)) new_array = list([float(x) for x in dv]) return new_array sparse_vector_to_array_udf = F.udf(sparse_vector_to_array, T.ArrayType(T.FloatType())) # COMMAND ---------- # DBTITLE 1,Build a grid to test hyperparameters from pyspark.ml.tuning import ParamGridBuilder paramGrid = (ParamGridBuilder() .addGrid(rf.maxBins, [20, 30, 30]) .addGrid(rf.maxDepth, [3, 5, 10]) .addGrid(rf.numTrees, [10, 80]) .build() ) # COMMAND ---------- # DBTITLE 1,Set validation method for comparison from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.tuning import CrossValidator evaluator = MulticlassClassificationEvaluator().setLabelCol("label") cv = CrossValidator( estimator = pipeline, # Estimator (individual model or pipeline) estimatorParamMaps = paramGrid, # Grid of parameters to try (grid search)
pipeline_model = pipeline.fit(data) data_set = pipeline_model.transform(data) train_data, test_data = data_set.randomSplit([0.7, 0.3], seed=0) print(train_data.count(), test_data.count()) clf = LogisticRegression() clf_model = clf.fit(train_data) predict = clf_model.transform(test_data) evaluator = BinaryClassificationEvaluator() print(evaluator.evaluate(predict)) rf = RandomForestClassifier() grid = ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5]) \ .addGrid(rf.maxDepth, [3, 5, 7]) \ .addGrid(rf.maxBins, [20, 30, 40]).build() cv = CrossValidator(estimator=rf, evaluator=evaluator, estimatorParamMaps=grid, numFolds=5) cv_model = cv.fit(train_data) cv_model_predict = cv_model.transform(test_data) print(evaluator.evaluate(cv_model_predict)) metrics = ComputeModelStatistics( evaluationMetric='classification', labelCol='label', scoresCol='probability',
lag(col('Arrest'), count=1).over( Window().partitionBy('Beat').orderBy('Year_Week'))).na.drop()) crime_beat_week_lagged = (crime_beat_week_lagged.withColumn( 'Lag_Domestic', lag(col('Domestic'), count=1).over( Window().partitionBy('Beat').orderBy('Year_Week'))).na.drop()) crime_beat_week_lagged.registerTempTable('crime_beat_week_lagged') #export_data.to_csv("q2_3_beat_cnt.csv", index=False) # run model input_cols = ['Lag1', 'Lag2', 'Lag3', 'Lag_Arrest', 'Lag_Domestic', 'week'] assembler = VectorAssembler(inputCols=input_cols, outputCol='features') rf = RandomForestRegressor(numTrees=30) stages = [assembler, rf] pipeline = Pipeline(stages=stages) param_grid = ParamGridBuilder().addGrid(rf.maxDepth, [5, 6, 7]).build() evaluator = RegressionEvaluator(labelCol='label', predictionCol='prediction', metricName='r2') rf_grid = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3).fit(crime_beat_week_lagged) lagged_fitted = rf_grid.transform(crime_beat_week_lagged) # print r-square eval_metric = evaluator.evaluate(lagged_fitted) print("evaluation answer") print("-----------------------------------")
evaluator = BinaryClassificationEvaluator() evaluator.evaluate(predictions) # COMMAND ---------- # MAGIC %md Now we will try tuning the model with the `ParamGridBuilder` and the `CrossValidator`. # MAGIC # MAGIC As we indicate 3 values for maxDepth and 3 values for maxBin, this grid will have 3 x 3 = 9 parameter settings for `CrossValidator` to choose from. We will create a 3-fold CrossValidator. # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation from pyspark.ml.tuning import ParamGridBuilder, CrossValidator paramGrid = (ParamGridBuilder().addGrid(dt.maxDepth, [1, 2, 6, 10]).addGrid( dt.maxBins, [20, 40, 80]).build()) # COMMAND ---------- # Create 3-fold CrossValidator cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) # Run cross validations (this can take several minutes to execute) cvModel = cv.fit(trainingData2) # COMMAND ---------- # MAGIC %md Now let's create new predictions with which to measure the accuracy of our model.
def SparkML(train_df, test_df=None, featuresCol='features', labelCol='label', binaryclass=False, multiclass=False, n_cluster=2, userCol='user', itemCol='item', ratingCol='rating', rank=10, userid=3, itemid=3, itemsCol='items', minSupport=0.3, minConfidence=0.8, stringIndexer=False, inputColStringIndexer=None, outputColStringIndexer=None, oneHotEncoder=False, inputColOneHotEncoder=None, outputColOneHotEncoder=None, vectorAssembler=False, inputColsVectorAssembler=None, outputColsVectorAssembler=None, vectorIndexer=False, inputColsVectorIndexer=None, outputColsVectorIndexer=None, maxCategories=None, classification=False, logisticregression=False, decisiontreeclassifier=False, linearsvc=False, naivebayes=False, randomforestclassifier=False, gbtclassifier=False, regression=False, linearregression=True, decisiontreeregressor=False, randomforestregressor=False, gbtregressor=False, clustering=False, kmeans=False, gaussianmixture=False, lda=False, recommendation=False, als=False, association=False, fpgrowth=False): if classification: if logisticregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRClassifier = LogisticRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', standardization=True, maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, threshold=0.5) paramGrid = ParamGridBuilder().addGrid( LRClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") LRCV = CrossValidator(estimator=LRClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LRC_Pipeline = Pipeline(stages=stagesList) LRC_PipelineModel = LRC_Pipeline.fit(train_df) LRC_Predicted = LRC_PipelineModel.transform(test_df) LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel LRC_Probability = LRC_Predicted.select("Probability").toPandas() LRC_Prediction = LRC_Predicted.select("Prediction").toPandas() LRC_Score = evaluator.evaluate(LRC_Predicted) return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score if decisiontreeclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTClassifier = DecisionTreeClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', seed=None) paramGrid = ParamGridBuilder().addGrid( DTClassifier.impurity, ["gini", "entropy"]).addGrid( DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTClassifier.maxBins, [3, 5, 10, 50, 100, 200]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") DTCV = CrossValidator(estimator=DTClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTCV) DTC_Pipeline = Pipeline(stages=stagesList) DTC_PipelineModel = DTC_Pipeline.fit(train_df) DTC_Predicted = DTC_PipelineModel.transform(test_df) DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel DTC_Probability = DTC_Predicted.select("Probability").toPandas() DTC_Prediction = DTC_Predicted.select("Prediction").toPandas() DTC_Score = evaluator.evaluate(DTC_Predicted) return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score if linearsvc: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) SVClassifier = LinearSVC(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', rawPredictionCol='RawPrediction', maxIter=100, regParam=0.0, tol=1e-06, fitIntercept=True, standardization=True, threshold=0.0) paramGrid = ParamGridBuilder().addGrid( SVClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( SVClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") SVCV = CrossValidator(estimator=SVClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(SVCV) SVC_Pipeline = Pipeline(stages=stagesList) SVC_PipelineModel = SVC_Pipeline.fit(train_df) SVC_Predicted = SVC_PipelineModel.transform(test_df) SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel SVC_Prediction = SVC_Predicted.select("Prediction").toPandas() SVC_Score = evaluator.evaluate(SVC_Predicted) return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score if naivebayes: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) NBClassifier = NaiveBayes(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', smoothing=1.0, modelType='multinomial', thresholds=None) paramGrid = ParamGridBuilder().addGrid( NBClassifier.smoothing, [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") NBCV = CrossValidator(estimator=NBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(NBCV) NBC_Pipeline = Pipeline(stages=stagesList) NBC_PipelineModel = NBC_Pipeline.fit(train_df) NBC_Predicted = NBC_PipelineModel.transform(test_df) NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel NBC_Probability = NBC_Predicted.select("Probability").toPandas() NBC_Prediction = NBC_Predicted.select("Prediction").toPandas() NBC_Score = evaluator.evaluate(NBC_Predicted) return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score if randomforestclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFClassifier = RandomForestClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', numTrees=20, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( RFClassifier.impurity, ["gini", "entropy"]).addGrid( RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFClassifier.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") RFCV = CrossValidator(estimator=RFClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFCV) RFC_Pipeline = Pipeline(stages=stagesList) RFC_PipelineModel = RFC_Pipeline.fit(train_df) RFC_Predicted = RFC_PipelineModel.transform(test_df) RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel RFC_Probability = RFC_Predicted.select("Probability").toPandas() RFC_Prediction = RFC_Predicted.select("Prediction").toPandas() RFC_Score = evaluator.evaluate(RFC_Predicted) return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score if gbtclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBClassifier = GBTClassifier(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, lossType='logistic', maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBClassifier.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBClassifier.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") GBCV = CrossValidator(estimator=GBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBCV) GBC_Pipeline = Pipeline(stages=stagesList) GBC_PipelineModel = GBC_Pipeline.fit(train_df) GBC_Predicted = GBC_PipelineModel.transform(test_df) GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel GBC_Prediction = GBC_Predicted.select("Prediction").toPandas() GBC_Score = evaluator.evaluate(GBC_Predicted) return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score if regression: if linearregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRegressor = LinearRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', standardization=True, fitIntercept=True, loss='squaredError', maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, epsilon=1.35) paramGrid = ParamGridBuilder().addGrid( LRegressor.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRegressor.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") LRCV = CrossValidator(estimator=LRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LR_Pipeline = Pipeline(stages=stagesList) LR_PipelineModel = LR_Pipeline.fit(train_df) LR_Predicted = LR_PipelineModel.transform(test_df) LR_BestModel = LR_PipelineModel.stages[-1].bestModel LR_Prediction = LR_Predicted.select("Prediction").toPandas() LR_Score = evaluator.evaluate(LR_Predicted) return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score if decisiontreeregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', seed=None, varianceCol=None) paramGrid = ParamGridBuilder().addGrid( DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") DTRCV = CrossValidator(estimator=DTRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTRCV) DTR_Pipeline = Pipeline(stages=stagesList) DTR_PipelineModel = DTR_Pipeline.fit(train_df) DTR_Predicted = DTR_PipelineModel.transform(test_df) DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel DTR_Prediction = DTR_Predicted.select("Prediction").toPandas() DTR_Score = evaluator.evaluate(DTR_Predicted) return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score if randomforestregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFRegressor = RandomForestRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', subsamplingRate=1.0, seed=None, numTrees=20) paramGrid = ParamGridBuilder().addGrid( RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFRegressor.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") RFRCV = CrossValidator(estimator=RFRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFRCV) RFR_Pipeline = Pipeline(stages=stagesList) RFR_PipelineModel = RFR_Pipeline.fit(train_df) RFR_Predicted = RFR_PipelineModel.transform(test_df) RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel RFR_Prediction = RFR_Predicted.select("Prediction").toPandas() RFR_Score = evaluator.evaluate(RFR_Predicted) return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score if gbtregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBRegressor = GBTRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, subsamplingRate=1.0, lossType='squared', maxIter=20, stepSize=0.1, seed=None, impurity='variance') paramGrid = ParamGridBuilder().addGrid( GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBRegressor.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBRegressor.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") GBRCV = CrossValidator(estimator=GBRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBRCV) GBR_Pipeline = Pipeline(stages=stagesList) GBR_PipelineModel = GBR_Pipeline.fit(train_df) GBR_Predicted = GBR_PipelineModel.transform(test_df) GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel GBR_Prediction = GBR_Predicted.select("Prediction").toPandas() GBR_Score = evaluator.evaluate(GBR_Predicted) return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score if clustering: if kmeans: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) KCluster = KMeans(featuresCol=featuresCol, predictionCol='Prediction', k=n_cluster, initMode='k-means||', initSteps=2, tol=0.0001, maxIter=20, seed=None) paramGrid = ParamGridBuilder().addGrid( KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid( KCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( KCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') KMCV = CrossValidator(estimator=KCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(KMCV) KMC_Pipeline = Pipeline(stages=stagesList) KMC_PipelineModel = KMC_Pipeline.fit(train_df) KMC_Predicted = KMC_PipelineModel.transform(train_df) KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel KMC_Prediction = KMC_Predicted.select("Prediction").toPandas() KMC_Score = evaluator.evaluate(KMC_Predicted) return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score if gaussianmixture: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GMCluster = GaussianMixture(featuresCol=featuresCol, predictionCol='Prediction', probabilityCol='Probability', k=n_cluster, tol=0.01, maxIter=100, seed=None) paramGrid = ParamGridBuilder().addGrid( GMCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( GMCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') GMCV = CrossValidator(estimator=GMCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GMCV) GMC_Pipeline = Pipeline(stages=stagesList) GMC_PipelineModel = GMC_Pipeline.fit(train_df) GMC_Predicted = GMC_PipelineModel.transform(train_df) GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel GMC_Probability = GMC_Predicted.select("Probability").toPandas() GMC_Prediction = GMC_Predicted.select("Prediction").toPandas() GMC_Score = evaluator.evaluate(GMC_Predicted) return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score if lda: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LDACluster = LDA(featuresCol=featuresCol, maxIter=20, seed=None, k=n_cluster, learningOffset=1024.0, learningDecay=0.51, subsamplingRate=0.05) paramGrid = ParamGridBuilder().addGrid( LDACluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( LDACluster.seed, [i for i in range(1001)]).addGrid( LDACluster.subsamplingRate, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') LDACV = CrossValidator(estimator=LDACluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LDACV) LDA_Pipeline = Pipeline(stages=stagesList) LDA_PipelineModel = LDA_Pipeline.fit(train_df) LDA_Predicted = LDA_PipelineModel.transform(train_df) LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel LDA_Topics = LDA_BestModel.describeTopics().toPandas() LDA_Score = evaluator.evaluate(LDA_Predicted) return LDA_BestModel, LDA_Topics, LDA_Score if recommendation: if als: ALSR = ALS(userCol=userCol, itemCol=itemCol, ratingCol=ratingCol, rank=rank, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, alpha=1.0, seed=1) ALSR_Model = ALSR.fit(train_df) ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid) ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid) return ALSR_Model, ALSR_ForUsers, ALSR_ForItems if association: if fpgrowth: fpg = FPGrowth(minSupport=minSupport, minConfidence=minConfidence, itemsCol=itemsCol, predictionCol='Prediction') fpg_model = fpg.fit(train_df) fpg_freqItemsets = fpg_model.freqItemsets.toPandas() fpg_associationRules = fpg_model.associationRules.toPandas() return fpg_model, fpg_freqItemsets, fpg_associationRules
assembler = VectorAssembler( inputCols=[x for x in train_data.columns if x not in ignore], outputCol='features') train_data = (assembler.transform(train_data).select("target", "features")) # with 500 iterations, GINI is around ~0.275 for submissions iteration = 500 gbt = GBTClassifier(labelCol="target", featuresCol="features", maxIter=iteration) evaluator = BinaryClassificationEvaluator(labelCol="target") # no parameter search paramGrid = ParamGridBuilder().build() # 6-fold cross validation crossval = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=6) model = crossval.fit(train_data) print("trained GBT classifier:%s" % model) # display CV score auc_roc = model.avgMetrics[0] print("AUC ROC = %g" % auc_roc) gini = (2 * auc_roc - 1)
spark = SparkSession\ .builder\ .appName("TrainValidationSplit")\ .getOrCreate() # $example on$ # Prepare training and test data. data = spark.read.format("libsvm")\ .load("data/mllib/sample_linear_regression_data.txt") train, test = data.randomSplit([0.7, 0.3]) lr = LinearRegression(maxIter=10, regParam=0.1) # We use a ParamGridBuilder to construct a grid of parameters to search over. # TrainValidationSplit will try all combinations of values and determine best model using # the evaluator. paramGrid = ParamGridBuilder()\ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\ .build() # In this case the estimator is simply the linear regression. # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. tvs = TrainValidationSplit( estimator=lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) # Run TrainValidationSplit, and choose the best set of parameters. model = tvs.fit(train) # Make predictions on test data. model is the model with combination of parameters # that performed best.
# Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions_tr) print("Decision Tree Classifier Accuracy before Cross Validation : ", accuracy) # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. # This will allow us to jointly choose parameters for all Pipeline stages. # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. # We use a ParamGridBuilder to construct a grid of parameters to search over. print("Starting CrossValidation") paramGrid = (ParamGridBuilder() .addGrid(dt.maxDepth, [15, 20, 25]) #.addGrid(dt.maxBins, [50, 80]) .addGrid(dt.numTrees, [100, 110, 120]) .build()) # Create k-fold CrossValidator cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=8) # Run cross-validation, and choose the best set of parameters. cvModel = cv.fit(training) # print(" | numTrees = ", cvModel.bestModel.numTrees) # print(" | depth = ", cvModel.bestModel.maxDepth) # Make predictions on test so we can measure the accuracy of our model on new data #predictions_tr_cv = cvModel.transform(test)
# ## Model Selection a.k.a. hyperparameter tuning # ##### For LR # In[53]: from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.mllib.evaluation import BinaryClassificationMetrics start_time = time() # Creating ParamGrid for Cross Validation lr_paramGrid = ParamGridBuilder() .addGrid(lr.regParam, [0.1, 0.01]) .build() # Creating CrossValidator lr_crossval = CrossValidator(estimator=lr, estimatorParamMaps=lr_paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=10) # Run cross validations lr_cvModel = lr_crossval.fit(trainSet) end_time = time() elapsed_time = end_time - start_time print("Time to train Logistic Regression best model: %.3f seconds" % elapsed_time) lr_cvModel.bestModel
# MAGIC # MAGIC \\[ \begin{bmatrix} a \times a \\\ a \times b & b \times b \\\ a \times c & b \times c & c \times c \\\ a \times d & b \times d & c \times d & d \times d \end{bmatrix} \\] # MAGIC # MAGIC Plus the original features # MAGIC # MAGIC \\[ \begin{bmatrix} a & b & c & d \end{bmatrix} \\] # COMMAND ---------- # MAGIC %md # MAGIC Can we do better? Let's build a grid of params and search using `CrossValidator`. # COMMAND ---------- paramGridRand = (ParamGridBuilder() .addGrid(rf.maxDepth, [2, 4, 8, 12]) .baseOn({rf.numTrees, 20}) .build()) cvRand = (CrossValidator() .setEstimator(rfPipeline) .setEvaluator(multiEval) .setEstimatorParamMaps(paramGridRand) .setNumFolds(2)) cvModelRand = cvRand.fit(irisTrain) predictionsRand = cvModelRand.transform(irisTest) print multiEval.evaluate(predictionsRand) print cvModelRand.bestModel.stages[-1]._java_obj.parent().getMaxDepth() # COMMAND ----------
model_node9.save("hdfs://namenode:9000/example5/model_1/") estimator_node11 = RandomForestClassifier(featureSubsetStrategy="auto", numTrees=20, maxDepth=5, predictionCol="prediction", rawPredictionCol="rawPrediction", probabilityCol="probability", labelCol="indexedSurvived", featuresCol="features", impurity="gini") evaluator_node12 = MulticlassClassificationEvaluator( labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy") param_grid_node13 = ParamGridBuilder().addGrid(estimator_node11.maxDepth, [3, 5, 8, 20]).build() cv_node13 = CrossValidator(estimator=estimator_node11, estimatorParamMaps=param_grid_node13, evaluator=evaluator_node12) model_node13 = cv_node13.fit(df_node9) df_node13 = model_node13.transform(df_node9) df_node16 = model_node13.transform(df_node15) model_node13.save("hdfs://namenode:9000/example5/model_2/") evaluator_node17 = MulticlassClassificationEvaluator( labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy") score_node17 = evaluator_node17.evaluate(df_node16) df_node17 = spark.createDataFrame([(score_node17, )], ["score"])
def main(context): # dem(context) # gop(context) # SAVED PARQUETS # comments is the comments-minimal.json # submissions is the submissions.json # task7 is the result of the count vectorizer # commentsFull is the comments-minimal.json joined with submissions with the sarcasm removed and the > removed #TASK 1 # Read from JSON #comments = sqlContext.read.json("comments-minimal.json.bz2") #comments.registerTempTable("commentsTable") #submissions = sqlContext.read.json("submissions.json.bz2") #submissions.registerTempTable("submissionsTable") # Write the Parquets #comments.write.parquet("comments.parquet") #submissions.write.parquet("submissions.parquet") # Read the parquets comments = sqlContext.read.parquet("comments.parquet") comments.registerTempTable("commentsTable") submissions = sqlContext.read.parquet("submissions.parquet") submissions.registerTempTable("submissionsTable") # Read the CSV labels = sqlContext.read.format('csv').options(header='true', inferSchema='true').load("labeled_data.csv") labels.registerTempTable("labelsTable") #TASK 2 dfTask2 = sqlContext.sql("SELECT commentsTable.* FROM commentsTable INNER JOIN labelsTable ON commentsTable.id = labelsTable.Input_id") #TASK 4 and TASK 5 def do_something(text): return parser.sanitize(text) udf_func = udf(do_something, ArrayType(StringType())) dfTask4 = dfTask2.withColumn("udf_results", udf_func(col("body"))) #TASK 6A and Task 6B if(not os.path.exists("cvModel")): cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0) model = cv.fit(dfTask4) model.write().overwrite().save("cvModel") model = CountVectorizerModel.load("cvModel") dfTask6A = model.transform(dfTask4) dfTask6A.registerTempTable("dfTask6ATable") dfTask6B = sqlContext.sql("SELECT dfTask6ATable.*, IF(labelsTable.labeldjt=1, 1, 0) AS pos_label, if(labelsTable.labeldjt=-1, 1, 0) AS neg_label FROM dfTask6ATable INNER JOIN labelsTable ON dfTask6ATable.id = labelsTable.Input_id") dfTask6B.registerTempTable("dfTask6BTable") pos = sqlContext.sql('select pos_label as label, features from dfTask6BTable') neg = sqlContext.sql('select neg_label as label, features from dfTask6BTable') if(not os.path.exists("www/neg.model") or not os.path.exists("www/pos.model")): # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator( estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=2) negCrossval = CrossValidator( estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=2) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.write().overwrite().save("www/pos.model") negModel.write().overwrite().save("www/neg.model") # TO LOAD BACK IN posModel = CrossValidatorModel.load("www/pos.model") negModel = CrossValidatorModel.load("www/neg.model") # Task 8 dfTask8 = sqlContext.sql('SELECT commentsTable.id, commentsTable.body, commentsTable.created_utc, commentsTable.author_flair_text, submissionsTable.title, commentsTable.score AS comment_score, submissionsTable.score AS story_score FROM commentsTable INNER JOIN submissionsTable ON RIGHT(commentsTable.link_id, 6)=submissionsTable.id') dfTask8 = dfTask8.sample(False, 0.1, None) #TASK 4 and TASK 5 def do_something(text): return parser.sanitize(text) udf_func = udf(do_something, ArrayType(StringType())) dfTask9_1 = dfTask8.withColumn("udf_results", udf_func(col("body"))) #TASK 6A and Task 6B model = CountVectorizerModel.load("cvModel") dfTask9_2 = model.transform(dfTask9_1) dfTask9_2.registerTempTable("dfTask9_2Table") # Task 9 dfTask9_3 = sqlContext.sql("SELECT * FROM dfTask9_2Table WHERE dfTask9_2Table.body NOT LIKE '%/s%' AND dfTask9_2Table.body NOT LIKE '>%'") dfTask9_3.registerTempTable("dfTask9_3Table") posResult_1 = posModel.transform(dfTask9_3) posResult_1.registerTempTable("posResult_1Table") posResult_2 = sqlContext.sql("SELECT posResult_1Table.id, posResult_1Table.body, posResult_1Table.author_flair_text, posResult_1Table.created_utc, posResult_1Table.title, posResult_1Table.comment_score, posResult_1Table.story_score, posResult_1Table.features, posResult_1Table.prediction AS pos FROM posResult_1Table") finalResult_1 = negModel.transform(posResult_2) finalResult_1.registerTempTable("finalResult_1Table") finalResult_2 = sqlContext.sql("SELECT finalResult_1Table.id, finalResult_1Table.body, finalResult_1Table.created_utc, finalResult_1Table.author_flair_text, finalResult_1Table.title, finalResult_1Table.comment_score, finalResult_1Table.story_score, finalResult_1Table.pos, finalResult_1Table.prediction AS neg FROM finalResult_1Table") finalResult_2.registerTempTable("finalResult_2Table") if(not os.path.exists("final.parquet")): finalResult_2.write.parquet("final.parquet") final = sqlContext.read.parquet("final.parquet") final.registerTempTable("finalTable") # Task 10 if(not os.path.exists("question1.csv")): question1 = sqlContext.sql("SELECT (100 * sum(pos) / COUNT(*)) AS percent_pos, (100 * sum(neg) / COUNT(*)) AS percent_neg FROM finalTable") question1.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question1.csv") if(not os.path.exists("question2.csv")): question2 = sqlContext.sql("SELECT DATE(from_unixtime(finalTable.created_utc)) AS date, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY date ORDER BY date") question2.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question2.csv") if(not os.path.exists("question3.csv")): question3 = sqlContext.sql("SELECT finalTable.author_flair_text AS place, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY place ORDER BY place") question3.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question3.csv") if(not os.path.exists("question4_comment.csv")): question4_comment = sqlContext.sql("SELECT finalTable.comment_score AS comment_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY comment_score ORDER BY comment_score") question4_comment.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question4_comment.csv") if(not os.path.exists("question4_story.csv")): question4_story = sqlContext.sql("SELECT finalTable.story_score AS story_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY story_score ORDER BY story_score") question4_story.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question4_story.csv")
def func1(): hour_df = sqlContext.read.format("csv").option( "header", "true").load(Path + "hour.csv") print("count", hour_df.count()) print("columns:", hour_df.columns) #舍弃不需要的字段 hour_df = hour_df.drop("instant").drop("dteday").drop("yr").drop( "casual").drop("registered") print("查看schema:", hour_df.printSchema()) # 数据转换为double hour_df = hour_df.select([ col(column).cast("double").alias(column) for column in hour_df.columns ]) print("转换后:hour_df.printSchema():", hour_df.printSchema()) print("前3项数据:", hour_df.show(3)) # 将数据分为train_df和test_df,比例为0.7:0.3 train_df, test_df = hour_df.randomSplit([0.7, 0.3]) train_df.cache() test_df.cache() # 创建特征字段list featureCols = hour_df.columns[:-1] print("featureCols:", featureCols) # 建立pipeline vectorAssembler = VectorAssembler(inputCols=featureCols, outputCol="aFeatures") vectorIndexer = VectorIndexer(inputCol="aFeatures", outputCol="features", maxCategories=24) dt = DecisionTreeRegressor(labelCol="cnt", featuresCol="features") dt_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, dt]) print("查看pipeline流程:", dt_pipeline.getStages()) # 训练 dt_pipelineModel = dt_pipeline.fit(dataset=train_df) print("查看训练完成后的模型:", dt_pipelineModel.stages[2].toDebugString[:500]) # 使用transform预测 predicted = dt_pipelineModel.transform(test_df) print("查看新增的字段:", predicted.columns) print("查看预测的结果:", predicted.show(2)) ###评估模型 evaluator = RegressionEvaluator(labelCol="cnt", predictionCol="prediction", metricName="rmse") predicted_df = dt_pipelineModel.transform(test_df) rmse = evaluator.evaluate(predicted_df) print("rmse:", rmse) ##TrainValidationSplit训练找出最佳模型 paramGrid = ParamGridBuilder().addGrid( dt.impurity, ["gini", "entory"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid( dt.maxBins, [10, 15, 20]).build() tvs = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8) # trainRatio 数据会8:2的比例分为训练集,验证集 tvs_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, tvs]) yvs_pipelineModel = tvs_pipeline.fit(dataset=train_df) bestmodel = yvs_pipelineModel.stages[2].bestModel print("bestModel:", bestmodel.toDebugString[:500]) ##使用最佳模型进行预测 predictions = tvs_pipeline.transform(test_df) rmse2 = evaluator.evaluate(predictions) print(rmse2)
TP = prediction.filter('prediction = 1 AND label = prediction').count() FN = prediction.filter('prediction = 0 AND label != prediction').count() FP = prediction.filter('prediction = 1 AND label != prediction').count() # Accuracy measures the proportion of correct predictions accuracy = (TN + TP) / (TN + TP + FN + FP) print(accuracy) ''' Its good to check accuracy and also the confusion matrix. Confusion matrix can be very helpful on finding how the model is doing and how well it predicts false positive and false negative. Accuracy can be bias if there is imbalanced dataset, Confusion matrix is very helpful on such situations. ''' # do cross validation for this model params = (ParamGridBuilder() .addGrid(gbt.maxDepth, [2, 4]) .addGrid(gbt.maxBins, [10, 20]) .addGrid(gbt.maxIter, [5, 10]) .build()) evaluator = BinaryClassificationEvaluator(labelCol='label') cv = CrossValidator(estimator=pipeline, estimatorParamMaps= params, evaluator= evaluator, numFolds=5) cvmodel = cv.fit(x_train) best_model = cvmodel.bestModel print(evaluator.evaluate(best_model.transform(x_test))) ''' We can also get other number of features like best params from our best model from cross validation. In our cross validation we can use best_model.stages[3].extractParamMap() to get the best params. Here [3] means the stages from our pipeline model which is at index 3. In Index3, our pipeline model has our classification
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. # This will allow us to jointly choose parameters for all Pipeline stages. # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. # We use a ParamGridBuilder to construct a grid of parameters to search over. # With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, # this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training) # Prepare test documents, which are unlabeled. test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"), (6, "mapreduce spark"), (7, "apache hadoop")], ["id", "text"])
df_train = vecAssembler.transform(train_data) pd.DataFrame(df_train.take(5), columns=df_train.columns).transpose() dt = DecisionTreeClassifier(labelCol="deposit", featuresCol="features") pipeline = Pipeline(stages=[vecAssembler, dt]) model = pipeline.fit(train_data) predictions = model.transform(test_data) predictions.select("prediction", "deposit", "features").toPandas().head(25) evaluator = BinaryClassificationEvaluator( labelCol="deposit", rawPredictionCol="prediction") evaluator.evaluate(predictions) paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [2,3,4,5,6,7,8,9,10,11,12]).build() # Set up 3-fold cross validation crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) CV_model = crossval.fit(train_data) tree_model = CV_model.bestModel.stages[1] print(tree_model) predictions_improved = CV_model.bestModel.transform(test_data) predictions_improved.select("prediction", "deposit", "features").toPandas().head(25)
# COMMAND ---------- print(dtc.explainParams()) # COMMAND ---------- # MAGIC %md # MAGIC %md # MAGIC #### ![Spark Logo Tiny](https://s3-us-west-2.amazonaws.com/curriculum-release/images/105/logo_spark_tiny.png) 3-fold Cross Validation # MAGIC # MAGIC ![crossValidation](http://curriculum-release.s3-website-us-west-2.amazonaws.com/images/301/CrossValidation.png) # COMMAND ---------- paramGrid = ParamGridBuilder().\ addGrid(dtc.maxBins, [64, 128]).\ addGrid(dtc.maxDepth, [20, 30]).\ build() crossval = CrossValidator(estimator=dtp, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3, parallelism=2) cvModel = crossval.fit(train) # COMMAND ---------- result = cvModel.transform(test) display(result)
# print(train_mod02.limit(2).toPandas()) from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel rfClassifer = RandomForestClassifier(labelCol="dx_factorize", numTrees=100) from pyspark.ml import Pipeline pipeline = Pipeline(stages=[rfClassifer]) from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import MulticlassClassificationEvaluator paramGrid = ParamGridBuilder() \ .addGrid(rfClassifer.maxDepth, [12]) \ .addGrid(rfClassifer.minInstancesPerNode, [20]) \ .build() evaluator = MulticlassClassificationEvaluator(labelCol="dx_factorize", predictionCol="prediction", metricName="accuracy") # evaluator_f1 = MulticlassClassificationEvaluator(labelCol="dx_factorize", predictionCol="prediction", # metricName="f1") crossval= CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) cvModel = crossval.fit(train)
predictionCol="prediction", metricName="rmse") rmse = eval_.evaluate(yhat) print('rmse is %.2f' % rmse) mae = eval_.evaluate(yhat, {eval_.metricName: "mae"}) print('mae is %.2f' % mae) r2 = eval_.evaluate(yhat, {eval_.metricName: "r2"}) print('r2 is %.2f' % r2) from pyspark.ml.tuning import CrossValidator, ParamGridBuilder paramGrid = (ParamGridBuilder().addGrid(gbt.maxDepth, [5, 8, 10, 12]).addGrid( gbt.maxBins, [64]).build()) cv = CrossValidator(estimator=gbt, estimatorParamMaps=paramGrid, evaluator=eval_, numFolds=3) cvModel = cv.fit(X_train) yhat = (cvModel.transform(X_test).withColumn( "prediction", F.expm1(F.col("prediction"))).withColumn( target, F.expm1(F.col(target))).withColumn( 'fiability', 1 - F.abs(F.col(target) - F.col("prediction")) / F.col(target)).withColumn( 'fiability', F.when(F.col("fiability") < 0, 0).otherwise(F.col("fiability"))))
# Use the `explainParams` method to get the full list of hyperparameters: print(lr.explainParams()) # Setting `elasticNetParam=1.0` corresponds to $l1$ (lasso) linear regression. # We are interested in finding a reasonable value of `regParam`. # ## Specify the hyperparameter grid # Use the # [ParamGridBuilder](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.ParamGridBuilder) # class to specify a hyperparameter grid: from pyspark.ml.tuning import ParamGridBuilder regParamList = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5] grid = ParamGridBuilder().addGrid(lr.regParam, regParamList).build() # The resulting object is simply a list of parameter maps: grid # Rather than specify `elasticNetParam` in the `LinearRegression` constructor, we can specify it in our grid: grid = ParamGridBuilder().baseOn({lr.elasticNetParam: 1.0}).addGrid(lr.regParam, regParamList).build() grid # ## Specify the evaluator # In this case we will use # [RegressionEvaluator](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.RegressionEvaluator) # as our evaluator and specify root-mean-squared error as the metric: from pyspark.ml.evaluation import RegressionEvaluator
# create TF_IDF features idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq( 0) # create a Logistic regression model lr = LogisticRegression() # streamline all above steps into a pipeline pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr]) # train model and predict results # perform grid search looking for the best parameters and the best models paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures,[1000,5000,10000])\ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.elasticNetParam, [0.0, 0.3, 0.6])\ .build() tvs = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator().setMetricName('areaUnderPR'), trainRatio=0.8) # set area under precision-recall curve as the evaluation metric - 80% of data will be used for training, 20% for validation # run TrainValidationSplit and choose the best set of parameters model = tvs.fit(train_set) # make predictions train_prediction = model.transform(train_set) test_prediction = model.transform(test_set)
#Create Decision Tree Estimator, set Label and Feature Columns dTree = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=7000) #Setup pipeline with feature transformers and model estimator steps = stringIndexers + encoders + [labeler, assembler, dTree] steps pipeline = Pipeline(stages=steps) #**************Train the Model******************* #Set up a CrossValidator with the parameters, a tree estimator and evaluator paramGrid = ParamGridBuilder().addGrid(dTree.maxDepth, [4, 5, 6]).build() evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol="label",metricName="f1") # Set up 3-fold cross validation with paramGrid crossVal = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=3) #Use CrossValidator Estimator to fit the training data set cvModel = crossVal.fit(strat_train_df) cvModel.bestModel.stages #Get the best Decision Tree Model
#importing necessary libraries for ALS from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS #setting parameter grid for parameter tuning by cross-validation evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating", predictionCol = "prediction") #Model recsys = ALS(userCol = "user_index", itemCol = "item_index", ratingCol = "rating", nonnegative = True, coldStartStrategy="drop") #Parameter grid paramGrid = ParamGridBuilder() \ .addGrid(recsys.regParam, [0.1, 0.01, 0.001]) \ .addGrid(recsys.rank, [5, 10, 15]) \ .build() #Cross validation cvs = CrossValidator(estimator = recsys, estimatorParamMaps = paramGrid, evaluator = evaluator, numFolds=5) #Reading the Products ratings file pathToFile = 'Products_preprocessed.csv' ProductsDF = spark.read.csv(pathToFile, inferSchema = True, header = True) ProductsDF.printSchema() #Reading the metadata file pathToFile = 'Products_meta_preprocessed.csv'
wine_train = assembler.transform(wine_train).select('features', 'quality') wine_valid = assembler.transform(wine_valid).select('features', 'quality') #initialize linear regression model lr = LinearRegression(featuresCol = 'features', labelCol='quality', maxIter = 10, regParam = 0.1, elasticNetParam = 0.5) #fitting model model = lr.fit(wine_train) wine_prediction = model.transform(wine_valid) #calculate results r = wine_prediction.stat.corr('features', 'quality') print("R-Squared: " + str(r ** 2)) crossval = CrossValidator(estimator=LinearRegression(labelCol = "quality"), estimatorParamMaps=ParamGridBuilder().addGrid( LinearRegression.elasticNetParam, [0, 0.5, 1.0]).build(), evaluator=RegressionEvaluator( labelCol = "quality", metricName = "r2"), numFolds=10) #cross validate the model and choose the best fit cvModel = crossval.fit(wine_train) model = cvModel.bestModel #calculate with improved model wine_prediction = model.transform(wine_valid) r = wine_prediction.stat.corr('features', 'quality') print("R-squared: " + (str ** 2)) #exports model to be used Pkl_Filename = 'TrainedModel.pkl'
# trainer NUMBER_OF_MODELS = 10 dataPipeline = Pipeline(stages=currPipeline) convertedData = dataPipeline.fit(df).transform(df) trainData, testData = convertedData.randomSplit([0.9, 0.1], seed=0) models = [] for i in range(NUMBER_OF_MODELS): tmpSample = trainData.sample(True, 1.0, 42) reg = LogisticRegression(labelCol="label", featuresCol="features", standardization=False) evaluator = BinaryClassificationEvaluator( labelCol="label", rawPredictionCol="prediction") cvPipeLine = Pipeline(stages=[reg]) paramGrid = ParamGridBuilder().addGrid(reg.regParam, [0.01, 0.1, 1, 2, 5]).build() crossval = CrossValidator(estimator=cvPipeLine, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) tmpModel = crossval.fit(tmpSample) models.append(tmpModel) # save models if SAVE_MODELS: modelIndex = 0 for elem in models: elem.save("./savedModels/model_" + str(modelIndex)) modelIndex = modelIndex + 1
//used for identifying features and indexing them vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=4) from pyspark.ml.regression import GBTRegressor // Takes the "features" column and learns to predict "cnt" gbt = GBTRegressor(labelCol="name") from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import RegressionEvaluator //define parameters such as maxDepth maxIter to test paramGrid = ParamGridBuilder()\ .addGrid(gbt.maxDepth, [2, 5])\ .addGrid(gbt.maxIter, [10, 100])\ .build() //this tells how well it is doing with the help of labels evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol()) //used for tuning the model cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid) from pyspark.ml import Pipeline pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv]) //pipeline used for running feature processing, model tuning, and training pipelineModel = pipeline.fit(train) //used for accuracy calculation predictions = pipelineModel.transform(test) //now evaluate the accuracy
def test_param_grid_type_coercion(self): lr = LogisticRegression(maxIter=10) paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.5, 1]).build() for param in paramGrid: for v in param.values(): assert (type(v) == float)
featVect = VectorAssembler(inputCols=["nfeatures"], outputCol="features") #rf = RandomForestClassifier(labelCol="label", featuresCol="features",impurity="gini",featureSubsetStrategy="auto",numTrees=10,maxDepth=32,maxBins=128,seed=1234) lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3) pipeline = Pipeline(stages=[assembler, minMax, featVect, lr]) # COMMAND ---------- # MAGIC %md ### Tune Parameters # MAGIC You can tune parameters to find the best model for your data. To do this you can use the **CrossValidator** class to evaluate each combination of parameters defined in a **ParameterGrid** against multiple *folds* of the data split into training and validation datasets, in order to find the best performing parameters. Note that this can take a long time to run because every parameter combination is tried multiple times. # COMMAND ---------- paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.3, 0.01]).addGrid( lr.maxIter, [10, 5]).build() # TODO: K = 2, you may test it with 5, 10 # K=2, 5, 10: Root Mean Square Error (RMSE): 13.2 cv = CrossValidator(estimator=pipeline, evaluator=RegressionEvaluator(), estimatorParamMaps=paramGrid, numFolds=10) model = cv.fit(train) # COMMAND ---------- # MAGIC %md ### Test the Recommender # MAGIC The model produced by the pipeline is a transformed that will apply to all stages in the pipeline to a specified DataFrame and apply the trained model to generate predictions. In this case, you will transform the test DataFrame using the pipeline to generate label prediction # COMMAND ----------