def NeuralNetwork(trainingData, testData):
    start_time = time.time()
    print(" ")
    print("--------------------- NEURAL NETWORK ---------------------")

    layers = [187, 8, 5]

    nn = MultilayerPerceptronClassifier(layers=layers)

    # Parametri su cui effettuare il tuning
    paramGrid = ParamGridBuilder() \
        .addGrid(nn.stepSize, [1, 0.01]) \
        .addGrid(nn.maxIter, [100, 1000]) \
        .build()

    # Tuning sui vari parametri per scegliere il modello migliore
    tvs = TrainValidationSplit(
        estimator=nn,
        estimatorParamMaps=paramGrid,
        evaluator=MulticlassClassificationEvaluator(),
        # Validation test: 80% traning, 20% validation.
        trainRatio=0.8)

    model = tvs.fit(trainingData)

    prediction = model.transform(testData)
    predictionAndLabels = prediction.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    accuracy = evaluator.evaluate(predictionAndLabels)
    evaluator = MulticlassClassificationEvaluator(metricName="f1")
    f1score = evaluator.evaluate(predictionAndLabels)

    # Confusion Matrix
    class_temp = prediction.select("label").groupBy("label") \
        .count().sort('count', ascending=False).toPandas()
    class_temp = class_temp["label"].values.tolist()

    y_true = prediction.select("label")
    y_true = y_true.toPandas()

    y_pred = prediction.select("prediction")
    y_pred = y_pred.toPandas()

    cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_temp)

    print("Accuracy Hold out: ", accuracy)
    print("F1-Score Hold out: ", f1score)
    print("")
    print("")
    print("Doc Parameters : [", model.explainParams(), "]")
    print("")
    print("")
    print("Confusion Matrix: ")
    print(cnf_matrix)
    print("Neural Network HoldOut Execution TIME:", time.time() - start_time)

    # Richiamo NN che utilizza la validazione K-Folds
    f1score_cv, cnf_matrix_cv, cv = NeuralNetworkCV(trainingData, testData)

    # Restituisco il modello migliore tra Hold Out e K-Folds
    if (f1score <= f1score_cv):
        return (f1score_cv, cnf_matrix_cv, cv)
    else:
        return (f1score, cnf_matrix, tvs)
Пример #2
0
    vectorassembler_stage
]
pipeline = Pipeline(stages=all_stages)

pipeline_model = pipeline.fit(taxi)

final_columns = feature_columns + ['features', 'label']
taxi_df = pipeline_model.transform(taxi).select(final_columns)
#taxi_df.show(5)
train, test = taxi_df.randomSplit([0.8, 0.2], seed=1234)

random_forest = RandomForestClassifier(featuresCol='features',
                                       labelCol='label')

param_grid = ParamGridBuilder().\
    addGrid(random_forest.maxDepth, [2, 3, 4]).\
    addGrid(random_forest.minInfoGain, [0.0, 0.1, 0.2, 0.3]).\
    build()
evaluator = BinaryClassificationEvaluator()
crossvalidation = CrossValidator(estimator=random_forest,
                                 estimatorParamMaps=param_grid,
                                 evaluator=evaluator)
crossvalidation_mod = crossvalidation.fit(taxi_df)

pred_test = crossvalidation_mod.transform(test)
pred_test.show(5)
label_pred_test = pred_test.select('label', 'prediction')
label_pred_test.rdd.zipWithIndex().countByKey()

print('Accuracy : ',
      evaluator.setMetricName('areaUnderROC').evaluate(pred_test))
print('Precision : ',
Пример #3
0
def sparse_vector_to_array(dv):
  print(type(dv))
  new_array = list([float(x) for x in dv])
  return new_array

sparse_vector_to_array_udf = F.udf(sparse_vector_to_array, T.ArrayType(T.FloatType()))

# COMMAND ----------

# DBTITLE 1,Build a grid to test hyperparameters
from pyspark.ml.tuning import ParamGridBuilder

paramGrid = (ParamGridBuilder()
  .addGrid(rf.maxBins, [20, 30, 30])
  .addGrid(rf.maxDepth, [3, 5, 10])
  .addGrid(rf.numTrees, [10, 80])
  .build()
)

# COMMAND ----------

# DBTITLE 1,Set validation method for comparison
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator

evaluator = MulticlassClassificationEvaluator().setLabelCol("label")

cv = CrossValidator(
  estimator = pipeline,             # Estimator (individual model or pipeline)
  estimatorParamMaps = paramGrid,   # Grid of parameters to try (grid search)
Пример #4
0
pipeline_model = pipeline.fit(data)
data_set = pipeline_model.transform(data)

train_data, test_data = data_set.randomSplit([0.7, 0.3], seed=0)
print(train_data.count(), test_data.count())

clf = LogisticRegression()
clf_model = clf.fit(train_data)
predict = clf_model.transform(test_data)

evaluator = BinaryClassificationEvaluator()
print(evaluator.evaluate(predict))

rf = RandomForestClassifier()
grid = ParamGridBuilder().addGrid(rf.numTrees, [1, 3, 5]) \
                         .addGrid(rf.maxDepth, [3, 5, 7]) \
                         .addGrid(rf.maxBins, [20, 30, 40]).build()

cv = CrossValidator(estimator=rf,
                    evaluator=evaluator,
                    estimatorParamMaps=grid,
                    numFolds=5)

cv_model = cv.fit(train_data)
cv_model_predict = cv_model.transform(test_data)
print(evaluator.evaluate(cv_model_predict))

metrics = ComputeModelStatistics(
    evaluationMetric='classification',
    labelCol='label',
    scoresCol='probability',
Пример #5
0
    lag(col('Arrest'), count=1).over(
        Window().partitionBy('Beat').orderBy('Year_Week'))).na.drop())
crime_beat_week_lagged = (crime_beat_week_lagged.withColumn(
    'Lag_Domestic',
    lag(col('Domestic'), count=1).over(
        Window().partitionBy('Beat').orderBy('Year_Week'))).na.drop())
crime_beat_week_lagged.registerTempTable('crime_beat_week_lagged')

#export_data.to_csv("q2_3_beat_cnt.csv", index=False)
# run model
input_cols = ['Lag1', 'Lag2', 'Lag3', 'Lag_Arrest', 'Lag_Domestic', 'week']
assembler = VectorAssembler(inputCols=input_cols, outputCol='features')
rf = RandomForestRegressor(numTrees=30)
stages = [assembler, rf]
pipeline = Pipeline(stages=stages)
param_grid = ParamGridBuilder().addGrid(rf.maxDepth, [5, 6, 7]).build()
evaluator = RegressionEvaluator(labelCol='label',
                                predictionCol='prediction',
                                metricName='r2')

rf_grid = CrossValidator(estimator=pipeline,
                         estimatorParamMaps=param_grid,
                         evaluator=evaluator,
                         numFolds=3).fit(crime_beat_week_lagged)
lagged_fitted = rf_grid.transform(crime_beat_week_lagged)
# print r-square

eval_metric = evaluator.evaluate(lagged_fitted)

print("evaluation answer")
print("-----------------------------------")
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

# COMMAND ----------

# MAGIC %md Now we will try tuning the model with the `ParamGridBuilder` and the `CrossValidator`.
# MAGIC
# MAGIC As we indicate 3 values for maxDepth and 3 values for maxBin, this grid will have 3 x 3 = 9 parameter settings for `CrossValidator` to choose from. We will create a 3-fold CrossValidator.

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder().addGrid(dt.maxDepth, [1, 2, 6, 10]).addGrid(
    dt.maxBins, [20, 40, 80]).build())

# COMMAND ----------

# Create 3-fold CrossValidator
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)

# Run cross validations (this can take several minutes to execute)
cvModel = cv.fit(trainingData2)

# COMMAND ----------

# MAGIC %md Now let's create new predictions with which to measure the accuracy of our model.
Пример #7
0
def SparkML(train_df,
            test_df=None,
            featuresCol='features',
            labelCol='label',
            binaryclass=False,
            multiclass=False,
            n_cluster=2,
            userCol='user',
            itemCol='item',
            ratingCol='rating',
            rank=10,
            userid=3,
            itemid=3,
            itemsCol='items',
            minSupport=0.3,
            minConfidence=0.8,
            stringIndexer=False,
            inputColStringIndexer=None,
            outputColStringIndexer=None,
            oneHotEncoder=False,
            inputColOneHotEncoder=None,
            outputColOneHotEncoder=None,
            vectorAssembler=False,
            inputColsVectorAssembler=None,
            outputColsVectorAssembler=None,
            vectorIndexer=False,
            inputColsVectorIndexer=None,
            outputColsVectorIndexer=None,
            maxCategories=None,
            classification=False,
            logisticregression=False,
            decisiontreeclassifier=False,
            linearsvc=False,
            naivebayes=False,
            randomforestclassifier=False,
            gbtclassifier=False,
            regression=False,
            linearregression=True,
            decisiontreeregressor=False,
            randomforestregressor=False,
            gbtregressor=False,
            clustering=False,
            kmeans=False,
            gaussianmixture=False,
            lda=False,
            recommendation=False,
            als=False,
            association=False,
            fpgrowth=False):
    if classification:
        if logisticregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRClassifier = LogisticRegression(featuresCol=featuresCol,
                                              labelCol=labelCol,
                                              predictionCol='Prediction',
                                              probabilityCol='Probability',
                                              rawPredictionCol='RawPrediction',
                                              standardization=True,
                                              maxIter=100,
                                              regParam=0.0,
                                              elasticNetParam=0.0,
                                              tol=1e-06,
                                              fitIntercept=True,
                                              threshold=0.5)
            paramGrid = ParamGridBuilder().addGrid(
                LRClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            LRCV = CrossValidator(estimator=LRClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LRC_Pipeline = Pipeline(stages=stagesList)
            LRC_PipelineModel = LRC_Pipeline.fit(train_df)
            LRC_Predicted = LRC_PipelineModel.transform(test_df)
            LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel
            LRC_Probability = LRC_Predicted.select("Probability").toPandas()
            LRC_Prediction = LRC_Predicted.select("Prediction").toPandas()
            LRC_Score = evaluator.evaluate(LRC_Predicted)
            return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score
        if decisiontreeclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTClassifier = DecisionTreeClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTClassifier.impurity, ["gini", "entropy"]).addGrid(
                    DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        DTClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            DTCV = CrossValidator(estimator=DTClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(DTCV)
            DTC_Pipeline = Pipeline(stages=stagesList)
            DTC_PipelineModel = DTC_Pipeline.fit(train_df)
            DTC_Predicted = DTC_PipelineModel.transform(test_df)
            DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel
            DTC_Probability = DTC_Predicted.select("Probability").toPandas()
            DTC_Prediction = DTC_Predicted.select("Prediction").toPandas()
            DTC_Score = evaluator.evaluate(DTC_Predicted)
            return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score
        if linearsvc:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            SVClassifier = LinearSVC(featuresCol=featuresCol,
                                     labelCol=labelCol,
                                     predictionCol='Prediction',
                                     rawPredictionCol='RawPrediction',
                                     maxIter=100,
                                     regParam=0.0,
                                     tol=1e-06,
                                     fitIntercept=True,
                                     standardization=True,
                                     threshold=0.0)
            paramGrid = ParamGridBuilder().addGrid(
                SVClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    SVClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            SVCV = CrossValidator(estimator=SVClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(SVCV)
            SVC_Pipeline = Pipeline(stages=stagesList)
            SVC_PipelineModel = SVC_Pipeline.fit(train_df)
            SVC_Predicted = SVC_PipelineModel.transform(test_df)
            SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel
            SVC_Prediction = SVC_Predicted.select("Prediction").toPandas()
            SVC_Score = evaluator.evaluate(SVC_Predicted)
            return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score
        if naivebayes:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            NBClassifier = NaiveBayes(featuresCol=featuresCol,
                                      labelCol=labelCol,
                                      predictionCol='Prediction',
                                      probabilityCol='Probability',
                                      rawPredictionCol='RawPrediction',
                                      smoothing=1.0,
                                      modelType='multinomial',
                                      thresholds=None)
            paramGrid = ParamGridBuilder().addGrid(
                NBClassifier.smoothing,
                [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            NBCV = CrossValidator(estimator=NBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(NBCV)
            NBC_Pipeline = Pipeline(stages=stagesList)
            NBC_PipelineModel = NBC_Pipeline.fit(train_df)
            NBC_Predicted = NBC_PipelineModel.transform(test_df)
            NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel
            NBC_Probability = NBC_Predicted.select("Probability").toPandas()
            NBC_Prediction = NBC_Predicted.select("Prediction").toPandas()
            NBC_Score = evaluator.evaluate(NBC_Predicted)
            return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score
        if randomforestclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFClassifier = RandomForestClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                numTrees=20,
                featureSubsetStrategy='auto',
                seed=None,
                subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                RFClassifier.impurity, ["gini", "entropy"]).addGrid(
                    RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        RFClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).addGrid(
                            RFClassifier.numTrees,
                            [5, 10, 20, 50, 100, 200]).addGrid(
                                RFClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            RFCV = CrossValidator(estimator=RFClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(RFCV)
            RFC_Pipeline = Pipeline(stages=stagesList)
            RFC_PipelineModel = RFC_Pipeline.fit(train_df)
            RFC_Predicted = RFC_PipelineModel.transform(test_df)
            RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel
            RFC_Probability = RFC_Predicted.select("Probability").toPandas()
            RFC_Prediction = RFC_Predicted.select("Prediction").toPandas()
            RFC_Score = evaluator.evaluate(RFC_Predicted)
            return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score
        if gbtclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBClassifier = GBTClassifier(featuresCol=featuresCol,
                                         labelCol=labelCol,
                                         predictionCol='Prediction',
                                         maxDepth=5,
                                         maxBins=32,
                                         minInstancesPerNode=1,
                                         minInfoGain=0.0,
                                         lossType='logistic',
                                         maxIter=20,
                                         stepSize=0.1,
                                         seed=None,
                                         subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBClassifier.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBClassifier.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = MulticlassClassificationEvaluator(
                labelCol=labelCol,
                predictionCol="Prediction",
                metricName="accuracy")
            GBCV = CrossValidator(estimator=GBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GBCV)
            GBC_Pipeline = Pipeline(stages=stagesList)
            GBC_PipelineModel = GBC_Pipeline.fit(train_df)
            GBC_Predicted = GBC_PipelineModel.transform(test_df)
            GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel
            GBC_Prediction = GBC_Predicted.select("Prediction").toPandas()
            GBC_Score = evaluator.evaluate(GBC_Predicted)
            return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score
    if regression:
        if linearregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRegressor = LinearRegression(featuresCol=featuresCol,
                                          labelCol=labelCol,
                                          predictionCol='Prediction',
                                          standardization=True,
                                          fitIntercept=True,
                                          loss='squaredError',
                                          maxIter=100,
                                          regParam=0.0,
                                          elasticNetParam=0.0,
                                          tol=1e-06,
                                          epsilon=1.35)
            paramGrid = ParamGridBuilder().addGrid(
                LRegressor.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRegressor.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            LRCV = CrossValidator(estimator=LRegressor,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LR_Pipeline = Pipeline(stages=stagesList)
            LR_PipelineModel = LR_Pipeline.fit(train_df)
            LR_Predicted = LR_PipelineModel.transform(test_df)
            LR_BestModel = LR_PipelineModel.stages[-1].bestModel
            LR_Prediction = LR_Predicted.select("Prediction").toPandas()
            LR_Score = evaluator.evaluate(LR_Predicted)
            return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score
        if decisiontreeregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                seed=None,
                                                varianceCol=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            DTRCV = CrossValidator(estimator=DTRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(DTRCV)
            DTR_Pipeline = Pipeline(stages=stagesList)
            DTR_PipelineModel = DTR_Pipeline.fit(train_df)
            DTR_Predicted = DTR_PipelineModel.transform(test_df)
            DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel
            DTR_Prediction = DTR_Predicted.select("Prediction").toPandas()
            DTR_Score = evaluator.evaluate(DTR_Predicted)
            return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score
        if randomforestregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFRegressor = RandomForestRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                subsamplingRate=1.0,
                                                seed=None,
                                                numTrees=20)
            paramGrid = ParamGridBuilder().addGrid(
                RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        RFRegressor.numTrees,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            RFRegressor.subsamplingRate,
                            [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            RFRCV = CrossValidator(estimator=RFRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(RFRCV)
            RFR_Pipeline = Pipeline(stages=stagesList)
            RFR_PipelineModel = RFR_Pipeline.fit(train_df)
            RFR_Predicted = RFR_PipelineModel.transform(test_df)
            RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel
            RFR_Prediction = RFR_Predicted.select("Prediction").toPandas()
            RFR_Score = evaluator.evaluate(RFR_Predicted)
            return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score
        if gbtregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBRegressor = GBTRegressor(featuresCol=featuresCol,
                                       labelCol=labelCol,
                                       predictionCol='Prediction',
                                       maxDepth=5,
                                       maxBins=32,
                                       minInstancesPerNode=1,
                                       minInfoGain=0.0,
                                       subsamplingRate=1.0,
                                       lossType='squared',
                                       maxIter=20,
                                       stepSize=0.1,
                                       seed=None,
                                       impurity='variance')
            paramGrid = ParamGridBuilder().addGrid(
                GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBRegressor.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBRegressor.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBRegressor.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            GBRCV = CrossValidator(estimator=GBRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(GBRCV)
            GBR_Pipeline = Pipeline(stages=stagesList)
            GBR_PipelineModel = GBR_Pipeline.fit(train_df)
            GBR_Predicted = GBR_PipelineModel.transform(test_df)
            GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel
            GBR_Prediction = GBR_Predicted.select("Prediction").toPandas()
            GBR_Score = evaluator.evaluate(GBR_Predicted)
            return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score
    if clustering:
        if kmeans:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            KCluster = KMeans(featuresCol=featuresCol,
                              predictionCol='Prediction',
                              k=n_cluster,
                              initMode='k-means||',
                              initSteps=2,
                              tol=0.0001,
                              maxIter=20,
                              seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid(
                    KCluster.maxIter,
                    [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                        KCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            KMCV = CrossValidator(estimator=KCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(KMCV)
            KMC_Pipeline = Pipeline(stages=stagesList)
            KMC_PipelineModel = KMC_Pipeline.fit(train_df)
            KMC_Predicted = KMC_PipelineModel.transform(train_df)
            KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel
            KMC_Prediction = KMC_Predicted.select("Prediction").toPandas()
            KMC_Score = evaluator.evaluate(KMC_Predicted)
            return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score
        if gaussianmixture:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GMCluster = GaussianMixture(featuresCol=featuresCol,
                                        predictionCol='Prediction',
                                        probabilityCol='Probability',
                                        k=n_cluster,
                                        tol=0.01,
                                        maxIter=100,
                                        seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                GMCluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    GMCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            GMCV = CrossValidator(estimator=GMCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GMCV)
            GMC_Pipeline = Pipeline(stages=stagesList)
            GMC_PipelineModel = GMC_Pipeline.fit(train_df)
            GMC_Predicted = GMC_PipelineModel.transform(train_df)
            GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel
            GMC_Probability = GMC_Predicted.select("Probability").toPandas()
            GMC_Prediction = GMC_Predicted.select("Prediction").toPandas()
            GMC_Score = evaluator.evaluate(GMC_Predicted)
            return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score
        if lda:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LDACluster = LDA(featuresCol=featuresCol,
                             maxIter=20,
                             seed=None,
                             k=n_cluster,
                             learningOffset=1024.0,
                             learningDecay=0.51,
                             subsamplingRate=0.05)
            paramGrid = ParamGridBuilder().addGrid(
                LDACluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    LDACluster.seed, [i for i in range(1001)]).addGrid(
                        LDACluster.subsamplingRate,
                        [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            LDACV = CrossValidator(estimator=LDACluster,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(LDACV)
            LDA_Pipeline = Pipeline(stages=stagesList)
            LDA_PipelineModel = LDA_Pipeline.fit(train_df)
            LDA_Predicted = LDA_PipelineModel.transform(train_df)
            LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel
            LDA_Topics = LDA_BestModel.describeTopics().toPandas()
            LDA_Score = evaluator.evaluate(LDA_Predicted)
            return LDA_BestModel, LDA_Topics, LDA_Score
    if recommendation:
        if als:
            ALSR = ALS(userCol=userCol,
                       itemCol=itemCol,
                       ratingCol=ratingCol,
                       rank=rank,
                       maxIter=10,
                       regParam=0.1,
                       numUserBlocks=10,
                       numItemBlocks=10,
                       alpha=1.0,
                       seed=1)
            ALSR_Model = ALSR.fit(train_df)
            ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid)
            ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid)
            return ALSR_Model, ALSR_ForUsers, ALSR_ForItems
    if association:
        if fpgrowth:
            fpg = FPGrowth(minSupport=minSupport,
                           minConfidence=minConfidence,
                           itemsCol=itemsCol,
                           predictionCol='Prediction')
            fpg_model = fpg.fit(train_df)
            fpg_freqItemsets = fpg_model.freqItemsets.toPandas()
            fpg_associationRules = fpg_model.associationRules.toPandas()
            return fpg_model, fpg_freqItemsets, fpg_associationRules
Пример #8
0
assembler = VectorAssembler(
    inputCols=[x for x in train_data.columns if x not in ignore],
    outputCol='features')

train_data = (assembler.transform(train_data).select("target", "features"))

# with 500 iterations, GINI is around ~0.275 for submissions
iteration = 500
gbt = GBTClassifier(labelCol="target",
                    featuresCol="features",
                    maxIter=iteration)

evaluator = BinaryClassificationEvaluator(labelCol="target")

# no parameter search
paramGrid = ParamGridBuilder().build()

# 6-fold cross validation
crossval = CrossValidator(estimator=gbt,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=6)

model = crossval.fit(train_data)

print("trained GBT classifier:%s" % model)

# display CV score
auc_roc = model.avgMetrics[0]
print("AUC ROC = %g" % auc_roc)
gini = (2 * auc_roc - 1)
    spark = SparkSession\
        .builder\
        .appName("TrainValidationSplit")\
        .getOrCreate()
    # $example on$
    # Prepare training and test data.
    data = spark.read.format("libsvm")\
        .load("data/mllib/sample_linear_regression_data.txt")
    train, test = data.randomSplit([0.7, 0.3])
    lr = LinearRegression(maxIter=10, regParam=0.1)

    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # TrainValidationSplit will try all combinations of values and determine best model using
    # the evaluator.
    paramGrid = ParamGridBuilder()\
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
        .build()

    # In this case the estimator is simply the linear regression.
    # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    tvs = TrainValidationSplit(
        estimator=lr,
        estimatorParamMaps=paramGrid,
        evaluator=RegressionEvaluator(),
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    # Run TrainValidationSplit, and choose the best set of parameters.
    model = tvs.fit(train)
    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
Пример #10
0
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")

accuracy = evaluator.evaluate(predictions_tr)
print("Decision Tree Classifier Accuracy before Cross Validation : ", accuracy)

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
print("Starting CrossValidation")
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [15, 20, 25])
             #.addGrid(dt.maxBins, [50, 80])
             .addGrid(dt.numTrees, [100, 110, 120])
             .build())

# Create k-fold CrossValidator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=8)

# Run cross-validation, and choose the best set of parameters.
cvModel = cv.fit(training)

# print("      | numTrees = ", cvModel.bestModel.numTrees)
# print("      | depth = ", cvModel.bestModel.maxDepth)

# Make predictions on test so we can measure the accuracy of our model on new data
#predictions_tr_cv = cvModel.transform(test)

# ## Model Selection a.k.a. hyperparameter tuning

# ##### For LR

# In[53]:


from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import BinaryClassificationMetrics

start_time = time()

# Creating ParamGrid for Cross Validation
lr_paramGrid = ParamGridBuilder()     .addGrid(lr.regParam, [0.1, 0.01])     .build()

# Creating CrossValidator
lr_crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=lr_paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=10)

# Run cross validations
lr_cvModel = lr_crossval.fit(trainSet)

end_time = time()
elapsed_time = end_time - start_time
print("Time to train Logistic Regression best model: %.3f seconds" % elapsed_time)

lr_cvModel.bestModel
Пример #12
0
# MAGIC  
# MAGIC \\[ \begin{bmatrix} a \times a \\\ a \times b & b \times b \\\ a \times c & b \times c & c \times c \\\ a \times d & b \times d & c \times d & d \times d \end{bmatrix}  \\]
# MAGIC  
# MAGIC Plus the original features
# MAGIC  
# MAGIC \\[ \begin{bmatrix} a & b & c & d \end{bmatrix} \\]

# COMMAND ----------

# MAGIC %md
# MAGIC Can we do better?  Let's build a grid of params and search using `CrossValidator`.

# COMMAND ----------

paramGridRand = (ParamGridBuilder()
                 .addGrid(rf.maxDepth, [2, 4, 8, 12])
                 .baseOn({rf.numTrees, 20})
                 .build())

cvRand = (CrossValidator()
          .setEstimator(rfPipeline)
          .setEvaluator(multiEval)
          .setEstimatorParamMaps(paramGridRand)
          .setNumFolds(2))

cvModelRand = cvRand.fit(irisTrain)
predictionsRand = cvModelRand.transform(irisTest)
print multiEval.evaluate(predictionsRand)
print cvModelRand.bestModel.stages[-1]._java_obj.parent().getMaxDepth()

# COMMAND ----------
Пример #13
0
model_node9.save("hdfs://namenode:9000/example5/model_1/")

estimator_node11 = RandomForestClassifier(featureSubsetStrategy="auto",
                                          numTrees=20,
                                          maxDepth=5,
                                          predictionCol="prediction",
                                          rawPredictionCol="rawPrediction",
                                          probabilityCol="probability",
                                          labelCol="indexedSurvived",
                                          featuresCol="features",
                                          impurity="gini")
evaluator_node12 = MulticlassClassificationEvaluator(
    labelCol="indexedSurvived",
    predictionCol="prediction",
    metricName="accuracy")
param_grid_node13 = ParamGridBuilder().addGrid(estimator_node11.maxDepth,
                                               [3, 5, 8, 20]).build()
cv_node13 = CrossValidator(estimator=estimator_node11,
                           estimatorParamMaps=param_grid_node13,
                           evaluator=evaluator_node12)
model_node13 = cv_node13.fit(df_node9)
df_node13 = model_node13.transform(df_node9)
df_node16 = model_node13.transform(df_node15)
model_node13.save("hdfs://namenode:9000/example5/model_2/")

evaluator_node17 = MulticlassClassificationEvaluator(
    labelCol="indexedSurvived",
    predictionCol="prediction",
    metricName="accuracy")
score_node17 = evaluator_node17.evaluate(df_node16)
df_node17 = spark.createDataFrame([(score_node17, )], ["score"])
Пример #14
0
def main(context):

    # dem(context)
    # gop(context)

    # SAVED PARQUETS
    # comments is the comments-minimal.json
    # submissions is the submissions.json
    # task7 is the result of the count vectorizer
    # commentsFull is the comments-minimal.json joined with submissions with the sarcasm removed and the &gt removed

    #TASK 1

    # Read from JSON
    #comments = sqlContext.read.json("comments-minimal.json.bz2")
    #comments.registerTempTable("commentsTable")
    #submissions = sqlContext.read.json("submissions.json.bz2")
    #submissions.registerTempTable("submissionsTable")

    # Write the Parquets
    #comments.write.parquet("comments.parquet")
    #submissions.write.parquet("submissions.parquet")

    # Read the parquets
    comments = sqlContext.read.parquet("comments.parquet")
    comments.registerTempTable("commentsTable")
    submissions = sqlContext.read.parquet("submissions.parquet")
    submissions.registerTempTable("submissionsTable")

    # Read the CSV
    labels = sqlContext.read.format('csv').options(header='true', inferSchema='true').load("labeled_data.csv")
    labels.registerTempTable("labelsTable")

    #TASK 2
    dfTask2 = sqlContext.sql("SELECT commentsTable.* FROM commentsTable INNER JOIN labelsTable ON commentsTable.id = labelsTable.Input_id")

    #TASK 4 and TASK 5
    def do_something(text):
        return parser.sanitize(text)

    udf_func = udf(do_something, ArrayType(StringType()))
    dfTask4 = dfTask2.withColumn("udf_results", udf_func(col("body")))

    #TASK 6A and Task 6B
    if(not os.path.exists("cvModel")):
        cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0)
        model = cv.fit(dfTask4)
        model.write().overwrite().save("cvModel")

    model = CountVectorizerModel.load("cvModel")
    dfTask6A = model.transform(dfTask4)
    dfTask6A.registerTempTable("dfTask6ATable")
    dfTask6B = sqlContext.sql("SELECT dfTask6ATable.*, IF(labelsTable.labeldjt=1, 1, 0) AS pos_label, if(labelsTable.labeldjt=-1, 1, 0) AS neg_label FROM dfTask6ATable INNER JOIN labelsTable ON dfTask6ATable.id = labelsTable.Input_id")
    dfTask6B.registerTempTable("dfTask6BTable")

    pos = sqlContext.sql('select pos_label as label, features from dfTask6BTable')
    neg = sqlContext.sql('select neg_label as label, features from dfTask6BTable')

    if(not os.path.exists("www/neg.model") or not os.path.exists("www/pos.model")):
        # Initialize two logistic regression models.
        # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
        poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2)
        neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25)
        # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        # There are a few parameters associated with logistic regression. We do not know what they are a priori.
        # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
        # We will assume the parameter is 1.0. Grid search takes forever.
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
        # We initialize a 5 fold cross-validation pipeline.
        posCrossval = CrossValidator(
            estimator=poslr,
            evaluator=posEvaluator,
            estimatorParamMaps=posParamGrid,
            numFolds=2)
        negCrossval = CrossValidator(
            estimator=neglr,
            evaluator=negEvaluator,
            estimatorParamMaps=negParamGrid,
            numFolds=2)
        # Although crossvalidation creates its own train/test sets for
        # tuning, we still need a labeled test set, because it is not
        # accessible from the crossvalidator (argh!)
        # Split the data 50/50
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])
        # Train the models
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)

        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.write().overwrite().save("www/pos.model")
        negModel.write().overwrite().save("www/neg.model")

    # TO LOAD BACK IN
    posModel = CrossValidatorModel.load("www/pos.model")
    negModel = CrossValidatorModel.load("www/neg.model")

    # Task 8
    dfTask8 = sqlContext.sql('SELECT commentsTable.id, commentsTable.body, commentsTable.created_utc, commentsTable.author_flair_text, submissionsTable.title, commentsTable.score AS comment_score, submissionsTable.score AS story_score FROM commentsTable INNER JOIN submissionsTable ON RIGHT(commentsTable.link_id, 6)=submissionsTable.id')
    dfTask8 = dfTask8.sample(False, 0.1, None)

    #TASK 4 and TASK 5
    def do_something(text):
        return parser.sanitize(text)

    udf_func = udf(do_something, ArrayType(StringType()))
    dfTask9_1 = dfTask8.withColumn("udf_results", udf_func(col("body")))

    #TASK 6A and Task 6B
    model = CountVectorizerModel.load("cvModel")
    dfTask9_2 = model.transform(dfTask9_1)
    dfTask9_2.registerTempTable("dfTask9_2Table")

    # Task 9
    dfTask9_3 = sqlContext.sql("SELECT * FROM dfTask9_2Table WHERE dfTask9_2Table.body NOT LIKE '%/s%' AND dfTask9_2Table.body NOT LIKE '&gt%'")
    dfTask9_3.registerTempTable("dfTask9_3Table")

    posResult_1 = posModel.transform(dfTask9_3)
    posResult_1.registerTempTable("posResult_1Table")
    posResult_2 = sqlContext.sql("SELECT posResult_1Table.id, posResult_1Table.body, posResult_1Table.author_flair_text, posResult_1Table.created_utc, posResult_1Table.title, posResult_1Table.comment_score, posResult_1Table.story_score, posResult_1Table.features, posResult_1Table.prediction AS pos FROM posResult_1Table")
    finalResult_1 = negModel.transform(posResult_2)
    finalResult_1.registerTempTable("finalResult_1Table")
    finalResult_2 = sqlContext.sql("SELECT finalResult_1Table.id, finalResult_1Table.body, finalResult_1Table.created_utc, finalResult_1Table.author_flair_text, finalResult_1Table.title, finalResult_1Table.comment_score, finalResult_1Table.story_score, finalResult_1Table.pos, finalResult_1Table.prediction AS neg FROM finalResult_1Table")
    finalResult_2.registerTempTable("finalResult_2Table")

    if(not os.path.exists("final.parquet")):
        finalResult_2.write.parquet("final.parquet")

    final = sqlContext.read.parquet("final.parquet")
    final.registerTempTable("finalTable")

    # Task 10
    if(not os.path.exists("question1.csv")):
        question1 = sqlContext.sql("SELECT (100 * sum(pos) / COUNT(*)) AS percent_pos, (100 * sum(neg) / COUNT(*)) AS percent_neg FROM finalTable")
        question1.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question1.csv")

    if(not os.path.exists("question2.csv")):
        question2 = sqlContext.sql("SELECT DATE(from_unixtime(finalTable.created_utc)) AS date, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY date ORDER BY date")
        question2.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question2.csv")

    if(not os.path.exists("question3.csv")):
        question3 = sqlContext.sql("SELECT finalTable.author_flair_text AS place, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY place ORDER BY place")
        question3.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question3.csv")

    if(not os.path.exists("question4_comment.csv")):
        question4_comment = sqlContext.sql("SELECT finalTable.comment_score AS comment_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY comment_score ORDER BY comment_score")
        question4_comment.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question4_comment.csv")

    if(not os.path.exists("question4_story.csv")):
        question4_story = sqlContext.sql("SELECT finalTable.story_score AS story_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY story_score ORDER BY story_score")
        question4_story.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question4_story.csv")
Пример #15
0
def func1():
    hour_df = sqlContext.read.format("csv").option(
        "header", "true").load(Path + "hour.csv")
    print("count", hour_df.count())
    print("columns:", hour_df.columns)
    #舍弃不需要的字段
    hour_df = hour_df.drop("instant").drop("dteday").drop("yr").drop(
        "casual").drop("registered")
    print("查看schema:", hour_df.printSchema())
    # 数据转换为double
    hour_df = hour_df.select([
        col(column).cast("double").alias(column) for column in hour_df.columns
    ])
    print("转换后:hour_df.printSchema():", hour_df.printSchema())
    print("前3项数据:", hour_df.show(3))
    # 将数据分为train_df和test_df,比例为0.7:0.3
    train_df, test_df = hour_df.randomSplit([0.7, 0.3])
    train_df.cache()
    test_df.cache()
    # 创建特征字段list
    featureCols = hour_df.columns[:-1]
    print("featureCols:", featureCols)
    # 建立pipeline
    vectorAssembler = VectorAssembler(inputCols=featureCols,
                                      outputCol="aFeatures")
    vectorIndexer = VectorIndexer(inputCol="aFeatures",
                                  outputCol="features",
                                  maxCategories=24)
    dt = DecisionTreeRegressor(labelCol="cnt", featuresCol="features")
    dt_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, dt])
    print("查看pipeline流程:", dt_pipeline.getStages())
    # 训练
    dt_pipelineModel = dt_pipeline.fit(dataset=train_df)
    print("查看训练完成后的模型:", dt_pipelineModel.stages[2].toDebugString[:500])
    # 使用transform预测
    predicted = dt_pipelineModel.transform(test_df)
    print("查看新增的字段:", predicted.columns)
    print("查看预测的结果:", predicted.show(2))
    ###评估模型
    evaluator = RegressionEvaluator(labelCol="cnt",
                                    predictionCol="prediction",
                                    metricName="rmse")
    predicted_df = dt_pipelineModel.transform(test_df)
    rmse = evaluator.evaluate(predicted_df)
    print("rmse:", rmse)
    ##TrainValidationSplit训练找出最佳模型
    paramGrid = ParamGridBuilder().addGrid(
        dt.impurity,
        ["gini", "entory"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(
            dt.maxBins, [10, 15, 20]).build()
    tvs = TrainValidationSplit(estimator=dt,
                               evaluator=evaluator,
                               estimatorParamMaps=paramGrid,
                               trainRatio=0.8)  # trainRatio 数据会8:2的比例分为训练集,验证集
    tvs_pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, tvs])
    yvs_pipelineModel = tvs_pipeline.fit(dataset=train_df)
    bestmodel = yvs_pipelineModel.stages[2].bestModel
    print("bestModel:", bestmodel.toDebugString[:500])
    ##使用最佳模型进行预测
    predictions = tvs_pipeline.transform(test_df)
    rmse2 = evaluator.evaluate(predictions)
    print(rmse2)
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label != prediction').count()
FP = prediction.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)
'''
Its good to check accuracy and also the confusion matrix. Confusion matrix can be very helpful on finding how the model is doing
and how well it predicts false positive and false negative. Accuracy can be bias if there is imbalanced dataset, 
Confusion  matrix is very helpful on such situations. 
'''
# do cross validation for this model
params = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 4])
             .addGrid(gbt.maxBins, [10, 20])
             .addGrid(gbt.maxIter, [5, 10])
             .build())

evaluator = BinaryClassificationEvaluator(labelCol='label')

cv = CrossValidator(estimator=pipeline, estimatorParamMaps= params, evaluator= evaluator, numFolds=5)
cvmodel = cv.fit(x_train)
best_model = cvmodel.bestModel

print(evaluator.evaluate(best_model.transform(x_test)))

'''
We can also get other number of features like best params from our best model from cross validation.
In our cross validation we can use best_model.stages[3].extractParamMap() to get the best params. Here [3] means
the stages from our pipeline model which is at index 3. In Index3, our pipeline model has our classification
Пример #17
0
    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=10)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
    # This will allow us to jointly choose parameters for all Pipeline stages.
    # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
    # this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
    paramGrid = ParamGridBuilder() \
        .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=2)  # use 3+ folds in practice

    # Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(training)

    # Prepare test documents, which are unlabeled.
    test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"),
                                  (6, "mapreduce spark"),
                                  (7, "apache hadoop")], ["id", "text"])
df_train = vecAssembler.transform(train_data)
pd.DataFrame(df_train.take(5), columns=df_train.columns).transpose()

dt = DecisionTreeClassifier(labelCol="deposit", featuresCol="features")
pipeline = Pipeline(stages=[vecAssembler, dt])
model = pipeline.fit(train_data)
predictions = model.transform(test_data)
predictions.select("prediction", "deposit", "features").toPandas().head(25)

evaluator = BinaryClassificationEvaluator(
    labelCol="deposit", rawPredictionCol="prediction")

evaluator.evaluate(predictions)

paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [2,3,4,5,6,7,8,9,10,11,12]).build()

# Set up 3-fold cross validation
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator, 
                          numFolds=3)

CV_model = crossval.fit(train_data)

tree_model = CV_model.bestModel.stages[1]
print(tree_model)

predictions_improved = CV_model.bestModel.transform(test_data)

predictions_improved.select("prediction", "deposit", "features").toPandas().head(25)
Пример #19
0
# COMMAND ----------

print(dtc.explainParams())

# COMMAND ----------

# MAGIC %md
# MAGIC %md 
# MAGIC #### ![Spark Logo Tiny](https://s3-us-west-2.amazonaws.com/curriculum-release/images/105/logo_spark_tiny.png) 3-fold Cross Validation
# MAGIC 
# MAGIC ![crossValidation](http://curriculum-release.s3-website-us-west-2.amazonaws.com/images/301/CrossValidation.png)

# COMMAND ----------

paramGrid = ParamGridBuilder().\
    addGrid(dtc.maxBins, [64, 128]).\
    addGrid(dtc.maxDepth, [20, 30]).\
    build()
    
crossval = CrossValidator(estimator=dtp,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3, 
                          parallelism=2)

cvModel = crossval.fit(train)

# COMMAND ----------

result = cvModel.transform(test)
display(result)
Пример #20
0
            # print(train_mod02.limit(2).toPandas())

            from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel

            rfClassifer = RandomForestClassifier(labelCol="dx_factorize", numTrees=100)

            from pyspark.ml import Pipeline

            pipeline = Pipeline(stages=[rfClassifer])

            from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
            from pyspark.ml.evaluation import MulticlassClassificationEvaluator

            paramGrid = ParamGridBuilder() \
                .addGrid(rfClassifer.maxDepth, [12]) \
                .addGrid(rfClassifer.minInstancesPerNode, [20]) \
                .build()


            evaluator = MulticlassClassificationEvaluator(labelCol="dx_factorize", predictionCol="prediction",
                                                          metricName="accuracy")
            # evaluator_f1 = MulticlassClassificationEvaluator(labelCol="dx_factorize", predictionCol="prediction",
            #                                               metricName="f1")


            crossval= CrossValidator(estimator=pipeline,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=evaluator,
                                      numFolds=5)

            cvModel = crossval.fit(train)
Пример #21
0
                            predictionCol="prediction",
                            metricName="rmse")

rmse = eval_.evaluate(yhat)

print('rmse is %.2f' % rmse)

mae = eval_.evaluate(yhat, {eval_.metricName: "mae"})
print('mae is %.2f' % mae)

r2 = eval_.evaluate(yhat, {eval_.metricName: "r2"})
print('r2 is %.2f' % r2)

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = (ParamGridBuilder().addGrid(gbt.maxDepth, [5, 8, 10, 12]).addGrid(
    gbt.maxBins, [64]).build())

cv = CrossValidator(estimator=gbt,
                    estimatorParamMaps=paramGrid,
                    evaluator=eval_,
                    numFolds=3)
cvModel = cv.fit(X_train)

yhat = (cvModel.transform(X_test).withColumn(
    "prediction", F.expm1(F.col("prediction"))).withColumn(
        target, F.expm1(F.col(target))).withColumn(
            'fiability', 1 - F.abs(F.col(target) - F.col("prediction")) /
            F.col(target)).withColumn(
                'fiability',
                F.when(F.col("fiability") < 0,
                       0).otherwise(F.col("fiability"))))
# Use the `explainParams` method to get the full list of hyperparameters:
print(lr.explainParams())

# Setting `elasticNetParam=1.0` corresponds to $l1$ (lasso) linear regression.
# We are interested in finding a reasonable value of `regParam`.


# ## Specify the hyperparameter grid

# Use the
# [ParamGridBuilder](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.tuning.ParamGridBuilder)
# class to specify a hyperparameter grid:
from pyspark.ml.tuning import ParamGridBuilder
regParamList = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
grid = ParamGridBuilder().addGrid(lr.regParam, regParamList).build()

# The resulting object is simply a list of parameter maps:
grid

# Rather than specify `elasticNetParam` in the `LinearRegression` constructor, we can specify it in our grid:
grid = ParamGridBuilder().baseOn({lr.elasticNetParam: 1.0}).addGrid(lr.regParam, regParamList).build()
grid


# ## Specify the evaluator

# In this case we will use
# [RegressionEvaluator](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.RegressionEvaluator)
# as our evaluator and specify root-mean-squared error as the metric:
from pyspark.ml.evaluation import RegressionEvaluator
Пример #23
0
# create TF_IDF features
idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(
    0)

# create a Logistic regression model
lr = LogisticRegression()

# streamline all above steps into a pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

# train model and predict results

# perform grid search looking for the best parameters and the best models
paramGrid = ParamGridBuilder()\
    .addGrid(hashingTF.numFeatures,[1000,5000,10000])\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.3, 0.6])\
    .build()
tvs = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator().setMetricName('areaUnderPR'),
    trainRatio=0.8)
# set area under precision-recall curve as the evaluation metric - 80% of data will be used for training, 20% for validation

# run TrainValidationSplit and choose the best set of parameters
model = tvs.fit(train_set)

# make predictions
train_prediction = model.transform(train_set)
test_prediction = model.transform(test_set)
Пример #24
0
#Create Decision Tree Estimator, set Label and Feature Columns
dTree = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=5, maxBins=7000)

#Setup pipeline with feature transformers and model estimator
steps = stringIndexers + encoders + [labeler, assembler, dTree]
steps

pipeline = Pipeline(stages=steps)

#**************Train the Model*******************

#Set up a CrossValidator with the parameters, a tree estimator and evaluator


paramGrid = ParamGridBuilder().addGrid(dTree.maxDepth, [4, 5, 6]).build()


evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol="label",metricName="f1")


# Set up 3-fold cross validation with paramGrid
crossVal = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=3)

#Use CrossValidator Estimator to fit the training data set

cvModel = crossVal.fit(strat_train_df)

cvModel.bestModel.stages

#Get the best Decision Tree Model
Пример #25
0
#importing necessary libraries for ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

#setting parameter grid for parameter tuning by cross-validation
evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating",
                                predictionCol = "prediction")
#Model
recsys = ALS(userCol = "user_index", itemCol = "item_index",
             ratingCol = "rating", nonnegative = True, coldStartStrategy="drop")

#Parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(recsys.regParam, [0.1, 0.01, 0.001]) \
    .addGrid(recsys.rank, [5, 10, 15]) \
    .build()

#Cross validation 
cvs = CrossValidator(estimator = recsys,
                     estimatorParamMaps = paramGrid,
                     evaluator = evaluator,
                     numFolds=5)

#Reading the Products ratings file
pathToFile = 'Products_preprocessed.csv'
ProductsDF = spark.read.csv(pathToFile, inferSchema = True, header = True)
ProductsDF.printSchema()

#Reading the metadata file
pathToFile = 'Products_meta_preprocessed.csv'
Пример #26
0
wine_train = assembler.transform(wine_train).select('features', 'quality')
wine_valid = assembler.transform(wine_valid).select('features', 'quality')
									 
#initialize linear regression model
lr = LinearRegression(featuresCol = 'features', labelCol='quality', maxIter = 10, regParam = 0.1, elasticNetParam = 0.5)

#fitting model
model = lr.fit(wine_train)
wine_prediction = model.transform(wine_valid)

#calculate results
r = wine_prediction.stat.corr('features', 'quality')
print("R-Squared: " + str(r ** 2))

crossval = CrossValidator(estimator=LinearRegression(labelCol = "quality"),  
                         estimatorParamMaps=ParamGridBuilder().addGrid(
                           LinearRegression.elasticNetParam, [0, 0.5, 1.0]).build(),
                         evaluator=RegressionEvaluator(
                           labelCol = "quality", metricName = "r2"),
                         numFolds=10)

#cross validate the model and choose the best fit
cvModel = crossval.fit(wine_train)
model = cvModel.bestModel

#calculate with improved model
wine_prediction = model.transform(wine_valid)
r = wine_prediction.stat.corr('features', 'quality')
print("R-squared: " + (str ** 2))

#exports model to be used 
Pkl_Filename = 'TrainedModel.pkl'
Пример #27
0
    # trainer
    NUMBER_OF_MODELS = 10
    dataPipeline = Pipeline(stages=currPipeline)
    convertedData = dataPipeline.fit(df).transform(df)
    trainData, testData = convertedData.randomSplit([0.9, 0.1], seed=0)
    models = []

    for i in range(NUMBER_OF_MODELS):
        tmpSample = trainData.sample(True, 1.0, 42)
        reg = LogisticRegression(labelCol="label",
                                 featuresCol="features",
                                 standardization=False)
        evaluator = BinaryClassificationEvaluator(
            labelCol="label", rawPredictionCol="prediction")
        cvPipeLine = Pipeline(stages=[reg])
        paramGrid = ParamGridBuilder().addGrid(reg.regParam,
                                               [0.01, 0.1, 1, 2, 5]).build()
        crossval = CrossValidator(estimator=cvPipeLine,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=2)

        tmpModel = crossval.fit(tmpSample)
        models.append(tmpModel)

    # save models
    if SAVE_MODELS:
        modelIndex = 0
        for elem in models:
            elem.save("./savedModels/model_" + str(modelIndex))
            modelIndex = modelIndex + 1
Пример #28
0
//used for identifying features and indexing them
vectorIndexer = VectorIndexer(inputCol="rawFeatures", outputCol="features", maxCategories=4)

from pyspark.ml.regression import GBTRegressor

// Takes the "features" column and learns to predict "cnt"
gbt = GBTRegressor(labelCol="name")


from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

//define parameters such as maxDepth maxIter to test
paramGrid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [2, 5])\
  .addGrid(gbt.maxIter, [10, 100])\
  .build()


//this tells how well it is doing with the help of labels
evaluator = RegressionEvaluator(metricName="rmse", labelCol=gbt.getLabelCol(), predictionCol=gbt.getPredictionCol())

//used for tuning the model
cv = CrossValidator(estimator=gbt, evaluator=evaluator, estimatorParamMaps=paramGrid)
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, cv])
//pipeline used for running feature processing, model tuning, and training 
pipelineModel = pipeline.fit(train)
//used for accuracy calculation
predictions = pipelineModel.transform(test)
//now evaluate the accuracy
Пример #29
0
 def test_param_grid_type_coercion(self):
     lr = LogisticRegression(maxIter=10)
     paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.5, 1]).build()
     for param in paramGrid:
         for v in param.values():
             assert (type(v) == float)
Пример #30
0
featVect = VectorAssembler(inputCols=["nfeatures"], outputCol="features")
#rf = RandomForestClassifier(labelCol="label", featuresCol="features",impurity="gini",featureSubsetStrategy="auto",numTrees=10,maxDepth=32,maxBins=128,seed=1234)
lr = LogisticRegression(labelCol="label",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.3)
pipeline = Pipeline(stages=[assembler, minMax, featVect, lr])

# COMMAND ----------

# MAGIC %md ### Tune Parameters
# MAGIC You can tune parameters to find the best model for your data. To do this you can use the  **CrossValidator** class to evaluate each combination of parameters defined in a **ParameterGrid** against multiple *folds* of the data split into training and validation datasets, in order to find the best performing parameters. Note that this can take a long time to run because every parameter combination is tried multiple times.

# COMMAND ----------

paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.3, 0.01]).addGrid(
    lr.maxIter, [10, 5]).build()
# TODO: K = 2, you may test it with 5, 10
# K=2, 5, 10: Root Mean Square Error (RMSE): 13.2
cv = CrossValidator(estimator=pipeline,
                    evaluator=RegressionEvaluator(),
                    estimatorParamMaps=paramGrid,
                    numFolds=10)

model = cv.fit(train)

# COMMAND ----------

# MAGIC %md  ### Test the Recommender
# MAGIC The model produced by the pipeline is a transformed that will apply to all stages in the pipeline to a specified DataFrame and apply the trained model to generate predictions. In this case, you will transform the test DataFrame using the pipeline to generate label prediction

# COMMAND ----------