def train_lg(training_data, collection): # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr. hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") pipeline1 = Pipeline(stages=[hashingTF, idf]) # Fit the pipeline1 to training documents. model1 = pipeline1.fit(training_data) lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline2 = Pipeline(stages=[model1, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline2, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5) # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training_data) # model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_' # + collection["Id"] + '_' # + collection["name"]) # cvModel.save(sc, model_path) return cvModel
def build_decisionTree(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show(truncate=False) dt = DecisionTreeClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def train_with_tune(input_df): # https://spark.apache.org/docs/latest/ml-tuning.html # 构建模型训练流程 lr = LogisticRegression() pipeline = Pipeline(stages=[lr]) # 构建超参空间 paramGrid = ParamGridBuilder() \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() # 只做一次切分 # tvs = TrainValidationSplit(estimator=pipeline, # estimatorParamMaps=paramGrid, # evaluator=BinaryClassificationEvaluator(), # # 80% of the data will be used for training, 20% for validation. # trainRatio=0.8) # k-fold cross validation cross_val = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=3) # train and find the best cvModel = cross_val.fit(input_df) return cvModel.bestModel
def buildModel(data, label): """ Build a pipeline to classify `label` against the rest of classes using Binary Regression Classification :param data: the training data as a DF :param label: 0..C-1 where C is the number of classes :param shouldDisplayGraph: True to plot the graph illustrating the classification :return: the model as a Transformer """ logging.info('building model for label = %d, type = %s' % (label, type(label))) lr = LogisticRegression() pipeline = Pipeline(stages=[lr]) paramGrid = ParamGridBuilder()\ .addGrid(lr.maxIter, [100])\ .addGrid(lr.elasticNetParam, [0.0, 1.0])\ .addGrid(lr.fitIntercept, [True, False])\ .build() crossValidator = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=15) dataDF = data.map(lambda point: LabeledPoint(0 if point.label == label else 1, point.features)).toDF() model = crossValidator.fit(dataDF) return model
def test_save_load_simple_estimator(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def pipelineRF(dataDF): """ :param train_data: :return: """ print('pipeline starting...') labelIndexer_transModel = StringIndexer(inputCol='label',outputCol='indexLabel').fit(dataDF) featIndexer_transModel = VectorIndexer(inputCol="features", outputCol="indexed_features",maxCategories=37)\ .fit(dataDF) #dtEstimator = DecisionTreeClassifier(featuresCol='indexed_features',labelCol='indexLabel',maxDepth=5, # maxBins=40,minInstancesPerNode=1,minInfoGain=0.0,impurity='entropy') rfEstimator = RandomForestClassifier(labelCol='indexLabel',featuresCol='indexed_features', maxBins=40,seed=13) pipeline = Pipeline(stages=[labelIndexer_transModel,featIndexer_transModel,rfEstimator]) paramGrid = ParamGridBuilder()\ .addGrid(rfEstimator.maxDepth,[5,10,30])\ .addGrid(rfEstimator.numTrees,[20,50,100]).build() evaluator =BinaryClassificationEvaluator(labelCol='indexLabel', rawPredictionCol='rawPrediction', metricName='areaUnderROC') cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) cvModel = cv.fit(dataDF) print("pipeline end..., cvModel was fit using parameters:\n") pprint(cvModel.explainParams()) predictionDF = cvModel.transform(dataDF) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.take(5): print row aucMetric = evaluator.evaluate(selected) print("auc of test data is:%.3f" % aucMetric)
def create_models(sqlContext, modelDataframe): modelDataframe.registerTempTable("modelDataframeTable") # Create dataframes to use on the positive and negative models pos = sqlContext.sql("SELECT pos_label AS label, features FROM modelDataframeTable") neg = sqlContext.sql("SELECT neg_label AS label, features FROM modelDataframeTable") # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator( estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=2) negCrossval = CrossValidator( estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=2) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.write().overwrite().save("models/posModel") negModel.write().overwrite().save("models/negModel")
def test_expose_sub_models(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() numFolds = 3 cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=numFolds, collectSubModels=True) def checkSubModels(subModels): self.assertEqual(len(subModels), numFolds) for i in range(numFolds): self.assertEqual(len(subModels[i]), len(grid)) cvModel = cv.fit(dataset) checkSubModels(cvModel.subModels) # Test the default value for option "persistSubModel" to be "true" testSubPath = temp_path + "/testCrossValidatorSubModels" savingPathWithSubModels = testSubPath + "cvModel3" cvModel.save(savingPathWithSubModels) cvModel3 = CrossValidatorModel.load(savingPathWithSubModels) checkSubModels(cvModel3.subModels) cvModel4 = cvModel3.copy() checkSubModels(cvModel4.subModels) savingPathWithoutSubModels = testSubPath + "cvModel2" cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels) cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels) self.assertEqual(cvModel2.subModels, None) for i in range(numFolds): for j in range(len(grid)): self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
def buil_lrmodel(path): df = load_data(path) #-------------------- preparing the dataset ------------------------------------------- avg_age = find_avg_age(df) df = data_preparation(df, avg_age) print "count = " , df.count() df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') #------------------ Build a model ---------------------------------------------------- lr = LogisticRegression(maxIter=10, regParam=0.01) model = lr.fit(df) prediction = model.transform(df) prediction.show(truncate=False) evaluator = BinaryClassificationEvaluator() print "classification evaluation :" , evaluator.evaluate(prediction) #-------------- selecting models with cross validation ----------------------------------- lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [1,10,50,150,200,500,1000])\ .addGrid(lr.regParam, [0.01, 0.05, 0.1,]).build() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(df) prediction = cvModel.transform(df) prediction.show(truncate=False) print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def main(): ''' takes one input argument :: Location of the directory for training and test data files. :return: Print output on console for the area under the ROC curve. ''' conf = SparkConf().setAppName("MLPipeline") sc = SparkContext(conf=conf) # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) numFeatures = (1000, 5000, 10000) regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build() cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() model_cv = cv.fit(trainDF) prediction_cv = model_cv.transform(testDF) print evaluator.evaluate(prediction) print evaluator.evaluate(prediction_cv)
def test_fit_minimize_metric(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="rmse") grid = (ParamGridBuilder() .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) .build()) cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) bestModel = cvModel.bestModel bestModelMetric = evaluator.evaluate(bestModel.transform(dataset)) self.assertEqual(0.0, bestModel.getOrDefault('inducedError'), "Best model should have zero induced error") self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
def test_save_load_trained_model(self): # This tests saving and loading the trained model only. # Save/load for CrossValidator will be added later: SPARK-13786 temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) lrModel = cvModel.bestModel cvModelPath = temp_path + "/cvModel" lrModel.save(cvModelPath) loadedLrModel = LogisticRegressionModel.load(cvModelPath) self.assertEqual(loadedLrModel.uid, lrModel.uid) self.assertEqual(loadedLrModel.intercept, lrModel.intercept)
def test_copy(self): dataset = self.spark.createDataFrame([ (10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]) iee = InducedErrorEstimator() evaluator = RegressionEvaluator(metricName="rmse") grid = (ParamGridBuilder() .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) .build()) cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator) cvCopied = cv.copy() self.assertEqual(cv.getEstimator().uid, cvCopied.getEstimator().uid) cvModel = cv.fit(dataset) cvModelCopied = cvModel.copy() for index in range(len(cvModel.avgMetrics)): self.assertTrue(abs(cvModel.avgMetrics[index] - cvModelCopied.avgMetrics[index]) < 0.0001)
def train_with_tune(input_df): # https://spark.apache.org/docs/latest/ml-tuning.html # build a model with GridSearch xgboost_params = { "eta": 0.023, "max_depth": 10, "min_child_weight": 0.3, "subsample": 0.7, "colsample_bytree": 0.82, "colsample_bylevel": 0.9, "eval_metric": "auc", "seed": 49, "silent": 1, "objective": "binary:logistic", "round": 10, "nWorkers": 2 } xgb_model = XGBoostClassifier(xgboost_params) pipeline = Pipeline(stages=[xgb_model]) # build the hyperparameter space paramGrid = ParamGridBuilder() \ .addGrid(xgb_model.max_depth, [3, 7]) \ .addGrid(xgb_model.min_child_weight, [0.1, 0.2, 0.3]) \ .build() # k-fold cross validation cross_val = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(rawPredictionCol="probabilities"), numFolds=3) # train and find the best cvModel = cross_val.fit(input_df) return cvModel.bestModel
def test_save_load_nested_estimator(self): temp_path = tempfile.mkdtemp() dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(100) lr2 = LogisticRegression().setMaxIter(150) grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build() evaluator = MulticlassClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) originalParamMap = cv.getEstimatorParamMaps() loadedParamMap = loadedCV.getEstimatorParamMaps() for i, param in enumerate(loadedParamMap): for p in param: if p.name == "classifier": self.assertEqual(param[p].uid, originalParamMap[i][p].uid) else: self.assertEqual(param[p], originalParamMap[i][p]) # test save/load of CrossValidatorModel cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def test_parallel_evaluation(self): dataset = self.spark.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build() evaluator = BinaryClassificationEvaluator() # test save/load of CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cv.setParallelism(1) cvSerialModel = cv.fit(dataset) cv.setParallelism(2) cvParallelModel = cv.fit(dataset) self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. # This will allow us to jointly choose parameters for all Pipeline stages. # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. # We use a ParamGridBuilder to construct a grid of parameters to search over. # With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, # this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training) # Prepare test documents, which are unlabeled. Document = Row("id", "text") test = sc.parallelize([(4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), (7L, "apache hadoop")]) \ .map(lambda x: Document(*x)).toDF() # Make predictions on test documents. cvModel uses the best model found (lrModel). prediction = cvModel.transform(test)
# In[330]: from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # In[331]: from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.tuning import CrossValidator grid=(ParamGridBuilder() .baseOn([evaluator.metricName,'precision']) .addGrid(dt.maxDepth, [10,20]) .build()) cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator) # In[332]: print "Fitting the decision tree on selected features" t0 = time() cv_model = cv.fit(dfTrainIndexed) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[302]: pr dfTestIndexed = string_indexer_model.transform(dfTestSelect)
# COMMAND ---------- from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # COMMAND ---------- grid = ParamGridBuilder() \ .addGrid(dtc.maxDepth, [2, 3, 4, 5, 6, 7, 8]) \ .addGrid(dtc.maxBins, [2, 4, 8]) \ .build() # COMMAND ---------- cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=grid, numFolds=3) # COMMAND ---------- # MAGIC %md Run `CrossValidator`. `CrossValidator` checks to see if an MLflow tracking server is available. If so, it log runs within MLflow: # MAGIC # MAGIC * Under the current active run, log info for `CrossValidator`. (Create a new run if none are active.) # MAGIC * For each submodel (number of folds of cross-validation x number of ParamMaps tested) # MAGIC * Log a run for this submodel, along with the evaluation metric on the held-out data. # COMMAND ---------- # Explicitly create a new run. # This allows this cell to be run multiple times. # If you omit mlflow.start_run(), then this cell could run once,
from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator #Setting Random Forest Paramaters From Users user_svm_param_maxIter = [16, 32, 64, 128] user_svm_param_numFolds = 3 #Settings for Random Forest - Paramaters Grid Search svm_paramGrid = ParamGridBuilder().addGrid(svmclassifier.maxIter, user_svm_param_maxIter).build() evaluator = BinaryClassificationEvaluator() multiEvaluator = MulticlassClassificationEvaluator() #Setting Paramaters for Crossvalidation svm_cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=svm_paramGrid, numFolds=user_svm_param_numFolds) svm_cvmodel = svm_cv.fit(train) #Evaluating Random Forest Model Performance from pyspark.sql.functions import udf svm_predictions = svm_cvmodel.transform(test) auroc = evaluator.evaluate(svm_predictions, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(svm_predictions, {evaluator.metricName: "areaUnderPR"}) "The AUROC is %s and the AUPR is %s" % (auroc, aupr) f1score = multiEvaluator.evaluate(svm_predictions, {multiEvaluator.metricName: "f1"})
dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol=string_indexer.getOutputCol(), maxDepth=10) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.tuning import CrossValidator grid=(ParamGridBuilder() .baseOn([evaluator.metricName,'precision']) .addGrid(dt.maxDepth, [10,20]) .build()) cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator) from time import time print "Start fitting" t0 = time() cv_model = cv.fit(featIndexed) tt = time() - t0 print "Classifier trained in {} seconds".format(round(tt,3)) print "Start preprocessing test data" t0 = time() dfTestTok = tokenizer.transform(dfTest) dfTestBigram = bigram.transform(dfTestTok) featuresTest=dfTestBigram.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) # **************************************************************** # *********************CROSS VALIDATION: 80%/20%****************** # *******************Model: DecisionTreeClassifier***************** # ***************************************************************** evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="target_indexed", metricName="precision" ) grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build() print "Grid is build" cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator) print "CV Estimator is defined" cv_model = cv.fit(dfTrain) print "Model is fitted" df_test_pred = cv_model.transform(dfTest) print "Labels are predicted" print evaluator.evaluate(df_test_pred)
rfModel = model.stages[2] print(rfModel) # summary only ### LR lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [100, 1000, 10000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=3) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(train) # Make predictions on test documents. cvModel uses the best model found (lrModel). prediction = cvModel.transform(test) prediction selected = prediction.select("id", "text", "probability", "prediction") for row in selected.collect(): print(row) accuracy = evaluator.evaluate(predictions)
from pyspark.sql import SQLContext from pyspark import SparkContext sc = SparkContext("local", "Pipeline") sqlContext = SQLContext(sc) dataset = sqlContext.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) print(cv.metrics) Params map: [ 2.80026035 2.77896443 2.52157438 2.77129878 2.68407165 2.29883198] Metrics: [ {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): <nltk.tokenize.casual.TweetTokenizer object at 0x105452128>, Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576}, {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): <nltk.tokenize.casual.TweetTokenizer object at 0x105452128>, Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 2, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576}, {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): <nltk.tokenize.casual.TweetTokenizer object at 0x105452128>, Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 3, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576}, {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): WhitespaceTokenizer(pattern='\\s+', gaps=True, discard_empty=True, flags=56), Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576}, {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): WhitespaceTokenizer(pattern='\\s+', gaps=True, discard_empty=True, flags=56), Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 2, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576}, {Param(parent='Tokenizzzer_47c4ad546cc0174c5bf9', name='tokenizer', doc=''): WhitespaceTokenizer(pattern='\\s+', gaps=True, discard_empty=True, flags=56), Param(parent='NGram_499e8ba0c19d556e369c', name='n', doc='number of elements per n-gram (>=1)'): 3, Param(parent='HashingTF_489387460f2680f2d6f8', name='numFeatures', doc='number of features.'): 1048576}] Params map: [ 2.80151508] Metrics: [{Param(parent='NGram_4b338738c901a197c6db', name='n', doc='number of elements per n-gram (>=1)'): 1, Param(parent='Tokenizzzer_4ef1808e516e9e78d783', name='tokenizer', doc=''): <nltk.tokenize.casual.TweetTokenizer object at 0x1054524a8>, Param(parent='HashingTF_4623a4e0650badd052c3', name='numFeatures', doc='number of features.'): 1048576}]
from pyspark.ml.tuning import ParamGridBuilder model_new = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative=True, coldStartStrategy="drop") # Parameters for tuning paramGrid = ParamGridBuilder().addGrid(model_new.regParam, [0.1, 0.01, 0.001]).addGrid( model_new.rank, [5, 10, 15]).build() crossvalidation = CrossValidator(estimator=model_new, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) #Using the Best Model model_cv = crossvalidation.fit(training).bestModel #Evaluate and print the predictions print("RMSE value after solving cold start problem is: ", evaluator.evaluate(model_cv.transform(test))) # As we can see, even after CV there isn't much improvement. # # Step 5. Top 10 movies for all the users # In[33]:
pl_rff = basePipeline + [rff] pg_rff = ParamGridBuilder()\ .baseOn({pipeline.stages: pl_rff})\ .build() # One grid from the individual grids paramGrid = pg_lr + pg_dt + pg_rff # COMMAND ---------- # The regression metric can be rmse, r2 # See the metrics here https://spark.apache.org/docs/latest/mllib-evaluation-metrics.html#regression-model-evaluation # Should run more than 3 folds, but here we simplify so that it will complete cv = CrossValidator()\ .setEstimator(pipeline)\ .setEvaluator(RegressionEvaluator()\ .setMetricName("r2"))\ .setEstimatorParamMaps(paramGrid)\ .setNumFolds(3) cvModel = cv.fit(df1) # COMMAND ---------- # MAGIC %md ## Best and Worst Model # COMMAND ---------- import numpy as np # RegressionEvaluator metric name is r2, so higher is better # http://gim.unmc.edu/dxtests/roc3.htm print("Best Model")
def main(context): """Main Function takes a Spark SQL Context.""" #--------------------------------------------------------------------------- # TASK 1 # Code for task 1... # df = context.read.csv('labeled_data.csv') # df.write.parquet("labeled_data.parquet") # comments = context.read.json("comments-minimal.json.bz2") # comments.write.parquet("comments.parquet") # submissions = context.read.json("submissions.json.bz2") # submissions.write.parquet("submissions.parquet") labeled_data = context.read.parquet('labeled_data.parquet') labeled_data = labeled_data.withColumnRenamed("_c0", "Input_id")\ .withColumnRenamed("_c1", "labeldem")\ .withColumnRenamed("_c2", "labelgop")\ .withColumnRenamed("_c3", "labeldjt") # labeled_data.show() comments = context.read.parquet('comments.parquet') # comments.show() submissions = context.read.parquet('submissions.parquet') # submissions.show() #--------------------------------------------------------------------------- # TASK 2 # Code for task 2... labeled_comments = labeled_data.join(comments, comments.id == labeled_data.Input_id) labeled_comments = labeled_comments.select('Input_id', 'labeldjt', 'body') # labeled_comments.show() #--------------------------------------------------------------------------- # TASK 4 # Code for task 4... sanitize_udf = udf(sanitize, ArrayType(StringType())) #--------------------------------------------------------------------------- # TASK 5 # Code for task 5... sanitized_labeled_comments = labeled_comments.select( 'Input_id', 'labeldjt', sanitize_udf('body').alias('raw')) #--------------------------------------------------------------------------- # TASK 6A # Code for task 6A... cv = CountVectorizer(binary=True, minDF=10.0, inputCol="raw", outputCol="features") model = cv.fit(sanitized_labeled_comments) sanitized_labeled_comments = model.transform(sanitized_labeled_comments) sanitized_labeled_comments.show(truncate=False) countVectorizerPath = "count_vectorizer_model" model.save(countVectorizerPath) #--------------------------------------------------------------------------- # TASK 6B # Code for task 6B... # Labels: {1, 0, -1, -99} pos = sanitized_labeled_comments.select( sanitized_labeled_comments.features, sanitized_labeled_comments.labeldjt.cast(IntegerType())) pos = pos.withColumnRenamed("labeldjt", "label") pos = pos.replace(-1, 0) pos = pos.replace(-99, 0) # pos.show() neg = sanitized_labeled_comments.select( sanitized_labeled_comments.features, sanitized_labeled_comments.labeldjt.cast(IntegerType())) neg = neg.withColumnRenamed("labeldjt", "label") neg = neg.replace(1, 0) neg = neg.replace(-99, 0) neg = neg.replace(-1, 1) # neg.show() #--------------------------------------------------------------------------- # TASK 7 # Code for task 7... # ... MACHINE LEARNING PORTION TO TRAIN MODELS - Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("project2/pos.model") negModel.save("project2/neg.model") # Positive Model: posModel # Negative Model: negModel #--------------------------------------------------------------------------- # TASK 8 # Code for task 8... # ... Make Final Deliverable for Unseen Data - We don't need labeled_data anymore strip_t3_udf = udf(strip_t3, StringType()) sarcastic_or_quote_udf = udf(sarcastic_or_quote, BooleanType()) # Get Unseen Data sanitized_final_deliverable = comments.select('created_utc', strip_t3_udf(comments.link_id).alias('link_id'), 'author_flair_text', 'id', 'body', 'gilded', sanitize_udf('body').alias('raw'), comments.score.alias('c_score'))\ .filter(sarcastic_or_quote_udf(comments['body'])) #F.when(comments["body"].rlike('^>|\/s'), False).otherwise(True)) # sanitized_final_deliverable.show() #--------------------------------------------------------------------------- # TASK 9 # Code for task 9... # Load models that we saved on previous runs of this script model = CountVectorizerModel.load("count_vectorizer_model") posModel = CrossValidatorModel.load("project2/pos.model") negModel = CrossValidatorModel.load("project2/neg.model") # Sanitize TASK 8 - Run the CountVectorizerModel on TASK 8 Relation sanitized_final_deliverable = model.transform(sanitized_final_deliverable) # Run classifier on unseen data to get positive labels posResult = posModel.transform(sanitized_final_deliverable) # Rename the 3 new columns to prevent name conflicts posResult = posResult.withColumnRenamed("probability", "probability_pos")\ .withColumnRenamed("rawPrediction", "rawPrediction_pos")\ .withColumnRenamed("prediction", "prediction_pos") # Run the classifier on previous positive result to get negative labels too result = negModel.transform(posResult) # Rename the 3 new columns to make it easier to see which is which result = result.withColumnRenamed("probability", "probability_neg")\ .withColumnRenamed("rawPrediction", "rawPrediction_neg")\ .withColumnRenamed("prediction", "prediction_neg") # UDF functions for predicting label based on thresholds predict_pos_udf = udf(predict_pos, IntegerType()) predict_neg_udf = udf(predict_neg, IntegerType()) # Make predictions based on probability and threshold: result = result.select('created_utc', 'author_flair_text', 'link_id', 'id', 'c_score', 'gilded',\ predict_pos_udf(result.probability_pos).alias('pos'),\ predict_neg_udf(result.probability_neg).alias('neg')) result.write.parquet("result.parquet") # result.show() #--------------------------------------------------------------------------- # TASK 10 # Code for task 10... # ... Perform Analysis on the Predictions result = context.read.parquet("result.parquet") submissions = submissions.select('id', 'title', submissions.score.alias('s_score')) result = result.join(submissions, result.link_id == submissions.id) # .explain() result.show() context.registerDataFrameAsTable(result, "result") # 1. Percentage of Comments that Were Positive/Negative Across ALL Submissions task_10_1 = context.sql( "SELECT title, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY title" ) task_10_1.show() task_10_1.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_1.csv") # 2. Percentage of Comments that Were Positive/Negative Across ALL Days task_10_2 = context.sql( "SELECT FROM_UNIXTIME(created_utc, 'Y-M-d') AS day, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY day ORDER BY day asc" ) task_10_2.show() task_10_2.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_2.csv") # 3. Percentage of Comments that Were Positive/Negative Across ALL States context.registerFunction("check_state_udf", check_state, BooleanType()) task_10_3 = context.sql( "SELECT author_flair_text AS state, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result WHERE check_state_udf(author_flair_text) = True GROUP BY state" ) task_10_3.show() task_10_3.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_3.csv") # 4A. Percentage of Comments that Were Positive/Negative Across ALL Comments task_10_4A = context.sql( "SELECT c_score AS comment_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY comment_score" ) task_10_4A.show() task_10_4A.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_4A.csv") # 4B. Percentage of Comments that Were Positive/Negative Across ALL Story Scores task_10_4B = context.sql( "SELECT s_score AS submission_score, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY submission_score" ) task_10_4B.show() task_10_4B.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("2task_10_4B.csv") #--------------------------------------------------------------------------- # Extra Credit (Task 10) # 1. Percentage of Comments that Were Positive/Negative For Gilded and Non-Gilded Comments task_10_extra_credit = context.sql( "SELECT gilded, AVG(pos) AS pos_percentage, AVG(neg) AS neg_percentage FROM result GROUP BY gilded" ) task_10_extra_credit.show() task_10_extra_credit.repartition(1).write.format( "com.databricks.spark.csv").option( "header", "true").save("task_10_extra_credit.csv")
def train(inputs_path: str): spark = SparkUtils.build_or_get_session('training') df_kids = spark.read.parquet(inputs_path) label_col = 'final_status' mlflow_tracking_ui = 'http://35.246.84.226' mlflow_experiment_name = 'kickstarter' mlflow.set_tracking_uri(mlflow_tracking_ui) mlflow.set_experiment(experiment_name=mlflow_experiment_name) numerical_columns = ['days_campaign', 'hours_prepa', 'goal'] categorical_columns = ['country_clean', 'currency_clean'] features = numerical_columns + categorical_columns df = df_kids.select(features + [label_col]) max_iter = 15 model_specs: Pipeline = build_model( numerical_columns=numerical_columns, categorical_columns=categorical_columns, label_col=label_col, max_iter=max_iter) df_train, df_test = df.randomSplit([0.8, 0.2], seed=12345) df_train = df_train.cache() evaluator = BinaryClassificationEvaluator() \ .setMetricName('areaUnderROC') \ .setRawPredictionCol('rawPrediction') \ .setLabelCol('final_status') gbt = model_specs.getStages()[-1] params_grid = ParamGridBuilder()\ .addGrid(gbt.maxDepth, [6]) \ .addGrid(gbt.maxIter, [15]) \ .addGrid(gbt.maxBins, [32])\ .build() cross_val = CrossValidator(estimator=model_specs, estimatorParamMaps=params_grid, evaluator=evaluator, numFolds=2) with mlflow.start_run() as active_run: logger.info(f'Cross evaluating model on {df_train.count()} lines') cross_val_model: CrossValidatorModel = cross_val.fit(df_train) model = cross_val_model.bestModel logger.info('Evaluating model') train_metrics = evaluator.evaluate(model.transform(df_train)) metrics = {'train_auc': train_metrics} test_metrics = evaluator.evaluate(model.transform(df_test)) metrics.update({'test_auc': test_metrics}) logger.info(f'Model metrics: {metrics}') logger.info('Logging to mlflow') mlflow_params = {'model_class': 'gbt', 'max_iter': max_iter} mlflow.log_params(mlflow_params) mlflow.log_metrics(metrics) log_model(model, 'model') model_uri = mlflow.get_artifact_uri(artifact_path='model') logger.info(f'Model successfully trained and saved @ {model_uri}')
from pyspark.ml.tuning import CrossValidator from mmlspark import LightGBMRegressor # today_str = datetime.date.today().strftime("%Y%m%d") today_str = "20190126" def mae_metric(y_true, predict): mae = mean_absolute_error(y_true, predict) return 1 / (1 + mae) my_score = make_scorer(mae_metric, greater_is_better=True) train_path = "E:/lgb/train_" + today_str + ".csv" test_path = "E:/lgb/test_" + today_str + ".csv" train_df = pd.read_csv(train_path) test_df = pd.read_csv(test_path) X_train, y_train = train_df.drop(['user_id', 'target'], axis=1), train_df['target'] X_test = test_df.drop(['user_id'], axis=1).values X_train = X_train.values y_train = y_train.values lgb = LightGBMRegressor(objective="quantile", alpha=0.2, learningRate=0.01, numLeaves=31) cv = CrossValidator(estimator=lgb, numFolds=5)
.addGrid(als.rank, [10, 50, 75, 100]) \ .addGrid(als.maxIter, [5, 50, 75, 100]) \ .addGrid(als.regParam, [.01, .05, .1, .15]) \ .build() # Define evaluator as RMSE evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating", predictionCol = "prediction") # Print length of evaluator print ("Num models to be tested using param_grid: ", len(param_grid)) # COMMAND ---------- # Build cross validation using CrossValidator cv = CrossValidator(estimator = als, estimatorParamMaps = param_grid, evaluator = evaluator, numFolds = 5) model = als.fit(training) predictions = model.transform(test) predictions.show(n = 10) # COMMAND ---------- rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # COMMAND ---------- # Generate n recommendations for all users ALS_recommendations = model.recommendForAllUsers(numItems = 10) # n — 10 ALS_recommendations.show(n = 10)
# COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation paramGrid = (ParamGridBuilder() .addGrid(lr.regParam, [0.01, 0.1, 0.5, 1.0, 2.0]) .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.5, 0.8, 1.0]) .addGrid(lr.maxIter, [1, 5, 10, 20]) .build()) # COMMAND ---------- # Create 5-fold CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross validations cvModel = cv.fit(trainingData) # COMMAND ---------- # Use test set here so we can measure the accuracy of our model on new data predictions = cvModel.transform(testData) # COMMAND ---------- # cvModel uses the best model found from the Cross Validation # Evaluate best model evaluator.evaluate(predictions)
#trainingDataDF, testingDataDF = trainingData2.randomSplit([0.8, 0.2], seed=0L) # COMMAND ---------- pipeline = Pipeline(stages=[glm]) pipeline2 = Pipeline(stages=[rfr]) # COMMAND ---------- paramGrid = ParamGridBuilder().addGrid(glm.maxIter, [8, 10, 12]).addGrid(glm.regParam, [0.4, 0.6, 0.8]).build() paramGrid2 = ParamGridBuilder().addGrid(rfr.maxDepth, [20, 25]).addGrid(rfr.maxBins, [32, 48]).build() # COMMAND ---------- crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(metricName = "mae"), numFolds=5) crossval2 = CrossValidator(estimator=pipeline2, estimatorParamMaps=paramGrid2, evaluator=RegressionEvaluator(metricName = "mae"), numFolds=5) # COMMAND ---------- trainingDataSJ = trainingData2.filter("city == 'sj'") trainingDataIQ = trainingData2.filter("city == 'iq'") testingDataSJ = testingData2.filter("city == 'sj'") testingDataIQ = testingData2.filter("city == 'iq'") #testingData2SJ = testingData2.filter("city == 'sj'") #testingData2IQ = testingData2.filter("city == 'iq'") # COMMAND ---------- cvModel = crossval2.fit(trainingDataSJ)#RFR cvModel2 = crossval2.fit(trainingDataIQ)#RFR
# COMMAND ---------- #Define Pipeline pipeline = Pipeline(stages=[ Neighborhood_indexer, YearBuilt_indexer, MoSold_indexer, YrSold_indexer, assembler, lr ]) # COMMAND ---------- paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.05, 0.01])\ .addGrid(lr.fitIntercept, [False, True])\ .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]).build() evaluator = RegressionEvaluator(metricName="rmse", labelCol="label") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=3) cvModel = crossval.fit(train) # COMMAND ---------- prediction = cvModel.transform(test) # COMMAND ---------- display(prediction.selectExpr("id as Id", "prediction as SalePrice")) # COMMAND ----------
def random_forest_classifier(training_data, test_data, validation_data): # ROC: 0.73 # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label') # ROC: 0.75 # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', numTrees=50, maxDepth=30, maxBins=32) # ROC: 0.75 # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', weightCol='classWeights', numTrees=50, # maxDepth=30, maxBins=32) # ROC: 0.75 # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', numTrees=50, impurity='entropy', # maxDepth=30, maxBins=32) # ROC: 0.70 # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', numTrees=50, impurity='entropy', # maxDepth=30, maxBins=2) # ROC: 0.76 # rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', numTrees=100, # maxDepth=30, maxBins=100) rf = RandomForestClassifier(featuresCol='scaled_features', labelCol='label', weightCol='classWeights', numTrees=25, maxDepth=5, maxBins=32) rfModel = rf.fit(training_data) # print(rfModel.featureImportances) # Plot roc curve roc_plot(rfModel) predict_valid = rfModel.transform(validation_data) # predict_train = rfModel.transform(training_data) predict_valid.show(5) evaluate_metrics(predict_valid) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label', metricName="areaUnderROC") model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_valid, data_type="valid_data") # predict_final = rfModel.transform(test_data) # # model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_final, # data_type="test_data") # print("\n\nParameter Grid and cross validation") paramGrid = ParamGridBuilder() \ .addGrid(rf.maxDepth, [2, 4, 6]) \ .addGrid(rf.maxBins, [20, 60]) \ .addGrid(rf.numTrees, [5, 20]) \ .build() # Create 5-fold CrossValidator cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross validations. This can take about 6 minutes since it is training over 20 trees! cvModel = cv.fit(training_data) predict_cross_valid = cvModel.transform(validation_data) model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_cross_valid, data_type="valid_data") predict_final = cvModel.bestModel.transform(test_data) model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_final, data_type="test_data")
# K fold Cross Validation from pyspark.ml.tuning import ParamGridBuilder, CrossValidator paramGrid = ParamGridBuilder()\ .addGrid(logR.aggregationDepth,[2,5,10])\ .addGrid(logR.elasticNetParam,[0.0, 0.5, 1.0])\ .addGrid(logR.fitIntercept,[False, True])\ .addGrid(logR.maxIter,[10, 100])\ .addGrid(logR.regParam,[0.01, 0.5, 2.0]) \ .build() CV = CrossValidator(estimator= logR, estimatorParamMaps=paramGrid, evaluator=evaluator_AUC, numFolds= 5) CVModel = CV.fit(train) # Best Model Best_Logm = CVModel.bestModel print(Best_Logm.coefficients) print(Best_Logm.intercept) predict_train_cv=CVModel.transform(train) predict_test_cv=CVModel.transform(test) predict_train_cv_pd = predict_train_cv.toPandas() predict_test_cv_pd = predict_test_cv.toPandas()
def decision_tree_classifier(training_data, test_data, validation_data): # ROC 0.69 # dt = DecisionTreeClassifier(featuresCol='scaled_features', labelCol='label', maxDepth=3) # ROC 0.46 # dt = DecisionTreeClassifier(featuresCol='scaled_features', labelCol='label', maxDepth=10) # ROC 0.68 dt = DecisionTreeClassifier(featuresCol='scaled_features', labelCol='label', maxDepth=3, impurity='entropy') model = dt.fit(training_data) predict_valid = model.transform(validation_data) # predict_train = model.transform(training_data) # predict_valid.show(10) evaluate_metrics(predict_valid) predict_valid.select('*').show(10) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label', metricName="areaUnderROC") model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_valid, data_type="valid_data") print("\n\nParameter Grid and cross validation") paramGrid = ParamGridBuilder() \ .addGrid(dt.maxDepth, [1, 2, 6, 10]) \ .addGrid(dt.maxBins, [20, 40, 80]) \ .build() # Create 5-fold CrossValidator cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross validations cvModel = cv.fit(training_data) print("numNodes = ", cvModel.bestModel.numNodes) print("depth = ", cvModel.bestModel.depth) # Use test set to measure the accuracy of the model on new data predict_cross_valid = cvModel.transform(validation_data) # cvModel uses the best model found from the Cross Validation # Evaluate best model # ROC 0.706, Slightly better than Logistic Regresion model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_cross_valid, data_type="valid_data")
pandas_df = pd.DataFrame(X_train_t) pandas_df["label"] = y_train spark_df = spark.createDataFrame(pandas_df) assembler = VectorAssembler(inputCols=[str(a) for a in pandas_df.columns[:-1]], outputCol="features") # spark ML logistic regression w/ grid seach start = time.time() lr = LR() pipeline = Pipeline(stages=[assembler, lr]) paramGrid = ParamGridBuilder().addGrid(lr.regParam, [10.0, 1.0, 0.1, 0.01]).build() crossval = CrossValidator( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=5, parallelism=8, ) cvModel = crossval.fit(spark_df) print("-- spark ML LR --") print("Train Time: {0}".format(time.time() - start)) print("Best Model CV Score: {0}".format(np.mean(cvModel.avgMetrics))) # test holdout pandas_df = pd.DataFrame(X_test_t) pandas_df["label"] = y_test eval_df = spark.createDataFrame(pandas_df) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") print("Holdout F1: {0}".format(evaluator.evaluate( cvModel.transform(spark_df))))
def weighted_logistic_regression(training_data, test_data, validation_data): # ROC: 0.69 lr = LogisticRegression(featuresCol='scaled_features', labelCol='label', weightCol='classWeights', maxIter=100) # Always the same prediction. ROC: 0.5 # lr = LogisticRegression(featuresCol='scaled_features',labelCol='label',maxIter=100, regParam=0.3, elasticNetParam=0.8) # A little better, still not very good. ROC: 0.663 # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label', maxIter=100, regParam=0.1, # elasticNetParam=0.8) # ROC: 0.60 # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label', maxIter=20, regParam=0.01, # elasticNetParam=0.3) # ROC: 0.61 # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label', maxIter=20, regParam=1e-10, # elasticNetParam=0.2) # ROC 0.65. It seems that in the most iterations things are getting improved # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label', maxIter=100, regParam=1e-10, # elasticNetParam=0.2) # ROC: 0.66 The best one. However, to improve we must move to other choices # lr = LogisticRegression(featuresCol='scaled_features', labelCol='label',weightCol='classWeights', maxIter=100, regParam=0.01, # elasticNetParam=0.2) # Train model using training Data model = lr.fit(training_data) metric_plotting(model) print("\nCoefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) # Make predictions on test data using the transform method # LogisticRegression.transform() will only use the features column # predict_train = model.transform(training_data) # predict_test = model.transform(test_data) predict_valid = model.transform(validation_data) evaluate_metrics(predict_valid) # View the predictions predict_valid.select('*').show(10) # Evaluate the model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label', metricName="areaUnderROC") # After experimenting and reading more about different metrics, areaUnderROC metric seems to be proper for our purposes # evaluator3 = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='label', # metricName="areaUnderPR") # evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy') model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_valid, data_type="valid_data") # model_evaluator(evaluator=evaluator2, evaluator_name="accuracy", data=predict_valid, data_type="valid_data") # model_evaluator(evaluator=evaluator3, evaluator_name="areaUnderPR", data=predict_valid, data_type="valid_data") # Create ParamGrid for Cross Validation # This grid takes a while. Choose another one for the next implementation # ROC: 0.694. Took for ever with these parameters. Execute with different parameters. # This seems to be the best we are going to get with this model # paramGrid = ParamGridBuilder() \ # .addGrid(lr.aggregationDepth, [2, 5, 10]) \ # .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \ # .addGrid(lr.fitIntercept, [False, True]) \ # .addGrid(lr.maxIter, [10, 100, 1000]) \ # .addGrid(lr.regParam, [0.01, 0.5, 2.0]) \ # .build() print("\n\nParameter Grid and cross validation") # ROC: 0.694 paramGrid = ParamGridBuilder() \ .addGrid(lr.aggregationDepth, [2, 5, 10]) \ .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \ .addGrid(lr.fitIntercept, [False, True]) \ .addGrid(lr.maxIter, [20, 50, 100]) \ .addGrid(lr.regParam, [0.01, 0.2, 1.0]) \ .build() # Create 5-fold CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross validations cvModel = cv.fit(training_data) # predict_train = cvModel.transform(training_data) predict_cross_valid = cvModel.transform(validation_data) model_evaluator(evaluator=evaluator, evaluator_name="areaUnderROC", data=predict_cross_valid, data_type="valid_data")
]) evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections", predictionCol="prediction", metricName="accuracy") print("Configuring CrossValidation") params = ParamGridBuilder() \ .addGrid(categorical_hasher.numFeatures, [2048]) \ .addGrid(regression.fitIntercept, [True]) \ .addGrid(regression.maxIter, [100]) \ .addGrid(regression.threshold, [0.5]) \ .addGrid(regression.standardization, [False]) \ .build() validator = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=3) print("Fitting -> Training Data") pipeline_model = validator.fit(train) print("Fitting -> Test Data") predictions = pipeline_model.transform(test) predictions.select("HasDetections", "MachineIdentifier", "probability", "prediction").show(truncate=False) print("Computing Accuracy") accuracy = evaluator.evaluate(predictions) print("Test set accuracy = {0}".format(accuracy)) print("Saving Pipeline Model")
# Generate top 10 movie recommendations for a specified set of users # 取3个用户,保留userIdInt列 print('==============recommendForUserSubset==============') users = ratingSamples.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) # 仅为这三个用户推荐电影 userSubsetRecs.show(5, False) # Generate top 10 user recommendations for a specified set of movies # 取3个电影,保留movieIdInt列 print('==============recommendForUserSubset==============') movies = ratingSamples.select(als.getItemCol()).distinct().limit(3) movieSubSetRecs = model.recommendForItemSubset(movies, 10) # 仅为3个电影推荐用户 movieSubSetRecs.show(5, False) # 模型超参数搜索,利用K折交叉验证 print('==============CrossValidator==============') paramGrid = ParamGridBuilder().addGrid(als.regParam, [0.01]).build() cv = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) cvModel = cv.fit(ratingSamples) # 所有超参数枚举的平均误差 avgMetrics = cvModel.avgMetrics print('avgMetrics:', avgMetrics) # 最佳模型 print('bestModel:', cvModel.bestModel) cvModel.bestModel.recommendForAllUsers(10).show(10, False) spark.stop()
# Building a RF model rf = RandomForestRegressor(labelCol="x4", featuresCol="indexedFeatures",numTrees=3, maxDepth=29, maxBins=32, featureSubsetStrategy="auto") # Pipeline pipeline = Pipeline(stages=[featureIndexer, rf]) # Cross Validation paramGrid = ParamGridBuilder().build() evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="x4",metricName="rmse") evaluator2 = RegressionEvaluator(predictionCol="prediction", labelCol="x4",metricName="r2") cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # 5 fold CV cvModel = cv.fit(trainDF) cvModel1 = cv.fit(trainingData) predict = cvModel.transform(testDF) file1 = open('predictionFile.txt','w') predict.select("prediction").show(10) file1.write('\n'.join(list(map(str,predict.select("prediction").collect())))) predict_cv = cvModel1.transform(testData) rmse = evaluator.evaluate(predict_cv) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
# COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.regression import GeneralizedLinearRegression from pyspark.ml import Pipeline from pyspark.ml.tuning import CrossValidator, ParamGridBuilder glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity") pipeline = Pipeline().setStages([glr]) params = ParamGridBuilder().addGrid(glr.regParam, [0, 0.5, 1]).build() evaluator = RegressionEvaluator()\ .setMetricName("rmse")\ .setPredictionCol("prediction")\ .setLabelCol("label") cv = CrossValidator()\ .setEstimator(pipeline)\ .setEvaluator(evaluator)\ .setEstimatorParamMaps(params)\ .setNumFolds(2) # should always be 3 or more but this dataset is small model = cv.fit(df) # COMMAND ---------- from pyspark.mllib.evaluation import RegressionMetrics out = model.transform(df)\ .select("prediction", "label").rdd.map(lambda x: (float(x[0]), float(x[1]))) metrics = RegressionMetrics(out) print "MSE: " + str(metrics.meanSquaredError) print "RMSE: " + str(metrics.rootMeanSquaredError) print "R-squared: " + str(metrics.r2) print "MAE: " + str(metrics.meanAbsoluteError)
#Further, we incure cost of bias and overfitting due to aritifically balancing our training set. #88% of ads appearing in both sets. So, a little bit overfit cost will not be crutial. train = train.sampleBy('label', fractions={0: .24, 1: 1.0}).cache() rf = RandomForestClassifier() stratified_CV_data = CV_data.sampleBy('Churn', fractions={ 0: 388. / 2278, 1: 1.0 }).cache() #TODO: Add ParamGrid grid = ParamGridBuilder().addGrid(rf.maxDepth, [3, 5, 8]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5) cvModel = cv.fit(train) score = evaluator.evaluate(cvModel.transform(train)) #Saving the model bestModel = cvModel.bestModel os.system('mkdir rf') os.chdir(os.getcwd() + '/rf') bestModel.save(os.getcwd() + '/rfModel') np.save('score', score) # TODO: Make predictions on test_transformed and use APK
baggingFraction=0.7 ) evaluator = BinaryClassificationEvaluator(rawPredictionCol="features", labelCol="label", metricName="areaUnderPR") paramGrid = ParamGridBuilder()\ .addGrid(model.maxDepth, [9, 13]) \ .addGrid(model.featureFraction, [0.9, 0.7, 0.5]) \ .build() print('Creamos cross-validador con {} folds'.format(kFolds)) crossval = CrossValidator( estimator=model, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=kFolds, seed=1) print('Entrenando modelo...') model_trained = crossval.fit(train_data) print('Fin del train') try: for i, j in zip(model_trained.avgMetrics, paramGrid): print("Score {} con los parametros {}".format(i, j)) except: print("No se pudo hacer zip(model.avgMetrics, paramGrid)") # FEATURES IMPORTANCES for i, j in zip(model_trained.bestModel.getFeatureImportances(), train_cols):
# Split the data into training and test sets (20% held out for testing) (trainingData, testData) = dfFinal.randomSplit([0.8, 0.2]) # Train the model. #rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") nb = NaiveBayes(smoothing = 1.0, labelCol="indexedLabel", featuresCol="indexedFeatures") #pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, nb]) paramGrid = ParamGridBuilder().build() crossval = CrossValidator( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5) model = crossval.fit(trainingData) # Compute raw scores on the test set predictions = model.transform(testData) predictions.select("prediction", "indexedLabel", "features").show(5) rddPredictions = predictions.select("prediction", "indexedLabel").rdd accuracy = rddPredictions.filter(lambda p: (p['prediction'] == p['indexedLabel'])).count() / float(testData.count()) TP = rddPredictions.filter(lambda p: (p['prediction'] == 1 and p['prediction'] == p['indexedLabel'])).count() TN = rddPredictions.filter(lambda p: (p['prediction'] == 0 and p['prediction'] == p['indexedLabel'])).count() FP = rddPredictions.filter(lambda p: (p['indexedLabel'] == 1 and p['prediction'] != p['indexedLabel'])).count() FN = rddPredictions.filter(lambda p: (p['indexedLabel'] == 0 and p['prediction'] != p['indexedLabel'])).count() print("TP = ", TP)
#build labelled Points from data data_class=zip(data,Y)#if a=[1,2,3] & b=['a','b','c'] then zip(a,b)=[(1,'a'),(2, 'b'), (3, 'c')] dcRDD=sc.parallelize(data_class,numSlices=16) #get the labelled points labeledRDD=dcRDD.map(partial(createBinaryLabeledPoint,dictionary=dict_broad.value)) #**************************************************************** #*********************CROSS VALIDATION: 80%/20%****************** #*******************Model: logistic regression******************* #***************************************************************** #create a data frame from an RDD -> features must be Vectors.sparse from pyspark.mllib.linalg sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(labeledRDD, ['features','label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) dfTrain.show() #choose estimator and grid lr = LogisticRegression() #choose the model grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() #the grid is built to find the best paramter 'alpha' for the regularization of the model. It is an elastic net #alpha=0, for a L2 regularization, #alpha=1, for a L1 regularization print "Start Cross validation" evaluator = BinaryClassificationEvaluator() #choose the evaluator cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) #perform the cross validation and keeps the best value of maxIter cvModel = cv.fit(dfTrain) #train the model on the whole training set resultat=evaluator.evaluate(cvModel.transform(dfTest)) #compute the percentage of success on test set print "Percentage of correct predicted labels (0-1): ",resultat
vectorizedData = training_data.toDF() print("Creating MultilayerPerceptronClassifier...") MLP = MultilayerPerceptronClassifier(labelCol='indexedLabel', featuresCol='indexedFeatures') labelIndexer = StringIndexer(inputCol='label', outputCol='indexedLabel').fit(vectorizedData) featureIndexer = VectorIndexer(inputCol='features', outputCol='indexedFeatures', maxCategories=2).fit(data.toDF()) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, MLP]) paramGrid_MLP = ParamGridBuilder().addGrid(MLP.layers,[[3072, neuron, 10] for neuron in [200, 500]]).build() evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel', predictionCol='prediction', metricName='f1') print("Processing crossvalidation with 3-fold & 200/500 hidden layer units") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid_MLP, evaluator=evaluator, numFolds=3) starttime = datetime.datetime.now() CV_model = crossval.fit(vectorizedData) print CV_model.bestModel.stages[2] print('Done on fitting model:%s'%(datetime.datetime.now()-starttime)) print("Transforming testing data...") vectorized_test_data = testing_data.toDF() #transformed_data1 = CV_model.transform(vectorizedData) #print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data1) transformed_data = CV_model.transform(vectorized_test_data) #print transformed_data.first() print("Fitting testing data into model...") print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data)
.addGrid(lr.tol, (1e-4, 1e-5))\ .addGrid(lr.elasticNetParam, (0.25,0.75))\ .build() # DEFINE PIPELINE # SIMPLY THE MODEL HERE, WITHOUT TRANSFORMATIONS pipeline = Pipeline(stages=[lr]) # DEFINE CV WITH PARAMETER SWEEP # splitting the dataset into a set of folds which are used as separate training and test datasets # generate 3 (training, test) dataset pairs, each of which uses 2/3 of # the data for training and 1/3 for testing # 8 params x 3 folds # See: https://spark.apache.org/docs/latest/ml-tuning.html#cross-validation cv = CrossValidator(estimator= lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=3) # CONVERT TO DATA FRAME, AS CROSSVALIDATOR WON'T RUN ON RDDS #trainDataFrame = sqlContext.createDataFrame(oneHotTRAINreg, ["features", "label"]) # TRAIN WITH CROSS-VALIDATION #cv_model = cv.fit(trainDataFrame) cv_model = cv.fit(trainReg.toDF(['label','features'])) # EVALUATE MODEL ON TEST SET #testDataFrame = sqlContext.createDataFrame(oneHotTESTreg, ["features", "label"]) testDataFrame = testReg.toDF(['label','features']) # MAKE PREDICTIONS ON TEST DOCUMENTS
# MAGIC We will create a 5-fold cross validator. # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation paramGrid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.5, 2.0]).addGrid( lr.elasticNetParam, [0.0, 0.5, 1.0]).addGrid(lr.maxIter, [1, 5, 10]).build()) # COMMAND ---------- # Create 5-fold CrossValidator cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross validations cvModel = cv.fit(trainingData) # this will likely take a fair amount of time because of the amount of models that we're creating and testing # COMMAND ---------- # Use test set to measure the accuracy of our model on new data predictions = cvModel.transform(testData) # COMMAND ---------- # cvModel uses the best model found from the Cross Validation # Evaluate best model
assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features") # COMMAND ---------- rfmodel = RandomForestClassifier()\ .setLabelCol("label")\ .setFeaturesCol("features") #print (rfmodel.explainParams()) # COMMAND ---------- paramGrid = ParamGridBuilder().addGrid(rfmodel.maxBins, [10,20]).addGrid(rfmodel.maxDepth, [5,10]).build() pipeline = Pipeline().setStages([assembler,rfmodel]) evaluator = MulticlassClassificationEvaluator() tvs = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(4) # COMMAND ---------- training, test = new_data.randomSplit([0.75, 0.25], seed = 12345) model = tvs.fit(training) # COMMAND ---------- from pyspark.sql import Row newtest = Row(sepal_length=3.50, sepal_width=1.0, petal_length=2.00, petal_width=0.30) df4 = sc.parallelize([newtest]).toDF() dff = model.transform(df4) display(dff) # COMMAND ----------
def RandomForestClassifier(self): print("********************************************************************************************************************************************") print("Random Forest") self.t0 = time() rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees = 100, maxDepth = 4, maxBins = 32,impurity="entropy") pipeline = Pipeline(stages=[self.labelIndexer, self.featureIndexer, rf, self.labelConverter]) model = pipeline.fit(self.trainingData) self.tm = time() - self.t0 print ("Modeli egitme zamani {} saniye ".format(self.tm)) self.t0 = time() self.predictions = model.transform(self.testData) self.tt = time() - self.t0 print ("Test verisini siniflandirma zamani {} saniye ".format(self.tt)) self.t0 = time() predictions_train = model.transform(self.trainingData) self.te = time() - self.t0 print ("Egitim verisini siniflandirma zamani {} saniye ".format(self.te)) self.predictions.select("features", "label", "predictedLabel", "probability").show(5) evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") self.t0 = time() self.accuracy = evaluator.evaluate(self.predictions) self.tt2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Testin dogrulanmasi {} saniye ".format(self.tt2, self.accuracy)) self.t0 = time() self.train_accuracy = evaluator.evaluate(predictions_train) self.te2 = time() -self.t0 print ("Tahmini yapilis zamani {} saniye . Egitim Verisinin dogrulanmasi {} saniye ".format(self.te2, self.train_accuracy)) print("Test Dogruluk = %g" % (self.accuracy)) self.testError = (1.0 - self.accuracy) print("Test Test Error = %g" % (1.0 - self.accuracy)) print("Egitim Dogruluk = %g" % (self.train_accuracy)) self.train_Error = (1.0 - self.train_accuracy) print("Egitim Error = %g" % (1.0 - self.train_accuracy)) rfModel = model.stages[2] evaluatorf1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1") self.f1 = evaluatorf1.evaluate(self.predictions) self.train_f1 = evaluatorf1.evaluate(predictions_train) print("test f1 = %g" % self.f1) print("egitim f1 = %g" % self.train_f1) evaluatorwp = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision") self.wp = evaluatorwp.evaluate(self.predictions) self.train_wp = evaluatorwp.evaluate(predictions_train) print("test weightedPrecision = %g" % self.wp) print("egitim weightedPrecision = %g" % self.train_wp) evaluatorwr = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall") self.wr = evaluatorwr.evaluate(self.predictions) self.train_wr = evaluatorwr.evaluate(predictions_train) print("test weightedRecall = %g" % self.wr) print("egitim weightedRecall = %g" % self.train_wr) rfModel = model.stages[2] #print (rfModel._call_java('toDebugString')) messagebox.showinfo("Başarılı","Model Eğitildi") self.skorEkle() self.ModelBtn.grid_remove() self.SonucBtn.grid(row=7,column=2) self.ExportCsvBtn.grid(row=8,column=2) svm = LinearSVC(maxIter=5, regParam=0.01) LSVC = LinearSVC() ovr = OneVsRest(classifier=LSVC) paramGrid = ParamGridBuilder().addGrid(LSVC.maxIter, [10, 100]).addGrid(LSVC.regParam,[0.001, 0.01, 1.0,10.0]).build() crossval = CrossValidator(estimator=ovr, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(metricName="f1"), numFolds=2) Train_sparkframe = self.trainingData.select("features", "label") cvModel = crossval.fit(Train_sparkframe) bestModel = cvModel.bestModel