def trainModel(self, infoData): label = infoData.get(pc.INDEXEDCOLM) feature = infoData.get(pc.FEATURESCOLM) dataset = infoData.get(pc.DATASET) predictionColm = infoData.get(pc.PREDICTIONCOLM) '''temp split the dataset to training and testing dataset''' (trainDataset, testDataset) = dataset.randomSplit([0.80, 0.20], seed=0) gradientBoostClassifier = GBTClassifier( labelCol=label, featuresCol=feature, predictionCol=predictionColm).fit(trainDataset) trainDataset = gradientBoostClassifier.transform(trainDataset) testDataset = gradientBoostClassifier.transform(testDataset) infoData.update({ pc.TESTDATA: testDataset, pc.TRAINDATA: trainDataset, pc.MODEL: gradientBoostClassifier }) infoData = self.storeModel(infoData) infoData = self.evaluation(infoData) return infoData
def predictions(train, test): #Aplicamos la tecnica de GBT GPT = GBTClassifier(featuresCol="Atributos", labelCol="Income", maxBins=41) GPT = GPT.fit(train) predictions = GPT.transform(test) results = predictions.select("Income", "prediction") predictionAndLabels = results.rdd metrics = MulticlassMetrics(predictionAndLabels) cm = metrics.confusionMatrix().toArray() #Calculo de metricas accuracy = (cm[0][0] + cm[1][1]) / cm.sum() precision = cm[0][0] / (cm[0][0] + cm[1][0]) recall = cm[0][0] / (cm[0][0] + cm[0][1]) f1 = 2 * ((precision * recall) / (precision + recall)) print("Metricas del modelo GBT Classifier") print("accuracy = {0}, precision = {1}, recall = {2}, f1 = {3}".format( accuracy, precision, recall, f1)) return
params = params.build() -------------------------------------------------- # Exercise_9 # Import the classes required from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator # Create model objects and train on training data tree = DecisionTreeClassifier().fit(flights_train) gbt = GBTClassifier().fit(flights_train) # Compare AUC on testing data evaluator = BinaryClassificationEvaluator() evaluator.evaluate(tree.transform(flights_test)) evaluator.evaluate(gbt.transform(flights_test)) # Find the number of trees and the relative importance of features print(gbt.getNumTrees) print(gbt.featureImportances) -------------------------------------------------- # Exercise_10 # Create a random forest classifier forest = RandomForestClassifier() # Create a parameter grid params = ParamGridBuilder() \ .addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \ .addGrid(forest.maxDepth, [2, 5, 10]) \ .build()
#Machine learning assemblerAtributos = VectorAssembler(inputCols=[ "age", "Workclass", "fnlwgt", "Education", "educational_num", "Marital_status", "Occupation", "Relationship", "Race", "Gender", "capital_gain", "capital_loss", "hours_per_week", "Native_country" ], outputCol="Atributos") dfModificado = assemblerAtributos.transform(df) dfModificado = dfModificado.select("Atributos", "Income") train, test = dfModificado.randomSplit([0.8, 0.2], seed=1) #80% entrenamiento 20% test #Aplicamos la tecnica de GBT GPT = GBTClassifier(featuresCol="Atributos", labelCol="Income", maxBins=41) GPT = GPT.fit(train) predictions = GPT.transform(test) results = predictions.select("Income", "prediction") predictionAndLabels = results.rdd metrics = MulticlassMetrics(predictionAndLabels) cm = metrics.confusionMatrix().toArray() #Calculo de metricas accuracy = (cm[0][0] + cm[1][1]) / cm.sum() precision = cm[0][0] / (cm[0][0] + cm[1][0]) recall = cm[0][0] / (cm[0][0] + cm[0][1]) f1 = 2 * ((precision * recall) / (precision + recall)) print("Metricas del modelo GBT Classifier") print("accuracy = {0}, precision = {1}, recall = {2}, f1 = {3}".format( accuracy, precision, recall, f1)) spark.stop()
# Create a Gradient-Boosted Tree classifier #tree = DecisionTreeClassifier(labelCol="origin_idx") #forest = RandomForestClassifier(labelCol="origin_idx", # numTrees=5) gbt = GBTClassifier(labelCol="origin_idx", maxIter=10) # Learn from training data #tree = tree.fit(kars_train) #tree = forest.fit(kars_train) gbt = gbt.fit(kars_train) # Make predictions on testing data #prediction = tree.transform(kars_test) #prediction = forest.transform(kars_test) prediction = gbt.transform(kars_test) prediction.show(9, False) print("\nforest.featureImportances:", gbt.featureImportances, '\n') # Confusion matrix confusion_matrix = prediction.groupBy("origin_idx", "prediction").count() confusion_matrix.show() # Accuracy evaluator = MulticlassClassificationEvaluator(labelCol="origin_idx", metricName="accuracy") accuracy = evaluator.evaluate(prediction) print("Test set accuracy = " + str(accuracy)) # Find weighted precision