Пример #1
0
    def trainModel(self, infoData):
        label = infoData.get(pc.INDEXEDCOLM)
        feature = infoData.get(pc.FEATURESCOLM)
        dataset = infoData.get(pc.DATASET)
        predictionColm = infoData.get(pc.PREDICTIONCOLM)
        '''temp split the dataset to training and testing dataset'''
        (trainDataset, testDataset) = dataset.randomSplit([0.80, 0.20], seed=0)
        gradientBoostClassifier = GBTClassifier(
            labelCol=label, featuresCol=feature,
            predictionCol=predictionColm).fit(trainDataset)
        trainDataset = gradientBoostClassifier.transform(trainDataset)
        testDataset = gradientBoostClassifier.transform(testDataset)
        infoData.update({
            pc.TESTDATA: testDataset,
            pc.TRAINDATA: trainDataset,
            pc.MODEL: gradientBoostClassifier
        })

        infoData = self.storeModel(infoData)
        infoData = self.evaluation(infoData)

        return infoData
Пример #2
0
def predictions(train, test):
    #Aplicamos la tecnica de GBT
    GPT = GBTClassifier(featuresCol="Atributos", labelCol="Income", maxBins=41)
    GPT = GPT.fit(train)
    predictions = GPT.transform(test)
    results = predictions.select("Income", "prediction")
    predictionAndLabels = results.rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    cm = metrics.confusionMatrix().toArray()
    #Calculo de metricas
    accuracy = (cm[0][0] + cm[1][1]) / cm.sum()
    precision = cm[0][0] / (cm[0][0] + cm[1][0])
    recall = cm[0][0] / (cm[0][0] + cm[0][1])
    f1 = 2 * ((precision * recall) / (precision + recall))
    print("Metricas del modelo GBT Classifier")
    print("accuracy = {0}, precision = {1}, recall = {2}, f1 = {3}".format(
        accuracy, precision, recall, f1))
    return
Пример #3
0
params = params.build()

--------------------------------------------------
# Exercise_9 
# Import the classes required
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create model objects and train on training data
tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(tree.transform(flights_test))
evaluator.evaluate(gbt.transform(flights_test))

# Find the number of trees and the relative importance of features
print(gbt.getNumTrees)
print(gbt.featureImportances)

--------------------------------------------------
# Exercise_10 
# Create a random forest classifier
forest = RandomForestClassifier()

# Create a parameter grid
params = ParamGridBuilder() \
            .addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
            .addGrid(forest.maxDepth, [2, 5, 10]) \
            .build()
Пример #4
0
#Machine learning
assemblerAtributos = VectorAssembler(inputCols=[
    "age", "Workclass", "fnlwgt", "Education", "educational_num",
    "Marital_status", "Occupation", "Relationship", "Race", "Gender",
    "capital_gain", "capital_loss", "hours_per_week", "Native_country"
],
                                     outputCol="Atributos")
dfModificado = assemblerAtributos.transform(df)
dfModificado = dfModificado.select("Atributos", "Income")
train, test = dfModificado.randomSplit([0.8, 0.2],
                                       seed=1)  #80% entrenamiento 20% test

#Aplicamos la tecnica de GBT
GPT = GBTClassifier(featuresCol="Atributos", labelCol="Income", maxBins=41)
GPT = GPT.fit(train)
predictions = GPT.transform(test)
results = predictions.select("Income", "prediction")
predictionAndLabels = results.rdd
metrics = MulticlassMetrics(predictionAndLabels)
cm = metrics.confusionMatrix().toArray()
#Calculo de metricas
accuracy = (cm[0][0] + cm[1][1]) / cm.sum()
precision = cm[0][0] / (cm[0][0] + cm[1][0])
recall = cm[0][0] / (cm[0][0] + cm[0][1])
f1 = 2 * ((precision * recall) / (precision + recall))
print("Metricas del modelo GBT Classifier")
print("accuracy = {0}, precision = {1}, recall = {2}, f1 = {3}".format(
    accuracy, precision, recall, f1))

spark.stop()
Пример #5
0
# Create a Gradient-Boosted Tree classifier
#tree = DecisionTreeClassifier(labelCol="origin_idx")
#forest = RandomForestClassifier(labelCol="origin_idx",
#                                numTrees=5)
gbt = GBTClassifier(labelCol="origin_idx", maxIter=10)

# Learn from training data
#tree = tree.fit(kars_train)
#tree = forest.fit(kars_train)
gbt = gbt.fit(kars_train)

# Make predictions on testing data
#prediction = tree.transform(kars_test)
#prediction = forest.transform(kars_test)
prediction = gbt.transform(kars_test)

prediction.show(9, False)

print("\nforest.featureImportances:", gbt.featureImportances, '\n')
# Confusion matrix
confusion_matrix = prediction.groupBy("origin_idx", "prediction").count()
confusion_matrix.show()

# Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="origin_idx",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))

# Find weighted precision