예제 #1
0
def predict_class(df: DataFrame, vector_assembler: VectorAssembler,
                  lrModel: LogisticRegression):
    """[Predict class of dataframe]

    Args:
        df (DataFrame): [spark dataframe for prediction]
        vector_assembler (VectorAssembler): [vector assembler to prepare features]
        lrModel (LogisticRegression): [model]
    """
    predict_df = vector_assembler.transform(df).select('features')
    predictions = lrModel.transform(predict_df)
    predictions = predictions.withColumn(
        'class',
        when(col('prediction') == 0, 'Iris-setosa').when(
            col('prediction') == 1,
            'Iris-versicolor').otherwise('Iris-virginica'))
    (predictions.select('class').coalesce(1).write.option(
        "header", "true").format('csv').save('out/out_3_2.txt'))
예제 #2
0
# In[13]:

from pyspark.ml.classification import LogisticRegression
# Evaluate model based on auc ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Evaluate model based on F1 socre
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Evaluate model based on confusion matrix
from pyspark.mllib.evaluation import MulticlassMetrics

# model on training data regPara: lasso regularisation parameter (L1)
lrModel = LogisticRegression(regParam=0.2).fit(trainData)

# make prediction on test data
pred = lrModel.transform(testData)

pred.select('label', 'prediction').show()

evaluator1 = BinaryClassificationEvaluator(labelCol='label',
                                           metricName="areaUnderROC")
evaluator2 = MulticlassClassificationEvaluator(labelCol='label',
                                               metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred))
print('F1 score of Logistic Regression model is %f' %
      evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()

# <a id="context322"></a>
예제 #3
0
def log_reg_train(train_data, test_data):
    classifier = LogisticRegression(featuresCol='features', labelCol='label')
    classifier = classifier.fit(train_data)
    pred = classifier.transform(test_data)
    cm = pred.select("label", "prediction")
    return cm
예제 #4
0
FP = prediction.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)

--------------------------------------------------
# Exercise_8 
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

# Create a classifier object and train on training data
logistic = LogisticRegression().fit(flights_train)

# Create predictions for the testing data and show confusion matrix
prediction = logistic.transform(flights_test)
prediction.groupBy('label', 'prediction').count().show()

--------------------------------------------------
# Exercise_9 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall    = {:.2f}'.format(precision, recall))

# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})
예제 #5
0
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = model.evaluate(test_data)

# COMMAND ----------

evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                          labelCol='label')
evaluator.evaluate(predictions.predictions)

# COMMAND ----------

#Achieved 96.3% accuracy on test data, lets predict on the unseen test data for kaggle submission.

# COMMAND ----------

results = model.transform(test_data)
results.select('features', 'prediction').show()

# COMMAND ----------

results.count()

# COMMAND ----------

results = model.transform(testData)
results.select('features', 'prediction').show()

# COMMAND ----------

results.count()
예제 #6
0
train_1, train_2, train_3 = train_data.randomSplit([0.33, 0.33, 0.34],
                                                   seed=1234)
'''
1) Training 1 individual model with all data
'''

print('**********')
print('For 1 Model')
print('**********')

#Training the Logistic Regression Model
classifier = LogisticRegression(featuresCol='features', labelCol='label')
classifier = classifier.fit(train_data)

#Making predictions
pred = classifier.transform(test_data)
print('Predictions:')
pred.show(10)

#Model Accuracy
cm = pred.select("label", "prediction")
cm.show()

acc = cm.filter(cm.label == cm.prediction).count() / cm.count()
print(f'Accuracy for one model: {acc}%')
'''
2) Training 3 individual models parallelly with 1/3 data each
'''

print('**********')
print('For 3 models trained separately used as an Ensemble')
# Convert hashed symbols to TF-IDF
idf = IDF(inputCol="hash", outputCol="features")
sms = idf.fit(hashed).transform(hashed)

# View the first four records
sms.show(4, truncate=False)

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2)
logistic = logistic.fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy("label", 'prediction').count().show()

# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
accuracy = multi_evaluator.evaluate(prediction,
                                    {multi_evaluator.metricName: "accuracy"})
weighted_precision = multi_evaluator.evaluate(
    prediction, {multi_evaluator.metricName: "weightedPrecision"})
weighted_recall = multi_evaluator.evaluate(
    prediction, {multi_evaluator.metricName: "weightedRecall"})

# Find AUC
binary_evaluator = BinaryClassificationEvaluator()
예제 #8
0
selectedcols = ["label", "features"] + ["hours_per_week"] + ["income"]
dataset = preppedDataDF.select(selectedcols)

# COMMAND ----------

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=122423)

lrModel = LogisticRegression().fit(trainingData)

print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# COMMAND ----------

# Determine intersect point of "hours_per_week" where model goes from predicting income "<= 50k" to income ">50k"
predictions = lrModel.transform(testData)

selected = predictions.select("income", "label", "prediction", "probability", "hours_per_week").filter("hours_per_week > 65 and hours_per_week < 69")
display(selected)

# COMMAND ----------

# evaluate. note only 2 metrics are supported out of the box by Spark ML.
bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
au_roc = bce.setMetricName('areaUnderROC').evaluate(predictions)
au_prc = bce.setMetricName('areaUnderPR').evaluate(predictions)

truePositive = predictions.select("label").filter("label = 1 and prediction = 1").count()
falsePositive = predictions.select("label").filter("label = 0 and prediction = 1").count()
trueNegative = predictions.select("label").filter("label = 0 and prediction = 0").count()
falseNegative = predictions.select("label").filter("label = 1 and prediction = 0").count()
예제 #9
0
pipeline = pipeline.fit(df)

df = pipeline.transform(df)

# Model

lr = LogisticRegression(featuresCol="features",
                        labelCol=TARGET,
                        predictionCol="predictions",
                        maxIter=10,
                        regParam=0.0,
                        elasticNetParam=0.0,
                        threshold=0.5)

lr = lr.fit(df)
df = lr.transform(df)

summary = lr.summary

print("Labels")
print(summary.labels)

print("Accuracy")
print(summary.accuracy)

print("Precision by Label")
print(summary.precisionByLabel)

print("Recall by Label")
print(summary.recallByLabel)
예제 #10
0
# Reference https://chrisalbon.com/machine_learning/trees_and_forests/random_forest_classifier_example/

import time
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, MultilayerPerceptronClassifier, LinearSVC, OneVsRest, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark import SparkConf, SparkContext, SQLContext

conf = SparkConf().setMaster("local[*]")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)

data = spark.read.format("libsvm").load(
    "D:\Outils\Spark\data\mllib\iris_libsvm.txt")
(train, test) = data.randomSplit([0.8, 0.2])

model = LogisticRegression()
# model = DecisionTreeClassifier()
# model = RandomForestClassifier()
# model = MultilayerPerceptronClassifier(layers=[4, 3])
# model = OneVsRest(classifier=LinearSVC())
# model = NaiveBayes()

model = model.fit(train)
predictions = model.transform(test)

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

score = evaluator.evaluate(predictions)
print('Accuracy: ', score)
예제 #11
0
#Estimator:入:DataFrame => 出:Transformer

sIndexer_02 = StringIndexer(inputCol="label", outputCol="indexed02")
si_model_02 = sIndexer_02.fit(train_data)
(trainingData02, testData02) = train_data.randomSplit([0.7, 0.3])
td_02 = si_model_02.transform(trainingData02)

#NB不能为负数
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

#LR
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier
model_LR = LogisticRegression(maxIter=5, regParam=0.01)
model_LR = model_LR.fit(train_data)
predict_lr_testData = model_LR.transform(testData)


#计算精度
def computeAcc(data):
    err = data.filter(data['label'] != data['prediction']).count()
    total = data.count()
    acc = float(err) / total
    print err, total, acc
    return acc


#GBT
gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed02")
model_gbt = gbt.fit(td_02)
predict_gbt_testData = model_gbt.transform(testData02)
test_df.show(10)

print("Training Dataset Count: " + str(train_df.count()))
print("Test Dataset Count: " + str(test_df.count()))

##################################################
# MODELING
##################################################

##################################################
# Logistic Regression
##################################################

log_model = LogisticRegression(featuresCol='features',
                               labelCol='label').fit(train_df)
y_pred = log_model.transform(test_df)
y_pred.show()

y_pred.select("label", "prediction").show()
y_pred.filter(y_pred.label == y_pred.prediction).count() / y_pred.count()

evaluator = BinaryClassificationEvaluator(labelCol="label",
                                          rawPredictionCol="prediction",
                                          metricName='areaUnderROC')

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label",
                                                   predictionCol="prediction")

acc = evaluatorMulti.evaluate(y_pred, {evaluatorMulti.metricName: "accuracy"})
precision = evaluatorMulti.evaluate(
    y_pred, {evaluatorMulti.metricName: "precisionByLabel"})
예제 #13
0
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

fileStore = sys.argv[1]

df = spark.read.format("csv")\
          .options(inferSchema=True, header=True)\
          .load(fileStore)
valids = [v for v in df.columns if not v in remove]
df = df.select(valids)

#printc("%s" % df.dtypes)
inputs, df = vectorizeData(df=df, labelsCol=LEAVE)
train, test = df.randomSplit([0.7, 0.3], seed=12345)

# Train Logistic Regression
lr = LogisticRegression(regParam=0.01)
lr = lr.fit(train)
# Make predictions.
predictions = lr.transform(test)
evaluator = Evaluator()
# Select example rows to display.
#predictions.select("prediction", "label", "features").show()
# Evaluate the learned model
print("Pensiones random deads Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))
# Print important features
get_feature_importances(model=lr, featureNames=inputs, out_csv=out_csv)
예제 #14
0
# MAGIC %md
# MAGIC #### Step 4: Fit our model by using the training data
# MAGIC When we call the `.fit()` function, the pipeline stages are executed on the data in that dataset.

# COMMAND ----------

lrModel = LogisticRegression().fit(trainingData)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Step 5: Run our test data through the fit model, and view the predicted results for model evaluation

# COMMAND ----------

predictionsDF = (lrModel.transform(testData)).select("income", "label",
                                                     "prediction",
                                                     "probability")

# COMMAND ----------

predictionsDF.registerTempTable("incomePredictionsOutputDF")

# COMMAND ----------

# MAGIC %sql
# MAGIC
# MAGIC SELECT
# MAGIC   *
# MAGIC FROM incomePredictionsOutputDF
    stages_lr = stages.copy()

    inputCols = ['norm_cols'] + [
        cname + "classVec"
        for cname in categorical_cols if cname != 'native_country'
    ]
    final_assembler = VectorAssembler(inputCols=inputCols,
                                      outputCol='features')
    stages_lr += [final_assembler]

    pipeline = Pipeline(stages=stages_lr)
    train_lr = pipeline.fit(train).transform(train)
    test_lr = pipeline.fit(test).transform(test)
    lr = LogisticRegression(featuresCol='features',
                            labelCol='label').fit(train_lr)
    res_lr = lr.transform(test_lr)

    #----------------- Decision and Random Forest -----------------

    # Final assembly
    inputCols = ['norm_cols'
                 ] + [cname + "classVec" for cname in categorical_cols]
    final_assembler = VectorAssembler(inputCols=inputCols,
                                      outputCol='features')
    stages += [final_assembler]

    pipeline = Pipeline(stages=stages)
    train_final = pipeline.fit(train).transform(train)
    test_final = pipeline.fit(test).transform(test)

    dt = DecisionTreeClassifier(featuresCol='features',