Python LogisticRegression.transform 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark.ml.classification

클래스/타입: LogisticRegression

메소드/함수: transform

hotexamples.com에서의 예제들: 15

Python LogisticRegression.transform - 15개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.ml.classification.LogisticRegression.transform에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

fit(30)

LogisticRegression(30)

transform(15)

setThreshold(12)

explainParams(11)

evaluate(5)

setLabelCol(4)

setElasticNetParam(4)

getOrDefault(4)

save(4)

load(3)

setMaxIter(3)

write(3)

getFeaturesCol(3)

extractParamMap(3)

setFeaturesCol(2)

explainParam(2)

getTol(1)

elasticNetParam(1)

isSet(1)

getThreshold(1)

maxiter(1)

predict(1)

getParam(1)

getMaxIter(1)

getLabelCol(1)

setParams(1)

setRegParam(1)

hasDefault(1)

예제 #1

파일 보기

def predict_class(df: DataFrame, vector_assembler: VectorAssembler,
                  lrModel: LogisticRegression):
    """[Predict class of dataframe]

    Args:
        df (DataFrame): [spark dataframe for prediction]
        vector_assembler (VectorAssembler): [vector assembler to prepare features]
        lrModel (LogisticRegression): [model]
    """
    predict_df = vector_assembler.transform(df).select('features')
    predictions = lrModel.transform(predict_df)
    predictions = predictions.withColumn(
        'class',
        when(col('prediction') == 0, 'Iris-setosa').when(
            col('prediction') == 1,
            'Iris-versicolor').otherwise('Iris-virginica'))
    (predictions.select('class').coalesce(1).write.option(
        "header", "true").format('csv').save('out/out_3_2.txt'))

예제 #2

파일 보기

파일: demo-PIMA-PySpark.py 프로젝트: ma-yue/Python_Tutorial

# In[13]:

from pyspark.ml.classification import LogisticRegression
# Evaluate model based on auc ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Evaluate model based on F1 socre
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Evaluate model based on confusion matrix
from pyspark.mllib.evaluation import MulticlassMetrics

# model on training data regPara: lasso regularisation parameter (L1)
lrModel = LogisticRegression(regParam=0.2).fit(trainData)

# make prediction on test data
pred = lrModel.transform(testData)

pred.select('label', 'prediction').show()

evaluator1 = BinaryClassificationEvaluator(labelCol='label',
                                           metricName="areaUnderROC")
evaluator2 = MulticlassClassificationEvaluator(labelCol='label',
                                               metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred))
print('F1 score of Logistic Regression model is %f' %
      evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()

# <a id="context322"></a>

예제 #3

파일 보기

파일: helper_1.py 프로젝트: Karan-Malik/pyspark

def log_reg_train(train_data, test_data):
    classifier = LogisticRegression(featuresCol='features', labelCol='label')
    classifier = classifier.fit(train_data)
    pred = classifier.transform(test_data)
    cm = pred.select("label", "prediction")
    return cm

예제 #4

파일 보기

FP = prediction.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)

--------------------------------------------------
# Exercise_8 
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

# Create a classifier object and train on training data
logistic = LogisticRegression().fit(flights_train)

# Create predictions for the testing data and show confusion matrix
prediction = logistic.transform(flights_test)
prediction.groupBy('label', 'prediction').count().show()

--------------------------------------------------
# Exercise_9 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall    = {:.2f}'.format(precision, recall))

# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})

예제 #5

파일 보기

from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = model.evaluate(test_data)

# COMMAND ----------

evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                          labelCol='label')
evaluator.evaluate(predictions.predictions)

# COMMAND ----------

#Achieved 96.3% accuracy on test data, lets predict on the unseen test data for kaggle submission.

# COMMAND ----------

results = model.transform(test_data)
results.select('features', 'prediction').show()

# COMMAND ----------

results.count()

# COMMAND ----------

results = model.transform(testData)
results.select('features', 'prediction').show()

# COMMAND ----------

results.count()

예제 #6

파일 보기

train_1, train_2, train_3 = train_data.randomSplit([0.33, 0.33, 0.34],
                                                   seed=1234)
'''
1) Training 1 individual model with all data
'''

print('**********')
print('For 1 Model')
print('**********')

#Training the Logistic Regression Model
classifier = LogisticRegression(featuresCol='features', labelCol='label')
classifier = classifier.fit(train_data)

#Making predictions
pred = classifier.transform(test_data)
print('Predictions:')
pred.show(10)

#Model Accuracy
cm = pred.select("label", "prediction")
cm.show()

acc = cm.filter(cm.label == cm.prediction).count() / cm.count()
print(f'Accuracy for one model: {acc}%')
'''
2) Training 3 individual models parallelly with 1/3 data each
'''

print('**********')
print('For 3 models trained separately used as an Ensemble')

예제 #7

파일 보기

파일: OriginalEx4a.3.py 프로젝트: wel51x/Machine_Learning_and_Spark

# Convert hashed symbols to TF-IDF
idf = IDF(inputCol="hash", outputCol="features")
sms = idf.fit(hashed).transform(hashed)

# View the first four records
sms.show(4, truncate=False)

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2)
logistic = logistic.fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy("label", 'prediction').count().show()

# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
accuracy = multi_evaluator.evaluate(prediction,
                                    {multi_evaluator.metricName: "accuracy"})
weighted_precision = multi_evaluator.evaluate(
    prediction, {multi_evaluator.metricName: "weightedPrecision"})
weighted_recall = multi_evaluator.evaluate(
    prediction, {multi_evaluator.metricName: "weightedRecall"})

# Find AUC
binary_evaluator = BinaryClassificationEvaluator()

예제 #8

파일 보기

selectedcols = ["label", "features"] + ["hours_per_week"] + ["income"]
dataset = preppedDataDF.select(selectedcols)

# COMMAND ----------

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=122423)

lrModel = LogisticRegression().fit(trainingData)

print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# COMMAND ----------

# Determine intersect point of "hours_per_week" where model goes from predicting income "<= 50k" to income ">50k"
predictions = lrModel.transform(testData)

selected = predictions.select("income", "label", "prediction", "probability", "hours_per_week").filter("hours_per_week > 65 and hours_per_week < 69")
display(selected)

# COMMAND ----------

# evaluate. note only 2 metrics are supported out of the box by Spark ML.
bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
au_roc = bce.setMetricName('areaUnderROC').evaluate(predictions)
au_prc = bce.setMetricName('areaUnderPR').evaluate(predictions)

truePositive = predictions.select("label").filter("label = 1 and prediction = 1").count()
falsePositive = predictions.select("label").filter("label = 0 and prediction = 1").count()
trueNegative = predictions.select("label").filter("label = 0 and prediction = 0").count()
falseNegative = predictions.select("label").filter("label = 1 and prediction = 0").count()

예제 #9

파일 보기

pipeline = pipeline.fit(df)

df = pipeline.transform(df)

# Model

lr = LogisticRegression(featuresCol="features",
                        labelCol=TARGET,
                        predictionCol="predictions",
                        maxIter=10,
                        regParam=0.0,
                        elasticNetParam=0.0,
                        threshold=0.5)

lr = lr.fit(df)
df = lr.transform(df)

summary = lr.summary

print("Labels")
print(summary.labels)

print("Accuracy")
print(summary.accuracy)

print("Precision by Label")
print(summary.precisionByLabel)

print("Recall by Label")
print(summary.recallByLabel)

예제 #10

파일 보기

파일: classification.py 프로젝트: AdrienBaude/MachineLearning

# Reference https://chrisalbon.com/machine_learning/trees_and_forests/random_forest_classifier_example/

import time
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, MultilayerPerceptronClassifier, LinearSVC, OneVsRest, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark import SparkConf, SparkContext, SQLContext

conf = SparkConf().setMaster("local[*]")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)

data = spark.read.format("libsvm").load(
    "D:\Outils\Spark\data\mllib\iris_libsvm.txt")
(train, test) = data.randomSplit([0.8, 0.2])

model = LogisticRegression()
# model = DecisionTreeClassifier()
# model = RandomForestClassifier()
# model = MultilayerPerceptronClassifier(layers=[4, 3])
# model = OneVsRest(classifier=LinearSVC())
# model = NaiveBayes()

model = model.fit(train)
predictions = model.transform(test)

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

score = evaluator.evaluate(predictions)
print('Accuracy: ', score)

예제 #11

파일 보기

#Estimator：入：DataFrame => 出：Transformer

sIndexer_02 = StringIndexer(inputCol="label", outputCol="indexed02")
si_model_02 = sIndexer_02.fit(train_data)
(trainingData02, testData02) = train_data.randomSplit([0.7, 0.3])
td_02 = si_model_02.transform(trainingData02)

#NB不能为负数
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

#LR
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier
model_LR = LogisticRegression(maxIter=5, regParam=0.01)
model_LR = model_LR.fit(train_data)
predict_lr_testData = model_LR.transform(testData)


#计算精度
def computeAcc(data):
    err = data.filter(data['label'] != data['prediction']).count()
    total = data.count()
    acc = float(err) / total
    print err, total, acc
    return acc


#GBT
gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed02")
model_gbt = gbt.fit(td_02)
predict_gbt_testData = model_gbt.transform(testData02)

예제 #12

파일 보기

파일: H_12_pyspark.py 프로젝트: hamitsarihanli/Customer_Churn_Prediction_using_Pyspark

test_df.show(10)

print("Training Dataset Count: " + str(train_df.count()))
print("Test Dataset Count: " + str(test_df.count()))

##################################################
# MODELING
##################################################

##################################################
# Logistic Regression
##################################################

log_model = LogisticRegression(featuresCol='features',
                               labelCol='label').fit(train_df)
y_pred = log_model.transform(test_df)
y_pred.show()

y_pred.select("label", "prediction").show()
y_pred.filter(y_pred.label == y_pred.prediction).count() / y_pred.count()

evaluator = BinaryClassificationEvaluator(labelCol="label",
                                          rawPredictionCol="prediction",
                                          metricName='areaUnderROC')

evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label",
                                                   predictionCol="prediction")

acc = evaluatorMulti.evaluate(y_pred, {evaluatorMulti.metricName: "accuracy"})
precision = evaluatorMulti.evaluate(
    y_pred, {evaluatorMulti.metricName: "precisionByLabel"})

예제 #13

파일 보기

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

fileStore = sys.argv[1]

df = spark.read.format("csv")\
          .options(inferSchema=True, header=True)\
          .load(fileStore)
valids = [v for v in df.columns if not v in remove]
df = df.select(valids)

#printc("%s" % df.dtypes)
inputs, df = vectorizeData(df=df, labelsCol=LEAVE)
train, test = df.randomSplit([0.7, 0.3], seed=12345)

# Train Logistic Regression
lr = LogisticRegression(regParam=0.01)
lr = lr.fit(train)
# Make predictions.
predictions = lr.transform(test)
evaluator = Evaluator()
# Select example rows to display.
#predictions.select("prediction", "label", "features").show()
# Evaluate the learned model
print("Pensiones random deads Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))
# Print important features
get_feature_importances(model=lr, featureNames=inputs, out_csv=out_csv)

예제 #14

파일 보기

# MAGIC %md
# MAGIC #### Step 4: Fit our model by using the training data
# MAGIC When we call the `.fit()` function, the pipeline stages are executed on the data in that dataset.

# COMMAND ----------

lrModel = LogisticRegression().fit(trainingData)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Step 5: Run our test data through the fit model, and view the predicted results for model evaluation

# COMMAND ----------

predictionsDF = (lrModel.transform(testData)).select("income", "label",
                                                     "prediction",
                                                     "probability")

# COMMAND ----------

predictionsDF.registerTempTable("incomePredictionsOutputDF")

# COMMAND ----------

# MAGIC %sql
# MAGIC
# MAGIC SELECT
# MAGIC   *
# MAGIC FROM incomePredictionsOutputDF

예제 #15

파일 보기

파일: who_rich.py 프로젝트: QikeYan/Big-data-programming-project

    stages_lr = stages.copy()

    inputCols = ['norm_cols'] + [
        cname + "classVec"
        for cname in categorical_cols if cname != 'native_country'
    ]
    final_assembler = VectorAssembler(inputCols=inputCols,
                                      outputCol='features')
    stages_lr += [final_assembler]

    pipeline = Pipeline(stages=stages_lr)
    train_lr = pipeline.fit(train).transform(train)
    test_lr = pipeline.fit(test).transform(test)
    lr = LogisticRegression(featuresCol='features',
                            labelCol='label').fit(train_lr)
    res_lr = lr.transform(test_lr)

    #----------------- Decision and Random Forest -----------------

    # Final assembly
    inputCols = ['norm_cols'
                 ] + [cname + "classVec" for cname in categorical_cols]
    final_assembler = VectorAssembler(inputCols=inputCols,
                                      outputCol='features')
    stages += [final_assembler]

    pipeline = Pipeline(stages=stages)
    train_final = pipeline.fit(train).transform(train)
    test_final = pipeline.fit(test).transform(test)

    dt = DecisionTreeClassifier(featuresCol='features',