Пример #1
0
from pyspark.ml.classification import LogisticRegression

#split the data
training_df, test_df = model_df.randomSplit([0.75, 0.25])
training_df.count()
training_df.groupBy('Status').count().show()
test_df.count()
test_df.groupBy('Status').count().show()
log_reg = LogisticRegression(labelCol='Status').fit(training_df)

#Training Results
train_results = log_reg.evaluate(training_df).predictions
train_results.filter(train_results['Status'] == 1).filter(
    train_results['prediction'] == 1).select(
        ['Status', 'prediction', 'probability']).show(10, False)

# MAGIC %md Probability at 0 index is for 0 class and probabilty as 1 index is for 1 class
correct_preds = train_results.filter(train_results['Status'] == 1).filter(
    train_results['prediction'] == 1).count()
training_df.filter(training_df['Status'] == 1).count()

#accuracy on training dataset
float(correct_preds) / (training_df.filter(training_df['Status'] == 1).count())

#Test Set results

results = log_reg.evaluate(test_df).predictions
results.select(['Status', 'prediction']).show(10, False)
results.printSchema()

#confusion matrix
Пример #2
0
# DBTITLE 1,Train Test Split of data
train_data, test_data = data.randomSplit([0.75, 0.25], seed=12345)

# COMMAND ----------

# DBTITLE 1,Modelling
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(labelCol='label', maxIter=5, regParam=0.001)
model = model.fit(train_data)
summary = model.summary
summary.predictions.describe().show()

# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = model.evaluate(test_data)

# COMMAND ----------

evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                          labelCol='label')
evaluator.evaluate(predictions.predictions)

# COMMAND ----------

#Achieved 96.3% accuracy on test data, lets predict on the unseen test data for kaggle submission.

# COMMAND ----------

results = model.transform(test_data)
results.select('features', 'prediction').show()
Пример #3
0
model_df = df.select(['features', 'Status'])

## 3.2 进行逻辑回归

from pyspark.ml.classification import LogisticRegression  # 逻辑回归。该类支持多项逻辑(softmax)和二项逻辑回归

training_df, test_df = model_df.randomSplit([0.75, 0.25
                                             ])  # 划分数据,75%的数据用于训练,25%数据用于验证测试

training_df.groupBy('Status').count().show()  # 查看划分后的数据
test_df.groupBy('Status').count().show()

log_reg = LogisticRegression(labelCol='Status').fit(
    training_df)  # 返回LogisticRegressionModel类型模型对象

train_results = log_reg.evaluate(
    training_df
).predictions  # 在测试数据集中评估模型,返回对象为BinaryLogisticRegressionSummary-给定模型的二元逻辑回归结果

train_results.filter(train_results['Status'] == 1).filter(
    train_results['prediction'] == 1).select(
        ['Status', 'prediction', 'probability']).show(10, False)

print('{}{}'.format('预测准确率:',
                    log_reg.evaluate(training_df).accuracy))  # 查看预测的准确率

test_results = log_reg.evaluate(test_df).predictions  # 使用模型训练测试数据
test_results.filter(test_results['Status'] == 1).filter(
    test_results['prediction'] == 1).select(
        ['Status', 'prediction', 'probability']).show(10, False)
from pyspark.ml.feature import VectorAssembler

df_assembler = VectorAssembler(inputCols=['features', 'token_count'],
                               outputCol='features_vec')
model_text_df = df_assembler.transform(model_text_df)
model_text_df.printSchema()

from pyspark.ml.classification import LogisticRegression
#split the data
training_df, test_df = model_text_df.randomSplit([0.75, 0.25])
training_df.groupBy('Label').count().show()

test_df.groupBy('Label').count().show()
log_reg = LogisticRegression(featuresCol='features_vec',
                             labelCol='Label').fit(training_df)
results = log_reg.evaluate(test_df).predictions
results.show()

from pyspark.ml.evaluation import BinaryClassificationEvaluator

#confusion matrix
true_postives = results[(results.Label == 1)
                        & (results.prediction == 1)].count()
true_negatives = results[(results.Label == 0)
                         & (results.prediction == 0)].count()
false_positives = results[(results.Label == 0)
                          & (results.prediction == 1)].count()
false_negatives = results[(results.Label == 1)
                          & (results.prediction == 0)].count()

recall = float(true_postives) / (true_postives + false_negatives)
Пример #5
0
train_tf_vec = tf_vector.transform(train_df)
test_tf_vec = tf_vector.transform(test_df)

tfidf_vector = IDF(inputCol='tf_vector', outputCol='tfidf_vector')

train_tfidf_vec = tfidf_vector.fit(train_tf_vec).transform(train_tf_vec)
test_tfidf_vec = tfidf_vector.fit(test_tf_vec).transform(test_tf_vec)

assembler = VectorAssembler(inputCols=['tfidf_vector', 'token_count'],
                            outputCol='X')

train_tfidf_vec = assembler.transform(train_tfidf_vec)
test_tfidf_vec = assembler.transform(test_tfidf_vec)

train_data, dev_data = train_tfidf_vec.randomSplit([0.95, 0.05])

model = LogisticRegression(featuresCol='X', labelCol='label').fit(train_data)

result_dev = model.evaluate(dev_data).predictions
result_test = model.evaluate(test_tfidf_vec).predictions

result_test = result_test.withColumn('final',
                                     result_test.prediction.cast('int'))
result_test.select("final").write.csv(
    path="file:///home/root/emailclass/sub_1.csv", header="false")

auc_dev = BinaryClassificationEvaluator(labelCol='label').evaluate(result_dev)

print(auc_dev)