from pyspark.ml.classification import LogisticRegression #split the data training_df, test_df = model_df.randomSplit([0.75, 0.25]) training_df.count() training_df.groupBy('Status').count().show() test_df.count() test_df.groupBy('Status').count().show() log_reg = LogisticRegression(labelCol='Status').fit(training_df) #Training Results train_results = log_reg.evaluate(training_df).predictions train_results.filter(train_results['Status'] == 1).filter( train_results['prediction'] == 1).select( ['Status', 'prediction', 'probability']).show(10, False) # MAGIC %md Probability at 0 index is for 0 class and probabilty as 1 index is for 1 class correct_preds = train_results.filter(train_results['Status'] == 1).filter( train_results['prediction'] == 1).count() training_df.filter(training_df['Status'] == 1).count() #accuracy on training dataset float(correct_preds) / (training_df.filter(training_df['Status'] == 1).count()) #Test Set results results = log_reg.evaluate(test_df).predictions results.select(['Status', 'prediction']).show(10, False) results.printSchema() #confusion matrix
# DBTITLE 1,Train Test Split of data train_data, test_data = data.randomSplit([0.75, 0.25], seed=12345) # COMMAND ---------- # DBTITLE 1,Modelling from pyspark.ml.classification import LogisticRegression model = LogisticRegression(labelCol='label', maxIter=5, regParam=0.001) model = model.fit(train_data) summary = model.summary summary.predictions.describe().show() # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator predictions = model.evaluate(test_data) # COMMAND ---------- evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') evaluator.evaluate(predictions.predictions) # COMMAND ---------- #Achieved 96.3% accuracy on test data, lets predict on the unseen test data for kaggle submission. # COMMAND ---------- results = model.transform(test_data) results.select('features', 'prediction').show()
model_df = df.select(['features', 'Status']) ## 3.2 进行逻辑回归 from pyspark.ml.classification import LogisticRegression # 逻辑回归。该类支持多项逻辑(softmax)和二项逻辑回归 training_df, test_df = model_df.randomSplit([0.75, 0.25 ]) # 划分数据,75%的数据用于训练,25%数据用于验证测试 training_df.groupBy('Status').count().show() # 查看划分后的数据 test_df.groupBy('Status').count().show() log_reg = LogisticRegression(labelCol='Status').fit( training_df) # 返回LogisticRegressionModel类型模型对象 train_results = log_reg.evaluate( training_df ).predictions # 在测试数据集中评估模型,返回对象为BinaryLogisticRegressionSummary-给定模型的二元逻辑回归结果 train_results.filter(train_results['Status'] == 1).filter( train_results['prediction'] == 1).select( ['Status', 'prediction', 'probability']).show(10, False) print('{}{}'.format('预测准确率:', log_reg.evaluate(training_df).accuracy)) # 查看预测的准确率 test_results = log_reg.evaluate(test_df).predictions # 使用模型训练测试数据 test_results.filter(test_results['Status'] == 1).filter( test_results['prediction'] == 1).select( ['Status', 'prediction', 'probability']).show(10, False)
from pyspark.ml.feature import VectorAssembler df_assembler = VectorAssembler(inputCols=['features', 'token_count'], outputCol='features_vec') model_text_df = df_assembler.transform(model_text_df) model_text_df.printSchema() from pyspark.ml.classification import LogisticRegression #split the data training_df, test_df = model_text_df.randomSplit([0.75, 0.25]) training_df.groupBy('Label').count().show() test_df.groupBy('Label').count().show() log_reg = LogisticRegression(featuresCol='features_vec', labelCol='Label').fit(training_df) results = log_reg.evaluate(test_df).predictions results.show() from pyspark.ml.evaluation import BinaryClassificationEvaluator #confusion matrix true_postives = results[(results.Label == 1) & (results.prediction == 1)].count() true_negatives = results[(results.Label == 0) & (results.prediction == 0)].count() false_positives = results[(results.Label == 0) & (results.prediction == 1)].count() false_negatives = results[(results.Label == 1) & (results.prediction == 0)].count() recall = float(true_postives) / (true_postives + false_negatives)
train_tf_vec = tf_vector.transform(train_df) test_tf_vec = tf_vector.transform(test_df) tfidf_vector = IDF(inputCol='tf_vector', outputCol='tfidf_vector') train_tfidf_vec = tfidf_vector.fit(train_tf_vec).transform(train_tf_vec) test_tfidf_vec = tfidf_vector.fit(test_tf_vec).transform(test_tf_vec) assembler = VectorAssembler(inputCols=['tfidf_vector', 'token_count'], outputCol='X') train_tfidf_vec = assembler.transform(train_tfidf_vec) test_tfidf_vec = assembler.transform(test_tfidf_vec) train_data, dev_data = train_tfidf_vec.randomSplit([0.95, 0.05]) model = LogisticRegression(featuresCol='X', labelCol='label').fit(train_data) result_dev = model.evaluate(dev_data).predictions result_test = model.evaluate(test_tfidf_vec).predictions result_test = result_test.withColumn('final', result_test.prediction.cast('int')) result_test.select("final").write.csv( path="file:///home/root/emailclass/sub_1.csv", header="false") auc_dev = BinaryClassificationEvaluator(labelCol='label').evaluate(result_dev) print(auc_dev)