wrangled = StopWordsRemover(inputCol="words", outputCol="terms").transform(wrangled) # Apply the hashing trick wrangled = HashingTF(inputCol="terms", outputCol="hash", numFeatures=1024).transform(wrangled) # Convert hashed symbols to TF-IDF sms = IDF(inputCol="hash", outputCol="features").fit(wrangled).transform(wrangled) # View the first four records sms.show(4, truncate=False) # Split the data into training and testing sets sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13) # Fit a Logistic Regression model to the training data logistic = LogisticRegression(regParam=0.2).fit(sms_train) # Make predictions on the testing data prediction = logistic.transform(sms_test) # Create a confusion matrix, comparing predictions to known labels prediction.groupBy("label", 'prediction').count().show() # Find weighted precision multi_evaluator = MulticlassClassificationEvaluator() accuracy = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "accuracy"}) weighted_precision = multi_evaluator.evaluate(