wrangled = Tokenizer(inputCol="text", outputCol="words").transform(wrangled) # Remove stop words. wrangled = StopWordsRemover(inputCol="words", outputCol="terms").transform(wrangled) # Apply the hashing trick wrangled = HashingTF(inputCol="terms", outputCol="hash", numFeatures=1024).transform(wrangled) # Convert hashed symbols to TF-IDF sms = IDF(inputCol="hash", outputCol="features").fit(wrangled).transform(wrangled) # View the first four records sms.show(4, truncate=False) # Split the data into training and testing sets sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13) # Fit a Logistic Regression model to the training data logistic = LogisticRegression(regParam=0.2).fit(sms_train) # Make predictions on the testing data prediction = logistic.transform(sms_test) # Create a confusion matrix, comparing predictions to known labels prediction.groupBy("label", 'prediction').count().show() # Find weighted precision multi_evaluator = MulticlassClassificationEvaluator()
# Hashing TF is TF with hashing enabled to allow the feature vector to be a set value df_tf = HashingTF( inputCol="words", outputCol="hashing_tf", numFeatures=15 #預設是262144維 ).transform(df) df_tf.show() df_tf.select("words").show(truncate=False) df_tf.select("hashing_tf").show(truncate=False) #第一個list代表詞的index,第2個list代表詞出現次數 # IDF df_tf_idf = IDF(inputCol="hashing_tf", outputCol="tf_idf").fit(df_tf).transform(df_tf) df_tf_idf.show() df_tf_idf.select("words").show(truncate=False) df_tf_idf.select("hashing_tf").show(truncate=False) # Hashing TF df_tf_idf.select("tf_idf").show(truncate=False) # IDF # TF from CountVectorizer, which is used to extract words and counts from document collection df = words.select("words") df.show(truncate=False) df_tf_cv = CountVectorizer(inputCol="words", outputCol="tf_cv").fit(df).transform(df) df_tf_cv.show() df_tf_cv.select("words").show(truncate=False) df_tf_cv.select("tf_cv").show(truncate=False)