# Exercise_11 from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF # Remove stop words. wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\ .transform(sms) # Apply the hashing trick wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\ .transform(wrangled) # Convert hashed symbols to TF-IDF tf_idf = IDF(inputCol='hash', outputCol='features')\ .fit(wrangled).transform(wrangled) tf_idf.select('terms', 'features').show(4, truncate=False) -------------------------------------------------- # Exercise_12 # Split the data into training and testing sets sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13) # Fit a Logistic Regression model to the training data logistic = LogisticRegression(regParam=0.2).fit(sms_train) # Make predictions on the testing data prediction = logistic.transform(sms_test) # Create a confusion matrix, comparing predictions to known labels prediction.groupBy('label', 'prediction').count().show()
# Hashing TF is TF with hashing enabled to allow the feature vector to be a set value df_tf = HashingTF( inputCol="words", outputCol="hashing_tf", numFeatures=15 #預設是262144維 ).transform(df) df_tf.show() df_tf.select("words").show(truncate=False) df_tf.select("hashing_tf").show(truncate=False) #第一個list代表詞的index,第2個list代表詞出現次數 # IDF df_tf_idf = IDF(inputCol="hashing_tf", outputCol="tf_idf").fit(df_tf).transform(df_tf) df_tf_idf.show() df_tf_idf.select("words").show(truncate=False) df_tf_idf.select("hashing_tf").show(truncate=False) # Hashing TF df_tf_idf.select("tf_idf").show(truncate=False) # IDF # TF from CountVectorizer, which is used to extract words and counts from document collection df = words.select("words") df.show(truncate=False) df_tf_cv = CountVectorizer(inputCol="words", outputCol="tf_cv").fit(df).transform(df) df_tf_cv.show() df_tf_cv.select("words").show(truncate=False) df_tf_cv.select("tf_cv").show(truncate=False)