outputCol="words", pattern="\\W+").transform(df) words.show(truncate=False) # StopWordsRemover is feature transformer that filters out stop words from input. stop_words_removed = StopWordsRemover( inputCol="words", outputCol="stop_words_removed").transform(words) stop_words_removed.show(truncate=False) # 變成n字一組 # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned. ngram_df = NGram(n=2, inputCol="words", outputCol="ngrams").transform(words) ngram_df.show(truncate=False) ngram_df.select("ngrams").show(truncate=False) # TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1] It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. df = words.select("words") df.show(truncate=False) # Hashing TF is TF with hashing enabled to allow the feature vector to be a set value df_tf = HashingTF( inputCol="words", outputCol="hashing_tf", numFeatures=15 #預設是262144維 ).transform(df) df_tf.show() df_tf.select("words").show(truncate=False) df_tf.select("hashing_tf").show(truncate=False)
stopword_remover = f.udf(stopword_remover, ArrayType(StringType())) replace_function = f.udf(replace_function, ArrayType(StringType())) sentenceData = sentenceData.select("source", replace_function("sentence").alias("Sentence")) sentenceData = sentenceData.select("source", stopword_remover("Sentence").alias("Sentence")) ################ # TRIGRAMS # ################ # DROPPED FROM THE PRESENTATION // don't reliably run without very small samples # Also provide little additional useful information: very few repeated trigram phrases trigram_df = NGram(n=3, inputCol="Sentence", outputCol="Trigrams").transform(sentenceData) trigram_df = trigram_df.select("source", f.explode("Trigrams").alias("Trigrams")) triCounted = trigram_df.groupBy("Trigrams").count() triCounted.orderBy(f.col("count").desc()).show(10) # FOR BOOKS triCounted.filter((f.col("Count") != 14) & (f.col("Count") != 13)).orderBy(f.col("Count").desc()).show(10) #OTHERS triCounted.orderBy(f.col("Count").desc()).show(10) ################ # BIGRAMS # ################ bigram_df = NGram(n=2, inputCol="Sentence", outputCol="Bigrams").transform(sentenceData) bigram_df = bigram_df.select("source", f.explode("Bigrams").alias("Bigrams"))