outputCol="words",
                           pattern="\\W+").transform(df)
    words.show(truncate=False)

    # StopWordsRemover is feature transformer that filters out stop words from input.
    stop_words_removed = StopWordsRemover(
        inputCol="words", outputCol="stop_words_removed").transform(words)
    stop_words_removed.show(truncate=False)

    # 變成n字一組
    # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned.
    ngram_df = NGram(n=2, inputCol="words",
                     outputCol="ngrams").transform(words)

    ngram_df.show(truncate=False)
    ngram_df.select("ngrams").show(truncate=False)

    # TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1] It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling.
    df = words.select("words")
    df.show(truncate=False)

    # Hashing TF is TF with hashing enabled to allow the feature vector to be a set value
    df_tf = HashingTF(
        inputCol="words",
        outputCol="hashing_tf",
        numFeatures=15  #預設是262144維
    ).transform(df)

    df_tf.show()
    df_tf.select("words").show(truncate=False)
    df_tf.select("hashing_tf").show(truncate=False)
Пример #2
0
stopword_remover = f.udf(stopword_remover, ArrayType(StringType()))
replace_function = f.udf(replace_function, ArrayType(StringType()))

sentenceData = sentenceData.select("source", replace_function("sentence").alias("Sentence"))
sentenceData = sentenceData.select("source", stopword_remover("Sentence").alias("Sentence"))

################
#   TRIGRAMS   #
################

# DROPPED FROM THE PRESENTATION // don't reliably run without very small samples
# Also provide little additional useful information: very few repeated trigram phrases

trigram_df = NGram(n=3, inputCol="Sentence", outputCol="Trigrams").transform(sentenceData)
trigram_df = trigram_df.select("source", f.explode("Trigrams").alias("Trigrams"))
triCounted = trigram_df.groupBy("Trigrams").count()
triCounted.orderBy(f.col("count").desc()).show(10)

# FOR BOOKS
triCounted.filter((f.col("Count") != 14) & (f.col("Count") != 13)).orderBy(f.col("Count").desc()).show(10)

#OTHERS
triCounted.orderBy(f.col("Count").desc()).show(10)

################
#   BIGRAMS   #
################

bigram_df = NGram(n=2, inputCol="Sentence", outputCol="Bigrams").transform(sentenceData)
bigram_df = bigram_df.select("source", f.explode("Bigrams").alias("Bigrams"))