sen_df.show() tokenizer = Tokenizer(inputCol='sentence', outputCol='words') regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W') count_tokens = udf(lambda words: len(words), IntegerType()) tokenized = tokenizer.transform(sen_df) tokenized.show() tokenized.withColumn('tokens', count_tokens(col('words'))).show() rg_tokenized = regex_tokenizer.tranfsorm(sen_df) rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show() from pyspark.ml.feature import StopWordsRemover sentenceDataFrame = spark.createDataFrame( [(0, ['I', 'saw', 'the', 'green', 'horse']), (1, ['Mary', 'had', 'a', 'little', 'lamb'])], ['id', 'tokens']) remover = StopWordsRemover(inputCol='tokens', outputCol='filtered') remover.transform(sentenceDataFrame).show() # n-gram pomaga znalezc zaleznosci miedzy kilkoma slowami from pyspark.ml.feature import NGram