def test_split_words_by_semicolon(self): print("test_split_words_by_semicolon") sample_data = [Row("test;splitting;a;dataset;of;words;by;hyphen")] sample_data_df = self.spark.createDataFrame(sample_data) results_spark = w.splitWords(self.spark, sample_data_df) df_length = results_spark.count() self.assertEqual(df_length,8)
def test_split_words_by_hyphen(self): print("test_split_words_by_hyphen") sample_data = [Row("test-splitting-a-dataset-of-words-by-hyphen")] sample_data_df = self.spark.createDataFrame(sample_data) results_spark = w.splitWords(self.spark, sample_data_df) df_length = results_spark.count() self.assertEqual(df_length,8)
def test_split_words_by_comma(self): print("test_split_words_by_comma") sample_data = [Row("test,splitting,a,dataset,of,words,by,comma")] sample_data_df = self.spark.createDataFrame(sample_data) results_spark = w.splitWords(self.spark, sample_data_df) df_length = results_spark.count() self.assertEqual(df_length,8)
def test_split_words_by_period(self): print("test_split_words_by_period") sample_data = [Row("test.splitting.a.dataset.of.words.by.period")] sample_data_df = self.spark.createDataFrame(sample_data) results_spark = w.splitWords(self.spark, sample_data_df) df_length = results_spark.count() self.assertEqual(df_length,8)
def test_split_words_by_spaces(self): print("test_split_words_by_spaces") sample_data = [Row("test splitting a dataset of words by spaces")] sample_data_df = self.spark.createDataFrame(sample_data) results_spark = w.splitWords(self.spark, sample_data_df) df_length = results_spark.count() self.assertEqual(df_length,8)
def test_case_insensitivity(self): print("test_case_insensitivity") sample_data = [Row("A TEST OF SPLITTING A DATASET OF WORDS BY SPACES")] sample_data_df = self.spark.createDataFrame(sample_data) sample_data_df_split = w.splitWords(self.spark, sample_data_df) word_count_df = w.countByWord(self.spark, sample_data_df_split) actual_word_counts = dataframe_converter(word_count_df) expected_word_counts = [('a', 2), ('by', 1), ('dataset', 1), ('of', 2), ('spaces', 1), ('splitting', 1), ('test', 1), ('words', 1)] self.assertEqual(actual_word_counts,expected_word_counts)
def test_should_not_aggregate_dissimilar_words(self): print("test_should_not_aggregate_dissimilar_words") sample_data = [Row("a test of splitting a dataset of words by spaces")] sample_data_df = self.spark.createDataFrame(sample_data) sample_data_df_split = w.splitWords(self.spark, sample_data_df) word_count_df = w.countByWord(self.spark, sample_data_df_split) actual_word_counts = dataframe_converter(word_count_df) expected_word_counts = [('a', 2), ('by', 1), ('dataset', 1), ('of', 2), ('spaces', 1), ('splitting', 1), ('test', 1), ('words', 1)] self.assertEqual(actual_word_counts,expected_word_counts)
def test_count_words_basic(self): print("test_count_words_basic") sample_data = [Row("a test of splitting a dataset of words by spaces")] sample_data_df = self.spark.createDataFrame(sample_data) sample_data_df_split = w.splitWords(self.spark, sample_data_df) word_count_df = w.countByWord(self.spark, sample_data_df_split) actual_word_counts = dataframe_converter(word_count_df) counts = [word_count[1] for word_count in actual_word_counts] total_count = sum(counts) self.assertEqual(total_count,10)
def test_ordering_words(self): print("test_ordering_words") sample_data = [Row("a test of splitting a dataset of words by spaces")] sample_data_df = self.spark.createDataFrame(sample_data) sample_data_df_split = w.splitWords(self.spark, sample_data_df) word_count_df = w.countByWord(self.spark, sample_data_df_split) actual_word_counts = dataframe_converter(word_count_df) actual_word_order = [word_count[0] for word_count in actual_word_counts] expected_word_order = ['a', 'by', 'dataset', 'of', 'spaces', 'splitting', 'test', 'words'] self.assertEqual(actual_word_order, expected_word_order)
def run(spark, inputPath, outputPath): logging.info("Reading text file from: " + inputPath) input_df = spark.read.text(inputPath) split_df = wordcount_utils.splitWords(spark, input_df) count_df = wordcount_utils.countByWord(spark, split_df) logging.info("Writing csv to directory: " + outputPath) count_df.show count_df.write.csv(outputPath, mode='append')