def test_filter_should_give_Transformable_rdd(self): initial_dataset = self.sc.parallelize(["1,2", "1,2", "1,3"]) transformable_rdd = TransformableRDD(initial_dataset, "csv") rdd_filter = transformable_rdd.filter( lambda line: line.split(",")[1] != "2") collected = rdd_filter.collect() self.assertEqual(1, collected.__len__())
def test_should_smooth_data_by_Simple_Moving_Average(self): initial_dataset = self.sc.parallelize( ["52,3,53", "23,4,64", "23,5,64", "23,6,64", "23,7,64", "23,8,64", "23,9,64"], 3) transformable_rdd = TransformableRDD(initial_dataset, "csv") transformed = transformable_rdd.smooth(1, SimpleMovingAverage(3)) excepted = 4.0 self.assertEquals(excepted, transformed.first())
def test_pivot_table_by_count_should_give_pivoted_table(self): initial_dataSet = self.sc.parallelize([ "known,new,long,home,skips", "unknown,new,short,work,reads", "unknown,follow Up,long,work,skips", "known,follow Up,long,home,skips", "known,new,short,home,reads", "known,follow Up,long,work,skips", "unknown,follow Up,short,work,skips", "unknown,new,short,work,reads", "known,follow Up,long,home,skips", "known,new,long,work,skips", "unknown,follow Up,short,home,skips", "known,new,long,work,skips", "known,follow Up,short,home,reads", "known,new,short,work,reads", "known,new,short,home,reads", "known,follow Up,short,work,reads", "known,new,short,home,reads", "unknown,new,short,work,reads" ]) initial_rdd = TransformableRDD(initial_dataSet, "csv") table = initial_rdd.pivot_by_count(4, [0, 1, 2, 3]) entry = table.value_at("skips", "known") self.assertEqual(6, entry) self.assertEqual(3, table.value_at("skips", "unknown"))
def test_should_split_given_column_indexes_split_by_delimiter_with_retain_column(self): initial_data_set = self.sc.parallelize(["FirstName LastName MiddleName,850"]) initial_rdd = TransformableRDD(initial_data_set, "csv") split_with_retained_columns = initial_rdd.split_by_delimiter(0, " ", True) self.assertEquals("FirstName LastName MiddleName,850,FirstName,LastName,MiddleName", split_with_retained_columns.first())
def test_replace_values_should_replace_cluster_values_with_given_text(self): initial_dataset = self.sc.parallelize(["XA,Y", "A,B", "AX,Z", "A,Q", "A,E"]) transformable_rdd = TransformableRDD(initial_dataset) clusters = transformable_rdd.clusters(0, NGramFingerprintAlgorithm(1)) one_cluster = clusters.get_all_clusters()[0] values = transformable_rdd.replace_values(one_cluster, "Hello", 0).collect() self.assertTrue(values.__contains__("Hello,B"))
def test_to_double_rdd_should_change_string_to_double_rdd(self): initial_dataset = self.sc.parallelize(["1,1", "5,2", "8,3"]) transformable_rdd = TransformableRDD(initial_dataset) rdd = transformable_rdd.to_double_rdd(0) collected = rdd.collect() self.assertTrue(collected.__contains__(1.0)) self.assertTrue(collected.__contains__(5.0)) self.assertTrue(collected.__contains__(8.0))
def test_multiply_column_should_multiply_two_given_column(self): initial_dataset = self.sc.parallelize(["1,1", "1,2", "1,3"]) transformable_rdd = TransformableRDD(initial_dataset) multiplied_rdd = transformable_rdd.multiply_columns(0, 1) collected = multiplied_rdd.collect() self.assertTrue(collected.__contains__(1.0)) self.assertTrue(collected.__contains__(2.0)) self.assertTrue(collected.__contains__(3.0))
def test_clusters_should_give_clusters_of_given_column_index(self): rdd = self.sc.parallelize(["CLUSTER Of Finger print", "finger print of cluster", "finger print for cluster"]) transformable_rdd = TransformableRDD(rdd, 'csv') clusters = transformable_rdd.clusters(0, SimpleFingerprint()) list_of_clusters = clusters.get_all_clusters() one_cluster = list_of_clusters[0] self.assertTrue(one_cluster.__contains__("CLUSTER Of Finger print")) self.assertFalse(one_cluster.__contains__("finger print for cluster"))
def test_clusters_should_give_clusters_By_n_gram_fingerprint(self): rdd = self.sc.parallelize(["CLUSTER Of Finger print", "finger print of cluster", "finger print for cluster"]) transformable_rdd = TransformableRDD(rdd, 'csv') clusters = transformable_rdd.clusters(0, NGramFingerprintAlgorithm(1)) list_of_clusters = clusters.get_all_clusters() one_cluster = list_of_clusters[0] self.assertTrue(one_cluster.__contains__("CLUSTER Of Finger print")) self.assertTrue(one_cluster.__contains__("finger print for cluster"))
def test_transformableRDD_can_impute_the_missing_values_by_UnivariateLinearRegressionSubstitution( self): rdd = self.sc.parallelize( ["60,3.1", "61,3.6", "62,3.8", "63,4", "65,4.1", "64,"]) transformable_rdd = TransformableRDD(rdd, 'csv') imputed_rdd = transformable_rdd.impute( 1, UnivariateLinearRegressionSubstitution(0)) self.assertTrue(imputed_rdd.collect().__contains__("64,4.06"))
def test_should_split_given_column_indexes_split_by_delimiter(self): initial_data_set = self.sc.parallelize( ["FirstName LastName MiddleName,850"]) initial_rdd = TransformableRDD(initial_data_set, "csv") splitted_columns = initial_rdd.split_by_delimiter(0, " ", False) self.assertEquals("850,FirstName,LastName,MiddleName", splitted_columns.first())
def test_should_smooth_data_by_Simple_Moving_Average(self): initial_dataset = self.sc.parallelize([ "52,3,53", "23,4,64", "23,5,64", "23,6,64", "23,7,64", "23,8,64", "23,9,64" ], 3) transformable_rdd = TransformableRDD(initial_dataset, "csv") transformed = transformable_rdd.smooth(1, SimpleMovingAverage(3)) excepted = 4.0 self.assertEquals(excepted, transformed.first())
def test_map_should_give_Transformable_rdd(self): initial_dataset = self.sc.parallelize(["1,2", "1,2", "1,3"]) transformable_rdd = TransformableRDD(initial_dataset, "csv") rdd_map = transformable_rdd.map(lambda line: line + "yes") deduplicate = rdd_map.deduplicate() collected = deduplicate.collect() self.assertEqual(2, collected.__len__()) expected = "1,2yes" self.assertTrue(collected.__contains__(expected))
def test_should_split_given_column_by_field_length_with_retained_columns(self): data = ["John,Male,21,+914382313832,Canada", "Smith, Male, 30,+015314343462, UK", "Larry, Male, 23,+009815432975, USA", "Fiona, Female,18,+891015709854,USA"] initial_data_set = self.sc.parallelize(data) initial_rdd = TransformableRDD(initial_data_set, "csv") result = initial_rdd.split_by_field_length(3, [3, 10], True).collect() self.assertTrue(len(result) == 4) self.assertTrue(result.__contains__("John,Male,21,+914382313832,Canada,+91,4382313832")) self.assertTrue(result.__contains__("Smith,Male,30,+015314343462,UK,+01,5314343462"))
def test_should_merge_given_column_indexes(self): initial_data_set = self.sc.parallelize(["FirstName,LastName,732,MiddleName"]) initial_rdd = TransformableRDD(initial_data_set, "csv") joined_column_rdd = initial_rdd.merge_columns([3, 1, 0], "_", False) self.assertEquals("732,MiddleName_LastName_FirstName", joined_column_rdd.first()) with_originals = initial_rdd.merge_columns([3, 1, 0], "_", True) self.assertEquals("FirstName,LastName,732,MiddleName,MiddleName_LastName_FirstName", with_originals.first()) joined_column_with_defaults = initial_rdd.merge_columns([3, 1, 0]) self.assertEquals("732,MiddleName LastName FirstName", joined_column_with_defaults.first())
def test_transformableRDD_can_impute_the_missing_values_by_NaiveBayesSubstitution(self): rdd = self.sc.parallelize(["Drew,No,Blue,Short,Male", "Claudia,Yes,Brown,Long,Female", "Drew,No,Blue,Long,Female", "Drew,No,Blue,Long,Female", "Alberto,Yes,Brown,Short,Male", "Karin,No,Blue,Long,Female", "Nina,Yes,Brown,Short,Female", "Sergio,Yes,Blue,Long,Male", "Drew,Yes,Blue,Long,"]) transformable_rdd = TransformableRDD(rdd, 'csv') imputed_rdd = transformable_rdd.impute(4, NaiveBayesSubstitution(0, 1, 2, 3)) self.assertTrue(imputed_rdd.collect().__contains__("Drew,Yes,Blue,Long,Female"))
def test_transformableRDD_can_impute_the_missing_values_by_NaiveBayesSubstitution( self): rdd = self.sc.parallelize([ "Drew,No,Blue,Short,Male", "Claudia,Yes,Brown,Long,Female", "Drew,No,Blue,Long,Female", "Drew,No,Blue,Long,Female", "Alberto,Yes,Brown,Short,Male", "Karin,No,Blue,Long,Female", "Nina,Yes,Brown,Short,Female", "Sergio,Yes,Blue,Long,Male", "Drew,Yes,Blue,Long," ]) transformable_rdd = TransformableRDD(rdd, 'csv') imputed_rdd = transformable_rdd.impute( 4, NaiveBayesSubstitution(0, 1, 2, 3)) self.assertTrue( imputed_rdd.collect().__contains__("Drew,Yes,Blue,Long,Female"))
def test_should_split_the_given_column_by_delimiter_into_given_number_of_split(self): data = [ "John\tMale\t21\t+91-4382-313832\tCanada", "Smith\tMale\t30\t+01-5314-343462\tUK", "Larry\tMale\t23\t+00-9815-432975\tUSA", "Fiona\tFemale\t18\t+89-1015-709854\tUSA" ] initial_data_set = self.sc.parallelize(data) initial_rdd = TransformableRDD(initial_data_set, "tsv") new_dataset = initial_rdd.split_by_delimiter(3, "-", False, 2) list_of_records = new_dataset.collect() self.assertEqual(4, list_of_records.__len__()) self.assertTrue(list_of_records.__contains__("John\tMale\t21\tCanada\t+91\t4382-313832")) self.assertTrue(list_of_records.__contains__("Smith\tMale\t30\tUK\t+01\t5314-343462"))
def test_should_smooth_data_by_Weighted_Moving_Average(self): initial_dataset = self.sc.parallelize(["10", "12", "16", "13", "17", "19", "15", "20", "22", "19", "21", "19"], 3) transformable_rdd = TransformableRDD(initial_dataset, "csv") weights = Weights(3) weights.add(0.166) weights.add(0.333) weights.add(0.5) moving_average = WeightedMovingAverage(3, weights) rdd = transformable_rdd.smooth(0, moving_average) expected = 13.656 actual = rdd.first() self.assertEquals(expected, actual)
def test_should_split_given_column_by_field_length(self): data = [ "John,Male,21,+914382313832,Canada", "Smith, Male, 30,+015314343462, UK", "Larry, Male, 23,+009815432975, USA", "Fiona, Female,18,+891015709854,USA" ] initial_data_set = self.sc.parallelize(data) initial_rdd = TransformableRDD(initial_data_set, "csv") result = initial_rdd.split_by_field_length(3, [3, 10], False).collect() self.assertTrue(len(result) == 4) self.assertTrue( result.__contains__("John,Male,21,Canada,+91,4382313832")) self.assertTrue(result.__contains__("Smith,Male,30,UK,+01,5314343462"))
def test_add_columns_from_should_merge_all_columns_of_other_transformable_rdd( self): initial_spelled_numbers = self.sc.parallelize([ "One,Two,Three", "Four,Five,Six", "Seven,Eight,Nine", "Ten,Eleven,Twelve" ]) spelled_numbers = TransformableRDD(initial_spelled_numbers, "csv") initial_numeric_data = self.sc.parallelize( ["1\t2\t3", "4\t5\t6", "7\t8\t9", "10\t11\t12"]) numeric_data = TransformableRDD(initial_numeric_data, "tsv") result = spelled_numbers.add_columns_from(numeric_data).collect() self.assertTrue(result.__contains__("One,Two,Three,1,2,3")) self.assertTrue(result.__contains__("Four,Five,Six,4,5,6")) self.assertTrue(result.__contains__("Seven,Eight,Nine,7,8,9")) self.assertTrue(result.__contains__("Ten,Eleven,Twelve,10,11,12"))
def test_should_smooth_data_by_Weighted_Moving_Average(self): initial_dataset = self.sc.parallelize([ "10", "12", "16", "13", "17", "19", "15", "20", "22", "19", "21", "19" ], 3) transformable_rdd = TransformableRDD(initial_dataset, "csv") weights = Weights(3) weights.add(0.166) weights.add(0.333) weights.add(0.5) moving_average = WeightedMovingAverage(3, weights) rdd = transformable_rdd.smooth(0, moving_average) expected = 13.656 actual = rdd.first() self.assertEquals(expected, actual)
def test_should_normalize_by_Z_Score_normalization(self): initial_dataset = self.sc.parallelize([ "07434677419,07371326239,Incoming,211,Wed Sep 15 19:17:44 +0100 2010", "07641036117,01666472054,Outgoing,0,Mon Feb 11 07:18:23 +0000 1980", "07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980", "07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980", "07641036117,07681546436,Missed,12,Mon Feb 11 08:04:42 +0000 1980"]) transformable_rdd = TransformableRDD(initial_dataset, 'csv') final_rdd = transformable_rdd.normalize(3, ZScoreNormalizer()) normalized_durations = final_rdd.select(3).collect() expected1 = "1.944528306701421" expected2 = "-0.8202659838241843" expected3 = "-0.2306179123850742" expected4 = "-0.2306179123850742" expected5 = "-0.6630264981070882" self.assertTrue(normalized_durations.__contains__(expected1)) self.assertTrue(normalized_durations.__contains__(expected2)) self.assertTrue(normalized_durations.__contains__(expected3)) self.assertTrue(normalized_durations.__contains__(expected4)) self.assertTrue(normalized_durations.__contains__(expected5))
def test_should_normalize_by_Min_Max_normalization(self): initial_dataset = self.sc.parallelize([ "07434677419,07371326239,Incoming,211,Wed Sep 15 19:17:44 +0100 2010", "07641036117,01666472054,Outgoing,0,Mon Feb 11 07:18:23 +0000 1980", "07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980", "07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980", "07641036117,07681546436,Missed,12,Mon Feb 11 08:04:42 +0000 1980"]) transformable_rdd = TransformableRDD(initial_dataset, 'csv') final_rdd = transformable_rdd.normalize(3, MinMaxNormalizer(0, 1)) normalized_durations = final_rdd.select(3).collect() expected1 = "1.0" expected2 = "0.0" expected3 = "0.2132701421800948" expected4 = "0.2132701421800948" expected5 = "0.05687203791469194" self.assertTrue(normalized_durations.__contains__(expected1)) self.assertTrue(normalized_durations.__contains__(expected2)) self.assertTrue(normalized_durations.__contains__(expected3)) self.assertTrue(normalized_durations.__contains__(expected4)) self.assertTrue(normalized_durations.__contains__(expected5))
def test_should_split_the_given_column_by_delimiter_into_given_number_of_split( self): data = [ "John\tMale\t21\t+91-4382-313832\tCanada", "Smith\tMale\t30\t+01-5314-343462\tUK", "Larry\tMale\t23\t+00-9815-432975\tUSA", "Fiona\tFemale\t18\t+89-1015-709854\tUSA" ] initial_data_set = self.sc.parallelize(data) initial_rdd = TransformableRDD(initial_data_set, "tsv") new_dataset = initial_rdd.split_by_delimiter(3, "-", False, 2) list_of_records = new_dataset.collect() self.assertEqual(4, list_of_records.__len__()) self.assertTrue( list_of_records.__contains__( "John\tMale\t21\tCanada\t+91\t4382-313832")) self.assertTrue( list_of_records.__contains__( "Smith\tMale\t30\tUK\t+01\t5314-343462"))
def test_should_normalize_by_Decimal_Scale(self): initial_dataset = self.sc.parallelize([ "07434677419,07371326239,Incoming,211,Wed Sep 15 19:17:44 +0100 2010", "07641036117,01666472054,Outgoing,0,Mon Feb 11 07:18:23 +0000 1980", "07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980", "07641036117,07371326239,Incoming,45,Mon Feb 11 07:45:42 +0000 1980", "07641036117,07681546436,Missed,12,Mon Feb 11 08:04:42 +0000 1980"]) transformable_rdd = TransformableRDD(initial_dataset, 'csv') final_rdd = transformable_rdd.normalize(3, DecimalScalingNormalizer()) normalized_durations = final_rdd.select(3).collect() expected1 = "2.11" expected2 = "0.0" expected3 = "0.45" expected4 = "0.45" expected5 = "0.12" self.assertTrue(normalized_durations.__contains__(expected1)) self.assertTrue(normalized_durations.__contains__(expected2)) self.assertTrue(normalized_durations.__contains__(expected3)) self.assertTrue(normalized_durations.__contains__(expected4)) self.assertTrue(normalized_durations.__contains__(expected5))
def test_add_columns_from_should_merge_all_columns_of_other_transformable_rdd(self): initial_spelled_numbers = self.sc.parallelize([ "One,Two,Three", "Four,Five,Six", "Seven,Eight,Nine", "Ten,Eleven,Twelve" ]) spelled_numbers = TransformableRDD(initial_spelled_numbers, "csv") initial_numeric_data = self.sc.parallelize([ "1\t2\t3", "4\t5\t6", "7\t8\t9", "10\t11\t12" ]) numeric_data = TransformableRDD(initial_numeric_data, "tsv") result = spelled_numbers.add_columns_from(numeric_data).collect() self.assertTrue(result.__contains__("One,Two,Three,1,2,3")) self.assertTrue(result.__contains__("Four,Five,Six,4,5,6")) self.assertTrue(result.__contains__("Seven,Eight,Nine,7,8,9")) self.assertTrue(result.__contains__("Ten,Eleven,Twelve,10,11,12"))
def test_filter_should_give_Transformable_rdd(self): initial_dataset = self.sc.parallelize(["1,2", "1,2", "1,3"]) transformable_rdd = TransformableRDD(initial_dataset, "csv") rdd_filter = transformable_rdd.filter(lambda line: line.split(",")[1] != "2") collected = rdd_filter.collect() self.assertEqual(1, collected.__len__())
def test_get_duplicates_should_give_duplicates_of_given_column_indexes( self): rdd = self.sc.parallelize(["Ram,23", "Ram,23", "Jill,45", "Soa,"]) transformable_rdd = TransformableRDD(rdd, 'csv') duplicates = transformable_rdd.get_duplicates([0]) self.assertEqual("Ram,23", duplicates.first())
def test_drop_column_should_drop_the_given_column(self): rdd = self.sc.parallelize( ["Ram,23,Male", "Ram,23,Male", "Jill,45,Female", "Soa,,Female,"]) transformable_rdd = TransformableRDD(rdd, 'csv') dropped = transformable_rdd.drop_column(1) self.assertEqual("Ram,Male", dropped.first())
def test_get_duplicates_should_give_duplicates_of_given_column_indexes(self): rdd = self.sc.parallelize(["Ram,23", "Ram,23", "Jill,45", "Soa,"]) transformable_rdd = TransformableRDD(rdd, 'csv') duplicates = transformable_rdd.get_duplicates([0]) self.assertEqual("Ram,23", duplicates.first())
def test_transformableRDD_gives_a_count_of_element(self): rdd = self.sc.parallelize(["2", "3", "4", "5", "6", "7", "7", "7"]) transformable_rdd = TransformableRDD(rdd, 'csv') self.assertEquals(8, transformable_rdd.count())
def test_transformableRDD_can_impute_the_missing_values_by_ApproxMeanSubstitution( self): rdd = self.sc.parallelize(["Ram,9", "Joe,45", "Jill,45", "Soa,"]) transformable_rdd = TransformableRDD(rdd, 'csv') imputed_rdd = transformable_rdd.impute(1, ApproxMeanSubstitution()) self.assertTrue(imputed_rdd.collect().__contains__("Soa,33.0"))
def test_transformableRDD_can_deduplicate_by_given_column_index(self): rdd = self.sc.parallelize(["2", "3", "4", "5", "6", "7", "7", "7"]) transformable_rdd = TransformableRDD(rdd, 'csv') deduplicate_rdd = transformable_rdd.deduplicate([0]) self.assertEquals(6, deduplicate_rdd.count())
def test_drop_column_should_drop_the_given_column(self): rdd = self.sc.parallelize(["Ram,23,Male", "Ram,23,Male", "Jill,45,Female", "Soa,,Female,"]) transformable_rdd = TransformableRDD(rdd, 'csv') dropped = transformable_rdd.drop_column(1) self.assertEqual("Ram,Male", dropped.first())
def test_transformableRDD_can_impute_the_missing_values_by_ApproxMeanSubstitution(self): rdd = self.sc.parallelize(["Ram,9", "Joe,45", "Jill,45", "Soa,"]) transformable_rdd = TransformableRDD(rdd, 'csv') imputed_rdd = transformable_rdd.impute(1, ApproxMeanSubstitution()) self.assertTrue(imputed_rdd.collect().__contains__("Soa,33.0"))
def test_list_facets_should_give_facets_of_given_column_indexes(self): rdd = self.sc.parallelize(["Ram,23,Male", "Ram,23,Male", "Jill,45,Female", "Soa,,Female,"]) transformable_rdd = TransformableRDD(rdd, 'csv') duplicates = transformable_rdd.list_facets([0, 1, 2]) highest = duplicates.highest() self.assertEqual("Ram,23,Male", highest[0]._1())
def test_transformableRDD_can_collect_all_the_elements(self): rdd = self.sc.parallelize(["2", "3", "4", "5", "6", "7", "7", "7"]) transformable_rdd = TransformableRDD(rdd, 'csv') self.assertEquals(["2", "3", "4", "5", "6", "7", "7", "7"], transformable_rdd.collect())
def test_list_facets_should_give_facets(self): initial_dataset = self.sc.parallelize(["X,Y", "A,B", "X,Z", "A,Q", "A,E"]) transformable_rdd = TransformableRDD(initial_dataset) text_facets = transformable_rdd.list_facets_of(0) self.assertEquals(2, text_facets.count())
def test_number_of_column_should_give_number_of_column_of_dataset(self): initial_dataset = self.sc.parallelize(["1,2", "1,2", "1,3"]) transformable_rdd = TransformableRDD(initial_dataset, "csv") self.assertEqual(2, transformable_rdd.number_of_columns())
def test_should_give_highest_of_facets(self): initial_dataset = self.sc.parallelize(["X,Y", "A,B", "X,Z", "A,Q", "A,E"]) transformable_rdd = TransformableRDD(initial_dataset) text_facets = transformable_rdd.list_facets_of(0) highest = text_facets.highest() self.assertEqual("A", highest[0]._1())
def test_exception_for_text_Facets(self): initial_dataset = self.sc.parallelize(["1,2", "1,2", "1,3"]) transformable_rdd = TransformableRDD(initial_dataset, "csv") self.assertRaises(ApplicationException, transformable_rdd.list_facets_of, 4)
def test_transformableRDD_can_impute_the_missing_values_by_UnivariateLinearRegressionSubstitution(self): rdd = self.sc.parallelize(["60,3.1", "61,3.6", "62,3.8", "63,4", "65,4.1", "64,"]) transformable_rdd = TransformableRDD(rdd, 'csv') imputed_rdd = transformable_rdd.impute(1, UnivariateLinearRegressionSubstitution(0)) self.assertTrue(imputed_rdd.collect().__contains__("64,4.06"))