def test_should_return_same_df_when_columns_param_is_empty(self): transformation = Transformation(self.test_data) transformation.split_content("", []) current_result = transformation.dataframe.columns expected_result = self.test_data.columns self.assertEqual(current_result, expected_result)
def test_should_return_same_df_when_columns_param_is_empty(self): transformation = Transformation(self.test_data) transformation.replace_content("mag", {}) current_result = transformation.dataframe.collect() expected_result = self.test_data.collect() self.assertEqual(current_result, expected_result)
def test_should_return_same_df_when_column_not_exists(self): transformation = Transformation(self.test_data) transformation.split_content("time", ["day", "month", "year"]) current_result = transformation.dataframe.columns expected_result = self.test_data.columns self.assertEqual(current_result, expected_result)
def test_should_return_same_df_when_columns_params_is_empty(self): transformation = Transformation(self.test_data) transformation.convert_data_type({}) current_result = transformation.dataframe.dtypes expected_result = self.test_data.dtypes self.assertEqual(current_result, expected_result)
def test_should_return_same_df_when_columns_not_exists(self): transformation = Transformation(self.test_data) transformation.convert_data_type({"dt": DateType()}) current_result = transformation.dataframe.dtypes expected_result = self.test_data.dtypes self.assertEqual(current_result, expected_result)
def test_should_return_same_df_when_columns_not_exists_in_df(self): transformation = Transformation(self.test_data) transformation.replace_null_values({"magnitude": 0}) current_result = transformation.dataframe.collect() expected_result = self.test_data.collect() self.assertEqual(current_result, expected_result)
def test_should_return_same_columns_when_column_not_exist_in_df(self): transformation = Transformation(self.test_data) transformation.rename({"dt": "date"}) current_result = transformation.dataframe.columns expected_result = self.test_data.columns self.assertEqual(current_result, expected_result)
def test_should_return_same_columns_when_column_param_is_empty(self): transformation = Transformation(self.test_data) transformation.rename({}) current_result = transformation.dataframe.columns expected_result = self.test_data.columns self.assertEqual(current_result, expected_result)
def test_should_replace_two_columns_name(self): transformation = Transformation(self.test_data) transformation.rename({"mag": "magnitude", "status": "new_status"}) current_result = transformation.dataframe.columns expected_result = [ "date", "place", "magnitude", "new_status", "coordinates", "alert" ] self.assertEqual(current_result, expected_result)
def test_should_convert_data_type_one_column(self): transformation = Transformation(self.test_data) transformation.convert_data_type({"mag": IntegerType()}) current_result = transformation.dataframe.dtypes expected_result = [("date", "bigint"), ("place", "string"), ("mag", "int"), ("status", "string"), ("coordinates", "array<double>"), ("alert", "string")] self.assertEqual(current_result, expected_result)
def test_should_split_column_content_into_three_new_columns(self): transformation = Transformation(self.test_data) transformation.split_content("coordinates", ["longitude", "latitude", "depth"]) current_result = transformation.dataframe.columns expected_result = [ "date", "place", "mag", "status", "longitude", "latitude", "depth", "alert" ] self.assertCountEqual(current_result, expected_result)
def test_should_convert_data_type_two_columns_when_one_column_name_not_exists( self): transformation = Transformation(self.test_data) transformation.convert_data_type({ "mag": IntegerType(), "coordinates": ArrayType(StringType()), "date": TimestampType() }) current_result = transformation.dataframe.dtypes expected_result = [("date", "timestamp"), ("place", "string"), ("mag", "int"), ("status", "string"), ("coordinates", "array<string>"), ("alert", "string")] self.assertEqual(current_result, expected_result)
def test_should_remove_two_columns_from_dataframe(self): transformation = Transformation(self.test_data) transformation.drop(["coordinates", "alert"]) current_result = transformation.dataframe.columns expected_result = self.spark.createDataFrame( [(1704567252, "California", 0.82, "Automatic"), (1391707828, "Alaska", 1.1, None), (1435498694, "Chile", 4.9, "Reviewed"), (1609879110, "Hawaii", 2.0099, "Automatic"), (1224994646, "Indonesia", 4.8, "Reviewed"), (1801059964, "Nevada", 0.5, "Automatic"), (1262739669, "Arkansas", 1.9, "Reviewed"), (1890118874, "Montana", 1.33, "Reviewed"), (1025727100, "Oklahoma", 1.58, "Reviewed"), (1834567116, "Idaho", 2.6, "Reviewed")], ["date", "place", "mag", "status"]).columns self.assertEqual(current_result, expected_result)
def test_should_return_transformed_data_using_all_pipeline_components( self, mock_get_data): self.create_tmp_folder() fake_api_input = ApiInput(self.FAKE_URL) mock_get_data.return_value = self.FAKE_INPUT_DATA extraction_process = Extraction(fake_api_input) extraction_process.extract() raw_data = extraction_process.data raw_df = self.spark.createDataFrame( raw_data, ["date", "place", "mag", "status", "coordinates", "alert"]) transformation_process = Transformation(raw_df) transformation_process.drop(["alert"]) transformation_process.rename({"mag": "magnitude", "place": "city"}) transformation_process.replace_null_values({"status": "Automatic"}) transformation_process.lowercase(["status"]) transformation_process.convert_data_type({"date": IntegerType()}) transformation_process.split_content( "coordinates", ["longitude", "latitude", "depth"]) transformed_df = transformation_process.dataframe csv_storage = CsvStorage(self.OUTPUT_FILEPATH) loading_process = Loading(csv_storage) loading_process.load(transformed_df) current_result = self.spark \ .read \ .csv(self.OUTPUT_FILEPATH, header=True, inferSchema=True) \ .collect() expected_result = self.spark \ .createDataFrame(self.FAKE_EXPECTED_DATA, ["date", "city", "magnitude", "status", "longitude", "latitude", "depth"]) \ .collect() self.assertEqual(current_result, expected_result) self.delete_test_file()