def test_duplicate_rows_in_target_are_deduped(self, mock_job): glue_context = GlueContext(self.spark) self.__mock_staging(glue_context, [ { "id": "02", "firstname": "Bob from staging", "modifieddate": "2019-01-02T00:40:32Z" } ]) self.__mock_existing_target(glue_context, [ { "id": "01", "first_name": "John", "modified_date": datetime.fromisoformat("2019-01-01T00:40:32+00:00") }, { "id": "02", "first_name": "Bob", "modified_date": datetime.fromisoformat("2019-01-01T00:40:32+00:00") }, { "id": "01", "first_name": "Bill", "modified_date": datetime.fromisoformat("2019-01-02T00:40:32+00:00") } ]) glue_context.write_dynamic_frame_from_options = MagicMock() glue_context.purge_s3_path = MagicMock() merge_into_customer_dim.main(self.argv, glue_context, mock_job) expected_df = input_df = self.spark.createDataFrame([ ["01", "Bill", None, None, None, datetime.fromisoformat("2019-01-02T00:40:32+00:00")], ["02", "Bob from staging", None, None, None, datetime.fromisoformat("2019-01-02T00:40:32+00:00")] ], schema=self.output_schema) write_args, write_kargs = glue_context.write_dynamic_frame_from_options.call_args self.assert_dataframe_equal(write_kargs['frame'].toDF(), expected_df, ["id"])
def test_the_target_path_is_purged(self, mock_job): glue_context = GlueContext(self.spark) self.__mock_staging(glue_context, [ { "id": "01", "firstname": "John", "lastname": "Smith", "birthdate": "1990-01-01", "zipcode": "12345", "modifieddate": "2019-01-01T00:40:32Z", } ]) self.__mock_existing_target(glue_context, []) glue_context.write_dynamic_frame_from_options = MagicMock() glue_context.purge_s3_path = MagicMock() merge_into_customer_dim.main(self.argv, glue_context, mock_job) glue_context.purge_s3_path.assert_called_with( s3_path = "s3://ut_target_path", options = { "retentionPeriod": 0 } )
def test_no_existing_output(self, mock_job): glue_context = GlueContext(self.spark) self.__mock_staging(glue_context, [ { "id": "01", "firstname": "John", "lastname": "Smith", "birthdate": "1990-01-01", "zipcode": "12345", "modifieddate": "2019-01-01T00:40:32Z", } ]) self.__mock_existing_target(glue_context, []) glue_context.write_dynamic_frame_from_options = MagicMock() glue_context.purge_s3_path = MagicMock() merge_into_customer_dim.main(self.argv, glue_context, mock_job) expected_df = input_df = self.spark.createDataFrame([ ["01", "John", "Smith", date(1990, 1, 1), "12345", datetime.fromisoformat("2019-01-01T00:40:32+00:00")] ], schema=self.output_schema) write_args, write_kargs = glue_context.write_dynamic_frame_from_options.call_args self.assert_dataframe_equal(write_kargs['frame'].toDF(), expected_df, ["id"])
partition_by = args['partition_by'] bookmark = args['bookmark'] # Get glue catalog table data for mapping glue = boto3.client('glue', region_name='us-west-2') table = glue.get_table(DatabaseName=glue_db, Name=table_name).get('Table') tableStorageDescriptor = table.get('StorageDescriptor') tableColumns = tableStorageDescriptor.get('Columns') mapping = [] for column in tableColumns: mapping.append( (column['Name'], column['Type'], column['Name'], column['Type'])) # Delete prior runs s3 file if bookmark == 'N': glueContext.purge_s3_path(s3_path) # Set connection options if partition_by == 'None': connection_options = {"path": s3_path} else: connection_options = {"path": s3_path, "partitionKeys": [partition_by]} # Run glue job ctx = table_name job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame.from_catalog( database=glue_db, table_name=table_name, transformation_ctx="datasource0" + ctx) applymapping1 = ApplyMapping.apply(frame=datasource0,