Пример #1
0
    def test_duplicate_rows_in_target_are_deduped(self, mock_job):
        glue_context = GlueContext(self.spark)
        self.__mock_staging(glue_context, [
            {
                "id": "02",
                "firstname": "Bob from staging",
                "modifieddate": "2019-01-02T00:40:32Z"
            }
        ])
        self.__mock_existing_target(glue_context, [
            {
                "id": "01",
                "first_name": "John",
                "modified_date": datetime.fromisoformat("2019-01-01T00:40:32+00:00")
            },
            {
                "id": "02",
                "first_name": "Bob",
                "modified_date": datetime.fromisoformat("2019-01-01T00:40:32+00:00")
            },
            {
                "id": "01",
                "first_name": "Bill",
                "modified_date": datetime.fromisoformat("2019-01-02T00:40:32+00:00")
            }
        ])
        glue_context.write_dynamic_frame_from_options = MagicMock()
        glue_context.purge_s3_path = MagicMock()

        merge_into_customer_dim.main(self.argv, glue_context, mock_job)

        expected_df = input_df = self.spark.createDataFrame([
                ["01", "Bill", None, None, None, datetime.fromisoformat("2019-01-02T00:40:32+00:00")],
                ["02", "Bob from staging", None, None, None, datetime.fromisoformat("2019-01-02T00:40:32+00:00")]
            ], schema=self.output_schema)

        write_args, write_kargs = glue_context.write_dynamic_frame_from_options.call_args
        self.assert_dataframe_equal(write_kargs['frame'].toDF(), expected_df, ["id"])
Пример #2
0
    def test_the_target_path_is_purged(self, mock_job):
        glue_context = GlueContext(self.spark)
        self.__mock_staging(glue_context, [
                {
                    "id": "01",
                    "firstname": "John",
                    "lastname": "Smith",
                    "birthdate": "1990-01-01",
                    "zipcode": "12345",
                    "modifieddate": "2019-01-01T00:40:32Z",
                }
            ])
        self.__mock_existing_target(glue_context, [])
        glue_context.write_dynamic_frame_from_options = MagicMock()
        glue_context.purge_s3_path = MagicMock()

        merge_into_customer_dim.main(self.argv, glue_context, mock_job)

        glue_context.purge_s3_path.assert_called_with(
            s3_path = "s3://ut_target_path",
            options = {
                "retentionPeriod": 0
            }
        )
Пример #3
0
    def test_no_existing_output(self, mock_job):
        glue_context = GlueContext(self.spark)
        self.__mock_staging(glue_context, [
                {
                    "id": "01",
                    "firstname": "John",
                    "lastname": "Smith",
                    "birthdate": "1990-01-01",
                    "zipcode": "12345",
                    "modifieddate": "2019-01-01T00:40:32Z",
                }
            ])
        self.__mock_existing_target(glue_context, [])
        glue_context.write_dynamic_frame_from_options = MagicMock()
        glue_context.purge_s3_path = MagicMock()

        merge_into_customer_dim.main(self.argv, glue_context, mock_job)

        expected_df = input_df = self.spark.createDataFrame([
                ["01", "John", "Smith", date(1990, 1, 1), "12345", datetime.fromisoformat("2019-01-01T00:40:32+00:00")]
            ], schema=self.output_schema)

        write_args, write_kargs = glue_context.write_dynamic_frame_from_options.call_args
        self.assert_dataframe_equal(write_kargs['frame'].toDF(), expected_df, ["id"])
Пример #4
0
partition_by = args['partition_by']
bookmark = args['bookmark']

# Get glue catalog table data for mapping
glue = boto3.client('glue', region_name='us-west-2')
table = glue.get_table(DatabaseName=glue_db, Name=table_name).get('Table')
tableStorageDescriptor = table.get('StorageDescriptor')
tableColumns = tableStorageDescriptor.get('Columns')
mapping = []
for column in tableColumns:
    mapping.append(
        (column['Name'], column['Type'], column['Name'], column['Type']))

# Delete prior runs s3 file
if bookmark == 'N':
    glueContext.purge_s3_path(s3_path)

# Set connection options
if partition_by == 'None':
    connection_options = {"path": s3_path}
else:
    connection_options = {"path": s3_path, "partitionKeys": [partition_by]}

# Run glue job
ctx = table_name
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(
    database=glue_db,
    table_name=table_name,
    transformation_ctx="datasource0" + ctx)
applymapping1 = ApplyMapping.apply(frame=datasource0,