'''

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [connection_type = "s3", format = "parquet", connection_options = {"paths": ["s3://bond-s3-forspark/invoice-application/"], "recurse":True}, transformation_ctx = "DataSource0"]
## @return: DataSource0
## @inputs: []
DataSource0 = glueContext.create_dynamic_frame.from_options(connection_type = "s3", format = "parquet", connection_options = {"paths": ["s3://bond-s3-forspark/invoice-application/"], "recurse":True}, transformation_ctx = "DataSource0")
## @type: SqlCode
## @args: [sqlAliases = {"myDataSource": DataSource0}, sqlName = SqlQuery0, transformation_ctx = "Transform0"]
## @return: Transform0
## @inputs: [dfc = DataSource0]
Transform0 = sparkSqlQuery(glueContext, query = SqlQuery0, mapping = {"myDataSource": DataSource0}, transformation_ctx = "Transform0")
## @type: DataSink
## @args: [connection_type = "s3", catalog_database_name = "frillmart", format = "glueparquet", connection_options = {"path": "s3://bond-s3-forspark/invoice-sandbox/", "partitionKeys": ["Country"], "enableUpdateCatalog":true, "updateBehavior":"UPDATE_IN_DATABASE"}, catalog_table_name = "invoice_sandbox", transformation_ctx = "DataSink0"]
## @return: DataSink0
## @inputs: [frame = Transform0]
DataSink0 = glueContext.getSink(path = "s3://bond-s3-forspark/invoice-sandbox/", connection_type = "s3", updateBehavior = "UPDATE_IN_DATABASE", partitionKeys = ["Country"], enableUpdateCatalog = True, transformation_ctx = "DataSink0")
DataSink0.setCatalogInfo(catalogDatabase = "frillmart",catalogTableName = "invoice_sandbox")
DataSink0.setFormat("glueparquet")
DataSink0.writeFrame(Transform0)

job.commit()
]).toDF().dropDuplicates(['archiveid'])

rownum = inventory.withColumn(
    "row_num",
    row_number().over(
        Window.orderBy(inventory['creationdate'],
                       inventory['archiveid'])).cast("long"))
merged = rownum.join(mapped, "archiveid", how='left_outer')

frame = DynamicFrame.fromDF(merged, glueContext, "merged")


def transform(rec):
    rec["part"] = rec["row_num"] // partiton_size
    rec["archivedescription"] = rec["override"] if rec["override"] and rec[
        "override"].strip() else rec["archivedescription"]
    rec.pop('override', None)
    return rec


trans0 = Map.apply(frame=frame, f=transform)

sink = glueContext.getSink(connection_type="s3",
                           path='s3://' + STAGING_BUCKET + '/partitioned/',
                           enableUpdateCatalog=True,
                           partitionKeys=["part"])
sink.setFormat("glueparquet")
sink.setCatalogInfo(catalogDatabase=DATABASE, catalogTableName=OUTPUT_TABLE)
sink.writeFrame(trans0)

job.commit()
Пример #3
0
                            field.dataType.typeName()))
    dyf = dyf.apply_mapping(mapping)

    # Add partition columns
    df = dyf.toDF()
    if 'year' in partition_keys:
        df = df.withColumn('year', year(timestamp_column_name))
    if 'month' in partition_keys:
        df = df.withColumn('month', month(timestamp_column_name))
    if 'day' in partition_keys:
        df = df.withColumn('day', dayofmonth(timestamp_column_name))
    if 'hour' in partition_keys:
        df = df.withColumn('hour', hour(timestamp_column_name))
    if 'minute' in partition_keys:
        df = df.withColumn('minute', minute(timestamp_column_name))

    df.drop(col(tmp_timestamp_column_name))
    dyf = DynamicFrame.fromDF(df, glue_context, "add_partitions")

# Write DynamicFrame to S3 in glueparquet format
sink = glue_context.getSink(connection_type="s3",
                            path=output_path,
                            enableUpdateCatalog=True,
                            partitionKeys=partition_keys)
sink.setFormat("glueparquet")
sink.setCatalogInfo(catalogDatabase=output_database,
                    catalogTableName=output_table)
sink.writeFrame(dyf)

job.commit()
try:
    glue.create_table(DatabaseName=output_database, TableInput=table_to_create)
except Exception as e:
    print(
        f"Failed to create destination table. If {output_database}.{output_table} table exists, please remove it."
    )
    print(e)
    raise

# Begin Lake Formation transaction
tx_id = glue_context.start_transaction(read_only=False)

# Write DynamicFrame into Lake Formation governed table using transaction
sink = glue_context.getSink(connection_type="s3",
                            path=output_path,
                            enableUpdateCatalog=True,
                            partitionKeys=partition_key_names,
                            transactionId=tx_id)
sink.setFormat("glueparquet")
sink.setCatalogInfo(catalogDatabase=output_database,
                    catalogTableName=output_table)

try:
    sink.writeFrame(dyf)
    glue_context.commit_transaction(tx_id)
except Exception:
    glue_context.cancel_transaction(tx_id)
    raise

job.commit()
Пример #5
0
                                                   )
 
glueContext.write_dynamic_frame.from_options(frame = l_history,
          connection_type = "s3",
          connection_options = {"path": "s3://leeys-virginia/output-dir/legislator_history"},
          format = "parquet") 
 
 
response = glueContext.write_dynamic_frame.from_jdbc_conf(frame = df_mapped,
                                                   catalog_connection = "red1",
                                                   connection_options = {"dbtable": "public.test1", "database": "leeysdb"},
                                                   redshift_tmp_dir = "s3://leeys-virginia/glue/temp-dir/")
 
 
response = glueContext.write_dynamic_frame.from_jdbc_conf(frame = df_mapped,
                                                   catalog_connection = "red1",
                                                   connection_options = {"dbtable": "public.test1",
                                                                         "database": "leeysdb" ,
                                                                         "preactions":"delete from public.test1;",
                                                                         "extracopyoptions":"MAXERROR 2"},
                                                   redshift_tmp_dir = "s3://leeys-virginia/glue/temp-dir/")
 
datasink4 = glueContext.write_dynamic_frame.from_options(frame = addpartitionfield, connection_type = "s3", connection_options = {"path": "s3://t1-saas-insights-dev-ap-southeast-2/glue/processed", "partitionKeys": ["date_p"]}, format = "parquet", transformation_ctx = "datasink4")
 
 
 
 
#### use  Get sink rather than write_dynamic_frame
data_sink = glueContext.getSink("s3://leeys-virginia/glue/temp/" , format = "json" )
( or data_sink.setFormat("json")  )
data_sink.writeFrame(myFrame)
Пример #6
0
def main():
    ## @params: [JOB_NAME, db_name, entity_name, datetime_column, date_column, partition_column, output_bucket_name]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name',
        'target_entity_name', 'datetime_column', 'date_column',
        'partition_column', 'output_bucket_name'
    ])
    job_name = args['JOB_NAME']
    raw_db_name = args['raw_db_name']
    clean_db_name = args['clean_db_name']
    source_entity_name = args['source_entity_name']
    target_entity_name = args['target_entity_name']
    partition_column = args['partition_column']
    datetime_column = args['datetime_column']
    date_column = args['date_column']
    output_bucket_name = args['output_bucket_name']

    # Constants derived from parameters
    raw_table_name = source_entity_name
    clean_table_name = target_entity_name

    processing_start_datetime = datetime.now(timezone.utc)

    # Initialization of contexts and job
    glue_context = GlueContext(SparkContext.getOrCreate())
    job = Job(glue_context)
    job.init(job_name, args)

    ## @type: DataSource
    ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"]
    ## @return: raw_data
    ## @inputs: []
    raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog(
        database=raw_db_name,
        table_name=raw_table_name,
        transformation_ctx="raw_data")

    # Terminate early if there is no data to process
    if raw_data.toDF().head() is None:
        job.commit()
        return

    ## @type: CleanDataset
    ## @args: []
    ## @return: cleaned_data
    ## @inputs: [frame = raw_data]
    input_data = raw_data.toDF()
    cleaned_data = input_data.select(*[
        from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_date(c).alias(c) if c.endswith('_date') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('string').alias(c) if c == 'zip' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] ==
        'double' else col(c) for c in input_data.columns
    ])

    ## @type: EnrichDataset
    ## @args: []
    ## @return: enriched_data
    ## @inputs: [frame = cleaned_data]
    enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \
        .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date"))

    ## @type: DataSink
    ## @args: [connection_type = "s3", connection_options = {"path": "s3://<output_bucket_name>/clean/<entity_name>", "enableUpdateCatalog": "True", "updateBehavior": "UPDATE_IN_DATABASE", "partitionKeys" : "[<partition_key>]"}, format = "glueparquet"]
    ## @return: sink
    ## @inputs: [frame = enriched_data]

    sink = glue_context.getSink(connection_type="s3",
                                path="s3://" + output_bucket_name + "/" +
                                clean_table_name,
                                enableUpdateCatalog=True,
                                updateBehavior="UPDATE_IN_DATABASE",
                                partitionKeys=[partition_column])
    sink.setFormat("glueparquet")
    sink.setCatalogInfo(catalogDatabase=clean_db_name,
                        catalogTableName=clean_table_name)
    sink.writeFrame(DynamicFrame.fromDF(enriched_data, glue_context, 'result'))

    job.commit()
Пример #7
0
# Maps a transformation function over each record to re-build date partitions using the timestamp
# rather than the Firehose ingestion timestamp
transformed_pose_data = Map.apply(frame=resolved_pose_data, f=applyTransform)

print("---- Processed data schema: ----")
transformed_pose_data.printSchema()
record_count = transformed_pose_data.count()
print("Processed record count: {}".format(record_count))

# Avoid errors if Glue Job Bookmark detects no new data to process and records = 0.
if record_count > 0:
    try:
        sink = glueContext.getSink(
            connection_type="s3",
            path=analytics_bucket_output,
            enableUpdateCatalog=True,
            updateBehavior="UPDATE_IN_DATABASE",
            partitionKeys=["year", "month", "day", "hour"],
            transformation_ctx="output")
        sink.setFormat("glueparquet")
        sink.setCatalogInfo(catalogDatabase=db_name,
                            catalogTableName=processed_pose_data_table)
        sink.writeFrame(transformed_pose_data)
    except:
        print("There was an error writing out the results to S3")
    else:
        print("Partition saved.")
else:
    print("Glue Job Bookmark detected no new files to process")

job.commit()