Пример #1
0
            "unrestrictedCompanyStockPercent"))

if has_column(inputDF,
              'enrollmentData.enrollmentEvents.lastServiceTierChange'):
    # Data frame for enrollmentData
    enrollmentDF = inputDF.withColumn("lastServiceTierChange",
                                      f.explode("enrollmentData.enrollmentEvents.lastServiceTierChange")). \
        withColumn("serviceTier_new", f.explode("enrollmentData.enrollmentEvents.serviceTier")). \
        withColumn("enrollmentChannel", f.explode("enrollmentData.enrollmentEvents.enrollmentChannel")). \
        withColumn("enrollmentReason", f.explode("enrollmentData.enrollmentEvents.enrollmentReason"))

    enrollmentDF.registerTempTable("enrollmentTable")

    enrollmentTFSQL = glue_context.sql(
        "select userId,planOwnerId,recordKeeperId,max(enrollmentReason) as enrollmentReason,max(enrollmentChannel) as enrollmentChannel from ( select userId,planOwnerId,recordKeeperId,enrollmentReason,enrollmentChannel,lastServiceTierChange,serviceTier_new, \
                                    rank() over(partition by enrollmentReason ,enrollmentChannel order by lastServiceTierChange DESC) rnk \
                                    from enrollmentTable where serviceTier_new ='ma'  \
                                    ) where rnk=1 group by userId,planOwnerId,recordKeeperId "
    )
else:
    noenrollmentDF = inputDF.select(
        "userId", "planOwnerId", "recordKeeperId").withColumn(
            "enrollmentReason",
            lit(None).cast(StringType())).withColumn(
                "enrollmentChannel",
                lit(None).cast(StringType())).dropDuplicates()

    noenrollmentDF.registerTempTable("enrollmentTable")

    enrollmentTFSQL = spark.sql(
        "select userId,planOwnerId,recordKeeperId,max(enrollmentReason) as enrollmentReason ,\
                                        max(enrollmentChannel) as enrollmentChannel from enrollmentTable  group by userId,planOwnerId,recordKeeperId"
    df = df.repartition("submitteddatehour")

    dyf = DynamicFrame.fromDF(df, glueContext, "submitteddatehour-extracted")

    return (DynamicFrameCollection({"CustomTransform0": dyf}, glueContext))


## @params: [JOB_NAME, SOURCE_BUCKET_URI, DESTINATION_BUCKET_URI]
args = getResolvedOptions(
    sys.argv, ['JOB_NAME', 'SOURCE_BUCKET_URI', 'DESTINATION_BUCKET_URI'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
glueContext.sql("set spark.sql.parquet.mergeSchema=true")
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [connection_type = "s3", format = "parquet", connection_options = {"paths": ["s3://te-load-test-analytics-submission-parquet/"], "recurse":True}, transformation_ctx = "DataSource0"]
## @return: DataSource0
## @inputs: []
DataSource0 = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    format="parquet",
    connection_options={
        "paths": [f"{args['SOURCE_BUCKET_URI']}/"],
        "recurse": True
    },
    transformation_ctx="DataSource0")
## @type: CustomCode