def transform_df_to_catalog_import_schema(sql_context, glue_context, df_databases, df_tables, df_partitions):
    df_databases_array = df_databases.select(df_databases['type'], array(df_databases['item']).alias('items'))
    df_tables_array = df_tables.select(df_tables['type'], df_tables['database'],
                                       array(df_tables['item']).alias('items'))
    df_partitions_array_batched = batch_metastore_partitions(sql_context=sql_context, df_parts=df_partitions)
    dyf_databases = DynamicFrame.fromDF(
        dataframe=df_databases_array, glue_ctx=glue_context, name='dyf_databases')
    dyf_tables = DynamicFrame.fromDF(
        dataframe=df_tables_array, glue_ctx=glue_context, name='dyf_tables')
    dyf_partitions = DynamicFrame.fromDF(
        dataframe=df_partitions_array_batched, glue_ctx=glue_context, name='dyf_partitions')
    return dyf_databases, dyf_tables, dyf_partitions
Exemplo n.º 2
0
def write_df_to_catalog(data_frame, entity_type, glue_context, options):
    # Check if data frame is empty. There is no "empty" method for data frame, this is the closest we get.
    if data_frame.rdd.isEmpty():
        return # nothing to do
    database_name = options['catalog.database']
    nested_data_frame = nest_data_frame(data_frame, database_name, entity_type)
    dynamic_frame = DynamicFrame.fromDF(nested_data_frame, glue_context, entity_type)
    sink = glue_context.getSink('catalog', **options)
    sink.write(dynamic_frame)
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    student_id_unavailable = '0'
    package_endtime_unavailable = 99999999999L
    package_starttime_unavailable = 0L
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'

    package_endtime = 'package_endtime'
    package_starttime = 'package_starttime'
    student_level_code = 'student_level_code'
    student_status_code = 'student_status_code'

    ACTIVED = 'ACTIVED'

    dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market",
        table_name="tpe_enduser_used_product_history"
    )
    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields(
        ['_key', 'contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated'])
        # .rename_field('contact_id', 'contactid')

    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice(specs=[('_key', 'cast:long')])
    # try:
    #     df_flag = spark.read.parquet("s3://dtsodin/flag/flag_trang_thai_tai_khoan_active.parquet")
    #     max_key = df_flag.collect()[0]['flag']
    #     print("max_key:  ", max_key)
    #     # Chi lay nhung ban ghi lon hon max_key da luu, ko load full
    #     dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key)
    # except:
    #     print('read flag file error ')
    print dyf_tpe_enduser_used_product_history.count()
    if dyf_tpe_enduser_used_product_history.count() > 0:
        try:
            dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog(
                database="tig_market",
                table_name="tpe_invoice_product_details"
            )
            dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields(
                ['id', 'cat_code'])

            dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
                database="tig_advisor",
                table_name="student_contact"
            )
            dyf_student_contact = dyf_student_contact.select_fields(
                ['contact_id', 'student_id']).rename_field('contact_id', 'contactid')

            ##################### Join and Filter data
            df_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.toDF()
            df_tpe_used_product_history_step1 = df_tpe_enduser_used_product_history.groupby('contact_id',
                                                                                            'used_product_id').agg(
                f.max("timecreated").alias("max_timecreated")) \
                .withColumnRenamed("contact_id", "contact_id_temp")
            print df_tpe_used_product_history_step1.count()
            df_tpe_used_product_history_step1.show()

            df_tpe_used_product_history_step2 = df_tpe_used_product_history_step1.groupby('contact_id_temp').agg(
                f.max("max_timecreated").alias("max_timecreated"),
                f.count("used_product_id").alias("count_used_product_id"))
            print df_tpe_used_product_history_step2.count()
            df_tpe_used_product_history_step2.show()
            print "EEEEEEEEEEEEEEEEEEEEEEEEE"

            dyf_tpe_used_product_history = DynamicFrame.fromDF(df_tpe_used_product_history_step2, glueContext,
                                                               "dyf_tpe_used_product_history")

            dyf_part_one = Filter.apply(frame=dyf_tpe_used_product_history,
                                        f=lambda x: x["count_used_product_id"] <= 1)

            # dyf_part_two = Filter.apply(frame=df_tpe_enduser_used_product_history,
            #                             f=lambda x: x["used_product_id"] > 1)
            df_part_one = dyf_part_one.toDF()
            df_part_one = df_part_one.join(df_tpe_enduser_used_product_history,
                                           (df_part_one.contact_id_temp == df_tpe_enduser_used_product_history.contact_id)
                                           & (df_part_one.max_timecreated == df_tpe_enduser_used_product_history.timecreated))

            dyf_part_one = DynamicFrame.fromDF(df_part_one, glueContext, "dyf_part_one")
            dyf_part_one = dyf_part_one.select_fields(['contact_id', 'used_product_id', 'status_old',
                                                       'status_new', 'status_description', 'timecreated'])


            dyf_join_part_one_product_details = Join.apply(dyf_part_one,
                                                           dyf_tpe_invoice_product_details, 'used_product_id', 'id')

            dyf_join_part_one_product_details.printSchema()
            print "total 01: ", dyf_join_part_one_product_details.count()
            dyf_join_part_one_product_details.toDF().show(2)

            dyf_join_part_one_contact = Join.apply(dyf_join_part_one_product_details,
                                                   dyf_student_contact, 'contact_id', 'contactid')
            dyf_join_part_one_contact = dyf_join_part_one_contact \
                .select_fields(['contact_id', 'student_id', 'status_new', 'status_description', 'timecreated'])


            dyf_join_part_one_contact.printSchema()
            print "total 02: ", dyf_join_part_one_contact.count()
            dyf_join_part_one_contact.toDF().show(2)
            # df_join_part_one = dyf_join_part_one_contact.toDF()

            ######################################
            ######## START active
            dyf_join_active_status = Filter.apply(frame=dyf_join_part_one_contact,
                                                     f=lambda x: x["status_new"] == ACTIVED)
            print "dyf_join_active_status ", dyf_join_active_status.count()
            dyf_join_active_status.toDF().show(2)
            df_join_active_status = dyf_join_active_status.toDF()

            df_join_active_status = df_join_active_status \
                .withColumn("change_status_date_id",
                            from_unixtime(df_join_active_status.timecreated, 'yyyyMMdd').cast("long")) \
                .withColumn("from_status_id", f.lit(None).cast("long")) \
                .withColumn("to_status_id", f.lit(206).cast("long")) \
                .withColumn("measure1", f.lit(None).cast("long")) \
                .withColumn("measure2", f.lit(None).cast("long")) \
                .withColumn("description", df_join_active_status.status_description) \
                .withColumn("timestamp1", f.lit(None).cast("long"))
            df_join_active_status.show(3)
            dyf_join_active_status = DynamicFrame.fromDF(df_join_active_status, glueContext,
                                                            "dyf_join_active_status")

            dyf_join_active_status = dyf_join_active_status \
                .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id',
                                'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1'])
            dyf_join_active_status.printSchema()
            df_join_active_status = dyf_join_active_status.toDF()
            ####### END

            df_join_active_status = df_join_active_status.withColumn("user_id", f.lit(None).cast("long"))

            dyf_join_status = DynamicFrame.fromDF(df_join_active_status, glueContext, "dyf_join_status")

            applymapping1 = ApplyMapping.apply(frame=dyf_join_status,
                                               mappings=[
                                                   ("student_id", "string", "student_id", "long"),
                                                   ("user_id", "long", "user_id", "long"),
                                                   ("change_status_date_id", "long", "change_status_date_id", "long"),
                                                   ("from_status_id", "long", "from_status_id", "long"),
                                                   ("to_status_id", "long", "to_status_id", "long"),
                                                   ("measure1", "long", "measure1", "double"),
                                                   ("measure2", "long", "measure2", "double"),
                                                   ("description", "string", "description", "string"),
                                                   ("timestamp1", "long", "timestamp1", "long"),
                                                   ("contact_id", "string", "contact_id", "string")
                                               ])

            resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols",
                                                 transformation_ctx="resolvechoice1")
            dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1")
            print resolvechoice1.count()
            resolvechoice1.printSchema()
            resolvechoice1.show(5)
            print('START WRITE TO REDSHIFT -------------------------')
            datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1,
                                                                       catalog_connection="glue_redshift",
                                                                       connection_options={
                                                                           "dbtable": "mapping_changed_status_student",
                                                                           "database": "dts_odin"
                                                                       },
                                                                       redshift_tmp_dir="s3a://dtsodin/temp/mapping_changed_status_student/",
                                                                       transformation_ctx="datasink1")

            print('START WRITE TO S3-------------------------')
            # datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3",
            #                                                          connection_options={
            #                                                              "path": "s3://dtsodin/student_behavior/student_behavior/",
            #                                                              "partitionKeys": ["behavior_id"]},
            #                                                          format="parquet",
            #                                                          transformation_ctx="datasink6")
            print('END WRITE TO S3-------------------------')

            df_temp = dyf_tpe_enduser_used_product_history.toDF()
            flag = df_temp.agg({"_key": "max"}).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            # ghi de _key vao s3
            df.write.parquet("s3a://dtsodin/flag/flag_trang_thai_tai_khoan_active.parquet", mode="overwrite")
        except Exception as e:
            print "Something was wrong ",e
    format=file_format,  ## "csv",
    format_options={
        "withHeader": True
    },
    transformation_ctx="data_df").toDF()
data_df.show(10)

## read data from input table to a data frame
#data_df=glueContext.create_dynamic_frame.from_catalog(database=database,table_name=table_name).toDF()

## running sql query on the dataframe created with input dataset
data_df.createOrReplaceTempView('data_df')
data_df = spark.sql('{} from data_df'.format(querySql))

query_columns = ['werk', 'spj', 'knr', 'result', 'probability', 'time']
data_df.toDF(*query_columns)

## convert the dataframe made by transformed dataset to dynamic frame again
data_df = DynamicFrame.fromDF(data_df, glueContext, "data_df")

## Define target s3 output location

rtp_dd_output = "s3://" + s3_output_data_folder + "/" + "plant=" + plant + "/" + "appid=" + applicationId + "/"

# Store the output/final dynamicFrame to the target s3 location
outputGDF = glueContext.write_dynamic_frame.from_options(
    frame=data_df,
    connection_type="s3",
    connection_options={"path": rtp_dd_output},
    format="csv")
Exemplo n.º 5
0
                                     choice="MATCH_CATALOG",
                                     database="as-redshift-dw",
                                     table_name="as_tech_test_public_dim_user",
                                     transformation_ctx="resolvechoice3")
## @type: ResolveChoice
## @args: [choice = "make_cols", transformation_ctx = "resolvechoice4"]
## @return: resolvechoice4
## @inputs: [frame = resolvechoice3]
resolvechoice4 = ResolveChoice.apply(frame=resolvechoice3,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice4")

##get Insert Date
timestampedDf = resolvechoice4.toDF().withColumn("dim_user_insert_dt",
                                                 current_timestamp())

#Back to DynamicFrame
cleaned_datasource = DynamicFrame.fromDF(timestampedDf, glueContext,
                                         "cleaned_datasource")

## @type: DataSink
## @args: [database = "as-redshift-dw", table_name = "as_tech_test_public_dim_user", redshift_tmp_dir = TempDir, transformation_ctx = "datasink5"]
## @return: datasink5
## @inputs: [frame = resolvechoice4]
datasink5 = glueContext.write_dynamic_frame.from_catalog(
    frame=cleaned_datasource,
    database="as-redshift-dw",
    table_name="as_tech_test_public_dim_user",
    redshift_tmp_dir=args["TempDir"],
    transformation_ctx="datasink5")
job.commit()
Exemplo n.º 6
0
            		source_cd, forecast_dt, hour_num, usage_factor, esiid_cnt, unadj_load, distrib_loss_load, 
            		transmission_loss_load, ufe_loss_load, ancillary_loss_load, deration_loss_load, cap_ob, 
            		tran_ob, crdt, batch_dt, batch_hr
            		from ams__iw_growth_stnorm_hourly__df
            		order by forecast_dt ,hour_num""")
            select__df.createOrReplaceTempView('select__df')
            rowcount_df = select__df
            
    
    except Exception as e:
        raise
    
        
        errormessage = str(spark.sql("""select error_message()
              ,{} = error_severity()
              ,{} = error_state()""".format(errorseverity, errorstate)).collect()[0][0])


#Write modified data frames to target
if __name__ == '__main__':
    p_manageforecastdata(*sys.argv[1:])
    try:
        for tab_df in mod_df.keys():
            if mod_df[tab_df] == org_df[tab_df]:
                continue
            dym__trans__df = DynamicFrame.fromDF(mod_df[tab_df],glueContext,'dym__trans__df')
            glueContext.write_dynamic_frame.from_options(frame = dym__trans__df, connection_type = 's3', connection_options = {'path': 's3://target/s3tables'}, format = 'csv')
    except:
        raise

Exemplo n.º 7
0
def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    class_topica_id = 1

    now = datetime.now()  # current date and time
    year = now.strftime("%Y%m%d")
    year = '20190901'
    print("year:", int(year))
    cur_date = int(year)
    pre_date = cur_date - 1
    print("year:", pre_date)
    ########## dyf_mapping_lo_student
    dyf_mapping_lo_student = glueContext.create_dynamic_frame.from_catalog(
        database="nvn_knowledge",
        table_name="mapping_lo_student"
    )

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3://dts-odin/flag/flag_mapping_lo_student.parquet")
    #     start_read = df_flag.collect()[0]['flag']
    #     print('read from index: ', start_read)
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     # dyf_student_contact = Filter.apply(frame=dyf_student_contact, f=lambda x: x['time_lms_created'] > start_read)
    # except:
    #     print('read flag file error ')
    # dyf_mapping_lo_student = Filter.apply(frame=dyf_mapping_lo_student, f=lambda x: x['knowledge_pass_date_id'] >= f.lit(int(year)))

    print('df_student_contact count 1:', dyf_mapping_lo_student.count())
    if dyf_mapping_lo_student.count() > 0:
        try:
            print("START......................")
            ########## dyf_mapping_lo_student
            dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
                database="nvn_knowledge",
                table_name="learning_object"
            )

            ########## dyf_learning_object_class
            dyf_learning_object_class = glueContext.create_dynamic_frame.from_catalog(
                database="nvn_knowledge",
                table_name="learning_object_class"
            )
            dyf_learning_object_class = dyf_learning_object_class.select_fields(['class_id', 'class_parent_id'])
            dyf_learning_object_class = Filter.apply(frame=dyf_learning_object_class, f=lambda x: x["class_parent_id"] == class_topica_id)

            ########## dyf_mapping_lo_class
            dyf_mapping_lo_class = glueContext.create_dynamic_frame.from_catalog(
                database="nvn_knowledge",
                table_name="mapping_lo_class"
            )
            dyf_mapping_lo_class = dyf_mapping_lo_class.select_fields(['class_id', 'learning_object_id'])\
                .rename_field('class_id', 'map_class_id').rename_field('learning_object_id', 'map_lo_id')
            ## JOIN chi lay nhung trinh do cua TOPICA
            dyf_mapping_lo_class = Join.apply(dyf_mapping_lo_class, dyf_learning_object_class, 'map_class_id', 'class_id')


            dyf_learning_object = dyf_learning_object.select_fields(
                ['learning_object_id', 'learning_object_type']).rename_field('learning_object_id', 'lo_id')
            dyf_mapping_lo_student = Join.apply(dyf_mapping_lo_student, dyf_learning_object, 'learning_object_id',
                                                'lo_id')
            dyf_mapping_lo_student = Join.apply(dyf_mapping_lo_student, dyf_mapping_lo_class, 'learning_object_id',
                                                'map_lo_id')

            # dyf_mapping_lo_student.printSchema()
            # dyf_mapping_lo_student.show()

            df_mapping_lo_student = dyf_mapping_lo_student.toDF()

            df_mapping_lo_student = df_mapping_lo_student.groupby('student_id', 'learning_object_type', 'class_id').agg(
                f.count('knowledge_pass_date_id').alias("knowledge_number"),
                f.count('comprehension_pass_date_id').alias("comprehension_number"),
                f.count('application_pass_date_id').alias("application_number"),
                f.count('analysis_pass_date_id').alias("analysis_number"),
                f.count('synthesis_pass_date_id').alias("synthesis_number"),
                f.count('evaluation_pass_date_id').alias("evaluation_number"))
            df_mapping_lo_student = df_mapping_lo_student.withColumn("created_date_id", f.lit(str(year)))
            # print('Count:' , df_mapping_lo_student.count())
            # df_mapping_lo_student.printSchema()
            # df_mapping_lo_student.show(5)

            dyf_mapping_lo_student = DynamicFrame.fromDF(df_mapping_lo_student, glueContext,
                                                         "dyf_mapping_lo_student")
            applymapping = ApplyMapping.apply(frame=dyf_mapping_lo_student,
                                              mappings=[("student_id", "long", "student_id", "long"),
                                                        ("user_id", "long", "user_id", "long"),
                                                        ("class_id", "long", "class_id", "long"),
                                                        ("knowledge_number", "long", "knowledge_number", "long"),
                                                        (
                                                        "comprehension_number", 'long', 'comprehension_number', 'long'),
                                                        ("application_number", 'long', 'application_number', 'long'),
                                                        ("analysis_number", 'long', 'analysis_number', 'long'),
                                                        ("synthesis_number", 'long', 'synthesis_number', 'long'),
                                                        ("evaluation_number", 'long', 'evaluation_number', 'long'),
                                                        ("created_date_id", 'string', 'created_date_id', 'long'),
                                                        ("learning_object_type", 'string', 'learning_object_type',
                                                         'string')])
            resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                                                transformation_ctx="resolvechoice2")
            dyf_student_lo_init = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dyf_student_lo_init")
            datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_student_lo_init,
                                                                       catalog_connection="glue_redshift",
                                                                       connection_options={
                                                                           "dbtable": "mapping_lo_student_number",
                                                                           "database": "dts_odin"
                                                                       },
                                                                       redshift_tmp_dir="s3n://dts-odin/temp1/dyf_student_lo_number",
                                                                       transformation_ctx="datasink5")

            print("END......................")
        except Exception as e:
            print("###################### Exception ##########################")
            print(e)
Exemplo n.º 8
0
vote_dataset_agg4 = vote_dataset.groupBy(col("idx_tedx")).agg(
    collect_list(
        struct(col("date"), col("time"), col("mail_user"),
               col("vote"))).alias("vote_user"))
vote_dataset_agg4.printSchema()

tedx_dataset_agg4 = tedx_dataset_agg3.join(vote_dataset_agg4, tedx_dataset_agg3._id == vote_dataset_agg4.idx_tedx, "left") \
    .drop("idx_tedx")

tedx_dataset_agg4.printSchema()

mongo_uri = "mongodb://mycluster-shard-00-00-wo6at.mongodb.net:27017,mycluster-shard-00-01-wo6at.mongodb.net:27017,mycluster-shard-00-02-wo6at.mongodb.net:27017"

write_mongo_options = {
    "uri": mongo_uri,
    "database": "unibg_tedx",
    "collection": "tedz_data",
    "username": "******",
    "password": "******",
    "ssl": "true",
    "ssl.domain_match": "false"
}
from awsglue.dynamicframe import DynamicFrame
tedx_dataset_dynamic_frame = DynamicFrame.fromDF(tedx_dataset_agg4,
                                                 glueContext, "nested")

glueContext.write_dynamic_frame.from_options(
    tedx_dataset_dynamic_frame,
    connection_type="mongodb",
    connection_options=write_mongo_options)
Exemplo n.º 9
0
def back_kup_h2472_question_type():
    dyf_jh2472_question_type = glueContext \
        .create_dynamic_frame.from_catalog(database="do_h2472",
                                           table_name="question_type")

    if is_dev:
        print('dyf_jh2472_question_type')
        dyf_jh2472_question_type.printSchema()
        dyf_jh2472_question_type.show(3)

    # root
    # | -- id: string
    # | -- created_date: string
    # | -- description: string
    # | -- group_type: string

    # | -- modified_date: string
    # | -- name: string
    # | -- active: boolean
    # | -- parent_id: string
    # | -- _key: long
    # | -- _table: string
    # | -- _schema: string

    dyf_jh2472_question_type = dyf_jh2472_question_type.resolveChoice(
        specs=[('id', 'cast:long')])
    #
    dyf_jh2472_question_type = Filter.apply(frame=dyf_jh2472_question_type,
                                            f=lambda x: x["id"] > 54)

    df_jh2472_question_type = dyf_jh2472_question_type.toDF()
    df_jh2472_question_type = df_jh2472_question_type.dropDuplicates(['id'])
    df_jh2472_question_type = df_jh2472_question_type.withColumn(
        'name', f.concat('name', f.lit('_'), 'id'))
    dyf_jh2472_question_type = DynamicFrame.fromDF(df_jh2472_question_type,
                                                   glueContext,
                                                   'dyf_jh2472_question_type')

    # #
    applymapping1 = ApplyMapping.apply(
        frame=dyf_jh2472_question_type,
        mappings=[("id", 'long', 'id', 'long'),
                  ("created_date", "string", "created_date", "timestamp"),
                  ("description", "string", "description", "string"),
                  ("group_type", "string", "group_type", "string"),
                  ("modified_date", 'string', 'modified_date', 'timestamp'),
                  ("name", "string", "name", "string"),
                  ("active", "boolean", "active", "boolean")])
    # #
    resolvechoice1 = ResolveChoice.apply(frame=applymapping1,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice1")

    if is_dev:
        print('resolvechoice1')
        resolvechoice1.printSchema()
        resolvechoice1.show(3)
    # #
    # #
    datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=resolvechoice1,
        catalog_connection="h2474_backup",
        connection_options={
            "dbtable": "question_type",
            "database": "topicaH2472"
        },
        redshift_tmp_dir="s3a://dts-odin/topicaH2472/question_type",
        transformation_ctx="datasink5")
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
    for alias, frame in mapping.items():
        frame.toDF().createOrReplaceTempView(alias)
    result = spark.sql(query)
    return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
Exemplo n.º 11
0
def write_df_to_s3(glue_context, data_frame, backup_location):
    dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, "toS3")
    sink = glue_context.getSink("s3", path=backup_location)
    sink.setFormat("json")
    sink.write(dynamic_frame)
Exemplo n.º 12
0
# s3 output directories
medicare_cast = "s3://glue-sample-target/output-dir/medicare_json_cast"
medicare_project = "s3://glue-sample-target/output-dir/medicare_json_project"
medicare_cols = "s3://glue-sample-target/output-dir/medicare_json_make_cols"
medicare_struct = "s3://glue-sample-target/output-dir/medicare_json_make_struct"
medicare_sql = "s3://glue-sample-target/output-dir/medicare_json_sql"

# Read data into a dynamic frame
medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = db_name, table_name = tbl_name)

# The `provider id` field will be choice between long and string

# Cast choices into integers, those values that cannot cast result in null
medicare_res_cast = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')])
medicare_res_project = medicare_dyf.resolveChoice(specs = [('provider id','project:long')])
medicare_res_make_cols = medicare_dyf.resolveChoice(specs = [('provider id','make_cols')])
medicare_res_make_struct = medicare_dyf.resolveChoice(specs = [('provider id','make_struct')])

# Spark SQL on a Spark dataframe
medicare_df = medicare_dyf.toDF()
medicare_df.createOrReplaceTempView("medicareTable")
medicare_sql_df = spark.sql("SELECT * FROM medicareTable WHERE `total discharges` > 30")
medicare_sql_dyf = DynamicFrame.fromDF(medicare_sql_df, glueContext, "medicare_sql_dyf")

# Write it out in Json
glueContext.write_dynamic_frame.from_options(frame = medicare_res_cast, connection_type = "s3", connection_options = {"path": medicare_cast}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_res_project, connection_type = "s3", connection_options = {"path": medicare_project}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_cols, connection_type = "s3", connection_options = {"path": medicare_cols}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_res_make_struct, connection_type = "s3", connection_options = {"path": medicare_struct}, format = "json")
glueContext.write_dynamic_frame.from_options(frame = medicare_sql_dyf, connection_type = "s3", connection_options = {"path": medicare_sql}, format = "json")
# The `provider id` field will be choice between long and string

# Cast choices into integers, those values that cannot cast result in null
medicare_res = medicare_dyf.resolveChoice(specs = [('provider id','cast:long')])

# Remove erroneous records
medicare_df = medicare_res.toDF()
medicare_df = medicare_df.where("`provider id` is NOT NULL")

# Apply a lambda to remove the '$'
chop_f = udf(lambda x: x[1:], StringType())
medicare_df = medicare_df.withColumn("ACC", chop_f(medicare_df["average covered charges"])).withColumn("ATP", chop_f(medicare_df["average total payments"])).withColumn("AMP", chop_f(medicare_df["average medicare payments"]))

# Turn it back to a dynamic frame
medicare_tmp = DynamicFrame.fromDF(medicare_df, glueContext, "nested")

# Rename, cast, and nest with apply_mapping
medicare_nest = medicare_tmp.apply_mapping([('drg definition', 'string', 'drg', 'string'), 
                             ('provider id', 'long', 'provider.id', 'long'),
                             ('provider name', 'string', 'provider.name', 'string'),
                             ('provider city', 'string', 'provider.city', 'string'),
                             ('provider state', 'string', 'provider.state', 'string'),
                             ('provider zip code', 'long', 'provider.zip', 'long'),
                             ('hospital referral region description', 'string','rr', 'string'),
                             ('ACC', 'string', 'charges.covered', 'double'),
                             ('ATP', 'string', 'charges.total_pay', 'double'),
                             ('AMP', 'string', 'charges.medicare_pay', 'double')])

# Write it out in Parquet
glueContext.write_dynamic_frame.from_options(frame = medicare_nest, connection_type = "s3", connection_options = {"path": output_dir}, format = "parquet")
input_file_path = "s3://xxxxx"

df = spark.read.option("header","true")\
 .option("inferSchema","true")\
 .option("quote","\"")\
 .option("escape","\"").csv(input_file_path)

df = df.withColumn(
    'event_timestamp',
    f.to_timestamp('event_timestamp', format='MM/dd/yyyy HH:mm'))


df= df.withColumn('year',f.year(f.col('event_timestamp')))\
 .withColumn('month',f.month(f.col('event_timestamp')))

dynamic_df = DynamicFrame.fromDF(df, glueContext, "dynamic_df")

mapped_df = ResolveChoice.apply(frame=dynamic_df,
                                choice="make_cols",
                                transformation_ctx="mapped_df")

datasink = glueContext.write_dynamic_frame.from_jdbc_conf(
    frame=mapped_df,
    catalog_connection="xxxxxxx",
    connection_options={
        "dbtable": "external_data_schema.xxxxxx",
        "database": "dev"
    },
    redshift_tmp_dir=args["TempDir"],
    transformation_ctx="datasink")
NATURAL_KEY = FINAL_TUPLE_WITH_DF_AND_MD5[1]

## Taking the natual key that passed in Json File.
NATURAL_KEY_1 = NATURAL_KEY[0]

##Taking the value from SOURCE_NAME column (example : "HR PERSON") from FINAL_MD5_DF
POST_QUERY_SOURCE_NAME = FINAL_MD5_DF.select("source_name").limit(1).rdd.map(
    lambda a: a[0]).collect()[0]
print('#######>>>>>>>POST_QUERY_SOURCE_NAME', POST_QUERY_SOURCE_NAME)
print("finalmd5")

FINAL_MD5_DF1 = FINAL_MD5_DF.drop_duplicates()

# Final Data frame is converted to Dynamic frame
# Final Dynamic Frame will be written to Stage Table
FINAL_DYNAMIC_FRAME = DynamicFrame.fromDF(FINAL_MD5_DF1, GLUECONTEXT,
                                          "Final_dynamic_frame")

#Updates,Inserts and Deletes counts logic here
# 1. Create a DF with counts and op_val, Group by JobId,op_val
# 2. Extract inserts, updates and deletes
# 3. Add it to Cloud Watch Logs.

COUNT_DF = FINAL_MD5_DF.withColumn('JobRunId', F.lit(str(RUN_ID)))\
                       .withColumn('JobName', F.lit(str(RUN_ID)))

## Truncating the stage table
PRE_QUERY = """begin;
truncate table {stage_database_name}.{stage_table};
end;""".format(stage_database_name=STAGE_DATABASE_NAME,
               stage_table=STAGE_TABLE)
Exemplo n.º 16
0
    *["*"] + [col("kvs").getItem(k).alias(k) for k in keys])

# change the data types and column names to be easier to query later
with_map = with_map \
    .withColumn("id", monotonically_increasing_id()) \
    .withColumn("resources_used_walltime_secs", get_sec("resources_used_walltime")) \
    .withColumn("resources_used_cput", get_sec("resources_used_cput")) \
    .withColumn("resources_used_mem_gb", convert_to_gb("resources_used_mem")) \
    .withColumn("resource_list_nodect", expr("CAST(resource_list_nodect AS INTEGER)")) \
    .withColumn("resource_list_cpu", expr("CAST(resource_list_cpu AS INTEGER)")) \
    .withColumn("resource_list_gpu", expr("CAST(resource_list_gpu AS INTEGER)")) \
    .withColumn("qtime", expr("CAST(qtime AS LONG)")) \
    .withColumn("start", expr("CAST(start AS LONG)")) \
    .withColumn("ctime", expr("CAST(qtime AS LONG)")) \
    .withColumn("etime", expr("CAST(qtime AS LONG)")) \
    .withColumn("end", expr("CAST(qtime AS LONG)")) \
    .withColumn("exit_status", expr("CAST(exit_status AS INTEGER)")) \
    .withColumnRenamed("group", "group_name") \
    .withColumn("resource_list_cores", expr("CAST(resource_list_nodes as LONG) * CAST(resource_list_cpu as INTEGER)")) \
    .withColumn("resources_used_walltime_hrs", expr("cast(round((resources_used_walltime_secs / 60.00 / 60.00), 3) as float)")) \
    .withColumn("resources_used_cput_hrs", expr("cast(round((resources_used_walltime_secs / 60.00 / 60.00), 3) as float)")) \
    .drop('resources_used_vmem', 'kvs', 'session', 'exec_host', 'resource_list_neednodes', 'resource_list_walltime', 'detail')
# eventually drop detail and the asked resources to only use actually used

torq = DynamicFrame.fromDF(with_map, glueContext, "joined")

datasink5 = glueContext.write_dynamic_frame.from_options(frame=torq, connection_type="s3", connection_options={
                                                         "path": args['S3_OUTPUT_PATH'], "partitionKeys": ["year", "month", "day"]}, format="parquet", transformation_ctx="datasink5")

job.commit()
Exemplo n.º 17
0
    "zipcode", 'size_of_adjusted_gross_income', 'num_of_returns',
    'num_of_single_returns', 'num_of_joint_returns',
    'num_of_head_of_household_returns', 'num_with_paid_preparers_signature',
    'num_of_exemptions', 'num_of_dependents',
    'num_of_volunteer_prepared_returns_Total',
    'num_of_volunteer_prepared_returns_Num_of_volunteer_income_tax_assistance_prepared_returns',
    'num_of_volunteer_prepared_returns_Num_of_tax_counseling_for_the_elderly_prepared_returns'
]

#rename the columns
for c, n in zip(income_ny_df.columns, new_cols):
    income_ny_df = income_ny_df.withColumnRenamed(c, n)

print("new columns:   ", income_ny_df.columns)

income_ny_DyF = DynamicFrame.fromDF(income_ny_df, glueContext, "income_ny_DyF")

income_ny_DyF.printSchema()

# Print out information about this data
print("Parks Count:  ", parks_DyF.count())
parks_DyF.printSchema()

# Print out information about this data.
print("Playground Count:  ", playgrounds_DyF.count())
playgrounds_DyF.printSchema()
# Convert to Spark DataFrame for left outer join
playgrounds_df = playgrounds_DyF.toDF()
# Drop duplicate columns in parks dataframe
columns_to_drop = ['Location', 'Name', 'year', 'month', 'day']
playgrounds_df = playgrounds_df.drop(*columns_to_drop)
def hash_cc(s):
    return hashlib.sha256(s).hexdigest()

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "serverless-datalake", table_name = "user-profile", transformation_ctx = "datasource0")


## @convert glue DynamicFrame to DataFrame to manipulate the columns
dataframe0 = DynamicFrame.toDF(datasource0)

hash_cc_f = udf(lambda x: hash_cc(x), StringType())

dataframe0 = dataframe0.withColumn("hash_cc", hash_cc_f(dataframe0["cc"])).withColumn("hash_ssn", hash_cc_f(dataframe0["ssn"]))
dataframe0 = dataframe0.drop('cc').drop('ssn').drop('password')

## @convert dataframe to glue DynamicFrame and write the output in parquet format
datasource1 = DynamicFrame.fromDF(dataframe0, glueContext, "name1")


datasink4 = glueContext.write_dynamic_frame.from_options(frame = datasource1, connection_type = "s3", connection_options = {"path": "s3://serverless-datalake-ingestionbucket-1jiyskijz5i03/prepared/userprofile-secure"}, format = "parquet", transformation_ctx = "datasink4")

job.commit()
def main():

    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source
    dyf_crm_contacts = glueContext.create_dynamic_frame.from_catalog(
        database='crm_native', table_name='contacts')

    dyf_crm_contacts = dyf_crm_contacts.select_fields(
        ['_key', 'Id', 'Code', 'Fullname', 'Address'])
    dyf_crm_contacts = dyf_crm_contacts.resolveChoice(specs=[('_key',
                                                              'cast:long')])

    dy_source_voxy_cache = dyf_crm_contacts.toDF()
    dy_source_voxy_cache = dy_source_voxy_cache.cache()
    dyf_crm_contacts = DynamicFrame.fromDF(dy_source_voxy_cache, glueContext,
                                           'dyf_crm_contacts')

    # try:
    #     df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_user_communication_full_name.parquet")
    #     read_from_index = df_flag.collect()[0]['flag']
    #     print('read from index: ', read_from_index)
    #     dyf_crm_contacts = Filter.apply(frame=dyf_crm_contacts,
    #                                     f=lambda x: x["_key"] > read_from_index)
    # except:
    #     print('read flag file error ')

    print('the number of new contacts: ', dyf_crm_contacts.count())

    if (dyf_crm_contacts.count() > 0):

        # print('Chay vao day nhe------------------')
        # print('dyf_crm_contacts::----------------')
        # dyf_crm_contacts.printSchema()
        # try:
        #--------------------------------------------------------------------------------------------------------------#
        dyf_crm_contacts = Filter.apply(
            frame=dyf_crm_contacts,
            f=lambda x: x["Id"] is not None and x["Id"] != '' and x[
                "Code"] is not None and x["Code"] != '' and x[
                    "Fullname"] is not None and x["Fullname"] != '')
        # --------------------------------------------------------------------------------------------------------------#

        # --------------------------------------------------------------------------------------------------------------#
        # today = date.today()
        # today_timestamp = today.timestamp();
        # print("Today's date:", today_timestamp)

        dy_crm_contacts = dyf_crm_contacts.toDF()
        dy_crm_contacts = dy_crm_contacts.dropDuplicates(['Code'])
        dy_crm_contacts = dy_crm_contacts.withColumn(
            'communication_type_full_name', f.lit(4))
        dy_crm_contacts = dy_crm_contacts.withColumn(
            'communication_type_address', f.lit(6))
        dy_crm_contacts = dy_crm_contacts.withColumn('is_primary', f.lit(1))
        dy_crm_contacts = dy_crm_contacts.withColumn('is_deleted', f.lit(0))
        dy_crm_contacts = dy_crm_contacts.withColumn(
            'last_update_date', f.lit('2019-08-28 00:00:00'))
        dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts, glueContext,
                                               'dyf_crm_contacts')

        dyf_crm_contacts = dyf_crm_contacts.resolveChoice(
            specs=[('last_update_date', 'cast:long')])

        applymapping2 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[("Id", "int", "user_id", "bigint"),
                      ("communication_type_full_name", 'int',
                       'communication_type', 'int'),
                      ("is_primary", 'int', 'is_primary', 'int'),
                      ("is_deleted", 'int', 'is_deleted', 'int'),
                      ("Fullname", 'string', 'comunication', 'string'),
                      ("last_update_date", 'string', 'last_update_date',
                       'timestamp')])

        #
        #
        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping2,
            choice="make_cols",
            transformation_ctx="resolvechoice2")
        dropnullfields6 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields2")

        datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields6,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "user_communication",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dts-odin/temp/user/communication/fullname/",
            transformation_ctx="datasink4")

        dyf_crm_contacts = Filter.apply(
            frame=dyf_crm_contacts,
            f=lambda x: x["Address"] is not None and x["Address"] != '')
        #--------------------------------------------------------------------------------------------------------------#
        applymapping3 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[("Id", "int", "user_id", "bigint"),
                      ("communication_type_address", 'int',
                       'communication_type', 'int'),
                      ("is_primary", 'int', 'is_primary', 'int'),
                      ("is_deleted", 'int', 'is_deleted', 'int'),
                      ("Address", 'string', 'comunication', 'string'),
                      ("last_update_date", 'string', 'last_update_date',
                       'timestamp')])
        #
        #
        resolvechoice3 = ResolveChoice.apply(
            frame=applymapping3,
            choice="make_cols",
            transformation_ctx="resolvechoice3")
        dropnullfields3 = DropNullFields.apply(
            frame=resolvechoice3, transformation_ctx="dropnullfields3")

        datasink3 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields3,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "user_communication",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dts-odin/temp/user/communication/address/",
            transformation_ctx="datasink3")
        # --------------------------------------------------------------------------------------------------------------#

        #insert into source_id

        # lay max _key tren datasource
        datasource = dyf_crm_contacts.toDF()
        flag = datasource.agg({"_key": "max"}).collect()[0][0]

        # ghi de flag moi vao s3
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        df.write.parquet(
            "s3a://dts-odin/flag/flag_user_communication_full_name.parquet",
            mode="overwrite")
Exemplo n.º 20
0
## @args: [f = lambda row : (bool(re.match("Match Finished", row["status"]))), transformation_ctx = "Transform2"]
## @return: Transform2
## @inputs: [frame = Transform7]
Transform2 = Filter.apply(frame = Transform7, f = lambda row : (bool(re.match("Match Finished", row["status"]))), transformation_ctx = "Transform2")
## @type: ApplyMapping
## @args: [mappings = [("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "long", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "long", "idhometeam", "int"), ("idawayteam", "long", "idawayteam", "int"), ("goalshometeam", "long", "goalshometeam", "int"), ("goalsawayteam", "long", "goalsawayteam", "int")], transformation_ctx = "Transform6"]
## @return: Transform6
## @inputs: [frame = Transform2]
Transform6 = ApplyMapping.apply(frame = Transform2, mappings = [("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "long", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "long", "idhometeam", "int"), ("idawayteam", "long", "idawayteam", "int"), ("goalshometeam", "long", "goalshometeam", "int"), ("goalsawayteam", "long", "goalsawayteam", "int")], transformation_ctx = "Transform6")
## @type: Join
## @args: [columnConditions = ["=", "="], joinType = right, keys2 = ["idfixture", "idhometeam"], keys1 = ["(predictions) idfixture", "(predictions) idteam"], transformation_ctx = "Transform4"]
## @return: Transform4
## @inputs: [frame1 = Transform1, frame2 = Transform6]
Transform1DF = Transform1.toDF()
Transform6DF = Transform6.toDF()
Transform4 = DynamicFrame.fromDF(Transform1DF.join(Transform6DF, (Transform1DF['(predictions) idfixture'] == Transform6DF['idfixture']) & (Transform1DF['(predictions) idteam'] == Transform6DF['idhometeam']), "right"), glueContext, "Transform4")
## @type: ApplyMapping
## @args: [mappings = [("(predictions) xgoals", "double", "xgoalshometeam", "double"), ("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "int", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "int", "idhometeam", "int"), ("idawayteam", "int", "idawayteam", "int"), ("goalshometeam", "int", "goalshometeam", "int"), ("goalsawayteam", "int", "goalsawayteam", "int")], transformation_ctx = "Transform5"]
## @return: Transform5
## @inputs: [frame = Transform4]
Transform5 = ApplyMapping.apply(frame = Transform4, mappings = [("(predictions) xgoals", "double", "xgoalshometeam", "double"), ("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string"), ("idfixture", "int", "idfixture", "int"), ("date", "string", "date", "string"), ("time", "string", "time", "string"), ("idhometeam", "int", "idhometeam", "int"), ("idawayteam", "int", "idawayteam", "int"), ("goalshometeam", "int", "goalshometeam", "int"), ("goalsawayteam", "int", "goalsawayteam", "int")], transformation_ctx = "Transform5")
## @type: Join
## @args: [columnConditions = ["=", "="], joinType = left, keys2 = ["(predictions) idfixture", "(predictions) idteam"], keys1 = ["idfixture", "idawayteam"], transformation_ctx = "Transform8"]
## @return: Transform8
## @inputs: [frame1 = Transform5, frame2 = Transform1]
Transform5DF = Transform5.toDF()
Transform1DF = Transform1.toDF()
Transform8 = DynamicFrame.fromDF(Transform5DF.join(Transform1DF, (Transform5DF['idfixture'] == Transform1DF['(predictions) idfixture']) & (Transform5DF['idawayteam'] == Transform1DF['(predictions) idteam']), "left"), glueContext, "Transform8")
## @type: ApplyMapping
## @args: [mappings = [("date", "string", "date", "string"), ("(predictions) xgoals", "double", "xgoalsawayteam", "decimal"), ("shotsinsideboxhometeam", "int", "shotsinsideboxhometeam", "int"), ("totalshotsawayteam", "int", "totalshotsawayteam", "int"), ("totalshotshometeam", "int", "totalshotshometeam", "int"), ("xgoalshometeam", "double", "xgoalshometeam", "decimal"), ("idfixture", "int", "idfixture", "int"), ("goalshometeam", "int", "goalshometeam", "int"), ("idawayteam", "int", "idawayteam", "int"), ("goalsawayteam", "int", "goalsawayteam", "int"), ("ballpossessionhometeam", "string", "ballpossessionhometeam", "string"), ("idhometeam", "int", "idhometeam", "int"), ("shotsongoalhometeam", "int", "shotsongoalhometeam", "int"), ("shotsinsideboxawayteam", "int", "shotsinsideboxawayteam", "int"), ("time", "string", "time", "string"), ("shotsongoalawayteam", "int", "shotsongoalawayteam", "int"), ("ballpossessionawayteam", "string", "ballpossessionawayteam", "string")], transformation_ctx = "Transform0"]
## @return: Transform0
Exemplo n.º 21
0
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

## @params: [JOB_NAME]
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session

ds0 = glueContext.create_dynamic_frame.from_catalog(
    database="autoglues3lineage",
    table_name="train_sm_s2adb_csv",
    transformation_ctx="ds0")

ds3 = ds0.toDF()
ds3.createOrReplaceTempView("train_sm_s2adb_csv_temp2")
ds4 = spark.sql("SELECT * FROM train_sm_s2adb_csv_temp2 WHERE age > 30")
ds5 = DynamicFrame.fromDF(ds4, glueContext, "ds5")

ds6 = glueContext.write_dynamic_frame.from_options(
    frame=ds5,
    connection_type="redshift",
    connection_options={
        "url":
        "jdbc:redshift://redshift-cluster-1.csvp5wcqqxvw.us-east-1.redshift.amazonaws.com:5439/world",
        "dbtable": "atn.gluetable312"
    },
    transformation_ctx="ds6")
ds7 = glueContext.write_dynamic_frame.from_options(
    frame=ds5,
    connection_type="s3",
    connection_options={"path": "s3://asgqatestautomation4/Targetdata312"},
    format="json",
Exemplo n.º 22
0
current_timestamp = time.strftime("%Y-%m-%d %H:%M:%S")

######################################
####        CONNECTION BLOCK      ####
######################################

## argo_carrier_visit connection
argoCV_ds = glueContext.create_dynamic_frame.from_catalog(
    database="staging_initial",
    table_name="argo_carrier_visit",
    transformation_ctx="argoCV_ds")
argoCV_regDF = argoCV_ds.toDF()
argoCV_regDF = argoCV_regDF.withColumn("sourcesystem", lit("PNCT")).withColumn(
    "dboperationtype", lit("L")).withColumn("audtdateadded",
                                            lit(current_timestamp))
argoCV_dynDF = DynamicFrame.fromDF(argoCV_regDF, glueContext, "nested")

## argo_chargeable_unit_events connection
argoCUE_ds = glueContext.create_dynamic_frame.from_catalog(
    database="staging_initial",
    table_name="argo_chargeable_unit_events",
    transformation_ctx="argoCUE_ds")
argoCUE_regDF = argoCUE_ds.toDF()
argoCUE_regDF = argoCUE_regDF.withColumn(
    "sourcesystem",
    lit("PNCT")).withColumn("dboperationtype",
                            lit("L")).withColumn("audtdateadded",
                                                 lit(current_timestamp))
argoCUE_dynDF = DynamicFrame.fromDF(argoCUE_regDF, glueContext, "nested")

## argo_visit_details connection
    database=DATABASE, table_name=INVENTORY_TABLE).toDF()
filelist = glueContext.create_dynamic_frame.from_catalog(
    database=DATABASE, table_name=FILENAME_TABLE)
mapped = filelist.apply_mapping([
    ("archiveid", "string", "archiveid", "string"),
    ("override", "string", "override", "string")
]).toDF().dropDuplicates(['archiveid'])

rownum = inventory.withColumn(
    "row_num",
    row_number().over(
        Window.orderBy(inventory['creationdate'],
                       inventory['archiveid'])).cast("long"))
merged = rownum.join(mapped, "archiveid", how='left_outer')

frame = DynamicFrame.fromDF(merged, glueContext, "merged")


def transform(rec):
    rec["part"] = rec["row_num"] // partiton_size
    rec["archivedescription"] = rec["override"] if rec["override"] and rec[
        "override"].strip() else rec["archivedescription"]
    rec.pop('override', None)
    return rec


trans0 = Map.apply(frame=frame, f=transform)

sink = glueContext.getSink(connection_type="s3",
                           path='s3://' + STAGING_BUCKET + '/partitioned/',
                           enableUpdateCatalog=True,
Exemplo n.º 24
0
              ("passenger_count", "long", "passenger_count", "long"),
              ("trip_distance", "double", "trip_distance", "double"),
              ("pulocationid", "long", "pulocationid", "long"),
              ("dolocationid", "long", "dolocationid", "long"),
              ("fare_amount", "double", "fare_amount", "double"),
              ("tip_amount", "double", "tip_amount", "double"),
              ("total_amount", "double", "total_amount", "double")],
    transformation_ctx="applymapping1")

resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                     choice="make_cols",
                                     transformation_ctx="resolvechoice2")
sparkdf = resolvechoice2.toDF()
transform1 = sparkdf.where(
    func.col('tpep_pickup_datetime').between('2019-01-01', '2020-12-31'))
transform2 = transform1.dropna(subset=['passenger_count', 'trip_distance'])
result = DynamicFrame.fromDF(dataframe=transform2,
                             glue_ctx=glueContext,
                             name='result')

datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
    frame=result,
    catalog_connection="redshift-east",
    connection_options={
        "dbtable": "yellow",
        "database": "dev"
    },
    redshift_tmp_dir=args["TempDir"],
    transformation_ctx="datasink4")

job.commit()
Exemplo n.º 25
0
def _find_row(paintings: DynamicFrame, episode_text: str):
    """ Assert a given row exists in the dynamic frame and that it contains the expected values """
    matches = paintings.filter(
        lambda x: x['season_episode_text'] == episode_text).toDF().collect()
    assert len(matches) == 1
    return matches[0]
Exemplo n.º 26
0
def main():
    def checknull(level_modified, level_study):
        if level_modified is not None:
            return level_modified
        else:
            return level_study

    checknull_ = udf(checknull, StringType())

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                  package_code, package_endtime, package_starttime,
                  student_level_code, student_package_status_code,
                  transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_package_status_code is not None:
            text_concat += str(student_package_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = udf(concaText, StringType())
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="student_contact")

    dyf_student_contact = dyf_student_contact.select_fields(
        ['student_id', 'contact_id', 'level_study'])

    dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_student_level_study")

    dyf_log_student_level_study = dyf_log_student_level_study.select_fields([
        'contact_id', 'level_current', 'level_modified', 'package_code',
        'time_created'
    ])
    dyf_log_student_level_study = dyf_log_student_level_study.resolveChoice(
        specs=[('_key', 'cast:int')])

    dyf_tpe_invoice_product = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_invoice_product")
    dyf_tpe_invoice_product = dyf_tpe_invoice_product.select_fields([
        '_key', 'timecreated', 'user_id', 'buyer_id', 'invoice_packages_price',
        'invoice_price', 'invoice_code'
    ])
    dyf_tpe_invoice_product = dyf_tpe_invoice_product.resolveChoice(
        specs=[('_key', 'cast:long')])
    dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_invoice_product_details")

    dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields(
        ['cat_code', 'package_time', 'invoice_code'])

    dyf_student_package = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_student_package")

    # chon cac field
    dyf_student_package = dyf_student_package.select_fields(
        ['student_id', 'start_time', 'end_time',
         'package_code']).rename_field('student_id', 'student_id1')
    dyf_student_package.printSchema()
    dyf_student_package.show(2)
    # # doc flag tu s3
    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet"
        )
        start_read = df_flag.collect()[0]['flag']
        print('read from index: ', start_read)

        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        dyf_tpe_invoice_product = Filter.apply(
            frame=dyf_tpe_invoice_product, f=lambda x: x['_key'] > start_read)
    except:
        print('read flag file error ')

    print('the number of new contacts: ', dyf_tpe_invoice_product.count())

    if (dyf_tpe_invoice_product.count() > 0):
        df_log_student_level_study = dyf_log_student_level_study.toDF()
        df_log_student_level_study = df_log_student_level_study.groupby(
            'contact_id', 'level_current', 'level_modified',
            'package_code').agg(f.max('time_created').alias('time_created'))

        dyf_join0 = Join.apply(dyf_tpe_invoice_product,
                               dyf_tpe_invoice_product_details, 'invoice_code',
                               'invoice_code')
        print("@@@@@@@@@@@@")
        dyf_join0.printSchema()
        dyf_join0.show(2)
        dyf_log_student_level_study = DynamicFrame.fromDF(
            df_log_student_level_study, glueContext,
            "dyf_log_student_level_study")

        dyf_join1 = Join.apply(dyf_student_contact, dyf_join0, "contact_id",
                               "user_id")
        dyf_join = Join.apply(dyf_join1, dyf_log_student_level_study,
                              "user_id", "contact_id")
        print("@@@@@@@@@@@@")
        dyf_join.printSchema()
        dyf_join.show(2)
        dyf_join = Filter.apply(
            frame=dyf_join, f=lambda x: x['time_created'] <= x['timecreated'])

        dyf_data_join3 = Join.apply(dyf_join, dyf_student_package,
                                    "student_id", "student_id1")
        dyf_data_join3 = Filter.apply(
            frame=dyf_data_join3,
            f=lambda x: x['package_code'] == x['cat_code'])
        df_data_join3 = dyf_data_join3.toDF()
        df_data_join3 = df_data_join3.withColumn("student_level_code", checknull_(df_data_join3.level_modified, df_data_join3.level_study))\
        .withColumn("behavior_id", f.lit(3))\
        .withColumn("student_package_status_code", f.lit("DEACTIVED"))\
        .withColumn("student_behavior_date", from_unixtime(df_data_join3.timecreated))\
        .withColumn("package_starttime", df_data_join3['start_time'])\
        .withColumn("package_endtime", df_data_join3['end_time']) \
            .withColumn("transformed_at", f.lit(None))
        df_data_join3 = df_data_join3.withColumn(
            'student_behavior_id',
            f.md5(
                concaText(df_data_join3.student_behavior_date,
                          df_data_join3.behavior_id, df_data_join3.student_id,
                          df_data_join3.contact_id, df_data_join3.package_code,
                          df_data_join3.package_endtime,
                          df_data_join3.package_starttime,
                          df_data_join3.student_level_code,
                          df_data_join3.student_package_status_code,
                          df_data_join3.transformed_at)))
        df_data_join3 = df_data_join3.dropDuplicates()
        dyf_data_join3 = DynamicFrame.fromDF(df_data_join3, glueContext,
                                             "dyf_data_join3")
        dyf_data_join3 = dyf_data_join3.resolveChoice(
            specs=[('behavior_id',
                    'cast:int'), ('student_behavior_date', 'cast:timestamp')])
        dyf_data_join3.printSchema()
        dyf_data_join3.show(2)
        applymapping = ApplyMapping.apply(
            frame=dyf_data_join3,
            mappings=[("student_behavior_id", "string", "student_behavior_id",
                       "string"),
                      ("contact_id", "string", "contact_id", "string"),
                      ("student_behavior_date", "timestamp",
                       "student_behavior_date", "long"),
                      ("student_id", "string", "student_id", "long"),
                      ("cat_code", "string", "package_code", "string"),
                      ("package_starttime", "int", "package_starttime",
                       "long"),
                      ("package_endtime", "int", "package_endtime", "long"),
                      ("student_package_status_code", "string",
                       "student_status_code", "string"),
                      ("behavior_id", "int", "behavior_id", "long"),
                      ("student_level_code", "string", "student_level_code",
                       "string")])

        resolvechoice = ResolveChoice.apply(frame=applymapping,
                                            choice="make_cols",
                                            transformation_ctx="resolvechoice")

        dropnullfields = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields")

        print(dropnullfields.count())
        dropnullfields.toDF().show()

        glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_behavior",
                "partitionKeys": ["behavior_id"]
            },
            format="parquet")

        applymapping1 = ApplyMapping.apply(
            frame=dyf_data_join3,
            mappings=[("invoice_packages_price", "int", "measure1", "long"),
                      ("behavior_id", "int", "behavior_id", "long"),
                      ("invoice_price", "int", "measure2 ", "long")])

        resolvechoice1 = ResolveChoice.apply(
            frame=applymapping1,
            choice="make_cols",
            transformation_ctx="resolvechoice1")

        dropnullfields1 = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields1")

        print(dropnullfields1.count())
        dropnullfields1.toDF().show()
        glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields,
            connection_type="s3",
            connection_options={
                "path":
                "s3://dtsodin/student_behavior/student_general_behavior",
                "partitionKeys": ["behavior_id"]
            },
            format="parquet")

        dyf_tpe_invoice_product = dyf_tpe_invoice_product.toDF()
        flag = dyf_tpe_invoice_product.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet(
            "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet",
            mode="overwrite")
Exemplo n.º 27
0
# extract out transactions for test/validation
n_train = int(transactions.count() * train_data_ratio)
test_ids = transactions.select_fields(TRANSACTION_ID)
get_fraud_frac = lambda series: 100 * sum(series) / len(series)
isfraud_df: DynamicFrame = transactions.select_fields("isFraud")
logger.info("Percent fraud for train transactions: {}".format(
    sum_col(transactions.toDF(), "isFraud")))
dump_df_to_s3(test_ids.toDF(), 'test', header=False)

id_cols = args['id_cols']
cat_cols = args['cat_cols']
features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols,
                                                 cat_cols)

# Creating glue dynamic frame from spark dataframe
features_dynamic_df = DynamicFrame.fromDF(features_df, glueContext,
                                          'FeaturesDF')
features_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(
    features_dynamic_df, [('~id', TRANSACTION_ID, 't')])
logger.info(f'Upserting transactions as vertices of graph...')
features_dynamic_df.toDF().foreachPartition(
    gremlin_client.upsert_vertices('Transaction', batch_size=50))
logger.info(f'Creating glue DF from labels dataframe')
labels_dynamic_df = DynamicFrame.fromDF(labels_df, glueContext, 'LabelsDF')
labels_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(
    labels_dynamic_df, [('~id', TRANSACTION_ID, 't')])
logger.info(f'Upserting transactions with isFraud property...')
labels_dynamic_df.toDF().foreachPartition(
    gremlin_client.upsert_vertices('Transaction', batch_size=100))

dump_df_to_s3(features_df, 'features')
dump_df_to_s3(labels_df, 'tags')
Exemplo n.º 28
0
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Read DynamicFrame.
dynf = glueContext.create_dynamic_frame.from_catalog(database="default",
                                                     table_name="sales",
                                                     transformation_ctx="dynf")

# Convert to DataFrame.
df = dynf.toDF()

# Put table on DataFrame.
df.createOrReplaceTempView("sales_tmp")

# Run SQL.
sql_df = spark.sql(
    "SELECT id, date, store, state, product, amount * 2.1 from sales_tmp")

# Convert back to DynamicFrame.
dynf_new = DynamicFrame.fromDF(sql_df, glueContext, "df")
datasink4 = glueContext.write_dynamic_frame.from_catalog(
    frame=dynf_new,
    database="default",
    table_name="sales1",
    transformation_ctx="datasink4")

# Commit.
job.commit()
Exemplo n.º 29
0
specid ,
systemcreationdate ,
udblistingid,
to_date(effectiveto) AS effectiveto_date
FROM edw_listings WHERE row_number_seq = 1
            ''')
#df_joined.cache()
df_joined.describe()
df_joined.printSchema()
#print  df_joined.count()
s3_location_target = 's3://move-dataeng-temp-dev/glue-etl/parquet_data/listingdim_pdt_deduped_pq'

output_folder = s3_location_target  # With absolute path
print 'output_folder= %s' % (output_folder)
#----  PySpark section ----

#df_joined.write.mode('overwrite').parquet(output_folder)
#df_joined.write.mode('overwrite').save(output_folder)
new_dynamic_frame = DynamicFrame.fromDF(df_joined, glueContext,
                                        "new_dynamic_frame")
codec = 'snappy'
#glueContext.write_dynamic_frame.from_options(frame = m_df, connection_type = "s3", connection_options = {"path":  child_output_dir}, format = "parquet", compression=codec)
glueContext.write_dynamic_frame.from_options(
    frame=new_dynamic_frame,
    connection_type="s3",
    connection_options={"path": output_folder},
    format="parquet",
    compression=codec)

print 'Done Parquet Conversion !'
Exemplo n.º 30
0
 def create_dynamic_frame_from_rdd(self, data, name, schema=None, sample_ratio=None, transformation_ctx=""):
     """Creates a DynamicFrame from an RDD.
     """
     df = super(GlueContext, self).createDataFrame(data, schema, sample_ratio)
     return DynamicFrame.fromDF(df, self, name)
Exemplo n.º 31
0
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

##Read Data from REST API using DataDirect Autonomous REST Connector JDBC driver in to DataFrame
source_df = spark.read.format("jdbc").option(
    "url",
    "jdbc:datadirect:autorest:config=yelp.rest;AuthenticationMethod=HttpHeader;AuthHeader=Authorization;SecurityToken='Bearer JcMUtuWfaqJdWJBqqLrgBxfbYh6GIUGv3zUyXOG4zsfe6wnOtlZBeroFb8rpRM-dESFzcSAUd1YDAtQm2yl0hrJwfldvHp2AdEzRXThZku69r-w4wTv80Cj7d08ZXHYx'"
).option("dbtable", "AUTOREST.BUSINESSES").option(
    "driver", "com.ddtek.jdbc.autorest.AutoRESTDriver").load()

job.init(args['JOB_NAME'], args)

print(source_df)

##Convert DataFrames to AWS Glue's DynamicFrames Object
dynamic_dframe = DynamicFrame.fromDF(source_df, glueContext, "dynamic_df")

##Write Dynamic Frames to S3 in CSV format. You can write it to any rds/redshift, by using the connection that you have defined previously in Glue
datasink4 = glueContext.write_dynamic_frame.from_options(
    frame=dynamic_dframe,
    connection_type="s3",
    connection_options={"path": "s3://glueuserdata"},
    format="csv",
    transformation_ctx="datasink4")

job.commit()
Exemplo n.º 32
0
####        CONNECTION BLOCK      ####
######################################

## ref_bizunit_scoped connection
refBizScopedCon_ds = glueContext.create_dynamic_frame.from_catalog(
    database="nola_staging_initial",
    table_name="ref_bizunit_scoped",
    transformation_ctx="refBizScopedCon_ds")
refBizScopedCon_regDF = refBizScopedCon_ds.toDF()
refBizScopedCon_regDF = refBizScopedCon_regDF.withColumn(
    "sourcesystem",
    lit("NOLA")).withColumn("dboperationtype",
                            lit("L")).withColumn("audtdateadded",
                                                 lit(current_timestamp))
refBizScopedCon_distDF = refBizScopedCon_regDF.distinct()
refBizScopedCon_dynDF = DynamicFrame.fromDF(refBizScopedCon_distDF,
                                            glueContext, "nested")

## ref_carrier_itinerary connection
refCarItinCon_ds = glueContext.create_dynamic_frame.from_catalog(
    database="nola_staging_initial",
    table_name="ref_carrier_itinerary",
    transformation_ctx="refCarItinCon_ds")
refCarItinCon_regDF = refCarItinCon_ds.toDF()
refCarItinCon_regDF = refCarItinCon_regDF.withColumn(
    "sourcesystem",
    lit("NOLA")).withColumn("dboperationtype",
                            lit("L")).withColumn("audtdateadded",
                                                 lit(current_timestamp))
refCarItinCon_distDF = refCarItinCon_regDF.distinct()
refCarItinCon_dynDF = DynamicFrame.fromDF(refCarItinCon_distDF, glueContext,
                                          "nested")
Exemplo n.º 33
0
# Cast choices into integers, those values that cannot cast result in null
medicare_res_cast = medicare_dyf.resolveChoice(specs=[('provider id',
                                                       'cast:long')])
medicare_res_project = medicare_dyf.resolveChoice(specs=[('provider id',
                                                          'project:long')])
medicare_res_make_cols = medicare_dyf.resolveChoice(specs=[('provider id',
                                                            'make_cols')])
medicare_res_make_struct = medicare_dyf.resolveChoice(specs=[('provider id',
                                                              'make_struct')])

# Spark SQL on a Spark dataframe
medicare_df = medicare_dyf.toDF()
medicare_df.createOrReplaceTempView("medicareTable")
medicare_sql_df = spark.sql(
    "SELECT * FROM medicareTable WHERE `total discharges` > 30")
medicare_sql_dyf = DynamicFrame.fromDF(medicare_sql_df, glueContext,
                                       "medicare_sql_dyf")

# Write it out in Json
glueContext.write_dynamic_frame.from_options(
    frame=medicare_res_cast,
    connection_type="s3",
    connection_options={"path": medicare_cast},
    format="json")
glueContext.write_dynamic_frame.from_options(
    frame=medicare_res_project,
    connection_type="s3",
    connection_options={"path": medicare_project},
    format="json")
glueContext.write_dynamic_frame.from_options(
    frame=medicare_res_make_cols,
    connection_type="s3",
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source
    is_dev = True
    limit = True
    # information database???????
    dyf_crm_contacts = glueContext.create_dynamic_frame.from_catalog(
        database='crm_native', table_name='contacts')

    # dyf_crm_contacts = Filter.apply(frame=dyf_crm_contacts,
    #                                 f=lambda x: x["Id"] < 1102)

    # print('dyf_crm_contacts::fdfdfdfdfdfdfd----------------')
    # dyf_crm_contacts.printSchema()
    dyf_crm_contacts = dyf_crm_contacts.resolveChoice(specs=[('Id',
                                                              'cast:int')])

    print('dyf_crm_contacts')
    dyf_crm_contacts.printSchema()

    #doc moc flag tu s3
    try:
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/flag_user_profile.parquet")
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_crm_contacts = Filter.apply(frame=dyf_crm_contacts,
                                        f=lambda x: x["Id"] > read_from_index)
    except:
        print('read flag file error ')
    print('the number of new contacts: ', dyf_crm_contacts.count())

    crm_contacts_number = dyf_crm_contacts.count()
    print('crm_contacts_number: ', crm_contacts_number)
    if crm_contacts_number < 1:
        print('Stopping--- crm_contacts_number < 1')
        return

    dyf_crm_contacts = dyf_crm_contacts.select_fields(
        ['_key', 'Id', 'Code', 'Birthday', 'Gender', 'Job', 'CreatedDate'])
    dy_crm_contacts_cache = dyf_crm_contacts.toDF()
    dy_crm_contacts_cache = dy_crm_contacts_cache.dropDuplicates(['Code'])
    dy_crm_contacts_cache = dy_crm_contacts_cache.cache()
    dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts_cache, glueContext,
                                           'dyf_crm_contacts')

    today = date.today()
    d4 = today.strftime("%Y-%m-%d")
    print("d4 =", d4)

    # print('Chay vao day nhe------------------')
    # print('dyf_crm_contacts::----------------')
    # dyf_crm_contacts.printSchema()
    # try:
    #--------------------------------------------------------------------------------------------------------------#
    dyf_crm_contacts = Filter.apply(
        frame=dyf_crm_contacts,
        f=lambda x: x["Id"] is not None and x["Id"] != '' and x[
            "Code"] is not None and x["Code"] != '')
    # --------------------------------------------------------------------------------------------------------------#

    # --------------------------------------------------------------------------------------------------------------#
    if (dyf_crm_contacts.count() > 0):
        dy_crm_contacts = dyf_crm_contacts.toDF()
        # dy_crm_contacts = dy_crm_contacts.dropDuplicates(['Code'])
        dy_crm_contacts = dy_crm_contacts.withColumn('source_type', f.lit(1))
        dy_crm_contacts = dy_crm_contacts.withColumn('is_root', f.lit(1))
        dy_crm_contacts = dy_crm_contacts.withColumn('description', f.lit(d4))
        dy_crm_contacts = dy_crm_contacts.withColumn('last_update_date',
                                                     f.lit(d4))
        dy_crm_contacts_cache_2 = dy_crm_contacts.cache()
        dyf_crm_contacts = DynamicFrame.fromDF(dy_crm_contacts_cache_2,
                                               glueContext, 'dyf_crm_contacts')

        applymapping2 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[("Id", "int", "user_id", "bigint"),
                      ("Gender", 'int', 'gender', 'string'),
                      ("is_root", 'int', 'is_root', 'int'),
                      ("Birthday", 'string', 'birthday', 'date'),
                      ("Job", 'string', 'job', 'string'),
                      ("last_update_date", 'string', 'last_update_date',
                       'timestamp')])
        #
        #
        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping2,
            choice="make_cols",
            transformation_ctx="resolvechoice2")
        dropnullfields6 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields2")

        datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields6,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "user_profile",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dts-odin/temp/user/profile/",
            transformation_ctx="datasink4")

        #insert into source_id

        # print('dyf_crm_contacts::-------source_type---------')
        # dyf_crm_contacts.printSchema()

        applymapping3 = ApplyMapping.apply(
            frame=dyf_crm_contacts,
            mappings=[("Id", "int", "user_id", "bigint"),
                      ("source_type", 'int', 'source_type', 'int'),
                      ("Code", 'string', 'source_id', 'string'),
                      ("description", 'string', 'description', 'string')])

        resolvechoice3 = ResolveChoice.apply(
            frame=applymapping3,
            choice="make_cols",
            transformation_ctx="resolvechoice3")
        dropnullfields7 = DropNullFields.apply(
            frame=resolvechoice3, transformation_ctx="resolvechoice3")

        # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields7,
        #                                                            catalog_connection="glue_redshift",
        #                                                            connection_options={"dbtable": "user_map",
        #                                                                                "database": "dts_odin"},
        #                                                            redshift_tmp_dir="s3n://dts-odin/temp/user/map/",
        #                                                            transformation_ctx="datasink5")

        #lay max _key tren datasource
        flag = dy_crm_contacts_cache.agg({"Id": "max"}).collect()[0][0]

        # ghi de flag moi vao s3
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "int").toDF('flag')

        df.write.parquet("s3a://dtsodin/flag/flag_user_profile.parquet",
                         mode="overwrite")
        dy_crm_contacts_cache_2.unpersist()
        dy_crm_contacts_cache.unpersist()
Exemplo n.º 35
0
def main():
    today = datetime.now(ho_chi_minh_timezone)
    print('today: ', today)
    yesterday = today - timedelta(1)
    today_id = int(today.strftime("%Y%m%d"))
    yesterday_id = int(yesterday.strftime("%Y%m%d"))
    print('today_id: ', today_id)
    print('yesterday_id: ', yesterday_id)

    lastest_number_days = 30
    chosen_word_number = 24

    yesterday = date.today() - timedelta(1)
    yesterday_id = int(yesterday.strftime("%Y%m%d"))

    lasted_30_day = today - timedelta(lastest_number_days)
    lasted_30_day_id = int(lasted_30_day.strftime("%Y%m%d"))

    StructPlusNumber = StructType([
        StructField("lo_plus_number", LongType(), False),
        StructField("learning_object_id", LongType(), False),
        StructField("learning_last_date_id", LongType(), False)
    ])

    def getBestWords(plus_number_pair_list):
        plus_number_pair_list = \
            sorted(plus_number_pair_list, key=lambda x: x['lo_plus_number'], reverse=True)
        a = plus_number_pair_list[0:chosen_word_number]
        return a

    getBestWords = udf(getBestWords, ArrayType(StructPlusNumber))

    #--------------------------------------------------------
    StructMiniNumber = StructType([
        StructField("lo_minus_number", LongType(), False),
        StructField("learning_object_id", LongType(), False),
        StructField("learning_last_date_id", LongType(), False)
    ])

    def getWorstWords(minus_number_pair_list):
        minus_number_pair_list = \
            sorted(minus_number_pair_list, key=lambda x: x['lo_minus_number'], reverse=True)
        a = minus_number_pair_list[0:chosen_word_number]
        return a

    getWorstWords = udf(getWorstWords, ArrayType(StructMiniNumber))
    #----------------------------------------

    if IS_DEV:
        dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_options(
            connection_type="redshift",
            connection_options={
                "url":
                "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin",
                "user":
                "******",
                "password":
                "******",
                "dbtable":
                "mapping_lo_student_history_test",
                "redshiftTmpDir":
                "s3://dts-odin/temp1/mapping_lo_student_history_test/v9"
            })
    else:
        # dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog(
        #     database="nvn_knowledge",
        #     table_name="mapping_lo_student_history",
        #     additional_options={"path": "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/*/*"},
        #     push_down_predicate="(partition_0=='starter_ait' or partition_0=='starter_micro')"
        # )

        dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog(
            database="nvn_knowledge",
            table_name="mapping_lo_student_history",
            additional_options={
                "path":
                "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/*/*"
            },
            push_down_predicate="(partition_0=='starter_micro')")

    dyf_mapping_lo_student_history = dyf_mapping_lo_student_history.select_fields(
        [
            'student_id', 'learning_object_id', 'minus_number', 'plus_number',
            'lu_type', 'created_date_id'
        ])

    if not IS_DEV:
        dyf_mapping_lo_student_history = Filter.apply(
            frame=dyf_mapping_lo_student_history,
            f=lambda x: x["student_id"] is not None and x["student_id"] != 0
            and x["learning_object_id"] is not None and x[
                "created_date_id"] >= lasted_30_day_id and x["lu_type"] == 1)

    if IS_DEV:
        print('dyf_mapping_lo_student_history')
        # dyf_mapping_lo_student_history.printSchema()
        # dyf_mapping_lo_student_history.show(3)

    df_mapping_lo_student_history = dyf_mapping_lo_student_history.toDF()
    df_mapping_lo_student_history = df_mapping_lo_student_history.cache()

    # print('df_mapping_lo_student_history: ', df_mapping_lo_student_history.count())

    if df_mapping_lo_student_history.count() < 1:
        return

    df_group_plus_minus_number = df_mapping_lo_student_history.groupby(
        'student_id', 'learning_object_id').agg(
            f.sum('plus_number').alias('lo_plus_number'),
            f.sum('minus_number').alias('lo_minus_number'),
            f.max('created_date_id').alias('learning_last_date_id'))

    # print('df_group_plus_minus_number')
    df_group_plus_minus_number.printSchema()
    df_group_plus_minus_number.show(3)

    df_group_plus_minus_number = df_group_plus_minus_number.na.fill({
        'lo_plus_number':
        0,
        'lo_minus_number':
        0
    })

    # xu ly de plus va minus khong trung nhau
    df_group_plus_minus_number = df_group_plus_minus_number\
        .select(
            'student_id', 'learning_object_id',
            f.when(f.col('lo_plus_number') >= f.col('lo_minus_number'), f.col('lo_plus_number'))
                .otherwise(0).alias('lo_plus_number'),
            f.when(f.col('lo_plus_number') < f.col('lo_minus_number'), f.col('lo_minus_number'))
                .otherwise(0).alias('lo_minus_number'),

            'learning_last_date_id'
    )

    df_group_plus_minus_number = df_group_plus_minus_number.select(
        'student_id',
        f.struct('lo_plus_number', 'learning_object_id',
                 'learning_last_date_id').alias('plus_number_pair'),
        f.struct('lo_minus_number', 'learning_object_id',
                 'learning_last_date_id').alias('minus_number_pair'))

    df_group_l2 = df_group_plus_minus_number.groupby('student_id').agg(
        f.collect_list('plus_number_pair').alias('plus_number_pair_list'),
        f.collect_list('minus_number_pair').alias('minus_number_pair_list'))

    print('df_group_l2')
    df_group_l2.printSchema()
    df_group_l2.show(2)

    df_group_l2 = df_group_l2.withColumn('right_list', getBestWords(df_group_l2.plus_number_pair_list))\
            .withColumn('wrong_list', getWorstWords(df_group_l2.minus_number_pair_list))

    print('df_group_l2---')
    df_group_l2.printSchema()
    df_group_l2.show(1)

    df_group_l2_right = df_group_l2.select(
        'student_id',
        f.explode('right_list').alias('str_right_item'))
    df_group_l2_wrong = df_group_l2.select(
        'student_id',
        f.explode('wrong_list').alias('str_wrong_item'))

    df_group_l2_right = df_group_l2_right.select(
        'student_id',
        f.col('str_right_item').getItem("lo_plus_number").alias(
            "learning_object_number"),
        f.col('str_right_item').getItem("learning_object_id").alias(
            "learning_object_id"),
        f.col('str_right_item').getItem("learning_last_date_id").alias(
            "learning_last_date_id"),
        f.lit(1).cast('long').alias("number_type"))

    df_group_l2_right = df_group_l2_right.filter(
        df_group_l2_right.learning_object_number.isNotNull())

    df_group_l2_wrong = df_group_l2_wrong.select(
        'student_id',
        f.col('str_wrong_item').getItem("lo_minus_number").alias(
            "learning_object_number"),
        f.col('str_wrong_item').getItem("learning_object_id").alias(
            "learning_object_id"),
        f.col('str_wrong_item').getItem("learning_last_date_id").alias(
            "learning_last_date_id"),
        f.lit(-1).cast('long').alias("number_type"))

    df_group_l2_wrong = df_group_l2_wrong.filter(
        (df_group_l2_wrong.learning_object_number.isNotNull())
        & (df_group_l2_wrong.learning_object_number != 0))
    print('df_group_l2_right')
    df_group_l2_right.printSchema()
    df_group_l2_right.show(2)

    print('df_group_l2_wrong')
    df_group_l2_wrong.printSchema()
    df_group_l2_wrong.show(2)

    total_plus_minus = df_group_l2_right.union(df_group_l2_wrong)

    #add created_date_id
    total_plus_minus = total_plus_minus.withColumn(
        'created_date_ids', udf_get_date_list(f.lit(yesterday)))
    total_plus_minus = total_plus_minus\
        .select(
            'student_id',
            'learning_object_number',
            'learning_object_id',
            'learning_last_date_id',
            'number_type',
            f.explode('created_date_ids').alias('created_date_id')
        )

    print('total_plus_minus')
    total_plus_minus.printSchema()

    dyf_total_plus_minus = DynamicFrame.fromDF(total_plus_minus, glueContext,
                                               'dyf_total_plus_minus')

    clear_before_saving = 'DELETE student_phonetic_number_history where created_date_id >= ' + str(
        yesterday_id)

    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dyf_total_plus_minus,
        catalog_connection="glue_redshift",
        connection_options={
            "preactions": clear_before_saving,
            "dbtable": "student_phonetic_number_history",
            "database": "dts_odin"
        },
        redshift_tmp_dir=
        "s3://dts-odin/temp/nvn/knowledge/student_phonetic_number_history/v4",
        transformation_ctx="datasink4")

    df_mapping_lo_student_history.unpersist()
Exemplo n.º 36
0
def back_kup_h2472_rating_answer():
    dyf_jh2472_rating_answer = glueContext \
        .create_dynamic_frame.from_catalog(database="do_h2472",
                                           table_name="rating_answer")

    if is_dev:
        print('dyf_jh2472_rating_answer')
        dyf_jh2472_rating_answer.printSchema()
        dyf_jh2472_rating_answer.show(3)

    # return

    # root
    # | -- id: string
    # | -- rating: float
    # | -- rating_date: string
    # | -- rating_user: string
    # | -- answer_id: string
    # | -- _key: string
    # | -- _table: string
    # | -- _schema: string

    dyf_jh2472_rating_answer = dyf_jh2472_rating_answer.resolveChoice(
        specs=[('id', 'cast:long'), ('rating', 'cast:double')])
    #
    dyf_jh2472_rating_answer = Filter.apply(frame=dyf_jh2472_rating_answer,
                                            f=lambda x: x["id"] > 26139)

    df_jh2472_rating_answer = dyf_jh2472_rating_answer.toDF()
    df_jh2472_rating_answer = df_jh2472_rating_answer.dropDuplicates(['id'])
    dyf_jh2472_rating_answer = DynamicFrame.fromDF(df_jh2472_rating_answer,
                                                   glueContext,
                                                   'dyf_jh2472_rating_answer')

    # #
    applymapping1 = ApplyMapping.apply(
        frame=dyf_jh2472_rating_answer,
        mappings=[("id", 'long', 'id', 'long'),
                  ("rating", "double", "rating", "double"),
                  ("rating_date", "string", "rating_date", "timestamp"),
                  ("rating_user", "string", "rating_user", "string"),
                  ("answer_id", 'string', 'answer_id', 'long')])
    # # #
    resolvechoice1 = ResolveChoice.apply(frame=applymapping1,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice1")
    #
    #
    if is_dev:
        print('resolvechoice1')
        resolvechoice1.printSchema()
        resolvechoice1.show(3)

    # #
    # #
    datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=resolvechoice1,
        catalog_connection="h2474_backup",
        connection_options={
            "dbtable": "rating_answer",
            "database": "topicaH2472"
        },
        redshift_tmp_dir="s3a://dts-odin/topicaH2472/rating_answer",
        transformation_ctx="datasink5")
trimmedLEOriginRequestLogs = DropFields.apply(frame = labdaEdgeOriginRequestLogs, paths=["executionregion", "distributionid", "distributionname", "requestdata", "customtraceid", "eventtype", "year", "month", "date", "hour"], transformation_ctx ="trimmedLEOriginRequestLogs")

## Rename the requestid field for Lambda@Edge origin request logs to origin requestid
modifiedLEOriginRequestLogs = RenameField.apply(frame = trimmedLEOriginRequestLogs, old_name = "requestid", new_name = "origin_requestid", transformation_ctx ="modifiedLEOriginRequestLogs" )

## Convert to DataFrame
modifiedLEOriginRequestLogsDF = modifiedLEOriginRequestLogs.toDF()

## Convert to DataFrame
modifiedLEViewerRequestLogsDF = modifiedLEViewerRequestLogs.toDF()

## Join(left outer join) the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid
combinedLambdaEdgeLogsDF = modifiedLEViewerRequestLogsDF.join(modifiedLEOriginRequestLogsDF, modifiedLEViewerRequestLogsDF["requestid"] == modifiedLEOriginRequestLogsDF["origin_requestid"], "left_outer")

## Convert to DynamicFrame
combinedLambdaEdgeLogs = DynamicFrame.fromDF(combinedLambdaEdgeLogsDF, glueContext, "combinedLambdaEdgeLogs")

## Join the Lambda@Edge viewer-request logs with the origin-request logs based on the requestid
#combinedLambdaEdgeLogs = Join.apply(modifiedLEViewerRequestLogs, modifiedLEOriginRequestLogs, 'requestid', 'origin_requestid')

## Drop the origin_requestid field
lambdaEdgeLogs = DropFields.apply(frame = combinedLambdaEdgeLogs, paths=["origin_requestid"], transformation_ctx ="lambdaEdgeLogs")

## Drop the "year", "month", "date", "hour" fields
trimmedLambdaEdgeLogs = DropFields.apply(frame =lambdaEdgeLogs, paths=["year", "month", "date", "hour", "useragentstring"], transformation_ctx ="trimmedLambdaEdgeLogs")

## Convert to DataFrame
trimmedLambdaEdgeLogsDF = trimmedLambdaEdgeLogs.toDF()

#Destnation S3 loaction for combine Lambda@Edge logs
leLogDestPath = "s3://" + args['target_s3_bucket'] + "/combined/lelogs"