def main(): # arguments from_s3 = 'from-s3' from_jdbc = 'from-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument('-m', '--mode', required=True, choices=[from_s3, from_jdbc], help='Choose to migrate metastore either from JDBC or from S3') parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument('-R', '--region', required=False, help='AWS region of target Glue DataCatalog, default to "us-east-1"') parser.add_argument('-d', '--database-prefix', required=False, help='Optional prefix for database names in Glue DataCatalog') parser.add_argument('-t', '--table-prefix', required=False, help='Optional prefix for table name in Glue DataCatalog') parser.add_argument('-D', '--database-input-path', required=False, help='An S3 path containing json files of metastore database entities') parser.add_argument('-T', '--table-input-path', required=False, help='An S3 path containing json files of metastore table entities') parser.add_argument('-P', '--partition-input-path', required=False, help='An S3 path containing json files of metastore partition entities') options = get_options(parser, sys.argv) if options['mode'] == from_s3: validate_options_in_mode( options=options, mode=from_s3, required_options=['database_input_path', 'table_input_path', 'partition_input_path'], not_allowed_options=['database_prefix', 'table_prefix'] ) elif options['mode'] == from_jdbc: validate_options_in_mode( options=options, mode=from_jdbc, required_options=['connection_name'], not_allowed_options=['database_input_path', 'table_input_path', 'partition_input_path'] ) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # launch job if options['mode'] == from_s3: metastore_import_from_s3( sql_context=sql_context, glue_context=glue_context, db_input_dir=options['database_input_path'], tbl_input_dir=options['table_input_path'], parts_input_dir=options['partition_input_path'], datacatalog_name='datacatalog', region=options.get('region') or 'us-east-1' ) elif options['mode'] == from_jdbc: glue_context.extract_jdbc_conf(options['connection_name']) metastore_full_migration( sc=sc, sql_context=sql_context, glue_context=glue_context, connection=glue_context.extract_jdbc_conf(options['connection_name']), db_prefix=options.get('database_prefix') or '', table_prefix=options.get('table_prefix') or '', datacatalog_name='datacatalog', region=options.get('region') or 'us-east-1' )
from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import boto3 target_format = "parquet" ## @params: [JOB_NAME] args = getResolvedOptions( sys.argv, ['JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) dataLakeBucket = args["DL_BUCKET"] dataLakePrefix = args["DL_PREFIX"] aws_region = args["DL_REGION"] glue_database = args["GLUE_SRC_DATABASE"] job.init(args['JOB_NAME'], args) client = boto3.client(service_name='glue', region_name=aws_region) responseGetTables = client.get_tables(DatabaseName=glue_database) tableList = responseGetTables['TableList']
from pyspark.sql.functions import col, when from pyspark.sql.utils import AnalysisException from pyspark.sql.functions import when, input_file_name, col import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame from datetime import datetime from datetime import timedelta sc = SparkContext() glue_context = GlueContext(sc) spark = glue_context.spark_session logger = glue_context.get_logger() job = Job(glue_context) args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'aws_region', 'zone', 'source_bucket', 'LOG_LEVEL', 'backup_bucket', 'source_prefixes', 'partition_key_1', 'partition_key_2', 'partition_value_1', 'file_type', 'expiry_days', 's3_backup_enabled', 'scramble_attribute_name', 'scramble_attribute_value_list', 'part_date_list_str', 'job_run_id' ]) job.init(args['JOB_NAME'], args) log_level = args['LOG_LEVEL']
#import pyspark module from pyspark.context import SparkContext import pyspark.sql.functions as f #import glue modules from awsglue.transforms import * from awsglue.utils import getResolvedOptions from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job #initialize contexts and session spark_context = SparkContext.getOrCreate() glue_context = GlueContext(spark_context) session = glue_context.spark_session #parameters glue_db = "imdb" glue_tbl = "input" s3_write_path = "s3://demo-glue-pavan/output" ############################################ # Extract (read DAta) ############################################ #Log stream time dt_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print ("start time :",dt_start)
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") dyf_care_call = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='care_call') dyf_care_call = dyf_care_call.resolveChoice(specs=[('_key', 'cast:long')]) # print schema and select fields print('original schema') dyf_care_call.printSchema() dyf_care_call.show(10) # try: # df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet") # read_from_index = df_flag.collect()[0]['flag'] # print('read from index: ', read_from_index) # dyf_care_call = Filter.apply(frame=dyf_care_call, # f=lambda x: x["_key"] > read_from_index) # except: # print('read flag file error ') # print('the number of new contacts: ', dyf_care_call.count()) dyf_care_call = dyf_care_call.select_fields( ['_key', 'id', 'phone', 'duration', 'call_status', 'time_created']).rename_field('time_created', 'call_date') dy_source_care_call_cache = dyf_care_call.toDF() dy_source_care_call_cache = dy_source_care_call_cache.dropDuplicates( ['id']) dy_source_care_call_cache = dy_source_care_call_cache.cache() dyf_care_call = DynamicFrame.fromDF(dy_source_care_call_cache, glueContext, 'dyf_care_call') # if (dyf_care_call.count() > 0): dyf_care_call = Filter.apply( frame=dyf_care_call, f=lambda x: x["phone"] is not None and x["phone"] != '' and (x["call_status"] == 'success' or x["call_status"] == 'call_success') and x["call_date"] is not None and x["call_date"] != '' and x["duration"] is not None and x["duration"] > 30) # print('dyf_care_call::corrcect') print('dyf_care_call number', dyf_care_call.count()) if (dyf_care_call.count() > 0): dyf_ad_contact_phone = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='student_contact_phone') dyf_ad_contact_phone = dyf_ad_contact_phone.select_fields( ['phone', 'contact_id']) dyf_ad_contact_phone = Filter.apply( frame=dyf_ad_contact_phone, f=lambda x: x["phone"] is not None and x["phone"] != '' and x[ "contact_id"] is not None and x["contact_id"] != '') print('dyf_ad_contact_phone::schema') dyf_ad_contact_phone.printSchema() # dyf_advisor_ip_phone = glueContext.create_dynamic_frame.from_catalog(database='callcenter', # table_name='advisor_ip_phone') # # dyf_advisor_ip_phone = Filter.apply(frame=dyf_advisor_ip_phone, # f=lambda x: x["ip_phone"] is not None and x["ip_phone"] != '') # # # # # # #-----------------------------------------------------------------------------------------------------------# join_call_contact = Join.apply(dyf_care_call, dyf_ad_contact_phone, 'phone', 'phone') # join_call_contact = join_call_contact.select_fields(['id_time', 'answertime', 'calldate', 'phonenumber_correct', 'calldate', 'ipphone', 'contact_id']) # print('join_call_contact::schema------------') join_call_contact.printSchema() join_call_contact.show(2) print('join: ', join_call_contact.count()) # # # #-----------------------------------------------------------------------------------------------------------# # dyf_source_ls_dong_tien = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='nvn_poss_lich_su_dong_tien') dyf_source_ls_dong_tien = Filter.apply( frame=dyf_source_ls_dong_tien, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' and x["ngay_thanhtoan"] is not None and x["ngay_thanhtoan" ] != '') dyf_source_ls_dong_tien = dyf_source_ls_dong_tien.select_fields([ '_key', 'id', 'contact_id', 'ngay_thanhtoan', 'ngay_tao', 'makh' ]).rename_field('ngay_tao', 'ngay_a0') dy_source_ls_dt_cache = dyf_source_ls_dong_tien.toDF() dy_source_ls_dt_cache = dy_source_ls_dt_cache.dropDuplicates( ['id']) dy_source_ls_dt_cache = dy_source_ls_dt_cache.cache() dyf_source_ls_dong_tien = DynamicFrame.fromDF( dy_source_ls_dt_cache, glueContext, 'dyf_source_ls_dong_tien') # join_call_contact_ao = Join.apply(join_call_contact, dyf_source_ls_dong_tien, 'contact_id', 'contact_id') # print('join_call_contact_ao::schema------------') join_call_contact_ao.printSchema() join_call_contact_ao.show(2) print('join: ', join_call_contact_ao.count()) # # # join_call_contact_ao = join_call_contact_ao.resolveChoice(specs=[('calldate', 'cast:timestamp'), # # ('ngay_a0', 'cast:timestamp')]) # # join_call_contact_ao = Filter.apply( frame=join_call_contact_ao, f=lambda x: x["call_date"] is not None and x[ "ngay_a0"] is not None and x["call_date"] > x["ngay_a0"]) # print( 'join_call_contact_ao::after filter calldate > ngay_a0------------' ) # join_call_contact_ao.printSchema() join_call_contact_ao.show(2) print('join_call_contact_ao: ', join_call_contact_ao.count()) # # #get lich su chao mung thanh cong df_join_call_contact_ao = join_call_contact_ao.toDF() df_join_call_contact_ao = df_join_call_contact_ao.groupby( 'contact_id', 'makh').agg(f.min('call_date').alias("ngay_a1")) df_join_call_contact_ao = df_join_call_contact_ao.withColumn( 'id_time', from_unixtime( unix_timestamp(df_join_call_contact_ao.ngay_a1, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) dyf_result = DynamicFrame.fromDF(df_join_call_contact_ao, glueContext, 'dyf_result') # # print('dyf_result------------') # join_call_contact_ao.printSchema() dyf_result.show(2) print('dyf_result: ', dyf_result.count()) # # # # # # # chon field applymapping1 = ApplyMapping.apply( frame=dyf_result, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", "string", "id_time", "bigint"), ("makh", "int", "makh", "int"), ("ngay_a1", "string", "ngay_a1", "timestamp")]) # resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # print('dropnullfields3::printSchema') # dropnullfields3.printSchema() # dropnullfields3.show(2) # ghi data vao redshift datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_ls_dong_tien_a1_v3", "database": "dts_odin", "postactions": """ INSERT into mapping_changed_status_student(description, user_id, change_status_date_id, to_status_id, timestamp1) SELECT 'contact_id: ' + temp_a1.contact_id +' - makh: ' + temp_a1.makh, um.user_id ,temp_a1.id_time, 2, temp_a1.ngay_a1 FROM temp_ls_dong_tien_a1_v3 temp_a1 LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = temp_a1.contact_id ; DROP TABLE IF EXISTS public.temp_ls_dong_tien_a1_v3; CALL update_a1_exception_from_eg() """ }, redshift_tmp_dir="s3n://dts-odin/temp/temp_ls_dong_tien/v2", transformation_ctx="datasink4") df_datasource = dyf_care_call.toDF() flag = df_datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet", mode="overwrite") dy_source_care_call_cache.unpersist()
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source #------------------------------------------------------------------------------------------------------------------# dyf_native_talk = glueContext.create_dynamic_frame.from_catalog(database='native_talk', table_name='native_talk_history_log_api') dyf_native_talk = dyf_native_talk.resolveChoice(specs=[('id', 'cast:long')]) try: df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet") read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_native_talk = Filter.apply(frame=dyf_native_talk, f=lambda x: x["id"] > read_from_index) except: print('read flag file error ') dyf_native_talk = dyf_native_talk.select_fields( ['id', 'learning_date', 'speaking_dialog_score', 'username', 'updated_time']) dy_cache = dyf_native_talk.toDF() dy_cache = dy_cache.cache() dyf_native_talk = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_native_talk') print('dy_cache------------') dy_cache.printSchema() print('dy_cache: ', dy_cache.count()) dy_cache.show(2) #------------------------------------------------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): #---------------------------------------------------------datasource0-----------------------------------------------------# dyf_native_talk = Filter.apply(frame=dyf_native_talk, f=lambda x: x["username"] is not None and x["username"] != '' and x["speaking_dialog_score"] is not None and x["learning_date"] is not None and x["learning_date"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): dyf_nt_account_mapping = glueContext.create_dynamic_frame.from_catalog(database='native_talk', table_name='native_talk_account_mapping') dyf_nt_account_mapping = dyf_nt_account_mapping.select_fields(['contact_id', 'username']).rename_field('username', 'nativetalk_user') dy_cache_2 = dyf_nt_account_mapping.toDF() dy_cache_2 = dy_cache_2.cache() dyf_nt_account_mapping = DynamicFrame.fromDF(dy_cache_2, glueContext, 'dyf_nt_account_mapping') dyf_nt_account_mapping = Filter.apply(frame=dyf_nt_account_mapping, f=lambda x: x["nativetalk_user"] is not None and x["nativetalk_user"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------# join = Join.apply(dyf_native_talk, dyf_nt_account_mapping, 'username', 'nativetalk_user') if(join.count() > 0): df_nativetalk = join.toDF() df_nativetalk = df_nativetalk.withColumn('sogio', f.lit(0.083333)) #5 phut df_nativetalk = df_nativetalk.withColumn('id_time', from_unixtime( unix_timestamp(df_nativetalk.learning_date, "yyyy-MM-dd"), "yyyyMMdd")) df_nativetalk = df_nativetalk.where("contact_id IS NOT NULL") data_nativetalk = DynamicFrame.fromDF(df_nativetalk, glueContext, 'data_nativetalk') data_nativetalk = data_nativetalk.resolveChoice(specs=[('sogio', 'cast:float')]) # -------------------------------------------------------------------------------------------------------------# print('data_nativetalk----------') data_nativetalk.printSchema() # tinh bang "fact_hieusuathoctap" df_hieusuathoctap = data_nativetalk.toDF() # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time df_hieusuathoctap = df_hieusuathoctap.groupby('contact_id', 'id_time').agg(f.sum('sogio'), f.count('contact_id')) df_hieusuathoctap = df_hieusuathoctap.withColumn('tu_hoc_type_id', f.lit(400)) data_hieusuathoctap = DynamicFrame.fromDF(df_hieusuathoctap, glueContext, 'data_hieusuathoctap') data_hieusuathoctap = data_hieusuathoctap.resolveChoice(specs=[('sum(sogio)', 'cast:double')]) print('data_hieusuathoctap::data_hieusuathoctap::data_hieusuathoctap------------------------------------------') data_hieusuathoctap.printSchema(); applymapping2 = ApplyMapping.apply(frame=data_hieusuathoctap, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", 'string', 'id_time', 'bigint'), ("count(contact_id)", 'long', 'soca', 'int'), ("sum(sogio)", 'double', 'sogio', 'double'), ("tu_hoc_type_id", 'int', "tu_hoc_type_id", "int")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields2 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") print('dropnullfields2 number: ', dropnullfields2.count()) datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2, catalog_connection="glue_redshift", connection_options={"dbtable": "temp_staging_lich_su_tu_hoc_native_talk", "database": "dts_odin", "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2) SELECT um.user_id, hwb.id_time, 53, hwb.soca, round(hwb.sogio, 4) FROM temp_staging_lich_su_tu_hoc_native_talk hwb LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = hwb.contact_id; DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_native_talk """ }, redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/", transformation_ctx="datasink2") df_datasource = dyf_native_talk.toDF() flag = df_datasource.agg({"id": "max"}).collect()[0][0] print('flag: ', flag) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet", mode="overwrite") dy_cache.unpersist() dy_cache_2.unpersist()
JSON_FILE_NAME = TARGET_TABLE + \ "_" + SOURCE_SYSTEM # ar_invc_hdr_f_must STAGE_TABLE = ARGS['rs_stage_table'] # ar_invc_hdr_f_stage_must CTLG_CONNECTION = ARGS['glue_conn'] # TestRedshift3 REDSHIFTDB = ARGS['rs_db'] # usinnovationredshift S3_BUCKET = ARGS['bkt_name'] # "odp-us-innovation-raw" MD5_COLUMN_SCD1 = TARGET_TABLE + "_md5_scd1" # ar_invc_hdr_f_md5_scd1 TARGET_TABLE_COLUMNS = ARGS['target_cols'] # As per DDL(col1,col2,col3) STAGE_TABLE_COLUMNS = ARGS['stage_cols'] # As per DDL(col1,col2,col3) DBTABLE_STG = STAGE_DATABASE_NAME + "." + STAGE_TABLE URL = ARGS["jdbc_url"] IAM_ROLE = ARGS["iam_role"] SC = SparkContext() GLUECONTEXT = GlueContext(SC) SPARK = GLUECONTEXT.spark_session JOB = Job(GLUECONTEXT) JOB.init(ARGS['JOB_NAME'], ARGS) RUN_ID = ARGS['JOB_RUN_ID'] JOB_NAME = ARGS['JOB_NAME'] TEMPDIR = ARGS['TempDir'] SRC_NOTEMPTY = True try: # @type: DataSource # @args: [database = "db_mrr_must", ## table_name = "billing" # transformation_ctx = "billing_df"] # @return: DynamicFrame # @inputs: []
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session student_id_unavailable = 0L package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L def do_check_endtime(val): if val is not None: return val return 4094592235 check_endtime_null = udf(do_check_endtime, LongType()) ########## dyf_student_contact dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields([ '_key', 'contact_id', 'student_id', 'advisor_id', 'level_study', 'time_lms_created', 'time_created' ]) dyf_student_contact = dyf_student_contact.resolveChoice( specs=[('time_lms_created', 'cast:long')]) dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x['student_id'] is not None and x[ 'contact_id'] is not None and x['time_lms_created'] is not None) dyf_student_level = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") dyf_student_level = dyf_student_level.select_fields( ['_key', 'contact_id', 'level_current', 'level_modified', 'time_created']) \ .rename_field('contact_id', 'contact_id1') \ .rename_field('time_created', 'time_level_created') # try: # df_flag = spark.read.parquet("s3://dtsodin/flag/flag_log_student_level.parquet") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_student_level = Filter.apply(frame=dyf_student_level, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') if dyf_student_level.count() > 0: dyf_mapping_class_lms = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="mapping_class_lms") dyf_mapping_class_lms = dyf_mapping_class_lms.select_fields( ['level_lms', 'level_list_master']) dyf_learning_object_class = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object_class") dyf_learning_object_class = dyf_learning_object_class.select_fields( ['class_id', 'class_name']) dyf_student_contact_level = Join.apply(dyf_student_contact, dyf_student_level, 'contact_id', 'contact_id1') df_student_contact_level = dyf_student_contact_level.toDF() df_student_contact_level = df_student_contact_level \ .select('student_id', 'level_current', 'level_modified', 'time_created', 'time_level_created') df_count_contact_level = df_student_contact_level \ .groupby('student_id').agg(f.count('time_level_created').alias("count").cast('long')) dyf_count_contact_level = DynamicFrame.fromDF( df_count_contact_level, glueContext, "dyf_count_contact_level") dyf_count_contact_level = Filter.apply(frame=dyf_count_contact_level, f=lambda x: x['count'] == 2L) if dyf_count_contact_level.count() > 0: df_count_contact_level = dyf_count_contact_level.toDF() print df_count_contact_level.count() df_count_contact_level = df_count_contact_level.select( 'student_id') list_filer = [ list(row) for row in df_count_contact_level.collect() ] else: list_filer = [''] dyf_student_contact_level = DynamicFrame.fromDF( df_student_contact_level, glueContext, "dyf_student_contact_level") dyf_student_contact_level = Filter.apply( frame=dyf_student_contact_level, f=lambda x: x['student_id'] not in list_filer) # or x['student_id'] == '29439' dyf_student_contact_level.count() dyf_student_contact_level.printSchema() df_student_contact_level = dyf_student_contact_level.toDF() df_contact_level_first = df_student_contact_level.groupby('student_id').agg( f.min("time_level_created").alias("min_time_level_created")) \ .withColumnRenamed('student_id', 'min_student_id') df_contact_level_last = df_student_contact_level.groupby('student_id').agg( f.max("time_level_created").alias("max_time_level_created")) \ .withColumnRenamed('student_id', 'max_student_id') df_contact_level_first = df_contact_level_first \ .join(df_student_contact_level, (df_contact_level_first.min_student_id == df_student_contact_level.student_id) & (df_contact_level_first.min_time_level_created == df_student_contact_level.time_level_created), "left") df_contact_level_last = df_contact_level_last \ .join(df_student_contact_level, (df_contact_level_last.max_student_id == df_student_contact_level.student_id) & (df_contact_level_last.max_time_level_created == df_student_contact_level.time_level_created), "left") df_contact_level_last = df_contact_level_last \ .select('student_id', 'level_modified', 'time_level_created') df_contact_level_last = df_contact_level_last \ .withColumnRenamed('time_level_created', 'start_date') \ .withColumnRenamed('level_modified', 'level_study') \ .withColumn('end_date', f.lit(4094592235).cast('long')) df_contact_level_last = df_contact_level_last \ .select('student_id', 'level_study', 'start_date', 'end_date') df_contact_level_first = df_contact_level_first \ .select('student_id', 'level_current', 'time_created', 'time_level_created') df_contact_level_first = df_contact_level_first \ .withColumnRenamed('level_current', 'level_study') \ .withColumnRenamed('time_created', 'start_date') \ .withColumnRenamed('time_level_created', 'end_date') df_contact_level_first = df_contact_level_first \ .select('student_id', 'level_study', 'start_date', 'end_date') df_contact_level_first.show() df_contact_level_last.show(), print "END FIRST_LAST" df_student_contact_level01 = df_student_contact_level \ .select('student_id', 'level_current', 'level_modified', 'time_level_created') print "df_student_contact_level" df_student_contact_level01.show(20) df_student_contact_level02 = df_student_contact_level01 df_student_contact_level02 = df_student_contact_level02 \ .withColumnRenamed('student_id', 'student_id_temp') \ .withColumnRenamed('level_current', 'level_current_temp') \ .withColumnRenamed('level_modified', 'level_modified_temp') \ .withColumnRenamed('time_level_created', 'time_level_created_temp') df_student_contact_level02.show(20) df_join_student_contact_level = df_student_contact_level01 \ .join(df_student_contact_level02, (df_student_contact_level01.student_id == df_student_contact_level02.student_id_temp) & (df_student_contact_level01.level_current == df_student_contact_level02.level_modified_temp) & (df_student_contact_level01.time_level_created > df_student_contact_level02.time_level_created_temp), "left") df_join_student_contact_level.show(100) df_join_student_contact_level = df_join_student_contact_level \ .groupby('student_id', 'level_current', 'time_level_created_temp') \ .agg(f.min("time_level_created").alias("time_level_created")) df_join_student_contact_level = df_join_student_contact_level \ .withColumnRenamed('time_level_created_temp', 'start_date') \ .withColumnRenamed('time_level_created', 'end_date') \ .withColumnRenamed('level_current', 'level_study') df_join_student_contact_level = df_join_student_contact_level.where( 'start_date is not null') df_join_student_contact_level.count() df_join_student_contact_level.show() df_union_first_and_middle_contact = df_contact_level_first.union( df_join_student_contact_level) df_union_all_contact = df_union_first_and_middle_contact.union( df_contact_level_last) print df_union_all_contact.count() df_union_all_contact.printSchema() df_union_all_contact.show() df_union_all_contact = df_union_all_contact.withColumn( "user_id", f.lit(None).cast('long')) dyf_student_contact_level = DynamicFrame.fromDF( df_union_all_contact, glueContext, "dyf_student_contact_level") dyf_join_all0 = Join.apply(dyf_mapping_class_lms, dyf_student_contact_level, "level_lms", "level_study") dyf_join_all = Join.apply(dyf_join_all0, dyf_learning_object_class, "level_list_master", "class_name") df_join_all = dyf_join_all.toDF() print "df_join_all ", df_join_all.count() df_join_all = df_join_all.dropDuplicates() dyf_join_all = DynamicFrame.fromDF(df_join_all, glueContext, "dyf_join_all") print "dyf_join_all ", dyf_join_all.count() dyf_join_all.printSchema() dyf_join_all.show(10) applymapping = ApplyMapping.apply( frame=dyf_join_all, mappings=[("student_id", "string", "student_id", "string"), ("user_id", "long", "user_id", "long"), ("class_id", "long", "class_id", "long"), ("start_date", "int", "start_date", "long"), ("end_date", 'long', 'end_date', 'long'), ("level_study", 'string', 'level_study', 'string')]) resolvechoice = ResolveChoice.apply( frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_class_student", "database": "dts_odin" }, redshift_tmp_dir="s3n://dtsodin/temp1/mapping_class_student/", transformation_ctx="datasink5") print('START WRITE TO S3-------------------------') s3 = boto3.resource('s3') bucket = s3.Bucket('dtsodin') bucket.objects.filter( Prefix="student_behavior/student_level/").delete() s3 = boto3.client('s3') bucket_name = "dtsodin" directory_name = "student_behavior/student_level/" # it's name of your folders s3.put_object(Bucket=bucket_name, Key=directory_name) ###### log_student_level = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "mapping_class_student", "redshiftTmpDir": "s3n://dtsodin/temp1/mapping_class_student/" }) datasink6 = glueContext.write_dynamic_frame.from_options( frame=log_student_level, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_level/" }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # ghi flag # lay max key trong data source datasource = dyf_student_level.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_log_student_level.parquet", mode="overwrite") ############################################################################################## dyf_student_advisor = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_change_assignment_advisor") dyf_student_advisor = dyf_student_advisor.select_fields( ['_key', 'id', 'contact_id', 'advisor_id_old', 'advisor_id_new', 'created_at'])\ .rename_field('contact_id', 'contact_id1') # try: # df_flag = spark.read.parquet("s3://dtsodin/flag/flag_log_student_advisor.parquet") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_student_advisor = Filter.apply(frame=dyf_student_advisor, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') if dyf_student_advisor.count() > 0: df_student_advisor = dyf_student_advisor.toDF() # df_student_advisor.printSchema() # df_student_advisor.show() df_student_advisor = df_student_advisor.withColumn( 'created_at_1', unix_timestamp(df_student_advisor.created_at, "yyyy-MM-dd HH:mm:ss").cast(IntegerType())) df_student_contact = dyf_student_contact.toDF() df_student_contact_advisor = df_student_contact.join( df_student_advisor, df_student_contact.contact_id == df_student_advisor.contact_id1) dyf_student_contact_advisor = DynamicFrame.fromDF( df_student_contact_advisor, glueContext, "dyf_student_contact_advisor") dyf_student_contact_advisor = dyf_student_contact_advisor.select_fields( [ 'student_id', 'advisor_id', 'advisor_id_old', 'advisor_id_new', 'created_at_1', 'time_created' ]) df_student_contact_advisor = dyf_student_contact_advisor.toDF() df_student_contact_advisor = df_student_contact_advisor.dropDuplicates( ) df_advisor_temp = df_student_contact_advisor df_advisor_temp = df_advisor_temp \ .withColumnRenamed('student_id', 'student_id_temp') \ .withColumnRenamed('advisor_id', 'advisor_id_temp') \ .withColumnRenamed('advisor_id_old', 'advisor_id_old_temp') \ .withColumnRenamed('advisor_id_new', 'advisor_id_new_temp') \ .withColumnRenamed('created_at_1', 'created_at_temp') \ .withColumnRenamed('time_created', 'time_created_temp') df_join_data = df_student_contact_advisor \ .join(df_advisor_temp, (df_student_contact_advisor.student_id == df_advisor_temp.student_id_temp) & (df_student_contact_advisor.advisor_id_new == df_advisor_temp.advisor_id_old_temp) & (df_student_contact_advisor.created_at_1 <= df_advisor_temp.created_at_temp), "left") df_join_data = df_join_data.groupby('student_id', 'advisor_id_new', 'created_at_1') \ .agg(f.min("created_at_temp").alias("created_at_temp")) df_join_data = df_join_data.withColumn( "end_time", check_endtime_null(df_join_data.created_at_temp)) df_join_data = df_join_data.dropDuplicates() dyf_student_contact_advisor = DynamicFrame.fromDF( df_join_data, glueContext, "dyf_student_contact_advisor") dyf_student_contact_advisor.printSchema() applymapping = ApplyMapping.apply( frame=dyf_student_contact_advisor, mappings=[("student_id", "string", "student_id", "string"), ("advisor_id_new", "string", "advisor_id", "string"), ("created_at_1", 'int', 'start_time', 'long'), ("end_time", 'long', 'end_time', 'long')]) resolvechoice = ResolveChoice.apply( frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "fact_log_change_advisor", "database": "dts_odin" }, redshift_tmp_dir="s3n://dtsodin/temp1/fact_log_change_advisor/", transformation_ctx="datasink5") print('START WRITE TO S3-------------------------') s3 = boto3.resource('s3') bucket = s3.Bucket('dtsodin') bucket.objects.filter( Prefix="student_behavior/student_advisor/").delete() s3 = boto3.client('s3') bucket_name = "dtsodin" directory_name = "student_behavior/student_advisor/" # it's name of your folders s3.put_object(Bucket=bucket_name, Key=directory_name) ###### log_student_advisor = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "fact_log_change_advisor", "redshiftTmpDir": "s3n://dtsodin/temp1/fact_log_change_advisor/" }) datasink6 = glueContext.write_dynamic_frame.from_options( frame=log_student_advisor, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_advisor/" }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') datasource = dyf_student_advisor.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # convert kieu dl flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_log_student_advisor.parquet", mode="overwrite") ############################################################################################## dyf_student_product = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product") dyf_student_product = dyf_student_product.select_fields([ '_key', 'id', 'contact_id', 'product_id', 'starttime', 'endtime', 'timecreated', 'balance_used', 'status', 'timemodified' ]).rename_field('contact_id', 'contact_id1') dyf_student_product = dyf_student_product.resolveChoice( specs=[('_key', 'cast:long')]) try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3://dtsodin/flag/flag_tpe_enduser_used_product.parquet") start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_student_product = Filter.apply(frame=dyf_student_product, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') dyf_student_product = Filter.apply( frame=dyf_student_product, f=lambda x: x["product_id"] is not None and x["product_id"] != '' and x["contact_id1"] is not None and x["contact_id1"] != '') if (dyf_student_product.count() > 0): # try: dyf_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_product_details = dyf_product_details.select_fields( ['id', 'cat_code', 'package_cost', 'actual_cost', 'package_time']).rename_field('id', 'id_goi') # loc data # df_goi = datasource1.toDF() # df_goi = df_goi.where("id_goi <> '' and id_goi is not null") # data_goi = DynamicFrame.fromDF(df_goi, glueContext, "data_goi") dyf_product_details = Filter.apply( frame=dyf_product_details, f=lambda x: x["id_goi"] is not None and x["id_goi"] != '') # df_lsmua = dyf_student_product.toDF() # df_lsmua = df_lsmua.where("product_id is not null and product_id <> ''") # df_lsmua = df_lsmua.where("contact_id is not null and contact_id <> ''") # data_lsmua = DynamicFrame.fromDF(df_lsmua, glueContext, "data_lsmua") # map ls mua goi vs thong tin chi tiet cua goi dyf_student_package = Join.apply(dyf_student_product, dyf_product_details, 'product_id', 'id_goi') dyf_join_data = Join.apply(dyf_student_package, dyf_student_contact, 'contact_id1', 'contact_id') print "dyf_student_package count1: ", dyf_join_data.count() df_join_data = dyf_join_data.toDF() # drop duplicate rows # df_join_data = df_join_data.dropDuplicates() df_join_data = df_join_data.groupby('id', 'student_id') \ .agg(f.first('cat_code').alias('cat_code'), f.first('starttime').alias('starttime'), f.first('endtime').alias('endtime'), f.first('timecreated').alias('timecreated')) dyf_student_package = DynamicFrame.fromDF(df_join_data, glueContext, "dyf_student_package") # convert data # df_student_package = dyf_student_package.toDF() # df_student_package = df_student_package.withColumn('ngay_kich_hoat', from_unixtime(df_student_package.starttime)) \ # .withColumn('ngay_het_han', from_unixtime(df_student_package.endtime)) \ # .withColumn('ngay_mua', from_unixtime(df_student_package.timecreated)) \ # .withColumn('id_time', from_unixtime(df_student_package.starttime, "yyyyMMdd")) \ # .withColumn('ngay_thay_doi', from_unixtime(df_student_package.timemodified)) # df_student_package = df_student_package.dropDuplicates(['contact_id', 'product_id']) # # data = DynamicFrame.fromDF(df_student_package, glueContext, "data") # df_student_package.printSchema() # df_student_package.show() # chon cac truong va kieu du lieu day vao db applyMapping = ApplyMapping.apply( frame=dyf_student_package, mappings=[("id", "string", "id", "string"), ("student_id", "string", "student_id", "string"), ("cat_code", 'string', 'package_code', 'string'), ("starttime", "int", "start_time", "int"), ("endtime", "int", "end_time", "int"), ("timecreated", "int", "timecreated", "int")]) resolvechoice = ResolveChoice.apply( frame=applyMapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields3") dropnullfields.show(10) # ghi data vao db datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "fact_student_package", "database": "dts_odin" }, redshift_tmp_dir="s3n://dtsodin/temp1/fact_student_package/", transformation_ctx="datasink4") print('START WRITE TO S3-------------------------') s3 = boto3.resource('s3') bucket = s3.Bucket('dtsodin') bucket.objects.filter( Prefix="student_behavior/student_package/").delete() s3 = boto3.client('s3') bucket_name = "dtsodin" directory_name = "student_behavior/student_package/" s3.put_object(Bucket=bucket_name, Key=directory_name) ###### log_student_package = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "fact_student_package", "redshiftTmpDir": "s3n://dtsodin/temp1/fact_student_package/" }) datasink6 = glueContext.write_dynamic_frame.from_options( frame=log_student_package, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_package/" }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # ghi flag # lay max key trong data source datasource = dyf_student_product.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_tpe_enduser_used_product_log.parquet", mode="overwrite") ############################################################################################## def do_check_null(val): if val is not None: return val return 4094592235 check_data_null = udf(do_check_null, LongType()) dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history") dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields( ['_key', 'contact_id', 'status_old', 'status_new', 'timecreated']) dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3a://dtsodin/flag/flag_tpe_enduser_used_product_history.parquet") start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') dyf_tpe_enduser_used_product_history = Filter.apply( frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') if (dyf_tpe_enduser_used_product_history.count() > 0): df_tpe_enduser_used_product_history01 = dyf_tpe_enduser_used_product_history.toDF( ) df_tpe_enduser_used_product_history02 = dyf_tpe_enduser_used_product_history.toDF( ) df_tpe_enduser_used_product_history01 = df_tpe_enduser_used_product_history01\ .withColumnRenamed('timecreated', 'start_date') df_tpe_enduser_used_product_history02 = df_tpe_enduser_used_product_history02 \ .withColumnRenamed('timecreated', 'timecreated02') \ .withColumnRenamed('status_old', 'status_old02') \ .withColumnRenamed('status_new', 'status_new02') \ .withColumnRenamed('contact_id', 'contact_id02') df_tpe_enduser_used_product_history_join = df_tpe_enduser_used_product_history01.join( df_tpe_enduser_used_product_history02, (df_tpe_enduser_used_product_history01['contact_id'] == df_tpe_enduser_used_product_history02['contact_id02']) & (df_tpe_enduser_used_product_history01['status_new'] == df_tpe_enduser_used_product_history02['status_old02']) & (df_tpe_enduser_used_product_history01['start_date'] <= df_tpe_enduser_used_product_history02['timecreated02']), "left") df_tpe_enduser_used_product_history_join = df_tpe_enduser_used_product_history_join \ .withColumn("end_date", check_data_null(df_tpe_enduser_used_product_history_join.timecreated02)) df_tpe_enduser_used_product_history_join.printSchema() print df_tpe_enduser_used_product_history_join.count() df_tpe_enduser_used_product_history_join.show(10) dyf_tpe_enduser_product_history = DynamicFrame.fromDF( df_tpe_enduser_used_product_history_join, glueContext, "dyf_tpe_enduser_product_history") dyf_tpe_enduser_product_history = dyf_tpe_enduser_product_history.select_fields( [ 'contact_id', 'status_old', 'status_new', 'start_date', 'end_date' ]) df_tpe_enduser_used_product_history = dyf_tpe_enduser_product_history.toDF( ) df_tpe_enduser_used_product_history_join.printSchema() print df_tpe_enduser_used_product_history_join.count() df_tpe_enduser_used_product_history_join.show(10) dyf_tpe_enduser_product_history = DynamicFrame.fromDF( df_tpe_enduser_used_product_history, glueContext, "dyf_tpe_enduser_product_history") # chon cac truong va kieu du lieu day vao db applyMapping = ApplyMapping.apply( frame=dyf_tpe_enduser_product_history, mappings=[("contact_id", "string", "contact_id", "string"), ("status_new", "string", "status_code", "string"), ("status_old", "string", "last_status_code", "string"), ("start_date", "int", "start_date", "long"), ("end_date", "long", "end_date", "long")]) resolvechoice = ResolveChoice.apply( frame=applyMapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") # ghi data vao db datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "fact_log_student_status", "database": "dts_odin", }, redshift_tmp_dir= "s3n://dtsodin/temp1/tpe_enduser_used_product_history/", transformation_ctx="datasink4") print('START WRITE TO S3-------------------------') s3 = boto3.resource('s3') bucket = s3.Bucket('dtsodin') bucket.objects.filter( Prefix="student_behavior/student_status/").delete() s3 = boto3.client('s3') bucket_name = "dtsodin" directory_name = "student_behavior/student_status/" s3.put_object(Bucket=bucket_name, Key=directory_name) ###### log_student_status = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "fact_log_student_status", "redshiftTmpDir": "s3n://dtsodin/temp1/fact_log_student_status/" }) datasink6 = glueContext.write_dynamic_frame.from_options( frame=log_student_status, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_status/" }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # ghi flag # lay max key trong data source datasource = dyf_tpe_enduser_used_product_history.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_tpe_enduser_used_product_history.parquet", mode="overwrite")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session mdl_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history") # xu ly truong hop start_read is null try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3a://dts-odin/flag/fact_flag_expired.parquet") start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag mdl_tpe_enduser_used_product_history = Filter.apply( frame=mdl_tpe_enduser_used_product_history, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') print('the number of new contacts: ', mdl_tpe_enduser_used_product_history.count()) mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.select_fields( [ '_key', 'id', 'used_product_id', 'contact_id', 'status_new', 'status_old', 'timecreated' ]) mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) # df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_LS_A3.parquet") # # max_key = df_flag.collect()[0]['flag'] # # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["_key"] > max_key) if (mdl_tpe_enduser_used_product_history.count() > 0): mdl_tpe_enduser_used_product_history = Filter.apply( frame=mdl_tpe_enduser_used_product_history, f=lambda x: x["timecreated"] is not None and x["contact_id"] is not None and x["used_product_id"] is not None and x["status_old"] == "ACTIVED" and x["status_new"] in ["EXPIRED", "EXPRIED"]) # print(mdl_tpe_enduser_used_product_history.count()) mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice( specs=[('timecreated', 'cast:long')]) df_mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.toDF( ) df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn( 'change_status_date_id', from_unixtime(df_mdl_tpe_enduser_used_product_history['timecreated'], "yyyyMMdd")) \ .withColumn('to_status_id', f.lit(108)) \ .withColumn('timestamp1', df_mdl_tpe_enduser_used_product_history['timecreated'] * f.lit( 1000)) # df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.select('used_product_id', # 'contact_id', # 'ngay_kich_hoat', # 'id').withColumnRenamed( # 'used_product_id', 'id_product_buy') data_mdl_tpe_enduser_used_product_history = DynamicFrame.fromDF( df_mdl_tpe_enduser_used_product_history, glueContext, "data_mdl_tpe_enduser_used_product_history") data_mdl_tpe_enduser_used_product_history.printSchema() data_mdl_tpe_enduser_used_product_history.show(3) applymapping1 = ApplyMapping.apply( frame=data_mdl_tpe_enduser_used_product_history, mappings=[("contact_id", "string", "contact_id", "string"), ("change_status_date_id", "string", "change_status_date_id", "long"), ("timestamp1", "long", "timestamp1", "timestamp"), ('to_status_id', 'int', 'to_status_id', 'long')]) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin", "postactions": """UPDATE mapping_changed_status_student SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 1 AND source_id = mapping_changed_status_student.contact_id LIMIT 1 ) WHERE user_id IS NULL and to_status_id=108""" }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") # ghi flag # lay max key trong data source datasourceTmp = mdl_tpe_enduser_used_product_history.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dts-odin/flag/fact_flag_expired.parquet", mode="overwrite")
def main(): def getTimestampType(time_modified): time_modified = long(time_modified) time_modified = datetime.fromtimestamp(time_modified) time_modified = str(time_modified) return time_modified getTimestampType = udf(getTimestampType, StringType()) glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session dyf_quiz_atm = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz_attempts") dyf_quiz_atm = dyf_quiz_atm.select_fields( ['_key', 'id', 'userid', 'timemodified', 'uniqueid', 'quiz']).rename_field('id', 'attempt_id') dyf_quiz_atm = dyf_quiz_atm.resolveChoice(specs=[('_key', 'cast:long')]) dyf_question_atm = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_attempts") dyf_question_atm = dyf_question_atm.select_fields([ 'id', 'rightanswer', 'responsesummary', 'timemodified', 'maxmark', 'questionusageid', 'questionid' ]).rename_field('id', 'attempt_step_id') # dyf_question_atm = dyf_question_atm.resolveChoice(specs=[('timemodified', 'cast:string')]) dyf_quiz = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz") dyf_quiz = dyf_quiz.select_fields(['name', 'id']).rename_field('id', 'quiz_id') dyf_question_steps = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question_attempt_steps") dyf_question_steps = dyf_question_steps.select_fields( ['state', 'questionattemptid']) dyf_result_ai = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_result_ai") dyf_result_ai = dyf_result_ai.select_fields([ 'id', 'answer', '.speech_result', 'right_word', 'wrong_word', 'result' ]) dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_user") dyf_top_user = dyf_top_user.select_fields(['id', 'student_id']) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3://dts-odin/flag/flag_ai_study_step_st_tuan_detail.parquet") # max_key = df_flag.collect()[0]['flag'] # print('read from index: ', max_key) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_quiz_atm = Filter.apply(frame=dyf_quiz_atm , f=lambda x: x['_key'] > max_key) # except: # print('read flag error ') print('the number of new contacts: ', dyf_quiz_atm.count()) # chon cac truong can thiet if (dyf_quiz_atm.count() > 0): dyf_quiz_atm = Filter.apply( frame=dyf_quiz_atm, f=lambda x: x['userid'] is not None and x[ 'uniqueid'] is not None and x['quiz'] is not None) join_01 = Join.apply(dyf_quiz_atm, dyf_question_atm, 'uniqueid', 'questionusageid').drop_fields( ['questionusageid']) join_01.printSchema() join_02 = Join.apply(join_01, dyf_question_steps, 'attempt_step_id', 'questionattemptid') join_02 = Join.apply(join_02, dyf_top_user, 'userid', 'id') join_02.printSchema() df_join_02 = join_02.toDF() # df_join_02 = df_join_02.withColumn("source_system", f.lit("Native Test")) \ # .withColumn("created_at", unix_timestamp(df_join_02.timemodified, "yyyy-MM-dd HH:mm:ss")) df_join_02 = df_join_02.withColumn("source_system", f.lit("Native Test")) \ .withColumn("created_at", getTimestampType(df_join_02.timemodified)) join_02 = DynamicFrame.fromDF(df_join_02, glueContext, "join_02") # join_02 = join_02.resolveChoice(specs=[('created_at', 'cast:string')]) join_02.printSchema() join_02.show(2) join_result_ai_1 = Join.apply(dyf_quiz_atm, dyf_quiz, 'quiz', 'quiz_id').drop_fields(['quiz']) join_result_ai = Join.apply(join_result_ai_1, dyf_result_ai, 'attempt_id', 'id') join_result_ai = Join.apply(join_result_ai, dyf_top_user, 'userid', 'id') print("TTTTTTTTTTTTTTTTTT") join_result_ai.printSchema() df_join_result_ai = join_result_ai.toDF() # df_join_result_ai = df_join_result_ai.withColumn("source_system", f.lit("Native Test"))\ # # .withColumn("created_at", unix_timestamp(df_join_result_ai.timemodified, "yyyy-MM-dd HH:mm:ss")) df_join_result_ai = df_join_result_ai.withColumn("source_system", f.lit("Native Test")) \ .withColumn("created_at", getTimestampType(df_join_result_ai.timemodified)) join_result_ai = DynamicFrame.fromDF(df_join_result_ai, glueContext, "join_result_ai") join_result_ai.printSchema() join_result_ai.show(3) applymapping1 = ApplyMapping.apply( frame=join_02, mappings=[("student_id", 'int', 'student_id', 'int'), ("attempt_id", "long", "attempt_id", "string"), ("rightanswer", "string", "correct_answer", "string"), ("responsesummary", "string", "student_answer", "string"), ("maxmark", "string", "received_point", "int"), ("source_system", "string", "source_system", "string"), ("attempt_step_id", "long", "attempt_step_id", "int"), ("state", "string", "result", "string"), ("created_at", "string", "created_at", "timestamp")]) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice1, transformation_ctx="dropnullfields") datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields1, catalog_connection="glue_redshift", connection_options={ "dbtable": "student_test_detail", "database": "dts_odin" }, redshift_tmp_dir="s3a://dts-odin/st_detail/", transformation_ctx="datasink5") print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://dts-odin/nvn_knowledge/student_test_detail/" }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') applymapping2 = ApplyMapping.apply( frame=join_result_ai, mappings=[("student_id", 'int', 'student_id', 'int'), ("attempt_id", "long", "attempt_id", "string"), ("name", "string", "test_type", "string"), ("answer", "string", "correct_answer", "string"), ("speech_result", "string", "student_answer", "string"), ("source_system", "string", "source_system", "string"), ("id", "int", "attempt_step_id", "int"), ("result", "string", "result", "string"), ("right_word", "string", "right_answer", "string"), ("wrong_word", "string", "wrong_answer", "string"), ("created_at", "string", "created_at", "timestamp")]) resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields2 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields") datasink8 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields2, catalog_connection="glue_redshift", connection_options={ "dbtable": "student_test_detail", "database": "dts_odin" }, redshift_tmp_dir="s3a://dts-odin/st_detail/", transformation_ctx="datasink8") print('START WRITE TO S3-------------------------') datasink7 = glueContext.write_dynamic_frame.from_options( frame=join_result_ai, connection_type="s3", connection_options={ "path": "s3://dts-odin/nvn_knowledge/student_test_detail/" }, format="parquet", transformation_ctx="datasink7") print('END WRITE TO S3-------------------------') df_temp = dyf_quiz_atm.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dts-odin/flag/flag_ai_study_step_st_tuan_detail.parquet", mode="overwrite")
import sys import boto3 import pyspark.sql.functions as F from awsglue.transforms import * from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session session = boto3.Session() glue_client = session.client(service_name='glue') ## @params: [JOB_NAME] workflowName = 'AmazonForecastWorkflow' workflow = glue_client.get_workflow(Name=workflowName) workflow_params = workflow['Workflow']['LastRun']['WorkflowRunProperties'] workflowRunId = workflow['Workflow']['LastRun']['WorkflowRunId'] PROCESSED_BUCKET = workflow_params['processedBucket'] LANDING_DB_NAME = workflow_params['landingDB'] LANDING_DB_TABLE = workflow_params['landingDBTable'] orders = glueContext.create_dynamic_frame_from_catalog( LANDING_DB_NAME, LANDING_DB_TABLE, transformation_ctx="orders") ordersDF = orders.toDF() ordersDF1 = ordersDF.select("invoicedate", "stockcode", "quantity", "storelocation")
import sys from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job from pyspark.ml.feature import StringIndexer glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session args = getResolvedOptions(sys.argv, ['s3_bucket']) s3_bucket = args['s3_bucket'] print(s3_bucket) input_dir = f's3://{s3_bucket}/2020/' print(input_dir) output_dir = f's3://{s3_bucket}/output-dir' print(output_dir) df = glueContext.create_dynamic_frame_from_options('s3', {'paths':[input_dir], 'recurse':True, 'groupFiles': 'inPartition', 'groupSize': '1048576'}, format="json") df.printSchema() df1 = df.toDF() indexer = StringIndexer(inputCol="GENDER", outputCol="GENDER_INDEX") indexed = indexer.fit(df1).transform(df1) indexed.show() df = df.apply_mapping([ ('FIRST', 'string', 'FIRST', 'string'), ('LAST', 'string', 'LAST', 'string'), ('AGE', 'integer', 'AGE', 'integer'), ('GENDER_INDEX', 'integer', 'GENDER', 'integer'), ('LATITUDE', 'string', 'LATITUDE', 'string'),
from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql import DataFrame, Row from datetime import datetime from awsglue import DynamicFrame ## @params: [JOB_NAME] args = getResolvedOptions( sys.argv, ['JOB_NAME', 'SRC_DB_NAME', 'SRC_TABLE_NAME', 'DEST_S3_PATH']) src_db_name = args['SRC_DB_NAME'] src_table_name = args['SRC_TABLE_NAME'] dest_s3_path = args['DEST_S3_PATH'] sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [database = src_db_name, table_name = src_table_name, additionalOptions = {"startingPosition": "TRIM_HORIZON", "inferSchema": "true"}, stream_type = kinesis] ## @return: datasource0 ## @inputs: [] datasource0 = glueContext.create_data_frame.from_catalog( database=src_db_name, table_name=src_table_name, transformation_ctx="datasource0", additional_options={ "startingPosition": "TRIM_HORIZON", "inferSchema": "true"
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) # f.lit(today_second).cast('long').alias('transformed_at') satisfaction = ['1', '2', '3', '4', '5'] student_id_unavailable = '0' package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_endtime = 'package_endtime' package_starttime = 'package_starttime' student_level_code = 'student_level_code' student_status_code = 'student_status_code' def doCheckModified(val1, val2): if val1 is not None: return val1 return val2 check_modified_null = udf(doCheckModified, StringType()) def doCheckStudentID(code): code = str(code) if code is None: return student_id_unavailable return code check_student_id = udf(doCheckStudentID, StringType()) def doCheckData(code, key): key = str(key) if code is None: if key == package_endtime: return package_endtime_unavailable else: return package_starttime_unavailable return code check_data = udf(doCheckData, IntegerType()) def doCheckDataNull(code, key): code = str(code) key = str(key) if (code is None) & (key == student_level_code): return student_level_code_unavailable if (code is None) & (key == student_status_code): return student_status_code_unavailable return code check_data_null = udf(doCheckDataNull, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = f.udf(concaText, StringType()) dyf_ticket_log = glueContext.create_dynamic_frame.from_catalog( database="native_smile", table_name="ticket_log") dyf_ticket_log = dyf_ticket_log.select_fields([ '_key', 'requester_email', 'satisfaction', 'satisfaction_at', 'created_at' ]) dyf_ticket_log = dyf_ticket_log.resolveChoice(specs=[('_key', 'cast:long')]) # try: # df_flag_1 = spark.read.parquet("s3://dtsodin/flag/flag_hoc_vien_rating_native_smile_caresoft.parquet") # max_key = df_flag_1.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_ticket_log = Filter.apply(frame=dyf_ticket_log, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') if dyf_ticket_log.count() > 0: dyf_student_contact_email = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact_email") dyf_student_contact_email = dyf_student_contact_email.select_fields( ['contact_id', 'email']) \ .rename_field('contact_id', 'contact_id_email') dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id', 'level_study', 'time_lms_created'])\ dyf_log_student_status = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_status") dyf_log_student_status = dyf_log_student_status.select_fields( ['contact_id', 'status_code', 'last_status_code', 'start_date', 'end_date']) \ .rename_field('contact_id', 'contact_id_status') dyf_log_student_package = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_package") dyf_log_student_package = dyf_log_student_package.select_fields( ['student_id', 'package_code', 'start_time', 'end_time']) \ .rename_field('student_id', 'student_id_package') \ .rename_field('start_time', 'start_time_package') \ .rename_field('end_time', 'end_time_package') dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") dyf_log_student_level_study = dyf_log_student_level_study.select_fields( ['contact_id', 'level_current', 'level_modified', 'package_code', 'time_created']) \ .rename_field('contact_id', 'contact_id_level') dyf_ticket_log.printSchema() print dyf_ticket_log.count() dyf_rating_class = Filter.apply( frame=dyf_ticket_log, f=lambda x: x['satisfaction'] in satisfaction) print dyf_rating_class.count() try: df_rating_class = dyf_rating_class.toDF() df_rating_class = df_rating_class.limit(99999) df_student_contact = dyf_student_contact.toDF() df_student_contact_email = dyf_student_contact_email.toDF() df_log_student_level_study = dyf_log_student_level_study.toDF() df_temp = dyf_log_student_level_study.toDF() df_log_student_status = dyf_log_student_status.toDF() df_log_student_package = dyf_log_student_package.toDF() df_temp = df_temp.groupby( 'contact_id_level', 'level_current', 'package_code').agg( f.max("time_created").alias("time_created_max")) df_temp = df_temp.withColumnRenamed('contact_id_level', 'contact_id_join') \ .withColumnRenamed('package_code', 'package_code_join') df_join0 = df_temp.join( df_log_student_level_study, (df_temp['contact_id_join'] == df_log_student_level_study['contact_id_level']) & (df_temp['package_code_join'] == df_log_student_level_study['package_code']) & (df_temp['time_created_max'] == df_log_student_level_study['time_created']), "left") print "=========== . ===========" df_join0.printSchema() dyf_join = DynamicFrame.fromDF(df_join0, glueContext, "dyf_join") dyf_join = dyf_join.select_fields([ 'contact_id_level', 'level_current', 'level_modified', 'package_code', 'time_created' ]) df_join = dyf_join.toDF() df_join.printSchema() df_join.show(10) print "########## . ###########" df_join0 = df_rating_class.join( df_student_contact_email, (df_rating_class['requester_email'] == df_student_contact_email['email'])) df_join01 = df_join0.join(df_student_contact, (df_join0['contact_id_email'] == df_student_contact['contact_id'])) df_join01.printSchema() df_join02 = df_join01.join( df_join, (df_join['contact_id_level'] == df_join01['contact_id']) & (df_join['time_created'] <= df_join01['time_lms_created']), "left") df_join02 = df_join02\ .withColumn("level_modified_new", check_modified_null(df_join02.level_modified, df_join02.level_study))\ .withColumn("timecreated", f.unix_timestamp(df_join02.created_at, "yyyy-MM-dd HH:mm:ss")) df_join02.printSchema() df_join02.show(10) dyf_join = DynamicFrame.fromDF(df_join02, glueContext, "dyf_join") dyf_join = dyf_join.select_fields([ 'timecreated', 'contact_id', 'student_id', 'level_study', 'time_lms_created', 'level_current', 'level_modified', 'package_code', 'time_created', 'satisfaction', 'level_modified_new' ]) # dyf_join_temp = Filter.apply(frame=dyf_join, # f=lambda x: x["level_modified_new"] is None) # print "count: ", dyf_join_temp.count() ############ df_join02 = dyf_join.toDF() df_join03 = df_join02.join( df_log_student_status, (df_log_student_status['contact_id_status'] == df_join02['contact_id']) & (df_log_student_status['start_date'] <= df_join02['timecreated']) & (df_log_student_status['end_date'] >= df_join02['timecreated']), "left") df_join04 = df_join03.join( df_log_student_package, (df_log_student_package['student_id_package'] == df_join03['student_id']) & (df_log_student_package['start_time_package'] <= df_join03['timecreated']) & (df_log_student_package['end_time_package'] >= df_join03['timecreated']), "left") dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join") dyf_join = Filter.apply( frame=dyf_join, f=lambda x: x["start_time_package"] is not None and x[ "end_time_package"] is not None) print "dyf_join: ", dyf_join.count() dyf_join.show(10) dyf_join = dyf_join.select_fields([ 'timecreated', 'student_id', 'contact_id', 'package_code', 'satisfaction', 'start_time_package', 'end_time_package', 'level_modified_new', 'status_code' ]) # dyf_join01 = Filter.apply(frame=dyf_join, # f=lambda x: x["level_current"] is not None) # # print "Check null ", dyf_join01.count() df_join04 = dyf_join.toDF() df_join04 = df_join04.withColumn("transformed_at", unix_timestamp(f.current_timestamp())) \ .withColumn("student_id", check_student_id(df_join04.student_id)) \ .withColumn("package_endtime", check_data(df_join04.end_time_package, f.lit(package_endtime))) \ .withColumn("package_starttime", check_data(df_join04.start_time_package, f.lit(package_starttime))) \ .withColumn("student_level_code", check_data_null(df_join04.level_modified_new, f.lit(student_level_code))) \ .withColumn("student_status_code", check_data_null(df_join04.status_code, f.lit(student_status_code))) \ .withColumn("behavior_id", f.lit(26)) \ .withColumn("rating_type", f.lit("rating_native_smile_caresoft")) \ .withColumn("comment", f.lit("")) \ .withColumn("rating_about", f.lit(None)) \ .withColumn("number_rating", f.lit(1)) \ .withColumn("value_rating", df_join04.satisfaction) df_join04.printSchema() print df_join04.count() df_join04.show(10) dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join") # dyf_join.printSchema() # print dyf_join.count() # dyf_join.show(10) dyf_rating_cara = ApplyMapping.apply( frame=dyf_join, mappings=[ ("timecreated", "int", "student_behavior_date", "long"), ("behavior_id", "int", "behavior_id", "long"), ("student_id", "string", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "int", "package_endtime", "long"), ("package_starttime", "int", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long"), ("rating_type", "string", "rating_type", "string"), ("comment", "string", "comment", "string"), ("rating_about", "int", "rating_about", "long"), ("number_rating", "int", "number_rating", "long"), ("value_rating", "int", "value_rating", "long") ]) df_rating_cara = dyf_rating_cara.toDF() df_rating_cara2 = df_rating_cara.withColumn( 'student_behavior_id', f.md5( concaText(df_rating_cara.student_behavior_date, df_rating_cara.behavior_id, df_rating_cara.student_id, df_rating_cara.contact_id, df_rating_cara.package_code, df_rating_cara.package_endtime, df_rating_cara.package_starttime, df_rating_cara.student_level_code, df_rating_cara.student_status_code, df_rating_cara.transformed_at))) dyf_rating_cara = DynamicFrame.fromDF(df_rating_cara2, glueContext, 'dyf_rating_cara') dyf_rating_cara = Filter.apply(frame=dyf_rating_cara, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') applymapping0 = ApplyMapping.apply( frame=dyf_rating_cara, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("rating_type", "string", "rating_type", "string"), ("comment", "string", "comment", "string"), ("rating_about", "long", "rating_about", "long"), ("number_rating", "long", "number_rating", "long"), ("value_rating", "long", "value_rating", "long"), ("behavior_id", "long", "behavior_id", "long")]) applymapping0.printSchema() print applymapping0.count() # applymapping0.show(5) resolvechoice0 = ResolveChoice.apply( frame=applymapping0, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields0 = DropNullFields.apply( frame=resolvechoice0, transformation_ctx="dropnullfields0") print resolvechoice0.count() # resolvechoice0.printSchema() resolvechoice0.show(10) print('START WRITE TO S3-------------------------') datasink0 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields0, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_rating/", "partitionKeys": ["behavior_id"] }, format="parquet", transformation_ctx="datasink0") print('END WRITE TO S3-------------------------') # datasink0 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields0, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "student_rating_temp", # "database": "dts_odin" # }, # redshift_tmp_dir="s3a://dtsodin/temp/student_rating_temp/", # transformation_ctx="datasink0") applymapping1 = ApplyMapping.apply( frame=dyf_rating_cara, mappings=[ ("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "long", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long") ]) applymapping1.printSchema() print applymapping1.count() # applymapping1.show(10) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice1, transformation_ctx="dropnullfields1") print resolvechoice1.count() # resolvechoice1.printSchema() resolvechoice1.show(10) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior/", "partitionKeys": ["behavior_id"] }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "student_behavior", # "database": "dts_odin" # }, # redshift_tmp_dir="s3a://dtsodin/temp/student_behavior", # transformation_ctx="datasink1") df_temp = dyf_ticket_log.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_hoc_vien_rating_native_smile_caresoft.parquet", mode="overwrite") except Exception as e: print e
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session datasource0 = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_tpebbb") datasource0 = datasource0.select_fields([ '_key', 'id', 'name', 'description', 'roomtype', 'calendar_code', 'vcr_type', 'vcr_class_id' ]) # convert dl datasource0 = datasource0.resolveChoice(specs=[('_key', 'cast:long')]) # doc flag tu s3 df_flag = spark.read.parquet("s3://dts-odin/flag/flag_lophoc.parquet") max = df_flag.collect()[0]['flag'] print "max: ", max # so sanh _key datasource voi flag, lay nhung gia tri co key > flag #data = datasource0.toDF() #data = data.where(data['_key'] > df_flag.collect()[0]['flag']) #datasource0 = DynamicFrame.fromDF(data, glueContext, "datasource0") datasource0 = Filter.apply(frame=datasource0, f=lambda x: x["_key"] > max) print "Count data 1: ", datasource0.count() if (datasource0.count() > 0): # try: datasource1 = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_tpe_calendar_teach") datasource1 = datasource1.select_fields([ 'calendar_code', 'level_class', 'teacher_type', 'student_type', 'type_class', 'teacher_id' ]).rename_field('calendar_code', 'code_calendar') # loc data # df_lop = datasource1.toDF() # df_lop = df_lop.where("code_calendar <> '' and code_calendar is not null") # df_lop = df_lop.where("teacher_type <> 'TRAINING' AND teacher_type <> 'ORIENTATION'") # data_lop = DynamicFrame.fromDF(df_lop, glueContext, "data_lop") data_lop = Filter.apply(frame=datasource1, f=lambda x: x["code_calendar"] != '' and x[ "code_calendar"] is not None) data_lop = Filter.apply(frame=data_lop, f=lambda x: x["teacher_type"] != 'TRAINING' and x["teacher_type"] != 'ORIENTATION') # loc data # df_lslop = datasource0.toDF() # df_lslop = df_lslop.where("calendar_code is not null and calendar_code <> ''") # df_lslop = df_lslop.where("roomtype = 'ROOM'") # df_lslop = df_lslop.where("id is not null and id <> ''") # data_lslop = DynamicFrame.fromDF(df_lslop, glueContext, "data_lslop") data_lslop = Filter.apply(frame=datasource0, f=lambda x: x["calendar_code"] != '' and x[ "calendar_code"] is not None) data_lslop = Filter.apply( frame=data_lslop, f=lambda x: x["id"] != '' and x["id"] is not None) data_lslop = Filter.apply(frame=data_lslop, f=lambda x: x["roomtype"] == 'ROOM') join = Join.apply(data_lslop, data_lop, 'calendar_code', 'code_calendar') # chon cac truong va kieu du lieu day vao db applyMapping = ApplyMapping.apply( frame=join, mappings=[("id", "string", "id", "string"), ("name", "string", "tenlop", "string"), ("description", 'string', 'mota', 'string'), ("roomtype", "string", "kieuphong", "string"), ("vcr_type", "string", "kieuvcr", "string"), ("vcr_class_id", "string", "malopvcr", "string"), ("level_class", "string", "levellop", "string"), ("teacher_type", "string", "kieugiaovien", "string"), ("student_type", "string", "kieuhocvien", "string"), ("type_class", "string", "kieulop", "string"), ("teacher_id", "string", "id_giaovien", "string")]) resolvechoice = ResolveChoice.apply( frame=applyMapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields3") print "Count data 2: ", dropnullfields.count() # ghi du lieu vao redshift datasink = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "dim_lop_hoc", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/topicalms/dim_lophoc/", transformation_ctx="datasink4") # ghi du lieu vao s3 # datasink1 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields, connection_type="s3", # connection_options={ # "path": "s3://dts-odin/dim_lophoc"}, # format="parquet", transformation_ctx="datasink1") # lay max _key cua datasource datasource = datasource0.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # tao data frame flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de flag vao s3 df.write.parquet("s3a://dts-odin/flag/flag_lophoc.parquet", mode="overwrite")
from datetime import datetime from awsglue.context import GlueContext from pyspark.context import SparkContext from common.transforms.with_current_timestamp import with_current_timestamp from .data import get_test_file_as_dynamic_frame glueContext = GlueContext(SparkContext.getOrCreate()) logger = glueContext.get_logger() def test_timestamp_addition_with_default_column(): start_time = datetime.utcnow() dogs = get_test_file_as_dynamic_frame('dogs.csv', glueContext) dogs_with_timestamp = with_current_timestamp()(dogs) timestamp_values = [ record['current_timestamp'] for record in dogs_with_timestamp.toDF().collect() ] assert len(timestamp_values) == dogs.count() assertion_time = datetime.utcnow() assert all(start_time <= timestamp_value <= assertion_time for timestamp_value in timestamp_values) def test_timestamp_addition_with_custom_column(): start_time = datetime.utcnow() dogs = get_test_file_as_dynamic_frame('dogs.csv', glueContext) dogs_with_timestamp = with_current_timestamp('mowie_wowie')(dogs) timestamp_values = [
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history") tpe_enduser_used_product_history = tpe_enduser_used_product_history.select_fields( [ '_key', 'id', 'used_product_id', 'contact_id', 'status_new', 'status_old', 'timecreated' ]) tpe_enduser_used_product_history = tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) df_flag = spark.read.parquet( "s3a://datashine-dev-redshift-backup/flag/flag_hvdkh_LS_A3.parquet") max_key = df_flag.collect()[0]['flag'] tpe_enduser_used_product_history = Filter.apply( frame=tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key) print(tpe_enduser_used_product_history.count()) if (tpe_enduser_used_product_history.count() > 0): tpe_enduser_used_product_history = Filter.apply( frame=tpe_enduser_used_product_history, f=lambda x: x["timecreated"] is not None and x["contact_id"] is not None and x["used_product_id"] is not None and x[ "status_new"] is not None and x["status_new"] == 'ACTIVED' and (x["status_old"] == 'SUSPENDED' or x["status_old"] == 'EXPIRED' or x["status_old"] == 'EXPRIED')) if (tpe_enduser_used_product_history.count() > 0): try: tpe_enduser_used_product_history = tpe_enduser_used_product_history.resolveChoice( specs=[('timecreated', 'cast:long')]) df_tpe_enduser_used_product_history = tpe_enduser_used_product_history.toDF( ) df_tpe_enduser_used_product_history = df_tpe_enduser_used_product_history.withColumn( 'ngay_kich_hoat', from_unixtime( df_tpe_enduser_used_product_history['timecreated'], "yyyyMMdd")) df_tpe_enduser_used_product_history = df_tpe_enduser_used_product_history.withColumn( 'timestemp', df_tpe_enduser_used_product_history['timecreated'] * f.lit(1000)) df_tpe_enduser_used_product_history = df_tpe_enduser_used_product_history.withColumn( 'to_status_id', f.lit(107)) data_tpe_enduser_used_product_history = DynamicFrame.fromDF( df_tpe_enduser_used_product_history, glueContext, "data_tpe_enduser_used_product_history") data_tpe_enduser_used_product_history = data_tpe_enduser_used_product_history.resolveChoice( specs=[('timestemp', 'cast:long')]) data_tpe_enduser_used_product_history.printSchema() data_tpe_enduser_used_product_history.show(3) applymapping1 = ApplyMapping.apply( frame=data_tpe_enduser_used_product_history, mappings=[("ngay_kich_hoat", "string", "change_status_date_id", "long"), ("to_status_id", "int", "to_status_id", "long"), ("timestemp", "long", "timestamp1", "timestamp"), ("contact_id", "string", "contact_id1", "string") ]) applymapping1.printSchema() applymapping1.show(20) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin", "postactions": """UPDATE mapping_changed_status_student SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 1 AND source_id = mapping_changed_status_student.contact_id1 LIMIT 1 ) WHERE user_id IS NULL AND to_status_id = 107""" }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") # ghi flag # lay max key trong data source datasourceTmp = tpe_enduser_used_product_history.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://datashine-dev-redshift-backup/flag/flag_hvdkh_LS_A3.parquet", mode="overwrite") except Exception as e: print("No new data") print(e)
def main(): # arguments from_s3 = 'from-s3' from_jdbc = 'from-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument( '-m', '--mode', required=True, choices=[from_s3, from_jdbc], help='Choose to migrate metastore either from JDBC or from S3') parser.add_argument( '-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument( '-R', '--region', required=False, help='AWS region of target Glue DataCatalog, default to "us-east-1"') parser.add_argument( '-d', '--database-prefix', required=False, help='Optional prefix for database names in Glue DataCatalog') parser.add_argument( '-t', '--table-prefix', required=False, help='Optional prefix for table name in Glue DataCatalog') parser.add_argument( '-D', '--database-input-path', required=False, help='An S3 path containing json files of metastore database entities') parser.add_argument( '-T', '--table-input-path', required=False, help='An S3 path containing json files of metastore table entities') parser.add_argument( '-P', '--partition-input-path', required=False, help='An S3 path containing json files of metastore partition entities' ) options = get_options(parser, sys.argv) if options['mode'] == from_s3: validate_options_in_mode( options=options, mode=from_s3, required_options=[ 'database_input_path', 'table_input_path', 'partition_input_path' ], not_allowed_options=['database_prefix', 'table_prefix']) elif options['mode'] == from_jdbc: validate_options_in_mode(options=options, mode=from_jdbc, required_options=['connection_name'], not_allowed_options=[ 'database_input_path', 'table_input_path', 'partition_input_path' ]) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # launch job if options['mode'] == from_s3: metastore_import_from_s3( sql_context=sql_context, glue_context=glue_context, db_input_dir=options['database_input_path'], tbl_input_dir=options['table_input_path'], parts_input_dir=options['partition_input_path'], datacatalog_name='datacatalog', region=options.get('region') or 'us-east-1') elif options['mode'] == from_jdbc: glue_context.extract_jdbc_conf(options['connection_name']) metastore_full_migration(sc=sc, sql_context=sql_context, glue_context=glue_context, connection=glue_context.extract_jdbc_conf( options['connection_name']), db_prefix=options.get('database_prefix') or '', table_prefix=options.get('table_prefix') or '', datacatalog_name='datacatalog', region=options.get('region') or 'us-east-1')
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today = today.replace(hour=0, minute=0, second=0, microsecond=0) first_day_of_month = today.replace() print('today: ', today) yesterday = today - timedelta(1) print('yesterday: ', yesterday) today_id = long(today.strftime("%Y%m%d")) yesterday_id = long(yesterday.strftime("%Y%m%d")) today_id_0h00 = long(today.strftime("%s")) print('today_id: ', today_id) print('yesterday_id: ', yesterday_id) print('today_id_0h00: ', today_id_0h00) date_end = 1573232400L General = 'General' Vocabulary = 'Vocabulary' Grammar = 'Grammar' Speaking = 'Speaking' Listening = 'Listening' Phrasal_Verb = 'Phrasal' Pronunciation = 'Pronunciation' # Phrasal # Verb # Speaking # 2 # General # 3 # Phrasal Verb # 4 # Grammar # 5 # Vocabulary # 6 # Pronunciation # 7 # Listening is_dev = True is_just_monthly_exam = False is_limit_test = False start_load_date = 0L BEHAVIOR_ID_TEST_TUAN = 22L BEHAVIOR_ID_TEST_THANG = 23L PERIOD_DAYLY = 1L PERIOD_WEEKLY = 2L PERIOD_MONTHLY = 3L def doCheckClassID(code): if code is None: return None code = str(code) if code == General: return 61L if code == Vocabulary: return 62L if code == Grammar: return 63L if code == Speaking: return 64L if code == Listening: return 65L if code == Pronunciation: return 66L if Phrasal_Verb in code: return 67L return None check_class_id = udf(doCheckClassID, LongType()) # ------------------------------------------------------------------------------------------------------------------# my_partition_predicate = "(behavior_id=='22' or behavior_id=='23')" dyf_student_behavior = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_behavior", push_down_predicate=my_partition_predicate) dyf_student_behaviors = dyf_student_behavior.resolveChoice( specs=[('behavior_id', 'cast:long'), ('transformed_at', 'cast:long')]) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3://dts-odin/flag/flag_student_testing_history.parquet") # max_key = df_flag.collect()[0]['flag'] # print('read from index: ', max_key) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_student_behaviors = Filter.apply(frame=dyf_student_behaviors, f=lambda x: x['transformed_at'] > max_key) # except: # print('read flag error ') if dyf_student_behaviors.count() > 0: dyf_student_behaviors = Filter.apply( frame=dyf_student_behaviors, f=lambda x: x["student_behavior_id"] is not None and x[ "student_id"] is not None # and x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN, # BEHAVIOR_ID_TEST_THANG # ] and start_load_date <= x["student_behavior_date"] < today_id_0h00) number_dyf_student_behavior = dyf_student_behaviors.count() print('number_dyf_student_behavior after filtering: ', number_dyf_student_behavior) if number_dyf_student_behavior == 0: return dyf_student_behavior = dyf_student_behaviors \ .select_fields(['student_behavior_id', 'student_behavior_date', 'student_id', 'behavior_id']) df_student_behavior = dyf_student_behavior.toDF() df_student_behavior = df_student_behavior.drop_duplicates( ['student_behavior_id']) if is_limit_test: df_student_behavior = df_student_behavior.limit(1000) df_student_behavior = df_student_behavior.repartition('behavior_id') df_student_behavior.cache() student_behavior_number = df_student_behavior.count() if is_dev: print('dy_student_behavior') print('student_behavior_number: ', student_behavior_number) df_student_behavior.printSchema() df_student_behavior.show(3) if student_behavior_number == 0: return # ------------------------------------------------------------------------------------------------------------------# dyf_student_test_mark = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_test_mark", push_down_predicate=my_partition_predicate) dyf_student_test_mark = dyf_student_test_mark.select_fields( ['student_behavior_id', 'question_category', 'grade']) # dyf_student_test_mark = Filter.apply(frame=dyf_student_test_mark, # f=lambda x: x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN, # BEHAVIOR_ID_TEST_THANG # ] # ) df_student_test_mark = dyf_student_test_mark.toDF() number_student_test_mark = df_student_test_mark.count() if is_dev: print('df_student_test_mark') print('df_student_test_mark: ', number_student_test_mark) df_student_test_mark.printSchema() df_student_test_mark.show(3) if number_student_test_mark == 0: return df_student_behavior_mark = df_student_behavior\ .join(df_student_test_mark, on='student_behavior_id', how='left') if is_dev: print('df_student_behavior_mark') print('df_student_behavior_mark: ', df_student_behavior_mark) df_student_behavior_mark.printSchema() df_student_behavior_mark.show(3) df_student_behavior_mark = df_student_behavior_mark.dropDuplicates([ 'student_behavior_id', 'student_id', 'behavior_id', 'question_category' ]) df_student_behavior_mark_week = df_student_behavior_mark\ .filter(df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_TUAN) df_student_behavior_mark_month = df_student_behavior_mark.filter( df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_THANG) df_student_behavior_mark_week = df_student_behavior_mark_week\ .withColumn('agg_week_id', from_unixtime(df_student_behavior_mark_week.student_behavior_date, "yyyyww")) df_student_behavior_mark_month = df_student_behavior_mark_month \ .withColumn('agg_month_id', from_unixtime(df_student_behavior_mark_month.student_behavior_date, "yyyyMM")) if is_dev: print('df_student_behavior_mark_week') df_student_behavior_mark_week.printSchema() df_student_behavior_mark_week.show(3) print('df_student_behavior_mark_month') df_student_behavior_mark_month.printSchema() df_student_behavior_mark_month.show(3) df_student_behavior_mark_week = df_student_behavior_mark_week \ .withColumn("class_id", check_class_id(df_student_behavior_mark_week.question_category)) df_student_behavior_mark_week_agg = df_student_behavior_mark_week.groupby( 'student_id', 'agg_week_id', 'class_id').agg( f.round(f.max(df_student_behavior_mark_week.grade)).cast( 'long').alias('grade_total'), f.lit(PERIOD_WEEKLY).alias('period_type_id'), f.lit(None).cast('string').alias('agg_date_id'), f.lit(None).cast('string').alias('agg_month_id')) df_student_behavior_mark_month = df_student_behavior_mark_month.na.fill( {'grade': 0}) df_student_behavior_mark_month = df_student_behavior_mark_month.groupby( 'student_behavior_id').agg( f.first('student_id').alias('student_id'), f.first('agg_month_id').alias('agg_month_id'), f.round( f.sum('grade')).cast('long').alias('grade_total_attempt'), ) df_student_behavior_mark_month_agg = df_student_behavior_mark_month.groupby( 'student_id', 'agg_month_id').agg( f.max( df_student_behavior_mark_month.grade_total_attempt).alias( 'grade_total'), f.lit(PERIOD_MONTHLY).alias('period_type_id'), f.lit(None).cast('string').alias('agg_date_id'), f.lit(None).cast('string').alias('agg_week_id'), f.lit(68L).cast('long').alias('class_id')) df_student_behavior_mark_month_agg = df_student_behavior_mark_month_agg.select( 'student_id', 'agg_week_id', 'class_id', 'grade_total', 'period_type_id', 'agg_date_id', 'agg_month_id') if is_dev: print('df_student_behavior_mark_week_agg') df_student_behavior_mark_week_agg.printSchema() df_student_behavior_mark_week_agg.show(3) print('df_student_behavior_mark_month_agg') df_student_behavior_mark_month_agg.printSchema() df_student_behavior_mark_month_agg.show(3) df_student_behavior_mark_agg = df_student_behavior_mark_week_agg.union( df_student_behavior_mark_month_agg) if is_dev: print('df_student_behavior_mark_agg') df_student_behavior_mark_agg.printSchema() df_student_behavior_mark_agg.show(3) dyf_student_behavior_mark_agg = DynamicFrame.fromDF( df_student_behavior_mark_agg, glueContext, 'dyf_student_behavior_mark_agg') dyf_student_behavior_mark_agg = Filter.apply( frame=dyf_student_behavior_mark_agg, f=lambda x: x["class_id"] is not None) dyf_student_behavior_mark_agg.show(3) apply_output_month = ApplyMapping.apply( frame=dyf_student_behavior_mark_agg, mappings=[("student_id", "long", "student_id", "long"), ("class_id", "long", "class_id", "long"), ("period_type_id", "long", "period_type_id", "long"), ("agg_date_id", "string", "created_date_id", "long"), ("agg_week_id", "string", "created_week_id", "long"), ("agg_month_id", "string", "created_month_id", "long"), ("grade_total", "long", "measure1", "long")]) dfy_output_month = ResolveChoice.apply( frame=apply_output_month, choice="make_cols", transformation_ctx="resolvechoice2") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dfy_output_month, catalog_connection="nvn_knowledge", connection_options={ "dbtable": "student_learning_history", "database": "nvn_knowledge_v2" }, redshift_tmp_dir= "s3n://dtsodin/temp/nvn_knowledge_v2/student_learning_history", transformation_ctx="datasink4") df_temp = dyf_student_behaviors.toDF() flag = df_temp.agg({"transformed_at": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dts-odin/flag/flag_student_testing_history.parquet", mode="overwrite")
from awsglue.context import GlueContext from awsglue.job import Job from pyspark.sql.functions import * from pyspark.sql.functions import * import boto3 import logging from botocore.exceptions import ClientError import pymysql import re args = getResolvedOptions( sys.argv, ['JOB_NAME', 'bucketName', 'workspace', 'jdbc_url', 'username', 'pswd']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) #ccn-alerts-104328-{workspace}-sfmc-migration workspace = args['workspace'] bucketName = args['bucketName'] jdbc_url = args['jdbc_url'] username = args['username'] pswd = args['pswd'] print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url, username)) s3Location = 's3://{}'.format(bucketName)
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job import datetime import boto3 args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) datasource0 = glueContext.create_dynamic_frame_from_options( "s3", {'paths': ["s3://xxx-xx-logs/Glue/parquet_sample_dataset/"]}, format="parquet", transformation_ctx="datasource0") datasink3 = glueContext.write_dynamic_frame.from_options( frame=datasource0, connection_type="s3", connection_options={"path": "s3://xx-xx-logs/Glue/glue_bm_issue_11_12/"}, format="parquet", transformation_ctx="datasink3") job.commit()
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") score_pass = 50 date_read_data = 20191005 print('date_read_date: ', date_read_data) def doAddScoreAll(plus, minus): if plus is None: plus = 0 if minus is None: minus = 0 return plus + minus addScoreAll = udf(doAddScoreAll, IntegerType()) def getPreviousDate(date_input): current_date = datetime.strptime(str(date_input), "%Y%m%d") previous_date = current_date - timedelta(1) previous_date_id = previous_date.strftime("%Y%m%d") return previous_date_id def addMoreSore(current_value, new_value): if current_value >= score_pass: return score_pass if current_value is None: current_value = 0 if new_value is None: new_value = 0 a = current_value + new_value if a < 0: return 0 if a >= score_pass: return score_pass return a addMoreSore = udf(addMoreSore, IntegerType()) def getNewPassDate(current_pass_date, score_value_c, score_value_n): if current_pass_date != None: return current_pass_date if score_value_c is None: score_value_c = 0 if score_value_n is None: score_value_n = 0 if score_value_c + score_value_n >= score_pass: return date_read_data return None getNewPassDate = udf(getNewPassDate, IntegerType()) # def getCurrentDate(): # return d4 # # getCurrentDate = udf(getCurrentDate, IntegerType()) def getModifyDate(modify_old, student_id_new): if student_id_new is not None: return date_read_data return modify_old getModifyDate = udf(getModifyDate, IntegerType()) def getnewStudentId(student_id, student_id_new): if student_id is None: return student_id_new return student_id getnewStudentId = udf(getnewStudentId, LongType()) def getnewStudentLearningObjectId(lo_id, lo_id_new): if lo_id is None: return lo_id_new return lo_id getnewStudentLearningObjectId = udf(getnewStudentLearningObjectId, LongType()) def caculateScore(plus, minus): if plus is None: plus = 0 if minus is None: minus = 0 return plus + minus caculateScore = udf(caculateScore, LongType()) def getModifiedDateId(student_id_new, learning_object_id_new, modified_date_id_current): if student_id_new is not None and learning_object_id_new is not None: return long(date_read_data) return long(modified_date_id_current) udfGetModifiedDateId = udf(getModifiedDateId, LongType()) def getCreatedDateId(student_id_new, learning_object_id_new, created_date_id_current): if created_date_id_current is not None: return created_date_id_current if student_id_new is not None and learning_object_id_new is not None: return date_read_data return created_date_id_current getCreatedDateId = udf(getCreatedDateId, LongType()) def getFirstLearningDate(student_id_new, learning_object_id_new, created_date_id_current): if created_date_id_current is not None: return created_date_id_current if student_id_new is not None and learning_object_id_new is not None: return date_read_data return created_date_id_current getFirstLearningDate = udf(getFirstLearningDate, LongType()) dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="mapping_lo_student_history", additional_options={ "path": "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/*/*" }) #get start read for read start_read = 0 # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3a://dts-odin/flag/nvn_knowledge/mapping_lo_student_end_read.parquet") # start_read = df_flag.collect()[0]['flag'] # print('read start date from: ', start_read) # except: # print('read flag file error ') # start_read = None dyf_mapping_lo_student_history = Filter.apply( frame=dyf_mapping_lo_student_history, f=lambda x: x["student_id"] is not None and x["student_id"] != 0 and x[ "learning_object_id"] is not None) # print('dyf_mapping_lo_student_history') # print(dyf_mapping_lo_student_history.count()) # dyf_mapping_lo_student_history.show(3) # dyf_mapping_lo_student_history.printSchema() df_mapping_lo_student_history_cache = dyf_mapping_lo_student_history.toDF() df_mapping_lo_student_history_cache.dropDuplicates([ 'student_id', 'learning_object_id', 'source_system', 'created_date_id' ]) df_mapping_lo_student_history_cache.cache() df_group_source_system = df_mapping_lo_student_history_cache.groupby( 'source_system').agg(f.max('created_date_id').alias('max_date')) max_allowing_date = df_group_source_system.agg({ "max_date": "min" }).collect()[0][0] print('check date_read_data') print('max_allowing_date: ', max_allowing_date) print('date_read_data: ', date_read_data) if max_allowing_date <= date_read_data: print('stop in here::max_allowing_date <= date_read_data') return df_mapping_lo_student_history_cache = df_mapping_lo_student_history_cache.filter( df_mapping_lo_student_history_cache['created_date_id'] == date_read_data) print('df_mapping_lo_student_history_cache') df_mapping_lo_student_history_cache.printSchema() df_mapping_lo_student_history_cache.show(3) print('df_mapping_lo_student_history_cache::number: ', df_mapping_lo_student_history_cache.count()) if df_mapping_lo_student_history_cache.count() > 0: df_mapping_lo_student_new = df_mapping_lo_student_history_cache\ .groupby('student_id', 'learning_object_id', ).agg( addScoreAll(f.sum('knowledge_plus'), f.sum('knowledge_minus')).alias('knowledge_new'), addScoreAll(f.sum('comprehension_plus'), f.sum('comprehension_minus')).alias('comprehension_new'), addScoreAll(f.sum('application_plus'), f.sum('application_minus')).alias('application_new'), addScoreAll(f.sum('analysis_plus'), f.sum('analysis_minus')).alias('analysis_new'), addScoreAll(f.sum('synthesis_plus'), f.sum('synthesis_minus')).alias('synthesis_new'), addScoreAll(f.sum('evaluation_plus'), f.sum('evaluation_minus')).alias('evaluation_new')) # df_mapping_lo_student_new = df_mapping_lo_student_new.withColumnRenamed('student_id', 'student_id_new')\ .withColumnRenamed('learning_object_id', 'learning_object_id_new') # dyf_df_mapping_lo_student_new = DynamicFrame.fromDF(df_mapping_lo_student_new, glueContext, 'dyf_df_mapping_lo_student_new') # # print('dyf_df_mapping_lo_student_new') # dyf_df_mapping_lo_student_new.printSchema() # dyf_df_mapping_lo_student_new.show(3) # print('dyf_df_mapping_lo_student_new number: ', dyf_df_mapping_lo_student_new.count()) # dyf_mapping_lo_student_current = glueContext.create_dynamic_frame.from_catalog( # # database="nvn_knowledge", # # table_name="mapping_lo_student" # # ) dyf_mapping_lo_student_current = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "temp_v1_mapping_lo_student", "redshiftTmpDir": "s3n://dts-odin/temp1/thanhtv3/temp_v1_mapping_lo_student/v8" }) print('mapping_lo_student_current') dyf_mapping_lo_student_current.printSchema() dyf_mapping_lo_student_current.show(3) # # # # # Filter all dyf_mapping_lo_student_current = Filter.apply( frame=dyf_mapping_lo_student_current, f=lambda x: x["student_id"] is not None and x["student_id"] != 0 # and x["knowledge_pass_date_id"] is None # and x["comprehension_pass_date_id"] is None # and x["application_pass_date_id"] is None # and x["analysis_pass_date_id"] is not None # and x["synthesis_pass_date_id"] is None # and x["evaluation_pass_date_id"] is None ) # dy_mapping_lo_student_current = dyf_mapping_lo_student_current.toDF() # dy_mapping_lo_student_current = dy_mapping_lo_student_current.drop('user_id') dy_mapping_lo_student_current.cache() # # join_mapping = df_mapping_lo_student_new.join( dy_mapping_lo_student_current, (dy_mapping_lo_student_current['student_id'] == df_mapping_lo_student_new['student_id_new']) & (dy_mapping_lo_student_current['learning_object_id'] == df_mapping_lo_student_new['learning_object_id_new']), 'left') print('join_new_knowledge_left') join_mapping.printSchema() # print('thanhtv3-------------------------------') join_mapping = join_mapping \ .withColumn('knowledge_t', addMoreSore(join_mapping.knowledge, join_mapping.knowledge_new))\ .withColumn('comprehension_t', addMoreSore(join_mapping.comprehension, join_mapping.comprehension_new))\ .withColumn('application_t', addMoreSore(join_mapping.application, join_mapping.application_new))\ .withColumn('analysis_t', addMoreSore(join_mapping.analysis, join_mapping.analysis_new))\ .withColumn('synthesis_t', addMoreSore(join_mapping.synthesis, join_mapping.synthesis_new))\ .withColumn('evaluation_t', addMoreSore(join_mapping.evaluation, join_mapping.evaluation_new)) \ .withColumn('student_id_t', getnewStudentId(join_mapping.student_id, join_mapping.student_id_new)) \ .withColumn('learning_object_id_t', getnewStudentLearningObjectId(join_mapping.learning_object_id, join_mapping.learning_object_id_new))\ .withColumn('knowledge_pass_date_id', getNewPassDate(join_mapping.knowledge_pass_date_id, join_mapping.knowledge, join_mapping.knowledge_new))\ .withColumn('comprehension_pass_date_id', getNewPassDate(join_mapping.comprehension_pass_date_id, join_mapping.comprehension, join_mapping.comprehension_new))\ .withColumn('application_pass_date_id', getNewPassDate(join_mapping.application_pass_date_id, join_mapping.application, join_mapping.application_new))\ .withColumn('analysis_pass_date_id', getNewPassDate(join_mapping.analysis_pass_date_id, join_mapping.analysis, join_mapping.analysis_new))\ .withColumn('synthesis_pass_date_id', getNewPassDate(join_mapping.synthesis_pass_date_id, join_mapping.synthesis, join_mapping.synthesis_new))\ .withColumn('evaluation_pass_date_id', getNewPassDate(join_mapping.evaluation_pass_date_id, join_mapping.evaluation, join_mapping.evaluation_new))\ .withColumn('modified_date_id_t', udfGetModifiedDateId(join_mapping.student_id_new, join_mapping.learning_object_id_new, join_mapping.modified_date_id))\ .withColumn('created_date_id_t', getCreatedDateId(join_mapping.student_id_new, join_mapping.learning_object_id_new, join_mapping.created_date_id))\ .withColumn('first_learning_date_id_t', getFirstLearningDate(join_mapping.student_id_new, join_mapping.learning_object_id_new, join_mapping.first_learning_date_id)) # #s # # # join_mapping = join_mapping.drop('knowledge_new', 'comprehension_new', 'synthesis_new', # 'application_new', 'evaluation_new', 'analysis_new', # 'knowledge', 'comprehension', 'synthesis', # 'application', 'evaluation', 'analysis', # 'student_id_new', 'learning_object_id_new', 'created_date_id') # join_mapping.cache() join_mapping = join_mapping.select( 'id', 'user_id', 'student_id_t', 'learning_object_id_t', 'knowledge_t', 'knowledge_pass_date_id', 'comprehension_t', 'comprehension_pass_date_id', 'application_t', 'application_pass_date_id', 'analysis_t', 'analysis_pass_date_id', 'synthesis_t', 'synthesis_pass_date_id', 'evaluation_t', 'evaluation_pass_date_id', 'modified_date_id_t', 'created_date_id_t', 'first_learning_date_id_t') print('join_mapping') join_mapping.printSchema() join_mapping.show(1) dyf_join_mapping = DynamicFrame.fromDF(join_mapping, glueContext, 'dyf_join_mapping') dyf_join_mapping = dyf_join_mapping.resolveChoice( specs=[('user_id', 'cast:long')]) apply_ouput = ApplyMapping.apply( frame=dyf_join_mapping, mappings=[ ("user_id", "long", "user_id", "long"), ("student_id_t", "long", "student_id", "long"), ("learning_object_id_t", "long", "learning_object_id", "long"), ("knowledge_t", "int", "knowledge", "long"), ("comprehension_t", "int", "comprehension", "long"), ("application_t", "int", "application", "long"), ("analysis_t", "int", "analysis", "long"), ("synthesis_t", "int", "synthesis", "long"), ("evaluation_t", "int", "evaluation", "long"), ("knowledge_pass_date_id", "int", "knowledge_pass_date_id", "long"), ("comprehension_pass_date_id", "int", "comprehension_pass_date_id", "long"), ("application_pass_date_id", "int", "application_pass_date_id", "long"), ("analysis_pass_date_id", "int", "analysis_pass_date_id", "long"), ("synthesis_pass_date_id", "int", "synthesis_pass_date_id", "long"), ("evaluation_pass_date_id", "int", "evaluation_pass_date_id", "long"), ("modified_date_id_t", "long", "modified_date_id", "long"), ("created_date_id_t", "long", "created_date_id", "long"), ("first_learning_date_id_t", "long", "first_learning_date_id", "long") ]) # dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dfy_output, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_thanhtv3_dyf_join_mapping", "database": "dts_odin" }, redshift_tmp_dir= "s3n://dts-odin/temp/thanhtv3/nvn/knowledge/mapping_lo_student/v3", transformation_ctx="datasink4") # # # # #save flag for next read # next_day = getPreviousDate(date_read_data) # flag_data = [next_day] # df = spark.createDataFrame(flag_data, "int").toDF('flag') # # # ghi de _key vao s3 # df.write.parquet("s3a://dts-odin/flag/nvn_knowledge/mapping_lo_student_end_read.parquet", mode="overwrite") # unpersit all cache df_mapping_lo_student_history_cache.unpersist() dy_mapping_lo_student_current.unpersist()
from pyspark.context import SparkContext from pyspark.sql import SQLContext from pyspark.sql import SparkSession from pyspark.sql.functions import * from awsglue.context import GlueContext, DynamicFrame from pyspark.sql.window import Window import urllib.parse as urlparse import urllib from pyspark.sql.utils import AnalysisException logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) sparkContext = SparkContext.getOrCreate() glueContext = GlueContext(sparkContext) spark = glueContext.spark_session job = Job(glueContext) args = getResolvedOptions(sys.argv, [ 'JOB_NAME', 'prefix', 'bucket', 'datalake_bucket', 'datalake_prefix', 'id_prefix', 'controller_table_name', 'region', 'crawler_name' ]) job.init(args['JOB_NAME'], args) class LoadIncremental(): def __init__(self): # self.prefix = 'dms-rawdata/public/' # self.bucket = 'dms-rawdata' # self.datalake_bucket='marketboomer-datalake-table'
def main(): to_s3 = 'to-s3' to_jdbc = 'to-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument( '-m', '--mode', required=True, choices=[to_s3, to_jdbc], help='Choose to migrate from datacatalog to s3 or to metastore') parser.add_argument( '--database-names', required=True, help= 'Semicolon-separated list of names of database in Datacatalog to export' ) parser.add_argument('-o', '--output-path', required=False, help='Output path, either local directory or S3 path') parser.add_argument( '-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument( '-R', '--region', required=False, help='AWS region of source Glue DataCatalog, default to "us-east-1"') options = get_options(parser, sys.argv) if options['mode'] == to_s3: validate_options_in_mode(options=options, mode=to_s3, required_options=['output_path'], not_allowed_options=['connection_name']) elif options['mode'] == to_jdbc: validate_options_in_mode(options=options, mode=to_jdbc, required_options=['connection_name'], not_allowed_options=['output_path']) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # extract from datacatalog reader database_arr = options['database_names'].split(';') (databases, tables, partitions) = read_databases_from_catalog(sql_context=sql_context, glue_context=glue_context, datacatalog_name='datacatalog', database_arr=database_arr, region=options.get('region') or 'us-east-1') if options['mode'] == to_s3: output_path = get_output_dir(options['output_path']) datacatalog_migrate_to_s3(databases=databases, tables=tables, partitions=partitions, output_path=output_path) elif options['mode'] == to_jdbc: connection_name = options['connection_name'] datacatalog_migrate_to_hive_metastore( sc=sc, sql_context=sql_context, databases=databases, tables=tables, partitions=partitions, connection=glue_context.extract_jdbc_conf(connection_name))
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job ## @params: [TempDir, JOB_NAME] args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME', 'db_name', 'redshift_connection', 'cis_bucket', 'geo_bucket', 'region']) sc = SparkContext() sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3."+args['region']+".amazonaws.com.cn") glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) """ Copy CIS Demo Data to Redshift """ cis_datasource = glueContext.create_dynamic_frame_from_options("s3", \ {"paths": ["s3://{}".format(args['cis_bucket'])],"recurse": True}, \ format="csv", \ format_options={ \ "withHeader": False, \ "separator": ";" \ }) cis_mapping = ApplyMapping.apply(frame = cis_datasource, mappings = [("col0", "string", "customer_id", "string"), \ ("col1", "string", "name", "string"), \ ("col2", "long", "zip", "long"), \
regexp_extract(df['filename'], r".+analytics-.+(202[01]-..-..-..).*parquet", 1)) df = df.repartition("submitteddatehour") dyf = DynamicFrame.fromDF(df, glueContext, "submitteddatehour-extracted") return (DynamicFrameCollection({"CustomTransform0": dyf}, glueContext)) ## @params: [JOB_NAME, SOURCE_BUCKET_URI, DESTINATION_BUCKET_URI] args = getResolvedOptions( sys.argv, ['JOB_NAME', 'SOURCE_BUCKET_URI', 'DESTINATION_BUCKET_URI']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session glueContext.sql("set spark.sql.parquet.mergeSchema=true") job = Job(glueContext) job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [connection_type = "s3", format = "parquet", connection_options = {"paths": ["s3://te-load-test-analytics-submission-parquet/"], "recurse":True}, transformation_ctx = "DataSource0"] ## @return: DataSource0 ## @inputs: [] DataSource0 = glueContext.create_dynamic_frame.from_options( connection_type="s3", format="parquet", connection_options={ "paths": [f"{args['SOURCE_BUCKET_URI']}/"], "recurse": True },
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # thoi gian tu 01/10/2019 timestamp = 1569888000 # ETL TBHV # Custom function def doSplitWord(word): size = len(word) rs = [word[i:i + 2] for i in range(0, size, 1)] rs1 = [word[i:i + 1] for i in range(0, size, 1)] rs.extend(rs1) return rs state_right = 'state_right' state_wrong = 'state_wrong' # mac dinh duoc cong knowledge # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2 knowledge = '' # cong diem comprehension: # Can list cac name duoc cong diem comprehension: # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 comprehension = [ 'P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2' ] # cong diem application: # Can list cac name duoc cong diem application: # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 application = [ 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2' ] # cong diem analysis: # Can list cac name duoc cong diem analysis # P2_D3; P3_D2; P4_D1; P4_D2 analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem synthesis: # Can list cac name duoc cong diem synthesis # P4_D1; P4_D2 synthesis = ['P4_D1', 'P4_D2'] # cong diem evaluation: # Can list cac name duoc cong diem evaluation evaluation = '' def doAddScore(name, state, type): arr = [''] score = 0 if type == 'comprehension': arr = comprehension if type == 'application': arr = application if type == 'analysis': arr = analysis if type == 'synthesis': arr = synthesis name = name.lower() if state == state_right: score = 10 if state == state_wrong: score = -5 if name is not None: for x in arr: if x.lower() in name: return score return 0 addScore = udf(doAddScore, IntegerType()) def doAddScoreAll(plus, minus): if plus is None and minus is not None: return minus if minus is None and plus is not None: return plus if minus is not None and plus is not None: return plus + minus return 0 addScoreAll = udf(doAddScoreAll, IntegerType()) def do_check_null(val1, val2): if val1 is None and val2 is not None: return val2 if val2 is None and val1 is not None: return val1 if val1 is not None and val2 is not None: return val1 return 0 check_data_null = udf(do_check_null, StringType()) # chuoi ky tu can replace special_str = '["] ;' splitWord = udf(lambda x: doSplitWord(x)) ########## top_quiz_attempts dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz_attempts") dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields( ['_key', 'id', 'timestart', 'quiz']) dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice( specs=[('_key', 'cast:long')]) print dyf_top_quiz_attempts.count() dyf_top_quiz_attempts.show(2) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read) # except: # print('read flag file error ') dyf_top_quiz_attempts = Filter.apply( frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp) print dyf_top_quiz_attempts.count() dyf_top_quiz_attempts.show() if dyf_top_quiz_attempts.count() > 0: ########## dyf_top_user dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="do_top_user") dyf_top_user = dyf_top_user.select_fields(['id', 'student_id']).rename_field( 'id', 'top_user_id') ######### top_question dyf_top_question = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question") dyf_top_question = dyf_top_question.select_fields( ['id', 'name']).rename_field('id', 'quest_id') # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_result_ai dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_result_ai") dyf_top_result_ai = dyf_top_result_ai.select_fields([ 'question_id', 'attempt_id', 'user_id', 'ratio', 'right_word', 'wrong_word' ]) # JOIN va FILTER cac bang theo dieu kien dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question, 'question_id', 'quest_id') dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts, 'attempt_id', 'id') dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["quiz"] not in [7, 9, 918]) dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id', 'top_user_id') # dyf_join02.show() df_study = dyf_join02.toDF() df_study.cache() if (df_study.count() > 0): try: # print("COUNT 1:", df_study.count()) # Loc cac ky tu dac biet [ ] ", # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ... # df_study = df_study.select( # 'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word, # special_str, ''), f.translate(df_study.wrong_word, # special_str, '')) df_study = df_study.select('quiz', 'name', 'student_id', 'timestart', 'right_word', 'wrong_word') df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \ .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, '')) # Tach cau thanh array tu: # house, her => [house, her] # PHan tich tu dung df_study_right = df_study.withColumn( "right_word_list", f.split(df_study.right_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_right = df_study_right.withColumn( "right", f.explode(df_study_right.right_word_list)) df_study_right = df_study_right.select('quiz', 'name', 'student_id', 'timestart', 'right') df_study_right = df_study_right.withColumn( "right", f.lower(f.col("right"))) # print("COUNT 2:", df_study_right.count()) # df_study_right.printSchema() # df_study_right.show() dyf_study_right = DynamicFrame.fromDF(df_study_right, glueContext, "dyf_study_right") ## Learning Object dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object") dyf_learning_object = dyf_learning_object.select_fields( ['learning_object_id', 'learning_object_name']) df_learning_object = dyf_learning_object.toDF() # convert to lowercase df_learning_object = df_learning_object.withColumn( "learning_object_name", f.lower(f.col("learning_object_name"))) dyf_learning_object = DynamicFrame.fromDF( df_learning_object, glueContext, "dyf_learning_object") dyf_knowledge_right = Join.apply(dyf_study_right, dyf_learning_object, 'right', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_right.count()) # dyf_knowledge_right.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_right = dyf_knowledge_right.toDF() df_knowledge_right.cache() df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(10)) \ .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd')) df_knowledge_right = df_knowledge_right.groupby( 'student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_plus"), f.sum('knowledge').alias("knowledge_plus"), f.sum('comprehension').alias("comprehension_plus"), f.sum('application').alias("application_plus"), f.sum('analysis').alias("analysis_plus"), f.sum('synthesis').alias("synthesis_plus"), f.sum('evaluation').alias("evaluation_plus")) df_knowledge_right = df_knowledge_right.where( 'student_id is not null') # df_knowledge_right.printSchema() # df_knowledge_right.show() # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right, # mappings=[("timestart", "long", "timestart", "long"), # ("student_id", 'int', 'student_id', 'long'), # ("learning_object_id", "int", "learning_object_id", "int"), # ("date_id", "string", "date_id", "int"), # ("knowledge", "int", "knowledge", "int"), # ("comprehension", "int", "comprehension", "int"), # ("application", "int", "application", "int"), # ("analysis", "int", "analysis", "int"), # ("synthesis", "int", "synthesis", "int"), # ("evaluation", "int", "evaluation", "int")]) # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", # transformation_ctx="resolvechoice2") # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "temp_right_wrong_learning_object", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") # END Cong diem cac tu dung ################################################# # Tru diem cac tu sai: Xu lu tuong tu tu dung. # rule tru diem la -5 diem neu sai df_study_wrong = df_study.withColumn( "wrong_word_list", f.split(df_study.wrong_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_wrong = df_study_wrong.withColumn( "wrong", f.explode(df_study_wrong.wrong_word_list)) #convert to lowercase df_study_wrong = df_study_wrong.withColumn( "wrong", f.lower(f.col("wrong"))) df_study_wrong = df_study_wrong.select('quiz', 'name', 'student_id', 'timestart', 'wrong') # print("COUNT 2:", df_study_wrong.count()) # df_study_wrong.printSchema() # df_study_wrong.show() dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong, glueContext, "dyf_study_wrong") ## Learning Object dyf_knowledge_wrong = Join.apply(dyf_study_wrong, dyf_learning_object, 'wrong', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_wrong = dyf_knowledge_wrong.toDF() df_knowledge_wrong.cache() df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-5)) \ .withColumn("comprehension", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd')) df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_minus"), f.sum('knowledge').alias("knowledge_minus"), f.sum('comprehension').alias("comprehension_minus"), f.sum('application').alias("application_minus"), f.sum('analysis').alias("analysis_minus"), f.sum('synthesis').alias("synthesis_minus"), f.sum('evaluation').alias("evaluation_minus"))\ .withColumnRenamed('student_id', 'student_id_wrong') \ .withColumnRenamed('date_id', 'date_id_wrong') \ .withColumnRenamed('learning_object_id', 'learning_object_id_wrong') df_knowledge_wrong = df_knowledge_wrong.where( 'student_id_wrong is not null') # df_study_all = df_study.select('student_id').withColumnRenamed('student_id', 'student_id_all') # df_knowledge_right.printSchema() # df_knowledge_right.show() df_knowledge = df_knowledge_right.join( df_knowledge_wrong, (df_knowledge_right['student_id'] == df_knowledge_wrong['student_id_wrong']) & (df_knowledge_right['date_id'] == df_knowledge_wrong['date_id_wrong']) & (df_knowledge_right['learning_object_id'] == df_knowledge_wrong['learning_object_id_wrong']), 'outer') df_knowledge = df_knowledge.withColumn("user_id", check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \ .withColumn("learning_object_id", check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \ .withColumn("created_date_id", check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \ .withColumn("source_system", f.lit('top_result_ai')) \ .withColumn("lu_id", f.lit(0)) dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext, "df_knowledge") applymapping2 = ApplyMapping.apply( frame=dyf_knowledge, mappings=[ ("user_id", 'string', 'student_id', 'long'), ("learning_object_id", "string", "learning_object_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), ("knowledge_plus", "long", "knowledge_plus", "long"), ("comprehension_plus", "long", "comprehension_plus", "long"), ("application_plus", "long", "application_plus", "long"), ("analysis_plus", "long", "analysis_plus", "long"), ("synthesis_plus", "long", "synthesis_plus", "long"), ("evaluation_plus", "long", "evaluation_plus", "long"), ("knowledge_minus", "long", "knowledge_minus", "long"), ("comprehension_minus", "long", "comprehension_minus", "long"), ("application_minus", "long", "application_minus", "long"), ("analysis_minus", "long", "analysis_minus", "long"), ("synthesis_minus", "long", "synthesis_minus", "long"), ("evaluation_minus", "long", "evaluation_minus", "long"), ("count_plus", "long", "plus_number", "long"), ("count_minus", "long", "minus_number", "long"), # ("lo_type", "string", "lo_type", "long"), ("source_system", "string", "source_system", "string"), ("created_date_id", "string", "created_date_id", "long"), ("lu_id", "int", "lu_type", "long") # ("student_level", "string", "student_level", "string"), # ("advisor_id", "string", "advisor_id", "long"), # ("package_code", "string", "package_code", "string") ]) applymapping2.printSchema() applymapping2.show(20) resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice3") dropnullfields2 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields2") print('COUNT df_knowledge: ', dropnullfields2.count()) dropnullfields2.printSchema() dropnullfields2.show(2) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields2, connection_type="s3", connection_options={ "path": "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/", "partitionKeys": ["created_date_id", "source_system"] }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_result_ai/", # transformation_ctx="datasink5") # END Tru diem cac tu sai # xoa cache df_study.unpersist() df_knowledge_right.unpersist() df_knowledge_wrong.unpersist() # df_knowledge_right.unpersist() except Exception as e: print( "###################### Exception ##########################" ) print(e) # ghi flag # lay max key trong data source mdl_dyf_top_quiz_attempts = dyf_top_quiz_attempts.toDF() flag = mdl_dyf_top_quiz_attempts.agg({ "_key": "max" }).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet", mode="overwrite")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session student_id_unavailable = '0' package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_endtime = 'package_endtime' package_starttime = 'package_starttime' student_level_code = 'student_level_code' student_status_code = 'student_status_code' CANCELLED = 'CANCELLED' dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history") dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields( [ '_key', 'contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated' ]) # .rename_field('contact_id', 'contactid') dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) # try: # df_flag = spark.read.parquet("s3://dtsodin/flag/flag_trang_thai_tai_khoan_cancelled.parquet") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') print dyf_tpe_enduser_used_product_history.count() if dyf_tpe_enduser_used_product_history.count() > 0: try: dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['id', 'cat_code']) dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id']).rename_field('contact_id', 'contactid') ##################### Join and Filter data df_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.toDF( ) df_tpe_used_product_history_step1 = df_tpe_enduser_used_product_history.groupby('contact_id', 'used_product_id').agg( f.max("timecreated").alias("max_timecreated")) \ .withColumnRenamed("contact_id", "contact_id_temp") print df_tpe_used_product_history_step1.count() df_tpe_used_product_history_step1.show(20) df_tpe_used_product_history_step2 = df_tpe_used_product_history_step1.groupby( 'contact_id_temp').agg( f.max("max_timecreated").alias("max_timecreated"), f.count("used_product_id").alias("count_used_product_id")) print df_tpe_used_product_history_step2.count() df_tpe_used_product_history_step2.show(20) print "EEEEEEEEEEEEEEEEEEEEEEEEE" dyf_tpe_used_product_history = DynamicFrame.fromDF( df_tpe_used_product_history_step2, glueContext, "dyf_tpe_used_product_history") dyf_part_one = Filter.apply( frame=dyf_tpe_used_product_history, f=lambda x: x["count_used_product_id"] <= 1) print dyf_part_one.count() # dyf_part_two = Filter.apply(frame=df_tpe_enduser_used_product_history, # f=lambda x: x["used_product_id"] > 1) df_part_one = dyf_part_one.toDF() df_part_one = df_part_one.join( df_tpe_enduser_used_product_history, (df_part_one.contact_id_temp == df_tpe_enduser_used_product_history.contact_id) & (df_part_one.max_timecreated == df_tpe_enduser_used_product_history.timecreated)) dyf_part_one = DynamicFrame.fromDF(df_part_one, glueContext, "dyf_part_one") dyf_part_one = dyf_part_one.select_fields([ 'contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated' ]) dyf_join_part_one_product_details = Join.apply( dyf_part_one, dyf_tpe_invoice_product_details, 'used_product_id', 'id') dyf_join_part_one_product_details.printSchema() print "total 01: ", dyf_join_part_one_product_details.count() dyf_join_part_one_product_details.toDF().show(2) dyf_join_part_one_contact = Join.apply( dyf_join_part_one_product_details, dyf_student_contact, 'contact_id', 'contactid') dyf_join_part_one_contact = dyf_join_part_one_contact \ .select_fields(['contact_id', 'student_id', 'status_new', 'status_description', 'timecreated']) dyf_join_part_one_contact.printSchema() print "total 02: ", dyf_join_part_one_contact.count() dyf_join_part_one_contact.toDF().show(2) # df_join_part_one = dyf_join_part_one_contact.toDF() ###################################### ######## START cancelled dyf_join_cancelled_status = Filter.apply( frame=dyf_join_part_one_contact, f=lambda x: x["status_new"] == CANCELLED) print "dyf_join_cancelled_status ", dyf_join_cancelled_status.count( ) dyf_join_cancelled_status.toDF().show(2) df_join_cancelled_status = dyf_join_cancelled_status.toDF() df_join_cancelled_status = df_join_cancelled_status \ .withColumn("change_status_date_id", from_unixtime(df_join_cancelled_status.timecreated, 'yyyyMMdd').cast("long")) \ .withColumn("from_status_id", f.lit(None).cast("long")) \ .withColumn("to_status_id", f.lit(208).cast("long")) \ .withColumn("measure1", f.lit(None).cast("long")) \ .withColumn("measure2", f.lit(None).cast("long")) \ .withColumn("description", df_join_cancelled_status.status_description) \ .withColumn("timestamp1", f.lit(None).cast("long")) df_join_cancelled_status.show(3) dyf_join_cancelled_status = DynamicFrame.fromDF( df_join_cancelled_status, glueContext, "dyf_join_cancelled_status") dyf_join_cancelled_status = dyf_join_cancelled_status \ .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id', 'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1']) dyf_join_cancelled_status.printSchema() df_join_cancelled_status = dyf_join_cancelled_status.toDF() ####### END # df_join = df_join.withColumn("to_status_id", f.lit(204).cast("long")) df_join_cancelled_status = df_join_cancelled_status.withColumn( "user_id", f.lit(None).cast("long")) dyf_join_status = DynamicFrame.fromDF(df_join_cancelled_status, glueContext, "dyf_join_status") applymapping1 = ApplyMapping.apply( frame=dyf_join_status, mappings=[("student_id", "string", "student_id", "long"), ("user_id", "long", "user_id", "long"), ("change_status_date_id", "long", "change_status_date_id", "long"), ("from_status_id", "long", "from_status_id", "long"), ("to_status_id", "long", "to_status_id", "long"), ("measure1", "long", "measure1", "double"), ("measure2", "long", "measure2", "double"), ("description", "string", "description", "string"), ("timestamp1", "long", "timestamp1", "long"), ("contact_id", "string", "contact_id", "string")]) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice1, transformation_ctx="dropnullfields1") print resolvechoice1.count() resolvechoice1.printSchema() resolvechoice1.show(5) print('START WRITE TO REDSHIFT -------------------------') datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields1, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin" }, redshift_tmp_dir= "s3a://dtsodin/temp/mapping_changed_status_student/", transformation_ctx="datasink1") print('START WRITE TO S3-------------------------') # datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3", # connection_options={ # "path": "s3://dtsodin/student_behavior/student_behavior/", # "partitionKeys": ["behavior_id"]}, # format="parquet", # transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') df_temp = dyf_tpe_enduser_used_product_history.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_trang_thai_tai_khoan_cancelled.parquet", mode="overwrite") except Exception as e: print "Something was wrong ", e
import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from pyspark.sql import SQLContext from awsglue.job import Job from awsglue.dynamicframe import DynamicFrame from pyspark.sql.functions import lit from pyspark.sql.types import DateType from datetime import datetime import os # Create a Glue context glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session sqlContext = SQLContext(spark) # mode = "zeppelin" mode = "glue" args = getResolvedOptions(sys.argv,['JOB_NAME','env_prefix', 'table']) jobName = args['JOB_NAME'] envPrefix = args['env_prefix'] table = args['table'] print ("Job: {}".format(jobName)) print ("Environment: {}".format(envPrefix)) print ("Table: {}".format(table)) try: sql = """WITH cte_ra_contacts as ( SELECT ACCT_ROLE.CUST_ACCOUNT_ROLE_ID AS CONTACT_ID, SUBSTR (PARTY.PERSON_LAST_NAME, 1, 50) AS last_name,
import datetime import logging from awsglue import DynamicFrame logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # @params: [JOB_NAME] args = getResolvedOptions(sys.argv, [ "JOB_NAME", "src_db_name", "src_tbl_name", "datalake_bkt_name", "datalake_bkt_prefix" ]) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args["JOB_NAME"], args) logger.info(f'{{"starting_job": "{args["JOB_NAME"]}"}}') data_frame_datasource0 = glueContext.create_data_frame.from_catalog( database=args["src_db_name"], table_name=args["src_tbl_name"], transformation_ctx="datasource0", additional_options={ "startingPosition": "TRIM_HORIZON", "inferSchema": "true" })
import sys from awsglue.transforms import * from awsglue.dynamicframe import DynamicFrame from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from pyspark.sql.functions import lit from awsglue.context import GlueContext from awsglue.job import Job ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [database = "green__tripdata_staging", table_name = "stagingfhv_tripdata_2015_01_csv", transformation_ctx = "datasource0"] ## @return: datasource0 ## @inputs: [] origin = glueContext.create_dynamic_frame_from_options( connection_type="s3", connection_options={ "paths": ["s3://taxi-data-etl/fhv_trippdata_test.csv"] }, format="csv", format_options={ "withHeader": True, "separator": "," })
def main(): to_s3 = 'to-s3' to_jdbc = 'to-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument('-m', '--mode', required=True, choices=[to_s3, to_jdbc], help='Choose to migrate from datacatalog to s3 or to metastore') parser.add_argument('--database-names', required=True, help='Semicolon-separated list of names of database in Datacatalog to export') parser.add_argument('-o', '--output-path', required=False, help='Output path, either local directory or S3 path') parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument('-R', '--region', required=False, help='AWS region of source Glue DataCatalog, default to "us-east-1"') options = get_options(parser, sys.argv) if options['mode'] == to_s3: validate_options_in_mode( options=options, mode=to_s3, required_options=['output_path'], not_allowed_options=['connection_name'] ) elif options['mode'] == to_jdbc: validate_options_in_mode( options=options, mode=to_jdbc, required_options=['connection_name'], not_allowed_options=['output_path'] ) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # extract from datacatalog reader database_arr = options['database_names'].split(';') (databases, tables, partitions) = read_databases_from_catalog( sql_context=sql_context, glue_context=glue_context, datacatalog_name='datacatalog', database_arr=database_arr, region=options.get('region') or 'us-east-1' ) if options['mode'] == to_s3: output_path = get_output_dir(options['output_path']) datacatalog_migrate_to_s3( databases=databases, tables=tables, partitions=partitions, output_path=output_path ) elif options['mode'] == to_jdbc: connection_name = options['connection_name'] datacatalog_migrate_to_hive_metastore( sc=sc, sql_context=sql_context, databases=databases, tables=tables, partitions=partitions, connection=glue_context.extract_jdbc_conf(connection_name) )