def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session student_id_unavailable = '0' package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_endtime = 'package_endtime' package_starttime = 'package_starttime' student_level_code = 'student_level_code' student_status_code = 'student_status_code' CANCELLED = 'CANCELLED' dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history") dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields( [ '_key', 'contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated' ]) # .rename_field('contact_id', 'contactid') dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) # try: # df_flag = spark.read.parquet("s3://dtsodin/flag/flag_trang_thai_tai_khoan_cancelled.parquet") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') print dyf_tpe_enduser_used_product_history.count() if dyf_tpe_enduser_used_product_history.count() > 0: try: dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['id', 'cat_code']) dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id']).rename_field('contact_id', 'contactid') ##################### Join and Filter data df_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.toDF( ) df_tpe_used_product_history_step1 = df_tpe_enduser_used_product_history.groupby('contact_id', 'used_product_id').agg( f.max("timecreated").alias("max_timecreated")) \ .withColumnRenamed("contact_id", "contact_id_temp") print df_tpe_used_product_history_step1.count() df_tpe_used_product_history_step1.show(20) df_tpe_used_product_history_step2 = df_tpe_used_product_history_step1.groupby( 'contact_id_temp').agg( f.max("max_timecreated").alias("max_timecreated"), f.count("used_product_id").alias("count_used_product_id")) print df_tpe_used_product_history_step2.count() df_tpe_used_product_history_step2.show(20) print "EEEEEEEEEEEEEEEEEEEEEEEEE" dyf_tpe_used_product_history = DynamicFrame.fromDF( df_tpe_used_product_history_step2, glueContext, "dyf_tpe_used_product_history") dyf_part_one = Filter.apply( frame=dyf_tpe_used_product_history, f=lambda x: x["count_used_product_id"] <= 1) print dyf_part_one.count() # dyf_part_two = Filter.apply(frame=df_tpe_enduser_used_product_history, # f=lambda x: x["used_product_id"] > 1) df_part_one = dyf_part_one.toDF() df_part_one = df_part_one.join( df_tpe_enduser_used_product_history, (df_part_one.contact_id_temp == df_tpe_enduser_used_product_history.contact_id) & (df_part_one.max_timecreated == df_tpe_enduser_used_product_history.timecreated)) dyf_part_one = DynamicFrame.fromDF(df_part_one, glueContext, "dyf_part_one") dyf_part_one = dyf_part_one.select_fields([ 'contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated' ]) dyf_join_part_one_product_details = Join.apply( dyf_part_one, dyf_tpe_invoice_product_details, 'used_product_id', 'id') dyf_join_part_one_product_details.printSchema() print "total 01: ", dyf_join_part_one_product_details.count() dyf_join_part_one_product_details.toDF().show(2) dyf_join_part_one_contact = Join.apply( dyf_join_part_one_product_details, dyf_student_contact, 'contact_id', 'contactid') dyf_join_part_one_contact = dyf_join_part_one_contact \ .select_fields(['contact_id', 'student_id', 'status_new', 'status_description', 'timecreated']) dyf_join_part_one_contact.printSchema() print "total 02: ", dyf_join_part_one_contact.count() dyf_join_part_one_contact.toDF().show(2) # df_join_part_one = dyf_join_part_one_contact.toDF() ###################################### ######## START cancelled dyf_join_cancelled_status = Filter.apply( frame=dyf_join_part_one_contact, f=lambda x: x["status_new"] == CANCELLED) print "dyf_join_cancelled_status ", dyf_join_cancelled_status.count( ) dyf_join_cancelled_status.toDF().show(2) df_join_cancelled_status = dyf_join_cancelled_status.toDF() df_join_cancelled_status = df_join_cancelled_status \ .withColumn("change_status_date_id", from_unixtime(df_join_cancelled_status.timecreated, 'yyyyMMdd').cast("long")) \ .withColumn("from_status_id", f.lit(None).cast("long")) \ .withColumn("to_status_id", f.lit(208).cast("long")) \ .withColumn("measure1", f.lit(None).cast("long")) \ .withColumn("measure2", f.lit(None).cast("long")) \ .withColumn("description", df_join_cancelled_status.status_description) \ .withColumn("timestamp1", f.lit(None).cast("long")) df_join_cancelled_status.show(3) dyf_join_cancelled_status = DynamicFrame.fromDF( df_join_cancelled_status, glueContext, "dyf_join_cancelled_status") dyf_join_cancelled_status = dyf_join_cancelled_status \ .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id', 'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1']) dyf_join_cancelled_status.printSchema() df_join_cancelled_status = dyf_join_cancelled_status.toDF() ####### END # df_join = df_join.withColumn("to_status_id", f.lit(204).cast("long")) df_join_cancelled_status = df_join_cancelled_status.withColumn( "user_id", f.lit(None).cast("long")) dyf_join_status = DynamicFrame.fromDF(df_join_cancelled_status, glueContext, "dyf_join_status") applymapping1 = ApplyMapping.apply( frame=dyf_join_status, mappings=[("student_id", "string", "student_id", "long"), ("user_id", "long", "user_id", "long"), ("change_status_date_id", "long", "change_status_date_id", "long"), ("from_status_id", "long", "from_status_id", "long"), ("to_status_id", "long", "to_status_id", "long"), ("measure1", "long", "measure1", "double"), ("measure2", "long", "measure2", "double"), ("description", "string", "description", "string"), ("timestamp1", "long", "timestamp1", "long"), ("contact_id", "string", "contact_id", "string")]) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice1, transformation_ctx="dropnullfields1") print resolvechoice1.count() resolvechoice1.printSchema() resolvechoice1.show(5) print('START WRITE TO REDSHIFT -------------------------') datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields1, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin" }, redshift_tmp_dir= "s3a://dtsodin/temp/mapping_changed_status_student/", transformation_ctx="datasink1") print('START WRITE TO S3-------------------------') # datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3", # connection_options={ # "path": "s3://dtsodin/student_behavior/student_behavior/", # "partitionKeys": ["behavior_id"]}, # format="parquet", # transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') df_temp = dyf_tpe_enduser_used_product_history.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_trang_thai_tai_khoan_cancelled.parquet", mode="overwrite") except Exception as e: print "Something was wrong ", e
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # 1. ETL du lieu user_map # 2. ETL du lieu user_communication: phone va email # 3. ETL trang thai RL2130.29 (Trang thai da tao tai khoan LMS thanh cong) # START # doc datasource student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") # chon cac field student_contact = student_contact.select_fields( ['_key', 'contact_id', 'student_id', 'time_lms_created']) # convert kieu du lieu student_contact = student_contact.resolveChoice(specs=[('_key', 'cast:long')]) # doc moc flag tu s3 df_flag = spark.read.parquet("s3://dtsodin/flag/flag_HV.parquet") max_key = df_flag.collect()[0]['flag'] # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # student_contact = Filter.apply(frame=student_contact, f=lambda x: x["_key"] > max_key) if (student_contact.count() > 0): try: student_contact_df = student_contact.toDF() student_contact_df = student_contact_df.withColumn( 'ngay_s0', from_unixtime(student_contact_df.time_lms_created)) student_contact_df = student_contact_df.withColumn( 'id_ngay_s0', from_unixtime(student_contact_df.time_lms_created, "yyyyMMdd")) student_contact_df.printSchema() student_contact = DynamicFrame.fromDF(student_contact_df, glueContext, "student_contact") # chon cac truong va kieu du lieu day vao db applyMapping = ApplyMapping.apply( frame=student_contact, mappings=[("contact_id", "string", "contact_id", "string"), ("student_id", "string", "student_id", "string"), ("id_ngay_s0", "string", "id_ngay_s0", "string"), ("ngay_s0", "string", "ngay_s0", "timestamp")]) resolvechoice = ResolveChoice.apply( frame=applyMapping, choice="make_cols", transformation_ctx="resolvechoice") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") # ghi dl vao db, thuc hien update data print("Count data student: ", dropnullfields.count()) datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_student_contact", "database": "dts_odin", "postactions": """ call proc_insert_student_contact(); DROP TABLE IF EXISTS temp_student_contact """ }, redshift_tmp_dir= "s3n://dtsodin/temp/tig_advisor/user_profile_student_contact/", transformation_ctx="datasink4") # lay max key trong data source datasource = student_contact.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # convert kieu dl flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_HV.parquet", mode="overwrite") except: datasource = student_contact.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dtsodin/flag/flag_HV.parquet", mode="overwrite") ######## # doc data email student_contact_email = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact_email") # chon cac filed student_contact_email = student_contact_email.select_fields( ['_key', 'contact_id', 'email', 'default', 'deleted']) # convert kieu du lieu student_contact_email = student_contact_email.resolveChoice( specs=[('_key', 'cast:long')]) # doc moc flag tu s3 df_flag = spark.read.parquet("s3://dtsodin/flag/flag_HV_Email.parquet") max_key = df_flag.collect()[0]['flag'] print("max_key_email: ", max_key) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag student_contact_email = Filter.apply(frame=student_contact_email, f=lambda x: x["_key"] > max_key) print("Count data 1: ", student_contact_email.count()) if (student_contact_email.count() > 0): try: # chon cac truong va kieu du lieu day vao db applyMappingEmail = ApplyMapping.apply( frame=student_contact_email, mappings=[("contact_id", "string", "contact_id", "string"), ("email", "string", "email", "string"), ("default", "int", "default", "int"), ("deleted", "int", "deleted", "int")]) resolvechoiceEmail = ResolveChoice.apply( frame=applyMappingEmail, choice="make_cols", transformation_ctx="resolvechoiceEmail") dropnullfieldsEmail = DropNullFields.apply( frame=resolvechoiceEmail, transformation_ctx="dropnullfieldsEmail") # ghi dl vao db, thuc hien update data datasinkEmail = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfieldsEmail, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_student_contact_email", "database": "dts_odin", "postactions": """ insert into user_communication(user_id, communication_type, comunication, is_primary, is_deleted, last_update_date) select DISTINCT user_id, 2 as communication_type, email, temp_student_contact_email.default, deleted, CURRENT_DATE as last_update_date from temp_student_contact_email join user_map on source_id = contact_id and source_type = 1; DROP TABLE IF EXISTS temp_student_contact_email """ }, redshift_tmp_dir= "s3n://dtsodin/temp/tig_advisor/user_profile_student_contact/", transformation_ctx="datasink4") # lay max key trong data source datasource = student_contact_email.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # convert kieu dl flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_HV_Email.parquet", mode="overwrite") except: datasource = student_contact_email.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dtsodin/flag/flag_HV_Email.parquet", mode="overwrite") ######## # doc data phone student_contact_phone = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact_phone") # chon cac filed student_contact_phone = student_contact_phone.select_fields( ['_key', 'contact_id', 'phone', 'default', 'deleted']) # convert kieu du lieu student_contact_phone = student_contact_phone.resolveChoice( specs=[('_key', 'cast:long')]) # doc moc flag tu s3 df_flag = spark.read.parquet("s3://dtsodin/flag/flag_HV_Phone.parquet") max_key = df_flag.collect()[0]['flag'] print("max_key_phone: ", max_key) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag student_contact_phone = Filter.apply(frame=student_contact_phone, f=lambda x: x["_key"] > max_key) print("Count data phone: ", student_contact_phone.count()) if (student_contact_phone.count() > 0): try: # chon cac truong va kieu du lieu day vao db applyMappingPhone = ApplyMapping.apply( frame=student_contact_phone, mappings=[("contact_id", "string", "contact_id", "string"), ("phone", "string", "phone", "string"), ("default", "int", "default", "int"), ("deleted", "int", "deleted", "int")]) resolvechoicePhone = ResolveChoice.apply( frame=applyMappingPhone, choice="make_cols", transformation_ctx="resolvechoiceEmail") dropnullfieldsPhone = DropNullFields.apply( frame=resolvechoicePhone, transformation_ctx="dropnullfieldsPhone") # ghi dl vao db, thuc hien update data datasinkPhone = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfieldsPhone, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_student_contact_phone", "database": "dts_odin", "postactions": """insert into user_communication(user_id, communication_type, comunication, is_primary, is_deleted, last_update_date) select DISTINCT user_id, 1 as communication_type, phone, temp_student_contact_phone.default, deleted, CURRENT_DATE as last_update_date from temp_student_contact_phone join user_map on source_id = contact_id and source_type = 1; DROP TABLE IF EXISTS temp_student_contact_phone """ }, redshift_tmp_dir= "s3n://dtsodin/temp/tig_advisor/user_profile_student_contact/", transformation_ctx="datasink4") # lay max key trong data source datasource = student_contact_phone.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # convert kieu dl flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_HV_Phone.parquet", mode="overwrite") except: datasource = student_contact_phone.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dtsodin/flag/flag_HV_Phone.parquet", mode="overwrite")
from awsglue.job import Job from pyspark.sql import SparkSession, DataFrame from awsglue.dynamicframe import DynamicFrame from awsglue.transforms.dynamicframe_filter import Filter import pyspark.sql.functions as f from pyspark.sql.functions import udf from pyspark.sql.types import StringType from pyspark.sql.types import ArrayType, IntegerType, LongType, StructType, StructField from datetime import date, datetime, timedelta import pytz ## @params: [JOB_NAME] # args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') IS_DEV = True PERIOD_DAILY = 1 PERIOD_WEEKLY = 2 period = PERIOD_WEEKLY def get_date_list(start_date): if period == PERIOD_WEEKLY:
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # thoi gian tu 01/10/2019 timestamp = 1569888000 ## Phonetic dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object" ) dyf_phonemic = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'phonetic') dyf_phonemic = dyf_phonemic.select_fields(['learning_object_id', 'learning_object_name']) # df_phonemic = dyf_phonemic.toDF() # df_phonemic = df_phonemic.withColumn('lo_name', convertedudf(df_phonemic.learning_object_name)) # df_phonemic.show() # Lay ra ngu am df1 = dyf_phonemic.toDF() df1 = df1.select('learning_object_id', 'learning_object_name') # myArr = np.array(df1.select('phonemic').collect()) arrPhonetic = [row.learning_object_name for row in df1.collect()] arrPhoneticId = [[row.learning_object_name, row.learning_object_id] for row in df1.collect()] # print(unicode(arrPhonetic[2])) # print('ARR:', arrPhonetic) # print('ARR:', arrPhonetic[2].encode('utf-8', 'replace')) # print('ARR1 :', (u'i:' in arrPhonetic)) # ETL TBHV # Custom function def doAddScoreAll(plus, minus): if plus is None and minus is not None: return minus if minus is None and plus is not None: return plus if minus is not None and plus is not None: return plus + minus return 0 addScoreAll = udf(doAddScoreAll, IntegerType()) def do_get_phone_tic_id(phonetic): phonetic = phonetic.encode('utf-8', 'replace').strip() for x in arrPhoneticId: p = x[0].encode('utf-8', 'replace').strip() if p == phonetic: return x[1] get_phone_tic_id = udf(do_get_phone_tic_id, IntegerType()) def do_check_null(val1, val2): if val1 is None and val2 is not None: return val2 if val2 is None and val1 is not None: return val1 if val1 is not None and val2 is not None: return val1 return 0 check_data_null = udf(do_check_null, StringType()) def doSplitWord(word): rs = [] if word is not None: i = 0 size = len(word) while i < size: s = word[i:i + 2] i += 2 if s in arrPhonetic: rs.append(s) if s not in arrPhonetic: i -= 2 s = word[i:i + 1] i += 1 if s in arrPhonetic: rs.append(s) return rs splitWord = udf(lambda x: doSplitWord(x)) state_right = 'state_right' state_wrong = 'state_wrong' # mac dinh duoc cong knowledge # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2 # knowledge = [] # cong diem comprehension: # Can list cac name duoc cong diem comprehension: # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 comprehension = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem application: # Can list cac name duoc cong diem application: # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 application = ['P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D1', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem analysis: # Can list cac name duoc cong diem analysis # P2_D3; P3_D2; P4_D1; P4_D2 analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem synthesis: # Can list cac name duoc cong diem synthesis # P4_D1; P4_D2 synthesis = [] # cong diem evaluation: # Can list cac name duoc cong diem evaluation evaluation = [] def doAddScore(name, state, type): arr = [''] score = 0 if type == 'comprehension': arr = comprehension if type == 'application': arr = application if type == 'analysis': arr = analysis if type == 'synthesis': arr = synthesis name = name.lower() if state == state_right: score = 2 if state == state_wrong: score = -1 if name is not None: for x in arr: if x.lower() in name: return score return 0 addScore = udf(doAddScore, IntegerType()) # chuoi ky tu can replace special_str = '["] ;' ########## top_quiz_attempts dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz_attempts" ) dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields(['_key', 'id', 'timestart', 'quiz']) dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(specs=[('_key', 'cast:long')]) # print dyf_top_quiz_attempts.count() # dyf_top_quiz_attempts.show(2) dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp) # print dyf_top_quiz_attempts.count() # dyf_top_quiz_attempts.show() # xu ly truong hop start_read is null # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read) # except: # print('read flag file error ') # print('the number of new contacts: ', dyf_top_quiz_attempts.count()) if dyf_top_quiz_attempts.count() > 0: ########## dyf_top_user dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="do_top_user" ) dyf_top_user = dyf_top_user.select_fields( ['id', 'student_id']).rename_field('id', 'top_user_id') ######### top_question dyf_top_question = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question" ) dyf_top_question = dyf_top_question.select_fields( ['id', 'name']) # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_result_ai dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_result_ai" ) dyf_top_result_ai = dyf_top_result_ai.select_fields( ['question_id', 'attempt_id', 'user_id', 'ratio', 'right_word', 'wrong_word']) # JOIN va FILTER cac bang theo dieu kien dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question, 'question_id', 'id') dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts, 'attempt_id', 'id') dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["quiz"] not in [7, 9, 918]) dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id', 'top_user_id') # dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["student_id"] == 259442) # dyf_join02.show() df_study = dyf_join02.toDF() df_study.cache() if (df_study.count() > 0): try: # print("COUNT 1:", df_study.count()) # Loc cac ky tu dac biet [ ] " # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ... # df_study = df_study.select( # 'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word, # special_str, ''), f.translate(df_study.wrong_word, # special_str, '')) df_study = df_study.select( 'quiz', 'name', 'student_id', 'timestart', 'right_word', 'wrong_word') df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \ .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, '')) # Tach cau thanh array tu: # house, her => [house, her] # PHan tich tu dung df_study_right = df_study.withColumn("right_word_list", f.split( df_study.right_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_right = df_study_right.withColumn("right", f.explode(df_study_right.right_word_list)) # convert to lowercase df_study_right = df_study_right.withColumn("right", f.lower(f.col("right"))) df_study_right = df_study_right.select('quiz', 'name', 'student_id', 'timestart', 'right') # print("COUNT 2:", df_study_right.count()) # df_study_right.printSchema() # df_study_right.show() dyf_study_right = DynamicFrame.fromDF(df_study_right, glueContext, "dyf_study_right") ## Learning Object # dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( # database="nvn_knowledge", # table_name="nvn_knowledge_learning_object" # ) dyf_learning_object = Filter.apply(frame=dyf_learning_object, f=lambda x: x["learning_object_type"] == 'vocabulary') dyf_learning_object = dyf_learning_object.select_fields( ['learning_object_id', 'learning_object_name', 'transcription']) df_learning_object = dyf_learning_object.toDF() # convert to lowercase df_learning_object = df_learning_object.withColumn("learning_object_name", f.lower(f.col("learning_object_name"))) # replace cac ky tu df_learning_object = df_learning_object.withColumn("phone_tic_new", f.translate(df_learning_object.transcription, '\',', '')) df_learning_object = df_learning_object.withColumn("phone_tic_tmp", splitWord(df_learning_object.phone_tic_new)) df_learning_object = df_learning_object.withColumn("phone_tic_tmp_01", f.translate(df_learning_object.phone_tic_tmp, '[]', '')) df_learning_object = df_learning_object.withColumn("phone_tic_arr", f.split(df_learning_object.phone_tic_tmp_01, ',')) df_learning_object = df_learning_object.withColumn("split_phonetic", f.explode(df_learning_object.phone_tic_arr)) df_learning_object = df_learning_object.select('learning_object_id', 'learning_object_name', 'split_phonetic') dyf_learning_object = DynamicFrame.fromDF(df_learning_object, glueContext, "dyf_learning_object") dyf_knowledge_right = Join.apply(dyf_study_right, dyf_learning_object, 'right', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_right.count()) # dyf_knowledge_right.printSchema() # 1 df_knowledge_right = dyf_knowledge_right.toDF() # df_knowledge_right = df_knowledge_right.withColumn("right_phonetic", # f.explode(df_knowledge_right.phone_tic_arr)) df_knowledge_right = df_knowledge_right.select('timestart', 'name', 'student_id', 'split_phonetic') df_knowledge_right = df_knowledge_right.withColumn("learning_object_id", get_phone_tic_id(df_knowledge_right.split_phonetic)) # dyf_phonemic_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_phonemic_right") # dyf_phonemic_right = Join.apply(dyf_study_right, dyf_phonemic, 'split_phonetic', 'learning_object_name') # # dropnullfields = DropNullFields.apply(frame=dyf_phonemic_right, transformation_ctx="dropnullfields") # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history_v06", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/", # transformation_ctx="datasink6") # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung # df_knowledge_right = dyf_phonemic_right.toDF() # print("COUNT 4:") # df_knowledge_right.printSchema() df_knowledge_right.cache() df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(2)) \ .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd')) \ .withColumn("lo_type", f.lit(2)) dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # dropnullfields = DropNullFields.apply(frame=dyf_knowledge_right, transformation_ctx="dropnullfields") # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history_v02", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/", # transformation_ctx="datasink6") # print("COUNT 444444444444444:", df_knowledge_right.count()) # df_knowledge_right.printSchema() # df_knowledge_right.show() # # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # # chon cac truong va kieu du lieu day vao db # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right, # mappings=[("timestart", "long", "timestart", "long"), # ("student_id", 'int', 'student_id', 'long'), # ("name", 'string', 'name', 'string'), # ("learning_object_id", "long", "learning_object_id", "long"), # ("date_id", "string", "date_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), # ("lo_type", "int", "lo_type", "int")]) # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", # transformation_ctx="resolvechoice") # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "t_temp_right_learning_object_phonetic", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") # END Cong diem cac tu dung ################################################## # Tru diem cac tu sai: Xu lu tuong tu tu dung. # rule tru diem la -1 diem neu sai df_study_wrong = df_study.withColumn("wrong_word_list", f.split( df_study.wrong_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_wrong = df_study_wrong.withColumn("wrong", f.explode(df_study_wrong.wrong_word_list)) #convert to lowercase df_study_wrong = df_study_wrong.withColumn("wrong", f.lower(f.col("wrong"))) df_study_wrong = df_study_wrong.select('quiz', 'name', 'student_id', 'timestart', 'wrong') # print("COUNT 2222:", df_study_wrong.count()) # df_study_wrong.printSchema() # df_study_wrong.show() dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong, glueContext, "dyf_study_wrong") ## Learning Object dyf_knowledge_wrong = Join.apply(dyf_study_wrong, dyf_learning_object, 'wrong', 'learning_object_name') df_knowledge_wrong = dyf_knowledge_wrong.toDF() # df_knowledge_wrong = df_knowledge_wrong.withColumn("wrong_phonetic", # f.explode(df_knowledge_wrong.phone_tic_arr)) df_knowledge_wrong = df_knowledge_wrong.select('timestart', 'name', 'student_id', 'split_phonetic') df_knowledge_wrong = df_knowledge_wrong.withColumn("learning_object_id", get_phone_tic_id(df_knowledge_wrong.split_phonetic)) # dyf_study_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_study_wrong") # dyf_phonemic_wrong = Join.apply(dyf_study_wrong, dyf_phonemic, 'split_phonetic', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung # df_knowledge_wrong = dyf_phonemic_wrong.toDF() df_knowledge_wrong.cache() df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-1)) \ .withColumn("comprehension", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd')) # df_knowledge_wrong.printSchema() # df_knowledge_wrong.show() # # dyf_knowledge_wrong = DynamicFrame.fromDF(df_knowledge_wrong, glueContext, "dyf_knowledge_wrong") # # # chon cac truong va kieu du lieu day vao db # applymapping1 = ApplyMapping.apply(frame=dyf_knowledge_wrong, # mappings=[("timestart", "long", "timestart", "long"), # ("name", 'string', 'name', 'string'), # ("student_id", 'int', 'student_id', 'long'), # ("id", "int", "learning_object_id", 'long'), # ("date_id", "string", "date_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long")]) # resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", # transformation_ctx="resolvechoice1") # dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1") # # datasink6 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "t_temp_right_learning_object_phonetic", # "database": "dts_odin", # "postactions": """ call proc_knowledge_ngu_am_top_result_ai () """ # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") ### Luu bang mapping_lo_student_history df_knowledge_right = df_knowledge_right.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_plus"), f.sum('knowledge').alias("knowledge_plus"), f.sum('comprehension').alias("comprehension_plus"), f.sum('application').alias("application_plus"), f.sum('analysis').alias("analysis_plus"), f.sum('synthesis').alias("synthesis_plus"), f.sum('evaluation').alias("evaluation_plus")) df_knowledge_right = df_knowledge_right.where('student_id is not null') df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_minus"), f.sum('knowledge').alias("knowledge_minus"), f.sum('comprehension').alias("comprehension_minus"), f.sum('application').alias("application_minus"), f.sum('analysis').alias("analysis_minus"), f.sum('synthesis').alias("synthesis_minus"), f.sum('evaluation').alias("evaluation_minus")) \ .withColumnRenamed('student_id', 'student_id_wrong') \ .withColumnRenamed('date_id', 'date_id_wrong') \ .withColumnRenamed('learning_object_id', 'learning_object_id_wrong') df_knowledge_wrong = df_knowledge_wrong.where('student_id_wrong is not null') df_knowledge = df_knowledge_right.join(df_knowledge_wrong, ( df_knowledge_right['student_id'] == df_knowledge_wrong['student_id_wrong']) & ( df_knowledge_right['date_id'] == df_knowledge_wrong['date_id_wrong']) & ( df_knowledge_right['learning_object_id'] == df_knowledge_wrong['learning_object_id_wrong']), 'outer') df_knowledge = df_knowledge.withColumn("user_id", check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \ .withColumn("learning_object_id", check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \ .withColumn("created_date_id", check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \ .withColumn("source_system", f.lit('top_result_ai_phonetic')) \ .withColumn("lu_id", f.lit(0)) dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext, "df_knowledge") # dyf_knowledge.printSchema() dyf_knowledge.printSchema() dyf_knowledge.show() # dyf_knowledge = DynamicFrame.fromDF(dyf_knowledge, glueContext, "dyf_knowledge") # chon cac truong va kieu du lieu day vao db applymapping = ApplyMapping.apply(frame=dyf_knowledge, mappings=[("user_id", 'string', 'student_id', 'long'), ("learning_object_id", "string", "learning_object_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), ("knowledge_plus", "long", "knowledge_plus", "long"), ("comprehension_plus", "long", "comprehension_plus", "long"), ("application_plus", "long", "application_plus", "long"), ("analysis_plus", "long", "analysis_plus", "long"), ("synthesis_plus", "long", "synthesis_plus", "long"), ("evaluation_plus", "long", "evaluation_plus", "long"), ("knowledge_minus", "long", "knowledge_minus", "long"), ("comprehension_minus", "long", "comprehension_minus", "long"), ("application_minus", "long", "application_minus", "long"), ("analysis_minus", "long", "analysis_minus", "long"), ("synthesis_minus", "long", "synthesis_minus", "long"), ("evaluation_minus", "long", "evaluation_minus", "long"), ("count_plus", "long", "plus_number", "long"), ("count_minus", "long", "minus_number", "long"), # ("lo_type", "string", "lo_type", "long"), ("source_system", "string", "source_system", "string"), ("created_date_id", "string", "created_date_id", "long"), ("lu_id", "int", "lu_type", "long") # ("student_level", "string", "student_level", "string"), # ("advisor_id", "string", "advisor_id", "long"), # ("package_code", "string", "package_code", "string") ]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields, connection_type="s3", connection_options={ "path": "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/", "partitionKeys": ["created_date_id", "source_system"]}, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_question_attempt/", # transformation_ctx="datasink5") ### END Luu bang mapping_lo_student_history # END Tru diem cac tu sai # lay max _key tren datasource datasource = dyf_top_quiz_attempts.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de flag moi vao s3 df.write.parquet("s3a://dtsodin/flag/flag_knowledge_ngu_am_top_ai", mode="overwrite") # xoa cache df_study.unpersist() df_knowledge_right.unpersist() # df_knowledge_right.unpersist() except Exception as e: print("###################### Exception ##########################") print(e)
from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job from pyspark.sql import SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import StringType args = getResolvedOptions(sys.argv, ['JOB_NAME', 'SOURCE_LOCATION', 'OUTPUT_LOCATION']) source = args['SOURCE_LOCATION'] destination = args['OUTPUT_LOCATION'] table = source.split("/")[-1] glueContext = GlueContext(SparkContext.getOrCreate()) job = Job(glueContext) job.init(args['JOB_NAME'], args) if (table == "persons"): persons = glueContext.create_dynamic_frame.from_options( connection_type="s3", format="json", connection_options={ "paths": ['{}/{}'.format(source, 'persons_parsed.json')] }, format_options={"withHeader": False}, transformation_ctx="path={}".format('persons_df')) persons.toDF().write.mode("overwrite").parquet( '{}/persons/'.format(destination))
JSON_FILE_NAME = TARGET_TABLE + \ "_" + SOURCE_SYSTEM # ar_invc_hdr_f_must STAGE_TABLE = ARGS['rs_stage_table'] # ar_invc_hdr_f_stage_must CTLG_CONNECTION = ARGS['glue_conn'] # TestRedshift3 REDSHIFTDB = ARGS['rs_db'] # usinnovationredshift S3_BUCKET = ARGS['bkt_name'] # "odp-us-innovation-raw" MD5_COLUMN_SCD1 = TARGET_TABLE + "_md5_scd1" # ar_invc_hdr_f_md5_scd1 TARGET_TABLE_COLUMNS = ARGS['target_cols'] # As per DDL(col1,col2,col3) STAGE_TABLE_COLUMNS = ARGS['stage_cols'] # As per DDL(col1,col2,col3) DBTABLE_STG = STAGE_DATABASE_NAME + "." + STAGE_TABLE URL = ARGS["jdbc_url"] IAM_ROLE = ARGS["iam_role"] SC = SparkContext() GLUECONTEXT = GlueContext(SC) SPARK = GLUECONTEXT.spark_session JOB = Job(GLUECONTEXT) JOB.init(ARGS['JOB_NAME'], ARGS) RUN_ID = ARGS['JOB_RUN_ID'] JOB_NAME = ARGS['JOB_NAME'] TEMPDIR = ARGS['TempDir'] SRC_NOTEMPTY = True try: # @type: DataSource # @args: [database = "db_mrr_must", ## table_name = "billing" # transformation_ctx = "billing_df"] # @return: DynamicFrame # @inputs: []
def main(): to_s3 = 'to-s3' to_jdbc = 'to-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument( '-m', '--mode', required=True, choices=[to_s3, to_jdbc], help='Choose to migrate from datacatalog to s3 or to metastore') parser.add_argument( '--database-names', required=True, help= 'Semicolon-separated list of names of database in Datacatalog to export' ) parser.add_argument('-o', '--output-path', required=False, help='Output path, either local directory or S3 path') parser.add_argument( '-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument( '-R', '--region', required=False, help='AWS region of source Glue DataCatalog, default to "us-east-1"') options = get_options(parser, sys.argv) if options['mode'] == to_s3: validate_options_in_mode(options=options, mode=to_s3, required_options=['output_path'], not_allowed_options=['connection_name']) elif options['mode'] == to_jdbc: validate_options_in_mode(options=options, mode=to_jdbc, required_options=['connection_name'], not_allowed_options=['output_path']) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # extract from datacatalog reader database_arr = options['database_names'].split(';') (databases, tables, partitions) = read_databases_from_catalog(sql_context=sql_context, glue_context=glue_context, datacatalog_name='datacatalog', database_arr=database_arr, region=options.get('region') or 'us-east-1') if options['mode'] == to_s3: output_path = get_output_dir(options['output_path']) datacatalog_migrate_to_s3(databases=databases, tables=tables, partitions=partitions, output_path=output_path) elif options['mode'] == to_jdbc: connection_name = options['connection_name'] datacatalog_migrate_to_hive_metastore( sc=sc, sql_context=sql_context, databases=databases, tables=tables, partitions=partitions, connection=glue_context.extract_jdbc_conf(connection_name))
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # thoi gian tu 01/10/2019 timestamp = 1569888000 # ETL TBHV # Custom function def doSplitWord(word): size = len(word) rs = [word[i:i + 2] for i in range(0, size, 1)] rs1 = [word[i:i + 1] for i in range(0, size, 1)] rs.extend(rs1) return rs state_right = 'state_right' state_wrong = 'state_wrong' # mac dinh duoc cong knowledge # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2 knowledge = '' # cong diem comprehension: # Can list cac name duoc cong diem comprehension: # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 comprehension = [ 'P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2' ] # cong diem application: # Can list cac name duoc cong diem application: # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2 application = [ 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2' ] # cong diem analysis: # Can list cac name duoc cong diem analysis # P2_D3; P3_D2; P4_D1; P4_D2 analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'] # cong diem synthesis: # Can list cac name duoc cong diem synthesis # P4_D1; P4_D2 synthesis = ['P4_D1', 'P4_D2'] # cong diem evaluation: # Can list cac name duoc cong diem evaluation evaluation = '' def doAddScore(name, state, type): arr = [''] score = 0 if type == 'comprehension': arr = comprehension if type == 'application': arr = application if type == 'analysis': arr = analysis if type == 'synthesis': arr = synthesis name = name.lower() if state == state_right: score = 10 if state == state_wrong: score = -5 if name is not None: for x in arr: if x.lower() in name: return score return 0 addScore = udf(doAddScore, IntegerType()) def doAddScoreAll(plus, minus): if plus is None and minus is not None: return minus if minus is None and plus is not None: return plus if minus is not None and plus is not None: return plus + minus return 0 addScoreAll = udf(doAddScoreAll, IntegerType()) def do_check_null(val1, val2): if val1 is None and val2 is not None: return val2 if val2 is None and val1 is not None: return val1 if val1 is not None and val2 is not None: return val1 return 0 check_data_null = udf(do_check_null, StringType()) # chuoi ky tu can replace special_str = '["] ;' splitWord = udf(lambda x: doSplitWord(x)) ########## top_quiz_attempts dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_quiz_attempts") dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields( ['_key', 'id', 'timestart', 'quiz']) dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice( specs=[('_key', 'cast:long')]) print dyf_top_quiz_attempts.count() dyf_top_quiz_attempts.show(2) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet") # start_read = df_flag.collect()[0]['flag'] # print('read from index: ', start_read) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read) # except: # print('read flag file error ') dyf_top_quiz_attempts = Filter.apply( frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp) print dyf_top_quiz_attempts.count() dyf_top_quiz_attempts.show() if dyf_top_quiz_attempts.count() > 0: ########## dyf_top_user dyf_top_user = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="do_top_user") dyf_top_user = dyf_top_user.select_fields(['id', 'student_id']).rename_field( 'id', 'top_user_id') ######### top_question dyf_top_question = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_question") dyf_top_question = dyf_top_question.select_fields( ['id', 'name']).rename_field('id', 'quest_id') # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')]) ######### top_result_ai dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog( database="moodle", table_name="top_result_ai") dyf_top_result_ai = dyf_top_result_ai.select_fields([ 'question_id', 'attempt_id', 'user_id', 'ratio', 'right_word', 'wrong_word' ]) # JOIN va FILTER cac bang theo dieu kien dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question, 'question_id', 'quest_id') dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts, 'attempt_id', 'id') dyf_join02 = Filter.apply(frame=dyf_join02, f=lambda x: x["quiz"] not in [7, 9, 918]) dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id', 'top_user_id') # dyf_join02.show() df_study = dyf_join02.toDF() df_study.cache() if (df_study.count() > 0): try: # print("COUNT 1:", df_study.count()) # Loc cac ky tu dac biet [ ] ", # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ... # df_study = df_study.select( # 'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word, # special_str, ''), f.translate(df_study.wrong_word, # special_str, '')) df_study = df_study.select('quiz', 'name', 'student_id', 'timestart', 'right_word', 'wrong_word') df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \ .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, '')) # Tach cau thanh array tu: # house, her => [house, her] # PHan tich tu dung df_study_right = df_study.withColumn( "right_word_list", f.split(df_study.right_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_right = df_study_right.withColumn( "right", f.explode(df_study_right.right_word_list)) df_study_right = df_study_right.select('quiz', 'name', 'student_id', 'timestart', 'right') df_study_right = df_study_right.withColumn( "right", f.lower(f.col("right"))) # print("COUNT 2:", df_study_right.count()) # df_study_right.printSchema() # df_study_right.show() dyf_study_right = DynamicFrame.fromDF(df_study_right, glueContext, "dyf_study_right") ## Learning Object dyf_learning_object = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object") dyf_learning_object = dyf_learning_object.select_fields( ['learning_object_id', 'learning_object_name']) df_learning_object = dyf_learning_object.toDF() # convert to lowercase df_learning_object = df_learning_object.withColumn( "learning_object_name", f.lower(f.col("learning_object_name"))) dyf_learning_object = DynamicFrame.fromDF( df_learning_object, glueContext, "dyf_learning_object") dyf_knowledge_right = Join.apply(dyf_study_right, dyf_learning_object, 'right', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_right.count()) # dyf_knowledge_right.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_right = dyf_knowledge_right.toDF() df_knowledge_right.cache() df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(10)) \ .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd')) df_knowledge_right = df_knowledge_right.groupby( 'student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_plus"), f.sum('knowledge').alias("knowledge_plus"), f.sum('comprehension').alias("comprehension_plus"), f.sum('application').alias("application_plus"), f.sum('analysis').alias("analysis_plus"), f.sum('synthesis').alias("synthesis_plus"), f.sum('evaluation').alias("evaluation_plus")) df_knowledge_right = df_knowledge_right.where( 'student_id is not null') # df_knowledge_right.printSchema() # df_knowledge_right.show() # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right") # # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right, # mappings=[("timestart", "long", "timestart", "long"), # ("student_id", 'int', 'student_id', 'long'), # ("learning_object_id", "int", "learning_object_id", "int"), # ("date_id", "string", "date_id", "int"), # ("knowledge", "int", "knowledge", "int"), # ("comprehension", "int", "comprehension", "int"), # ("application", "int", "application", "int"), # ("analysis", "int", "analysis", "int"), # ("synthesis", "int", "synthesis", "int"), # ("evaluation", "int", "evaluation", "int")]) # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", # transformation_ctx="resolvechoice2") # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "temp_right_wrong_learning_object", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/", # transformation_ctx="datasink5") # END Cong diem cac tu dung ################################################# # Tru diem cac tu sai: Xu lu tuong tu tu dung. # rule tru diem la -5 diem neu sai df_study_wrong = df_study.withColumn( "wrong_word_list", f.split(df_study.wrong_word_new, ',')) # Split column array => nhieu row # row: [house, her] => # row1: house # row2: her df_study_wrong = df_study_wrong.withColumn( "wrong", f.explode(df_study_wrong.wrong_word_list)) #convert to lowercase df_study_wrong = df_study_wrong.withColumn( "wrong", f.lower(f.col("wrong"))) df_study_wrong = df_study_wrong.select('quiz', 'name', 'student_id', 'timestart', 'wrong') # print("COUNT 2:", df_study_wrong.count()) # df_study_wrong.printSchema() # df_study_wrong.show() dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong, glueContext, "dyf_study_wrong") ## Learning Object dyf_knowledge_wrong = Join.apply(dyf_study_wrong, dyf_learning_object, 'wrong', 'learning_object_name') # print("COUNT 3:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # print("COUNT 4:", dyf_knowledge_wrong.count()) # dyf_knowledge_wrong.printSchema() # Cong diem cac tu dung df_knowledge_wrong = dyf_knowledge_wrong.toDF() df_knowledge_wrong.cache() df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-5)) \ .withColumn("comprehension", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \ .withColumn("application", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \ .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \ .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd')) df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id', 'learning_object_id').agg( f.count('knowledge').alias("count_minus"), f.sum('knowledge').alias("knowledge_minus"), f.sum('comprehension').alias("comprehension_minus"), f.sum('application').alias("application_minus"), f.sum('analysis').alias("analysis_minus"), f.sum('synthesis').alias("synthesis_minus"), f.sum('evaluation').alias("evaluation_minus"))\ .withColumnRenamed('student_id', 'student_id_wrong') \ .withColumnRenamed('date_id', 'date_id_wrong') \ .withColumnRenamed('learning_object_id', 'learning_object_id_wrong') df_knowledge_wrong = df_knowledge_wrong.where( 'student_id_wrong is not null') # df_study_all = df_study.select('student_id').withColumnRenamed('student_id', 'student_id_all') # df_knowledge_right.printSchema() # df_knowledge_right.show() df_knowledge = df_knowledge_right.join( df_knowledge_wrong, (df_knowledge_right['student_id'] == df_knowledge_wrong['student_id_wrong']) & (df_knowledge_right['date_id'] == df_knowledge_wrong['date_id_wrong']) & (df_knowledge_right['learning_object_id'] == df_knowledge_wrong['learning_object_id_wrong']), 'outer') df_knowledge = df_knowledge.withColumn("user_id", check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \ .withColumn("learning_object_id", check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \ .withColumn("created_date_id", check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \ .withColumn("source_system", f.lit('top_result_ai')) \ .withColumn("lu_id", f.lit(0)) dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext, "df_knowledge") applymapping2 = ApplyMapping.apply( frame=dyf_knowledge, mappings=[ ("user_id", 'string', 'student_id', 'long'), ("learning_object_id", "string", "learning_object_id", "long"), # ("knowledge", "int", "knowledge", "long"), # ("comprehension", "int", "comprehension", "long"), # ("application", "int", "application", "long"), # ("analysis", "int", "analysis", "long"), # ("synthesis", "int", "synthesis", "long"), # ("evaluation", "int", "evaluation", "long"), ("knowledge_plus", "long", "knowledge_plus", "long"), ("comprehension_plus", "long", "comprehension_plus", "long"), ("application_plus", "long", "application_plus", "long"), ("analysis_plus", "long", "analysis_plus", "long"), ("synthesis_plus", "long", "synthesis_plus", "long"), ("evaluation_plus", "long", "evaluation_plus", "long"), ("knowledge_minus", "long", "knowledge_minus", "long"), ("comprehension_minus", "long", "comprehension_minus", "long"), ("application_minus", "long", "application_minus", "long"), ("analysis_minus", "long", "analysis_minus", "long"), ("synthesis_minus", "long", "synthesis_minus", "long"), ("evaluation_minus", "long", "evaluation_minus", "long"), ("count_plus", "long", "plus_number", "long"), ("count_minus", "long", "minus_number", "long"), # ("lo_type", "string", "lo_type", "long"), ("source_system", "string", "source_system", "string"), ("created_date_id", "string", "created_date_id", "long"), ("lu_id", "int", "lu_type", "long") # ("student_level", "string", "student_level", "string"), # ("advisor_id", "string", "advisor_id", "long"), # ("package_code", "string", "package_code", "string") ]) applymapping2.printSchema() applymapping2.show(20) resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice3") dropnullfields2 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields2") print('COUNT df_knowledge: ', dropnullfields2.count()) dropnullfields2.printSchema() dropnullfields2.show(2) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields2, connection_type="s3", connection_options={ "path": "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/", "partitionKeys": ["created_date_id", "source_system"] }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "mapping_lo_student_history", # "database": "dts_odin" # }, # redshift_tmp_dir="s3n://dts-odin/temp1/top_result_ai/", # transformation_ctx="datasink5") # END Tru diem cac tu sai # xoa cache df_study.unpersist() df_knowledge_right.unpersist() df_knowledge_wrong.unpersist() # df_knowledge_right.unpersist() except Exception as e: print( "###################### Exception ##########################" ) print(e) # ghi flag # lay max key trong data source mdl_dyf_top_quiz_attempts = dyf_top_quiz_attempts.toDF() flag = mdl_dyf_top_quiz_attempts.agg({ "_key": "max" }).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet", mode="overwrite")
def main(): # arguments from_s3 = 'from-s3' from_jdbc = 'from-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument( '-m', '--mode', required=True, choices=[from_s3, from_jdbc], help='Choose to migrate metastore either from JDBC or from S3') parser.add_argument( '-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument( '-R', '--region', required=False, help='AWS region of target Glue DataCatalog, default to "us-east-1"') parser.add_argument( '-d', '--database-prefix', required=False, help='Optional prefix for database names in Glue DataCatalog') parser.add_argument( '-t', '--table-prefix', required=False, help='Optional prefix for table name in Glue DataCatalog') parser.add_argument( '-D', '--database-input-path', required=False, help='An S3 path containing json files of metastore database entities') parser.add_argument( '-T', '--table-input-path', required=False, help='An S3 path containing json files of metastore table entities') parser.add_argument( '-P', '--partition-input-path', required=False, help='An S3 path containing json files of metastore partition entities' ) options = get_options(parser, sys.argv) if options['mode'] == from_s3: validate_options_in_mode(options=options, mode=from_s3, required_options=[ 'database_input_path', 'table_input_path', 'partition_input_path' ], not_allowed_options=['table_prefix']) elif options['mode'] == from_jdbc: validate_options_in_mode(options=options, mode=from_jdbc, required_options=['connection_name'], not_allowed_options=[ 'database_input_path', 'table_input_path', 'partition_input_path' ]) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # launch job if options['mode'] == from_s3: metastore_import_from_s3( sql_context=sql_context, glue_context=glue_context, db_input_dir=options['database_input_path'], tbl_input_dir=options['table_input_path'], parts_input_dir=options['partition_input_path'], db_prefix=options.get('database_prefix') or '', datacatalog_name='datacatalog', region=options.get('region') or 'us-east-1') elif options['mode'] == from_jdbc: glue_context.extract_jdbc_conf(options['connection_name']) metastore_full_migration(sc=sc, sql_context=sql_context, glue_context=glue_context, connection=glue_context.extract_jdbc_conf( options['connection_name']), db_prefix=options.get('database_prefix') or '', table_prefix=options.get('table_prefix') or '', datacatalog_name='datacatalog', region=options.get('region') or 'us-east-1')
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source #------------------------------------------------------------------------------------------------------------------# dyf_native_talk = glueContext.create_dynamic_frame.from_catalog(database='native_talk', table_name='native_talk_history_log_api') dyf_native_talk = dyf_native_talk.resolveChoice(specs=[('id', 'cast:long')]) try: df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet") read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_native_talk = Filter.apply(frame=dyf_native_talk, f=lambda x: x["id"] > read_from_index) except: print('read flag file error ') dyf_native_talk = dyf_native_talk.select_fields( ['id', 'learning_date', 'speaking_dialog_score', 'username', 'updated_time']) dy_cache = dyf_native_talk.toDF() dy_cache = dy_cache.cache() dyf_native_talk = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_native_talk') print('dy_cache------------') dy_cache.printSchema() print('dy_cache: ', dy_cache.count()) dy_cache.show(2) #------------------------------------------------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): #---------------------------------------------------------datasource0-----------------------------------------------------# dyf_native_talk = Filter.apply(frame=dyf_native_talk, f=lambda x: x["username"] is not None and x["username"] != '' and x["speaking_dialog_score"] is not None and x["learning_date"] is not None and x["learning_date"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): dyf_nt_account_mapping = glueContext.create_dynamic_frame.from_catalog(database='native_talk', table_name='native_talk_account_mapping') dyf_nt_account_mapping = dyf_nt_account_mapping.select_fields(['contact_id', 'username']).rename_field('username', 'nativetalk_user') dy_cache_2 = dyf_nt_account_mapping.toDF() dy_cache_2 = dy_cache_2.cache() dyf_nt_account_mapping = DynamicFrame.fromDF(dy_cache_2, glueContext, 'dyf_nt_account_mapping') dyf_nt_account_mapping = Filter.apply(frame=dyf_nt_account_mapping, f=lambda x: x["nativetalk_user"] is not None and x["nativetalk_user"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------# join = Join.apply(dyf_native_talk, dyf_nt_account_mapping, 'username', 'nativetalk_user') if(join.count() > 0): df_nativetalk = join.toDF() df_nativetalk = df_nativetalk.withColumn('sogio', f.lit(0.083333)) #5 phut df_nativetalk = df_nativetalk.withColumn('id_time', from_unixtime( unix_timestamp(df_nativetalk.learning_date, "yyyy-MM-dd"), "yyyyMMdd")) df_nativetalk = df_nativetalk.where("contact_id IS NOT NULL") data_nativetalk = DynamicFrame.fromDF(df_nativetalk, glueContext, 'data_nativetalk') data_nativetalk = data_nativetalk.resolveChoice(specs=[('sogio', 'cast:float')]) # -------------------------------------------------------------------------------------------------------------# print('data_nativetalk----------') data_nativetalk.printSchema() # tinh bang "fact_hieusuathoctap" df_hieusuathoctap = data_nativetalk.toDF() # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time df_hieusuathoctap = df_hieusuathoctap.groupby('contact_id', 'id_time').agg(f.sum('sogio'), f.count('contact_id')) df_hieusuathoctap = df_hieusuathoctap.withColumn('tu_hoc_type_id', f.lit(400)) data_hieusuathoctap = DynamicFrame.fromDF(df_hieusuathoctap, glueContext, 'data_hieusuathoctap') data_hieusuathoctap = data_hieusuathoctap.resolveChoice(specs=[('sum(sogio)', 'cast:double')]) print('data_hieusuathoctap::data_hieusuathoctap::data_hieusuathoctap------------------------------------------') data_hieusuathoctap.printSchema(); applymapping2 = ApplyMapping.apply(frame=data_hieusuathoctap, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", 'string', 'id_time', 'bigint'), ("count(contact_id)", 'long', 'soca', 'int'), ("sum(sogio)", 'double', 'sogio', 'double'), ("tu_hoc_type_id", 'int', "tu_hoc_type_id", "int")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields2 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") print('dropnullfields2 number: ', dropnullfields2.count()) datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2, catalog_connection="glue_redshift", connection_options={"dbtable": "temp_staging_lich_su_tu_hoc_native_talk", "database": "dts_odin", "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2) SELECT um.user_id, hwb.id_time, 53, hwb.soca, round(hwb.sogio, 4) FROM temp_staging_lich_su_tu_hoc_native_talk hwb LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = hwb.contact_id; DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_native_talk """ }, redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/", transformation_ctx="datasink2") df_datasource = dyf_native_talk.toDF() flag = df_datasource.agg({"id": "max"}).collect()[0][0] print('flag: ', flag) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet", mode="overwrite") dy_cache.unpersist() dy_cache_2.unpersist()
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") dyf_care_call = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='care_call') dyf_care_call = dyf_care_call.resolveChoice(specs=[('_key', 'cast:long')]) # print schema and select fields print('original schema') dyf_care_call.printSchema() dyf_care_call.show(10) # try: # df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet") # read_from_index = df_flag.collect()[0]['flag'] # print('read from index: ', read_from_index) # dyf_care_call = Filter.apply(frame=dyf_care_call, # f=lambda x: x["_key"] > read_from_index) # except: # print('read flag file error ') # print('the number of new contacts: ', dyf_care_call.count()) dyf_care_call = dyf_care_call.select_fields( ['_key', 'id', 'phone', 'duration', 'call_status', 'time_created']).rename_field('time_created', 'call_date') dy_source_care_call_cache = dyf_care_call.toDF() dy_source_care_call_cache = dy_source_care_call_cache.dropDuplicates( ['id']) dy_source_care_call_cache = dy_source_care_call_cache.cache() dyf_care_call = DynamicFrame.fromDF(dy_source_care_call_cache, glueContext, 'dyf_care_call') # if (dyf_care_call.count() > 0): dyf_care_call = Filter.apply( frame=dyf_care_call, f=lambda x: x["phone"] is not None and x["phone"] != '' and (x["call_status"] == 'success' or x["call_status"] == 'call_success') and x["call_date"] is not None and x["call_date"] != '' and x["duration"] is not None and x["duration"] > 30) # print('dyf_care_call::corrcect') print('dyf_care_call number', dyf_care_call.count()) if (dyf_care_call.count() > 0): dyf_ad_contact_phone = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='student_contact_phone') dyf_ad_contact_phone = dyf_ad_contact_phone.select_fields( ['phone', 'contact_id']) dyf_ad_contact_phone = Filter.apply( frame=dyf_ad_contact_phone, f=lambda x: x["phone"] is not None and x["phone"] != '' and x[ "contact_id"] is not None and x["contact_id"] != '') print('dyf_ad_contact_phone::schema') dyf_ad_contact_phone.printSchema() # dyf_advisor_ip_phone = glueContext.create_dynamic_frame.from_catalog(database='callcenter', # table_name='advisor_ip_phone') # # dyf_advisor_ip_phone = Filter.apply(frame=dyf_advisor_ip_phone, # f=lambda x: x["ip_phone"] is not None and x["ip_phone"] != '') # # # # # # #-----------------------------------------------------------------------------------------------------------# join_call_contact = Join.apply(dyf_care_call, dyf_ad_contact_phone, 'phone', 'phone') # join_call_contact = join_call_contact.select_fields(['id_time', 'answertime', 'calldate', 'phonenumber_correct', 'calldate', 'ipphone', 'contact_id']) # print('join_call_contact::schema------------') join_call_contact.printSchema() join_call_contact.show(2) print('join: ', join_call_contact.count()) # # # #-----------------------------------------------------------------------------------------------------------# # dyf_source_ls_dong_tien = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='nvn_poss_lich_su_dong_tien') dyf_source_ls_dong_tien = Filter.apply( frame=dyf_source_ls_dong_tien, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' and x["ngay_thanhtoan"] is not None and x["ngay_thanhtoan" ] != '') dyf_source_ls_dong_tien = dyf_source_ls_dong_tien.select_fields([ '_key', 'id', 'contact_id', 'ngay_thanhtoan', 'ngay_tao', 'makh' ]).rename_field('ngay_tao', 'ngay_a0') dy_source_ls_dt_cache = dyf_source_ls_dong_tien.toDF() dy_source_ls_dt_cache = dy_source_ls_dt_cache.dropDuplicates( ['id']) dy_source_ls_dt_cache = dy_source_ls_dt_cache.cache() dyf_source_ls_dong_tien = DynamicFrame.fromDF( dy_source_ls_dt_cache, glueContext, 'dyf_source_ls_dong_tien') # join_call_contact_ao = Join.apply(join_call_contact, dyf_source_ls_dong_tien, 'contact_id', 'contact_id') # print('join_call_contact_ao::schema------------') join_call_contact_ao.printSchema() join_call_contact_ao.show(2) print('join: ', join_call_contact_ao.count()) # # # join_call_contact_ao = join_call_contact_ao.resolveChoice(specs=[('calldate', 'cast:timestamp'), # # ('ngay_a0', 'cast:timestamp')]) # # join_call_contact_ao = Filter.apply( frame=join_call_contact_ao, f=lambda x: x["call_date"] is not None and x[ "ngay_a0"] is not None and x["call_date"] > x["ngay_a0"]) # print( 'join_call_contact_ao::after filter calldate > ngay_a0------------' ) # join_call_contact_ao.printSchema() join_call_contact_ao.show(2) print('join_call_contact_ao: ', join_call_contact_ao.count()) # # #get lich su chao mung thanh cong df_join_call_contact_ao = join_call_contact_ao.toDF() df_join_call_contact_ao = df_join_call_contact_ao.groupby( 'contact_id', 'makh').agg(f.min('call_date').alias("ngay_a1")) df_join_call_contact_ao = df_join_call_contact_ao.withColumn( 'id_time', from_unixtime( unix_timestamp(df_join_call_contact_ao.ngay_a1, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) dyf_result = DynamicFrame.fromDF(df_join_call_contact_ao, glueContext, 'dyf_result') # # print('dyf_result------------') # join_call_contact_ao.printSchema() dyf_result.show(2) print('dyf_result: ', dyf_result.count()) # # # # # # # chon field applymapping1 = ApplyMapping.apply( frame=dyf_result, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", "string", "id_time", "bigint"), ("makh", "int", "makh", "int"), ("ngay_a1", "string", "ngay_a1", "timestamp")]) # resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # print('dropnullfields3::printSchema') # dropnullfields3.printSchema() # dropnullfields3.show(2) # ghi data vao redshift datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_ls_dong_tien_a1_v3", "database": "dts_odin", "postactions": """ INSERT into mapping_changed_status_student(description, user_id, change_status_date_id, to_status_id, timestamp1) SELECT 'contact_id: ' + temp_a1.contact_id +' - makh: ' + temp_a1.makh, um.user_id ,temp_a1.id_time, 2, temp_a1.ngay_a1 FROM temp_ls_dong_tien_a1_v3 temp_a1 LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = temp_a1.contact_id ; DROP TABLE IF EXISTS public.temp_ls_dong_tien_a1_v3; CALL update_a1_exception_from_eg() """ }, redshift_tmp_dir="s3n://dts-odin/temp/temp_ls_dong_tien/v2", transformation_ctx="datasink4") df_datasource = dyf_care_call.toDF() flag = df_datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet", mode="overwrite") dy_source_care_call_cache.unpersist()
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today = today.replace(hour=0, minute=0, second=0, microsecond=0) first_day_of_month = today.replace() print('today: ', today) yesterday = today - timedelta(1) print('yesterday: ', yesterday) today_id = long(today.strftime("%Y%m%d")) yesterday_id = long(yesterday.strftime("%Y%m%d")) today_id_0h00 = long(today.strftime("%s")) print('today_id: ', today_id) print('yesterday_id: ', yesterday_id) print('today_id_0h00: ', today_id_0h00) date_end = 1573232400L General = 'General' Vocabulary = 'Vocabulary' Grammar = 'Grammar' Speaking = 'Speaking' Listening = 'Listening' Phrasal_Verb = 'Phrasal' Pronunciation = 'Pronunciation' # Phrasal # Verb # Speaking # 2 # General # 3 # Phrasal Verb # 4 # Grammar # 5 # Vocabulary # 6 # Pronunciation # 7 # Listening is_dev = True is_just_monthly_exam = False is_limit_test = False start_load_date = 0L BEHAVIOR_ID_TEST_TUAN = 22L BEHAVIOR_ID_TEST_THANG = 23L PERIOD_DAYLY = 1L PERIOD_WEEKLY = 2L PERIOD_MONTHLY = 3L def doCheckClassID(code): if code is None: return None code = str(code) if code == General: return 61L if code == Vocabulary: return 62L if code == Grammar: return 63L if code == Speaking: return 64L if code == Listening: return 65L if code == Pronunciation: return 66L if Phrasal_Verb in code: return 67L return None check_class_id = udf(doCheckClassID, LongType()) # ------------------------------------------------------------------------------------------------------------------# my_partition_predicate = "(behavior_id=='22' or behavior_id=='23')" dyf_student_behavior = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_behavior", push_down_predicate=my_partition_predicate) dyf_student_behaviors = dyf_student_behavior.resolveChoice( specs=[('behavior_id', 'cast:long'), ('transformed_at', 'cast:long')]) # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3://dts-odin/flag/flag_student_testing_history.parquet") # max_key = df_flag.collect()[0]['flag'] # print('read from index: ', max_key) # # # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_student_behaviors = Filter.apply(frame=dyf_student_behaviors, f=lambda x: x['transformed_at'] > max_key) # except: # print('read flag error ') if dyf_student_behaviors.count() > 0: dyf_student_behaviors = Filter.apply( frame=dyf_student_behaviors, f=lambda x: x["student_behavior_id"] is not None and x[ "student_id"] is not None # and x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN, # BEHAVIOR_ID_TEST_THANG # ] and start_load_date <= x["student_behavior_date"] < today_id_0h00) number_dyf_student_behavior = dyf_student_behaviors.count() print('number_dyf_student_behavior after filtering: ', number_dyf_student_behavior) if number_dyf_student_behavior == 0: return dyf_student_behavior = dyf_student_behaviors \ .select_fields(['student_behavior_id', 'student_behavior_date', 'student_id', 'behavior_id']) df_student_behavior = dyf_student_behavior.toDF() df_student_behavior = df_student_behavior.drop_duplicates( ['student_behavior_id']) if is_limit_test: df_student_behavior = df_student_behavior.limit(1000) df_student_behavior = df_student_behavior.repartition('behavior_id') df_student_behavior.cache() student_behavior_number = df_student_behavior.count() if is_dev: print('dy_student_behavior') print('student_behavior_number: ', student_behavior_number) df_student_behavior.printSchema() df_student_behavior.show(3) if student_behavior_number == 0: return # ------------------------------------------------------------------------------------------------------------------# dyf_student_test_mark = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_test_mark", push_down_predicate=my_partition_predicate) dyf_student_test_mark = dyf_student_test_mark.select_fields( ['student_behavior_id', 'question_category', 'grade']) # dyf_student_test_mark = Filter.apply(frame=dyf_student_test_mark, # f=lambda x: x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN, # BEHAVIOR_ID_TEST_THANG # ] # ) df_student_test_mark = dyf_student_test_mark.toDF() number_student_test_mark = df_student_test_mark.count() if is_dev: print('df_student_test_mark') print('df_student_test_mark: ', number_student_test_mark) df_student_test_mark.printSchema() df_student_test_mark.show(3) if number_student_test_mark == 0: return df_student_behavior_mark = df_student_behavior\ .join(df_student_test_mark, on='student_behavior_id', how='left') if is_dev: print('df_student_behavior_mark') print('df_student_behavior_mark: ', df_student_behavior_mark) df_student_behavior_mark.printSchema() df_student_behavior_mark.show(3) df_student_behavior_mark = df_student_behavior_mark.dropDuplicates([ 'student_behavior_id', 'student_id', 'behavior_id', 'question_category' ]) df_student_behavior_mark_week = df_student_behavior_mark\ .filter(df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_TUAN) df_student_behavior_mark_month = df_student_behavior_mark.filter( df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_THANG) df_student_behavior_mark_week = df_student_behavior_mark_week\ .withColumn('agg_week_id', from_unixtime(df_student_behavior_mark_week.student_behavior_date, "yyyyww")) df_student_behavior_mark_month = df_student_behavior_mark_month \ .withColumn('agg_month_id', from_unixtime(df_student_behavior_mark_month.student_behavior_date, "yyyyMM")) if is_dev: print('df_student_behavior_mark_week') df_student_behavior_mark_week.printSchema() df_student_behavior_mark_week.show(3) print('df_student_behavior_mark_month') df_student_behavior_mark_month.printSchema() df_student_behavior_mark_month.show(3) df_student_behavior_mark_week = df_student_behavior_mark_week \ .withColumn("class_id", check_class_id(df_student_behavior_mark_week.question_category)) df_student_behavior_mark_week_agg = df_student_behavior_mark_week.groupby( 'student_id', 'agg_week_id', 'class_id').agg( f.round(f.max(df_student_behavior_mark_week.grade)).cast( 'long').alias('grade_total'), f.lit(PERIOD_WEEKLY).alias('period_type_id'), f.lit(None).cast('string').alias('agg_date_id'), f.lit(None).cast('string').alias('agg_month_id')) df_student_behavior_mark_month = df_student_behavior_mark_month.na.fill( {'grade': 0}) df_student_behavior_mark_month = df_student_behavior_mark_month.groupby( 'student_behavior_id').agg( f.first('student_id').alias('student_id'), f.first('agg_month_id').alias('agg_month_id'), f.round( f.sum('grade')).cast('long').alias('grade_total_attempt'), ) df_student_behavior_mark_month_agg = df_student_behavior_mark_month.groupby( 'student_id', 'agg_month_id').agg( f.max( df_student_behavior_mark_month.grade_total_attempt).alias( 'grade_total'), f.lit(PERIOD_MONTHLY).alias('period_type_id'), f.lit(None).cast('string').alias('agg_date_id'), f.lit(None).cast('string').alias('agg_week_id'), f.lit(68L).cast('long').alias('class_id')) df_student_behavior_mark_month_agg = df_student_behavior_mark_month_agg.select( 'student_id', 'agg_week_id', 'class_id', 'grade_total', 'period_type_id', 'agg_date_id', 'agg_month_id') if is_dev: print('df_student_behavior_mark_week_agg') df_student_behavior_mark_week_agg.printSchema() df_student_behavior_mark_week_agg.show(3) print('df_student_behavior_mark_month_agg') df_student_behavior_mark_month_agg.printSchema() df_student_behavior_mark_month_agg.show(3) df_student_behavior_mark_agg = df_student_behavior_mark_week_agg.union( df_student_behavior_mark_month_agg) if is_dev: print('df_student_behavior_mark_agg') df_student_behavior_mark_agg.printSchema() df_student_behavior_mark_agg.show(3) dyf_student_behavior_mark_agg = DynamicFrame.fromDF( df_student_behavior_mark_agg, glueContext, 'dyf_student_behavior_mark_agg') dyf_student_behavior_mark_agg = Filter.apply( frame=dyf_student_behavior_mark_agg, f=lambda x: x["class_id"] is not None) dyf_student_behavior_mark_agg.show(3) apply_output_month = ApplyMapping.apply( frame=dyf_student_behavior_mark_agg, mappings=[("student_id", "long", "student_id", "long"), ("class_id", "long", "class_id", "long"), ("period_type_id", "long", "period_type_id", "long"), ("agg_date_id", "string", "created_date_id", "long"), ("agg_week_id", "string", "created_week_id", "long"), ("agg_month_id", "string", "created_month_id", "long"), ("grade_total", "long", "measure1", "long")]) dfy_output_month = ResolveChoice.apply( frame=apply_output_month, choice="make_cols", transformation_ctx="resolvechoice2") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dfy_output_month, catalog_connection="nvn_knowledge", connection_options={ "dbtable": "student_learning_history", "database": "nvn_knowledge_v2" }, redshift_tmp_dir= "s3n://dtsodin/temp/nvn_knowledge_v2/student_learning_history", transformation_ctx="datasink4") df_temp = dyf_student_behaviors.toDF() flag = df_temp.agg({"transformed_at": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dts-odin/flag/flag_student_testing_history.parquet", mode="overwrite")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session student_id_unavailable = 0L package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L def do_check_endtime(val): if val is not None: return val return 4094592235 check_endtime_null = udf(do_check_endtime, LongType()) ########## dyf_student_contact dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields([ '_key', 'contact_id', 'student_id', 'advisor_id', 'level_study', 'time_lms_created', 'time_created' ]) dyf_student_contact = dyf_student_contact.resolveChoice( specs=[('time_lms_created', 'cast:long')]) dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x['student_id'] is not None and x[ 'contact_id'] is not None and x['time_lms_created'] is not None) dyf_student_level = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") dyf_student_level = dyf_student_level.select_fields( ['_key', 'contact_id', 'level_current', 'level_modified', 'time_created']) \ .rename_field('contact_id', 'contact_id1') \ .rename_field('time_created', 'time_level_created') # try: # df_flag = spark.read.parquet("s3://dtsodin/flag/flag_log_student_level.parquet") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_student_level = Filter.apply(frame=dyf_student_level, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') if dyf_student_level.count() > 0: dyf_mapping_class_lms = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="mapping_class_lms") dyf_mapping_class_lms = dyf_mapping_class_lms.select_fields( ['level_lms', 'level_list_master']) dyf_learning_object_class = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="learning_object_class") dyf_learning_object_class = dyf_learning_object_class.select_fields( ['class_id', 'class_name']) dyf_student_contact_level = Join.apply(dyf_student_contact, dyf_student_level, 'contact_id', 'contact_id1') df_student_contact_level = dyf_student_contact_level.toDF() df_student_contact_level = df_student_contact_level \ .select('student_id', 'level_current', 'level_modified', 'time_created', 'time_level_created') df_count_contact_level = df_student_contact_level \ .groupby('student_id').agg(f.count('time_level_created').alias("count").cast('long')) dyf_count_contact_level = DynamicFrame.fromDF( df_count_contact_level, glueContext, "dyf_count_contact_level") dyf_count_contact_level = Filter.apply(frame=dyf_count_contact_level, f=lambda x: x['count'] == 2L) if dyf_count_contact_level.count() > 0: df_count_contact_level = dyf_count_contact_level.toDF() print df_count_contact_level.count() df_count_contact_level = df_count_contact_level.select( 'student_id') list_filer = [ list(row) for row in df_count_contact_level.collect() ] else: list_filer = [''] dyf_student_contact_level = DynamicFrame.fromDF( df_student_contact_level, glueContext, "dyf_student_contact_level") dyf_student_contact_level = Filter.apply( frame=dyf_student_contact_level, f=lambda x: x['student_id'] not in list_filer) # or x['student_id'] == '29439' dyf_student_contact_level.count() dyf_student_contact_level.printSchema() df_student_contact_level = dyf_student_contact_level.toDF() df_contact_level_first = df_student_contact_level.groupby('student_id').agg( f.min("time_level_created").alias("min_time_level_created")) \ .withColumnRenamed('student_id', 'min_student_id') df_contact_level_last = df_student_contact_level.groupby('student_id').agg( f.max("time_level_created").alias("max_time_level_created")) \ .withColumnRenamed('student_id', 'max_student_id') df_contact_level_first = df_contact_level_first \ .join(df_student_contact_level, (df_contact_level_first.min_student_id == df_student_contact_level.student_id) & (df_contact_level_first.min_time_level_created == df_student_contact_level.time_level_created), "left") df_contact_level_last = df_contact_level_last \ .join(df_student_contact_level, (df_contact_level_last.max_student_id == df_student_contact_level.student_id) & (df_contact_level_last.max_time_level_created == df_student_contact_level.time_level_created), "left") df_contact_level_last = df_contact_level_last \ .select('student_id', 'level_modified', 'time_level_created') df_contact_level_last = df_contact_level_last \ .withColumnRenamed('time_level_created', 'start_date') \ .withColumnRenamed('level_modified', 'level_study') \ .withColumn('end_date', f.lit(4094592235).cast('long')) df_contact_level_last = df_contact_level_last \ .select('student_id', 'level_study', 'start_date', 'end_date') df_contact_level_first = df_contact_level_first \ .select('student_id', 'level_current', 'time_created', 'time_level_created') df_contact_level_first = df_contact_level_first \ .withColumnRenamed('level_current', 'level_study') \ .withColumnRenamed('time_created', 'start_date') \ .withColumnRenamed('time_level_created', 'end_date') df_contact_level_first = df_contact_level_first \ .select('student_id', 'level_study', 'start_date', 'end_date') df_contact_level_first.show() df_contact_level_last.show(), print "END FIRST_LAST" df_student_contact_level01 = df_student_contact_level \ .select('student_id', 'level_current', 'level_modified', 'time_level_created') print "df_student_contact_level" df_student_contact_level01.show(20) df_student_contact_level02 = df_student_contact_level01 df_student_contact_level02 = df_student_contact_level02 \ .withColumnRenamed('student_id', 'student_id_temp') \ .withColumnRenamed('level_current', 'level_current_temp') \ .withColumnRenamed('level_modified', 'level_modified_temp') \ .withColumnRenamed('time_level_created', 'time_level_created_temp') df_student_contact_level02.show(20) df_join_student_contact_level = df_student_contact_level01 \ .join(df_student_contact_level02, (df_student_contact_level01.student_id == df_student_contact_level02.student_id_temp) & (df_student_contact_level01.level_current == df_student_contact_level02.level_modified_temp) & (df_student_contact_level01.time_level_created > df_student_contact_level02.time_level_created_temp), "left") df_join_student_contact_level.show(100) df_join_student_contact_level = df_join_student_contact_level \ .groupby('student_id', 'level_current', 'time_level_created_temp') \ .agg(f.min("time_level_created").alias("time_level_created")) df_join_student_contact_level = df_join_student_contact_level \ .withColumnRenamed('time_level_created_temp', 'start_date') \ .withColumnRenamed('time_level_created', 'end_date') \ .withColumnRenamed('level_current', 'level_study') df_join_student_contact_level = df_join_student_contact_level.where( 'start_date is not null') df_join_student_contact_level.count() df_join_student_contact_level.show() df_union_first_and_middle_contact = df_contact_level_first.union( df_join_student_contact_level) df_union_all_contact = df_union_first_and_middle_contact.union( df_contact_level_last) print df_union_all_contact.count() df_union_all_contact.printSchema() df_union_all_contact.show() df_union_all_contact = df_union_all_contact.withColumn( "user_id", f.lit(None).cast('long')) dyf_student_contact_level = DynamicFrame.fromDF( df_union_all_contact, glueContext, "dyf_student_contact_level") dyf_join_all0 = Join.apply(dyf_mapping_class_lms, dyf_student_contact_level, "level_lms", "level_study") dyf_join_all = Join.apply(dyf_join_all0, dyf_learning_object_class, "level_list_master", "class_name") df_join_all = dyf_join_all.toDF() print "df_join_all ", df_join_all.count() df_join_all = df_join_all.dropDuplicates() dyf_join_all = DynamicFrame.fromDF(df_join_all, glueContext, "dyf_join_all") print "dyf_join_all ", dyf_join_all.count() dyf_join_all.printSchema() dyf_join_all.show(10) applymapping = ApplyMapping.apply( frame=dyf_join_all, mappings=[("student_id", "string", "student_id", "string"), ("user_id", "long", "user_id", "long"), ("class_id", "long", "class_id", "long"), ("start_date", "int", "start_date", "long"), ("end_date", 'long', 'end_date', 'long'), ("level_study", 'string', 'level_study', 'string')]) resolvechoice = ResolveChoice.apply( frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_class_student", "database": "dts_odin" }, redshift_tmp_dir="s3n://dtsodin/temp1/mapping_class_student/", transformation_ctx="datasink5") print('START WRITE TO S3-------------------------') s3 = boto3.resource('s3') bucket = s3.Bucket('dtsodin') bucket.objects.filter( Prefix="student_behavior/student_level/").delete() s3 = boto3.client('s3') bucket_name = "dtsodin" directory_name = "student_behavior/student_level/" # it's name of your folders s3.put_object(Bucket=bucket_name, Key=directory_name) ###### log_student_level = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "mapping_class_student", "redshiftTmpDir": "s3n://dtsodin/temp1/mapping_class_student/" }) datasink6 = glueContext.write_dynamic_frame.from_options( frame=log_student_level, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_level/" }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # ghi flag # lay max key trong data source datasource = dyf_student_level.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_log_student_level.parquet", mode="overwrite") ############################################################################################## dyf_student_advisor = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_change_assignment_advisor") dyf_student_advisor = dyf_student_advisor.select_fields( ['_key', 'id', 'contact_id', 'advisor_id_old', 'advisor_id_new', 'created_at'])\ .rename_field('contact_id', 'contact_id1') # try: # df_flag = spark.read.parquet("s3://dtsodin/flag/flag_log_student_advisor.parquet") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_student_advisor = Filter.apply(frame=dyf_student_advisor, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') if dyf_student_advisor.count() > 0: df_student_advisor = dyf_student_advisor.toDF() # df_student_advisor.printSchema() # df_student_advisor.show() df_student_advisor = df_student_advisor.withColumn( 'created_at_1', unix_timestamp(df_student_advisor.created_at, "yyyy-MM-dd HH:mm:ss").cast(IntegerType())) df_student_contact = dyf_student_contact.toDF() df_student_contact_advisor = df_student_contact.join( df_student_advisor, df_student_contact.contact_id == df_student_advisor.contact_id1) dyf_student_contact_advisor = DynamicFrame.fromDF( df_student_contact_advisor, glueContext, "dyf_student_contact_advisor") dyf_student_contact_advisor = dyf_student_contact_advisor.select_fields( [ 'student_id', 'advisor_id', 'advisor_id_old', 'advisor_id_new', 'created_at_1', 'time_created' ]) df_student_contact_advisor = dyf_student_contact_advisor.toDF() df_student_contact_advisor = df_student_contact_advisor.dropDuplicates( ) df_advisor_temp = df_student_contact_advisor df_advisor_temp = df_advisor_temp \ .withColumnRenamed('student_id', 'student_id_temp') \ .withColumnRenamed('advisor_id', 'advisor_id_temp') \ .withColumnRenamed('advisor_id_old', 'advisor_id_old_temp') \ .withColumnRenamed('advisor_id_new', 'advisor_id_new_temp') \ .withColumnRenamed('created_at_1', 'created_at_temp') \ .withColumnRenamed('time_created', 'time_created_temp') df_join_data = df_student_contact_advisor \ .join(df_advisor_temp, (df_student_contact_advisor.student_id == df_advisor_temp.student_id_temp) & (df_student_contact_advisor.advisor_id_new == df_advisor_temp.advisor_id_old_temp) & (df_student_contact_advisor.created_at_1 <= df_advisor_temp.created_at_temp), "left") df_join_data = df_join_data.groupby('student_id', 'advisor_id_new', 'created_at_1') \ .agg(f.min("created_at_temp").alias("created_at_temp")) df_join_data = df_join_data.withColumn( "end_time", check_endtime_null(df_join_data.created_at_temp)) df_join_data = df_join_data.dropDuplicates() dyf_student_contact_advisor = DynamicFrame.fromDF( df_join_data, glueContext, "dyf_student_contact_advisor") dyf_student_contact_advisor.printSchema() applymapping = ApplyMapping.apply( frame=dyf_student_contact_advisor, mappings=[("student_id", "string", "student_id", "string"), ("advisor_id_new", "string", "advisor_id", "string"), ("created_at_1", 'int', 'start_time', 'long'), ("end_time", 'long', 'end_time', 'long')]) resolvechoice = ResolveChoice.apply( frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "fact_log_change_advisor", "database": "dts_odin" }, redshift_tmp_dir="s3n://dtsodin/temp1/fact_log_change_advisor/", transformation_ctx="datasink5") print('START WRITE TO S3-------------------------') s3 = boto3.resource('s3') bucket = s3.Bucket('dtsodin') bucket.objects.filter( Prefix="student_behavior/student_advisor/").delete() s3 = boto3.client('s3') bucket_name = "dtsodin" directory_name = "student_behavior/student_advisor/" # it's name of your folders s3.put_object(Bucket=bucket_name, Key=directory_name) ###### log_student_advisor = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "fact_log_change_advisor", "redshiftTmpDir": "s3n://dtsodin/temp1/fact_log_change_advisor/" }) datasink6 = glueContext.write_dynamic_frame.from_options( frame=log_student_advisor, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_advisor/" }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') datasource = dyf_student_advisor.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # convert kieu dl flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_log_student_advisor.parquet", mode="overwrite") ############################################################################################## dyf_student_product = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product") dyf_student_product = dyf_student_product.select_fields([ '_key', 'id', 'contact_id', 'product_id', 'starttime', 'endtime', 'timecreated', 'balance_used', 'status', 'timemodified' ]).rename_field('contact_id', 'contact_id1') dyf_student_product = dyf_student_product.resolveChoice( specs=[('_key', 'cast:long')]) try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3://dtsodin/flag/flag_tpe_enduser_used_product.parquet") start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_student_product = Filter.apply(frame=dyf_student_product, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') dyf_student_product = Filter.apply( frame=dyf_student_product, f=lambda x: x["product_id"] is not None and x["product_id"] != '' and x["contact_id1"] is not None and x["contact_id1"] != '') if (dyf_student_product.count() > 0): # try: dyf_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_product_details = dyf_product_details.select_fields( ['id', 'cat_code', 'package_cost', 'actual_cost', 'package_time']).rename_field('id', 'id_goi') # loc data # df_goi = datasource1.toDF() # df_goi = df_goi.where("id_goi <> '' and id_goi is not null") # data_goi = DynamicFrame.fromDF(df_goi, glueContext, "data_goi") dyf_product_details = Filter.apply( frame=dyf_product_details, f=lambda x: x["id_goi"] is not None and x["id_goi"] != '') # df_lsmua = dyf_student_product.toDF() # df_lsmua = df_lsmua.where("product_id is not null and product_id <> ''") # df_lsmua = df_lsmua.where("contact_id is not null and contact_id <> ''") # data_lsmua = DynamicFrame.fromDF(df_lsmua, glueContext, "data_lsmua") # map ls mua goi vs thong tin chi tiet cua goi dyf_student_package = Join.apply(dyf_student_product, dyf_product_details, 'product_id', 'id_goi') dyf_join_data = Join.apply(dyf_student_package, dyf_student_contact, 'contact_id1', 'contact_id') print "dyf_student_package count1: ", dyf_join_data.count() df_join_data = dyf_join_data.toDF() # drop duplicate rows # df_join_data = df_join_data.dropDuplicates() df_join_data = df_join_data.groupby('id', 'student_id') \ .agg(f.first('cat_code').alias('cat_code'), f.first('starttime').alias('starttime'), f.first('endtime').alias('endtime'), f.first('timecreated').alias('timecreated')) dyf_student_package = DynamicFrame.fromDF(df_join_data, glueContext, "dyf_student_package") # convert data # df_student_package = dyf_student_package.toDF() # df_student_package = df_student_package.withColumn('ngay_kich_hoat', from_unixtime(df_student_package.starttime)) \ # .withColumn('ngay_het_han', from_unixtime(df_student_package.endtime)) \ # .withColumn('ngay_mua', from_unixtime(df_student_package.timecreated)) \ # .withColumn('id_time', from_unixtime(df_student_package.starttime, "yyyyMMdd")) \ # .withColumn('ngay_thay_doi', from_unixtime(df_student_package.timemodified)) # df_student_package = df_student_package.dropDuplicates(['contact_id', 'product_id']) # # data = DynamicFrame.fromDF(df_student_package, glueContext, "data") # df_student_package.printSchema() # df_student_package.show() # chon cac truong va kieu du lieu day vao db applyMapping = ApplyMapping.apply( frame=dyf_student_package, mappings=[("id", "string", "id", "string"), ("student_id", "string", "student_id", "string"), ("cat_code", 'string', 'package_code', 'string'), ("starttime", "int", "start_time", "int"), ("endtime", "int", "end_time", "int"), ("timecreated", "int", "timecreated", "int")]) resolvechoice = ResolveChoice.apply( frame=applyMapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields3") dropnullfields.show(10) # ghi data vao db datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "fact_student_package", "database": "dts_odin" }, redshift_tmp_dir="s3n://dtsodin/temp1/fact_student_package/", transformation_ctx="datasink4") print('START WRITE TO S3-------------------------') s3 = boto3.resource('s3') bucket = s3.Bucket('dtsodin') bucket.objects.filter( Prefix="student_behavior/student_package/").delete() s3 = boto3.client('s3') bucket_name = "dtsodin" directory_name = "student_behavior/student_package/" s3.put_object(Bucket=bucket_name, Key=directory_name) ###### log_student_package = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "fact_student_package", "redshiftTmpDir": "s3n://dtsodin/temp1/fact_student_package/" }) datasink6 = glueContext.write_dynamic_frame.from_options( frame=log_student_package, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_package/" }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # ghi flag # lay max key trong data source datasource = dyf_student_product.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_tpe_enduser_used_product_log.parquet", mode="overwrite") ############################################################################################## def do_check_null(val): if val is not None: return val return 4094592235 check_data_null = udf(do_check_null, LongType()) dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history") dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields( ['_key', 'contact_id', 'status_old', 'status_new', 'timecreated']) dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3a://dtsodin/flag/flag_tpe_enduser_used_product_history.parquet") start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') dyf_tpe_enduser_used_product_history = Filter.apply( frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') if (dyf_tpe_enduser_used_product_history.count() > 0): df_tpe_enduser_used_product_history01 = dyf_tpe_enduser_used_product_history.toDF( ) df_tpe_enduser_used_product_history02 = dyf_tpe_enduser_used_product_history.toDF( ) df_tpe_enduser_used_product_history01 = df_tpe_enduser_used_product_history01\ .withColumnRenamed('timecreated', 'start_date') df_tpe_enduser_used_product_history02 = df_tpe_enduser_used_product_history02 \ .withColumnRenamed('timecreated', 'timecreated02') \ .withColumnRenamed('status_old', 'status_old02') \ .withColumnRenamed('status_new', 'status_new02') \ .withColumnRenamed('contact_id', 'contact_id02') df_tpe_enduser_used_product_history_join = df_tpe_enduser_used_product_history01.join( df_tpe_enduser_used_product_history02, (df_tpe_enduser_used_product_history01['contact_id'] == df_tpe_enduser_used_product_history02['contact_id02']) & (df_tpe_enduser_used_product_history01['status_new'] == df_tpe_enduser_used_product_history02['status_old02']) & (df_tpe_enduser_used_product_history01['start_date'] <= df_tpe_enduser_used_product_history02['timecreated02']), "left") df_tpe_enduser_used_product_history_join = df_tpe_enduser_used_product_history_join \ .withColumn("end_date", check_data_null(df_tpe_enduser_used_product_history_join.timecreated02)) df_tpe_enduser_used_product_history_join.printSchema() print df_tpe_enduser_used_product_history_join.count() df_tpe_enduser_used_product_history_join.show(10) dyf_tpe_enduser_product_history = DynamicFrame.fromDF( df_tpe_enduser_used_product_history_join, glueContext, "dyf_tpe_enduser_product_history") dyf_tpe_enduser_product_history = dyf_tpe_enduser_product_history.select_fields( [ 'contact_id', 'status_old', 'status_new', 'start_date', 'end_date' ]) df_tpe_enduser_used_product_history = dyf_tpe_enduser_product_history.toDF( ) df_tpe_enduser_used_product_history_join.printSchema() print df_tpe_enduser_used_product_history_join.count() df_tpe_enduser_used_product_history_join.show(10) dyf_tpe_enduser_product_history = DynamicFrame.fromDF( df_tpe_enduser_used_product_history, glueContext, "dyf_tpe_enduser_product_history") # chon cac truong va kieu du lieu day vao db applyMapping = ApplyMapping.apply( frame=dyf_tpe_enduser_product_history, mappings=[("contact_id", "string", "contact_id", "string"), ("status_new", "string", "status_code", "string"), ("status_old", "string", "last_status_code", "string"), ("start_date", "int", "start_date", "long"), ("end_date", "long", "end_date", "long")]) resolvechoice = ResolveChoice.apply( frame=applyMapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") # ghi data vao db datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "fact_log_student_status", "database": "dts_odin", }, redshift_tmp_dir= "s3n://dtsodin/temp1/tpe_enduser_used_product_history/", transformation_ctx="datasink4") print('START WRITE TO S3-------------------------') s3 = boto3.resource('s3') bucket = s3.Bucket('dtsodin') bucket.objects.filter( Prefix="student_behavior/student_status/").delete() s3 = boto3.client('s3') bucket_name = "dtsodin" directory_name = "student_behavior/student_status/" s3.put_object(Bucket=bucket_name, Key=directory_name) ###### log_student_status = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "fact_log_student_status", "redshiftTmpDir": "s3n://dtsodin/temp1/fact_log_student_status/" }) datasink6 = glueContext.write_dynamic_frame.from_options( frame=log_student_status, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_status/" }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # ghi flag # lay max key trong data source datasource = dyf_tpe_enduser_used_product_history.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_tpe_enduser_used_product_history.parquet", mode="overwrite")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history") tpe_enduser_used_product_history = tpe_enduser_used_product_history.select_fields( [ '_key', 'id', 'used_product_id', 'contact_id', 'status_new', 'status_old', 'timecreated' ]) tpe_enduser_used_product_history = tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) df_flag = spark.read.parquet( "s3a://datashine-dev-redshift-backup/flag/flag_hvdkh_LS_A3.parquet") max_key = df_flag.collect()[0]['flag'] tpe_enduser_used_product_history = Filter.apply( frame=tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key) print(tpe_enduser_used_product_history.count()) if (tpe_enduser_used_product_history.count() > 0): tpe_enduser_used_product_history = Filter.apply( frame=tpe_enduser_used_product_history, f=lambda x: x["timecreated"] is not None and x["contact_id"] is not None and x["used_product_id"] is not None and x[ "status_new"] is not None and x["status_new"] == 'ACTIVED' and (x["status_old"] == 'SUSPENDED' or x["status_old"] == 'EXPIRED' or x["status_old"] == 'EXPRIED')) if (tpe_enduser_used_product_history.count() > 0): try: tpe_enduser_used_product_history = tpe_enduser_used_product_history.resolveChoice( specs=[('timecreated', 'cast:long')]) df_tpe_enduser_used_product_history = tpe_enduser_used_product_history.toDF( ) df_tpe_enduser_used_product_history = df_tpe_enduser_used_product_history.withColumn( 'ngay_kich_hoat', from_unixtime( df_tpe_enduser_used_product_history['timecreated'], "yyyyMMdd")) df_tpe_enduser_used_product_history = df_tpe_enduser_used_product_history.withColumn( 'timestemp', df_tpe_enduser_used_product_history['timecreated'] * f.lit(1000)) df_tpe_enduser_used_product_history = df_tpe_enduser_used_product_history.withColumn( 'to_status_id', f.lit(107)) data_tpe_enduser_used_product_history = DynamicFrame.fromDF( df_tpe_enduser_used_product_history, glueContext, "data_tpe_enduser_used_product_history") data_tpe_enduser_used_product_history = data_tpe_enduser_used_product_history.resolveChoice( specs=[('timestemp', 'cast:long')]) data_tpe_enduser_used_product_history.printSchema() data_tpe_enduser_used_product_history.show(3) applymapping1 = ApplyMapping.apply( frame=data_tpe_enduser_used_product_history, mappings=[("ngay_kich_hoat", "string", "change_status_date_id", "long"), ("to_status_id", "int", "to_status_id", "long"), ("timestemp", "long", "timestamp1", "timestamp"), ("contact_id", "string", "contact_id1", "string") ]) applymapping1.printSchema() applymapping1.show(20) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin", "postactions": """UPDATE mapping_changed_status_student SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 1 AND source_id = mapping_changed_status_student.contact_id1 LIMIT 1 ) WHERE user_id IS NULL AND to_status_id = 107""" }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") # ghi flag # lay max key trong data source datasourceTmp = tpe_enduser_used_product_history.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://datashine-dev-redshift-backup/flag/flag_hvdkh_LS_A3.parquet", mode="overwrite") except Exception as e: print("No new data") print(e)
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # date_now = datetime.now() # preday = date_now + timedelta(days=-1) # d1 = preday.strftime("%Y%m%d") # print("d1 =", d1) # # now = datetime.now() # current date and time # year = now.strftime("%Y%m%d") # print("year:", year) dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="mapping_lo_student_history" ) print('Count:', dyf_mapping_lo_student_history.count()) # # Filter nhung ban ghi cua ngay hom truoc, filter nhung ban ghi co diem != 0 # dyf_mapping_lo_student_history = Filter.apply(frame=dyf_mapping_lo_student_history, f=lambda x: x['date_id'] is not None) dyf_mapping_lo_student_history = Filter.apply(frame=dyf_mapping_lo_student_history, f=lambda x: x['date_id'] is not None and (x['knowledge'] != 0 or x['comprehension'] != 0 or x[ 'application'] != 0 or x['analysis'] != 0 or x[ 'synthesis'] != 0 or x['evaluation'] != 0)) if dyf_mapping_lo_student_history.count() > 0: print('START JOB---------------') df_mapping_lo_student_history = dyf_mapping_lo_student_history.toDF() df_mapping_lo_student_history = df_mapping_lo_student_history.groupby('date_id', 'student_id', 'learning_object_id').agg( f.sum("knowledge").alias("knowledge"), f.sum("comprehension").alias("comprehension"), f.sum("application").alias("application"), f.sum("analysis").alias("analysis"), f.sum("synthesis").alias("synthesis"), f.sum("evaluation").alias("evaluation")) df_mapping_lo_student_history.printSchema() df_mapping_lo_student_history.show() print('END JOB---------------') dyf_mapping_lo_student_used = DynamicFrame.fromDF(df_mapping_lo_student_history, glueContext, "dyf_student_lo_init") # print('COUNT:', dyf_student_lo_init.count()) # dyf_student_lo_init.printSchema() # dyf_student_lo_init.show() dyf_mapping_lo_student_used = ApplyMapping.apply(frame=dyf_mapping_lo_student_used, mappings=[("student_id", "long", "student_id", "long"), ("learning_object_id", "long", "learning_object_id", "long"), ("date_id", "int", "date_id", "long"), ("knowledge", 'long', 'knowledge', 'long'), ("comprehension", 'long', 'comprehension', 'long'), ("application", 'long', 'application', 'long'), ("analysis", 'long', 'analysis', 'long'), ("synthesis", 'long', 'synthesis', 'long'), ("evaluation", 'long', 'evaluation', 'long')]) dyf_mapping_lo_student_used = ResolveChoice.apply(frame=dyf_mapping_lo_student_used, choice="make_cols", transformation_ctx="resolvechoice2") dyf_mapping_lo_student_used = DropNullFields.apply(frame=dyf_mapping_lo_student_used, transformation_ctx="dyf_mapping_lo_student_used") datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dyf_mapping_lo_student_used, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_used", "database": "dts_odin", "postactions": """ call proc_insert_tbhv(); INSERT INTO mapping_lo_student_history SELECT * FROM mapping_lo_student_used; DROP TABLE IF EXISTS mapping_lo_student_used """ }, redshift_tmp_dir="s3n://dts-odin/temp1/dyf_student_lo_init", transformation_ctx="datasink5")
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) #------------------------------------------------------------------------------------------------------------------# def getSolanBaoLuu(solan_baoluu, songay_baoluu): if solan_baoluu is None: solan_baoluu = 0 if songay_baoluu is None: songay_baoluu = 0 if solan_baoluu > songay_baoluu: return songay_baoluu return solan_baoluu getSolanBaoLuu = udf(getSolanBaoLuu, LongType()) def getSoNgayBaoLuu(solan_baoluu, songay_baoluu): if solan_baoluu is None: solan_baoluu = 0 if songay_baoluu is None: songay_baoluu = 0 if songay_baoluu > solan_baoluu: return songay_baoluu return solan_baoluu getSoNgayBaoLuu = udf(getSoNgayBaoLuu, LongType()) def getContactId(code, contact_id_advisor): if code is not None: return code return contact_id_advisor getContactId = udf(getContactId, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = udf(concaText, StringType()) # ------------------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------------------# dyf_poss_ghinhan_hp = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='ghinhan_hp') dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.select_fields([ '_key', 'id', 'ngay_thanhtoan', 'so_tien', 'khoa_hoc_makh', 'trang_thai' ]) dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.resolveChoice( specs=[('_key', 'cast:long')]) try: df_flag = spark.read.parquet( "s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet") read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_poss_ghinhan_hp = Filter.apply( frame=dyf_poss_ghinhan_hp, f=lambda x: x["_key"] > read_from_index) except: print('read flag file error ') dyf_poss_ghinhan_hp_number = dyf_poss_ghinhan_hp.count() print('dyf_poss_ghinhan_hp_number: ', dyf_poss_ghinhan_hp_number) if dyf_poss_ghinhan_hp_number < 1: return #-------------------------------------------------------------------------------------------------------------------# dyf_poss_khoa_hoc = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='khoa_hoc') dyf_poss_khoa_hoc = dyf_poss_khoa_hoc.select_fields( ['makh', 'mahv', 'goi_sanpham_id', 'trang_thai']) # -------------------------------------------------------------------------------------------------------------------# dyf_poss_hoc_vien = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='hoc_vien') dyf_poss_hoc_vien = dyf_poss_hoc_vien.select_fields( ['mahv', 'crm_id', 'trang_thai']).rename_field('mahv', 'mahv_hv') # -------------------------------------------------------------------------------------------------------------------# dyf_poss_goi_sanpham = glueContext.create_dynamic_frame.from_catalog( database='poss', table_name='goi_sanpham') dyf_poss_goi_sanpham = dyf_poss_goi_sanpham.select_fields( ['ma', 'id', 'solan_baoluu', 'songay_baoluu', 'trang_thai']) # -------------------------------------------------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------------# dyf_crm_goi_contacts = glueContext.create_dynamic_frame.from_catalog( database='crm_native', table_name='contacts') # print('dyf_crm_goi_contacts::full') # # dyf_crm_goi_contacts.printSchema() dyf_crm_goi_contacts = dyf_crm_goi_contacts.select_fields( ['Code']).rename_field('Code', 'code') dyf_crm_goi_contacts = Filter.apply( frame=dyf_crm_goi_contacts, f=lambda x: x["code"] is not None and x["code"] != '') dy_crm_goi_contacts = dyf_crm_goi_contacts.toDF() dy_crm_goi_contacts = dy_crm_goi_contacts.dropDuplicates() # print('dy_crm_goi_contacts') # dy_crm_goi_contacts.printSchema() # -------------------------------------------------------------------------------------------------------------------# dyf_advisor_student_contact = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='student_contact') dyf_advisor_student_contact = dyf_advisor_student_contact.select_fields( ['student_id', 'contact_id']) dyf_advisor_student_contact = Filter.apply(frame=dyf_advisor_student_contact, f=lambda x: x["student_id"] is not None and x["student_id"] != '' and x["contact_id"] is not None and x["contact_id"] != '')\ .rename_field('student_id', 'student_id_advisor')\ .rename_field('contact_id', 'contact_id_advisor') dy_advisor_student_contact = dyf_advisor_student_contact.toDF() dy_advisor_student_contact = dy_advisor_student_contact.dropDuplicates( ['student_id_advisor']) # print('dy_advisor_student_contact') # dy_advisor_student_contact.printSchema() # -------------------------------------------------------------------------------------------------------------------# # print('dyf_poss_ghinhan_hp') # dyf_poss_ghinhan_hp.printSchema() # # print('dyf_poss_khoa_hoc') # dyf_poss_khoa_hoc.printSchema() # # print('dyf_poss_hoc_vien') # dyf_poss_hoc_vien.printSchema() # # print('dyf_poss_goi_sanpham') # dyf_poss_goi_sanpham.printSchema() dy_poss_ghinhan_hp = dyf_poss_ghinhan_hp.toDF() dy_poss_ghinhan_hp = dy_poss_ghinhan_hp.dropDuplicates(['id']) dy_poss_khoa_hoc = dyf_poss_khoa_hoc.toDF() dy_poss_khoa_hoc = dy_poss_khoa_hoc.dropDuplicates(['makh', 'mahv']) dy_poss_hoc_vien = dyf_poss_hoc_vien.toDF() dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['mahv_hv']) dy_poss_goi_sanpham = dyf_poss_goi_sanpham.toDF() dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['crm_id']) poss_ghinhan_hp_number = dy_poss_ghinhan_hp.count() # print('poss_ghinhan_hp_number: ', poss_ghinhan_hp_number) if poss_ghinhan_hp_number < 1: return df_dong_tien = dy_poss_ghinhan_hp.join(dy_poss_khoa_hoc, dy_poss_ghinhan_hp.khoa_hoc_makh == dy_poss_khoa_hoc.makh, 'left')\ .join(dy_poss_hoc_vien, dy_poss_hoc_vien.mahv_hv == dy_poss_khoa_hoc.mahv, 'left')\ .join(dy_poss_goi_sanpham, dy_poss_goi_sanpham.id == dy_poss_khoa_hoc.goi_sanpham_id, 'left') df_dong_tien = df_dong_tien.select( 'ngay_thanhtoan', 'ma', 'crm_id', 'so_tien', getSolanBaoLuu(df_dong_tien['solan_baoluu'], df_dong_tien['songay_baoluu']).alias('solan_baoluu_t'), getSoNgayBaoLuu( df_dong_tien['solan_baoluu'], df_dong_tien['songay_baoluu']).alias('songay_baoluu_t')) # print('df_dong_tien') # df_dong_tien.printSchema() #check lms_id and contact_id df_dong_tien_student = df_dong_tien.join(dy_crm_goi_contacts, df_dong_tien.crm_id == dy_crm_goi_contacts.code, 'left')\ .join(dy_advisor_student_contact, df_dong_tien.crm_id == dy_advisor_student_contact.student_id_advisor, 'left') # print('df_dong_tien_student-----') # df_dong_tien_student.printSchema() df_dong_tien_student = df_dong_tien_student.filter( df_dong_tien_student.code.isNotNull() | (df_dong_tien_student.contact_id_advisor.isNotNull())) df_dong_tien_student = df_dong_tien_student.limit(100) student_id_unavailable = 0L package_endtime_unavailable = 0L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' measure1_unavailable = 0 measure2_unavailable = 0 measure3_unavailable = 0 measure4_unavailable = float(0.0) df_dong_tien_student = df_dong_tien_student.select( f.unix_timestamp(df_dong_tien_student.ngay_thanhtoan, 'yyyy-MM-dd').alias('student_behavior_date'), f.lit(1L).alias('behavior_id'), f.lit(student_id_unavailable).cast('long').alias('student_id'), getContactId( df_dong_tien_student.code, df_dong_tien_student.contact_id_advisor).alias('contact_id'), df_dong_tien_student.ma.alias('package_code'), f.lit(package_endtime_unavailable).cast('long').alias( 'package_endtime'), f.lit(package_starttime_unavailable).cast('long').alias( 'package_starttime'), f.lit(student_level_code_unavailable).cast('string').alias( 'student_level_code'), f.lit(student_status_code_unavailable).cast('string').alias( 'student_status_code'), f.lit(today_second).alias('transformed_at'), 'so_tien', 'solan_baoluu_t', 'songay_baoluu_t', f.lit(measure4_unavailable).alias('measure4')) print('df_dong_tien_student--1') df_dong_tien_student.printSchema() df_dong_tien_student.show(1) df_dong_tien_student2 = df_dong_tien_student.withColumn( 'student_behavior_id', f.md5( concaText(df_dong_tien_student.student_behavior_date, df_dong_tien_student.behavior_id, df_dong_tien_student.student_id, df_dong_tien_student.contact_id, df_dong_tien_student.package_code, df_dong_tien_student.package_endtime, df_dong_tien_student.package_starttime, df_dong_tien_student.student_level_code, df_dong_tien_student.student_status_code, df_dong_tien_student.transformed_at))) print('df_dong_tien_student2--2') df_dong_tien_student2.printSchema() df_dong_tien_student2.show(5) dyf_dong_tien_student = DynamicFrame.fromDF(df_dong_tien_student2, glueContext, 'dyf_dong_tien_student') dyf_dong_tien_student = Filter.apply( frame=dyf_dong_tien_student, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') apply_ouput = ApplyMapping.apply( frame=dyf_dong_tien_student, mappings=[ ("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "long", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "long", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long") ]) dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") glueContext.write_dynamic_frame.from_options( frame=dfy_output, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") apply_general = ApplyMapping.apply( frame=dyf_dong_tien_student, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("so_tien", "double", "measure1", "float"), ("solan_baoluu_t", "long", "measure2", "float"), ("songay_baoluu_t", "long", "measure3", "float"), ("measure4", "float", "measure4", "float"), ("behavior_id", "long", "behavior_id", "long")]) dfy_output2 = ResolveChoice.apply(frame=apply_general, choice="make_cols", transformation_ctx="resolvechoice2") print('dfy_output2::') dfy_output2.show(5) glueContext.write_dynamic_frame.from_options( frame=dfy_output2, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_general_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") flag = dy_poss_ghinhan_hp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet", mode="overwrite")
import sys import datetime from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from awsglue.job import Job args = getResolvedOptions(sys.argv, ['JOB_NAME']) jdbc_url = "jdbc:mongodb:RTK=RTK_REPLACE;Server=SERVER_REPLACE;User=USER_REPLACE;Password=PASSWORD_REPLACE;Port=PORT_REPLACE;Use SSL=USE_SSL_REPLACE;Database=DATABASE_REPLACE;" sparkContext = SparkContext() glueContext = GlueContext(sparkContext) sparkSession = glueContext.spark_session glueJob = Job(glueContext) glueJob.init(args['JOB_NAME'], args) collections_input = "COLLECTIONS_REPLACE" collections = collections_input.split(",") dfs = [] # Loop over each collection read the collection and push it to dataframes list for collection in collections: source_df = sparkSession.read.format("jdbc").option( "url", jdbc_url).option("dbtable", collection).option( "driver", "cdata.jdbc.mongodb.MongoDBDriver").load() dynamic_dframe = DynamicFrame.fromDF(source_df, glueContext, "dynamic_df_{}".format(collection)) dfs.append({"dynamic_frame": dynamic_dframe, "collection": collection})
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session ########## dyf_student_contact dyf_student_level = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") dyf_student_level = dyf_student_level.select_fields( ['_key', 'id', 'contact_id', 'level_current', 'level_modified', 'time_created']) \ .rename_field('time_created', 'time_level_created') df_student_level = dyf_student_level.toDF() df_student_level = df_student_level.dropDuplicates(['id']) print('df_student_level') df_student_level.printSchema() df_student_level.show(3) number_student_level = df_student_level.count() print('number_student_level: ', number_student_level) df_change_level_new = get_df_change_level_new(df_student_level) df_change_level_init = get_df_change_level_init(df_student_level) # ------------------------------------------------------------------------------------------------------------------# # get student and level in student contact df_contact_id_list = df_student_level.select('contact_id') df_contact_id_list = df_contact_id_list.dropDuplicates(['contact_id']) df_level_from_student_contact = get_level_from_student_contact( glueContext, df_contact_id_list) print('df_level_from_student_contact') df_level_from_student_contact.printSchema() df_level_from_student_contact.show(3) # -------------------------------------------------------------------------------------------------------------------# change_level_total = df_change_level_new \ .union(df_change_level_init)\ .union(df_level_from_student_contact) if is_dev: print('change_level_total') change_level_total.printSchema() change_level_total.show(3) # -------------------------------------------------------------------------------------------------------------------# # df_student_contact = get_df_student_contact(glueContext) if number_student_level < 1: return change_level_total = change_level_total.orderBy( f.asc('contact_id'), f.asc('time_level_created')) change_level_total = change_level_total.select( 'contact_id', f.struct(df_student_level.time_level_created.cast('string'), 'level').alias("s_time_level")) print('df_student_level') change_level_total.printSchema() change_level_total.show(3) df_student_level_list = change_level_total.groupBy('contact_id') \ .agg(f.collect_list('s_time_level').alias('l_time_level')) print('df_student_level_list') df_student_level_list.printSchema() df_student_level_list.show(3) df_student_level_list = df_student_level_list.select( 'contact_id', get_level_start_end('l_time_level').alias('l_level_start_ends')) print('df_student_level_list_v2') df_student_level_list.printSchema() df_student_level_list.show(3) df_student_level_list = df_student_level_list.select( 'contact_id', f.explode('l_level_start_ends').alias('level_start_end')) print('df_student_level_list_v2 after explode') df_student_level_list.printSchema() df_student_level_list.show(3) df_student_level_start_end = df_student_level_list.select( 'contact_id', f.col("level_start_end").getItem("level").alias("level"), f.col("level_start_end").getItem("start").alias("start").cast('long'), f.col("level_start_end").getItem("end").alias("end").cast('long')) print('df_student_level_start_end') df_student_level_start_end.printSchema() df_student_level_start_end.show(3) dyf_student_level_start_end_student = DynamicFrame \ .fromDF(df_student_level_start_end, glueContext, "dyf_student_level_start_end_student") applyMapping = ApplyMapping.apply( frame=dyf_student_level_start_end_student, mappings=[("contact_id", "string", "contact_id", "string"), ("level", 'string', 'level_code', 'string'), ("start", "long", "start_date", "long"), ("end", "long", "end_date", "long")]) resolvechoice = ResolveChoice.apply(frame=applyMapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields3") glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "postactions": """TRUNCATE TABLE ad_student_level """, "dbtable": "ad_student_level", "database": "transaction_log" }, redshift_tmp_dir= "s3n://datashine-dev-redshift-backup/translation_log/user_advisor/ad_student_level", transformation_ctx="datasink4")
def main(): # args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) # tao timebegin bang ngay hien tai + 1 ngay # time_begin = datetime.date.today() + datetime.timedelta(days=1) # 20190908 time_begin = datetime.date(2017, 1, 1) # tao timeend bang ngay hien tai + 1 thang - 1 ngay # time_end = time_begin + relativedelta(months=1) - datetime.timedelta(days=1) time_end = datetime.date(2020, 2, 1) # tao dataframe tu time_begin va time_end data = [(time_begin, time_end)] df = spark.createDataFrame(data, ["minDate", "maxDate"]) # convert kieu dl va ten field df = df.select(df.minDate.cast(DateType()).alias("minDate"), df.maxDate.cast(DateType()).alias("maxDate")) # chay vong lap lay tat ca cac ngay giua mindate va maxdate df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \ .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \ .select("*", f.posexplode("repeat").alias("date", "val")) \ .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \ .select('date') # convert date thanh cac option ngay_thang_nam df = df.withColumn('date_id', date_format(df.date, "yyyyMMdd")) \ .withColumn('day_of_month', f.dayofmonth(df.date)) \ .withColumn('day_of_week', from_unixtime(unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \ .withColumn('week_of_year', f.weekofyear(df.date)) \ .withColumn('month', f.month(df.date)) \ .withColumn('quarter', f.quarter(df.date)) \ .withColumn('year', f.year(df.date)) df = df.withColumn('week_of_month', (df.day_of_month - 1) / 7 + 1) data_time = DynamicFrame.fromDF(df, glueContext, 'data_time') # convert data data_time = data_time.resolveChoice(specs=[('week_of_month', 'cast:int')]) # chon cac truong va kieu du lieu day vao db applymapping1 = ApplyMapping.apply(frame=data_time, mappings=[("date_id", "string", "date_id", "bigint"), ("day_of_month", 'int', 'day_of_month', 'int'), ("day_of_week", "string", "day_of_week", "string"), ("week_of_month", "int", "week_of_month", "int"), ("week_of_year", "int", "week_of_year", "int"), ("month", "int", "month", "int"), ("quarter", "int", "quarter", "int"), ("year", "int", "year", "int"), ("date", "date", "date", "timestamp")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3") # ghi dl vao db datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={"dbtable": "date_dim", "database": "student_learning_fact"}, redshift_tmp_dir="s3://datashine-dev-redshift-backup/student_learning_fact/date_dim", transformation_ctx="student_learning_dim")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session mdl_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(database="tig_market", table_name="tpe_enduser_used_product_history") mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.select_fields( ['_key', 'id', 'used_product_id', 'contact_id', 'status_new', 'status_old', 'timecreated']) mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice(specs=[('_key', 'cast:long')]) # xu ly truong hop start_read is ngitull try: # # doc moc flag tu s3 df_flag = spark.read.parquet("s3a://dts-odin/flag/fact_flag_suspended.parquet") start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') print('the number of new contacts: ', mdl_tpe_enduser_used_product_history.count()) # df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_LS_A3.parquet") # # max_key = df_flag.collect()[0]['flag'] # # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["_key"] > max_key) if (mdl_tpe_enduser_used_product_history.count() > 0): mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, f=lambda x: x["timecreated"] is not None and x[ "contact_id"] is not None and x[ "used_product_id"] is not None and (x["status_old"] == 'ACTIVED' and x["status_new"] in ['SUSPENDED','SUPPENDED'])) # print(mdl_tpe_enduser_used_product_history.count()) mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice( specs=[('timecreated', 'cast:long')]) df_mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.toDF() df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn('change_status_date_id', from_unixtime( df_mdl_tpe_enduser_used_product_history[ 'timecreated'], "yyyyMMdd"))\ .withColumn('to_status_id',f.lit(55))\ .withColumn('timestamp1',df_mdl_tpe_enduser_used_product_history[ 'timecreated'] * f.lit( 1000)) # df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.select('used_product_id', # 'contact_id', # 'ngay_kich_hoat', # 'id').withColumnRenamed( # 'used_product_id', 'id_product_buy') data_mdl_tpe_enduser_used_product_history = DynamicFrame.fromDF(df_mdl_tpe_enduser_used_product_history, glueContext, "data_mdl_tpe_enduser_used_product_history") data_mdl_tpe_enduser_used_product_history.printSchema() data_mdl_tpe_enduser_used_product_history.show(3) applymapping1 = ApplyMapping.apply(frame=data_mdl_tpe_enduser_used_product_history, mappings=[("contact_id", "string", "contact_id", "string"), ("change_status_date_id", "string", "change_status_date_id", "long"), ("timestamp1", "long", "timestamp1", "timestamp"), ('to_status_id','int','to_status_id','long')]) resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin", "postactions":"""UPDATE mapping_changed_status_student SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 1 AND source_id = mapping_changed_status_student.contact_id LIMIT 1 ) WHERE user_id IS NULL and to_status_id=55""" }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") # ghi data vao s3 datasink5 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields3, connection_type="s3", connection_options={ "path": "s3://datashine-dev-redshift-backup/A_55_tam_dung_goi"}, format="parquet", transformation_ctx="datasink5") # ghi flag # lay max key trong data source datasourceTmp = mdl_tpe_enduser_used_product_history.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dts-odin/flag/fact_flag_suspended.parquet", mode="overwrite")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) # f.lit(today_second).cast('long').alias('transformed_at') ratings = [1, 2, 3, 4, 5] student_id_unavailable = '0' package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_endtime = 'package_endtime' package_starttime = 'package_starttime' student_level_code = 'student_level_code' student_status_code = 'student_status_code' def doCheckModified(val1, val2): if val1 is not None: return val1 return val2 check_modified_null = udf(doCheckModified, StringType()) def doCheckStudentID(code): code = str(code) if code is None: return student_id_unavailable return code check_student_id = udf(doCheckStudentID, StringType()) def doCheckData(code, key): key = str(key) if code is None: if key == package_endtime: return package_endtime_unavailable else: return package_starttime_unavailable return code check_data = udf(doCheckData, IntegerType()) def doCheckDataNull(code, key): code = str(code) key = str(key) if (code is None) & (key == student_level_code): return student_level_code_unavailable if (code is None) & (key == student_status_code): return student_status_code_unavailable return code check_data_null = udf(doCheckDataNull, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = f.udf(concaText, StringType()) dyf_advisorcall = glueContext.create_dynamic_frame.from_catalog( database="callcenter", table_name="advisorcall") dyf_advisorcall = dyf_advisorcall.select_fields([ '_key', 'calldate', 'phonenumber', 'rating', 'device', 'hanguptvts', 'status' ]) dyf_advisorcall = dyf_advisorcall.resolveChoice(specs=[('_key', 'cast:long')]) try: df_flag_1 = spark.read.parquet( "s3://dtsodin/flag/flag_hoc_vien_rating_hotline.parquet") max_key = df_flag_1.collect()[0]['flag'] print("max_key: ", max_key) # Chi lay nhung ban ghi lon hon max_key da luu, ko load full dyf_advisorcall = Filter.apply(frame=dyf_advisorcall, f=lambda x: x["_key"] > max_key) except: print('read flag file error ') if dyf_advisorcall.count() > 0: dyf_student_contact_phone = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact_phone") dyf_student_contact_phone = dyf_student_contact_phone.select_fields( ['contact_id', 'phone']) \ .rename_field('contact_id', 'contact_id_phone') dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id', 'level_study', 'time_lms_created'])\ dyf_log_student_status = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_status") dyf_log_student_status = dyf_log_student_status.select_fields( ['contact_id', 'status_code', 'last_status_code', 'start_date', 'end_date']) \ .rename_field('contact_id', 'contact_id_status') dyf_log_student_package = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_package") dyf_log_student_package = dyf_log_student_package.select_fields( ['student_id', 'package_code', 'start_time', 'end_time']) \ .rename_field('student_id', 'student_id_package') \ .rename_field('start_time', 'start_time_package') \ .rename_field('end_time', 'end_time_package') dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") dyf_log_student_level_study = dyf_log_student_level_study.select_fields( ['contact_id', 'level_current', 'level_modified', 'package_code', 'time_created']) \ .rename_field('contact_id', 'contact_id_level') dyf_advisorcall.printSchema() print dyf_advisorcall.count() dyf_rating_class = Filter.apply( frame=dyf_advisorcall, f=lambda x: x['rating'] in ratings and x['device'] == '3CX' and x[ 'hanguptvts'] == 1 and x['status'] == 'ANSWER') print dyf_rating_class.count() try: df_rating_class = dyf_rating_class.toDF() # df_rating_class = df_rating_class.limit(99999) df_student_contact = dyf_student_contact.toDF() df_student_contact_phone = dyf_student_contact_phone.toDF() df_log_student_level_study = dyf_log_student_level_study.toDF() df_temp = dyf_log_student_level_study.toDF() df_log_student_status = dyf_log_student_status.toDF() df_log_student_package = dyf_log_student_package.toDF() df_temp = df_temp.groupby( 'contact_id_level', 'level_current', 'package_code').agg( f.max("time_created").alias("time_created_max")) df_temp = df_temp.withColumnRenamed('contact_id_level', 'contact_id_join') \ .withColumnRenamed('package_code', 'package_code_join') df_join0 = df_temp.join( df_log_student_level_study, (df_temp['contact_id_join'] == df_log_student_level_study['contact_id_level']) & (df_temp['package_code_join'] == df_log_student_level_study['package_code']) & (df_temp['time_created_max'] == df_log_student_level_study['time_created']), "left") print "=========== . ===========" df_join0.printSchema() dyf_join = DynamicFrame.fromDF(df_join0, glueContext, "dyf_join") dyf_join = dyf_join.select_fields([ 'contact_id_level', 'level_current', 'level_modified', 'package_code', 'time_created' ]) df_join = dyf_join.toDF() df_join.printSchema() df_join.show(10) print "########## . ###########" df_join0 = df_rating_class.join( df_student_contact_phone, (df_rating_class['phonenumber'] == df_student_contact_phone['phone'])) df_join01 = df_join0.join(df_student_contact, (df_join0['contact_id_phone'] == df_student_contact['contact_id'])) df_join01.printSchema() df_join02 = df_join01.join( df_join, (df_join['contact_id_level'] == df_join01['contact_id']) & (df_join['time_created'] <= df_join01['time_lms_created']), "left") df_join02 = df_join02 \ .withColumn("level_modified_new", check_modified_null(df_join02.level_modified, df_join02.level_study)) \ .withColumn("timecreated", f.unix_timestamp(df_join02.calldate, "yyyy-MM-dd HH:mm:ss")) df_join02.printSchema() df_join02.show(10) dyf_join = DynamicFrame.fromDF(df_join02, glueContext, "dyf_join") dyf_join = dyf_join.select_fields([ 'timecreated', 'contact_id', 'student_id', 'level_study', 'time_lms_created', 'level_current', 'level_modified', 'package_code', 'time_created', 'rating', 'level_modified_new' ]) # dyf_join_temp = Filter.apply(frame=dyf_join, # f=lambda x: x["level_modified_new"] is None) # print "count: ", dyf_join_temp.count() ############ df_join02 = dyf_join.toDF() df_join03 = df_join02.join( df_log_student_status, (df_log_student_status['contact_id_status'] == df_join02['contact_id']) & (df_log_student_status['start_date'] <= df_join02['timecreated']) & (df_log_student_status['end_date'] >= df_join02['timecreated']), "left") df_join04 = df_join03.join( df_log_student_package, (df_log_student_package['student_id_package'] == df_join03['student_id']) & (df_log_student_package['start_time_package'] <= df_join03['timecreated']) & (df_log_student_package['end_time_package'] >= df_join03['timecreated']), "left") dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join") dyf_join = Filter.apply( frame=dyf_join, f=lambda x: x["start_time_package"] is not None and x[ "end_time_package"] is not None) print "dyf_join: ", dyf_join.count() dyf_join.show(10) dyf_join.printSchema() dyf_join = dyf_join.select_fields([ 'timecreated', 'student_id', 'contact_id', 'package_code', 'rating', 'start_time_package', 'end_time_package', 'level_modified_new', 'status_code' ]) dyf_join.printSchema() print "TTTTTTTTTTTTTTTTT" # dyf_join01 = Filter.apply(frame=dyf_join, # f=lambda x: x["level_current"] is not None) # # print "Check null ", dyf_join01.count() df_join04 = dyf_join.toDF() df_join04 = df_join04.withColumn("transformed_at", unix_timestamp(f.current_timestamp())) \ .withColumn("student_id", check_student_id(df_join04.student_id)) \ .withColumn("package_endtime", check_data(df_join04.end_time_package, f.lit(package_endtime))) \ .withColumn("package_starttime", check_data(df_join04.start_time_package, f.lit(package_starttime))) \ .withColumn("student_level_code", check_data_null(df_join04.level_modified_new, f.lit(student_level_code))) \ .withColumn("student_status_code", check_data_null(df_join04.status_code, f.lit(student_status_code))) \ .withColumn("behavior_id", f.lit(25)) \ .withColumn("rating_type", f.lit("rating_hotline")) \ .withColumn("comment", f.lit("")) \ .withColumn("rating_about", f.lit(None)) \ .withColumn("number_rating", f.lit(1)) \ .withColumn("value_rating", df_join04.rating) print "XXXXXXXXXXXXXXXXXXXX" df_join04.printSchema() print df_join04.count() df_join04.show(10) dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join") # dyf_join.printSchema() # print dyf_join.count() # dyf_join.show(10) dyf_rating_cara = ApplyMapping.apply( frame=dyf_join, mappings=[ ("timecreated", "int", "student_behavior_date", "long"), ("behavior_id", "int", "behavior_id", "long"), ("student_id", "string", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "int", "package_endtime", "long"), ("package_starttime", "int", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long"), ("rating_type", "string", "rating_type", "string"), ("comment", "string", "comment", "string"), ("rating_about", "int", "rating_about", "long"), ("number_rating", "int", "number_rating", "long"), ("value_rating", "int", "value_rating", "long") ]) df_rating_cara = dyf_rating_cara.toDF() df_rating_cara2 = df_rating_cara.withColumn( 'student_behavior_id', f.md5( concaText(df_rating_cara.student_behavior_date, df_rating_cara.behavior_id, df_rating_cara.student_id, df_rating_cara.contact_id, df_rating_cara.package_code, df_rating_cara.package_endtime, df_rating_cara.package_starttime, df_rating_cara.student_level_code, df_rating_cara.student_status_code, df_rating_cara.transformed_at))) dyf_rating_cara = DynamicFrame.fromDF(df_rating_cara2, glueContext, 'dyf_rating_cara') dyf_rating_cara = Filter.apply(frame=dyf_rating_cara, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') applymapping0 = ApplyMapping.apply( frame=dyf_rating_cara, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("rating_type", "string", "rating_type", "string"), ("comment", "string", "comment", "string"), ("rating_about", "long", "rating_about", "long"), ("number_rating", "long", "number_rating", "long"), ("value_rating", "long", "value_rating", "long"), ("behavior_id", "long", "behavior_id", "long")]) applymapping0.printSchema() print applymapping0.count() applymapping0.show(5) resolvechoice0 = ResolveChoice.apply( frame=applymapping0, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields0 = DropNullFields.apply( frame=resolvechoice0, transformation_ctx="dropnullfields0") print resolvechoice0.count() resolvechoice0.printSchema() # resolvechoice0.show(10) print('START WRITE TO S3-------------------------') datasink0 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields0, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_rating/", "partitionKeys": ["behavior_id"] }, format="parquet", transformation_ctx="datasink0") print('END WRITE TO S3-------------------------') # datasink0 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields0, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "student_rating", # "database": "dts_odin" # }, # redshift_tmp_dir="s3a://dtsodin/temp/student_rating/", # transformation_ctx="datasink0") applymapping1 = ApplyMapping.apply( frame=dyf_rating_cara, mappings=[ ("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "long", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long") ]) applymapping1.printSchema() print applymapping1.count() applymapping1.show(5) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice1, transformation_ctx="dropnullfields1") print resolvechoice1.count() resolvechoice1.printSchema() # resolvechoice1.show(10) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options( frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior/", "partitionKeys": ["behavior_id"] }, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "student_behavior", # "database": "dts_odin" # }, # redshift_tmp_dir="s3a://dtsodin/temp/student_behavior", # transformation_ctx="datasink1") df_temp = dyf_advisorcall.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_hoc_vien_rating_hotline.parquet", mode="overwrite") except Exception as e: print e
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # ----------------------------------------------DYF-----------------------------------------------------------------# dyf_tpe_enduser_used_product = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product") dyf_tpe_enduser_used_product = dyf_tpe_enduser_used_product.select_fields( ['contact_id', 'product_id', 'timecreated']) # -----------------------------------------DYF-----------------------------------------------------------------------# dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['id', 'cat_code']) # ----------------------------------------------DYF-----------------------------------------------------------------# dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id']).rename_field('contact_id', 'ct_id') # dyf_student_contact = Filter.apply(frame=dyf_student_contact, # f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' # and x["student_id"] is not None and x["student_id"] != '' # ) df_student_contact = dyf_student_contact.toDF() print('df_student_contact') df_student_contact.show() #-------------------------------------------------------------------------------------------------------------------# df_tpe_invoice_product_details = dyf_tpe_invoice_product_details.toDF() df_tpe_invoice_product_details = df_tpe_invoice_product_details.\ where("cat_code like 'TAAM%' OR cat_code like 'TENUP%' ") df_tpe_invoice_product_details = df_tpe_invoice_product_details.withColumn( 'to_status_id', f.when(df_tpe_invoice_product_details.cat_code.like('TAAM%'), f.lit(999L)).when( df_tpe_invoice_product_details.cat_code.like('TENUP%'), f.lit(998L)).otherwise(f.lit(999999999L))) df_tpe_invoice_product_details.show(2) df_tpe_enduser_used_product = dyf_tpe_enduser_used_product.toDF() #-----------------------------------------------____JOIN______------------------------------------------------------# df_join = df_tpe_invoice_product_details.join( df_tpe_enduser_used_product, df_tpe_invoice_product_details.id == df_tpe_enduser_used_product.product_id) df_join.printSchema() print('df_join ::', df_join.count()) df_join1 = df_join.join(df_student_contact, df_student_contact.ct_id == df_join.contact_id) df_join1 = df_join1.withColumn( 'change_status_date_id', from_unixtime(df_join1.timecreated, "yyyyMMdd")) df_join1.printSchema() print('df_join1 ::', df_join1.count()) #-------------------------------------------------------------------------------------------------------------------# df_result = df_join1.select('student_id', 'change_status_date_id', 'to_status_id', 'contact_id') df_result.printSchema() df_result.show(3) df_result = df_result.drop_duplicates() df_result.cache() print('count df_result::', df_result.count()) dyf_result = DynamicFrame.fromDF(df_result, glueContext, "dyf_result") dyf_result = Filter.apply( frame=dyf_result, f=lambda x: x["student_id"] is not None and x[ "change_status_date_id"] is not None and x[ "to_status_id"] is not None and x["contact_id"] is not None) apply_output = ApplyMapping.apply( frame=dyf_result, mappings=[ ("student_id", "string", "student_id", "long"), # ("user_id", "long", "user_id", "long"), ("change_status_date_id", "string", "change_status_date_id", "long" ), # ("from_status_id", "long", "from_status_id", "long"), ("to_status_id", "long", "to_status_id", "long"), # ("measure1", "double", "measure1", "double"), # ("measure2", "double", "measure2", "double"), # ("description", "string", "description", "string"), # ("timestamp1", "string", "timestamp1", "string"), ("contact_id", "string", "contact_id", "string"), # ("teacher_id", "long", "teacher_id", "long"), # ("contact_id1", "string", "contact_id1", "string"), # ("measure1_int", "int", "measure1_int", "int"), # ("measure2_int", "int", "measure2_int", "int"), # ("contact_id_str", "string", "contact_id_str", "string"), # ("lc", "string", "lc", "string"), # ("student_id_string", "string", "student_id_string", "string") ]) df_apply_output = apply_output.toDF() df_apply_output.drop_duplicates() print('df_apply_output.count', df_apply_output.count()) dyf_apply_output = DynamicFrame.fromDF(df_apply_output, glueContext, "dyf_apply_output") resolve_choice = ResolveChoice.apply(frame=dyf_apply_output, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolve_choice, transformation_ctx="dropnullfields") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_mapping_status", "database": "dts_odin", "postactions": """ insert into mapping_changed_status_student_v1(student_id, change_status_date_id, to_status_id, contact_id) select student_id, change_status_date_id, to_status_id, contact_id from temp_mapping_status; update mapping_changed_status_student_v1 set user_id = (select user_id from user_map where source_type = 2 and source_id = student_id) where user_id is null; DROP TABLE IF EXISTS temp_mapping_status """ }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") df_result.unpersist() df_student_contact.unpersist() print( '------------------------>___complete__________------------------------------>' )
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # etl_package_category(glueContext) etl_student_package(glueContext)
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session datasource0 = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="advisor_account", transformation_ctx="datasource0") datasource0 = datasource0.select_fields([ '_key', 'user_id', 'user_name', 'user_display_name', 'user_email', 'user_phone', 'ip_phone_number', 'level', 'advisor_deleted' ]).rename_field('user_id', 'id').rename_field('user_name', 'ten').rename_field( 'advisor_deleted', 'advisor_deleted_tmp') # doc flag tu s3 df_flag = spark.read.parquet("s3://dts-odin/flag/flag_CVHD.parquet") # so sanh _key datasource voi flag, lay nhung gia tri co key > flag data = datasource0.toDF() data = data.where(data['_key'] > df_flag.collect()[0]['flag']) data = data.withColumn('type_eg', f.lit(None)) data = data.withColumn('advisor_type', f.lit(None)) data = data.withColumn( 'advisor_deleted', when(data.advisor_deleted_tmp, f.lit(1)).otherwise(f.lit(0))) data.printSchema() datasource0 = DynamicFrame.fromDF(data, glueContext, "datasource0") # datasource0.show() if (datasource0.count() > 0): try: # chon field mong muon applymapping1 = ApplyMapping.apply( frame=datasource0, mappings=[("id", "int", "id", "bigint"), ("ten", "string", "username", "string"), ("user_display_name", "string", "name", "string"), ("user_email", "string", "email", "string"), ("level", "int", "level", "int"), ("advisor_deleted", "int", "advisor_deleted", "int"), ("type_eg", "int", "type_eg", "string"), ("advisor_type", "int", "advisor_type", "string")], transformation_ctx="applymapping1") resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # ghi data vao redshift datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "dim_advisor", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/backup/advisor_account/", transformation_ctx="datasink4") # lay max _key tren datasource datasource = datasource0.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] # tao data frame flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de flag moi vao s3 df.write.parquet("s3a://dts-odin/flag/flag_CVHD.parquet", mode="overwrite") except: # xu ly ngoai le(khi co datasource nhung k co gia tri thoa man dieu kien sau khi loc) # ghi flag datasource = datasource0.toDF() flag = datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dts-odin/flag/flag_CVHD.parquet", mode="overwrite") # EG datasource = glueContext.create_dynamic_frame.from_catalog( database="dm_toa", table_name="advisor_eg") # Chon cac truong can thiet datasource = datasource.select_fields( ['_key', 'advisor_id', 'bo_phan', 'eg']) datasource = datasource.resolveChoice(specs=[('_key', 'cast:long')]) data = datasource.toDF() # data = data.where(data['_key'] > df_flag.collect()[0]['flag']) # data = data.where(data['_key'] < 276961) datasource = DynamicFrame.fromDF(data, glueContext, "datasource") if (datasource.count() > 0): applymapping1 = ApplyMapping.apply( frame=datasource, mappings=[("advisor_id", "string", "advisor_id", "string"), ("bo_phan", "string", "bo_phan", "string"), ("eg", "string", "eg", "string")]) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "dim_advisor_eg", "database": "dts_odin", "postactions": """update dim_advisor set type_eg = eg, advisor_type = bo_phan from dim_advisor_eg where id=advisor_id; DROP TABLE IF EXISTS public.dim_advisor_eg""" }, redshift_tmp_dir="s3n://dts-odin/backup/advisor_account/", transformation_ctx="datasink4")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # dyf_learning_object = glueContext.create_dynamic_frame.from_options( # connection_type="redshift", # connection_options={ # "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", # "user": "******", # "password": "******", # "dbtable": "learning_object", # "redshiftTmpDir": "s3n://dts-odin/temp1/dyf_learning_object"} # ) # # # resolvechoice = ResolveChoice.apply(frame=dyf_learning_object, choice="make_cols", transformation_ctx="resolvechoice2") # # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields3") # print('count', dropnullfields.count()) # dropnullfields.printSchema() # dropnullfields.show() # datasink5 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields, connection_type="s3", # connection_options={ # "path": "s3://dts-odin/nvn_knowledge/learning_object/"}, # format="parquet", transformation_ctx="datasink6") # dyf_mapping_class_lms = glueContext.create_dynamic_frame.from_options( # connection_type="redshift", # connection_options={ # "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", # "user": "******", # "password": "******", # "dbtable": "mapping_class_lms", # "redshiftTmpDir": "s3n://dts-odin/temp1/mapping_class_lms"} # ) # # # dyf_mapping_class_lms = ResolveChoice.apply(frame=dyf_mapping_class_lms, choice="make_cols", transformation_ctx="resolvechoice2") # datasink5 = glueContext.write_dynamic_frame.from_options(frame=dyf_mapping_class_lms, connection_type="s3", # connection_options={ # "path": "s3://dts-odin/nvn_knowledge/mapping_class_lms/"}, # format="parquet", transformation_ctx="datasink6") # dyf_learning_object_class = glueContext.create_dynamic_frame.from_options( # connection_type="redshift", # connection_options={ # "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", # "user": "******", # "password": "******", # "dbtable": "learning_object_class", # "redshiftTmpDir": "s3n://dts-odin/temp1/mapping_class_lms"} # ) # # # dyf_learning_object_class = ResolveChoice.apply(frame=dyf_learning_object_class, choice="make_cols", # transformation_ctx="resolvechoice2") # dyf_learning_object_class.printSchema() # dyf_learning_object_class.show() # datasink5 = glueContext.write_dynamic_frame.from_options(frame=dyf_learning_object_class, connection_type="s3", # connection_options={ # "path": "s3://dts-odin/nvn_knowledge/learning_object_class/"}, # format="parquet", transformation_ctx="datasink6") dyf_mapping_lo_class = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "mapping_lo_class", "redshiftTmpDir": "s3n://dts-odin/temp1/mapping_class_lms" }) # dyf_mapping_lo_class = ResolveChoice.apply( frame=dyf_mapping_lo_class, choice="make_cols", transformation_ctx="resolvechoice2") # dyf_learning_object_class.printSchema() # dyf_learning_object_class.show() datasink5 = glueContext.write_dynamic_frame.from_options( frame=dyf_mapping_lo_class, connection_type="s3", connection_options={ "path": "s3://dts-odin/nvn_knowledge/mapping_lo_class/" }, format="parquet", transformation_ctx="datasink6")
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") datasource = glueContext.create_dynamic_frame.from_catalog( database="myjira", table_name="historystatustoa") # Chon cac truong can thiet datasource = datasource.select_fields([ '_key', 'change_group_id', 'student_id', 'modification_time', 'old_value_code', 'new_value_code' ]) datasource = datasource.resolveChoice(specs=[('_key', 'cast:long')]) # doc flag tu s3 df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_LS_A1.parquet") # so sanh _key datasource voi flag, lay nhung gia tri co key > flag max_key = df_flag.collect()[0]['flag'] # data = datasource.toDF() # data = data.where(data['_key'] > df_flag.collect()[0]['flag']) # data = data.where(data['_key'] < 276961) # datasource = DynamicFrame.fromDF(data, glueContext, "datasource") # datasource = Filter.apply(frame=datasource, f=lambda x: x["_key"] > max_key) print("max_key: ", max_key) print("Count data datasource: ", datasource.count()) if (datasource.count() > 0): # datasource_df = datasource.toDF() # datasource_df = datasource_df.where("student_id IS NOT NULL") # datasource_df = datasource_df.where("modification_time IS NOT NULL") # # trang thai cu la 10519 - S0. HV dc phan cong CVHD # datasource_df = datasource_df.where("old_value_code = '10519'") # # trang thai moi la 11013 - S1. HV dc chao mung thanh cong # datasource_df = datasource_df.where("new_value_code = '11013'") # # data = DynamicFrame.fromDF(datasource_df, glueContext, "data") datasource = Filter.apply( frame=datasource, f=lambda x: x["student_id"] is not None and x["modification_time"] is not None and x["old_value_code"] == '10519' and x[ "new_value_code"] == '11013') # datasource = Filter.apply(frame=datasource, f=lambda x: x["modification_time"] is not None) # datasource = Filter.apply(frame=datasource, f=lambda x: x["old_value_code"] == '10519') # datasource = Filter.apply(frame=datasource, f=lambda x: x["new_value_code"] == '11013') print("Count data: ", datasource.count()) applymapping1 = ApplyMapping.apply( frame=datasource, mappings=[("student_id", "string", "student_id", "string"), ("modification_time", "string", "ngay_a1", "timestamp"), ("change_group_id", "string", "id", "string")]) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_ls_trang_thai_a1", "database": "dts_odin", "postactions": """ call proc_insert_chao_mung_thanh_cong(); DROP TABLE IF EXISTS temp_ls_trang_thai_a1 """ }, redshift_tmp_dir="s3n://dts-odin/historystatustoa/", transformation_ctx="datasink4") # ghi flag # lay max key trong data source datasourceTmp = datasource.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dts-odin/flag/flag_LS_A1.parquet", mode="overwrite") ############################################################################################### ### lay tu nguon craw_gs_care datasourceCraw = glueContext.create_dynamic_frame.from_catalog( database="dm_toa", table_name="craw_gs_care") # Chon cac truong can thiet datasourceCraw = datasourceCraw.select_fields(['_key', 'id', 'datetime']) datasourceCraw = datasourceCraw.resolveChoice(specs=[('_key', 'cast:long')]) # doc flag tu s3 df_flag_craw = spark.read.parquet( "s3://dts-odin/flag/flag_LS_A1_1.parquet") max_key = df_flag_craw.collect()[0]['flag'] # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dataCraw = datasourceCraw.toDF() # dataCraw = dataCraw.where(data['_key'] > df_flag_craw.collect()[0]['flag']) # dataCraw = dataCraw.where(dataCraw['_key'] < 276961) # datasourceCraw = DynamicFrame.fromDF(dataCraw, glueContext, "datasourceCraw") # datasourceCraw = Filter.apply(frame=datasourceCraw, f=lambda x: x["_key"] > max_key) print("max_key: ", max_key) # print "Count data datasourceCraw: ", datasourceCraw.count() if (datasourceCraw.count() > 0): # datasource_df = datasourceCraw.toDF() # datasource_df = datasource_df.where("id IS NOT NULL") # datasource_df = datasource_df.where("datetime IS NOT NULL") datasourceCraw = Filter.apply(frame=datasourceCraw, f=lambda x: x["id"] is not None) datasourceCraw = Filter.apply(frame=datasourceCraw, f=lambda x: x["datetime"] is not None) # data = DynamicFrame.fromDF(datasource_df, glueContext, "data") print("Count data: ", datasourceCraw.count()) datasourceCraw.show() applymapping1 = ApplyMapping.apply( frame=datasourceCraw, mappings=[("id", "int", "student_id", "string"), ("datetime", "string", "ngay_a1", "timestamp"), ("_key", "string", "id", "string")]) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_ls_trang_thai_a1", "database": "dts_odin", "postactions": """ call proc_insert_chao_mung_thanh_cong(); DROP TABLE IF EXISTS temp_ls_trang_thai_a1 """ }, redshift_tmp_dir="s3n://dts-odin/craw_gs_care/", transformation_ctx="datasink4") # ghi flag # lay max key trong data source datasourceTmp = datasourceCraw.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3n://dts-odin/flag/flag_LS_A1_1.parquet", mode="overwrite")
import sys from awsglue.context import GlueContext from awsglue.job import Job from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext spark_context = SparkContext() glue_context = GlueContext(spark_context) job = Job(glue_context) args = getResolvedOptions(sys.argv, ["JOB_NAME"]) job.init(args["JOB_NAME"], args) print("Hello, World!") job.commit()
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # TBHV E ngay # LO_TYPE: 1: Tu vung; 2: Ngu am; 3: Nghe; 4: Ngu phap # Custom function def get_length(array_str): json_obj = json.loads(array_str) # index = 0; # for item in json_obj: # index += 1 length = 0 if json_obj is not None: length = len(json_obj) return length udf_get_length = udf(get_length, IntegerType()) arr_aip_tu_vung = ['3', '4', '5', '17'] arr_aip_ngu_phap = ['6', '7', '8', '9', '18'] arr_aip_ngu_am = ['16'] arr_aip_nghe = ['10', '11', '12', '13', '14', '15'] arr_knowledge = ['3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18'] arr_comprehension = ['8', '9', '14'] def do_add_lo_type(code): lo_type = -1 code = str(code) for x in arr_aip_tu_vung: if x == code: lo_type = 1 for x in arr_aip_ngu_am: if x == code: lo_type = 2 for x in arr_aip_nghe: if x == code: lo_type = 3 for x in arr_aip_ngu_phap: if x == code: lo_type = 4 return lo_type add_lo_type = udf(do_add_lo_type, IntegerType()) def do_add_score_aip(code, type, lo_type, correct_answer, student_answer): code = str(code) score = 0 arr = [] # diem ngu am if lo_type == 2 and correct_answer == student_answer: score = 2 if lo_type == 2 and correct_answer != student_answer: score = -1 # truong hop cac diem khac ko phair ngu am if lo_type != 2 and correct_answer == student_answer: score = 10 if lo_type != 2 and correct_answer != student_answer: score = -5 if type == 'knowledge': arr = arr_knowledge for x in arr: if x == code: return score return 0 add_score_aip = udf(do_add_score_aip, IntegerType()) def do_add_score_micro(code, type, lo_type, total_step, count_step): code = str(code) score = 0 arr = [] percent_success = 0.7 if count_step / total_step >= percent_success: score = 10 else: score = -5 if type == 'knowledge': arr = arr_knowledge if type == 'comprehension': arr = arr_comprehension for x in arr: if x == code: return score return 0 add_score_micro = udf(do_add_score_micro, IntegerType()) def do_add_score_ait(total_step, max_step, received_point, length_answer): score = 0 if total_step == max_step: if length_answer <= 3 and received_point >= 3: score = 30 if length_answer <= 3 and received_point <= 2: score = 10 if length_answer >= 4 and received_point <= 2: score = -15 return score add_score_ait = udf(do_add_score_ait, IntegerType()) ########## dyf_ai_study_step dyf_ai_study_step = glueContext.create_dynamic_frame.from_catalog( database="moodlestarter", table_name="ai_study_step" ) dyf_ai_study_step = dyf_ai_study_step.select_fields( ['_key', 'user_id', 'lesson_id', 'tag', 'current_step', 'total_step', 'learning_object', 'learning_object_type', 'correct_answer', 'student_answer', 'student_answer_details', 'max_point', 'received_point', 'created_at', 'page_style', 'session_id']) try: # # doc moc flag tu s3 df_flag = spark.read.parquet("s3://dts-odin/flag/flag_ai_study_step.parquet") max_key = df_flag.collect()[0]['flag'] print('read from index: ', max_key) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # dyf_ai_study_step = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['_key'] > max_key) except: print('read flag error ') if dyf_ai_study_step.count() > 0: try: ## Xu ly tag la: aip dyf_aip = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['tag'] == 'aip') df_aip = dyf_aip.toDF() def random_code(): return random.randint(1, 16) add_code = udf(random_code, IntegerType()) df_aip = df_aip.withColumn("code", add_code()) df_aip.printSchema() df_aip = df_aip.withColumn("lo_type", add_lo_type(df_aip.code)) df_aip = df_aip.withColumn("knowledge", add_score_aip(df_aip.code, f.lit('knowledge'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("comprehension", add_score_aip(df_aip.code, f.lit('comprehension'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("application", add_score_aip(df_aip.code, f.lit('application'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("analysis", add_score_aip(df_aip.code, f.lit('analysis'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("synthesis", add_score_aip(df_aip.code, f.lit('synthesis'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("evaluation", add_score_aip(df_aip.code, f.lit('evaluation'), df_aip.lo_type, df_aip.correct_answer, df_aip.student_answer)) \ .withColumn("date_id", from_unixtime(unix_timestamp(df_aip.created_at, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) df_aip.printSchema() df_aip.show() dyf_aip = DynamicFrame.fromDF(df_aip, glueContext, "dyf_aip") applymapping = ApplyMapping.apply(frame=dyf_aip, mappings=[("created_at", "string", "created_at", "timestamp"), ("user_id", 'string', 'student_id', 'long'), ("correct_answer", "string", "learning_object", "string"), ("date_id", "string", "date_id", "int"), ("knowledge", "int", "knowledge", "int"), ("comprehension", "int", "comprehension", "int"), ("application", "int", "application", "int"), ("analysis", "int", "analysis", "int"), ("synthesis", "int", "synthesis", "int"), ("evaluation", "int", "evaluation", "int")]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") dropnullfields.printSchema() dropnullfields.show() datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_starter_1", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/ai_study_step/", transformation_ctx="datasink5") except Exception as e: print("###################### Exception ##########################") print(e) try: ## Xu ly tag la: micro dyf_micro = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['tag'] == 'micro') df_micro = dyf_micro.toDF() df_micro_max_step = df_micro.groupby('user_id', 'lesson_id', 'session_id').agg( f.max('current_step').alias("max_step")) df_micro_max_step = df_micro_max_step.where("max_step >= 4") df_micro_max_step = df_micro_max_step.withColumnRenamed('user_id', 'user_id1') \ .withColumnRenamed('lesson_id', 'lesson_id1') \ .withColumnRenamed('session_id', 'session_id1') df_micro_received_point = df_micro.where("max_point = received_point") df_micro_received_point = df_micro_received_point.groupby('user_id', 'lesson_id', 'session_id').agg( f.count('received_point').alias("count_received_point")) df_micro_received_point = df_micro_received_point.withColumnRenamed('user_id', 'user_id2') \ .withColumnRenamed('lesson_id', 'lesson_id2') \ .withColumnRenamed('session_id', 'session_id2') df_micro = df_micro.join(df_micro_max_step, (df_micro['user_id'] == df_micro_max_step['user_id1']) & (df_micro['lesson_id'] == df_micro_max_step['lesson_id1']) & (df_micro['session_id'] == df_micro_max_step['session_id1'])) df_micro = df_micro.join(df_micro_received_point, (df_micro['user_id'] == df_micro_received_point['user_id2']) & (df_micro['lesson_id'] == df_micro_received_point['lesson_id2']) & (df_micro['session_id'] == df_micro_received_point['session_id2'])) def random_code1(): return random.randint(17, 18) add_code1 = udf(random_code1, IntegerType()) df_micro = df_micro.withColumn("code", add_code1()) df_micro = df_micro.withColumn("lo_type", add_lo_type(df_micro.code)) df_micro = df_micro.withColumn("knowledge", add_score_micro(df_micro.code, f.lit('knowledge'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("comprehension", add_score_micro(df_micro.code, f.lit('comprehension'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("application", add_score_micro(df_micro.code, f.lit('application'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("analysis", add_score_micro(df_micro.code, f.lit('analysis'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("synthesis", add_score_micro(df_micro.code, f.lit('synthesis'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("evaluation", add_score_micro(df_micro.code, f.lit('evaluation'), df_micro.lo_type, df_micro.total_step, df_micro.count_received_point)) \ .withColumn("date_id", from_unixtime(unix_timestamp(df_micro.created_at, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) df_micro.printSchema() df_micro.show() dyf_micro = DynamicFrame.fromDF(df_micro, glueContext, "dyf_micro") applymapping = ApplyMapping.apply(frame=dyf_micro, mappings=[("created_at", "string", "created_at", "timestamp"), ("user_id", 'string', 'student_id', 'long'), ("learning_object", "string", "learning_object", "string"), ("date_id", "string", "date_id", "int"), ("knowledge", "int", "knowledge", "int"), ("comprehension", "int", "comprehension", "int"), ("application", "int", "application", "int"), ("analysis", "int", "analysis", "int"), ("synthesis", "int", "synthesis", "int"), ("evaluation", "int", "evaluation", "int")]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") dropnullfields.printSchema() dropnullfields.show() datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_starter_2", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/ai_study_step/", transformation_ctx="datasink5") except Exception as e: print("###################### Exception ##########################") print(e) except Exception as e: print("###################### Exception ##########################") print(e) try: ## Xu ly tag la: ait # dyf_ai_study_step.show(5) dyf_ait = Filter.apply(frame=dyf_ai_study_step, f=lambda x: x['tag'] == 'ait') # dyf_ait = Filter.apply(frame=dyf_ai_study_step, # f=lambda x: x['tag'] == 'ait' # and x['student_answer_details'] is not None # and x['student_answer_details'] != 'null' # and x['correct_answer'] is not None) df_ait = dyf_ait.toDF() # udf_parse_json = udf(lambda str: parse_json(str), json_schema) # age_list = df_ait["student_answer_details"].tolist() # print ('list', age_list) df_ait = df_ait.withColumn('len_answer', udf_get_length(df_ait["student_answer_details"])) # df_ait.printSchema() # df_ait.show() df_ait_max_step = df_ait.groupby('user_id', 'lesson_id', 'total_step').agg( f.max('current_step').alias("max_step")) df_ait_max_step = df_ait_max_step.where('total_step = max_step') df_ait_max_step = df_ait_max_step.withColumnRenamed('user_id', 'user_id1').withColumnRenamed('lesson_id', 'lesson_id1').withColumnRenamed( 'total_step', 'total_step1') # df_ait_max_step.printSchema() # df_ait_max_step.show() df_ait_received_point = df_ait.where( "student_answer_details IS NOT NULL AND max_point = received_point AND page_style like '%ait_practice%'") df_ait_received_point = df_ait_received_point.groupby('user_id', 'lesson_id').agg( f.count('received_point').alias("count_received_point")) df_ait_received_point = df_ait_received_point.withColumnRenamed('user_id', 'user_id2').withColumnRenamed( 'lesson_id', 'lesson_id2') # df_ait_received_point.printSchema() # df_ait_received_point.show() # ait_pronunciation df_ait = df_ait.where("max_point = received_point AND page_style like '%ait_pronunciation%'") df_ait = df_ait.join(df_ait_received_point, ( df_ait['user_id'] == df_ait_received_point['user_id2']) & ( df_ait['lesson_id'] == df_ait_received_point[ 'lesson_id2'])) df_ait = df_ait.join(df_ait_max_step, ( df_ait['user_id'] == df_ait_max_step['user_id1']) & ( df_ait['lesson_id'] == df_ait_max_step[ 'lesson_id1'])) # print('SCHEMA:::') # df_ait.printSchema() # df_ait.show() df_ait = df_ait.withColumn("knowledge", add_score_ait(df_ait.total_step, df_ait.max_step, df_ait.count_received_point, df_ait.len_answer)) \ .withColumn("comprehension", add_score_ait(df_ait.total_step, df_ait.max_step, df_ait.count_received_point, df_ait.len_answer)) \ .withColumn("application", add_score_ait(df_ait.total_step, df_ait.max_step, df_ait.count_received_point, df_ait.len_answer)) \ .withColumn("analysis", f.lit(0)) \ .withColumn("synthesis", f.lit(0)) \ .withColumn("evaluation", f.lit(0)) \ .withColumn("lo_type", f.lit(1)) \ .withColumn("date_id", from_unixtime(unix_timestamp(df_ait.created_at, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) # df_ait.printSchema() # df_ait.show() dyf_ait = DynamicFrame.fromDF(df_ait, glueContext, "dyf_ait") applymapping = ApplyMapping.apply(frame=dyf_ait, mappings=[("created_at", "string", "created_at", "timestamp"), ("user_id", 'string', 'student_id', 'long'), ("correct_answer", "string", "learning_object", "string"), ("date_id", "string", "date_id", "int"), ("knowledge", "int", "knowledge", "int"), ("comprehension", "int", "comprehension", "int"), ("application", "int", "application", "int"), ("analysis", "int", "analysis", "int"), ("synthesis", "int", "synthesis", "int"), ("evaluation", "int", "evaluation", "int")]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields") dropnullfields.printSchema() dropnullfields.show() datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_lo_student_starter", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/ai_study_step/", transformation_ctx="datasink5") except Exception as e: print("###################### Exception ##########################") print(e) df_temp = dyf_ai_study_step.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dts-odin/flag/flag_ai_study_step.parquet", mode="overwrite")
def main(): # # @params: [TempDir, JOB_NAME] # args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME']) # sc = SparkContext() glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) dyf_log_change_assignment_advisor = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_change_assignment_advisor") # chon cac field dyf_log_change_assignment_advisor = dyf_log_change_assignment_advisor.select_fields( [ '_key', 'contact_id', 'advisor_id_old', 'advisor_id_new', 'created_at' ]) dyf_log_change_assignment_advisor = dyf_log_change_assignment_advisor.resolveChoice( specs=[('_key', 'cast:long')]) dyf_log_change_assignment_advisor.printSchema() dyf_log_change_assignment_advisor.show(2) # check bucket is not null try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3a://dtsodin/flag/toa_L3150/log_change_assignment_advisor.parquet" ) start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag dyf_log_change_assignment_advisor = Filter.apply( frame=dyf_log_change_assignment_advisor, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') print('the number of new contacts: ', dyf_log_change_assignment_advisor.count()) if (dyf_log_change_assignment_advisor.count() > 0): dyf_log_change_assignment_advisor = Filter.apply( frame=dyf_log_change_assignment_advisor, f=lambda x: x['contact_id'] is not None and x['advisor_id_old'] is None and x['advisor_id_new'] is not None and x['created_at' ] is not None) dyf_log_change_assignment_advisor.printSchema() df_log_change_assignment_advisor = dyf_log_change_assignment_advisor.toDF( ) df_log_change_assignment_advisor = df_log_change_assignment_advisor.withColumn('change_status_date_id', from_unixtime(unix_timestamp(df_log_change_assignment_advisor.created_at, "yyyy-MM-dd"), "yyyyMMdd"))\ .withColumn('to_status_id', f.lit(202))\ .withColumn('timestamp1', f.lit(df_log_change_assignment_advisor.created_at)) df_log_change_assignment_advisor = df_log_change_assignment_advisor.dropDuplicates( ) dyf_log_change_assignment_advisor = DynamicFrame.fromDF( df_log_change_assignment_advisor, glueContext, 'dyf_log_change_assignment_advisor') applymapping1 = ApplyMapping.apply( frame=dyf_log_change_assignment_advisor, mappings=[("contact_id", "string", "contact_id", "string"), ("change_status_date_id", "string", "change_status_date_id", "long"), ("timestamp1", "string", "timestamp1", "timestamp"), ('to_status_id', 'int', 'to_status_id', 'long')]) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields_3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields_3, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin", "postactions": """UPDATE mapping_changed_status_student SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 1 AND source_id = mapping_changed_status_student.contact_id LIMIT 1 ) WHERE user_id IS NULL and to_status_id=202""" }, redshift_tmp_dir="s3n://dtsodin/backup/TOA/trang_thai_S0/", transformation_ctx="datasink4") # ghi flag # lay max key trong data source datasourceTmp = dyf_log_change_assignment_advisor.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/toa_L3150/log_change_assignment_advisor.parquet", mode="overwrite")
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") score_pass = 50 date_read_data = 20191005 print('date_read_date: ', date_read_data) def doAddScoreAll(plus, minus): if plus is None: plus = 0 if minus is None: minus = 0 return plus + minus addScoreAll = udf(doAddScoreAll, IntegerType()) def getPreviousDate(date_input): current_date = datetime.strptime(str(date_input), "%Y%m%d") previous_date = current_date - timedelta(1) previous_date_id = previous_date.strftime("%Y%m%d") return previous_date_id def addMoreSore(current_value, new_value): if current_value >= score_pass: return score_pass if current_value is None: current_value = 0 if new_value is None: new_value = 0 a = current_value + new_value if a < 0: return 0 if a >= score_pass: return score_pass return a addMoreSore = udf(addMoreSore, IntegerType()) def getNewPassDate(current_pass_date, score_value_c, score_value_n): if current_pass_date != None: return current_pass_date if score_value_c is None: score_value_c = 0 if score_value_n is None: score_value_n = 0 if score_value_c + score_value_n >= score_pass: return date_read_data return None getNewPassDate = udf(getNewPassDate, IntegerType()) # def getCurrentDate(): # return d4 # # getCurrentDate = udf(getCurrentDate, IntegerType()) def getModifyDate(modify_old, student_id_new): if student_id_new is not None: return date_read_data return modify_old getModifyDate = udf(getModifyDate, IntegerType()) def getnewStudentId(student_id, student_id_new): if student_id is None: return student_id_new return student_id getnewStudentId = udf(getnewStudentId, LongType()) def getnewStudentLearningObjectId(lo_id, lo_id_new): if lo_id is None: return lo_id_new return lo_id getnewStudentLearningObjectId = udf(getnewStudentLearningObjectId, LongType()) def caculateScore(plus, minus): if plus is None: plus = 0 if minus is None: minus = 0 return plus + minus caculateScore = udf(caculateScore, LongType()) def getModifiedDateId(student_id_new, learning_object_id_new, modified_date_id_current): if student_id_new is not None and learning_object_id_new is not None: return long(date_read_data) return long(modified_date_id_current) udfGetModifiedDateId = udf(getModifiedDateId, LongType()) def getCreatedDateId(student_id_new, learning_object_id_new, created_date_id_current): if created_date_id_current is not None: return created_date_id_current if student_id_new is not None and learning_object_id_new is not None: return date_read_data return created_date_id_current getCreatedDateId = udf(getCreatedDateId, LongType()) def getFirstLearningDate(student_id_new, learning_object_id_new, created_date_id_current): if created_date_id_current is not None: return created_date_id_current if student_id_new is not None and learning_object_id_new is not None: return date_read_data return created_date_id_current getFirstLearningDate = udf(getFirstLearningDate, LongType()) dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog( database="nvn_knowledge", table_name="mapping_lo_student_history", additional_options={ "path": "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/*/*" }) #get start read for read start_read = 0 # try: # # # doc moc flag tu s3 # df_flag = spark.read.parquet("s3a://dts-odin/flag/nvn_knowledge/mapping_lo_student_end_read.parquet") # start_read = df_flag.collect()[0]['flag'] # print('read start date from: ', start_read) # except: # print('read flag file error ') # start_read = None dyf_mapping_lo_student_history = Filter.apply( frame=dyf_mapping_lo_student_history, f=lambda x: x["student_id"] is not None and x["student_id"] != 0 and x[ "learning_object_id"] is not None) # print('dyf_mapping_lo_student_history') # print(dyf_mapping_lo_student_history.count()) # dyf_mapping_lo_student_history.show(3) # dyf_mapping_lo_student_history.printSchema() df_mapping_lo_student_history_cache = dyf_mapping_lo_student_history.toDF() df_mapping_lo_student_history_cache.dropDuplicates([ 'student_id', 'learning_object_id', 'source_system', 'created_date_id' ]) df_mapping_lo_student_history_cache.cache() df_group_source_system = df_mapping_lo_student_history_cache.groupby( 'source_system').agg(f.max('created_date_id').alias('max_date')) max_allowing_date = df_group_source_system.agg({ "max_date": "min" }).collect()[0][0] print('check date_read_data') print('max_allowing_date: ', max_allowing_date) print('date_read_data: ', date_read_data) if max_allowing_date <= date_read_data: print('stop in here::max_allowing_date <= date_read_data') return df_mapping_lo_student_history_cache = df_mapping_lo_student_history_cache.filter( df_mapping_lo_student_history_cache['created_date_id'] == date_read_data) print('df_mapping_lo_student_history_cache') df_mapping_lo_student_history_cache.printSchema() df_mapping_lo_student_history_cache.show(3) print('df_mapping_lo_student_history_cache::number: ', df_mapping_lo_student_history_cache.count()) if df_mapping_lo_student_history_cache.count() > 0: df_mapping_lo_student_new = df_mapping_lo_student_history_cache\ .groupby('student_id', 'learning_object_id', ).agg( addScoreAll(f.sum('knowledge_plus'), f.sum('knowledge_minus')).alias('knowledge_new'), addScoreAll(f.sum('comprehension_plus'), f.sum('comprehension_minus')).alias('comprehension_new'), addScoreAll(f.sum('application_plus'), f.sum('application_minus')).alias('application_new'), addScoreAll(f.sum('analysis_plus'), f.sum('analysis_minus')).alias('analysis_new'), addScoreAll(f.sum('synthesis_plus'), f.sum('synthesis_minus')).alias('synthesis_new'), addScoreAll(f.sum('evaluation_plus'), f.sum('evaluation_minus')).alias('evaluation_new')) # df_mapping_lo_student_new = df_mapping_lo_student_new.withColumnRenamed('student_id', 'student_id_new')\ .withColumnRenamed('learning_object_id', 'learning_object_id_new') # dyf_df_mapping_lo_student_new = DynamicFrame.fromDF(df_mapping_lo_student_new, glueContext, 'dyf_df_mapping_lo_student_new') # # print('dyf_df_mapping_lo_student_new') # dyf_df_mapping_lo_student_new.printSchema() # dyf_df_mapping_lo_student_new.show(3) # print('dyf_df_mapping_lo_student_new number: ', dyf_df_mapping_lo_student_new.count()) # dyf_mapping_lo_student_current = glueContext.create_dynamic_frame.from_catalog( # # database="nvn_knowledge", # # table_name="mapping_lo_student" # # ) dyf_mapping_lo_student_current = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin", "user": "******", "password": "******", "dbtable": "temp_v1_mapping_lo_student", "redshiftTmpDir": "s3n://dts-odin/temp1/thanhtv3/temp_v1_mapping_lo_student/v8" }) print('mapping_lo_student_current') dyf_mapping_lo_student_current.printSchema() dyf_mapping_lo_student_current.show(3) # # # # # Filter all dyf_mapping_lo_student_current = Filter.apply( frame=dyf_mapping_lo_student_current, f=lambda x: x["student_id"] is not None and x["student_id"] != 0 # and x["knowledge_pass_date_id"] is None # and x["comprehension_pass_date_id"] is None # and x["application_pass_date_id"] is None # and x["analysis_pass_date_id"] is not None # and x["synthesis_pass_date_id"] is None # and x["evaluation_pass_date_id"] is None ) # dy_mapping_lo_student_current = dyf_mapping_lo_student_current.toDF() # dy_mapping_lo_student_current = dy_mapping_lo_student_current.drop('user_id') dy_mapping_lo_student_current.cache() # # join_mapping = df_mapping_lo_student_new.join( dy_mapping_lo_student_current, (dy_mapping_lo_student_current['student_id'] == df_mapping_lo_student_new['student_id_new']) & (dy_mapping_lo_student_current['learning_object_id'] == df_mapping_lo_student_new['learning_object_id_new']), 'left') print('join_new_knowledge_left') join_mapping.printSchema() # print('thanhtv3-------------------------------') join_mapping = join_mapping \ .withColumn('knowledge_t', addMoreSore(join_mapping.knowledge, join_mapping.knowledge_new))\ .withColumn('comprehension_t', addMoreSore(join_mapping.comprehension, join_mapping.comprehension_new))\ .withColumn('application_t', addMoreSore(join_mapping.application, join_mapping.application_new))\ .withColumn('analysis_t', addMoreSore(join_mapping.analysis, join_mapping.analysis_new))\ .withColumn('synthesis_t', addMoreSore(join_mapping.synthesis, join_mapping.synthesis_new))\ .withColumn('evaluation_t', addMoreSore(join_mapping.evaluation, join_mapping.evaluation_new)) \ .withColumn('student_id_t', getnewStudentId(join_mapping.student_id, join_mapping.student_id_new)) \ .withColumn('learning_object_id_t', getnewStudentLearningObjectId(join_mapping.learning_object_id, join_mapping.learning_object_id_new))\ .withColumn('knowledge_pass_date_id', getNewPassDate(join_mapping.knowledge_pass_date_id, join_mapping.knowledge, join_mapping.knowledge_new))\ .withColumn('comprehension_pass_date_id', getNewPassDate(join_mapping.comprehension_pass_date_id, join_mapping.comprehension, join_mapping.comprehension_new))\ .withColumn('application_pass_date_id', getNewPassDate(join_mapping.application_pass_date_id, join_mapping.application, join_mapping.application_new))\ .withColumn('analysis_pass_date_id', getNewPassDate(join_mapping.analysis_pass_date_id, join_mapping.analysis, join_mapping.analysis_new))\ .withColumn('synthesis_pass_date_id', getNewPassDate(join_mapping.synthesis_pass_date_id, join_mapping.synthesis, join_mapping.synthesis_new))\ .withColumn('evaluation_pass_date_id', getNewPassDate(join_mapping.evaluation_pass_date_id, join_mapping.evaluation, join_mapping.evaluation_new))\ .withColumn('modified_date_id_t', udfGetModifiedDateId(join_mapping.student_id_new, join_mapping.learning_object_id_new, join_mapping.modified_date_id))\ .withColumn('created_date_id_t', getCreatedDateId(join_mapping.student_id_new, join_mapping.learning_object_id_new, join_mapping.created_date_id))\ .withColumn('first_learning_date_id_t', getFirstLearningDate(join_mapping.student_id_new, join_mapping.learning_object_id_new, join_mapping.first_learning_date_id)) # #s # # # join_mapping = join_mapping.drop('knowledge_new', 'comprehension_new', 'synthesis_new', # 'application_new', 'evaluation_new', 'analysis_new', # 'knowledge', 'comprehension', 'synthesis', # 'application', 'evaluation', 'analysis', # 'student_id_new', 'learning_object_id_new', 'created_date_id') # join_mapping.cache() join_mapping = join_mapping.select( 'id', 'user_id', 'student_id_t', 'learning_object_id_t', 'knowledge_t', 'knowledge_pass_date_id', 'comprehension_t', 'comprehension_pass_date_id', 'application_t', 'application_pass_date_id', 'analysis_t', 'analysis_pass_date_id', 'synthesis_t', 'synthesis_pass_date_id', 'evaluation_t', 'evaluation_pass_date_id', 'modified_date_id_t', 'created_date_id_t', 'first_learning_date_id_t') print('join_mapping') join_mapping.printSchema() join_mapping.show(1) dyf_join_mapping = DynamicFrame.fromDF(join_mapping, glueContext, 'dyf_join_mapping') dyf_join_mapping = dyf_join_mapping.resolveChoice( specs=[('user_id', 'cast:long')]) apply_ouput = ApplyMapping.apply( frame=dyf_join_mapping, mappings=[ ("user_id", "long", "user_id", "long"), ("student_id_t", "long", "student_id", "long"), ("learning_object_id_t", "long", "learning_object_id", "long"), ("knowledge_t", "int", "knowledge", "long"), ("comprehension_t", "int", "comprehension", "long"), ("application_t", "int", "application", "long"), ("analysis_t", "int", "analysis", "long"), ("synthesis_t", "int", "synthesis", "long"), ("evaluation_t", "int", "evaluation", "long"), ("knowledge_pass_date_id", "int", "knowledge_pass_date_id", "long"), ("comprehension_pass_date_id", "int", "comprehension_pass_date_id", "long"), ("application_pass_date_id", "int", "application_pass_date_id", "long"), ("analysis_pass_date_id", "int", "analysis_pass_date_id", "long"), ("synthesis_pass_date_id", "int", "synthesis_pass_date_id", "long"), ("evaluation_pass_date_id", "int", "evaluation_pass_date_id", "long"), ("modified_date_id_t", "long", "modified_date_id", "long"), ("created_date_id_t", "long", "created_date_id", "long"), ("first_learning_date_id_t", "long", "first_learning_date_id", "long") ]) # dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dfy_output, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_thanhtv3_dyf_join_mapping", "database": "dts_odin" }, redshift_tmp_dir= "s3n://dts-odin/temp/thanhtv3/nvn/knowledge/mapping_lo_student/v3", transformation_ctx="datasink4") # # # # #save flag for next read # next_day = getPreviousDate(date_read_data) # flag_data = [next_day] # df = spark.createDataFrame(flag_data, "int").toDF('flag') # # # ghi de _key vao s3 # df.write.parquet("s3a://dts-odin/flag/nvn_knowledge/mapping_lo_student_end_read.parquet", mode="overwrite") # unpersit all cache df_mapping_lo_student_history_cache.unpersist() dy_mapping_lo_student_current.unpersist()