def main(): def checknull(level_modified, level_study): if level_modified is not None: return level_modified else: return level_study checknull_ = udf(checknull, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_package_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_package_status_code is not None: text_concat += str(student_package_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = udf(concaText, StringType()) glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['student_id', 'contact_id', 'level_study']) dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study") dyf_log_student_level_study = dyf_log_student_level_study.select_fields([ 'contact_id', 'level_current', 'level_modified', 'package_code', 'time_created' ]) dyf_log_student_level_study = dyf_log_student_level_study.resolveChoice( specs=[('_key', 'cast:int')]) dyf_tpe_invoice_product = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product") dyf_tpe_invoice_product = dyf_tpe_invoice_product.select_fields([ '_key', 'timecreated', 'user_id', 'buyer_id', 'invoice_packages_price', 'invoice_price', 'invoice_code' ]) dyf_tpe_invoice_product = dyf_tpe_invoice_product.resolveChoice( specs=[('_key', 'cast:long')]) dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['cat_code', 'package_time', 'invoice_code']) dyf_student_package = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_package") # chon cac field dyf_student_package = dyf_student_package.select_fields( ['student_id', 'start_time', 'end_time', 'package_code']).rename_field('student_id', 'student_id1') dyf_student_package.printSchema() dyf_student_package.show(2) # # doc flag tu s3 try: # # doc moc flag tu s3 df_flag = spark.read.parquet( "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet" ) start_read = df_flag.collect()[0]['flag'] print('read from index: ', start_read) # so sanh _key datasource voi flag, lay nhung gia tri co key > flag dyf_tpe_invoice_product = Filter.apply( frame=dyf_tpe_invoice_product, f=lambda x: x['_key'] > start_read) except: print('read flag file error ') print('the number of new contacts: ', dyf_tpe_invoice_product.count()) if (dyf_tpe_invoice_product.count() > 0): df_log_student_level_study = dyf_log_student_level_study.toDF() df_log_student_level_study = df_log_student_level_study.groupby( 'contact_id', 'level_current', 'level_modified', 'package_code').agg(f.max('time_created').alias('time_created')) dyf_join0 = Join.apply(dyf_tpe_invoice_product, dyf_tpe_invoice_product_details, 'invoice_code', 'invoice_code') print("@@@@@@@@@@@@") dyf_join0.printSchema() dyf_join0.show(2) dyf_log_student_level_study = DynamicFrame.fromDF( df_log_student_level_study, glueContext, "dyf_log_student_level_study") dyf_join1 = Join.apply(dyf_student_contact, dyf_join0, "contact_id", "user_id") dyf_join = Join.apply(dyf_join1, dyf_log_student_level_study, "user_id", "contact_id") print("@@@@@@@@@@@@") dyf_join.printSchema() dyf_join.show(2) dyf_join = Filter.apply( frame=dyf_join, f=lambda x: x['time_created'] <= x['timecreated']) dyf_data_join3 = Join.apply(dyf_join, dyf_student_package, "student_id", "student_id1") dyf_data_join3 = Filter.apply( frame=dyf_data_join3, f=lambda x: x['package_code'] == x['cat_code']) df_data_join3 = dyf_data_join3.toDF() df_data_join3 = df_data_join3.withColumn("student_level_code", checknull_(df_data_join3.level_modified, df_data_join3.level_study))\ .withColumn("behavior_id", f.lit(3))\ .withColumn("student_package_status_code", f.lit("DEACTIVED"))\ .withColumn("student_behavior_date", from_unixtime(df_data_join3.timecreated))\ .withColumn("package_starttime", df_data_join3['start_time'])\ .withColumn("package_endtime", df_data_join3['end_time']) \ .withColumn("transformed_at", f.lit(None)) df_data_join3 = df_data_join3.withColumn( 'student_behavior_id', f.md5( concaText(df_data_join3.student_behavior_date, df_data_join3.behavior_id, df_data_join3.student_id, df_data_join3.contact_id, df_data_join3.package_code, df_data_join3.package_endtime, df_data_join3.package_starttime, df_data_join3.student_level_code, df_data_join3.student_package_status_code, df_data_join3.transformed_at))) df_data_join3 = df_data_join3.dropDuplicates() dyf_data_join3 = DynamicFrame.fromDF(df_data_join3, glueContext, "dyf_data_join3") dyf_data_join3 = dyf_data_join3.resolveChoice( specs=[('behavior_id', 'cast:int'), ('student_behavior_date', 'cast:timestamp')]) dyf_data_join3.printSchema() dyf_data_join3.show(2) applymapping = ApplyMapping.apply( frame=dyf_data_join3, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("contact_id", "string", "contact_id", "string"), ("student_behavior_date", "timestamp", "student_behavior_date", "long"), ("student_id", "string", "student_id", "long"), ("cat_code", "string", "package_code", "string"), ("package_starttime", "int", "package_starttime", "long"), ("package_endtime", "int", "package_endtime", "long"), ("student_package_status_code", "string", "student_status_code", "string"), ("behavior_id", "int", "behavior_id", "long"), ("student_level_code", "string", "student_level_code", "string")]) resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice") dropnullfields = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields") print(dropnullfields.count()) dropnullfields.toDF().show() glueContext.write_dynamic_frame.from_options( frame=dropnullfields, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") applymapping1 = ApplyMapping.apply( frame=dyf_data_join3, mappings=[("invoice_packages_price", "int", "measure1", "long"), ("behavior_id", "int", "behavior_id", "long"), ("invoice_price", "int", "measure2 ", "long")]) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields1") print(dropnullfields1.count()) dropnullfields1.toDF().show() glueContext.write_dynamic_frame.from_options( frame=dropnullfields, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_general_behavior", "partitionKeys": ["behavior_id"] }, format="parquet") dyf_tpe_invoice_product = dyf_tpe_invoice_product.toDF() flag = dyf_tpe_invoice_product.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/student_behavior/flag_hoc_vien_duoc_mua_goi_nap_tien.parquet", mode="overwrite")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # ----------------------------------------------DYF-----------------------------------------------------------------# dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id']) #-----------------------------------------DYF-----------------------------------------------------------------------# dyf_ghinhan_hp = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="ghinhan_hp") dyf_ghinhan_hp = dyf_ghinhan_hp.select_fields( ['ngay_thanhtoan', 'khoa_hoc_makh', 'trang_thai']).rename_field('trang_thai', 'trang_thai_gnhp') dyf_ghinhan_hp = Filter.apply(frame=dyf_ghinhan_hp, f=lambda x: x["trang_thai_gnhp"] == True) # -----------------------------------------DYF-----------------------------------------------------------------------# dyf_khoa_hoc = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="khoa_hoc") dyf_khoa_hoc = dyf_khoa_hoc.select_fields(['makh', 'mahv', 'trang_thai']).rename_field( 'trang_thai', 'trang_thai_kh') dyf_khoa_hoc = Filter.apply(frame=dyf_khoa_hoc, f=lambda x: x["trang_thai_kh"] == True) # -----------------------------------------DYF-----------------------------------------------------------------------# dyf_hoc_vien = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="hoc_vien") dyf_hoc_vien = dyf_hoc_vien.select_fields([ 'mahv', 'crm_id', 'trang_thai' ]).rename_field('mahv', 'mahv_hv').rename_field('trang_thai', 'trang_thai_hv') dyf_hoc_vien = Filter.apply(frame=dyf_hoc_vien, f=lambda x: x["trang_thai_hv"] == True) #-------------------------------------------------------------------------------------------------------------------# df_student_contact_1 = dyf_student_contact.toDF() df_student_contact_1.drop_duplicates() df_student_contact = df_student_contact_1.groupby( 'contact_id', 'student_id').agg( f.count('contact_id').alias("contact_id_after_count")) dyf_student_contact = DynamicFrame.fromDF(df_student_contact, glueContext, "dyf_student_contact") dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x["contact_id_after_count"] > 1) df_student_contact = dyf_student_contact.toDF() df_student_contact.drop_duplicates() df_student_contact.cache() df_student_contact.printSchema() df_student_contact.show(2) print('df_student_contact count::', df_student_contact.count()) df_ghinhan_hp = dyf_ghinhan_hp.toDF() df_khoa_hoc = dyf_khoa_hoc.toDF() df_hoc_vien = dyf_hoc_vien.toDF() #------------------------------------------___JOIN___---------------------------------------------------------------# df_join = df_ghinhan_hp.join( df_khoa_hoc, df_ghinhan_hp.khoa_hoc_makh == df_khoa_hoc.makh) df_join.printSchema() print('df_join count::', df_join.count()) df_join1 = df_join.join(df_hoc_vien, df_join.mahv == df_hoc_vien.mahv_hv) df_join1.printSchema() print('df_join1 count::', df_join1.count()) df_join2 = df_join1.join(df_student_contact, df_join1.crm_id == df_student_contact.contact_id) df_join2 = df_join2.withColumn( 'change_status_date_id', from_unixtime(unix_timestamp(df_join2.ngay_thanhtoan, "yyyy-MM-dd"), "yyyyMMdd")) df_join2.drop_duplicates() df_join2.printSchema() df_join2.show(2) print('df_join2 count::', df_join2.count()) # df_join2.printSchema() # print('df_join2 count::', df_join2.count()) #-----------------------------------_____choose_name_field______----------------------------------------------------# to_status_id = 201L df_result = df_join2.select('student_id', 'change_status_date_id', f.lit(to_status_id).alias('to_status_id'), 'contact_id') df_result.printSchema() df_result.show(3) df_result = df_result.drop_duplicates() df_result.cache() print('count df_result::', df_result.count()) dyf_result = DynamicFrame.fromDF(df_result, glueContext, "dyf_result") dyf_result = Filter.apply( frame=dyf_result, f=lambda x: x["student_id"] is not None and x[ "change_status_date_id"] is not None and x[ "to_status_id"] is not None and x["contact_id"] is not None) apply_output = ApplyMapping.apply( frame=dyf_result, mappings=[ ("student_id", "string", "student_id", "long"), # ("user_id", "long", "user_id", "long"), ("change_status_date_id", "string", "change_status_date_id", "long" ), # ("from_status_id", "long", "from_status_id", "long"), ("to_status_id", "long", "to_status_id", "long"), # ("measure1", "double", "measure1", "double"), # ("measure2", "double", "measure2", "double"), # ("description", "string", "description", "string"), # ("timestamp1", "string", "timestamp1", "string"), ("contact_id", "string", "contact_id", "string"), # ("teacher_id", "long", "teacher_id", "long"), # ("contact_id1", "string", "contact_id1", "string"), # ("measure1_int", "int", "measure1_int", "int"), # ("measure2_int", "int", "measure2_int", "int"), # ("contact_id_str", "string", "contact_id_str", "string"), # ("lc", "string", "lc", "string"), # ("student_id_string", "string", "student_id_string", "string") ]) df_apply_output = apply_output.toDF() df_apply_output.drop_duplicates() print('df_apply_output.count', df_apply_output.count()) dyf_apply_output = DynamicFrame.fromDF(df_apply_output, glueContext, "dyf_apply_output") resolve_choice = ResolveChoice.apply(frame=dyf_apply_output, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolve_choice, transformation_ctx="dropnullfields") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student_v1", "database": "dts_odin" }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") df_result.unpersist() df_student_contact.unpersist() print( '------------------------>___complete__________------------------------------>' )
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) # f.lit(today_second).cast('long').alias('transformed_at') rangeid = [14, 15, 16, 17, 18] student_id_unavailable = '0' package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_endtime = 'package_endtime' package_starttime = 'package_starttime' student_level_code = 'student_level_code' student_status_code = 'student_status_code' def doCheckModified(val1, val2): if val1 is not None: return val1 return val2 check_modified_null = udf(doCheckModified, StringType()) def doCheckStudentID(code): code = str(code) if code is None: return student_id_unavailable return code check_student_id = udf(doCheckStudentID, StringType()) def doCheckData(code, key): key = str(key) if code is None: if key == package_endtime: return package_endtime_unavailable else: return package_starttime_unavailable return code check_data = udf(doCheckData, IntegerType()) def doCheckDataNull(code, key): code = str(code) key = str(key) if (code is None) & (key == student_level_code): return student_level_code_unavailable if (code is None) & (key == student_status_code): return student_status_code_unavailable return code check_data_null = udf(doCheckDataNull, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime,package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = f.udf(concaText, StringType()) dyf_tblreply_rating = glueContext.create_dynamic_frame.from_catalog( database="native_smile", table_name="tblreply_rating" ) dyf_tblreply_rating = dyf_tblreply_rating.select_fields( ['_key', 'userid', 'ratingid', 'time_rating'] ) dyf_tblreply_rating = dyf_tblreply_rating.resolveChoice(specs=[('_key', 'cast:long')]) # try: # df_flag_1 = spark.read.parquet("s3://dtsodin/flag/flag_hoc_vien_rating_native_smile_h2472.parquet") # max_key = df_flag_1.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_tblreply_rating = Filter.apply(frame=dyf_tblreply_rating, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') if dyf_tblreply_rating.count()> 0: dyf_mdl_user = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_user" ) dyf_mdl_user = dyf_mdl_user.select_fields( ['id', 'email'] ) dyf_tma_dm_tu_dien = glueContext.create_dynamic_frame.from_catalog( database="native_smile", table_name="tma_dm_tu_dien" ) dyf_tma_dm_tu_dien = dyf_tma_dm_tu_dien.select_fields( ['id', 'ma_tu_dien', 'id_dm_loai_tu_dien'] ) dyf_tma_dm_tu_dien = Filter.apply(frame=dyf_tma_dm_tu_dien, f=lambda x: x['id_dm_loai_tu_dien'] == 7 and x['id'] in rangeid) ################ join_rating_user = Join.apply(dyf_tblreply_rating, dyf_mdl_user, 'userid', 'id') join_rating_user01 = Join.apply(join_rating_user, dyf_tma_dm_tu_dien, 'ratingid', 'id') ################ dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact" ) dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id', 'level_study', 'time_lms_created'])\ dyf_log_student_status = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_status" ) dyf_log_student_status = dyf_log_student_status.select_fields( ['contact_id', 'status_code', 'last_status_code', 'start_date', 'end_date']) \ .rename_field('contact_id', 'contact_id_status') dyf_log_student_package = glueContext.create_dynamic_frame.from_catalog( database="do_tig_advisor", table_name="log_student_package" ) dyf_log_student_package = dyf_log_student_package.select_fields( ['student_id', 'package_code', 'start_time', 'end_time']) \ .rename_field('student_id', 'student_id_package') \ .rename_field('start_time', 'start_time_package') \ .rename_field('end_time', 'end_time_package') dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="log_student_level_study" ) dyf_log_student_level_study = dyf_log_student_level_study.select_fields( ['contact_id', 'level_current', 'level_modified', 'package_code', 'time_created']) \ .rename_field('contact_id', 'contact_id_level') join_rating_user01.printSchema() print join_rating_user01.count() print join_rating_user01.count() try: df_rating_class = join_rating_user01.toDF() df_student_contact = dyf_student_contact.toDF() df_log_student_level_study = dyf_log_student_level_study.toDF() df_temp = dyf_log_student_level_study.toDF() df_log_student_status = dyf_log_student_status.toDF() df_log_student_package = dyf_log_student_package.toDF() df_temp = df_temp.groupby('contact_id_level', 'level_current', 'package_code').agg( f.max("time_created").alias("time_created_max")) df_temp = df_temp.withColumnRenamed('contact_id_level', 'contact_id_join') \ .withColumnRenamed('package_code', 'package_code_join') df_join0 = df_temp.join(df_log_student_level_study, (df_temp['contact_id_join'] == df_log_student_level_study['contact_id_level']) & (df_temp['package_code_join'] == df_log_student_level_study['package_code']) & (df_temp['time_created_max'] == df_log_student_level_study['time_created']), "left") print "=========== . ===========" df_join0.printSchema() dyf_join = DynamicFrame.fromDF(df_join0, glueContext, "dyf_join") dyf_join = dyf_join.select_fields( ['contact_id_level', 'level_current', 'level_modified', 'package_code', 'time_created']) df_join = dyf_join.toDF() df_join.printSchema() df_join.show(10) print "########## . ###########" df_join01 = df_rating_class.join(df_student_contact, (df_rating_class['userid'] == df_student_contact['student_id'])) df_join01.printSchema() df_join02 = df_join01.join(df_join, (df_join['contact_id_level'] == df_join01['contact_id']) & (df_join['time_created'] <= df_join01['time_lms_created']), "left") df_join02 = df_join02\ .withColumn("level_modified_new", check_modified_null(df_join02.level_modified, df_join02.level_study)) df_join02.printSchema() df_join02.show(10) dyf_join = DynamicFrame.fromDF(df_join02, glueContext, "dyf_join") dyf_join = dyf_join.select_fields(['time_rating', 'contact_id', 'student_id', 'level_study', 'time_lms_created', 'ratingid', 'level_current', 'level_modified', 'package_code', 'time_created', 'level_modified_new']) # dyf_join_temp = Filter.apply(frame=dyf_join, # f=lambda x: x["level_modified_new"] is None) # print "count: ", dyf_join_temp.count() ############ df_join02 = dyf_join.toDF() df_join03 = df_join02.join(df_log_student_status, (df_log_student_status['contact_id_status'] == df_join02['contact_id']) & (df_log_student_status['start_date'] <= df_join02['time_rating']) & (df_log_student_status['end_date'] >= df_join02['time_rating']), "left") df_join04 = df_join03.join(df_log_student_package, (df_log_student_package['student_id_package'] == df_join03['student_id']) & (df_log_student_package['start_time_package'] <= df_join03['time_rating']) & (df_log_student_package['end_time_package'] >= df_join03['time_rating']), "left") dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join") dyf_join = Filter.apply(frame=dyf_join, f=lambda x: x["start_time_package"] is not None and x["end_time_package"] is not None) print "dyf_join: ", dyf_join.count() dyf_join.printSchema() dyf_join.show(10) dyf_join = dyf_join.select_fields( ['time_rating', 'student_id', 'contact_id', 'package_code', 'ratingid', 'start_time_package', 'end_time_package', 'level_modified_new', 'status_code'] ) print "TTTTTTTTTTTTTT" # dyf_join01 = Filter.apply(frame=dyf_join, # f=lambda x: x["level_current"] is not None) # # print "Check null ", dyf_join01.count() df_join04 = dyf_join.toDF() df_join04 = df_join04.withColumn("transformed_at", unix_timestamp(f.current_timestamp())) \ .withColumn("student_id", check_student_id(df_join04.student_id)) \ .withColumn("package_endtime", check_data(df_join04.end_time_package, f.lit(package_endtime))) \ .withColumn("package_starttime", check_data(df_join04.start_time_package, f.lit(package_starttime))) \ .withColumn("student_level_code", check_data_null(df_join04.level_modified_new, f.lit(student_level_code))) \ .withColumn("student_status_code", check_data_null(df_join04.status_code, f.lit(student_status_code))) \ .withColumn("behavior_id", f.lit(27)) \ .withColumn("rating_type", f.lit("rating_native_smile_h2472")) \ .withColumn("comment", f.lit("")) \ .withColumn("rating_about", f.lit(None)) \ .withColumn("number_rating", f.lit(1)) \ .withColumn("value_rating", (df_join04.ratingid - f.lit(13))) df_join04.printSchema() print df_join04.count() df_join04.show(10) dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join") # dyf_join.printSchema() # print dyf_join.count() # dyf_join.show(10) dyf_rating_cara = ApplyMapping.apply(frame=dyf_join, mappings=[("time_rating", "int", "student_behavior_date", "long"), ("behavior_id", "int", "behavior_id", "long"), ("student_id", "string", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "int", "package_endtime", "long"), ("package_starttime", "int", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long"), ("rating_type", "string", "rating_type", "string"), ("comment", "string", "comment", "string"), ("rating_about", "int", "rating_about", "long"), ("number_rating", "int", "number_rating", "long"), ("value_rating", "int", "value_rating", "long")]) df_rating_cara = dyf_rating_cara.toDF() df_rating_cara2 = df_rating_cara.withColumn('student_behavior_id', f.md5(concaText( df_rating_cara.student_behavior_date, df_rating_cara.behavior_id, df_rating_cara.student_id, df_rating_cara.contact_id, df_rating_cara.package_code, df_rating_cara.package_endtime, df_rating_cara.package_starttime, df_rating_cara.student_level_code, df_rating_cara.student_status_code, df_rating_cara.transformed_at))) dyf_rating_cara = DynamicFrame.fromDF(df_rating_cara2, glueContext, 'dyf_rating_cara') dyf_rating_cara = Filter.apply(frame=dyf_rating_cara, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') applymapping0 = ApplyMapping.apply(frame=dyf_rating_cara, mappings=[ ("student_behavior_id", "string", "student_behavior_id", "string"), ("rating_type", "string", "rating_type", "string"), ("comment", "string", "comment", "string"), ("rating_about", "long", "rating_about", "long"), ("number_rating", "long", "number_rating", "long"), ("value_rating", "long", "value_rating", "long"), ("behavior_id", "long", "behavior_id", "long")]) applymapping0.printSchema() print applymapping0.count() applymapping0.show(5) resolvechoice0 = ResolveChoice.apply(frame=applymapping0, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields0 = DropNullFields.apply(frame=resolvechoice0, transformation_ctx="dropnullfields0") print resolvechoice0.count() # resolvechoice0.printSchema() # resolvechoice0.show(10) print('START WRITE TO S3-------------------------') datasink0 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields0, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_rating/", "partitionKeys": ["behavior_id"]}, format="parquet", transformation_ctx="datasink0") print('END WRITE TO S3-------------------------') # datasink0 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields0, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "student_rating_temp", # "database": "dts_odin" # }, # redshift_tmp_dir="s3a://dtsodin/temp/student_rating_temp/", # transformation_ctx="datasink0") applymapping1 = ApplyMapping.apply(frame=dyf_rating_cara, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "long", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "string", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long")]) applymapping1.printSchema() print applymapping1.count() applymapping1.show(10) resolvechoice1 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply(frame=resolvechoice1, transformation_ctx="dropnullfields1") print resolvechoice1.count() # resolvechoice1.printSchema() # resolvechoice1.show(10) print('START WRITE TO S3-------------------------') datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior/", "partitionKeys": ["behavior_id"]}, format="parquet", transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') # datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "student_behavior", # "database": "dts_odin" # }, # redshift_tmp_dir="s3a://dtsodin/temp/student_behavior", # transformation_ctx="datasink1") df_temp = dyf_tblreply_rating.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dtsodin/flag/flag_hoc_vien_rating_native_smile_h2472.parquet", mode="overwrite") except Exception as e: print e
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session khoa_hoc = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="khoa_hoc") khoa_hoc.printSchema() khoa_hoc.show(2) khoa_hoc = khoa_hoc.select_fields( ['_key', 'makh', 'mahv', 'goi_sanpham_id', 'trang_thai']) khoa_hoc = khoa_hoc.resolveChoice(specs=[('_key', 'cast:long')]) print("khoa_hoc: ", khoa_hoc.count()) hoc_vien = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="hoc_vien") hoc_vien = hoc_vien.select_fields(['_key', 'mahv', 'crm_id', 'trang_thai']) hoc_vien = hoc_vien.resolveChoice(specs=[('_key', 'cast:long')]) print("hoc_vien: ", hoc_vien.count()) goi_sanpham = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="goi_sanpham") goi_sanpham = goi_sanpham.select_fields( ['_key', 'id', 'ma', 'solan_baoluu', 'songay_baoluu', 'trang_thai']) goi_sanpham = goi_sanpham.resolveChoice(specs=[('_key', 'cast:long')]) print("goi_sanpham: ", goi_sanpham.count()) ghinhan_hp = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="ghinhan_hp") ghinhan_hp = ghinhan_hp.select_fields([ '_key', 'id', 'ngay_thanhtoan', 'so_tien', 'khoa_hoc_makh', 'trang_thai' ]) ghinhan_hp = ghinhan_hp.resolveChoice(specs=[('_key', 'cast:long')]) print("ghinhan_hp: ", ghinhan_hp.count()) # # doc flag tu s3 df_flag = spark.read.parquet( "s3a://dts-odin/flag/flag_rating_class.parquet") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # mdl_rating_class = Filter.apply(frame=mdl_rating_class, # f=lambda x: x["_key"] > max_key) if (ghinhan_hp.count() > 0): ghinhan_hp = ghinhan_hp.toDF() khoa_hoc = khoa_hoc.toDF() hoc_vien = hoc_vien.toDF() goi_sanpham = goi_sanpham.toDF() data_join1 = ghinhan_hp.join( khoa_hoc, (ghinhan_hp['khoa_hoc_makh'] == khoa_hoc['makh'])) data_join2 = data_join1.join(hoc_vien, (data_join1['mahv'] == hoc_vien['mahv'])) data_join3 = data_join2.join( goi_sanpham, (data_join2['goi_sanpham_id'] == goi_sanpham['id'])) data_join3 = data_join3.withColumn('BehaviorId', f.lit(1)) data_data_join3 = DynamicFrame.fromDF(data_join3, glueContext, "datasource0") applymapping = ApplyMapping.apply( frame=data_data_join3, mappings=[("ngay_thanhtoan", "string", "StudentBehaviorDate", "timestamp"), ("ma", "string", "PackagePOSSCode", "string"), ("so_tien", "double", "measure1", "int"), ("songay_baoluu", "int", "songay_baoluu", "int"), ("solan_baoluu", "int", "solan_baoluu", "int"), ("BehaviorId", "int", "BehaviorId", "int")]) resolvechoice = ResolveChoice.apply( frame=applymapping, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice, transformation_ctx="dropnullfields3") print(dropnullfields3.count()) dropnullfields3.toDF().show() datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "student_behavior", "database": "dts_odin" }, redshift_tmp_dir= "s3n://dts-odin/topicalms/mdl_toannt_rating_class/", transformation_ctx="datasink4")
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # ----------------------------------------------DYF-----------------------------------------------------------------# dyf_tpe_enduser_used_product = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product") dyf_tpe_enduser_used_product = dyf_tpe_enduser_used_product.select_fields( ['contact_id', 'product_id', 'timecreated']) # -----------------------------------------DYF-----------------------------------------------------------------------# dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['id', 'cat_code']) # ----------------------------------------------DYF-----------------------------------------------------------------# dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id']).rename_field('contact_id', 'ct_id') # dyf_student_contact = Filter.apply(frame=dyf_student_contact, # f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' # and x["student_id"] is not None and x["student_id"] != '' # ) df_student_contact = dyf_student_contact.toDF() print('df_student_contact') df_student_contact.show() #-------------------------------------------------------------------------------------------------------------------# df_tpe_invoice_product_details = dyf_tpe_invoice_product_details.toDF() df_tpe_invoice_product_details = df_tpe_invoice_product_details.\ where("cat_code like 'TAAM%' OR cat_code like 'TENUP%' ") df_tpe_invoice_product_details = df_tpe_invoice_product_details.withColumn( 'to_status_id', f.when(df_tpe_invoice_product_details.cat_code.like('TAAM%'), f.lit(999L)).when( df_tpe_invoice_product_details.cat_code.like('TENUP%'), f.lit(998L)).otherwise(f.lit(999999999L))) df_tpe_invoice_product_details.show(2) df_tpe_enduser_used_product = dyf_tpe_enduser_used_product.toDF() #-----------------------------------------------____JOIN______------------------------------------------------------# df_join = df_tpe_invoice_product_details.join( df_tpe_enduser_used_product, df_tpe_invoice_product_details.id == df_tpe_enduser_used_product.product_id) df_join.printSchema() print('df_join ::', df_join.count()) df_join1 = df_join.join(df_student_contact, df_student_contact.ct_id == df_join.contact_id) df_join1 = df_join1.withColumn( 'change_status_date_id', from_unixtime(df_join1.timecreated, "yyyyMMdd")) df_join1.printSchema() print('df_join1 ::', df_join1.count()) #-------------------------------------------------------------------------------------------------------------------# df_result = df_join1.select('student_id', 'change_status_date_id', 'to_status_id', 'contact_id') df_result.printSchema() df_result.show(3) df_result = df_result.drop_duplicates() df_result.cache() print('count df_result::', df_result.count()) dyf_result = DynamicFrame.fromDF(df_result, glueContext, "dyf_result") dyf_result = Filter.apply( frame=dyf_result, f=lambda x: x["student_id"] is not None and x[ "change_status_date_id"] is not None and x[ "to_status_id"] is not None and x["contact_id"] is not None) apply_output = ApplyMapping.apply( frame=dyf_result, mappings=[ ("student_id", "string", "student_id", "long"), # ("user_id", "long", "user_id", "long"), ("change_status_date_id", "string", "change_status_date_id", "long" ), # ("from_status_id", "long", "from_status_id", "long"), ("to_status_id", "long", "to_status_id", "long"), # ("measure1", "double", "measure1", "double"), # ("measure2", "double", "measure2", "double"), # ("description", "string", "description", "string"), # ("timestamp1", "string", "timestamp1", "string"), ("contact_id", "string", "contact_id", "string"), # ("teacher_id", "long", "teacher_id", "long"), # ("contact_id1", "string", "contact_id1", "string"), # ("measure1_int", "int", "measure1_int", "int"), # ("measure2_int", "int", "measure2_int", "int"), # ("contact_id_str", "string", "contact_id_str", "string"), # ("lc", "string", "lc", "string"), # ("student_id_string", "string", "student_id_string", "string") ]) df_apply_output = apply_output.toDF() df_apply_output.drop_duplicates() print('df_apply_output.count', df_apply_output.count()) dyf_apply_output = DynamicFrame.fromDF(df_apply_output, glueContext, "dyf_apply_output") resolve_choice = ResolveChoice.apply(frame=dyf_apply_output, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolve_choice, transformation_ctx="dropnullfields") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_mapping_status", "database": "dts_odin", "postactions": """ insert into mapping_changed_status_student_v1(student_id, change_status_date_id, to_status_id, contact_id) select student_id, change_status_date_id, to_status_id, contact_id from temp_mapping_status; update mapping_changed_status_student_v1 set user_id = (select user_id from user_map where source_type = 2 and source_id = student_id) where user_id is null; DROP TABLE IF EXISTS temp_mapping_status """ }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") df_result.unpersist() df_student_contact.unpersist() print( '------------------------>___complete__________------------------------------>' )
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session mdl_reply_rating = glueContext.create_dynamic_frame.from_catalog( database="native_smile", table_name="tblreply_rating") mdl_reply_rating = mdl_reply_rating.select_fields( ['_key', 'id', 'userid', 'ratingid', 'time_rating']) mdl_reply_rating = mdl_reply_rating.resolveChoice(specs=[('_key', 'cast:long')]) # doc flag tu s3 df_flag = spark.read.parquet( "s3a://dts-odin/flag/flag_rating_class_h2472.parquet") max_key = df_flag.collect()[0]['flag'] print("max_key: ", max_key) mdl_reply_rating = Filter.apply(frame=mdl_reply_rating, f=lambda x: x["_key"] > max_key) if (mdl_reply_rating.count() > 0): print(mdl_reply_rating.count()) try: mdl_dm_tu_dien = glueContext.create_dynamic_frame.from_catalog( database="dts_odin", table_name="tma_dm_tu_dien") mdl_dm_tu_dien = mdl_dm_tu_dien.select_fields( ['id', 'ma_tu_dien', 'id_dm_loai_tu_dien']).rename_field('id', 'tbl_tu_dien_id') mdl_dm_tu_dien = Filter.apply( frame=mdl_dm_tu_dien, f=lambda x: x["id_dm_loai_tu_dien"] == 7) mdl_reply_rating = Filter.apply(frame=mdl_reply_rating, f=lambda x: x["ratingid"] in (14, 15, 16, 17, 18)) df_mdl_reply_rating = mdl_reply_rating.toDF() df_mdl_dm_tu_dien = mdl_dm_tu_dien.toDF() df_join_rating_tu_dien = df_mdl_reply_rating.join( df_mdl_dm_tu_dien, (df_mdl_reply_rating['ratingid'] == df_mdl_dm_tu_dien['tbl_tu_dien_id']), 'left_outer') df_join_rating_tu_dien = df_join_rating_tu_dien.withColumn( 'id_time', from_unixtime(df_join_rating_tu_dien['time_rating'])) df_join_rating_tu_dien = df_join_rating_tu_dien.withColumn( 'mapping_id_time', from_unixtime(df_join_rating_tu_dien['time_rating'], "yyyyMMdd")) # So lan rating 1 2 3 4 5* cua tung hoc vien theo tung ngay # start df_join_rating_tu_dien = df_join_rating_tu_dien.groupby( 'userid', 'mapping_id_time', 'ratingid').agg(f.count('ratingid')) df_join_rating_tu_dien.printSchema() df_join_rating_tu_dien = df_join_rating_tu_dien.withColumn( 'points', when(df_join_rating_tu_dien['ratingid'] == 14, f.lit(1)).when( df_join_rating_tu_dien['ratingid'] == 15, f.lit(2)).when( df_join_rating_tu_dien['ratingid'] == 16, f.lit(3)).when( df_join_rating_tu_dien['ratingid'] == 17, f.lit(4)).when( df_join_rating_tu_dien['ratingid'] == 18, f.lit(5))) df_join_rating_tu_dien = df_join_rating_tu_dien.withColumn( 'to_status_id', f.lit(505)) dyf_join_rating_user_tu_dien = DynamicFrame.fromDF( df_join_rating_tu_dien, glueContext, "datasource0") applymapping_count_rating = ApplyMapping.apply( frame=dyf_join_rating_user_tu_dien, mappings=[("userid", "long", "student_id", "long"), ("mapping_id_time", "string", "change_status_date_id", "long"), ("points", "int", "measure1", "double"), ("to_status_id", "int", "to_status_id", "long"), ("count(ratingid)", "long", "measure2", "double")]) resolvechoice_count_rating = ResolveChoice.apply( frame=applymapping_count_rating, choice="make_cols", transformation_ctx="resolvechoice_count_rating") dropnullfields_count_rating_star = DropNullFields.apply( frame=resolvechoice_count_rating, transformation_ctx="dropnullfields3") print(dropnullfields_count_rating_star.count()) dropnullfields_count_rating_star.toDF().show() datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields_count_rating_star, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student", "database": "dts_odin", "postactions": """ UPDATE mapping_changed_status_student SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 2 AND source_id = student_id ) WHERE user_id IS NULL """ }, redshift_tmp_dir= "s3n://dts-odin/topicalms/mdl_toannt_rating_class/", transformation_ctx="datasink4") # end # ghi flag # lay max key trong data source mdl_reply_rating = mdl_reply_rating.toDF() flag = mdl_reply_rating.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dts-odin/flag/flag_rating_class_h2472.parquet", mode="overwrite") except Exception as e: print("No new data") print(e)
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session mdl_rating_class = glueContext.create_dynamic_frame.from_catalog( database="topicalms", table_name="mdl_rating_class") mdl_rating_class = mdl_rating_class.select_fields([ '_key', 'id', 'points', 'vote', 'student_id', 'teacher_id', 'room_id', 'timecreated', 'opinion' ]) mdl_rating_class = mdl_rating_class.resolveChoice(specs=[('_key', 'cast:long')]) # doc flag tu s3 df_flag = spark.read.parquet( "s3a://dts-odin/flag/flag_rating_class.parquet") # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # data = mdl_rating_class.toDF() # data = data.where(data['_key'] > df_flag.collect()[0]['flag']) # data = data.where(data['_key'] < 100) # # mdl_rating_class = DynamicFrame.fromDF(data, glueContext, "mdl_rating_class") max_key = df_flag.collect()[0]['flag'] print("max_key: ", max_key) # mdl_rating_class = Filter.apply(frame=mdl_rating_class, # f=lambda x: x["_key"] > max_key) if (mdl_rating_class.count() > 0): print(mdl_rating_class.count()) try: # mdl_logsservice_move_user = glueContext.create_dynamic_frame.from_catalog(database="topicalms", # table_name="mdl_logsservice_move_user") # mdl_logsservice_move_user = mdl_logsservice_move_user.select_fields( # ['id', 'userid', 'roomidto', 'role_in_class', 'device_type']).rename_field('id', # 'log_id') # mdl_rating_class = Filter.apply(frame=mdl_rating_class, # f=lambda x: x['vote'] == 0 or x['vote'] == 4) df_mdl_rating_class = mdl_rating_class.toDF() # df_mdl_logsservice_move_user = mdl_logsservice_move_user.toDF() # # join_rating_class_logsservice_move_user = df_mdl_rating_class.join(df_mdl_logsservice_move_user, ( # df_mdl_rating_class['student_id'] == df_mdl_logsservice_move_user['userid']) & ( # df_mdl_rating_class['room_id'] == # df_mdl_logsservice_move_user[ # 'roomidto']), 'left_outer') # # join_rating_class_logsservice_move_user = join_rating_class_logsservice_move_user.withColumn('role_class', # when(( # join_rating_class_logsservice_move_user[ # 'role_in_class'] == 'AUDIT'), # 'Audience').otherwise( # 'Practical')) # # join_rating_class_logsservice_move_user = join_rating_class_logsservice_move_user.withColumn('device', # when(( # join_rating_class_logsservice_move_user[ # 'device_type'] == 'MOBILE'), # 'MOBLIE').otherwise( # 'WEB')) df_mdl_rating_class = df_mdl_rating_class.withColumn( 'id_time', from_unixtime(df_mdl_rating_class['timecreated'])) df_mdl_rating_class = df_mdl_rating_class.withColumn( 'mapping_id_time', from_unixtime(df_mdl_rating_class['timecreated'], "yyyyMMdd")) # data_rating_class_logsservice_move_user = DynamicFrame.fromDF(df_mdl_rating_class, # glueContext, "datasource0") # Tat ca nhung lan rating cua hoc vien sau khi loc dieu kien o tren # applymapping = ApplyMapping.apply(frame=data_rating_class_logsservice_move_user, # mappings=[("student_id", "string", "id_hoc_vien", "bigint"), # ("room_id", "string", "id_lop", "bigint"), # ("vote", "int", "danh_gia", "int"), # ("points", "int", "diem", "int"), # ("id_time", "string", "gio_tao", "timestamp"), # ("opinion", "string", "mo_ta", "string"), # ("role_class", "string", "vai_tro", "string"), # ("device", "string", "thiet_bi", "string"), # ("mapping_id_time", "string", "id_time_mapping", "int")]) # # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols", # transformation_ctx="resolvechoice2") # # dropnullfields3 = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields3") # # print(dropnullfields3.count()) # dropnullfields3.toDF().show() # # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields3, # catalog_connection="glue_redshift", # connection_options={"dbtable": "rating_class", # "database": "dts_odin"}, # redshift_tmp_dir="s3n://dts-odin/topicalms/mdl_toannt_rating_class/", # transformation_ctx="datasink4") # Diem trung binh rating cua tung hoc vien theo tung ngay # start # avg_count_rating = data_rating_class_logsservice_move_user.toDF() # # avg_count_rating = avg_count_rating.groupby('student_id', 'mapping_id_time').agg(f.count('points'), # f.avg('points')) # # avg_count_rating.printSchema() # # avg_count_rating = avg_count_rating.withColumn('to_status_id', f.lit(13)) # # df_data_join_avg_rating = DynamicFrame.fromDF(avg_count_rating, # glueContext, "datasource0") # # applymapping2 = ApplyMapping.apply(frame=df_data_join_avg_rating, # mappings=[("student_id", "string", "student_id", "bigint"), # ("mapping_id_time", "string", "change_status_date_id", "int"), # ("to_status_id", "int", "to_status_id", "int"), # ("avg(points)", "double", "measure1", "double"), # ("count(points)", "long", "measure2", "int")]) # # resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", # transformation_ctx="resolvechoice2") # # avg_count_rating = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3") # # print(avg_count_rating.count()) # avg_count_rating.toDF().show() # # datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=avg_count_rating, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "test_rating_avg", # "database": "dts_odin", # "postactions": """DROP TABLE IF EXISTS temp_join_rating_old_new; # # CREATE TABLE temp_join_rating_old_new AS # SELECT mcss.id as "id", tra.student_id as "student_id", tra.change_status_date_id as "change_status_date_id",tra.to_status_id as "to_status_id", # COALESCE(mcss.measure2 + tra.measure2, tra.measure2) as measure2_a, # COALESCE((mcss.measure1 * mcss.measure2) + (tra.measure1 * tra.measure2), tra.measure1 * tra.measure2) / COALESCE(measure2_a) as measure1_a # FROM test_rating_avg tra # LEFT JOIN mapping_changed_status_student mcss # ON mcss.student_id = tra.student_id # AND mcss.change_status_date_id = tra.change_status_date_id # AND mcss.to_status_id = 13; # # DELETE mapping_changed_status_student # WHERE mapping_changed_status_student.id in (SELECT id FROM temp_join_rating_old_new WHERE id is not null); # # INSERT INTO mapping_changed_status_student(student_id, change_status_date_id, to_status_id, measure1, measure2) # SELECT student_id, change_status_date_id, to_status_id, measure1_a, measure2_a # FROM temp_join_rating_old_new; # # DROP TABLE IF EXISTS temp_join_rating_old_new, test_rating_avg;"""}, # redshift_tmp_dir="s3n://dts-odin/topicalms/mdl_toannt_rating_class/", # transformation_ctx="datasink4") # end # So lan rating 1 2 3 4 5* cua tung hoc vien theo tung ngay # start # count_rating_star = data_rating_class_logsservice_move_user.toDF() df_mdl_rating_class = df_mdl_rating_class.groupby( 'student_id', 'mapping_id_time', 'points').agg(f.count('points')) df_mdl_rating_class.printSchema() df_mdl_rating_class = df_mdl_rating_class.withColumn( 'to_status_id', when(df_mdl_rating_class['points'] == 1, f.lit(20)).when( df_mdl_rating_class['points'] == 2, f.lit(21)).when(df_mdl_rating_class['points'] == 3, f.lit(22)).when( df_mdl_rating_class['points'] == 4, f.lit(23)).when( df_mdl_rating_class['points'] == 5, f.lit(24)).otherwise(f.lit(20))) df_data_count_rating_star = DynamicFrame.fromDF( df_mdl_rating_class, glueContext, "datasource0") applymapping_count_rating = ApplyMapping.apply( frame=df_data_count_rating_star, mappings=[("student_id", "string", "student_id", "bigint"), ("mapping_id_time", "string", "change_status_date_id", "int"), ("points", "int", "points", "int"), ("to_status_id", "int", "to_status_id", "int"), ("teacher_id", "string", "teacher_id", "bigint"), ("count(points)", "long", "measure", "int")]) resolvechoice_count_rating = ResolveChoice.apply( frame=applymapping_count_rating, choice="make_cols", transformation_ctx="resolvechoice_count_rating") dropnullfields_count_rating_star = DropNullFields.apply( frame=resolvechoice_count_rating, transformation_ctx="dropnullfields3") print(dropnullfields_count_rating_star.count()) dropnullfields_count_rating_star.toDF().show() datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields_count_rating_star, catalog_connection="glue_redshift", connection_options={ "dbtable": "test_rating_star_temp", "database": "dts_odin", "postactions": """DROP TABLE IF EXISTS temp_rating_star; CREATE TABLE temp_rating_star AS SELECT mcss.id as "id", trs.student_id as "student_id", trs.change_status_date_id as "change_status_date_id", trs.to_status_id as "to_status_id", trs.points as "points", COALESCE(mcss.measure2 + trs.measure, trs.measure) as "measure2_a", trs.teacher_id as "teacher_id" FROM test_rating_star_temp trs LEFT JOIN mapping_changed_status_student mcss ON mcss.student_id = trs.student_id AND mcss.change_status_date_id = trs.change_status_date_id AND mcss.to_status_id IN (20,21,22,23,24); DELETE mapping_changed_status_student WHERE mapping_changed_status_student.id in (SELECT id FROM temp_rating_star WHERE id is not null); INSERT INTO mapping_changed_status_student(student_id, change_status_date_id, to_status_id, measure1, measure2, teacher_id) SELECT student_id, change_status_date_id, to_status_id, points, measure2_a, teacher_id FROM temp_rating_star; DROP TABLE IF EXISTS temp_rating_star; DROP TABLE IF EXISTS test_rating_star_temp;""" }, redshift_tmp_dir= "s3n://dts-odin/topicalms/mdl_toannt_rating_class/", transformation_ctx="datasink4") # end # ghi flag # lay max key trong data source mdl_rating_class_tmp = mdl_rating_class.toDF() flag = mdl_rating_class_tmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://dts-odin/flag/flag_rating_class.parquet", mode="overwrite") except Exception as e: print("No new data") print(e)
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session mdl_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(database="tig_market", table_name="tpe_enduser_used_product_history") mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.select_fields( ['_key', 'id', 'used_product_id', 'contact_id', 'status_new', 'status_old', 'timecreated']) mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) # df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_LS_A3.parquet") # so sanh _key datasource voi flag, lay nhung gia tri co key > flag # data = mdl_rating_class.toDF() # data = data.where(data['_key'] > df_flag.collect()[0]['flag']) # data = data.where(data['_key'] < 100) # # mdl_rating_class = DynamicFrame.fromDF(data, glueContext, "mdl_rating_class") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["_key"] > max_key) if (mdl_tpe_enduser_used_product_history.count() > 0): # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["_key"] > max_key) mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, f=lambda x: x["timecreated"] is not None and x["contact_id"] is not None and x["used_product_id"] is not None and x["status_old"] == 'DEACTIVED' and x["status_new"] is not None) # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["contact_id"] is not None) # # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["used_product_id"] is not None) # # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["status_new"] is not None) # # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["status_old"] == 'DEACTIVED') # # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history, # f=lambda x: x["status_new"] == 'ACTIVED') df_mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.toDF() df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn('ngay_kich_hoat', from_unixtime( df_mdl_tpe_enduser_used_product_history[ "timecreated"], "yyyyMMdd")) df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.select('used_product_id', 'contact_id', 'ngay_kich_hoat', 'id').withColumnRenamed( 'used_product_id', 'id_product_buy') data_mdl_tpe_enduser_used_product_history = DynamicFrame.fromDF(df_mdl_tpe_enduser_used_product_history, glueContext, "datasource") applymapping1 = ApplyMapping.apply(frame=data_mdl_tpe_enduser_used_product_history, mappings=[("contact_id", "string", "contact_id", "string"), ("ngay_kich_hoat", "string", "ngay_kich_hoat", "timestamp"), ("id", "string", "id", "string")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields3") print("count: ", dropnullfields3.count()) dropnullfields3.show() # datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields3, # catalog_connection="glue_redshift", # connection_options={ # "dbtable": "temp_ls_trang_thai_a3_1", # "database": "dts_odin", # "postactions": """ call tiepnv_proc_status_a3(1,'');"""}, # redshift_tmp_dir="s3n://datashine-dwh/temp1/", # transformation_ctx="datasink4") # # ghi flag # lay max key trong data source datasourceTmp = mdl_tpe_enduser_used_product_history.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet("s3a://datashine-dev-redshift-backup/flag/flag_LS_A3.parquet", mode="overwrite")