def main(): # ========== init glue_context = GlueContext(SparkContext.getOrCreate()) # ========== retrieve dynamic frame df_advisor = retrieve_dynamic_frame( glue_context, 'tig_advisor', 'advisor_account', ['user_id', 'user_name', 'user_email'] ) display(df_advisor, "df_advisor") df_advisor = df_advisor.withColumnRenamed('user_id', 'advisor_id').withColumnRenamed('user_email', 'email') display(df_advisor, "df_advisor renamed") dyf_advisor = DynamicFrame.fromDF( df_advisor, glue_context, "dyf_advisor" ) display(dyf_advisor, "dyf_advisor") # ========== save dynamic frame to redshift save_data_to_redshift( glue_context, dyf_advisor, 'student_learning_fact', 'advisor_dim', 's3://datashine-dev-redshift-backup/student_learning_fact/advisor_dim', 'student_learning_dim' )
def processBatch(data_frame, batchId): if (data_frame.count() > 0): datasource0 = DynamicFrame.fromDF(data_frame, glueContext, "from_data_frame") now = datetime.datetime.now() year = now.year month = now.month day = now.day hour = now.hour minute = now.minute path_datasink1 = f"s3://{args['datalake_bkt_name']}/{args['datalake_bkt_prefix']}" + "/ingest_year=" + "{:0>4}".format( str(year)) + "/ingest_month=" + "{:0>2}".format( str(month)) + "/ingest_day=" + "{:0>2}".format( str(day)) + "/ingest_hour=" + "{:0>2}".format( str(hour)) + "/" datasink1 = glueContext.write_dynamic_frame.from_options( frame=datasource0, connection_type="s3", connection_options={"path": path_datasink1}, format="parquet", transformation_ctx="datasink1") logger.info(f'{{"batch_process_successful":True}}')
def processBatch(data_frame, batchId): now = datetime.datetime.now() year = now.year month = now.month day = now.day hour = now.hour minute = now.minute if data_frame.count() > 0: dynamic_frame = DynamicFrame.fromDF(data_frame, glueContext, "from_data_frame") apply_mapping = ApplyMapping.apply( frame=dynamic_frame, mappings=[ ("ventilatorid", "long", "ventilatorid", "long"), ("eventtime", "string", "eventtime", "timestamp"), ("serialnumber", "string", "serialnumber", "string"), ("pressurecontrol", "long", "pressurecontrol", "long"), ("o2stats", "long", "o2stats", "long"), ("minutevolume", "long", "minutevolume", "long"), ("manufacturer", "string", "manufacturer", "string"), ], transformation_ctx="apply_mapping", ) dynamic_frame.printSchema() # Write to S3 Sink s3path = (s3_target + "/ingest_year=" + "{:0>4}".format(str(year)) + "/ingest_month=" + "{:0>2}".format(str(month)) + "/ingest_day=" + "{:0>2}".format(str(day)) + "/ingest_hour=" + "{:0>2}".format(str(hour)) + "/") s3sink = glueContext.write_dynamic_frame.from_options( frame=apply_mapping, connection_type="s3", connection_options={"path": s3path}, format="parquet", transformation_ctx="s3sink", )
def main(): ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) # ------------------------------------------------------------------------------------------------------------------# start_year_month_id = 201900 end_year_month_id = today.strftime("%Y%m") try: df_flag = spark.read.parquet(FLAG_BC200_ADVISOR_FILE) display(df_flag, "df_flag") start_year_month_id = df_flag.collect()[0]['flag'] except: print 'read flag file error ' if start_year_month_id >= start_year_month_id: print 'The data was etl on this week', start_year_month_id, end_year_month_id pass print('start_year_month_id: ', start_year_month_id) print('end_year_month_id: ', end_year_month_id) # ------------------------------------------------------------------------------------------------------------------# push_down_predicate = "( year_month_id >= '" + str(start_year_month_id) + "' " \ + " and year_month_id <= '" + str(end_year_month_id) + "') " df_student_care_advisor = retrieve_data_frame( glue_context, database='callcenter', table_name='student_care_advisor', push_down_predicate=push_down_predicate, fields=[ 'idcall', 'student_behavior_date', 'ip_phone', 'student_id', 'contact_id', 'call_status', 'answer_duration', 'requested_rating', 'value_rating' ]) if df_student_care_advisor.count() <= 0: pass # -----------------------------------------------------------------------------------------------------------------# df_call = calculate_call(df_student_care_advisor) df_call.persist(StorageLevel.DISK_ONLY_2) df_student_advisor = calculate_advisor(glue_context=glue_context) df_student_advisor.persist(StorageLevel.DISK_ONLY_2) # -----------------------------------------------------------------------------------------------------------------# df_result = df_call.join(df_student_advisor, on=['ip_phone', 'month_id'], how='inner') # -----------------------------------------------------------------------------------------------------------------# df_result = df_result \ .withColumn('period_id', f.lit(2)) \ .withColumnRenamed('month_id', 'time_id') # -----------------------------------------------------------------------------------------------------------------# df_result = df_result.dropDuplicates() # -----------------------------------------------------------------------------------------------------------------# dyf_result = DynamicFrame.fromDF(df_result, glue_context, 'test') dyf_result = select_student_advisor_fact(dyf_result) save_data_to_redshift(glue_context, dyf_result, 'student_native_report', 'bc200_advisor.advisor_care_fact', REDSHIFT_TMP_DIR, 'advisor_care_fact') # -----------------------------------------------------------------------------------------------------------------# df_flag = get_flag(spark=spark, data_frame=df_result) if df_flag.collect()[0]['flag'] is not None: print 'save_flag done' save_flag(df_flag, FLAG_BC200_ADVISOR_FILE) # -----------------------------------------------------------------------------------------------------------------# df_call.unpersist() df_student_advisor.unpersist()
def etl_gia_han_goi_hoc(): dyf_ghi_nhan_hoc_phi = connectGlue( database="poss", table_name="ghinhan_hp", select_fields=["_key", "khoa_hoc_makh", "ngay_tao"], fillter=["khoa_hoc_makh", "ngay_tao"]) dyf_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.resolveChoice( specs=[("_key", "cast:long")]) try: flag_smile_care = spark.read.parquet( "s3://toxd-olap/transaction_log/flag/flag_behavior_ghi_nhan_hoc_phi.parquet" ) max_key = flag_smile_care.collect()[0]["flag"] print("max_key: ", max_key) dyf_ghi_nhan_hoc_phi = Filter.apply(frame=dyf_ghi_nhan_hoc_phi, f=lambda x: x["_key"] > max_key) except: print("read flag file error ") if dyf_ghi_nhan_hoc_phi.count() > 0: df_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.toDF() flag = df_ghi_nhan_hoc_phi.agg({"_key": "max"}).collect()[0][0] prinDev(df_ghi_nhan_hoc_phi) dyf_khoa_hoc_poss = connectGlue( database="poss", table_name="khoa_hoc", select_fields=["makh", "mahv", "goi_sanpham_id"], fillter=["makh", "mahv", "goi_sanpham_id"], duplicates=["makh", "mahv", "goi_sanpham_id" ]).rename_field("mahv", "ma_hv").rename_field("makh", "ma_kh") df_khoa_hoc_poss = dyf_khoa_hoc_poss.toDF() df_ghi_nhan_hoc_phi = df_khoa_hoc_poss.join( df_ghi_nhan_hoc_phi, (df_khoa_hoc_poss["ma_kh"] == df_ghi_nhan_hoc_phi["khoa_hoc_makh"])) df_ghi_nhan_hoc_phi = df_ghi_nhan_hoc_phi \ .withColumn("student_behavior_date", f.unix_timestamp(df_ghi_nhan_hoc_phi.ngay_tao, "yyyy-MM-dd HH:mm:ss")) dyf_ghi_nhan_hoc_phi = DynamicFrame.fromDF(df_ghi_nhan_hoc_phi, glueContext, "dyf_ghi_nhan_hoc_phi") dyf_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.resolveChoice( specs=[("student_behavior_date", "cast:long")]) df_ghi_nhan_hoc_phi = dyf_ghi_nhan_hoc_phi.toDF() prinDev(df_ghi_nhan_hoc_phi) df_ghi_nhan_hoc_phi = df_ghi_nhan_hoc_phi.drop("khoa_hoc_makh") dyf_hoc_vien_poss = connectGlue(database="poss", table_name="hoc_vien", select_fields=["mahv", "crm_id"], fillter=["mahv", "crm_id"], duplicates=["mahv", "crm_id"]).rename_field( "crm_id", "contact_id") df_hoc_vien_poss = dyf_hoc_vien_poss.toDF() df_khoa_hoc_contact = df_ghi_nhan_hoc_phi.join( df_hoc_vien_poss, (df_ghi_nhan_hoc_phi["ma_hv"] == df_hoc_vien_poss["mahv"]), "left") df_khoa_hoc_contact = df_khoa_hoc_contact.drop("mahv") if is_dev: print "df_khoa_hoc_contact" df_khoa_hoc_contact.show(10) # -----------------------------------------------------------------------------------------------------------------# df_package_code = package_code() df_khoa_hoc_contact_package_code = df_khoa_hoc_contact.join( df_package_code, (df_khoa_hoc_contact["goi_sanpham_id"] == df_package_code["id"])) df_khoa_hoc_contact_package_code.drop("goi_sanpham_id", "id") # -----------------------------------------------------------------------------------------------------------------# # -----------------------------------------------------------------------------------------------------------------# # ---------------------------------------------------------------------------------------------------------------- dyf_test_dauvao_poss = connectGlue( database="poss", table_name="test_dauvao", select_fields=["mahv", "trinhdo_dauvao"], duplicates=["mahv", "trinhdo_dauvao"], fillter=["mahv", "trinhdo_dauvao"]) df_test_dauvao_poss = dyf_test_dauvao_poss.toDF() df_join_level_code = df_khoa_hoc_contact_package_code.join( df_test_dauvao_poss, (df_khoa_hoc_contact_package_code["ma_hv"] == df_test_dauvao_poss["mahv"]), "left") df_join_level_code = df_join_level_code.drop("mahv", "ma_hv") dyf_student_contact = connectGlue( database="tig_advisor", table_name="student_contact", select_fields=["student_id", "contact_id"], duplicates=["student_id", "contact_id"], fillter=["student_id", "contact_id"]).rename_field("contact_id", "contact_id_contact") df_student_contact = dyf_student_contact.toDF() df_join_level_code = df_join_level_code.join( df_student_contact, (df_student_contact["contact_id_contact"] == df_join_level_code["contact_id"])) df_join_level_code = df_join_level_code.drop("contact_id_contact") df_join_level_code = set_package_advisor_level(df_join_level_code) prinDev(df_join_level_code, "end data") for k, v in ADD_COLLUM.items(): df_join_level_code = df_join_level_code.withColumn(k, v) convertAndSaveS3(df_join_level_code) flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://toxd-olap/transaction_log/flag/flag_behavior_ghi_nhan_hoc_phi.parquet", mode="overwrite")