Python GlueContext примеры использования

Язык программирования: Python

Пространство имен/Пакет: awsglue.context

Класс/Тип: GlueContext

Примеров на hotexamples.com: 32

Python GlueContext - 32 примеров найдено. Это лучшие примеры Python кода для awsglue.context.GlueContext, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GlueContext(30)

create_dynamic_frame_from_options(30)

write_dynamic_frame_from_options(13)

create_dynamic_frame_from_catalog(8)

getSink(7)

forEachBatch(4)

purge_s3_path(4)

get_logger(4)

extract_jdbc_conf(3)

createDataFrame(3)

sql(2)

create_dynamic_frame_from_rdd(1)

abort_transaction(1)

commit_transaction(1)

begin_transaction(1)

getSource(1)

get_catalog_schema_as_spark_schema(1)

createOrReplaceTempView(1)

start_transaction(1)

cancel_transaction(1)

Пример #1

Показать файл

Файл: import_into_datacatalog.py Проект: mazelx/aws-glue-samples

def main():
    # arguments
    from_s3 = 'from-s3'
    from_jdbc = 'from-jdbc'
    parser = argparse.ArgumentParser(prog=sys.argv[0])
    parser.add_argument('-m', '--mode', required=True, choices=[from_s3, from_jdbc], help='Choose to migrate metastore either from JDBC or from S3')
    parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection')
    parser.add_argument('-R', '--region', required=False, help='AWS region of target Glue DataCatalog, default to "us-east-1"')
    parser.add_argument('-d', '--database-prefix', required=False, help='Optional prefix for database names in Glue DataCatalog')
    parser.add_argument('-t', '--table-prefix', required=False, help='Optional prefix for table name in Glue DataCatalog')
    parser.add_argument('-D', '--database-input-path', required=False, help='An S3 path containing json files of metastore database entities')
    parser.add_argument('-T', '--table-input-path', required=False, help='An S3 path containing json files of metastore table entities')
    parser.add_argument('-P', '--partition-input-path', required=False, help='An S3 path containing json files of metastore partition entities')

    options = get_options(parser, sys.argv)
    if options['mode'] == from_s3:
        validate_options_in_mode(
            options=options, mode=from_s3,
            required_options=['database_input_path', 'table_input_path', 'partition_input_path'],
            not_allowed_options=['database_prefix', 'table_prefix']
        )
    elif options['mode'] == from_jdbc:
        validate_options_in_mode(
            options=options, mode=from_jdbc,
            required_options=['connection_name'],
            not_allowed_options=['database_input_path', 'table_input_path', 'partition_input_path']
        )
    else:
        raise AssertionError('unknown mode ' + options['mode'])

    validate_aws_regions(options['region'])

    # spark env
    (conf, sc, sql_context) = get_spark_env()
    glue_context = GlueContext(sc)

    # launch job
    if options['mode'] == from_s3:
        metastore_import_from_s3(
            sql_context=sql_context,
            glue_context=glue_context,
            db_input_dir=options['database_input_path'],
            tbl_input_dir=options['table_input_path'],
            parts_input_dir=options['partition_input_path'],
            datacatalog_name='datacatalog',
            region=options.get('region') or 'us-east-1'
        )
    elif options['mode'] == from_jdbc:
        glue_context.extract_jdbc_conf(options['connection_name'])
        metastore_full_migration(
            sc=sc,
            sql_context=sql_context,
            glue_context=glue_context,
            connection=glue_context.extract_jdbc_conf(options['connection_name']),
            db_prefix=options.get('database_prefix') or '',
            table_prefix=options.get('table_prefix') or '',
            datacatalog_name='datacatalog',
            region=options.get('region') or 'us-east-1'
        )

Пример #2

Показать файл

Файл: glue.s3importchembl25.py Проект: markjschreiber/data-lake-as-code

from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3

target_format = "parquet"

## @params: [JOB_NAME]
args = getResolvedOptions(
    sys.argv,
    ['JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session

job = Job(glueContext)

dataLakeBucket = args["DL_BUCKET"]
dataLakePrefix = args["DL_PREFIX"]
aws_region = args["DL_REGION"]
glue_database = args["GLUE_SRC_DATABASE"]

job.init(args['JOB_NAME'], args)

client = boto3.client(service_name='glue', region_name=aws_region)
responseGetTables = client.get_tables(DatabaseName=glue_database)

tableList = responseGetTables['TableList']

Пример #3

Показать файл

from pyspark.sql.functions import col, when
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import when, input_file_name, col

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from datetime import datetime
from datetime import timedelta

sc = SparkContext()
glue_context = GlueContext(sc)
spark = glue_context.spark_session

logger = glue_context.get_logger()
job = Job(glue_context)

args = getResolvedOptions(sys.argv, [
    'JOB_NAME', 'aws_region', 'zone', 'source_bucket', 'LOG_LEVEL',
    'backup_bucket', 'source_prefixes', 'partition_key_1', 'partition_key_2',
    'partition_value_1', 'file_type', 'expiry_days', 's3_backup_enabled',
    'scramble_attribute_name', 'scramble_attribute_value_list',
    'part_date_list_str', 'job_run_id'
])

job.init(args['JOB_NAME'], args)
log_level = args['LOG_LEVEL']

Пример #4

Показать файл

Файл: job.py Проект: filani1985/AWS_ETL_Project

#import pyspark module
from pyspark.context import SparkContext
import pyspark.sql.functions as f


#import glue modules
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job

#initialize contexts and session
spark_context = SparkContext.getOrCreate()
glue_context = GlueContext(spark_context)
session = glue_context.spark_session

#parameters
glue_db = "imdb"
glue_tbl = "input"
s3_write_path = "s3://demo-glue-pavan/output"

############################################
#  Extract (read DAta)
############################################

#Log stream time
dt_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print ("start time :",dt_start)

Пример #5

Показать файл

def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    # job = Job(glueContext)
    # job.init(args['JOB_NAME'], args)
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    dyf_care_call = glueContext.create_dynamic_frame.from_catalog(
        database='tig_advisor', table_name='care_call')

    dyf_care_call = dyf_care_call.resolveChoice(specs=[('_key', 'cast:long')])
    # print schema and select fields
    print('original schema')
    dyf_care_call.printSchema()
    dyf_care_call.show(10)

    # try:
    #     df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet")
    #     read_from_index = df_flag.collect()[0]['flag']
    #     print('read from index: ', read_from_index)
    #     dyf_care_call = Filter.apply(frame=dyf_care_call,
    #                                            f=lambda x: x["_key"] > read_from_index)
    # except:
    #     print('read flag file error ')
    # print('the number of new contacts: ', dyf_care_call.count())

    dyf_care_call = dyf_care_call.select_fields(
        ['_key', 'id', 'phone', 'duration', 'call_status',
         'time_created']).rename_field('time_created', 'call_date')

    dy_source_care_call_cache = dyf_care_call.toDF()
    dy_source_care_call_cache = dy_source_care_call_cache.dropDuplicates(
        ['id'])
    dy_source_care_call_cache = dy_source_care_call_cache.cache()
    dyf_care_call = DynamicFrame.fromDF(dy_source_care_call_cache, glueContext,
                                        'dyf_care_call')
    #
    if (dyf_care_call.count() > 0):
        dyf_care_call = Filter.apply(
            frame=dyf_care_call,
            f=lambda x: x["phone"] is not None and x["phone"] != '' and
            (x["call_status"] == 'success' or x["call_status"] ==
             'call_success') and x["call_date"] is not None and x["call_date"]
            != '' and x["duration"] is not None and x["duration"] > 30)
        #
        print('dyf_care_call::corrcect')
        print('dyf_care_call number', dyf_care_call.count())
        if (dyf_care_call.count() > 0):

            dyf_ad_contact_phone = glueContext.create_dynamic_frame.from_catalog(
                database='tig_advisor', table_name='student_contact_phone')

            dyf_ad_contact_phone = dyf_ad_contact_phone.select_fields(
                ['phone', 'contact_id'])

            dyf_ad_contact_phone = Filter.apply(
                frame=dyf_ad_contact_phone,
                f=lambda x: x["phone"] is not None and x["phone"] != '' and x[
                    "contact_id"] is not None and x["contact_id"] != '')

            print('dyf_ad_contact_phone::schema')
            dyf_ad_contact_phone.printSchema()

            #         dyf_advisor_ip_phone = glueContext.create_dynamic_frame.from_catalog(database='callcenter',
            #                                                                              table_name='advisor_ip_phone')
            #
            #         dyf_advisor_ip_phone = Filter.apply(frame=dyf_advisor_ip_phone,
            #                                             f=lambda x: x["ip_phone"] is not None and x["ip_phone"] != '')
            #
            #
            #
            #
            #
            #
            #-----------------------------------------------------------------------------------------------------------#

            join_call_contact = Join.apply(dyf_care_call, dyf_ad_contact_phone,
                                           'phone', 'phone')
            # join_call_contact = join_call_contact.select_fields(['id_time', 'answertime', 'calldate', 'phonenumber_correct', 'calldate', 'ipphone', 'contact_id'])
            # print('join_call_contact::schema------------')
            join_call_contact.printSchema()
            join_call_contact.show(2)
            print('join: ', join_call_contact.count())
            #
            #
            #         #-----------------------------------------------------------------------------------------------------------#
            #
            dyf_source_ls_dong_tien = glueContext.create_dynamic_frame.from_catalog(
                database='poss', table_name='nvn_poss_lich_su_dong_tien')

            dyf_source_ls_dong_tien = Filter.apply(
                frame=dyf_source_ls_dong_tien,
                f=lambda x: x["contact_id"] is not None and x["contact_id"] !=
                '' and x["ngay_thanhtoan"] is not None and x["ngay_thanhtoan"
                                                             ] != '')

            dyf_source_ls_dong_tien = dyf_source_ls_dong_tien.select_fields([
                '_key', 'id', 'contact_id', 'ngay_thanhtoan', 'ngay_tao',
                'makh'
            ]).rename_field('ngay_tao', 'ngay_a0')

            dy_source_ls_dt_cache = dyf_source_ls_dong_tien.toDF()
            dy_source_ls_dt_cache = dy_source_ls_dt_cache.dropDuplicates(
                ['id'])
            dy_source_ls_dt_cache = dy_source_ls_dt_cache.cache()
            dyf_source_ls_dong_tien = DynamicFrame.fromDF(
                dy_source_ls_dt_cache, glueContext, 'dyf_source_ls_dong_tien')
            #
            join_call_contact_ao = Join.apply(join_call_contact,
                                              dyf_source_ls_dong_tien,
                                              'contact_id', 'contact_id')
            #
            print('join_call_contact_ao::schema------------')
            join_call_contact_ao.printSchema()
            join_call_contact_ao.show(2)
            print('join: ', join_call_contact_ao.count())
            #
            #         # join_call_contact_ao = join_call_contact_ao.resolveChoice(specs=[('calldate', 'cast:timestamp'),
            #         #                                                                  ('ngay_a0', 'cast:timestamp')])
            #
            #
            join_call_contact_ao = Filter.apply(
                frame=join_call_contact_ao,
                f=lambda x: x["call_date"] is not None and x[
                    "ngay_a0"] is not None and x["call_date"] > x["ngay_a0"])
            #
            print(
                'join_call_contact_ao::after filter calldate > ngay_a0------------'
            )
            # join_call_contact_ao.printSchema()
            join_call_contact_ao.show(2)
            print('join_call_contact_ao: ', join_call_contact_ao.count())
            #
            #         #get lich su chao mung thanh cong
            df_join_call_contact_ao = join_call_contact_ao.toDF()
            df_join_call_contact_ao = df_join_call_contact_ao.groupby(
                'contact_id', 'makh').agg(f.min('call_date').alias("ngay_a1"))

            df_join_call_contact_ao = df_join_call_contact_ao.withColumn(
                'id_time',
                from_unixtime(
                    unix_timestamp(df_join_call_contact_ao.ngay_a1,
                                   "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd"))
            dyf_result = DynamicFrame.fromDF(df_join_call_contact_ao,
                                             glueContext, 'dyf_result')
            #
            #         print('dyf_result------------')
            # join_call_contact_ao.printSchema()
            dyf_result.show(2)
            print('dyf_result: ', dyf_result.count())
            #
            #
            #
            #
            #         # # chon field
            applymapping1 = ApplyMapping.apply(
                frame=dyf_result,
                mappings=[("contact_id", "string", "contact_id", "string"),
                          ("id_time", "string", "id_time", "bigint"),
                          ("makh", "int", "makh", "int"),
                          ("ngay_a1", "string", "ngay_a1", "timestamp")])
            #
            resolvechoice2 = ResolveChoice.apply(
                frame=applymapping1,
                choice="make_cols",
                transformation_ctx="resolvechoice2")
            dropnullfields3 = DropNullFields.apply(
                frame=resolvechoice2, transformation_ctx="dropnullfields3")

            # print('dropnullfields3::printSchema')
            # dropnullfields3.printSchema()
            # dropnullfields3.show(2)

            # ghi data vao redshift
            datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfields3,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable":
                    "temp_ls_dong_tien_a1_v3",
                    "database":
                    "dts_odin",
                    "postactions":
                    """
                                                                                        INSERT into mapping_changed_status_student(description, user_id, change_status_date_id, to_status_id, timestamp1)
                                                                                        SELECT 'contact_id: ' + temp_a1.contact_id +' - makh: ' + temp_a1.makh, um.user_id ,temp_a1.id_time, 2, temp_a1.ngay_a1
                                                                                        FROM temp_ls_dong_tien_a1_v3 temp_a1
                                                                                        LEFT JOIN user_map um
                                                                                             ON um.source_type = 1
                                                                                             AND um.source_id = temp_a1.contact_id
                                                                                        ;
                                                                                        DROP TABLE IF EXISTS public.temp_ls_dong_tien_a1_v3;
                                                                                        CALL update_a1_exception_from_eg()
                                                                           """
                },
                redshift_tmp_dir="s3n://dts-odin/temp/temp_ls_dong_tien/v2",
                transformation_ctx="datasink4")
            df_datasource = dyf_care_call.toDF()
            flag = df_datasource.agg({"_key": "max"}).collect()[0][0]
            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            df.write.parquet(
                "s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet",
                mode="overwrite")
            dy_source_care_call_cache.unpersist()

Пример #6

Показать файл

Файл: job_L3100_D4_tu_hoc_native_talk_api.py Проект: 01662024622/dts-odin-etl

def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source

    #------------------------------------------------------------------------------------------------------------------#
    dyf_native_talk = glueContext.create_dynamic_frame.from_catalog(database='native_talk',
                                                                table_name='native_talk_history_log_api')

    dyf_native_talk = dyf_native_talk.resolveChoice(specs=[('id', 'cast:long')])

    try:
        df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet")
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_native_talk = Filter.apply(frame=dyf_native_talk,
                                       f=lambda x: x["id"] > read_from_index)
    except:
        print('read flag file error ')

    dyf_native_talk = dyf_native_talk.select_fields(
        ['id', 'learning_date', 'speaking_dialog_score', 'username', 'updated_time'])

    dy_cache = dyf_native_talk.toDF()
    dy_cache = dy_cache.cache()
    dyf_native_talk = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_native_talk')

    print('dy_cache------------')
    dy_cache.printSchema()
    print('dy_cache: ', dy_cache.count())
    dy_cache.show(2)

    #------------------------------------------------------------------------------------------------------------------#

    if (dyf_native_talk.count() > 0):

        #---------------------------------------------------------datasource0-----------------------------------------------------#
        dyf_native_talk = Filter.apply(frame=dyf_native_talk,
                                              f=lambda x: x["username"] is not None and x["username"] != ''
                                                          and x["speaking_dialog_score"] is not None
                                                          and x["learning_date"] is not None and x["learning_date"] != '')
        # ----------------------------------datasource1---------------------------------------------------------------------------#
        if (dyf_native_talk.count() > 0):
            dyf_nt_account_mapping = glueContext.create_dynamic_frame.from_catalog(database='native_talk',
                                                                        table_name='native_talk_account_mapping')

            dyf_nt_account_mapping = dyf_nt_account_mapping.select_fields(['contact_id', 'username']).rename_field('username', 'nativetalk_user')
            dy_cache_2 = dyf_nt_account_mapping.toDF()
            dy_cache_2 = dy_cache_2.cache()
            dyf_nt_account_mapping = DynamicFrame.fromDF(dy_cache_2, glueContext, 'dyf_nt_account_mapping')

            dyf_nt_account_mapping = Filter.apply(frame=dyf_nt_account_mapping,
                                                  f=lambda x: x["nativetalk_user"] is not None and x["nativetalk_user"] != '')
            # ----------------------------------datasource1---------------------------------------------------------------------------#

            # -------------------------------------------------------------------------------------------------------------#
            join = Join.apply(dyf_native_talk, dyf_nt_account_mapping, 'username', 'nativetalk_user')
            if(join.count() > 0):
                df_nativetalk = join.toDF()
                df_nativetalk = df_nativetalk.withColumn('sogio', f.lit(0.083333)) #5 phut
                df_nativetalk = df_nativetalk.withColumn('id_time',
                                                         from_unixtime(
                                                             unix_timestamp(df_nativetalk.learning_date, "yyyy-MM-dd"),
                                                             "yyyyMMdd"))
                df_nativetalk = df_nativetalk.where("contact_id IS NOT NULL")

                data_nativetalk = DynamicFrame.fromDF(df_nativetalk, glueContext, 'data_nativetalk')
                data_nativetalk = data_nativetalk.resolveChoice(specs=[('sogio', 'cast:float')])
                # -------------------------------------------------------------------------------------------------------------#
                print('data_nativetalk----------')
                data_nativetalk.printSchema()


                # tinh bang "fact_hieusuathoctap"
                df_hieusuathoctap = data_nativetalk.toDF()
                # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time
                df_hieusuathoctap = df_hieusuathoctap.groupby('contact_id', 'id_time').agg(f.sum('sogio'),
                                                                                               f.count('contact_id'))

                df_hieusuathoctap = df_hieusuathoctap.withColumn('tu_hoc_type_id', f.lit(400))
                data_hieusuathoctap = DynamicFrame.fromDF(df_hieusuathoctap, glueContext, 'data_hieusuathoctap')
                data_hieusuathoctap = data_hieusuathoctap.resolveChoice(specs=[('sum(sogio)', 'cast:double')])

                print('data_hieusuathoctap::data_hieusuathoctap::data_hieusuathoctap------------------------------------------')
                data_hieusuathoctap.printSchema();

                applymapping2 = ApplyMapping.apply(frame=data_hieusuathoctap,
                                                   mappings=[("contact_id", "string", "contact_id", "string"),
                                                             ("id_time", 'string', 'id_time', 'bigint'),
                                                             ("count(contact_id)", 'long', 'soca', 'int'),
                                                             ("sum(sogio)", 'double', 'sogio', 'double'),
                                                             ("tu_hoc_type_id", 'int', "tu_hoc_type_id", "int")])


                resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                                     transformation_ctx="resolvechoice2")
                dropnullfields2 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")

                print('dropnullfields2 number: ', dropnullfields2.count())

                datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2,
                                                                           catalog_connection="glue_redshift",
                                                                           connection_options={"dbtable": "temp_staging_lich_su_tu_hoc_native_talk",
                                                                                               "database": "dts_odin",
                                                                                               "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2)
                                                                                                                            SELECT um.user_id, hwb.id_time, 53, hwb.soca, round(hwb.sogio, 4)
                                                                                                                            FROM temp_staging_lich_su_tu_hoc_native_talk hwb
                                                                                                                            LEFT JOIN user_map um
                                                                                                                                ON um.source_type = 1
                                                                                                                                AND um.source_id = hwb.contact_id;
                                                                                                                 DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_native_talk    
                                                                                                                """
                                                                                               },
                                                                           redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/",
                                                                           transformation_ctx="datasink2")

                df_datasource = dyf_native_talk.toDF()
                flag = df_datasource.agg({"id": "max"}).collect()[0][0]
                print('flag: ', flag)
                flag_data = [flag]
                df = spark.createDataFrame(flag_data, "long").toDF('flag')
                df.write.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet", mode="overwrite")
                dy_cache.unpersist()
                dy_cache_2.unpersist()

Пример #7

Показать файл

JSON_FILE_NAME = TARGET_TABLE + \
    "_" + SOURCE_SYSTEM  # ar_invc_hdr_f_must
STAGE_TABLE = ARGS['rs_stage_table']  # ar_invc_hdr_f_stage_must
CTLG_CONNECTION = ARGS['glue_conn']  # TestRedshift3
REDSHIFTDB = ARGS['rs_db']  # usinnovationredshift
S3_BUCKET = ARGS['bkt_name']  # "odp-us-innovation-raw"
MD5_COLUMN_SCD1 = TARGET_TABLE + "_md5_scd1"  # ar_invc_hdr_f_md5_scd1
TARGET_TABLE_COLUMNS = ARGS['target_cols']  # As per DDL(col1,col2,col3)
STAGE_TABLE_COLUMNS = ARGS['stage_cols']  # As per DDL(col1,col2,col3)
DBTABLE_STG = STAGE_DATABASE_NAME + "." + STAGE_TABLE

URL = ARGS["jdbc_url"]
IAM_ROLE = ARGS["iam_role"]

SC = SparkContext()
GLUECONTEXT = GlueContext(SC)
SPARK = GLUECONTEXT.spark_session
JOB = Job(GLUECONTEXT)
JOB.init(ARGS['JOB_NAME'], ARGS)
RUN_ID = ARGS['JOB_RUN_ID']
JOB_NAME = ARGS['JOB_NAME']
TEMPDIR = ARGS['TempDir']

SRC_NOTEMPTY = True
try:
    # @type: DataSource
    # @args: [database = "db_mrr_must",
    ## table_name = "billing"
    # transformation_ctx = "billing_df"]
    # @return: DynamicFrame
    # @inputs: []

Пример #8

Показать файл

def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    student_id_unavailable = 0L
    package_endtime_unavailable = 99999999999L
    package_starttime_unavailable = 0L

    def do_check_endtime(val):
        if val is not None:
            return val
        return 4094592235

    check_endtime_null = udf(do_check_endtime, LongType())

    ########## dyf_student_contact
    dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="student_contact")
    dyf_student_contact = dyf_student_contact.select_fields([
        '_key', 'contact_id', 'student_id', 'advisor_id', 'level_study',
        'time_lms_created', 'time_created'
    ])

    dyf_student_contact = dyf_student_contact.resolveChoice(
        specs=[('time_lms_created', 'cast:long')])

    dyf_student_contact = Filter.apply(
        frame=dyf_student_contact,
        f=lambda x: x['student_id'] is not None and x[
            'contact_id'] is not None and x['time_lms_created'] is not None)

    dyf_student_level = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_student_level_study")

    dyf_student_level = dyf_student_level.select_fields(
        ['_key', 'contact_id', 'level_current', 'level_modified', 'time_created']) \
        .rename_field('contact_id', 'contact_id1') \
        .rename_field('time_created', 'time_level_created')

    # try:
    #     df_flag = spark.read.parquet("s3://dtsodin/flag/flag_log_student_level.parquet")
    #     max_key = df_flag.collect()[0]['flag']
    #     print("max_key:  ", max_key)
    #     # Chi lay nhung ban ghi lon hon max_key da luu, ko load full
    #     dyf_student_level = Filter.apply(frame=dyf_student_level, f=lambda x: x["_key"] > max_key)
    # except:
    #     print('read flag file error ')

    if dyf_student_level.count() > 0:

        dyf_mapping_class_lms = glueContext.create_dynamic_frame.from_catalog(
            database="nvn_knowledge", table_name="mapping_class_lms")
        dyf_mapping_class_lms = dyf_mapping_class_lms.select_fields(
            ['level_lms', 'level_list_master'])

        dyf_learning_object_class = glueContext.create_dynamic_frame.from_catalog(
            database="nvn_knowledge", table_name="learning_object_class")
        dyf_learning_object_class = dyf_learning_object_class.select_fields(
            ['class_id', 'class_name'])

        dyf_student_contact_level = Join.apply(dyf_student_contact,
                                               dyf_student_level, 'contact_id',
                                               'contact_id1')
        df_student_contact_level = dyf_student_contact_level.toDF()

        df_student_contact_level = df_student_contact_level \
            .select('student_id', 'level_current', 'level_modified', 'time_created', 'time_level_created')

        df_count_contact_level = df_student_contact_level \
            .groupby('student_id').agg(f.count('time_level_created').alias("count").cast('long'))
        dyf_count_contact_level = DynamicFrame.fromDF(
            df_count_contact_level, glueContext, "dyf_count_contact_level")
        dyf_count_contact_level = Filter.apply(frame=dyf_count_contact_level,
                                               f=lambda x: x['count'] == 2L)
        if dyf_count_contact_level.count() > 0:
            df_count_contact_level = dyf_count_contact_level.toDF()
            print df_count_contact_level.count()
            df_count_contact_level = df_count_contact_level.select(
                'student_id')
            list_filer = [
                list(row) for row in df_count_contact_level.collect()
            ]
        else:
            list_filer = ['']

        dyf_student_contact_level = DynamicFrame.fromDF(
            df_student_contact_level, glueContext, "dyf_student_contact_level")
        dyf_student_contact_level = Filter.apply(
            frame=dyf_student_contact_level,
            f=lambda x: x['student_id'] not in list_filer)

        # or x['student_id'] == '29439'
        dyf_student_contact_level.count()
        dyf_student_contact_level.printSchema()

        df_student_contact_level = dyf_student_contact_level.toDF()

        df_contact_level_first = df_student_contact_level.groupby('student_id').agg(
            f.min("time_level_created").alias("min_time_level_created")) \
            .withColumnRenamed('student_id', 'min_student_id')

        df_contact_level_last = df_student_contact_level.groupby('student_id').agg(
            f.max("time_level_created").alias("max_time_level_created")) \
            .withColumnRenamed('student_id', 'max_student_id')

        df_contact_level_first = df_contact_level_first \
            .join(df_student_contact_level,
                  (df_contact_level_first.min_student_id == df_student_contact_level.student_id)
                  & (df_contact_level_first.min_time_level_created == df_student_contact_level.time_level_created), "left")

        df_contact_level_last = df_contact_level_last \
            .join(df_student_contact_level,
                  (df_contact_level_last.max_student_id == df_student_contact_level.student_id)
                  & (df_contact_level_last.max_time_level_created == df_student_contact_level.time_level_created), "left")

        df_contact_level_last = df_contact_level_last \
            .select('student_id', 'level_modified', 'time_level_created')

        df_contact_level_last = df_contact_level_last \
            .withColumnRenamed('time_level_created', 'start_date') \
            .withColumnRenamed('level_modified', 'level_study') \
            .withColumn('end_date', f.lit(4094592235).cast('long'))

        df_contact_level_last = df_contact_level_last \
            .select('student_id', 'level_study', 'start_date', 'end_date')

        df_contact_level_first = df_contact_level_first \
            .select('student_id', 'level_current', 'time_created', 'time_level_created')

        df_contact_level_first = df_contact_level_first \
            .withColumnRenamed('level_current', 'level_study') \
            .withColumnRenamed('time_created', 'start_date') \
            .withColumnRenamed('time_level_created', 'end_date')

        df_contact_level_first = df_contact_level_first \
            .select('student_id', 'level_study', 'start_date', 'end_date')

        df_contact_level_first.show()
        df_contact_level_last.show(),
        print "END FIRST_LAST"

        df_student_contact_level01 = df_student_contact_level \
            .select('student_id', 'level_current', 'level_modified', 'time_level_created')
        print "df_student_contact_level"
        df_student_contact_level01.show(20)
        df_student_contact_level02 = df_student_contact_level01

        df_student_contact_level02 = df_student_contact_level02 \
            .withColumnRenamed('student_id', 'student_id_temp') \
            .withColumnRenamed('level_current', 'level_current_temp') \
            .withColumnRenamed('level_modified', 'level_modified_temp') \
            .withColumnRenamed('time_level_created', 'time_level_created_temp')
        df_student_contact_level02.show(20)

        df_join_student_contact_level = df_student_contact_level01 \
            .join(df_student_contact_level02,
                  (df_student_contact_level01.student_id == df_student_contact_level02.student_id_temp)
                  & (df_student_contact_level01.level_current == df_student_contact_level02.level_modified_temp)
                  & (df_student_contact_level01.time_level_created > df_student_contact_level02.time_level_created_temp),
                  "left")
        df_join_student_contact_level.show(100)
        df_join_student_contact_level = df_join_student_contact_level \
            .groupby('student_id', 'level_current', 'time_level_created_temp') \
            .agg(f.min("time_level_created").alias("time_level_created"))

        df_join_student_contact_level = df_join_student_contact_level \
            .withColumnRenamed('time_level_created_temp', 'start_date') \
            .withColumnRenamed('time_level_created', 'end_date') \
            .withColumnRenamed('level_current', 'level_study')

        df_join_student_contact_level = df_join_student_contact_level.where(
            'start_date is not null')
        df_join_student_contact_level.count()
        df_join_student_contact_level.show()

        df_union_first_and_middle_contact = df_contact_level_first.union(
            df_join_student_contact_level)
        df_union_all_contact = df_union_first_and_middle_contact.union(
            df_contact_level_last)
        print df_union_all_contact.count()
        df_union_all_contact.printSchema()
        df_union_all_contact.show()
        df_union_all_contact = df_union_all_contact.withColumn(
            "user_id",
            f.lit(None).cast('long'))

        dyf_student_contact_level = DynamicFrame.fromDF(
            df_union_all_contact, glueContext, "dyf_student_contact_level")
        dyf_join_all0 = Join.apply(dyf_mapping_class_lms,
                                   dyf_student_contact_level, "level_lms",
                                   "level_study")
        dyf_join_all = Join.apply(dyf_join_all0, dyf_learning_object_class,
                                  "level_list_master", "class_name")

        df_join_all = dyf_join_all.toDF()
        print "df_join_all ", df_join_all.count()

        df_join_all = df_join_all.dropDuplicates()
        dyf_join_all = DynamicFrame.fromDF(df_join_all, glueContext,
                                           "dyf_join_all")

        print "dyf_join_all ", dyf_join_all.count()
        dyf_join_all.printSchema()
        dyf_join_all.show(10)

        applymapping = ApplyMapping.apply(
            frame=dyf_join_all,
            mappings=[("student_id", "string", "student_id", "string"),
                      ("user_id", "long", "user_id", "long"),
                      ("class_id", "long", "class_id", "long"),
                      ("start_date", "int", "start_date", "long"),
                      ("end_date", 'long', 'end_date', 'long'),
                      ("level_study", 'string', 'level_study', 'string')])
        resolvechoice = ResolveChoice.apply(
            frame=applymapping,
            choice="make_cols",
            transformation_ctx="resolvechoice2")
        dropnullfields = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields")

        datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "mapping_class_student",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dtsodin/temp1/mapping_class_student/",
            transformation_ctx="datasink5")

        print('START WRITE TO S3-------------------------')
        s3 = boto3.resource('s3')
        bucket = s3.Bucket('dtsodin')
        bucket.objects.filter(
            Prefix="student_behavior/student_level/").delete()

        s3 = boto3.client('s3')
        bucket_name = "dtsodin"
        directory_name = "student_behavior/student_level/"  # it's name of your folders
        s3.put_object(Bucket=bucket_name, Key=directory_name)

        ######
        log_student_level = glueContext.create_dynamic_frame.from_options(
            connection_type="redshift",
            connection_options={
                "url":
                "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin",
                "user": "******",
                "password": "******",
                "dbtable": "mapping_class_student",
                "redshiftTmpDir": "s3n://dtsodin/temp1/mapping_class_student/"
            })
        datasink6 = glueContext.write_dynamic_frame.from_options(
            frame=log_student_level,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_level/"
            },
            format="parquet",
            transformation_ctx="datasink6")
        print('END WRITE TO S3-------------------------')

        # ghi flag
        # lay max key trong data source
        datasource = dyf_student_level.toDF()
        flag = datasource.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet("s3a://dtsodin/flag/flag_log_student_level.parquet",
                         mode="overwrite")

    ##############################################################################################
    dyf_student_advisor = glueContext.create_dynamic_frame.from_catalog(
        database="tig_advisor", table_name="log_change_assignment_advisor")
    dyf_student_advisor = dyf_student_advisor.select_fields(
        ['_key', 'id', 'contact_id', 'advisor_id_old', 'advisor_id_new', 'created_at'])\
        .rename_field('contact_id', 'contact_id1')

    # try:
    #     df_flag = spark.read.parquet("s3://dtsodin/flag/flag_log_student_advisor.parquet")
    #     max_key = df_flag.collect()[0]['flag']
    #     print("max_key:  ", max_key)
    #     # Chi lay nhung ban ghi lon hon max_key da luu, ko load full
    #     dyf_student_advisor = Filter.apply(frame=dyf_student_advisor, f=lambda x: x["_key"] > max_key)
    # except:
    #     print('read flag file error ')

    if dyf_student_advisor.count() > 0:
        df_student_advisor = dyf_student_advisor.toDF()
        # df_student_advisor.printSchema()
        # df_student_advisor.show()

        df_student_advisor = df_student_advisor.withColumn(
            'created_at_1',
            unix_timestamp(df_student_advisor.created_at,
                           "yyyy-MM-dd HH:mm:ss").cast(IntegerType()))

        df_student_contact = dyf_student_contact.toDF()
        df_student_contact_advisor = df_student_contact.join(
            df_student_advisor,
            df_student_contact.contact_id == df_student_advisor.contact_id1)

        dyf_student_contact_advisor = DynamicFrame.fromDF(
            df_student_contact_advisor, glueContext,
            "dyf_student_contact_advisor")
        dyf_student_contact_advisor = dyf_student_contact_advisor.select_fields(
            [
                'student_id', 'advisor_id', 'advisor_id_old', 'advisor_id_new',
                'created_at_1', 'time_created'
            ])
        df_student_contact_advisor = dyf_student_contact_advisor.toDF()
        df_student_contact_advisor = df_student_contact_advisor.dropDuplicates(
        )
        df_advisor_temp = df_student_contact_advisor
        df_advisor_temp = df_advisor_temp \
            .withColumnRenamed('student_id', 'student_id_temp') \
            .withColumnRenamed('advisor_id', 'advisor_id_temp') \
            .withColumnRenamed('advisor_id_old', 'advisor_id_old_temp') \
            .withColumnRenamed('advisor_id_new', 'advisor_id_new_temp') \
            .withColumnRenamed('created_at_1', 'created_at_temp') \
            .withColumnRenamed('time_created', 'time_created_temp')

        df_join_data = df_student_contact_advisor \
            .join(df_advisor_temp,
                  (df_student_contact_advisor.student_id == df_advisor_temp.student_id_temp)
                  & (df_student_contact_advisor.advisor_id_new == df_advisor_temp.advisor_id_old_temp)
                  & (df_student_contact_advisor.created_at_1 <= df_advisor_temp.created_at_temp), "left")

        df_join_data = df_join_data.groupby('student_id', 'advisor_id_new', 'created_at_1') \
            .agg(f.min("created_at_temp").alias("created_at_temp"))

        df_join_data = df_join_data.withColumn(
            "end_time", check_endtime_null(df_join_data.created_at_temp))
        df_join_data = df_join_data.dropDuplicates()
        dyf_student_contact_advisor = DynamicFrame.fromDF(
            df_join_data, glueContext, "dyf_student_contact_advisor")

        dyf_student_contact_advisor.printSchema()
        applymapping = ApplyMapping.apply(
            frame=dyf_student_contact_advisor,
            mappings=[("student_id", "string", "student_id", "string"),
                      ("advisor_id_new", "string", "advisor_id", "string"),
                      ("created_at_1", 'int', 'start_time', 'long'),
                      ("end_time", 'long', 'end_time', 'long')])

        resolvechoice = ResolveChoice.apply(
            frame=applymapping,
            choice="make_cols",
            transformation_ctx="resolvechoice2")
        dropnullfields = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields")

        datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "fact_log_change_advisor",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dtsodin/temp1/fact_log_change_advisor/",
            transformation_ctx="datasink5")

        print('START WRITE TO S3-------------------------')
        s3 = boto3.resource('s3')
        bucket = s3.Bucket('dtsodin')
        bucket.objects.filter(
            Prefix="student_behavior/student_advisor/").delete()

        s3 = boto3.client('s3')
        bucket_name = "dtsodin"
        directory_name = "student_behavior/student_advisor/"  # it's name of your folders
        s3.put_object(Bucket=bucket_name, Key=directory_name)

        ######
        log_student_advisor = glueContext.create_dynamic_frame.from_options(
            connection_type="redshift",
            connection_options={
                "url":
                "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin",
                "user": "******",
                "password": "******",
                "dbtable": "fact_log_change_advisor",
                "redshiftTmpDir":
                "s3n://dtsodin/temp1/fact_log_change_advisor/"
            })
        datasink6 = glueContext.write_dynamic_frame.from_options(
            frame=log_student_advisor,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_advisor/"
            },
            format="parquet",
            transformation_ctx="datasink6")
        print('END WRITE TO S3-------------------------')

        datasource = dyf_student_advisor.toDF()
        flag = datasource.agg({"_key": "max"}).collect()[0][0]

        # convert kieu dl
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet("s3a://dtsodin/flag/flag_log_student_advisor.parquet",
                         mode="overwrite")

    ##############################################################################################
    dyf_student_product = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_enduser_used_product")

    dyf_student_product = dyf_student_product.select_fields([
        '_key', 'id', 'contact_id', 'product_id', 'starttime', 'endtime',
        'timecreated', 'balance_used', 'status', 'timemodified'
    ]).rename_field('contact_id', 'contact_id1')

    dyf_student_product = dyf_student_product.resolveChoice(
        specs=[('_key', 'cast:long')])

    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet(
            "s3://dtsodin/flag/flag_tpe_enduser_used_product.parquet")
        start_read = df_flag.collect()[0]['flag']
        print('read from index: ', start_read)
        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        # dyf_student_product = Filter.apply(frame=dyf_student_product, f=lambda x: x['_key'] > start_read)
    except:
        print('read flag file error ')

    dyf_student_product = Filter.apply(
        frame=dyf_student_product,
        f=lambda x: x["product_id"] is not None and x["product_id"] != '' and
        x["contact_id1"] is not None and x["contact_id1"] != '')
    if (dyf_student_product.count() > 0):
        # try:
        dyf_product_details = glueContext.create_dynamic_frame.from_catalog(
            database="tig_market", table_name="tpe_invoice_product_details")

        dyf_product_details = dyf_product_details.select_fields(
            ['id', 'cat_code', 'package_cost', 'actual_cost',
             'package_time']).rename_field('id', 'id_goi')

        # loc data
        # df_goi = datasource1.toDF()
        # df_goi = df_goi.where("id_goi <> '' and id_goi is not null")
        # data_goi = DynamicFrame.fromDF(df_goi, glueContext, "data_goi")
        dyf_product_details = Filter.apply(
            frame=dyf_product_details,
            f=lambda x: x["id_goi"] is not None and x["id_goi"] != '')

        # df_lsmua = dyf_student_product.toDF()
        # df_lsmua = df_lsmua.where("product_id is not null and product_id <> ''")
        # df_lsmua = df_lsmua.where("contact_id is not null and contact_id <> ''")
        # data_lsmua = DynamicFrame.fromDF(df_lsmua, glueContext, "data_lsmua")

        # map ls mua goi vs thong tin chi tiet cua goi
        dyf_student_package = Join.apply(dyf_student_product,
                                         dyf_product_details, 'product_id',
                                         'id_goi')
        dyf_join_data = Join.apply(dyf_student_package, dyf_student_contact,
                                   'contact_id1', 'contact_id')

        print "dyf_student_package count1: ", dyf_join_data.count()
        df_join_data = dyf_join_data.toDF()
        # drop duplicate rows
        # df_join_data = df_join_data.dropDuplicates()
        df_join_data = df_join_data.groupby('id', 'student_id') \
            .agg(f.first('cat_code').alias('cat_code'),
                 f.first('starttime').alias('starttime'),
                 f.first('endtime').alias('endtime'),
                 f.first('timecreated').alias('timecreated'))
        dyf_student_package = DynamicFrame.fromDF(df_join_data, glueContext,
                                                  "dyf_student_package")

        # convert data
        # df_student_package = dyf_student_package.toDF()
        # df_student_package = df_student_package.withColumn('ngay_kich_hoat', from_unixtime(df_student_package.starttime)) \
        #     .withColumn('ngay_het_han', from_unixtime(df_student_package.endtime)) \
        #     .withColumn('ngay_mua', from_unixtime(df_student_package.timecreated)) \
        #     .withColumn('id_time', from_unixtime(df_student_package.starttime, "yyyyMMdd")) \
        #     .withColumn('ngay_thay_doi', from_unixtime(df_student_package.timemodified))
        # df_student_package = df_student_package.dropDuplicates(['contact_id', 'product_id'])
        #
        # data = DynamicFrame.fromDF(df_student_package, glueContext, "data")
        # df_student_package.printSchema()
        # df_student_package.show()

        # chon cac truong va kieu du lieu day vao db
        applyMapping = ApplyMapping.apply(
            frame=dyf_student_package,
            mappings=[("id", "string", "id", "string"),
                      ("student_id", "string", "student_id", "string"),
                      ("cat_code", 'string', 'package_code', 'string'),
                      ("starttime", "int", "start_time", "int"),
                      ("endtime", "int", "end_time", "int"),
                      ("timecreated", "int", "timecreated", "int")])

        resolvechoice = ResolveChoice.apply(
            frame=applyMapping,
            choice="make_cols",
            transformation_ctx="resolvechoice2")

        dropnullfields = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields3")

        dropnullfields.show(10)
        # ghi data vao db
        datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "fact_student_package",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dtsodin/temp1/fact_student_package/",
            transformation_ctx="datasink4")

        print('START WRITE TO S3-------------------------')
        s3 = boto3.resource('s3')
        bucket = s3.Bucket('dtsodin')
        bucket.objects.filter(
            Prefix="student_behavior/student_package/").delete()

        s3 = boto3.client('s3')
        bucket_name = "dtsodin"
        directory_name = "student_behavior/student_package/"
        s3.put_object(Bucket=bucket_name, Key=directory_name)

        ######
        log_student_package = glueContext.create_dynamic_frame.from_options(
            connection_type="redshift",
            connection_options={
                "url":
                "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin",
                "user": "******",
                "password": "******",
                "dbtable": "fact_student_package",
                "redshiftTmpDir": "s3n://dtsodin/temp1/fact_student_package/"
            })
        datasink6 = glueContext.write_dynamic_frame.from_options(
            frame=log_student_package,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_package/"
            },
            format="parquet",
            transformation_ctx="datasink6")
        print('END WRITE TO S3-------------------------')

        # ghi flag
        # lay max key trong data source
        datasource = dyf_student_product.toDF()
        flag = datasource.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet(
            "s3a://dtsodin/flag/flag_tpe_enduser_used_product_log.parquet",
            mode="overwrite")

    ##############################################################################################

    def do_check_null(val):
        if val is not None:
            return val
        return 4094592235

    check_data_null = udf(do_check_null, LongType())

    dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_enduser_used_product_history")

    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields(
        ['_key', 'contact_id', 'status_old', 'status_new', 'timecreated'])

    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice(
        specs=[('_key', 'cast:long')])

    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet(
            "s3a://dtsodin/flag/flag_tpe_enduser_used_product_history.parquet")
        start_read = df_flag.collect()[0]['flag']
        print('read from index: ', start_read)
        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        # dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x['_key'] > start_read)
    except:
        print('read flag file error ')

    dyf_tpe_enduser_used_product_history = Filter.apply(
        frame=dyf_tpe_enduser_used_product_history,
        f=lambda x: x["contact_id"] is not None and x["contact_id"] != '')
    if (dyf_tpe_enduser_used_product_history.count() > 0):
        df_tpe_enduser_used_product_history01 = dyf_tpe_enduser_used_product_history.toDF(
        )
        df_tpe_enduser_used_product_history02 = dyf_tpe_enduser_used_product_history.toDF(
        )

        df_tpe_enduser_used_product_history01 = df_tpe_enduser_used_product_history01\
            .withColumnRenamed('timecreated', 'start_date')

        df_tpe_enduser_used_product_history02 = df_tpe_enduser_used_product_history02 \
            .withColumnRenamed('timecreated', 'timecreated02') \
            .withColumnRenamed('status_old', 'status_old02') \
            .withColumnRenamed('status_new', 'status_new02') \
            .withColumnRenamed('contact_id', 'contact_id02')

        df_tpe_enduser_used_product_history_join = df_tpe_enduser_used_product_history01.join(
            df_tpe_enduser_used_product_history02,
            (df_tpe_enduser_used_product_history01['contact_id']
             == df_tpe_enduser_used_product_history02['contact_id02'])
            & (df_tpe_enduser_used_product_history01['status_new']
               == df_tpe_enduser_used_product_history02['status_old02'])
            & (df_tpe_enduser_used_product_history01['start_date'] <=
               df_tpe_enduser_used_product_history02['timecreated02']), "left")

        df_tpe_enduser_used_product_history_join = df_tpe_enduser_used_product_history_join \
            .withColumn("end_date", check_data_null(df_tpe_enduser_used_product_history_join.timecreated02))

        df_tpe_enduser_used_product_history_join.printSchema()
        print df_tpe_enduser_used_product_history_join.count()
        df_tpe_enduser_used_product_history_join.show(10)

        dyf_tpe_enduser_product_history = DynamicFrame.fromDF(
            df_tpe_enduser_used_product_history_join, glueContext,
            "dyf_tpe_enduser_product_history")
        dyf_tpe_enduser_product_history = dyf_tpe_enduser_product_history.select_fields(
            [
                'contact_id', 'status_old', 'status_new', 'start_date',
                'end_date'
            ])

        df_tpe_enduser_used_product_history = dyf_tpe_enduser_product_history.toDF(
        )
        df_tpe_enduser_used_product_history_join.printSchema()
        print df_tpe_enduser_used_product_history_join.count()
        df_tpe_enduser_used_product_history_join.show(10)

        dyf_tpe_enduser_product_history = DynamicFrame.fromDF(
            df_tpe_enduser_used_product_history, glueContext,
            "dyf_tpe_enduser_product_history")
        # chon cac truong va kieu du lieu day vao db
        applyMapping = ApplyMapping.apply(
            frame=dyf_tpe_enduser_product_history,
            mappings=[("contact_id", "string", "contact_id", "string"),
                      ("status_new", "string", "status_code", "string"),
                      ("status_old", "string", "last_status_code", "string"),
                      ("start_date", "int", "start_date", "long"),
                      ("end_date", "long", "end_date", "long")])

        resolvechoice = ResolveChoice.apply(
            frame=applyMapping,
            choice="make_cols",
            transformation_ctx="resolvechoice2")

        dropnullfields = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields")

        # ghi data vao db
        datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "fact_log_student_status",
                "database": "dts_odin",
            },
            redshift_tmp_dir=
            "s3n://dtsodin/temp1/tpe_enduser_used_product_history/",
            transformation_ctx="datasink4")

        print('START WRITE TO S3-------------------------')
        s3 = boto3.resource('s3')
        bucket = s3.Bucket('dtsodin')
        bucket.objects.filter(
            Prefix="student_behavior/student_status/").delete()

        s3 = boto3.client('s3')
        bucket_name = "dtsodin"
        directory_name = "student_behavior/student_status/"
        s3.put_object(Bucket=bucket_name, Key=directory_name)

        ######
        log_student_status = glueContext.create_dynamic_frame.from_options(
            connection_type="redshift",
            connection_options={
                "url":
                "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin",
                "user": "******",
                "password": "******",
                "dbtable": "fact_log_student_status",
                "redshiftTmpDir":
                "s3n://dtsodin/temp1/fact_log_student_status/"
            })
        datasink6 = glueContext.write_dynamic_frame.from_options(
            frame=log_student_status,
            connection_type="s3",
            connection_options={
                "path": "s3://dtsodin/student_behavior/student_status/"
            },
            format="parquet",
            transformation_ctx="datasink6")
        print('END WRITE TO S3-------------------------')

        # ghi flag
        # lay max key trong data source
        datasource = dyf_tpe_enduser_used_product_history.toDF()
        flag = datasource.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet(
            "s3a://dtsodin/flag/flag_tpe_enduser_used_product_history.parquet",
            mode="overwrite")

Пример #9

Показать файл

def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    mdl_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_enduser_used_product_history")
    # xu ly truong hop start_read is null
    try:
        # # doc moc flag tu s3
        df_flag = spark.read.parquet(
            "s3a://dts-odin/flag/fact_flag_expired.parquet")
        start_read = df_flag.collect()[0]['flag']
        print('read from index: ', start_read)

        # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
        mdl_tpe_enduser_used_product_history = Filter.apply(
            frame=mdl_tpe_enduser_used_product_history,
            f=lambda x: x['_key'] > start_read)
    except:
        print('read flag file error ')

    print('the number of new contacts: ',
          mdl_tpe_enduser_used_product_history.count())
    mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.select_fields(
        [
            '_key', 'id', 'used_product_id', 'contact_id', 'status_new',
            'status_old', 'timecreated'
        ])

    mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice(
        specs=[('_key', 'cast:long')])

    # df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_LS_A3.parquet")
    #
    # max_key = df_flag.collect()[0]['flag']
    #
    # mdl_tpe_enduser_used_product_history = Filter.apply(frame=mdl_tpe_enduser_used_product_history,
    #                                                     f=lambda x: x["_key"] > max_key)

    if (mdl_tpe_enduser_used_product_history.count() > 0):
        mdl_tpe_enduser_used_product_history = Filter.apply(
            frame=mdl_tpe_enduser_used_product_history,
            f=lambda x: x["timecreated"] is not None and x["contact_id"] is
            not None and x["used_product_id"] is not None and x["status_old"]
            == "ACTIVED" and x["status_new"] in ["EXPIRED", "EXPRIED"])

        # print(mdl_tpe_enduser_used_product_history.count())

        mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice(
            specs=[('timecreated', 'cast:long')])
        df_mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.toDF(
        )

        df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn(
            'change_status_date_id', from_unixtime(df_mdl_tpe_enduser_used_product_history['timecreated'], "yyyyMMdd")) \
            .withColumn('to_status_id', f.lit(108)) \
            .withColumn('timestamp1', df_mdl_tpe_enduser_used_product_history['timecreated'] * f.lit(
            1000))

        # df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.select('used_product_id',
        #                                                                                    'contact_id',
        #                                                                                    'ngay_kich_hoat',
        #                                                                                    'id').withColumnRenamed(
        #     'used_product_id', 'id_product_buy')
        data_mdl_tpe_enduser_used_product_history = DynamicFrame.fromDF(
            df_mdl_tpe_enduser_used_product_history, glueContext,
            "data_mdl_tpe_enduser_used_product_history")

        data_mdl_tpe_enduser_used_product_history.printSchema()
        data_mdl_tpe_enduser_used_product_history.show(3)

        applymapping1 = ApplyMapping.apply(
            frame=data_mdl_tpe_enduser_used_product_history,
            mappings=[("contact_id", "string", "contact_id", "string"),
                      ("change_status_date_id", "string",
                       "change_status_date_id", "long"),
                      ("timestamp1", "long", "timestamp1", "timestamp"),
                      ('to_status_id', 'int', 'to_status_id', 'long')])

        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping1,
            choice="make_cols",
            transformation_ctx="resolvechoice2")

        dropnullfields3 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields3")

        datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields3,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable":
                "mapping_changed_status_student",
                "database":
                "dts_odin",
                "postactions":
                """UPDATE mapping_changed_status_student
		                                                                                 SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 1 AND source_id = mapping_changed_status_student.contact_id LIMIT 1 )
	                                                                                     WHERE user_id IS NULL and to_status_id=108"""
            },
            redshift_tmp_dir="s3n://datashine-dwh/temp1/",
            transformation_ctx="datasink4")
        # ghi flag
        # lay max key trong data source
        datasourceTmp = mdl_tpe_enduser_used_product_history.toDF()
        flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de _key vao s3
        df.write.parquet("s3a://dts-odin/flag/fact_flag_expired.parquet",
                         mode="overwrite")

Пример #10

Показать файл

Файл: DO_L3120_student_test_tuan_detail.py Проект: 01662024622/dts-odin-etl

def main():
    def getTimestampType(time_modified):
        time_modified = long(time_modified)
        time_modified = datetime.fromtimestamp(time_modified)
        time_modified = str(time_modified)
        return time_modified

    getTimestampType = udf(getTimestampType, StringType())

    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session
    dyf_quiz_atm = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_quiz_attempts")
    dyf_quiz_atm = dyf_quiz_atm.select_fields(
        ['_key', 'id', 'userid', 'timemodified', 'uniqueid',
         'quiz']).rename_field('id', 'attempt_id')
    dyf_quiz_atm = dyf_quiz_atm.resolveChoice(specs=[('_key', 'cast:long')])

    dyf_question_atm = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_question_attempts")
    dyf_question_atm = dyf_question_atm.select_fields([
        'id', 'rightanswer', 'responsesummary', 'timemodified', 'maxmark',
        'questionusageid', 'questionid'
    ]).rename_field('id', 'attempt_step_id')
    # dyf_question_atm = dyf_question_atm.resolveChoice(specs=[('timemodified', 'cast:string')])
    dyf_quiz = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_quiz")
    dyf_quiz = dyf_quiz.select_fields(['name',
                                       'id']).rename_field('id', 'quiz_id')

    dyf_question_steps = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_question_attempt_steps")
    dyf_question_steps = dyf_question_steps.select_fields(
        ['state', 'questionattemptid'])

    dyf_result_ai = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_result_ai")
    dyf_result_ai = dyf_result_ai.select_fields([
        'id', 'answer', '.speech_result', 'right_word', 'wrong_word', 'result'
    ])

    dyf_top_user = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_user")
    dyf_top_user = dyf_top_user.select_fields(['id', 'student_id'])

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3://dts-odin/flag/flag_ai_study_step_st_tuan_detail.parquet")
    #     max_key = df_flag.collect()[0]['flag']
    #     print('read from index: ', max_key)
    #
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     dyf_quiz_atm  = Filter.apply(frame=dyf_quiz_atm , f=lambda x: x['_key'] > max_key)
    # except:
    #     print('read flag error ')

    print('the number of new contacts: ', dyf_quiz_atm.count())
    # chon cac truong can thiet
    if (dyf_quiz_atm.count() > 0):
        dyf_quiz_atm = Filter.apply(
            frame=dyf_quiz_atm,
            f=lambda x: x['userid'] is not None and x[
                'uniqueid'] is not None and x['quiz'] is not None)
        join_01 = Join.apply(dyf_quiz_atm, dyf_question_atm, 'uniqueid',
                             'questionusageid').drop_fields(
                                 ['questionusageid'])
        join_01.printSchema()

        join_02 = Join.apply(join_01, dyf_question_steps, 'attempt_step_id',
                             'questionattemptid')
        join_02 = Join.apply(join_02, dyf_top_user, 'userid', 'id')
        join_02.printSchema()
        df_join_02 = join_02.toDF()
        # df_join_02 = df_join_02.withColumn("source_system", f.lit("Native Test")) \
        #     .withColumn("created_at", unix_timestamp(df_join_02.timemodified, "yyyy-MM-dd HH:mm:ss"))

        df_join_02 = df_join_02.withColumn("source_system", f.lit("Native Test")) \
            .withColumn("created_at", getTimestampType(df_join_02.timemodified))

        join_02 = DynamicFrame.fromDF(df_join_02, glueContext, "join_02")
        # join_02 = join_02.resolveChoice(specs=[('created_at', 'cast:string')])

        join_02.printSchema()
        join_02.show(2)

        join_result_ai_1 = Join.apply(dyf_quiz_atm, dyf_quiz, 'quiz',
                                      'quiz_id').drop_fields(['quiz'])
        join_result_ai = Join.apply(join_result_ai_1, dyf_result_ai,
                                    'attempt_id', 'id')
        join_result_ai = Join.apply(join_result_ai, dyf_top_user, 'userid',
                                    'id')
        print("TTTTTTTTTTTTTTTTTT")
        join_result_ai.printSchema()
        df_join_result_ai = join_result_ai.toDF()
        # df_join_result_ai = df_join_result_ai.withColumn("source_system", f.lit("Native Test"))\
        #         #                                         .withColumn("created_at", unix_timestamp(df_join_result_ai.timemodified, "yyyy-MM-dd HH:mm:ss"))

        df_join_result_ai = df_join_result_ai.withColumn("source_system", f.lit("Native Test")) \
            .withColumn("created_at", getTimestampType(df_join_result_ai.timemodified))

        join_result_ai = DynamicFrame.fromDF(df_join_result_ai, glueContext,
                                             "join_result_ai")

        join_result_ai.printSchema()
        join_result_ai.show(3)

        applymapping1 = ApplyMapping.apply(
            frame=join_02,
            mappings=[("student_id", 'int', 'student_id', 'int'),
                      ("attempt_id", "long", "attempt_id", "string"),
                      ("rightanswer", "string", "correct_answer", "string"),
                      ("responsesummary", "string",
                       "student_answer", "string"),
                      ("maxmark", "string", "received_point", "int"),
                      ("source_system", "string", "source_system", "string"),
                      ("attempt_step_id", "long", "attempt_step_id", "int"),
                      ("state", "string", "result", "string"),
                      ("created_at", "string", "created_at", "timestamp")])

        resolvechoice1 = ResolveChoice.apply(
            frame=applymapping1,
            choice="make_cols",
            transformation_ctx="resolvechoice1")
        dropnullfields1 = DropNullFields.apply(
            frame=resolvechoice1, transformation_ctx="dropnullfields")
        datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields1,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "student_test_detail",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3a://dts-odin/st_detail/",
            transformation_ctx="datasink5")
        print('START WRITE TO S3-------------------------')

        datasink6 = glueContext.write_dynamic_frame.from_options(
            frame=dropnullfields1,
            connection_type="s3",
            connection_options={
                "path": "s3://dts-odin/nvn_knowledge/student_test_detail/"
            },
            format="parquet",
            transformation_ctx="datasink6")

        print('END WRITE TO S3-------------------------')

        applymapping2 = ApplyMapping.apply(
            frame=join_result_ai,
            mappings=[("student_id", 'int', 'student_id', 'int'),
                      ("attempt_id", "long", "attempt_id", "string"),
                      ("name", "string", "test_type", "string"),
                      ("answer", "string", "correct_answer", "string"),
                      ("speech_result", "string", "student_answer", "string"),
                      ("source_system", "string", "source_system", "string"),
                      ("id", "int", "attempt_step_id", "int"),
                      ("result", "string", "result", "string"),
                      ("right_word", "string", "right_answer", "string"),
                      ("wrong_word", "string", "wrong_answer", "string"),
                      ("created_at", "string", "created_at", "timestamp")])

        resolvechoice2 = ResolveChoice.apply(
            frame=applymapping2,
            choice="make_cols",
            transformation_ctx="resolvechoice1")
        dropnullfields2 = DropNullFields.apply(
            frame=resolvechoice2, transformation_ctx="dropnullfields")
        datasink8 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields2,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "student_test_detail",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3a://dts-odin/st_detail/",
            transformation_ctx="datasink8")
        print('START WRITE TO S3-------------------------')

        datasink7 = glueContext.write_dynamic_frame.from_options(
            frame=join_result_ai,
            connection_type="s3",
            connection_options={
                "path": "s3://dts-odin/nvn_knowledge/student_test_detail/"
            },
            format="parquet",
            transformation_ctx="datasink7")
        print('END WRITE TO S3-------------------------')

        df_temp = dyf_quiz_atm.toDF()
        flag = df_temp.agg({"_key": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://dts-odin/flag/flag_ai_study_step_st_tuan_detail.parquet",
            mode="overwrite")

Пример #11

Показать файл

import sys
import boto3
import pyspark.sql.functions as F
from awsglue.transforms import *
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session

session = boto3.Session()
glue_client = session.client(service_name='glue')

## @params: [JOB_NAME]
workflowName = 'AmazonForecastWorkflow'
workflow = glue_client.get_workflow(Name=workflowName)
workflow_params = workflow['Workflow']['LastRun']['WorkflowRunProperties']
workflowRunId = workflow['Workflow']['LastRun']['WorkflowRunId']
PROCESSED_BUCKET = workflow_params['processedBucket']
LANDING_DB_NAME = workflow_params['landingDB']
LANDING_DB_TABLE = workflow_params['landingDBTable']

orders = glueContext.create_dynamic_frame_from_catalog(
    LANDING_DB_NAME, LANDING_DB_TABLE, transformation_ctx="orders")

ordersDF = orders.toDF()

ordersDF1 = ordersDF.select("invoicedate", "stockcode", "quantity",
                            "storelocation")

Пример #12

Показать файл

Файл: glue.py Проект: weisisheng/aws-misc

import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from pyspark.ml.feature import StringIndexer

glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
args = getResolvedOptions(sys.argv, ['s3_bucket'])
s3_bucket = args['s3_bucket']
print(s3_bucket)

input_dir = f's3://{s3_bucket}/2020/'
print(input_dir)
output_dir = f's3://{s3_bucket}/output-dir'
print(output_dir)

df = glueContext.create_dynamic_frame_from_options('s3', {'paths':[input_dir], 'recurse':True, 'groupFiles': 'inPartition', 'groupSize': '1048576'}, format="json")
df.printSchema()
df1 = df.toDF()
indexer = StringIndexer(inputCol="GENDER", outputCol="GENDER_INDEX")
indexed = indexer.fit(df1).transform(df1)
indexed.show()
df = df.apply_mapping([
    ('FIRST', 'string', 'FIRST', 'string'),
    ('LAST', 'string', 'LAST', 'string'),
    ('AGE', 'integer', 'AGE', 'integer'),
    ('GENDER_INDEX', 'integer', 'GENDER', 'integer'),
    ('LATITUDE', 'string', 'LATITUDE', 'string'),

Пример #13

Показать файл

from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import DataFrame, Row
from datetime import datetime
from awsglue import DynamicFrame

## @params: [JOB_NAME]
args = getResolvedOptions(
    sys.argv, ['JOB_NAME', 'SRC_DB_NAME', 'SRC_TABLE_NAME', 'DEST_S3_PATH'])
src_db_name = args['SRC_DB_NAME']
src_table_name = args['SRC_TABLE_NAME']
dest_s3_path = args['DEST_S3_PATH']

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

## @type: DataSource
## @args: [database = src_db_name, table_name = src_table_name, additionalOptions = {"startingPosition": "TRIM_HORIZON", "inferSchema": "true"}, stream_type = kinesis]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_data_frame.from_catalog(
    database=src_db_name,
    table_name=src_table_name,
    transformation_ctx="datasource0",
    additional_options={
        "startingPosition": "TRIM_HORIZON",
        "inferSchema": "true"

Пример #14

Показать файл

def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today_second = long(today.strftime("%s"))
    print('today_id: ', today_second)
    # f.lit(today_second).cast('long').alias('transformed_at')

    satisfaction = ['1', '2', '3', '4', '5']

    student_id_unavailable = '0'
    package_endtime_unavailable = 99999999999L
    package_starttime_unavailable = 0L
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'

    package_endtime = 'package_endtime'
    package_starttime = 'package_starttime'
    student_level_code = 'student_level_code'
    student_status_code = 'student_status_code'

    def doCheckModified(val1, val2):
        if val1 is not None:
            return val1
        return val2

    check_modified_null = udf(doCheckModified, StringType())

    def doCheckStudentID(code):
        code = str(code)
        if code is None:
            return student_id_unavailable
        return code

    check_student_id = udf(doCheckStudentID, StringType())

    def doCheckData(code, key):
        key = str(key)
        if code is None:
            if key == package_endtime:
                return package_endtime_unavailable
            else:
                return package_starttime_unavailable
        return code

    check_data = udf(doCheckData, IntegerType())

    def doCheckDataNull(code, key):
        code = str(code)
        key = str(key)
        if (code is None) & (key == student_level_code):
            return student_level_code_unavailable

        if (code is None) & (key == student_status_code):
            return student_status_code_unavailable

        return code

    check_data_null = udf(doCheckDataNull, StringType())

    def concaText(student_behavior_date, behavior_id, student_id, contact_id,
                  package_code, package_endtime, package_starttime,
                  student_level_code, student_status_code, transformed_at):
        text_concat = ""
        if student_behavior_date is not None:
            text_concat += str(student_behavior_date)
        if behavior_id is not None:
            text_concat += str(behavior_id)
        if student_id is not None:
            text_concat += str(student_id)
        if contact_id is not None:
            text_concat += str(contact_id)
        if package_code is not None:
            text_concat += str(package_code)
        if package_endtime is not None:
            text_concat += str(package_endtime)
        if package_starttime is not None:
            text_concat += str(package_starttime)
        if student_level_code is not None:
            text_concat += str(student_level_code)
        if student_status_code is not None:
            text_concat += str(student_status_code)
        if transformed_at is not None:
            text_concat += str(transformed_at)
        return text_concat

    concaText = f.udf(concaText, StringType())

    dyf_ticket_log = glueContext.create_dynamic_frame.from_catalog(
        database="native_smile", table_name="ticket_log")

    dyf_ticket_log = dyf_ticket_log.select_fields([
        '_key', 'requester_email', 'satisfaction', 'satisfaction_at',
        'created_at'
    ])
    dyf_ticket_log = dyf_ticket_log.resolveChoice(specs=[('_key',
                                                          'cast:long')])
    # try:
    #     df_flag_1 = spark.read.parquet("s3://dtsodin/flag/flag_hoc_vien_rating_native_smile_caresoft.parquet")
    #     max_key = df_flag_1.collect()[0]['flag']
    #     print("max_key:  ", max_key)
    #     # Chi lay nhung ban ghi lon hon max_key da luu, ko load full
    #     dyf_ticket_log = Filter.apply(frame=dyf_ticket_log, f=lambda x: x["_key"] > max_key)
    # except:
    #     print('read flag file error ')

    if dyf_ticket_log.count() > 0:

        dyf_student_contact_email = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor", table_name="student_contact_email")
        dyf_student_contact_email = dyf_student_contact_email.select_fields(
            ['contact_id', 'email']) \
            .rename_field('contact_id', 'contact_id_email')

        dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor", table_name="student_contact")
        dyf_student_contact = dyf_student_contact.select_fields(
            ['contact_id', 'student_id', 'level_study', 'time_lms_created'])\

        dyf_log_student_status = glueContext.create_dynamic_frame.from_catalog(
            database="do_tig_advisor", table_name="log_student_status")
        dyf_log_student_status = dyf_log_student_status.select_fields(
            ['contact_id', 'status_code', 'last_status_code', 'start_date', 'end_date']) \
            .rename_field('contact_id', 'contact_id_status')

        dyf_log_student_package = glueContext.create_dynamic_frame.from_catalog(
            database="do_tig_advisor", table_name="log_student_package")
        dyf_log_student_package = dyf_log_student_package.select_fields(
            ['student_id', 'package_code', 'start_time', 'end_time']) \
            .rename_field('student_id', 'student_id_package') \
            .rename_field('start_time', 'start_time_package') \
            .rename_field('end_time', 'end_time_package')

        dyf_log_student_level_study = glueContext.create_dynamic_frame.from_catalog(
            database="tig_advisor", table_name="log_student_level_study")
        dyf_log_student_level_study = dyf_log_student_level_study.select_fields(
            ['contact_id', 'level_current', 'level_modified', 'package_code', 'time_created']) \
            .rename_field('contact_id', 'contact_id_level')

        dyf_ticket_log.printSchema()
        print dyf_ticket_log.count()
        dyf_rating_class = Filter.apply(
            frame=dyf_ticket_log,
            f=lambda x: x['satisfaction'] in satisfaction)
        print dyf_rating_class.count()
        try:
            df_rating_class = dyf_rating_class.toDF()
            df_rating_class = df_rating_class.limit(99999)
            df_student_contact = dyf_student_contact.toDF()
            df_student_contact_email = dyf_student_contact_email.toDF()
            df_log_student_level_study = dyf_log_student_level_study.toDF()
            df_temp = dyf_log_student_level_study.toDF()
            df_log_student_status = dyf_log_student_status.toDF()
            df_log_student_package = dyf_log_student_package.toDF()

            df_temp = df_temp.groupby(
                'contact_id_level', 'level_current', 'package_code').agg(
                    f.max("time_created").alias("time_created_max"))
            df_temp = df_temp.withColumnRenamed('contact_id_level', 'contact_id_join') \
                .withColumnRenamed('package_code', 'package_code_join')

            df_join0 = df_temp.join(
                df_log_student_level_study,
                (df_temp['contact_id_join']
                 == df_log_student_level_study['contact_id_level'])
                & (df_temp['package_code_join']
                   == df_log_student_level_study['package_code'])
                & (df_temp['time_created_max']
                   == df_log_student_level_study['time_created']), "left")
            print "=========== . ==========="
            df_join0.printSchema()
            dyf_join = DynamicFrame.fromDF(df_join0, glueContext, "dyf_join")
            dyf_join = dyf_join.select_fields([
                'contact_id_level', 'level_current', 'level_modified',
                'package_code', 'time_created'
            ])
            df_join = dyf_join.toDF()
            df_join.printSchema()
            df_join.show(10)
            print "########## . ###########"
            df_join0 = df_rating_class.join(
                df_student_contact_email,
                (df_rating_class['requester_email']
                 == df_student_contact_email['email']))

            df_join01 = df_join0.join(df_student_contact,
                                      (df_join0['contact_id_email']
                                       == df_student_contact['contact_id']))
            df_join01.printSchema()
            df_join02 = df_join01.join(
                df_join,
                (df_join['contact_id_level'] == df_join01['contact_id'])
                & (df_join['time_created'] <= df_join01['time_lms_created']),
                "left")

            df_join02 = df_join02\
                .withColumn("level_modified_new", check_modified_null(df_join02.level_modified, df_join02.level_study))\
                .withColumn("timecreated", f.unix_timestamp(df_join02.created_at, "yyyy-MM-dd HH:mm:ss"))

            df_join02.printSchema()
            df_join02.show(10)
            dyf_join = DynamicFrame.fromDF(df_join02, glueContext, "dyf_join")
            dyf_join = dyf_join.select_fields([
                'timecreated', 'contact_id', 'student_id', 'level_study',
                'time_lms_created', 'level_current', 'level_modified',
                'package_code', 'time_created', 'satisfaction',
                'level_modified_new'
            ])
            # dyf_join_temp = Filter.apply(frame=dyf_join,
            #                              f=lambda x: x["level_modified_new"] is None)
            # print "count: ", dyf_join_temp.count()

            ############
            df_join02 = dyf_join.toDF()
            df_join03 = df_join02.join(
                df_log_student_status,
                (df_log_student_status['contact_id_status']
                 == df_join02['contact_id'])
                & (df_log_student_status['start_date'] <=
                   df_join02['timecreated'])
                & (df_log_student_status['end_date'] >=
                   df_join02['timecreated']), "left")

            df_join04 = df_join03.join(
                df_log_student_package,
                (df_log_student_package['student_id_package']
                 == df_join03['student_id'])
                & (df_log_student_package['start_time_package'] <=
                   df_join03['timecreated'])
                & (df_log_student_package['end_time_package'] >=
                   df_join03['timecreated']), "left")

            dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join")
            dyf_join = Filter.apply(
                frame=dyf_join,
                f=lambda x: x["start_time_package"] is not None and x[
                    "end_time_package"] is not None)
            print "dyf_join: ", dyf_join.count()
            dyf_join.show(10)
            dyf_join = dyf_join.select_fields([
                'timecreated', 'student_id', 'contact_id', 'package_code',
                'satisfaction', 'start_time_package', 'end_time_package',
                'level_modified_new', 'status_code'
            ])
            # dyf_join01 = Filter.apply(frame=dyf_join,
            #                           f=lambda x: x["level_current"] is not None)
            #
            # print "Check null ", dyf_join01.count()

            df_join04 = dyf_join.toDF()
            df_join04 = df_join04.withColumn("transformed_at", unix_timestamp(f.current_timestamp())) \
                .withColumn("student_id", check_student_id(df_join04.student_id)) \
                .withColumn("package_endtime", check_data(df_join04.end_time_package, f.lit(package_endtime))) \
                .withColumn("package_starttime", check_data(df_join04.start_time_package, f.lit(package_starttime))) \
                .withColumn("student_level_code", check_data_null(df_join04.level_modified_new, f.lit(student_level_code))) \
                .withColumn("student_status_code", check_data_null(df_join04.status_code, f.lit(student_status_code))) \
                .withColumn("behavior_id", f.lit(26)) \
                .withColumn("rating_type", f.lit("rating_native_smile_caresoft")) \
                .withColumn("comment", f.lit("")) \
                .withColumn("rating_about", f.lit(None)) \
                .withColumn("number_rating", f.lit(1)) \
                .withColumn("value_rating", df_join04.satisfaction)

            df_join04.printSchema()
            print df_join04.count()
            df_join04.show(10)

            dyf_join = DynamicFrame.fromDF(df_join04, glueContext, "dyf_join")
            # dyf_join.printSchema()
            # print dyf_join.count()
            # dyf_join.show(10)

            dyf_rating_cara = ApplyMapping.apply(
                frame=dyf_join,
                mappings=[
                    ("timecreated", "int", "student_behavior_date", "long"),
                    ("behavior_id", "int", "behavior_id", "long"),
                    ("student_id", "string", "student_id", "long"),
                    ("contact_id", "string", "contact_id", "string"),
                    ("package_code", "string", "package_code", "string"),
                    ("package_endtime", "int", "package_endtime", "long"),
                    ("package_starttime", "int", "package_starttime", "long"),
                    ("student_level_code", "string", "student_level_code",
                     "string"),
                    ("student_status_code", "string", "student_status_code",
                     "string"),
                    ("transformed_at", "long", "transformed_at", "long"),
                    ("rating_type", "string", "rating_type", "string"),
                    ("comment", "string", "comment", "string"),
                    ("rating_about", "int", "rating_about", "long"),
                    ("number_rating", "int", "number_rating", "long"),
                    ("value_rating", "int", "value_rating", "long")
                ])

            df_rating_cara = dyf_rating_cara.toDF()
            df_rating_cara2 = df_rating_cara.withColumn(
                'student_behavior_id',
                f.md5(
                    concaText(df_rating_cara.student_behavior_date,
                              df_rating_cara.behavior_id,
                              df_rating_cara.student_id,
                              df_rating_cara.contact_id,
                              df_rating_cara.package_code,
                              df_rating_cara.package_endtime,
                              df_rating_cara.package_starttime,
                              df_rating_cara.student_level_code,
                              df_rating_cara.student_status_code,
                              df_rating_cara.transformed_at)))

            dyf_rating_cara = DynamicFrame.fromDF(df_rating_cara2, glueContext,
                                                  'dyf_rating_cara')

            dyf_rating_cara = Filter.apply(frame=dyf_rating_cara,
                                           f=lambda x: x["contact_id"] is
                                           not None and x["contact_id"] != '')

            applymapping0 = ApplyMapping.apply(
                frame=dyf_rating_cara,
                mappings=[("student_behavior_id", "string",
                           "student_behavior_id", "string"),
                          ("rating_type", "string", "rating_type", "string"),
                          ("comment", "string", "comment", "string"),
                          ("rating_about", "long", "rating_about", "long"),
                          ("number_rating", "long", "number_rating", "long"),
                          ("value_rating", "long", "value_rating", "long"),
                          ("behavior_id", "long", "behavior_id", "long")])

            applymapping0.printSchema()
            print applymapping0.count()
            # applymapping0.show(5)
            resolvechoice0 = ResolveChoice.apply(
                frame=applymapping0,
                choice="make_cols",
                transformation_ctx="resolvechoice1")
            dropnullfields0 = DropNullFields.apply(
                frame=resolvechoice0, transformation_ctx="dropnullfields0")
            print resolvechoice0.count()
            # resolvechoice0.printSchema()
            resolvechoice0.show(10)

            print('START WRITE TO S3-------------------------')
            datasink0 = glueContext.write_dynamic_frame.from_options(
                frame=dropnullfields0,
                connection_type="s3",
                connection_options={
                    "path": "s3://dtsodin/student_behavior/student_rating/",
                    "partitionKeys": ["behavior_id"]
                },
                format="parquet",
                transformation_ctx="datasink0")
            print('END WRITE TO S3-------------------------')

            # datasink0 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields0,
            #                                                            catalog_connection="glue_redshift",
            #                                                            connection_options={
            #                                                                "dbtable": "student_rating_temp",
            #                                                                "database": "dts_odin"
            #                                                            },
            #                                                            redshift_tmp_dir="s3a://dtsodin/temp/student_rating_temp/",
            #                                                            transformation_ctx="datasink0")

            applymapping1 = ApplyMapping.apply(
                frame=dyf_rating_cara,
                mappings=[
                    ("student_behavior_id", "string", "student_behavior_id",
                     "string"),
                    ("student_behavior_date", "long", "student_behavior_date",
                     "long"), ("behavior_id", "long", "behavior_id", "long"),
                    ("student_id", "long", "student_id", "long"),
                    ("contact_id", "string", "contact_id", "string"),
                    ("package_code", "string", "package_code", "string"),
                    ("package_endtime", "long", "package_endtime", "long"),
                    ("package_starttime", "long", "package_starttime", "long"),
                    ("student_level_code", "string", "student_level_code",
                     "string"),
                    ("student_status_code", "string", "student_status_code",
                     "string"),
                    ("transformed_at", "long", "transformed_at", "long")
                ])

            applymapping1.printSchema()
            print applymapping1.count()
            # applymapping1.show(10)

            resolvechoice1 = ResolveChoice.apply(
                frame=applymapping1,
                choice="make_cols",
                transformation_ctx="resolvechoice1")
            dropnullfields1 = DropNullFields.apply(
                frame=resolvechoice1, transformation_ctx="dropnullfields1")
            print resolvechoice1.count()
            # resolvechoice1.printSchema()
            resolvechoice1.show(10)

            print('START WRITE TO S3-------------------------')
            datasink6 = glueContext.write_dynamic_frame.from_options(
                frame=dropnullfields1,
                connection_type="s3",
                connection_options={
                    "path": "s3://dtsodin/student_behavior/student_behavior/",
                    "partitionKeys": ["behavior_id"]
                },
                format="parquet",
                transformation_ctx="datasink6")
            print('END WRITE TO S3-------------------------')

            # datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields1,
            #                                                            catalog_connection="glue_redshift",
            #                                                            connection_options={
            #                                                                "dbtable": "student_behavior",
            #                                                                "database": "dts_odin"
            #                                                            },
            #                                                            redshift_tmp_dir="s3a://dtsodin/temp/student_behavior",
            #                                                            transformation_ctx="datasink1")

            df_temp = dyf_ticket_log.toDF()
            flag = df_temp.agg({"_key": "max"}).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            # ghi de _key vao s3
            df.write.parquet(
                "s3a://dtsodin/flag/flag_hoc_vien_rating_native_smile_caresoft.parquet",
                mode="overwrite")
        except Exception as e:
            print e

Пример #15

Показать файл

def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    datasource0 = glueContext.create_dynamic_frame.from_catalog(
        database="topicalms", table_name="mdl_tpebbb")

    datasource0 = datasource0.select_fields([
        '_key', 'id', 'name', 'description', 'roomtype', 'calendar_code',
        'vcr_type', 'vcr_class_id'
    ])

    # convert dl
    datasource0 = datasource0.resolveChoice(specs=[('_key', 'cast:long')])

    # doc flag tu s3
    df_flag = spark.read.parquet("s3://dts-odin/flag/flag_lophoc.parquet")
    max = df_flag.collect()[0]['flag']
    print "max:  ", max
    # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #data = datasource0.toDF()
    #data = data.where(data['_key'] > df_flag.collect()[0]['flag'])
    #datasource0 = DynamicFrame.fromDF(data, glueContext, "datasource0")
    datasource0 = Filter.apply(frame=datasource0, f=lambda x: x["_key"] > max)
    print "Count data 1:  ", datasource0.count()
    if (datasource0.count() > 0):
        # try:
        datasource1 = glueContext.create_dynamic_frame.from_catalog(
            database="topicalms", table_name="mdl_tpe_calendar_teach")

        datasource1 = datasource1.select_fields([
            'calendar_code', 'level_class', 'teacher_type', 'student_type',
            'type_class', 'teacher_id'
        ]).rename_field('calendar_code', 'code_calendar')

        # loc data
        # df_lop = datasource1.toDF()
        # df_lop = df_lop.where("code_calendar <> '' and code_calendar is not null")
        # df_lop = df_lop.where("teacher_type <> 'TRAINING' AND teacher_type <> 'ORIENTATION'")
        # data_lop = DynamicFrame.fromDF(df_lop, glueContext, "data_lop")

        data_lop = Filter.apply(frame=datasource1,
                                f=lambda x: x["code_calendar"] != '' and x[
                                    "code_calendar"] is not None)
        data_lop = Filter.apply(frame=data_lop,
                                f=lambda x: x["teacher_type"] != 'TRAINING' and
                                x["teacher_type"] != 'ORIENTATION')

        # loc data
        # df_lslop = datasource0.toDF()
        # df_lslop = df_lslop.where("calendar_code is not null and calendar_code <> ''")
        # df_lslop = df_lslop.where("roomtype = 'ROOM'")
        # df_lslop = df_lslop.where("id is not null and id <> ''")
        # data_lslop = DynamicFrame.fromDF(df_lslop, glueContext, "data_lslop")

        data_lslop = Filter.apply(frame=datasource0,
                                  f=lambda x: x["calendar_code"] != '' and x[
                                      "calendar_code"] is not None)
        data_lslop = Filter.apply(
            frame=data_lslop,
            f=lambda x: x["id"] != '' and x["id"] is not None)
        data_lslop = Filter.apply(frame=data_lslop,
                                  f=lambda x: x["roomtype"] == 'ROOM')

        join = Join.apply(data_lslop, data_lop, 'calendar_code',
                          'code_calendar')

        # chon cac truong va kieu du lieu day vao db
        applyMapping = ApplyMapping.apply(
            frame=join,
            mappings=[("id", "string", "id", "string"),
                      ("name", "string", "tenlop", "string"),
                      ("description", 'string', 'mota', 'string'),
                      ("roomtype", "string", "kieuphong", "string"),
                      ("vcr_type", "string", "kieuvcr", "string"),
                      ("vcr_class_id", "string", "malopvcr", "string"),
                      ("level_class", "string", "levellop", "string"),
                      ("teacher_type", "string", "kieugiaovien", "string"),
                      ("student_type", "string", "kieuhocvien", "string"),
                      ("type_class", "string", "kieulop", "string"),
                      ("teacher_id", "string", "id_giaovien", "string")])

        resolvechoice = ResolveChoice.apply(
            frame=applyMapping,
            choice="make_cols",
            transformation_ctx="resolvechoice2")

        dropnullfields = DropNullFields.apply(
            frame=resolvechoice, transformation_ctx="dropnullfields3")
        print "Count data 2:  ", dropnullfields.count()
        # ghi du lieu vao redshift
        datasink = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dropnullfields,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "dim_lop_hoc",
                "database": "dts_odin"
            },
            redshift_tmp_dir="s3n://dts-odin/topicalms/dim_lophoc/",
            transformation_ctx="datasink4")

        # ghi du lieu vao s3
        # datasink1 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields, connection_type="s3",
        #                                                          connection_options={
        #                                                              "path": "s3://dts-odin/dim_lophoc"},
        #                                                          format="parquet", transformation_ctx="datasink1")
        # lay max _key cua datasource
        datasource = datasource0.toDF()
        flag = datasource.agg({"_key": "max"}).collect()[0][0]

        # tao data frame
        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')

        # ghi de flag vao s3
        df.write.parquet("s3a://dts-odin/flag/flag_lophoc.parquet",
                         mode="overwrite")

Пример #16

Показать файл

Файл: test_with_current_timestamp.py Проект: cohuebn/cory-learns-glue

from datetime import datetime

from awsglue.context import GlueContext
from pyspark.context import SparkContext

from common.transforms.with_current_timestamp import with_current_timestamp
from .data import get_test_file_as_dynamic_frame

glueContext = GlueContext(SparkContext.getOrCreate())
logger = glueContext.get_logger()


def test_timestamp_addition_with_default_column():
    start_time = datetime.utcnow()
    dogs = get_test_file_as_dynamic_frame('dogs.csv', glueContext)
    dogs_with_timestamp = with_current_timestamp()(dogs)
    timestamp_values = [
        record['current_timestamp']
        for record in dogs_with_timestamp.toDF().collect()
    ]
    assert len(timestamp_values) == dogs.count()
    assertion_time = datetime.utcnow()
    assert all(start_time <= timestamp_value <= assertion_time
               for timestamp_value in timestamp_values)


def test_timestamp_addition_with_custom_column():
    start_time = datetime.utcnow()
    dogs = get_test_file_as_dynamic_frame('dogs.csv', glueContext)
    dogs_with_timestamp = with_current_timestamp('mowie_wowie')(dogs)
    timestamp_values = [

Пример #17

Показать файл

Файл: job_dtsodin_hoc_vien_da_kich_hoat.py Проект: 01662024622/dts-odin-etl

def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_enduser_used_product_history")
    tpe_enduser_used_product_history = tpe_enduser_used_product_history.select_fields(
        [
            '_key', 'id', 'used_product_id', 'contact_id', 'status_new',
            'status_old', 'timecreated'
        ])

    tpe_enduser_used_product_history = tpe_enduser_used_product_history.resolveChoice(
        specs=[('_key', 'cast:long')])

    df_flag = spark.read.parquet(
        "s3a://datashine-dev-redshift-backup/flag/flag_hvdkh_LS_A3.parquet")

    max_key = df_flag.collect()[0]['flag']

    tpe_enduser_used_product_history = Filter.apply(
        frame=tpe_enduser_used_product_history,
        f=lambda x: x["_key"] > max_key)
    print(tpe_enduser_used_product_history.count())
    if (tpe_enduser_used_product_history.count() > 0):
        tpe_enduser_used_product_history = Filter.apply(
            frame=tpe_enduser_used_product_history,
            f=lambda x: x["timecreated"] is not None and x["contact_id"] is
            not None and x["used_product_id"] is not None and x[
                "status_new"] is not None and x["status_new"] == 'ACTIVED' and
            (x["status_old"] == 'SUSPENDED' or x["status_old"] == 'EXPIRED' or
             x["status_old"] == 'EXPRIED'))

        if (tpe_enduser_used_product_history.count() > 0):
            try:
                tpe_enduser_used_product_history = tpe_enduser_used_product_history.resolveChoice(
                    specs=[('timecreated', 'cast:long')])
                df_tpe_enduser_used_product_history = tpe_enduser_used_product_history.toDF(
                )

                df_tpe_enduser_used_product_history = df_tpe_enduser_used_product_history.withColumn(
                    'ngay_kich_hoat',
                    from_unixtime(
                        df_tpe_enduser_used_product_history['timecreated'],
                        "yyyyMMdd"))

                df_tpe_enduser_used_product_history = df_tpe_enduser_used_product_history.withColumn(
                    'timestemp',
                    df_tpe_enduser_used_product_history['timecreated'] *
                    f.lit(1000))

                df_tpe_enduser_used_product_history = df_tpe_enduser_used_product_history.withColumn(
                    'to_status_id', f.lit(107))

                data_tpe_enduser_used_product_history = DynamicFrame.fromDF(
                    df_tpe_enduser_used_product_history, glueContext,
                    "data_tpe_enduser_used_product_history")

                data_tpe_enduser_used_product_history = data_tpe_enduser_used_product_history.resolveChoice(
                    specs=[('timestemp', 'cast:long')])

                data_tpe_enduser_used_product_history.printSchema()
                data_tpe_enduser_used_product_history.show(3)

                applymapping1 = ApplyMapping.apply(
                    frame=data_tpe_enduser_used_product_history,
                    mappings=[("ngay_kich_hoat", "string",
                               "change_status_date_id", "long"),
                              ("to_status_id", "int", "to_status_id", "long"),
                              ("timestemp", "long", "timestamp1", "timestamp"),
                              ("contact_id", "string", "contact_id1", "string")
                              ])

                applymapping1.printSchema()
                applymapping1.show(20)

                resolvechoice2 = ResolveChoice.apply(
                    frame=applymapping1,
                    choice="make_cols",
                    transformation_ctx="resolvechoice2")

                dropnullfields3 = DropNullFields.apply(
                    frame=resolvechoice2, transformation_ctx="dropnullfields3")

                datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
                    frame=dropnullfields3,
                    catalog_connection="glue_redshift",
                    connection_options={
                        "dbtable":
                        "mapping_changed_status_student",
                        "database":
                        "dts_odin",
                        "postactions":
                        """UPDATE mapping_changed_status_student 
                                            SET user_id = ( SELECT user_id FROM user_map WHERE source_type = 1 AND source_id = mapping_changed_status_student.contact_id1 LIMIT 1 ) 
                                            WHERE user_id IS NULL AND to_status_id = 107"""
                    },
                    redshift_tmp_dir="s3n://datashine-dwh/temp1/",
                    transformation_ctx="datasink4")

                # ghi flag
                # lay max key trong data source
                datasourceTmp = tpe_enduser_used_product_history.toDF()
                flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0]

                flag_data = [flag]
                df = spark.createDataFrame(flag_data, "long").toDF('flag')

                # ghi de _key vao s3
                df.write.parquet(
                    "s3a://datashine-dev-redshift-backup/flag/flag_hvdkh_LS_A3.parquet",
                    mode="overwrite")
            except Exception as e:
                print("No new data")
                print(e)

Пример #18

Показать файл

Файл: import_into_datacatalog.py Проект: felipemsantos/datalake-toolkit

def main():
    # arguments
    from_s3 = 'from-s3'
    from_jdbc = 'from-jdbc'
    parser = argparse.ArgumentParser(prog=sys.argv[0])
    parser.add_argument(
        '-m',
        '--mode',
        required=True,
        choices=[from_s3, from_jdbc],
        help='Choose to migrate metastore either from JDBC or from S3')
    parser.add_argument(
        '-c',
        '--connection-name',
        required=False,
        help='Glue Connection name for Hive metastore JDBC connection')
    parser.add_argument(
        '-R',
        '--region',
        required=False,
        help='AWS region of target Glue DataCatalog, default to "us-east-1"')
    parser.add_argument(
        '-d',
        '--database-prefix',
        required=False,
        help='Optional prefix for database names in Glue DataCatalog')
    parser.add_argument(
        '-t',
        '--table-prefix',
        required=False,
        help='Optional prefix for table name in Glue DataCatalog')
    parser.add_argument(
        '-D',
        '--database-input-path',
        required=False,
        help='An S3 path containing json files of metastore database entities')
    parser.add_argument(
        '-T',
        '--table-input-path',
        required=False,
        help='An S3 path containing json files of metastore table entities')
    parser.add_argument(
        '-P',
        '--partition-input-path',
        required=False,
        help='An S3 path containing json files of metastore partition entities'
    )

    options = get_options(parser, sys.argv)
    if options['mode'] == from_s3:
        validate_options_in_mode(
            options=options,
            mode=from_s3,
            required_options=[
                'database_input_path', 'table_input_path',
                'partition_input_path'
            ],
            not_allowed_options=['database_prefix', 'table_prefix'])
    elif options['mode'] == from_jdbc:
        validate_options_in_mode(options=options,
                                 mode=from_jdbc,
                                 required_options=['connection_name'],
                                 not_allowed_options=[
                                     'database_input_path', 'table_input_path',
                                     'partition_input_path'
                                 ])
    else:
        raise AssertionError('unknown mode ' + options['mode'])

    validate_aws_regions(options['region'])

    # spark env
    (conf, sc, sql_context) = get_spark_env()
    glue_context = GlueContext(sc)

    # launch job
    if options['mode'] == from_s3:
        metastore_import_from_s3(
            sql_context=sql_context,
            glue_context=glue_context,
            db_input_dir=options['database_input_path'],
            tbl_input_dir=options['table_input_path'],
            parts_input_dir=options['partition_input_path'],
            datacatalog_name='datacatalog',
            region=options.get('region') or 'us-east-1')
    elif options['mode'] == from_jdbc:
        glue_context.extract_jdbc_conf(options['connection_name'])
        metastore_full_migration(sc=sc,
                                 sql_context=sql_context,
                                 glue_context=glue_context,
                                 connection=glue_context.extract_jdbc_conf(
                                     options['connection_name']),
                                 db_prefix=options.get('database_prefix')
                                 or '',
                                 table_prefix=options.get('table_prefix')
                                 or '',
                                 datacatalog_name='datacatalog',
                                 region=options.get('region') or 'us-east-1')

Пример #19

Показать файл

def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today = today.replace(hour=0, minute=0, second=0, microsecond=0)
    first_day_of_month = today.replace()
    print('today: ', today)
    yesterday = today - timedelta(1)
    print('yesterday: ', yesterday)
    today_id = long(today.strftime("%Y%m%d"))
    yesterday_id = long(yesterday.strftime("%Y%m%d"))
    today_id_0h00 = long(today.strftime("%s"))
    print('today_id: ', today_id)
    print('yesterday_id: ', yesterday_id)
    print('today_id_0h00: ', today_id_0h00)

    date_end = 1573232400L
    General = 'General'
    Vocabulary = 'Vocabulary'
    Grammar = 'Grammar'
    Speaking = 'Speaking'
    Listening = 'Listening'
    Phrasal_Verb = 'Phrasal'
    Pronunciation = 'Pronunciation'

    # Phrasal
    # Verb

    # Speaking
    # 2
    # General
    # 3
    # Phrasal Verb
    # 4
    # Grammar
    # 5
    # Vocabulary
    # 6
    # Pronunciation
    # 7
    # Listening

    is_dev = True
    is_just_monthly_exam = False
    is_limit_test = False

    start_load_date = 0L

    BEHAVIOR_ID_TEST_TUAN = 22L
    BEHAVIOR_ID_TEST_THANG = 23L

    PERIOD_DAYLY = 1L
    PERIOD_WEEKLY = 2L
    PERIOD_MONTHLY = 3L

    def doCheckClassID(code):
        if code is None:
            return None
        code = str(code)
        if code == General:
            return 61L
        if code == Vocabulary:
            return 62L
        if code == Grammar:
            return 63L
        if code == Speaking:
            return 64L
        if code == Listening:
            return 65L
        if code == Pronunciation:
            return 66L
        if Phrasal_Verb in code:
            return 67L
        return None

    check_class_id = udf(doCheckClassID, LongType())

    # ------------------------------------------------------------------------------------------------------------------#
    my_partition_predicate = "(behavior_id=='22' or behavior_id=='23')"
    dyf_student_behavior = glueContext.create_dynamic_frame.from_catalog(
        database="od_student_behavior",
        table_name="student_behavior",
        push_down_predicate=my_partition_predicate)

    dyf_student_behaviors = dyf_student_behavior.resolveChoice(
        specs=[('behavior_id', 'cast:long'), ('transformed_at', 'cast:long')])

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3://dts-odin/flag/flag_student_testing_history.parquet")
    #     max_key = df_flag.collect()[0]['flag']
    #     print('read from index: ', max_key)
    #
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     dyf_student_behaviors = Filter.apply(frame=dyf_student_behaviors, f=lambda x: x['transformed_at'] > max_key)
    # except:
    #     print('read flag error ')

    if dyf_student_behaviors.count() > 0:

        dyf_student_behaviors = Filter.apply(
            frame=dyf_student_behaviors,
            f=lambda x: x["student_behavior_id"] is not None and x[
                "student_id"] is not None
            # and x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN,
            #                          BEHAVIOR_ID_TEST_THANG
            #                          ]
            and start_load_date <= x["student_behavior_date"] < today_id_0h00)

        number_dyf_student_behavior = dyf_student_behaviors.count()
        print('number_dyf_student_behavior after filtering: ',
              number_dyf_student_behavior)
        if number_dyf_student_behavior == 0:
            return

        dyf_student_behavior = dyf_student_behaviors \
            .select_fields(['student_behavior_id',
                            'student_behavior_date',
                            'student_id',
                            'behavior_id'])

        df_student_behavior = dyf_student_behavior.toDF()
        df_student_behavior = df_student_behavior.drop_duplicates(
            ['student_behavior_id'])
        if is_limit_test:
            df_student_behavior = df_student_behavior.limit(1000)

        df_student_behavior = df_student_behavior.repartition('behavior_id')
        df_student_behavior.cache()

        student_behavior_number = df_student_behavior.count()

        if is_dev:
            print('dy_student_behavior')
            print('student_behavior_number: ', student_behavior_number)
            df_student_behavior.printSchema()
            df_student_behavior.show(3)

        if student_behavior_number == 0:
            return

        # ------------------------------------------------------------------------------------------------------------------#
        dyf_student_test_mark = glueContext.create_dynamic_frame.from_catalog(
            database="od_student_behavior",
            table_name="student_test_mark",
            push_down_predicate=my_partition_predicate)

        dyf_student_test_mark = dyf_student_test_mark.select_fields(
            ['student_behavior_id', 'question_category', 'grade'])

        # dyf_student_test_mark = Filter.apply(frame=dyf_student_test_mark,
        #                                     f=lambda x: x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN,
        #                                                                      BEHAVIOR_ID_TEST_THANG
        #                                                                      ]
        #                                     )

        df_student_test_mark = dyf_student_test_mark.toDF()

        number_student_test_mark = df_student_test_mark.count()

        if is_dev:
            print('df_student_test_mark')
            print('df_student_test_mark: ', number_student_test_mark)
            df_student_test_mark.printSchema()
            df_student_test_mark.show(3)

        if number_student_test_mark == 0:
            return

        df_student_behavior_mark = df_student_behavior\
            .join(df_student_test_mark,
                    on='student_behavior_id',
                    how='left')

        if is_dev:
            print('df_student_behavior_mark')
            print('df_student_behavior_mark: ', df_student_behavior_mark)
            df_student_behavior_mark.printSchema()
            df_student_behavior_mark.show(3)

        df_student_behavior_mark = df_student_behavior_mark.dropDuplicates([
            'student_behavior_id', 'student_id', 'behavior_id',
            'question_category'
        ])

        df_student_behavior_mark_week = df_student_behavior_mark\
            .filter(df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_TUAN)
        df_student_behavior_mark_month = df_student_behavior_mark.filter(
            df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_THANG)


        df_student_behavior_mark_week = df_student_behavior_mark_week\
            .withColumn('agg_week_id',  from_unixtime(df_student_behavior_mark_week.student_behavior_date, "yyyyww"))

        df_student_behavior_mark_month = df_student_behavior_mark_month \
            .withColumn('agg_month_id',
                        from_unixtime(df_student_behavior_mark_month.student_behavior_date, "yyyyMM"))

        if is_dev:
            print('df_student_behavior_mark_week')
            df_student_behavior_mark_week.printSchema()
            df_student_behavior_mark_week.show(3)

            print('df_student_behavior_mark_month')
            df_student_behavior_mark_month.printSchema()
            df_student_behavior_mark_month.show(3)

        df_student_behavior_mark_week = df_student_behavior_mark_week \
            .withColumn("class_id", check_class_id(df_student_behavior_mark_week.question_category))

        df_student_behavior_mark_week_agg = df_student_behavior_mark_week.groupby(
            'student_id', 'agg_week_id', 'class_id').agg(
                f.round(f.max(df_student_behavior_mark_week.grade)).cast(
                    'long').alias('grade_total'),
                f.lit(PERIOD_WEEKLY).alias('period_type_id'),
                f.lit(None).cast('string').alias('agg_date_id'),
                f.lit(None).cast('string').alias('agg_month_id'))

        df_student_behavior_mark_month = df_student_behavior_mark_month.na.fill(
            {'grade': 0})

        df_student_behavior_mark_month = df_student_behavior_mark_month.groupby(
            'student_behavior_id').agg(
                f.first('student_id').alias('student_id'),
                f.first('agg_month_id').alias('agg_month_id'),
                f.round(
                    f.sum('grade')).cast('long').alias('grade_total_attempt'),
            )

        df_student_behavior_mark_month_agg = df_student_behavior_mark_month.groupby(
            'student_id', 'agg_month_id').agg(
                f.max(
                    df_student_behavior_mark_month.grade_total_attempt).alias(
                        'grade_total'),
                f.lit(PERIOD_MONTHLY).alias('period_type_id'),
                f.lit(None).cast('string').alias('agg_date_id'),
                f.lit(None).cast('string').alias('agg_week_id'),
                f.lit(68L).cast('long').alias('class_id'))

        df_student_behavior_mark_month_agg = df_student_behavior_mark_month_agg.select(
            'student_id', 'agg_week_id', 'class_id', 'grade_total',
            'period_type_id', 'agg_date_id', 'agg_month_id')

        if is_dev:
            print('df_student_behavior_mark_week_agg')
            df_student_behavior_mark_week_agg.printSchema()
            df_student_behavior_mark_week_agg.show(3)

            print('df_student_behavior_mark_month_agg')
            df_student_behavior_mark_month_agg.printSchema()
            df_student_behavior_mark_month_agg.show(3)

        df_student_behavior_mark_agg = df_student_behavior_mark_week_agg.union(
            df_student_behavior_mark_month_agg)

        if is_dev:
            print('df_student_behavior_mark_agg')
            df_student_behavior_mark_agg.printSchema()
            df_student_behavior_mark_agg.show(3)

        dyf_student_behavior_mark_agg = DynamicFrame.fromDF(
            df_student_behavior_mark_agg, glueContext,
            'dyf_student_behavior_mark_agg')
        dyf_student_behavior_mark_agg = Filter.apply(
            frame=dyf_student_behavior_mark_agg,
            f=lambda x: x["class_id"] is not None)
        dyf_student_behavior_mark_agg.show(3)
        apply_output_month = ApplyMapping.apply(
            frame=dyf_student_behavior_mark_agg,
            mappings=[("student_id", "long", "student_id", "long"),
                      ("class_id", "long", "class_id", "long"),
                      ("period_type_id", "long", "period_type_id", "long"),
                      ("agg_date_id", "string", "created_date_id", "long"),
                      ("agg_week_id", "string", "created_week_id", "long"),
                      ("agg_month_id", "string", "created_month_id", "long"),
                      ("grade_total", "long", "measure1", "long")])

        dfy_output_month = ResolveChoice.apply(
            frame=apply_output_month,
            choice="make_cols",
            transformation_ctx="resolvechoice2")

        datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dfy_output_month,
            catalog_connection="nvn_knowledge",
            connection_options={
                "dbtable": "student_learning_history",
                "database": "nvn_knowledge_v2"
            },
            redshift_tmp_dir=
            "s3n://dtsodin/temp/nvn_knowledge_v2/student_learning_history",
            transformation_ctx="datasink4")

        df_temp = dyf_student_behaviors.toDF()
        flag = df_temp.agg({"transformed_at": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://dts-odin/flag/flag_student_testing_history.parquet",
            mode="overwrite")

Пример #20

Показать файл

from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from pyspark.sql.functions import *
import boto3
import logging
from botocore.exceptions import ClientError
import pymysql
import re

args = getResolvedOptions(
    sys.argv,
    ['JOB_NAME', 'bucketName', 'workspace', 'jdbc_url', 'username', 'pswd'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

#ccn-alerts-104328-{workspace}-sfmc-migration
workspace = args['workspace']
bucketName = args['bucketName']
jdbc_url = args['jdbc_url']
username = args['username']
pswd = args['pswd']

print('args passed are: {}, {}, {}, {}'.format(workspace, bucketName, jdbc_url,
                                               username))

s3Location = 's3://{}'.format(bucketName)

Пример #21

Показать файл

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import datetime
import boto3

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

datasource0 = glueContext.create_dynamic_frame_from_options(
    "s3", {'paths': ["s3://xxx-xx-logs/Glue/parquet_sample_dataset/"]},
    format="parquet",
    transformation_ctx="datasource0")

datasink3 = glueContext.write_dynamic_frame.from_options(
    frame=datasource0,
    connection_type="s3",
    connection_options={"path": "s3://xx-xx-logs/Glue/glue_bm_issue_11_12/"},
    format="parquet",
    transformation_ctx="datasink3")

job.commit()

Пример #22

Показать файл

Файл: DO_L3120_mapping_lo_student_load_by_date_v2.py Проект: 01662024622/dts-odin-etl

def main():

    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    score_pass = 50
    date_read_data = 20191005
    print('date_read_date: ', date_read_data)

    def doAddScoreAll(plus, minus):
        if plus is None:
            plus = 0
        if minus is None:
            minus = 0
        return plus + minus

    addScoreAll = udf(doAddScoreAll, IntegerType())

    def getPreviousDate(date_input):
        current_date = datetime.strptime(str(date_input), "%Y%m%d")
        previous_date = current_date - timedelta(1)
        previous_date_id = previous_date.strftime("%Y%m%d")
        return previous_date_id

    def addMoreSore(current_value, new_value):
        if current_value >= score_pass:
            return score_pass
        if current_value is None:
            current_value = 0
        if new_value is None:
            new_value = 0
        a = current_value + new_value
        if a < 0:
            return 0
        if a >= score_pass:
            return score_pass
        return a

    addMoreSore = udf(addMoreSore, IntegerType())

    def getNewPassDate(current_pass_date, score_value_c, score_value_n):
        if current_pass_date != None:
            return current_pass_date
        if score_value_c is None:
            score_value_c = 0
        if score_value_n is None:
            score_value_n = 0
        if score_value_c + score_value_n >= score_pass:
            return date_read_data
        return None

    getNewPassDate = udf(getNewPassDate, IntegerType())

    # def getCurrentDate():
    #     return d4
    #
    # getCurrentDate = udf(getCurrentDate, IntegerType())

    def getModifyDate(modify_old, student_id_new):
        if student_id_new is not None:
            return date_read_data
        return modify_old

    getModifyDate = udf(getModifyDate, IntegerType())

    def getnewStudentId(student_id, student_id_new):
        if student_id is None:
            return student_id_new
        return student_id

    getnewStudentId = udf(getnewStudentId, LongType())

    def getnewStudentLearningObjectId(lo_id, lo_id_new):
        if lo_id is None:
            return lo_id_new
        return lo_id

    getnewStudentLearningObjectId = udf(getnewStudentLearningObjectId,
                                        LongType())

    def caculateScore(plus, minus):
        if plus is None:
            plus = 0
        if minus is None:
            minus = 0
        return plus + minus

    caculateScore = udf(caculateScore, LongType())

    def getModifiedDateId(student_id_new, learning_object_id_new,
                          modified_date_id_current):
        if student_id_new is not None and learning_object_id_new is not None:
            return long(date_read_data)
        return long(modified_date_id_current)

    udfGetModifiedDateId = udf(getModifiedDateId, LongType())

    def getCreatedDateId(student_id_new, learning_object_id_new,
                         created_date_id_current):
        if created_date_id_current is not None:
            return created_date_id_current
        if student_id_new is not None and learning_object_id_new is not None:
            return date_read_data
        return created_date_id_current

    getCreatedDateId = udf(getCreatedDateId, LongType())

    def getFirstLearningDate(student_id_new, learning_object_id_new,
                             created_date_id_current):
        if created_date_id_current is not None:
            return created_date_id_current
        if student_id_new is not None and learning_object_id_new is not None:
            return date_read_data
        return created_date_id_current

    getFirstLearningDate = udf(getFirstLearningDate, LongType())

    dyf_mapping_lo_student_history = glueContext.create_dynamic_frame.from_catalog(
        database="nvn_knowledge",
        table_name="mapping_lo_student_history",
        additional_options={
            "path":
            "s3://dts-odin/nvn_knowledge/mapping_lo_student_history/*/*"
        })

    #get start read for read

    start_read = 0

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3a://dts-odin/flag/nvn_knowledge/mapping_lo_student_end_read.parquet")
    #     start_read = df_flag.collect()[0]['flag']
    #     print('read start date from: ', start_read)
    # except:
    #     print('read flag file error ')
    #     start_read = None

    dyf_mapping_lo_student_history = Filter.apply(
        frame=dyf_mapping_lo_student_history,
        f=lambda x: x["student_id"] is not None and x["student_id"] != 0 and x[
            "learning_object_id"] is not None)
    #
    print('dyf_mapping_lo_student_history')
    # print(dyf_mapping_lo_student_history.count())
    # dyf_mapping_lo_student_history.show(3)
    # dyf_mapping_lo_student_history.printSchema()

    df_mapping_lo_student_history_cache = dyf_mapping_lo_student_history.toDF()
    df_mapping_lo_student_history_cache.dropDuplicates([
        'student_id', 'learning_object_id', 'source_system', 'created_date_id'
    ])
    df_mapping_lo_student_history_cache.cache()
    df_group_source_system = df_mapping_lo_student_history_cache.groupby(
        'source_system').agg(f.max('created_date_id').alias('max_date'))

    max_allowing_date = df_group_source_system.agg({
        "max_date": "min"
    }).collect()[0][0]

    print('check date_read_data')
    print('max_allowing_date: ', max_allowing_date)
    print('date_read_data: ', date_read_data)
    if max_allowing_date <= date_read_data:
        print('stop in here::max_allowing_date <= date_read_data')
        return

    df_mapping_lo_student_history_cache = df_mapping_lo_student_history_cache.filter(
        df_mapping_lo_student_history_cache['created_date_id'] ==
        date_read_data)

    print('df_mapping_lo_student_history_cache')
    df_mapping_lo_student_history_cache.printSchema()
    df_mapping_lo_student_history_cache.show(3)
    print('df_mapping_lo_student_history_cache::number: ',
          df_mapping_lo_student_history_cache.count())
    if df_mapping_lo_student_history_cache.count() > 0:
        df_mapping_lo_student_new = df_mapping_lo_student_history_cache\
            .groupby('student_id', 'learning_object_id', ).agg(
                addScoreAll(f.sum('knowledge_plus'), f.sum('knowledge_minus')).alias('knowledge_new'),
                addScoreAll(f.sum('comprehension_plus'), f.sum('comprehension_minus')).alias('comprehension_new'),
                addScoreAll(f.sum('application_plus'), f.sum('application_minus')).alias('application_new'),
                addScoreAll(f.sum('analysis_plus'), f.sum('analysis_minus')).alias('analysis_new'),
                addScoreAll(f.sum('synthesis_plus'), f.sum('synthesis_minus')).alias('synthesis_new'),
                addScoreAll(f.sum('evaluation_plus'), f.sum('evaluation_minus')).alias('evaluation_new'))
        #
        df_mapping_lo_student_new = df_mapping_lo_student_new.withColumnRenamed('student_id', 'student_id_new')\
            .withColumnRenamed('learning_object_id', 'learning_object_id_new')

        # dyf_df_mapping_lo_student_new = DynamicFrame.fromDF(df_mapping_lo_student_new, glueContext, 'dyf_df_mapping_lo_student_new')
        #
        # print('dyf_df_mapping_lo_student_new')
        # dyf_df_mapping_lo_student_new.printSchema()
        # dyf_df_mapping_lo_student_new.show(3)
        # print('dyf_df_mapping_lo_student_new number: ', dyf_df_mapping_lo_student_new.count())

        # dyf_mapping_lo_student_current = glueContext.create_dynamic_frame.from_catalog(
        #         #     database="nvn_knowledge",
        #         #     table_name="mapping_lo_student"
        #         # )

        dyf_mapping_lo_student_current = glueContext.create_dynamic_frame.from_options(
            connection_type="redshift",
            connection_options={
                "url":
                "jdbc:redshift://datashine-dev.c4wxydftpsto.ap-southeast-1.redshift.amazonaws.com:5439/dts_odin",
                "user":
                "******",
                "password":
                "******",
                "dbtable":
                "temp_v1_mapping_lo_student",
                "redshiftTmpDir":
                "s3n://dts-odin/temp1/thanhtv3/temp_v1_mapping_lo_student/v8"
            })

        print('mapping_lo_student_current')
        dyf_mapping_lo_student_current.printSchema()
        dyf_mapping_lo_student_current.show(3)
        # #
        # # # Filter all
        dyf_mapping_lo_student_current = Filter.apply(
            frame=dyf_mapping_lo_student_current,
            f=lambda x: x["student_id"] is not None and x["student_id"] != 0

            # and x["knowledge_pass_date_id"] is None
            # and x["comprehension_pass_date_id"] is None
            # and x["application_pass_date_id"] is None
            # and x["analysis_pass_date_id"] is not None
            # and x["synthesis_pass_date_id"] is None
            # and x["evaluation_pass_date_id"] is None
        )
        #
        dy_mapping_lo_student_current = dyf_mapping_lo_student_current.toDF()
        # dy_mapping_lo_student_current = dy_mapping_lo_student_current.drop('user_id')
        dy_mapping_lo_student_current.cache()
        #
        #

        join_mapping = df_mapping_lo_student_new.join(
            dy_mapping_lo_student_current,
            (dy_mapping_lo_student_current['student_id']
             == df_mapping_lo_student_new['student_id_new'])
            & (dy_mapping_lo_student_current['learning_object_id']
               == df_mapping_lo_student_new['learning_object_id_new']), 'left')

        print('join_new_knowledge_left')
        join_mapping.printSchema()

        #
        print('thanhtv3-------------------------------')

        join_mapping = join_mapping \
                .withColumn('knowledge_t', addMoreSore(join_mapping.knowledge, join_mapping.knowledge_new))\
                .withColumn('comprehension_t', addMoreSore(join_mapping.comprehension, join_mapping.comprehension_new))\
                .withColumn('application_t', addMoreSore(join_mapping.application, join_mapping.application_new))\
                .withColumn('analysis_t', addMoreSore(join_mapping.analysis, join_mapping.analysis_new))\
                .withColumn('synthesis_t', addMoreSore(join_mapping.synthesis, join_mapping.synthesis_new))\
                .withColumn('evaluation_t', addMoreSore(join_mapping.evaluation, join_mapping.evaluation_new)) \
                .withColumn('student_id_t', getnewStudentId(join_mapping.student_id, join_mapping.student_id_new)) \
                .withColumn('learning_object_id_t',
                            getnewStudentLearningObjectId(join_mapping.learning_object_id, join_mapping.learning_object_id_new))\
                .withColumn('knowledge_pass_date_id',
                            getNewPassDate(join_mapping.knowledge_pass_date_id, join_mapping.knowledge, join_mapping.knowledge_new))\
                .withColumn('comprehension_pass_date_id',
                            getNewPassDate(join_mapping.comprehension_pass_date_id, join_mapping.comprehension, join_mapping.comprehension_new))\
                .withColumn('application_pass_date_id',
                           getNewPassDate(join_mapping.application_pass_date_id, join_mapping.application, join_mapping.application_new))\
                .withColumn('analysis_pass_date_id',
                            getNewPassDate(join_mapping.analysis_pass_date_id, join_mapping.analysis, join_mapping.analysis_new))\
                .withColumn('synthesis_pass_date_id',
                            getNewPassDate(join_mapping.synthesis_pass_date_id, join_mapping.synthesis, join_mapping.synthesis_new))\
                .withColumn('evaluation_pass_date_id',
                            getNewPassDate(join_mapping.evaluation_pass_date_id, join_mapping.evaluation, join_mapping.evaluation_new))\
                .withColumn('modified_date_id_t',
                            udfGetModifiedDateId(join_mapping.student_id_new, join_mapping.learning_object_id_new, join_mapping.modified_date_id))\
                .withColumn('created_date_id_t',
                            getCreatedDateId(join_mapping.student_id_new, join_mapping.learning_object_id_new, join_mapping.created_date_id))\
                .withColumn('first_learning_date_id_t',
                            getFirstLearningDate(join_mapping.student_id_new, join_mapping.learning_object_id_new, join_mapping.first_learning_date_id))
        # #s
        # #
        # join_mapping = join_mapping.drop('knowledge_new', 'comprehension_new', 'synthesis_new',
        #                                                    'application_new', 'evaluation_new', 'analysis_new',
        #                                  'knowledge', 'comprehension', 'synthesis',
        #                                  'application', 'evaluation', 'analysis',
        #                                  'student_id_new', 'learning_object_id_new', 'created_date_id')
        # join_mapping.cache()

        join_mapping = join_mapping.select(
            'id', 'user_id', 'student_id_t', 'learning_object_id_t',
            'knowledge_t', 'knowledge_pass_date_id', 'comprehension_t',
            'comprehension_pass_date_id', 'application_t',
            'application_pass_date_id', 'analysis_t', 'analysis_pass_date_id',
            'synthesis_t', 'synthesis_pass_date_id', 'evaluation_t',
            'evaluation_pass_date_id', 'modified_date_id_t',
            'created_date_id_t', 'first_learning_date_id_t')

        print('join_mapping')
        join_mapping.printSchema()
        join_mapping.show(1)
        dyf_join_mapping = DynamicFrame.fromDF(join_mapping, glueContext,
                                               'dyf_join_mapping')

        dyf_join_mapping = dyf_join_mapping.resolveChoice(
            specs=[('user_id', 'cast:long')])

        apply_ouput = ApplyMapping.apply(
            frame=dyf_join_mapping,
            mappings=[
                ("user_id", "long", "user_id", "long"),
                ("student_id_t", "long", "student_id", "long"),
                ("learning_object_id_t", "long", "learning_object_id", "long"),
                ("knowledge_t", "int", "knowledge", "long"),
                ("comprehension_t", "int", "comprehension", "long"),
                ("application_t", "int", "application", "long"),
                ("analysis_t", "int", "analysis", "long"),
                ("synthesis_t", "int", "synthesis", "long"),
                ("evaluation_t", "int", "evaluation", "long"),
                ("knowledge_pass_date_id", "int", "knowledge_pass_date_id",
                 "long"),
                ("comprehension_pass_date_id", "int",
                 "comprehension_pass_date_id", "long"),
                ("application_pass_date_id", "int", "application_pass_date_id",
                 "long"),
                ("analysis_pass_date_id", "int", "analysis_pass_date_id",
                 "long"),
                ("synthesis_pass_date_id", "int", "synthesis_pass_date_id",
                 "long"),
                ("evaluation_pass_date_id", "int", "evaluation_pass_date_id",
                 "long"),
                ("modified_date_id_t", "long", "modified_date_id", "long"),
                ("created_date_id_t", "long", "created_date_id", "long"),
                ("first_learning_date_id_t", "long", "first_learning_date_id",
                 "long")
            ])
        #
        dfy_output = ResolveChoice.apply(frame=apply_ouput,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")

        datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dfy_output,
            catalog_connection="glue_redshift",
            connection_options={
                "dbtable": "temp_thanhtv3_dyf_join_mapping",
                "database": "dts_odin"
            },
            redshift_tmp_dir=
            "s3n://dts-odin/temp/thanhtv3/nvn/knowledge/mapping_lo_student/v3",
            transformation_ctx="datasink4")
        # #
        # # #save flag for next read
        # next_day = getPreviousDate(date_read_data)
        # flag_data = [next_day]
        # df = spark.createDataFrame(flag_data, "int").toDF('flag')
        # # # ghi de _key vao s3
        # df.write.parquet("s3a://dts-odin/flag/nvn_knowledge/mapping_lo_student_end_read.parquet", mode="overwrite")
        # unpersit all cache
        df_mapping_lo_student_history_cache.unpersist()
        dy_mapping_lo_student_current.unpersist()

Пример #23

Показать файл

from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from awsglue.context import GlueContext, DynamicFrame
from pyspark.sql.window import Window
import urllib.parse as urlparse
import urllib
from pyspark.sql.utils import AnalysisException

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

sparkContext = SparkContext.getOrCreate()
glueContext = GlueContext(sparkContext)
spark = glueContext.spark_session

job = Job(glueContext)
args = getResolvedOptions(sys.argv, [
    'JOB_NAME', 'prefix', 'bucket', 'datalake_bucket', 'datalake_prefix',
    'id_prefix', 'controller_table_name', 'region', 'crawler_name'
])
job.init(args['JOB_NAME'], args)


class LoadIncremental():
    def __init__(self):
        # self.prefix = 'dms-rawdata/public/'
        # self.bucket = 'dms-rawdata'
        # self.datalake_bucket='marketboomer-datalake-table'

Пример #24

Показать файл

Файл: export_from_datacatalog.py Проект: paparaoc/aws-glue-samples

def main():
    to_s3 = 'to-s3'
    to_jdbc = 'to-jdbc'
    parser = argparse.ArgumentParser(prog=sys.argv[0])
    parser.add_argument(
        '-m',
        '--mode',
        required=True,
        choices=[to_s3, to_jdbc],
        help='Choose to migrate from datacatalog to s3 or to metastore')
    parser.add_argument(
        '--database-names',
        required=True,
        help=
        'Semicolon-separated list of names of database in Datacatalog to export'
    )
    parser.add_argument('-o',
                        '--output-path',
                        required=False,
                        help='Output path, either local directory or S3 path')
    parser.add_argument(
        '-c',
        '--connection-name',
        required=False,
        help='Glue Connection name for Hive metastore JDBC connection')
    parser.add_argument(
        '-R',
        '--region',
        required=False,
        help='AWS region of source Glue DataCatalog, default to "us-east-1"')

    options = get_options(parser, sys.argv)
    if options['mode'] == to_s3:
        validate_options_in_mode(options=options,
                                 mode=to_s3,
                                 required_options=['output_path'],
                                 not_allowed_options=['connection_name'])
    elif options['mode'] == to_jdbc:
        validate_options_in_mode(options=options,
                                 mode=to_jdbc,
                                 required_options=['connection_name'],
                                 not_allowed_options=['output_path'])
    else:
        raise AssertionError('unknown mode ' + options['mode'])

    validate_aws_regions(options['region'])

    # spark env
    (conf, sc, sql_context) = get_spark_env()
    glue_context = GlueContext(sc)

    # extract from datacatalog reader
    database_arr = options['database_names'].split(';')

    (databases, tables,
     partitions) = read_databases_from_catalog(sql_context=sql_context,
                                               glue_context=glue_context,
                                               datacatalog_name='datacatalog',
                                               database_arr=database_arr,
                                               region=options.get('region')
                                               or 'us-east-1')

    if options['mode'] == to_s3:
        output_path = get_output_dir(options['output_path'])
        datacatalog_migrate_to_s3(databases=databases,
                                  tables=tables,
                                  partitions=partitions,
                                  output_path=output_path)
    elif options['mode'] == to_jdbc:
        connection_name = options['connection_name']
        datacatalog_migrate_to_hive_metastore(
            sc=sc,
            sql_context=sql_context,
            databases=databases,
            tables=tables,
            partitions=partitions,
            connection=glue_context.extract_jdbc_conf(connection_name))

Пример #25

Показать файл

Файл: import_demo_data_to_redshift.py Проект: xs2maverick/aws-utility-meter-data-analytics-platform-cn

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

## @params: [TempDir, JOB_NAME]
args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME', 'db_name', 'redshift_connection', 'cis_bucket', 'geo_bucket', 'region'])

sc = SparkContext()
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3."+args['region']+".amazonaws.com.cn")
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

"""
    Copy CIS Demo Data to Redshift
"""
cis_datasource = glueContext.create_dynamic_frame_from_options("s3", \
                                                               {"paths": ["s3://{}".format(args['cis_bucket'])],"recurse": True}, \
                                                               format="csv", \
                                                               format_options={ \
                                                                   "withHeader": False, \
                                                                   "separator": ";" \
                                                                   })

cis_mapping = ApplyMapping.apply(frame = cis_datasource, mappings = [("col0", "string", "customer_id", "string"), \
                                                                     ("col1", "string", "name", "string"), \
                                                                     ("col2", "long", "zip", "long"), \

Пример #26

Показать файл

        regexp_extract(df['filename'],
                       r".+analytics-.+(202[01]-..-..-..).*parquet", 1))

    df = df.repartition("submitteddatehour")

    dyf = DynamicFrame.fromDF(df, glueContext, "submitteddatehour-extracted")

    return (DynamicFrameCollection({"CustomTransform0": dyf}, glueContext))


## @params: [JOB_NAME, SOURCE_BUCKET_URI, DESTINATION_BUCKET_URI]
args = getResolvedOptions(
    sys.argv, ['JOB_NAME', 'SOURCE_BUCKET_URI', 'DESTINATION_BUCKET_URI'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
glueContext.sql("set spark.sql.parquet.mergeSchema=true")
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [connection_type = "s3", format = "parquet", connection_options = {"paths": ["s3://te-load-test-analytics-submission-parquet/"], "recurse":True}, transformation_ctx = "DataSource0"]
## @return: DataSource0
## @inputs: []
DataSource0 = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    format="parquet",
    connection_options={
        "paths": [f"{args['SOURCE_BUCKET_URI']}/"],
        "recurse": True
    },

Пример #27

Показать файл

Файл: DO_L3120_knowledge_tu_vung_top_result_ai.py Проект: 01662024622/dts-odin-etl

def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    # thoi gian tu 01/10/2019
    timestamp = 1569888000

    # ETL TBHV
    # Custom function
    def doSplitWord(word):
        size = len(word)
        rs = [word[i:i + 2] for i in range(0, size, 1)]
        rs1 = [word[i:i + 1] for i in range(0, size, 1)]
        rs.extend(rs1)
        return rs

    state_right = 'state_right'
    state_wrong = 'state_wrong'

    # mac dinh duoc cong knowledge
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D1; P3_D2; P4_D1; P4_D2
    knowledge = ''
    # cong diem comprehension:
    # Can list cac name duoc cong diem comprehension:
    # P1_D1; P1_D2; P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    comprehension = [
        'P1_D1', 'P1_D2', 'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1',
        'P4_D2'
    ]
    # cong diem application:
    # Can list cac name duoc cong diem application:
    # P1_D3; P2_D1; P2_D2; P2_D3; P3_D2; P4_D1; P4_D2
    application = [
        'P1_D3', 'P2_D1', 'P2_D2', 'P2_D3', 'P3_D2', 'P4_D1', 'P4_D2'
    ]
    # cong diem analysis:
    # Can list cac name duoc cong diem analysis
    # P2_D3; P3_D2; P4_D1; P4_D2
    analysis = ['P2_D3', 'P3_D2', 'P4_D1', 'P4_D2']
    # cong diem synthesis:
    # Can list cac name duoc cong diem synthesis
    # P4_D1; P4_D2
    synthesis = ['P4_D1', 'P4_D2']
    # cong diem evaluation:
    # Can list cac name duoc cong diem evaluation
    evaluation = ''

    def doAddScore(name, state, type):
        arr = ['']
        score = 0
        if type == 'comprehension':
            arr = comprehension

        if type == 'application':
            arr = application

        if type == 'analysis':
            arr = analysis

        if type == 'synthesis':
            arr = synthesis

        name = name.lower()
        if state == state_right:
            score = 10
        if state == state_wrong:
            score = -5

        if name is not None:
            for x in arr:
                if x.lower() in name:
                    return score
        return 0

    addScore = udf(doAddScore, IntegerType())

    def doAddScoreAll(plus, minus):
        if plus is None and minus is not None:
            return minus
        if minus is None and plus is not None:
            return plus
        if minus is not None and plus is not None:
            return plus + minus
        return 0

    addScoreAll = udf(doAddScoreAll, IntegerType())

    def do_check_null(val1, val2):
        if val1 is None and val2 is not None:
            return val2
        if val2 is None and val1 is not None:
            return val1
        if val1 is not None and val2 is not None:
            return val1
        return 0

    check_data_null = udf(do_check_null, StringType())

    # chuoi ky tu can replace
    special_str = '["] ;'

    splitWord = udf(lambda x: doSplitWord(x))

    ########## top_quiz_attempts
    dyf_top_quiz_attempts = glueContext.create_dynamic_frame.from_catalog(
        database="moodle", table_name="top_quiz_attempts")
    dyf_top_quiz_attempts = dyf_top_quiz_attempts.select_fields(
        ['_key', 'id', 'timestart', 'quiz'])

    dyf_top_quiz_attempts = dyf_top_quiz_attempts.resolveChoice(
        specs=[('_key', 'cast:long')])

    print dyf_top_quiz_attempts.count()
    dyf_top_quiz_attempts.show(2)

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet")
    #     start_read = df_flag.collect()[0]['flag']
    #     print('read from index: ', start_read)
    #
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     dyf_top_quiz_attempts = Filter.apply(frame=dyf_top_quiz_attempts, f=lambda x: x['_key'] > start_read)
    # except:
    #     print('read flag file error ')

    dyf_top_quiz_attempts = Filter.apply(
        frame=dyf_top_quiz_attempts, f=lambda x: x["timestart"] >= timestamp)

    print dyf_top_quiz_attempts.count()
    dyf_top_quiz_attempts.show()

    if dyf_top_quiz_attempts.count() > 0:
        ########## dyf_top_user
        dyf_top_user = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="do_top_user")
        dyf_top_user = dyf_top_user.select_fields(['id',
                                                   'student_id']).rename_field(
                                                       'id', 'top_user_id')
        ######### top_question
        dyf_top_question = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_question")
        dyf_top_question = dyf_top_question.select_fields(
            ['id', 'name']).rename_field('id', 'quest_id')
        # dyf_top_result_ai = dyf_top_result_ai.resolveChoice(specs=[('_key', 'cast:long')])

        ######### top_result_ai
        dyf_top_result_ai = glueContext.create_dynamic_frame.from_catalog(
            database="moodle", table_name="top_result_ai")
        dyf_top_result_ai = dyf_top_result_ai.select_fields([
            'question_id', 'attempt_id', 'user_id', 'ratio', 'right_word',
            'wrong_word'
        ])

        # JOIN va FILTER cac bang theo dieu kien
        dyf_join01 = Join.apply(dyf_top_result_ai, dyf_top_question,
                                'question_id', 'quest_id')
        dyf_join02 = Join.apply(dyf_join01, dyf_top_quiz_attempts,
                                'attempt_id', 'id')

        dyf_join02 = Filter.apply(frame=dyf_join02,
                                  f=lambda x: x["quiz"] not in [7, 9, 918])
        dyf_join02 = Join.apply(dyf_join02, dyf_top_user, 'user_id',
                                'top_user_id')

        # dyf_join02.show()
        df_study = dyf_join02.toDF()
        df_study.cache()
        if (df_study.count() > 0):
            try:
                # print("COUNT 1:", df_study.count())
                # Loc cac ky tu dac biet [ ] ",
                # Hien data co dang nhu sau: ["house","her","to","how","get","long"] hoac "environmental", ...
                # df_study = df_study.select(
                #     'quiz', 'name', 'user_id', 'timestart', 'right_word', 'wrong_word', f.translate(df_study.right_word,
                #                                                                                     special_str, ''), f.translate(df_study.wrong_word,
                #                                        special_str, ''))
                df_study = df_study.select('quiz', 'name', 'student_id',
                                           'timestart', 'right_word',
                                           'wrong_word')
                df_study = df_study.withColumn("right_word_new", f.translate(df_study.right_word, special_str, '')) \
                                   .withColumn("wrong_word_new", f.translate(df_study.wrong_word, special_str, ''))

                # Tach cau thanh array tu:
                # house, her => [house, her]
                # PHan tich tu dung
                df_study_right = df_study.withColumn(
                    "right_word_list", f.split(df_study.right_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_right = df_study_right.withColumn(
                    "right", f.explode(df_study_right.right_word_list))
                df_study_right = df_study_right.select('quiz', 'name',
                                                       'student_id',
                                                       'timestart', 'right')
                df_study_right = df_study_right.withColumn(
                    "right", f.lower(f.col("right")))
                # print("COUNT 2:", df_study_right.count())
                # df_study_right.printSchema()
                # df_study_right.show()
                dyf_study_right = DynamicFrame.fromDF(df_study_right,
                                                      glueContext,
                                                      "dyf_study_right")
                ## Learning Object
                dyf_learning_object = glueContext.create_dynamic_frame.from_catalog(
                    database="nvn_knowledge", table_name="learning_object")
                dyf_learning_object = dyf_learning_object.select_fields(
                    ['learning_object_id', 'learning_object_name'])

                df_learning_object = dyf_learning_object.toDF()
                # convert to lowercase
                df_learning_object = df_learning_object.withColumn(
                    "learning_object_name",
                    f.lower(f.col("learning_object_name")))
                dyf_learning_object = DynamicFrame.fromDF(
                    df_learning_object, glueContext, "dyf_learning_object")

                dyf_knowledge_right = Join.apply(dyf_study_right,
                                                 dyf_learning_object, 'right',
                                                 'learning_object_name')

                # print("COUNT 3:", dyf_knowledge_right.count())
                # dyf_knowledge_right.printSchema()
                # print("COUNT 4:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                df_knowledge_right = dyf_knowledge_right.toDF()
                df_knowledge_right.cache()

                df_knowledge_right = df_knowledge_right.withColumn("knowledge", f.lit(10)) \
                        .withColumn("comprehension", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('comprehension'))) \
                        .withColumn("application", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('application'))) \
                        .withColumn("analysis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('analysis'))) \
                        .withColumn("synthesis", addScore(df_knowledge_right.name, f.lit('state_right'), f.lit('synthesis'))) \
                        .withColumn("evaluation", f.lit(0)) \
                        .withColumn("date_id", from_unixtime(df_knowledge_right['timestart'], 'yyyyMMdd'))

                df_knowledge_right = df_knowledge_right.groupby(
                    'student_id', 'date_id', 'learning_object_id').agg(
                        f.count('knowledge').alias("count_plus"),
                        f.sum('knowledge').alias("knowledge_plus"),
                        f.sum('comprehension').alias("comprehension_plus"),
                        f.sum('application').alias("application_plus"),
                        f.sum('analysis').alias("analysis_plus"),
                        f.sum('synthesis').alias("synthesis_plus"),
                        f.sum('evaluation').alias("evaluation_plus"))
                df_knowledge_right = df_knowledge_right.where(
                    'student_id is not null')
                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()

                # dyf_knowledge_right = DynamicFrame.fromDF(df_knowledge_right, glueContext, "dyf_knowledge_right")
                #
                # applymapping = ApplyMapping.apply(frame=dyf_knowledge_right,
                #                                   mappings=[("timestart", "long", "timestart", "long"),
                #                                             ("student_id", 'int', 'student_id', 'long'),
                #                                             ("learning_object_id", "int", "learning_object_id", "int"),
                #                                             ("date_id", "string", "date_id", "int"),
                #                                             ("knowledge", "int", "knowledge", "int"),
                #                                             ("comprehension", "int", "comprehension", "int"),
                #                                             ("application", "int", "application", "int"),
                #                                             ("analysis", "int", "analysis", "int"),
                #                                             ("synthesis", "int", "synthesis", "int"),
                #                                             ("evaluation", "int", "evaluation", "int")])
                # resolvechoice = ResolveChoice.apply(frame=applymapping, choice="make_cols",
                #                                     transformation_ctx="resolvechoice2")
                # dropnullfields = DropNullFields.apply(frame=resolvechoice, transformation_ctx="dropnullfields")
                #
                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "temp_right_wrong_learning_object",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/",
                #                                                            transformation_ctx="datasink5")

                # END Cong diem cac tu dung

                #################################################
                # Tru diem cac tu sai: Xu lu tuong tu tu dung.
                # rule tru diem la -5 diem neu sai

                df_study_wrong = df_study.withColumn(
                    "wrong_word_list", f.split(df_study.wrong_word_new, ','))

                # Split column array => nhieu row
                # row: [house, her] =>
                # row1: house
                # row2: her
                df_study_wrong = df_study_wrong.withColumn(
                    "wrong", f.explode(df_study_wrong.wrong_word_list))
                #convert to lowercase
                df_study_wrong = df_study_wrong.withColumn(
                    "wrong", f.lower(f.col("wrong")))

                df_study_wrong = df_study_wrong.select('quiz', 'name',
                                                       'student_id',
                                                       'timestart', 'wrong')
                # print("COUNT 2:", df_study_wrong.count())
                # df_study_wrong.printSchema()
                # df_study_wrong.show()

                dyf_study_wrong = DynamicFrame.fromDF(df_study_wrong,
                                                      glueContext,
                                                      "dyf_study_wrong")
                ## Learning Object
                dyf_knowledge_wrong = Join.apply(dyf_study_wrong,
                                                 dyf_learning_object, 'wrong',
                                                 'learning_object_name')

                # print("COUNT 3:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # print("COUNT 4:", dyf_knowledge_wrong.count())
                # dyf_knowledge_wrong.printSchema()
                # Cong diem cac tu dung
                df_knowledge_wrong = dyf_knowledge_wrong.toDF()
                df_knowledge_wrong.cache()

                df_knowledge_wrong = df_knowledge_wrong.withColumn("knowledge", f.lit(-5)) \
                    .withColumn("comprehension",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('comprehension'))) \
                    .withColumn("application",
                                addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('application'))) \
                    .withColumn("analysis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('analysis'))) \
                    .withColumn("synthesis", addScore(df_knowledge_wrong.name, f.lit('state_wrong'), f.lit('synthesis'))) \
                    .withColumn("evaluation", f.lit(0)) \
                    .withColumn("date_id", from_unixtime(df_knowledge_wrong['timestart'], 'yyyyMMdd'))

                df_knowledge_wrong = df_knowledge_wrong.groupby('student_id', 'date_id',
                                                                'learning_object_id').agg(
                    f.count('knowledge').alias("count_minus"),
                    f.sum('knowledge').alias("knowledge_minus"),
                    f.sum('comprehension').alias("comprehension_minus"),
                    f.sum('application').alias("application_minus"),
                    f.sum('analysis').alias("analysis_minus"),
                    f.sum('synthesis').alias("synthesis_minus"),
                    f.sum('evaluation').alias("evaluation_minus"))\
                    .withColumnRenamed('student_id', 'student_id_wrong') \
                    .withColumnRenamed('date_id', 'date_id_wrong') \
                    .withColumnRenamed('learning_object_id', 'learning_object_id_wrong')

                df_knowledge_wrong = df_knowledge_wrong.where(
                    'student_id_wrong is not null')
                # df_study_all = df_study.select('student_id').withColumnRenamed('student_id', 'student_id_all')

                # df_knowledge_right.printSchema()
                # df_knowledge_right.show()
                df_knowledge = df_knowledge_right.join(
                    df_knowledge_wrong,
                    (df_knowledge_right['student_id']
                     == df_knowledge_wrong['student_id_wrong']) &
                    (df_knowledge_right['date_id']
                     == df_knowledge_wrong['date_id_wrong']) &
                    (df_knowledge_right['learning_object_id']
                     == df_knowledge_wrong['learning_object_id_wrong']),
                    'outer')

                df_knowledge = df_knowledge.withColumn("user_id",
                                check_data_null(df_knowledge.student_id, df_knowledge.student_id_wrong)) \
                    .withColumn("learning_object_id",
                                check_data_null(df_knowledge.learning_object_id, df_knowledge.learning_object_id_wrong)) \
                    .withColumn("created_date_id",
                                check_data_null(df_knowledge.date_id, df_knowledge.date_id_wrong)) \
                    .withColumn("source_system", f.lit('top_result_ai')) \
                    .withColumn("lu_id", f.lit(0))

                dyf_knowledge = DynamicFrame.fromDF(df_knowledge, glueContext,
                                                    "df_knowledge")

                applymapping2 = ApplyMapping.apply(
                    frame=dyf_knowledge,
                    mappings=[
                        ("user_id", 'string', 'student_id', 'long'),
                        ("learning_object_id", "string", "learning_object_id",
                         "long"),
                        # ("knowledge", "int", "knowledge", "long"),
                        # ("comprehension", "int", "comprehension", "long"),
                        # ("application", "int", "application", "long"),
                        # ("analysis", "int", "analysis", "long"),
                        # ("synthesis", "int", "synthesis", "long"),
                        # ("evaluation", "int", "evaluation", "long"),
                        ("knowledge_plus", "long", "knowledge_plus", "long"),
                        ("comprehension_plus", "long", "comprehension_plus",
                         "long"),
                        ("application_plus", "long", "application_plus",
                         "long"),
                        ("analysis_plus", "long", "analysis_plus", "long"),
                        ("synthesis_plus", "long", "synthesis_plus", "long"),
                        ("evaluation_plus", "long", "evaluation_plus", "long"),
                        ("knowledge_minus", "long", "knowledge_minus", "long"),
                        ("comprehension_minus", "long", "comprehension_minus",
                         "long"),
                        ("application_minus", "long", "application_minus",
                         "long"),
                        ("analysis_minus", "long", "analysis_minus", "long"),
                        ("synthesis_minus", "long", "synthesis_minus", "long"),
                        ("evaluation_minus", "long", "evaluation_minus",
                         "long"),
                        ("count_plus", "long", "plus_number", "long"),
                        ("count_minus", "long", "minus_number", "long"),
                        # ("lo_type", "string", "lo_type", "long"),
                        ("source_system", "string", "source_system", "string"),
                        ("created_date_id", "string", "created_date_id",
                         "long"),
                        ("lu_id", "int", "lu_type", "long")
                        # ("student_level", "string", "student_level", "string"),
                        # ("advisor_id", "string", "advisor_id", "long"),
                        # ("package_code", "string", "package_code", "string")
                    ])

                applymapping2.printSchema()
                applymapping2.show(20)

                resolvechoice2 = ResolveChoice.apply(
                    frame=applymapping2,
                    choice="make_cols",
                    transformation_ctx="resolvechoice3")
                dropnullfields2 = DropNullFields.apply(
                    frame=resolvechoice2, transformation_ctx="dropnullfields2")

                print('COUNT df_knowledge: ', dropnullfields2.count())
                dropnullfields2.printSchema()
                dropnullfields2.show(2)

                print('START WRITE TO S3-------------------------')

                datasink6 = glueContext.write_dynamic_frame.from_options(
                    frame=dropnullfields2,
                    connection_type="s3",
                    connection_options={
                        "path":
                        "s3://dtsodin/nvn_knowledge/mapping_lo_student_history_v2/",
                        "partitionKeys": ["created_date_id", "source_system"]
                    },
                    format="parquet",
                    transformation_ctx="datasink6")
                print('END WRITE TO S3-------------------------')
                # datasink5 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2,
                #                                                            catalog_connection="glue_redshift",
                #                                                            connection_options={
                #                                                                "dbtable": "mapping_lo_student_history",
                #                                                                "database": "dts_odin"
                #                                                            },
                #                                                            redshift_tmp_dir="s3n://dts-odin/temp1/top_result_ai/",
                #                                                            transformation_ctx="datasink5")

                # END Tru diem cac tu sai

                # xoa cache
                df_study.unpersist()
                df_knowledge_right.unpersist()
                df_knowledge_wrong.unpersist()
                # df_knowledge_right.unpersist()
            except Exception as e:
                print(
                    "###################### Exception ##########################"
                )
                print(e)

            # ghi flag
            # lay max key trong data source
            mdl_dyf_top_quiz_attempts = dyf_top_quiz_attempts.toDF()
            flag = mdl_dyf_top_quiz_attempts.agg({
                "_key": "max"
            }).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')

            # ghi de _key vao s3
            df.write.parquet(
                "s3a://dtsodin/flag/flag_tu_vung_result_ai.parquet",
                mode="overwrite")

Пример #28

Показать файл

Файл: DO_L3150_trang_thai_tai_khoan_CANCELLED.py Проект: 01662024622/dts-odin-etl

def main():
    glueContext = GlueContext(SparkContext.getOrCreate())
    spark = glueContext.spark_session

    student_id_unavailable = '0'
    package_endtime_unavailable = 99999999999L
    package_starttime_unavailable = 0L
    student_level_code_unavailable = 'UNAVAILABLE'
    student_status_code_unavailable = 'UNAVAILABLE'

    package_endtime = 'package_endtime'
    package_starttime = 'package_starttime'
    student_level_code = 'student_level_code'
    student_status_code = 'student_status_code'

    CANCELLED = 'CANCELLED'

    dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog(
        database="tig_market", table_name="tpe_enduser_used_product_history")
    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields(
        [
            '_key', 'contact_id', 'used_product_id', 'status_old',
            'status_new', 'status_description', 'timecreated'
        ])
    # .rename_field('contact_id', 'contactid')

    dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice(
        specs=[('_key', 'cast:long')])
    # try:
    #     df_flag = spark.read.parquet("s3://dtsodin/flag/flag_trang_thai_tai_khoan_cancelled.parquet")
    #     max_key = df_flag.collect()[0]['flag']
    #     print("max_key:  ", max_key)
    #     # Chi lay nhung ban ghi lon hon max_key da luu, ko load full
    #     dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key)
    # except:
    #     print('read flag file error ')
    print dyf_tpe_enduser_used_product_history.count()
    if dyf_tpe_enduser_used_product_history.count() > 0:
        try:
            dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog(
                database="tig_market",
                table_name="tpe_invoice_product_details")
            dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields(
                ['id', 'cat_code'])

            dyf_student_contact = glueContext.create_dynamic_frame.from_catalog(
                database="tig_advisor", table_name="student_contact")
            dyf_student_contact = dyf_student_contact.select_fields(
                ['contact_id',
                 'student_id']).rename_field('contact_id', 'contactid')

            ##################### Join and Filter data
            df_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.toDF(
            )
            df_tpe_used_product_history_step1 = df_tpe_enduser_used_product_history.groupby('contact_id',
                                                                                            'used_product_id').agg(
                f.max("timecreated").alias("max_timecreated")) \
                .withColumnRenamed("contact_id", "contact_id_temp")
            print df_tpe_used_product_history_step1.count()
            df_tpe_used_product_history_step1.show(20)

            df_tpe_used_product_history_step2 = df_tpe_used_product_history_step1.groupby(
                'contact_id_temp').agg(
                    f.max("max_timecreated").alias("max_timecreated"),
                    f.count("used_product_id").alias("count_used_product_id"))
            print df_tpe_used_product_history_step2.count()
            df_tpe_used_product_history_step2.show(20)
            print "EEEEEEEEEEEEEEEEEEEEEEEEE"

            dyf_tpe_used_product_history = DynamicFrame.fromDF(
                df_tpe_used_product_history_step2, glueContext,
                "dyf_tpe_used_product_history")

            dyf_part_one = Filter.apply(
                frame=dyf_tpe_used_product_history,
                f=lambda x: x["count_used_product_id"] <= 1)
            print dyf_part_one.count()
            # dyf_part_two = Filter.apply(frame=df_tpe_enduser_used_product_history,
            #                             f=lambda x: x["used_product_id"] > 1)
            df_part_one = dyf_part_one.toDF()
            df_part_one = df_part_one.join(
                df_tpe_enduser_used_product_history,
                (df_part_one.contact_id_temp
                 == df_tpe_enduser_used_product_history.contact_id)
                & (df_part_one.max_timecreated
                   == df_tpe_enduser_used_product_history.timecreated))

            dyf_part_one = DynamicFrame.fromDF(df_part_one, glueContext,
                                               "dyf_part_one")
            dyf_part_one = dyf_part_one.select_fields([
                'contact_id', 'used_product_id', 'status_old', 'status_new',
                'status_description', 'timecreated'
            ])

            dyf_join_part_one_product_details = Join.apply(
                dyf_part_one, dyf_tpe_invoice_product_details,
                'used_product_id', 'id')

            dyf_join_part_one_product_details.printSchema()
            print "total 01: ", dyf_join_part_one_product_details.count()
            dyf_join_part_one_product_details.toDF().show(2)

            dyf_join_part_one_contact = Join.apply(
                dyf_join_part_one_product_details, dyf_student_contact,
                'contact_id', 'contactid')
            dyf_join_part_one_contact = dyf_join_part_one_contact \
                .select_fields(['contact_id', 'student_id', 'status_new', 'status_description', 'timecreated'])

            dyf_join_part_one_contact.printSchema()
            print "total 02: ", dyf_join_part_one_contact.count()
            dyf_join_part_one_contact.toDF().show(2)
            # df_join_part_one = dyf_join_part_one_contact.toDF()

            ######################################
            ######## START cancelled
            dyf_join_cancelled_status = Filter.apply(
                frame=dyf_join_part_one_contact,
                f=lambda x: x["status_new"] == CANCELLED)
            print "dyf_join_cancelled_status ", dyf_join_cancelled_status.count(
            )
            dyf_join_cancelled_status.toDF().show(2)
            df_join_cancelled_status = dyf_join_cancelled_status.toDF()

            df_join_cancelled_status = df_join_cancelled_status \
                .withColumn("change_status_date_id",
                            from_unixtime(df_join_cancelled_status.timecreated, 'yyyyMMdd').cast("long")) \
                .withColumn("from_status_id", f.lit(None).cast("long")) \
                .withColumn("to_status_id", f.lit(208).cast("long")) \
                .withColumn("measure1", f.lit(None).cast("long")) \
                .withColumn("measure2", f.lit(None).cast("long")) \
                .withColumn("description", df_join_cancelled_status.status_description) \
                .withColumn("timestamp1", f.lit(None).cast("long"))
            df_join_cancelled_status.show(3)
            dyf_join_cancelled_status = DynamicFrame.fromDF(
                df_join_cancelled_status, glueContext,
                "dyf_join_cancelled_status")

            dyf_join_cancelled_status = dyf_join_cancelled_status \
                .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id',
                                'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1'])
            dyf_join_cancelled_status.printSchema()
            df_join_cancelled_status = dyf_join_cancelled_status.toDF()
            ####### END

            # df_join = df_join.withColumn("to_status_id", f.lit(204).cast("long"))
            df_join_cancelled_status = df_join_cancelled_status.withColumn(
                "user_id",
                f.lit(None).cast("long"))

            dyf_join_status = DynamicFrame.fromDF(df_join_cancelled_status,
                                                  glueContext,
                                                  "dyf_join_status")

            applymapping1 = ApplyMapping.apply(
                frame=dyf_join_status,
                mappings=[("student_id", "string", "student_id", "long"),
                          ("user_id", "long", "user_id", "long"),
                          ("change_status_date_id", "long",
                           "change_status_date_id", "long"),
                          ("from_status_id", "long", "from_status_id", "long"),
                          ("to_status_id", "long", "to_status_id", "long"),
                          ("measure1", "long", "measure1", "double"),
                          ("measure2", "long", "measure2", "double"),
                          ("description", "string", "description", "string"),
                          ("timestamp1", "long", "timestamp1", "long"),
                          ("contact_id", "string", "contact_id", "string")])

            resolvechoice1 = ResolveChoice.apply(
                frame=applymapping1,
                choice="make_cols",
                transformation_ctx="resolvechoice1")
            dropnullfields1 = DropNullFields.apply(
                frame=resolvechoice1, transformation_ctx="dropnullfields1")
            print resolvechoice1.count()
            resolvechoice1.printSchema()
            resolvechoice1.show(5)
            print('START WRITE TO REDSHIFT -------------------------')
            datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfields1,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable": "mapping_changed_status_student",
                    "database": "dts_odin"
                },
                redshift_tmp_dir=
                "s3a://dtsodin/temp/mapping_changed_status_student/",
                transformation_ctx="datasink1")

            print('START WRITE TO S3-------------------------')
            # datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3",
            #                                                          connection_options={
            #                                                              "path": "s3://dtsodin/student_behavior/student_behavior/",
            #                                                              "partitionKeys": ["behavior_id"]},
            #                                                          format="parquet",
            #                                                          transformation_ctx="datasink6")
            print('END WRITE TO S3-------------------------')

            df_temp = dyf_tpe_enduser_used_product_history.toDF()
            flag = df_temp.agg({"_key": "max"}).collect()[0][0]

            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            # ghi de _key vao s3
            df.write.parquet(
                "s3a://dtsodin/flag/flag_trang_thai_tai_khoan_cancelled.parquet",
                mode="overwrite")
        except Exception as e:
            print "Something was wrong ", e

Пример #29

Показать файл

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from pyspark.sql import SQLContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import lit
from pyspark.sql.types import DateType
from datetime import datetime
import os
# Create a Glue context
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
sqlContext = SQLContext(spark)
# mode = "zeppelin"
mode = "glue"
args = getResolvedOptions(sys.argv,['JOB_NAME','env_prefix', 'table'])
jobName = args['JOB_NAME']
envPrefix = args['env_prefix']
table = args['table']
print ("Job: {}".format(jobName))
print ("Environment: {}".format(envPrefix))
print ("Table: {}".format(table))
try:
        sql = """WITH cte_ra_contacts as (
   
   SELECT 
         ACCT_ROLE.CUST_ACCOUNT_ROLE_ID      AS CONTACT_ID,
         SUBSTR (PARTY.PERSON_LAST_NAME, 1, 50)  AS last_name,

Пример #30

Показать файл

import datetime
import logging
from awsglue import DynamicFrame

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, [
    "JOB_NAME", "src_db_name", "src_tbl_name", "datalake_bkt_name",
    "datalake_bkt_prefix"
])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)

logger.info(f'{{"starting_job": "{args["JOB_NAME"]}"}}')

data_frame_datasource0 = glueContext.create_data_frame.from_catalog(
    database=args["src_db_name"],
    table_name=args["src_tbl_name"],
    transformation_ctx="datasource0",
    additional_options={
        "startingPosition": "TRIM_HORIZON",
        "inferSchema": "true"
    })

Пример #31

Показать файл

import sys
from awsglue.transforms import *
from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.functions import lit
from awsglue.context import GlueContext
from awsglue.job import Job

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "green__tripdata_staging", table_name = "stagingfhv_tripdata_2015_01_csv", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
origin = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    connection_options={
        "paths": ["s3://taxi-data-etl/fhv_trippdata_test.csv"]
    },
    format="csv",
    format_options={
        "withHeader": True,
        "separator": ","
    })

Пример #32

Показать файл

Файл: export_from_datacatalog.py Проект: mazelx/aws-glue-samples

def main():
    to_s3 = 'to-s3'
    to_jdbc = 'to-jdbc'
    parser = argparse.ArgumentParser(prog=sys.argv[0])
    parser.add_argument('-m', '--mode', required=True, choices=[to_s3, to_jdbc], help='Choose to migrate from datacatalog to s3 or to metastore')
    parser.add_argument('--database-names', required=True, help='Semicolon-separated list of names of database in Datacatalog to export')
    parser.add_argument('-o', '--output-path', required=False, help='Output path, either local directory or S3 path')
    parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection')
    parser.add_argument('-R', '--region', required=False, help='AWS region of source Glue DataCatalog, default to "us-east-1"')

    options = get_options(parser, sys.argv)
    if options['mode'] == to_s3:
        validate_options_in_mode(
            options=options, mode=to_s3,
            required_options=['output_path'],
            not_allowed_options=['connection_name']
        )
    elif options['mode'] == to_jdbc:
        validate_options_in_mode(
            options=options, mode=to_jdbc,
            required_options=['connection_name'],
            not_allowed_options=['output_path']
        )
    else:
        raise AssertionError('unknown mode ' + options['mode'])

    validate_aws_regions(options['region'])

    # spark env
    (conf, sc, sql_context) = get_spark_env()
    glue_context = GlueContext(sc)

    # extract from datacatalog reader
    database_arr = options['database_names'].split(';')

    (databases, tables, partitions) = read_databases_from_catalog(
        sql_context=sql_context,
        glue_context=glue_context,
        datacatalog_name='datacatalog',
        database_arr=database_arr,
        region=options.get('region') or 'us-east-1'
    )

    if options['mode'] == to_s3:
        output_path = get_output_dir(options['output_path'])
        datacatalog_migrate_to_s3(
            databases=databases,
            tables=tables,
            partitions=partitions,
            output_path=output_path
        )
    elif options['mode'] == to_jdbc:
        connection_name = options['connection_name']
        datacatalog_migrate_to_hive_metastore(
            sc=sc,
            sql_context=sql_context,
            databases=databases,
            tables=tables,
            partitions=partitions,
            connection=glue_context.extract_jdbc_conf(connection_name)
        )