Пример #1
0
def configure_job(cluster_id, data_product):
    step_get_credentials = EmrClusterController.add_job_step(cluster_id, "Get-Credentials", "command-runner.jar",
                                                ["aws", "s3", "cp", "s3://art-emr-configuration-scripts/credentials",
                                                 "/home/hadoop/.aws/"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_get_credentials)
    status = EmrClusterController.get_step_status(cluster_id, step_get_credentials)
    if status == "FAILED":
        print("GET CREDENTIALS FROM S3 FAILED")
        raise RuntimeError("Get Credentials Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET CREDENTIALS FROM S3 COMPLETED SUCCESSFULLY")

    if data_product == 'citi_bike':
        s3_jar_path = 's3://art-emr-configuration-scripts/CitiBikeDataProduct-assembly-0.1.jar'
    elif data_product == 'covid':
        s3_jar_path = 's3://art-emr-configuration-scripts/SparkPractice-assembly-0.1.jar'
    else:
        raise RuntimeError("Invalid data_product Option")
        
    step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar",
                                            ['aws', 's3', 'cp', s3_jar_path,"/home/hadoop/"])

    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("GET JAR FROM S3 FAILED")
        raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")    
Пример #2
0
def configure_job(cluster_id, s3_jar_path):
    step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar",
                                                ['aws', 's3', 'cp', s3_jar_path, "/home/hadoop/"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("GET JAR FROM S3 FAILED")
        raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
Пример #3
0
def spark_submit(cluster_id, jar_path):
    step_spark_submit = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar",
                                                          ['spark-submit', '--class', 'com.ricardo.farias.App',
                                                           jar_path])
    EmrClusterController.wait_for_step_completion(cluster_id, step_spark_submit)
    status = EmrClusterController.get_step_status(cluster_id, step_spark_submit)
    if status == "FAILED":
        print("SPARK SUBMIT JOB FAILED")
        raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
Пример #4
0
def get_credentials(**kwargs):
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    step_id = EmrClusterController.add_job_step(cluster_id, "Get-Credentials", "command-runner.jar",
                           ["aws", "s3", "cp", "s3://emr-configuration-scripts/credentials", "/home/hadoop/.aws/"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("GET CREDENTIALS FROM S3 FAILED")
        raise RuntimeError("Get Credentials Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET CREDENTIALS FROM S3 COMPLETED SUCCESSFULLY")
Пример #5
0
def spark_submit(**kwargs):
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    step_id = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar",
                           ['spark-submit', '--class', 'com.ricardo.farias.App',
                            "/home/hadoop/SparkPractice-assembly-0.1.jar"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("SPARK SUBMIT JOB FAILED")
        raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
Пример #6
0
def get_jar(**kwargs):
    ti = kwargs['ti']
    cluster_id = ti.xcom_pull(task_ids='create_cluster')
    step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar",
                           ['aws', 's3', 'cp', 's3://emr-configuration-scripts/SparkPractice-assembly-0.1.jar',
                            "/home/hadoop/"])
    EmrClusterController.wait_for_step_completion(cluster_id, step_id)
    status = EmrClusterController.get_step_status(cluster_id, step_id)
    if status == "FAILED":
        print("GET JAR FROM S3 FAILED")
        raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
Пример #7
0
def spark_submit(cluster_id, data_product):
    if data_product == 'citi_bike':
        jar_path = '/home/hadoop/CitiBikeDataProduct-assembly-0.1.jar'
    elif data_product == 'covid':
        jar_path = '/home/hadoop/SparkPractice-assembly-0.1.jar'
    else:
        raise RuntimeError("Invalid data_product Option")

    step_spark_submit = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar",
                                                ['spark-submit', '--class', 'com.ricardo.farias.App',jar_path])
    EmrClusterController.wait_for_step_completion(cluster_id, step_spark_submit)
    status = EmrClusterController.get_step_status(cluster_id, step_spark_submit)
    if status == "FAILED":
        print("SPARK SUBMIT JOB FAILED")
        raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?")
    elif status == "COMPLETED":
        print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")