def configure_job(cluster_id, data_product): step_get_credentials = EmrClusterController.add_job_step(cluster_id, "Get-Credentials", "command-runner.jar", ["aws", "s3", "cp", "s3://art-emr-configuration-scripts/credentials", "/home/hadoop/.aws/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_get_credentials) status = EmrClusterController.get_step_status(cluster_id, step_get_credentials) if status == "FAILED": print("GET CREDENTIALS FROM S3 FAILED") raise RuntimeError("Get Credentials Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET CREDENTIALS FROM S3 COMPLETED SUCCESSFULLY") if data_product == 'citi_bike': s3_jar_path = 's3://art-emr-configuration-scripts/CitiBikeDataProduct-assembly-0.1.jar' elif data_product == 'covid': s3_jar_path = 's3://art-emr-configuration-scripts/SparkPractice-assembly-0.1.jar' else: raise RuntimeError("Invalid data_product Option") step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar", ['aws', 's3', 'cp', s3_jar_path,"/home/hadoop/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("GET JAR FROM S3 FAILED") raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
def configure_job(cluster_id, s3_jar_path): step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar", ['aws', 's3', 'cp', s3_jar_path, "/home/hadoop/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("GET JAR FROM S3 FAILED") raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
def spark_submit(cluster_id, jar_path): step_spark_submit = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar", ['spark-submit', '--class', 'com.ricardo.farias.App', jar_path]) EmrClusterController.wait_for_step_completion(cluster_id, step_spark_submit) status = EmrClusterController.get_step_status(cluster_id, step_spark_submit) if status == "FAILED": print("SPARK SUBMIT JOB FAILED") raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
def get_credentials(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') step_id = EmrClusterController.add_job_step(cluster_id, "Get-Credentials", "command-runner.jar", ["aws", "s3", "cp", "s3://emr-configuration-scripts/credentials", "/home/hadoop/.aws/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("GET CREDENTIALS FROM S3 FAILED") raise RuntimeError("Get Credentials Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET CREDENTIALS FROM S3 COMPLETED SUCCESSFULLY")
def spark_submit(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') step_id = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar", ['spark-submit', '--class', 'com.ricardo.farias.App', "/home/hadoop/SparkPractice-assembly-0.1.jar"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("SPARK SUBMIT JOB FAILED") raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
def get_jar(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar", ['aws', 's3', 'cp', 's3://emr-configuration-scripts/SparkPractice-assembly-0.1.jar', "/home/hadoop/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("GET JAR FROM S3 FAILED") raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
def spark_submit(cluster_id, data_product): if data_product == 'citi_bike': jar_path = '/home/hadoop/CitiBikeDataProduct-assembly-0.1.jar' elif data_product == 'covid': jar_path = '/home/hadoop/SparkPractice-assembly-0.1.jar' else: raise RuntimeError("Invalid data_product Option") step_spark_submit = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar", ['spark-submit', '--class', 'com.ricardo.farias.App',jar_path]) EmrClusterController.wait_for_step_completion(cluster_id, step_spark_submit) status = EmrClusterController.get_step_status(cluster_id, step_spark_submit) if status == "FAILED": print("SPARK SUBMIT JOB FAILED") raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")