def create_EMR_cluster(cluster_name, emr_version, subnet_ids): cluster_id = EmrClusterController.create_cluster_job_execution(cluster_name, emr_version, subnet_ids) print("Waiting for Cluster: ", cluster_id) xcom_return = {"clusterId": cluster_id} with open("/airflow/xcom/return.json", "w") as file: json.dump(xcom_return, file) return EmrClusterController.wait_for_cluster_creation(cluster_id)
def configure_job(cluster_id, s3_jar_path): step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar", ['aws', 's3', 'cp', s3_jar_path, "/home/hadoop/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("GET JAR FROM S3 FAILED") raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
def spark_submit(cluster_id, jar_path): step_spark_submit = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar", ['spark-submit', '--class', 'com.ricardo.farias.App', jar_path]) EmrClusterController.wait_for_step_completion(cluster_id, step_spark_submit) status = EmrClusterController.get_step_status(cluster_id, step_spark_submit) if status == "FAILED": print("SPARK SUBMIT JOB FAILED") raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
def get_credentials(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') step_id = EmrClusterController.add_job_step(cluster_id, "Get-Credentials", "command-runner.jar", ["aws", "s3", "cp", "s3://emr-configuration-scripts/credentials", "/home/hadoop/.aws/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("GET CREDENTIALS FROM S3 FAILED") raise RuntimeError("Get Credentials Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET CREDENTIALS FROM S3 COMPLETED SUCCESSFULLY")
def spark_submit(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') step_id = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar", ['spark-submit', '--class', 'com.ricardo.farias.App', "/home/hadoop/SparkPractice-assembly-0.1.jar"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("SPARK SUBMIT JOB FAILED") raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
def get_jar(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar", ['aws', 's3', 'cp', 's3://emr-configuration-scripts/SparkPractice-assembly-0.1.jar', "/home/hadoop/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("GET JAR FROM S3 FAILED") raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
def configure_job(cluster_id, data_product): step_get_credentials = EmrClusterController.add_job_step(cluster_id, "Get-Credentials", "command-runner.jar", ["aws", "s3", "cp", "s3://art-emr-configuration-scripts/credentials", "/home/hadoop/.aws/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_get_credentials) status = EmrClusterController.get_step_status(cluster_id, step_get_credentials) if status == "FAILED": print("GET CREDENTIALS FROM S3 FAILED") raise RuntimeError("Get Credentials Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET CREDENTIALS FROM S3 COMPLETED SUCCESSFULLY") if data_product == 'citi_bike': s3_jar_path = 's3://art-emr-configuration-scripts/CitiBikeDataProduct-assembly-0.1.jar' elif data_product == 'covid': s3_jar_path = 's3://art-emr-configuration-scripts/SparkPractice-assembly-0.1.jar' else: raise RuntimeError("Invalid data_product Option") step_id = EmrClusterController.add_job_step(cluster_id, "Get-Jars", "command-runner.jar", ['aws', 's3', 'cp', s3_jar_path,"/home/hadoop/"]) EmrClusterController.wait_for_step_completion(cluster_id, step_id) status = EmrClusterController.get_step_status(cluster_id, step_id) if status == "FAILED": print("GET JAR FROM S3 FAILED") raise RuntimeError("Get Jar Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("GET JAR FROM S3 COMPLETED SUCCESSFULLY")
def spark_submit(cluster_id, data_product): if data_product == 'citi_bike': jar_path = '/home/hadoop/CitiBikeDataProduct-assembly-0.1.jar' elif data_product == 'covid': jar_path = '/home/hadoop/SparkPractice-assembly-0.1.jar' else: raise RuntimeError("Invalid data_product Option") step_spark_submit = EmrClusterController.add_job_step(cluster_id, "Spark-Submit", "command-runner.jar", ['spark-submit', '--class', 'com.ricardo.farias.App',jar_path]) EmrClusterController.wait_for_step_completion(cluster_id, step_spark_submit) status = EmrClusterController.get_step_status(cluster_id, step_spark_submit) if status == "FAILED": print("SPARK SUBMIT JOB FAILED") raise RuntimeError("Spark Job Failed During Execution: Reason documented in logs probably...?") elif status == "COMPLETED": print("SPARK SUBMIT JOB COMPLETED SUCCESSFULLY")
def create_livy_session(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') master_dns = EmrClusterController.get_cluster_dns(cluster_id) print(f"\n\n MASTER DNS: {master_dns}") response_headers = EmrClusterController.create_spark_session(master_dns) print(f"Create Spark Session: {response_headers}") session_url = EmrClusterController.wait_for_idle_session( master_dns, response_headers) spark_response = EmrClusterController.submit_statement( session_url, "./dags/spark/RddCreation.scala") print(f"Spark Command Response: {spark_response}") EmrClusterController.track_statement_progress(master_dns, spark_response.headers) EmrClusterController.kill_spark_session(session_url)
def create_emr_cluster(**kwargs): cluster_id = EmrClusterController.create_cluster_job_execution( "Livy Cluster", "emr-5.30.0") return cluster_id
def terminate_cluster(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') EmrClusterController.terminate_cluster(cluster_id)
def wait_for_cluster(**kwargs): ti = kwargs['ti'] cluster_id = ti.xcom_pull(task_ids='create_cluster') EmrClusterController.wait_for_cluster_creation(cluster_id)
def terminate_cluster(cluster_id): EmrClusterController.terminate_cluster(cluster_id)