def wait_for_task_execution(self, task_execution_arn, max_iterations=2 * 180): """ Wait for Task Execution status to be complete (SUCCESS/ERROR). The ``task_execution_arn`` must exist, or a boto3 ClientError will be raised. :param str task_execution_arn: TaskExecutionArn :param int max_iterations: Maximum number of iterations before timing out. :return: Result of task execution. :rtype: bool :raises AirflowTaskTimeout: If maximum iterations is exceeded. :raises AirflowBadRequest: If ``task_execution_arn`` is empty. """ if not task_execution_arn: raise AirflowBadRequest("task_execution_arn not specified") status = None iterations = max_iterations while status is None or status in self.TASK_EXECUTION_INTERMEDIATE_STATES: task_execution = self.get_conn().describe_task_execution( TaskExecutionArn=task_execution_arn) status = task_execution["Status"] self.log.info("status=%s", status) iterations -= 1 if status in self.TASK_EXECUTION_FAILURE_STATES: break if status in self.TASK_EXECUTION_SUCCESS_STATES: break if iterations <= 0: break time.sleep(self.wait_interval_seconds) if status in self.TASK_EXECUTION_SUCCESS_STATES: return True if status in self.TASK_EXECUTION_FAILURE_STATES: return False if iterations <= 0: raise AirflowTaskTimeout("Max iterations exceeded!") raise AirflowException("Unknown status: %s" % status) # Should never happen
def create_session(self): headers = { "X-Requested-By": "airflow", "Content-Type": "application/json" } unfiltered_payload = { "kind": self.kind, "proxyUser": self.proxy_user, "jars": self.jars, "pyFiles": self.py_files, "files": self.files, "driverMemory": self.driver_memory, "driverCores": self.driver_cores, "executorMemory": self.executor_memory, "executorCores": self.executor_cores, "numExecutors": self.num_executors, "archives": self.archives, "queue": self.queue, "name": self.name, "conf": self.conf, "heartbeatTimeoutInSecond": self.heartbeat_timeout, } payload = {k: v for k, v in unfiltered_payload.items() if v} logging.info(f"Creating a session in Livy... " f"Payload:\n{json.dumps(payload, indent=2)}") response = HttpHook(http_conn_id=self.http_conn_id).run( ENDPOINT, json.dumps(payload), headers, ) try: session_id = json.loads(response.content)["id"] except (JSONDecodeError, LookupError) as ex: log_response_error("$.id", response) raise AirflowBadRequest(ex) if not isinstance(session_id, Number): raise AirflowException( f"ID of the created session is not a number ({session_id}). " "Are you sure we're calling Livy API?") self.session_id = session_id
def create_database(self, database_name): """ Creates a new database in CosmosDB. """ if database_name is None: raise AirflowBadRequest("Database name cannot be None.") # We need to check to see if this database already exists so we don't try # to create it twice existing_database = list(self.get_conn().QueryDatabases({ "query": "SELECT * FROM r WHERE r.id=@id", "parameters": [{ "name": "@id", "value": database_name }] })) # Only create if we did not find it already existing if len(existing_database) == 0: self.get_conn().CreateDatabase({"id": database_name})
def get_documents(self, sql_string, database_name=None, collection_name=None, partition_key=None): """ Get a list of documents from an existing collection in the CosmosDB database via SQL query. """ if sql_string is None: raise AirflowBadRequest("SQL query string cannot be None") # Query them in SQL query = {'query': sql_string} try: result_iterable = self.get_conn().QueryItems( get_collection_link( self.__get_database_name(database_name), self.__get_collection_name(collection_name)), query, partition_key) return list(result_iterable) except HTTPFailure: return None
def create_collection(self, collection_name: str, database_name: Optional[str] = None) -> None: """Creates a new collection in the CosmosDB database.""" if collection_name is None: raise AirflowBadRequest("Collection name cannot be None.") # We need to check to see if this container already exists so we don't try # to create it twice existing_container = list(self.get_conn().get_database_client( self.__get_database_name(database_name)).query_containers( "SELECT * FROM r WHERE r.id=@id", [{ "name": "@id", "value": collection_name }])) # Only create if we did not find it already existing if len(existing_container) == 0: self.get_conn().get_database_client( self.__get_database_name(database_name)).create_container( collection_name)
def submit_batch(self): headers = { "X-Requested-By": "airflow", "Content-Type": "application/json" } unfiltered_payload = { "file": self.file, "proxyUser": self.proxy_user, "className": self.class_name, "args": self.arguments, "jars": self.jars, "pyFiles": self.py_files, "files": self.files, "driverMemory": self.driver_memory, "driverCores": self.driver_cores, "executorMemory": self.executor_memory, "executorCores": self.executor_cores, "numExecutors": self.num_executors, "archives": self.archives, "queue": self.queue, "name": self.name, "conf": self.conf, } payload = {k: v for k, v in unfiltered_payload.items() if v} logging.info("Submitting the batch to Livy... " "Payload:\n{payload}".format( payload=json.dumps(payload, indent=2))) response = HttpHook(http_conn_id=self.http_conn_id_livy).run( LIVY_ENDPOINT, json.dumps(payload), headers) try: batch_id = json.loads(response.content)["id"] except (JSONDecodeError, LookupError) as ex: log_response_error("$.id", response) raise AirflowBadRequest(ex) if not isinstance(batch_id, Number): raise AirflowException( "ID of the created batch is not a number ({batch_id}). " "Are you sure we're calling Livy API?".format( batch_id=batch_id)) self.batch_id = batch_id
def create_collection(self, collection_name, database_name=None): """ Creates a new collection in the CosmosDB database. """ if collection_name is None: raise AirflowBadRequest("Collection name cannot be None.") # We need to check to see if this container already exists so we don't try # to create it twice existing_container = list(self.get_conn().QueryContainers( get_database_link(self.__get_database_name(database_name)), { "query": "SELECT * FROM r WHERE r.id=@id", "parameters": [ {"name": "@id", "value": collection_name} ] })) # Only create if we did not find it already existing if len(existing_container) == 0: self.get_conn().CreateContainer( get_database_link(self.__get_database_name(database_name)), {"id": collection_name})
def _check_yarn_app_status(self, app_id): """ Verifies whether this YARN job has succeeded or failed by querying the YARN Resource Manager :param app_id: the YARN application ID :raises AirflowException: when the job is verified to have failed """ self.log.info("Getting app status (id=%s) from YARN RM REST API...", app_id) endpoint = f"{YARN_ENDPOINT}/{app_id}" response = self.LocalConnHttpHook(self, method="GET", http_conn_id='yarn_conn_id').run( endpoint ) try: status = json.loads(response.content)["app"]["finalStatus"] except (JSONDecodeError, LookupError, TypeError) as ex: self._log_response_error("$.app.finalStatus", response) raise AirflowBadRequest(ex) expected_status = "SUCCEEDED" if status != expected_status: raise AirflowException( f"YARN app {app_id} is '{status}', expected status: '{expected_status}'" )
def check_spark_app_status(self, app_id): logging.info( f"Getting app status (id={app_id}) from Spark REST API...") endpoint = f"{SPARK_ENDPOINT}/{app_id}/jobs" response = HttpHook(method="GET", http_conn_id=self.http_conn_id_spark).run(endpoint) try: jobs = json.loads(response.content) expected_status = "SUCCEEDED" for job in jobs: job_id = job["jobId"] job_status = job["status"] logging.info( f"Job id {job_id} associated with application '{app_id}' " f"is '{job_status}'") if job_status != expected_status: raise AirflowException( f"Job id '{job_id}' associated with application '{app_id}' " f"is '{job_status}', expected status is '{expected_status}'" ) except (JSONDecodeError, LookupError, TypeError) as ex: log_response_error("$.jobId, $.status", response) raise AirflowBadRequest(ex)
def get_documents( self, sql_string: str, database_name: Optional[str] = None, collection_name: Optional[str] = None, partition_key: Optional[str] = None, ) -> Optional[list]: """Get a list of documents from an existing collection in the CosmosDB database via SQL query.""" if sql_string is None: raise AirflowBadRequest("SQL query string cannot be None") # Query them in SQL query = {'query': sql_string} try: result_iterable = (self.get_conn().get_database_client( self.__get_database_name(database_name)).get_container_client( self.__get_collection_name(collection_name)).query_items( query, partition_key)) return list(result_iterable) except CosmosHttpResponseError: return None
def poke(self, context): logging.info("Getting batch {batch_id} status...".format( batch_id=self.batch_id)) endpoint = "{LIVY_ENDPOINT}/{batch_id}".format( LIVY_ENDPOINT=LIVY_ENDPOINT, batch_id=self.batch_id) response = HttpHook(method="GET", http_conn_id=self.http_conn_id).run(endpoint) try: state = json.loads(response.content)["state"] except (JSONDecodeError, LookupError) as ex: log_response_error("$.state", response, self.batch_id) raise AirflowBadRequest(ex) if state in VALID_BATCH_STATES: logging.info( "Batch {batch_id} has not finished yet (state is '{state}')". format(batch_id=self.batch_id, state=state)) return False if state == "success": logging.info("Batch {batch_id} has finished successfully!".format( batch_id=self.batch_id)) return True raise AirflowException( "Batch {batch_id} failed with state '{state}'".format( batch_id=self.batch_id, state=state))
def delete_database(self, database_name: str) -> None: """Deletes an existing database in CosmosDB.""" if database_name is None: raise AirflowBadRequest("Database name cannot be None.") self.get_conn().delete_database(database_name)