Exemplo n.º 1
0
    def wait_for_task_execution(self,
                                task_execution_arn,
                                max_iterations=2 * 180):
        """
        Wait for Task Execution status to be complete (SUCCESS/ERROR).
        The ``task_execution_arn`` must exist, or a boto3 ClientError will be raised.

        :param str task_execution_arn: TaskExecutionArn
        :param int max_iterations: Maximum number of iterations before timing out.
        :return: Result of task execution.
        :rtype: bool
        :raises AirflowTaskTimeout: If maximum iterations is exceeded.
        :raises AirflowBadRequest: If ``task_execution_arn`` is empty.
        """
        if not task_execution_arn:
            raise AirflowBadRequest("task_execution_arn not specified")

        status = None
        iterations = max_iterations
        while status is None or status in self.TASK_EXECUTION_INTERMEDIATE_STATES:
            task_execution = self.get_conn().describe_task_execution(
                TaskExecutionArn=task_execution_arn)
            status = task_execution["Status"]
            self.log.info("status=%s", status)
            iterations -= 1
            if status in self.TASK_EXECUTION_FAILURE_STATES:
                break
            if status in self.TASK_EXECUTION_SUCCESS_STATES:
                break
            if iterations <= 0:
                break
            time.sleep(self.wait_interval_seconds)

        if status in self.TASK_EXECUTION_SUCCESS_STATES:
            return True
        if status in self.TASK_EXECUTION_FAILURE_STATES:
            return False
        if iterations <= 0:
            raise AirflowTaskTimeout("Max iterations exceeded!")
        raise AirflowException("Unknown status: %s" %
                               status)  # Should never happen
Exemplo n.º 2
0
    def create_session(self):
        headers = {
            "X-Requested-By": "airflow",
            "Content-Type": "application/json"
        }
        unfiltered_payload = {
            "kind": self.kind,
            "proxyUser": self.proxy_user,
            "jars": self.jars,
            "pyFiles": self.py_files,
            "files": self.files,
            "driverMemory": self.driver_memory,
            "driverCores": self.driver_cores,
            "executorMemory": self.executor_memory,
            "executorCores": self.executor_cores,
            "numExecutors": self.num_executors,
            "archives": self.archives,
            "queue": self.queue,
            "name": self.name,
            "conf": self.conf,
            "heartbeatTimeoutInSecond": self.heartbeat_timeout,
        }
        payload = {k: v for k, v in unfiltered_payload.items() if v}
        logging.info(f"Creating a session in Livy... "
                     f"Payload:\n{json.dumps(payload, indent=2)}")
        response = HttpHook(http_conn_id=self.http_conn_id).run(
            ENDPOINT,
            json.dumps(payload),
            headers,
        )
        try:
            session_id = json.loads(response.content)["id"]
        except (JSONDecodeError, LookupError) as ex:
            log_response_error("$.id", response)
            raise AirflowBadRequest(ex)

        if not isinstance(session_id, Number):
            raise AirflowException(
                f"ID of the created session is not a number ({session_id}). "
                "Are you sure we're calling Livy API?")
        self.session_id = session_id
Exemplo n.º 3
0
    def create_database(self, database_name):
        """
        Creates a new database in CosmosDB.
        """
        if database_name is None:
            raise AirflowBadRequest("Database name cannot be None.")

        # We need to check to see if this database already exists so we don't try
        # to create it twice
        existing_database = list(self.get_conn().QueryDatabases({
            "query":
            "SELECT * FROM r WHERE r.id=@id",
            "parameters": [{
                "name": "@id",
                "value": database_name
            }]
        }))

        # Only create if we did not find it already existing
        if len(existing_database) == 0:
            self.get_conn().CreateDatabase({"id": database_name})
    def get_documents(self, sql_string, database_name=None, collection_name=None, partition_key=None):
        """
        Get a list of documents from an existing collection in the CosmosDB database via SQL query.
        """
        if sql_string is None:
            raise AirflowBadRequest("SQL query string cannot be None")

        # Query them in SQL
        query = {'query': sql_string}

        try:
            result_iterable = self.get_conn().QueryItems(
                get_collection_link(
                    self.__get_database_name(database_name),
                    self.__get_collection_name(collection_name)),
                query,
                partition_key)

            return list(result_iterable)
        except HTTPFailure:
            return None
Exemplo n.º 5
0
    def create_collection(self,
                          collection_name: str,
                          database_name: Optional[str] = None) -> None:
        """Creates a new collection in the CosmosDB database."""
        if collection_name is None:
            raise AirflowBadRequest("Collection name cannot be None.")

        # We need to check to see if this container already exists so we don't try
        # to create it twice
        existing_container = list(self.get_conn().get_database_client(
            self.__get_database_name(database_name)).query_containers(
                "SELECT * FROM r WHERE r.id=@id", [{
                    "name": "@id",
                    "value": collection_name
                }]))

        # Only create if we did not find it already existing
        if len(existing_container) == 0:
            self.get_conn().get_database_client(
                self.__get_database_name(database_name)).create_container(
                    collection_name)
Exemplo n.º 6
0
 def submit_batch(self):
     headers = {
         "X-Requested-By": "airflow",
         "Content-Type": "application/json"
     }
     unfiltered_payload = {
         "file": self.file,
         "proxyUser": self.proxy_user,
         "className": self.class_name,
         "args": self.arguments,
         "jars": self.jars,
         "pyFiles": self.py_files,
         "files": self.files,
         "driverMemory": self.driver_memory,
         "driverCores": self.driver_cores,
         "executorMemory": self.executor_memory,
         "executorCores": self.executor_cores,
         "numExecutors": self.num_executors,
         "archives": self.archives,
         "queue": self.queue,
         "name": self.name,
         "conf": self.conf,
     }
     payload = {k: v for k, v in unfiltered_payload.items() if v}
     logging.info("Submitting the batch to Livy... "
                  "Payload:\n{payload}".format(
                      payload=json.dumps(payload, indent=2)))
     response = HttpHook(http_conn_id=self.http_conn_id_livy).run(
         LIVY_ENDPOINT, json.dumps(payload), headers)
     try:
         batch_id = json.loads(response.content)["id"]
     except (JSONDecodeError, LookupError) as ex:
         log_response_error("$.id", response)
         raise AirflowBadRequest(ex)
     if not isinstance(batch_id, Number):
         raise AirflowException(
             "ID of the created batch is not a number ({batch_id}). "
             "Are you sure we're calling Livy API?".format(
                 batch_id=batch_id))
     self.batch_id = batch_id
    def create_collection(self, collection_name, database_name=None):
        """
        Creates a new collection in the CosmosDB database.
        """
        if collection_name is None:
            raise AirflowBadRequest("Collection name cannot be None.")

        # We need to check to see if this container already exists so we don't try
        # to create it twice
        existing_container = list(self.get_conn().QueryContainers(
            get_database_link(self.__get_database_name(database_name)), {
                "query": "SELECT * FROM r WHERE r.id=@id",
                "parameters": [
                    {"name": "@id", "value": collection_name}
                ]
            }))

        # Only create if we did not find it already existing
        if len(existing_container) == 0:
            self.get_conn().CreateContainer(
                get_database_link(self.__get_database_name(database_name)),
                {"id": collection_name})
 def _check_yarn_app_status(self, app_id):
     """
     Verifies whether this YARN job has succeeded or failed
     by querying the YARN Resource Manager
     :param app_id: the YARN application ID
     :raises AirflowException: when the job is verified to have failed
     """
     self.log.info("Getting app status (id=%s) from YARN RM REST API...", app_id)
     endpoint = f"{YARN_ENDPOINT}/{app_id}"
     response = self.LocalConnHttpHook(self, method="GET", http_conn_id='yarn_conn_id').run(
         endpoint
     )
     try:
         status = json.loads(response.content)["app"]["finalStatus"]
     except (JSONDecodeError, LookupError, TypeError) as ex:
         self._log_response_error("$.app.finalStatus", response)
         raise AirflowBadRequest(ex)
     expected_status = "SUCCEEDED"
     if status != expected_status:
         raise AirflowException(
             f"YARN app {app_id} is '{status}', expected status: '{expected_status}'"
         )
Exemplo n.º 9
0
 def check_spark_app_status(self, app_id):
     logging.info(
         f"Getting app status (id={app_id}) from Spark REST API...")
     endpoint = f"{SPARK_ENDPOINT}/{app_id}/jobs"
     response = HttpHook(method="GET",
                         http_conn_id=self.http_conn_id_spark).run(endpoint)
     try:
         jobs = json.loads(response.content)
         expected_status = "SUCCEEDED"
         for job in jobs:
             job_id = job["jobId"]
             job_status = job["status"]
             logging.info(
                 f"Job id {job_id} associated with application '{app_id}' "
                 f"is '{job_status}'")
             if job_status != expected_status:
                 raise AirflowException(
                     f"Job id '{job_id}' associated with application '{app_id}' "
                     f"is '{job_status}', expected status is '{expected_status}'"
                 )
     except (JSONDecodeError, LookupError, TypeError) as ex:
         log_response_error("$.jobId, $.status", response)
         raise AirflowBadRequest(ex)
Exemplo n.º 10
0
    def get_documents(
        self,
        sql_string: str,
        database_name: Optional[str] = None,
        collection_name: Optional[str] = None,
        partition_key: Optional[str] = None,
    ) -> Optional[list]:
        """Get a list of documents from an existing collection in the CosmosDB database via SQL query."""
        if sql_string is None:
            raise AirflowBadRequest("SQL query string cannot be None")

        # Query them in SQL
        query = {'query': sql_string}

        try:
            result_iterable = (self.get_conn().get_database_client(
                self.__get_database_name(database_name)).get_container_client(
                    self.__get_collection_name(collection_name)).query_items(
                        query, partition_key))

            return list(result_iterable)
        except CosmosHttpResponseError:
            return None
Exemplo n.º 11
0
 def poke(self, context):
     logging.info("Getting batch {batch_id} status...".format(
         batch_id=self.batch_id))
     endpoint = "{LIVY_ENDPOINT}/{batch_id}".format(
         LIVY_ENDPOINT=LIVY_ENDPOINT, batch_id=self.batch_id)
     response = HttpHook(method="GET",
                         http_conn_id=self.http_conn_id).run(endpoint)
     try:
         state = json.loads(response.content)["state"]
     except (JSONDecodeError, LookupError) as ex:
         log_response_error("$.state", response, self.batch_id)
         raise AirflowBadRequest(ex)
     if state in VALID_BATCH_STATES:
         logging.info(
             "Batch {batch_id} has not finished yet (state is '{state}')".
             format(batch_id=self.batch_id, state=state))
         return False
     if state == "success":
         logging.info("Batch {batch_id} has finished successfully!".format(
             batch_id=self.batch_id))
         return True
     raise AirflowException(
         "Batch {batch_id} failed with state '{state}'".format(
             batch_id=self.batch_id, state=state))
Exemplo n.º 12
0
    def delete_database(self, database_name: str) -> None:
        """Deletes an existing database in CosmosDB."""
        if database_name is None:
            raise AirflowBadRequest("Database name cannot be None.")

        self.get_conn().delete_database(database_name)