Exemplo n.º 1
0
    def poll_status(self, context):

        retval = None

        if self.finished:
            logging.info("Job is already finished - skipping to result phase.")
            retval = self.return_value
        else:
            logging.info("Job is not finished - executing poll phase.")
            start_time = datetime.utcnow()
            i = 0
            # Bluecore App Engine backend instances timeout after an hour
            while retval is None:
                remaining_secs = self.appengine_timeout - (
                    datetime.utcnow() - start_time).total_seconds()
                logging.info("%0.2f seconds remain until timeout" %
                             remaining_secs)
                if remaining_secs <= 0:
                    raise AirflowTaskTimeout()

                # try_xcom_pull allows us to distinguish between cases where the task
                # hasn't pushed an XCom and where the task pushed an XCom with value None.
                retval_tuple = try_xcom_pull(context=context,
                                             task_ids=self.task_id)
                # if XCom not yet pushed
                if not retval_tuple[0]:
                    logging.info("XCom response not found. Sleeping.")
                    # sleep for a while and try again
                    time.sleep(min(60, 2**i))
                    i += 1
                    continue
                logging.info("XCom response received: %s" % str(retval))
                retval = retval_tuple[1]

                if retval == '__EXCEPTION__':
                    self.retrieve_exception_details(context)
                break

        logging.info("Executing result phase.")
        if retval == '__EXCEPTION__':
            logging.error("Found exception %s: %s" %
                          (self.exc_type or '<UNKNOWN>', self.exc_message
                           or '<UNKNOWN>'))

            if self.exc_callstack:
                logging.error(str(self.exc_callstack))

            raise AirflowException(self.exc_message)

        logging.info("Remote task finished successfully.")
        return
Exemplo n.º 2
0
    def poll_status(self, context):
        start_time = datetime.utcnow()
        i = 0

        # Bluecore App Engine backend instances timeout after an hour
        while True:
            remaining_secs = self.appengine_timeout - (
                datetime.utcnow() - start_time).total_seconds()
            logging.info("%0.2f seconds remain until timeout" % remaining_secs)
            if remaining_secs <= 0:
                raise AirflowTaskTimeout()

            # try_xcom_pull allows us to distinguish between cases where the task
            # hasn't pushed an XCom and where the task pushed an XCom with value None.
            retval_tuple = try_xcom_pull(context=context,
                                         task_ids=self.task_id)
            # if XCom not yet pushed

            if not retval_tuple[0]:
                logging.info("XCom response not found. Sleeping.")
                # sleep for a while and try again
                time.sleep(min(60, 2**i))
                i += 1
                continue
            retval = retval_tuple[1]
            logging.info("XCom response received: %s" % str(retval))
            if retval == '__EXCEPTION__':
                exc_message = self.safe_xcom_pull(context=context,
                                                  task_ids=self.task_id,
                                                  key='__EXCEPTION_MESSAGE')

                exc_type = self.safe_xcom_pull(context=context,
                                               task_ids=self.task_id,
                                               key='__EXCEPTION_TYPE')

                exc_callstack = self.safe_xcom_pull(
                    context=context,
                    task_ids=self.task_id,
                    key='__EXCEPTION_CALLSTACK')

                logging.error(
                    "Found exception %s: %s" %
                    (exc_type or '<UNKNOWN>', exc_message or '<UNKNOWN>'))

                if exc_callstack:
                    logging.error(str(exc_callstack))

                raise AirflowException(exc_message)
            return
Exemplo n.º 3
0
    def _monitor_logging(self, ci_hook, resource_group, name):
        last_state = None
        last_message_logged = None
        last_line_logged = None
        for _ in range(43200):  # roughly 12 hours
            try:
                state, exit_code, detail_status = ci_hook.get_state_exitcode_details(
                    resource_group, name)
                if state != last_state:
                    self.log.info("Container group state changed to %s", state)
                    last_state = state

                messages = ci_hook.get_messages(resource_group, name)
                last_message_logged = self._log_last(messages,
                                                     last_message_logged)

                if state in ["Running", "Terminated"]:
                    try:
                        logs = ci_hook.get_logs(resource_group, name)
                        last_line_logged = self._log_last(
                            logs, last_line_logged)
                    except CloudError:
                        self.log.exception("Exception while getting logs from "
                                           "container instance, retrying...")

                if state == "Terminated":
                    self.log.info("Container exited with detail_status %s",
                                  detail_status)
                    return exit_code

            except CloudError as err:
                if 'ResourceNotFound' in str(err):
                    self.log.warning(
                        "ResourceNotFound, container is probably removed "
                        "by another process "
                        "(make sure that the name is unique).")
                    return 1
                else:
                    self.log.exception(
                        "Exception while getting container groups")
            except Exception:
                self.log.exception("Exception while getting container groups")

            sleep(1)

        # no return -> hence still running
        raise AirflowTaskTimeout("Did not complete on time")
Exemplo n.º 4
0
    def poll_status_files(self):
        success_file_name = '%s/succeeded' % self.job_id
        fail_file_name = '%s/failed' % self.job_id
        start_time = datetime.utcnow()
        i = 0
        # Bluecore App Engine backend instances timeout after an hour
        while (datetime.utcnow() - start_time).total_seconds() < 3600:
            time.sleep(min(60, 5 * 2**i))
            i += 1
            if check_gcs_file_exists(success_file_name,
                                     self.google_cloud_conn_id, self.bucket):
                return
            if check_gcs_file_exists(fail_file_name, self.google_cloud_conn_id,
                                     self.bucket):
                raise AirflowException('found failure file %s/%s' %
                                       (self.bucket, fail_file_name))

        raise AirflowTaskTimeout()
Exemplo n.º 5
0
    def wait_for_task_execution(self,
                                task_execution_arn: str,
                                max_iterations: int = 2 * 180) -> bool:
        """
        Wait for Task Execution status to be complete (SUCCESS/ERROR).
        The ``task_execution_arn`` must exist, or a boto3 ClientError will be raised.

        :param str task_execution_arn: TaskExecutionArn
        :param int max_iterations: Maximum number of iterations before timing out.
        :return: Result of task execution.
        :rtype: bool
        :raises AirflowTaskTimeout: If maximum iterations is exceeded.
        :raises AirflowBadRequest: If ``task_execution_arn`` is empty.
        """
        if not task_execution_arn:
            raise AirflowBadRequest("task_execution_arn not specified")

        status = None
        iterations = max_iterations
        while status is None or status in self.TASK_EXECUTION_INTERMEDIATE_STATES:
            task_execution = self.get_conn().describe_task_execution(
                TaskExecutionArn=task_execution_arn)
            status = task_execution["Status"]
            self.log.info("status=%s", status)
            iterations -= 1
            if status in self.TASK_EXECUTION_FAILURE_STATES:
                break
            if status in self.TASK_EXECUTION_SUCCESS_STATES:
                break
            if iterations <= 0:
                break
            time.sleep(self.wait_interval_seconds)

        if status in self.TASK_EXECUTION_SUCCESS_STATES:
            return True
        if status in self.TASK_EXECUTION_FAILURE_STATES:
            return False
        if iterations <= 0:
            raise AirflowTaskTimeout("Max iterations exceeded!")
        raise AirflowException("Unknown status: %s" %
                               status)  # Should never happen
Exemplo n.º 6
0
 def handle_timeout(self, signum, frame):
     _log.error("Process timed out")
     raise AirflowTaskTimeout(self.error_message)
Exemplo n.º 7
0
 def handle_timeout(self, signum, frame):
     """
     Logs information and raises AirflowTaskTimeout.
     """
     self.log.error("Process timed out, PID: %s", str(os.getpid()))
     raise AirflowTaskTimeout(self.error_message)
Exemplo n.º 8
0
 def handle_timeout(self, *args):  # pylint: disable=unused-argument
     """Logs information and raises AirflowTaskTimeout."""
     self.log.error("Process timed out, PID: %s", str(os.getpid()))
     raise AirflowTaskTimeout(self.error_message)
Exemplo n.º 9
0
 def handle_timeout(self, signum, frame):
     self.log.error("Process timed out, PID: %s", str(os.getpid()))
     raise AirflowTaskTimeout(self.error_message)