Пример #1
0
 def test_serialize(self, session):
     event_log_model_1 = Log(event="TEST_EVENT_1", task_instance=self._create_task_instance())
     event_log_model_2 = Log(event="TEST_EVENT_2", task_instance=self._create_task_instance())
     event_logs = [event_log_model_1, event_log_model_2]
     session.add_all(event_logs)
     session.commit()
     event_log_model_1.dttm = timezone.parse(self.default_time)
     event_log_model_2.dttm = timezone.parse(self.default_time2)
     instance = EventLogCollection(event_logs=event_logs, total_entries=2)
     deserialized_event_logs = event_log_collection_schema.dump(instance)
     assert deserialized_event_logs == {
         "event_logs": [
             {
                 "event_log_id": event_log_model_1.id,
                 "event": "TEST_EVENT_1",
                 "dag_id": "TEST_DAG_ID",
                 "task_id": "TEST_TASK_ID",
                 "execution_date": self.default_time,
                 "owner": 'airflow',
                 "when": self.default_time,
                 "extra": None,
             },
             {
                 "event_log_id": event_log_model_2.id,
                 "event": "TEST_EVENT_2",
                 "dag_id": "TEST_DAG_ID",
                 "task_id": "TEST_TASK_ID",
                 "execution_date": self.default_time,
                 "owner": 'airflow',
                 "when": self.default_time2,
                 "extra": None,
             },
         ],
         "total_entries": 2,
     }
 def test_should_respond_200(self, session):
     log_model_1 = Log(
         event='TEST_EVENT_1',
         task_instance=self._create_task_instance(),
     )
     log_model_2 = Log(
         event='TEST_EVENT_2',
         task_instance=self._create_task_instance(),
     )
     log_model_3 = Log(event="cli_scheduler",
                       owner='root',
                       extra='{"host_name": "e24b454f002a"}')
     log_model_1.dttm = timezone.parse(self.default_time)
     log_model_2.dttm = timezone.parse(self.default_time_2)
     log_model_3.dttm = timezone.parse(self.default_time_2)
     session.add_all([log_model_1, log_model_2, log_model_3])
     session.commit()
     response = self.client.get("/api/v1/eventLogs",
                                environ_overrides={'REMOTE_USER': "******"})
     assert response.status_code == 200
     self.assertEqual(
         response.json,
         {
             "event_logs": [
                 {
                     "event_log_id": log_model_1.id,
                     "event": "TEST_EVENT_1",
                     "dag_id": "TEST_DAG_ID",
                     "task_id": "TEST_TASK_ID",
                     "execution_date": self.default_time,
                     "owner": 'airflow',
                     "when": self.default_time,
                     "extra": None,
                 },
                 {
                     "event_log_id": log_model_2.id,
                     "event": "TEST_EVENT_2",
                     "dag_id": "TEST_DAG_ID",
                     "task_id": "TEST_TASK_ID",
                     "execution_date": self.default_time,
                     "owner": 'airflow',
                     "when": self.default_time_2,
                     "extra": None,
                 },
                 {
                     "event_log_id": log_model_3.id,
                     "event": "cli_scheduler",
                     "dag_id": None,
                     "task_id": None,
                     "execution_date": None,
                     "owner": 'root',
                     "when": self.default_time_2,
                     "extra": '{"host_name": "e24b454f002a"}',
                 },
             ],
             "total_entries":
             3,
         },
     )
 def test_should_response_200(self, session):
     log_model = Log(
         event='TEST_EVENT',
         task_instance=self._create_task_instance(),
     )
     log_model.dttm = timezone.parse(self.default_time)
     session.add(log_model)
     session.commit()
     event_log_id = log_model.id
     response = self.client.get(f"/api/v1/eventLogs/{event_log_id}",
                                environ_overrides={'REMOTE_USER': "******"})
     assert response.status_code == 200
     self.assertEqual(
         response.json,
         {
             "event_log_id": event_log_id,
             "event": "TEST_EVENT",
             "dag_id": "TEST_DAG_ID",
             "task_id": "TEST_TASK_ID",
             "execution_date": self.default_time,
             "owner": 'airflow',
             "when": self.default_time,
             "extra": None,
         },
     )
Пример #4
0
    def wrapper(*args, **kwargs):
        __tracebackhide__ = True  # Hide from pytest traceback.

        with create_session() as session:
            if g.user.is_anonymous:
                user = '******'
            else:
                user = g.user.username

            fields_skip_logging = {'csrf_token', '_csrf_token'}
            log = Log(
                event=f.__name__,
                task_instance=None,
                owner=user,
                extra=str([(k, v) for k, v in request.values.items()
                           if k not in fields_skip_logging]),
                task_id=request.values.get('task_id'),
                dag_id=request.values.get('dag_id'),
            )

            if 'execution_date' in request.values:
                execution_date_value = request.values.get('execution_date')
                try:
                    log.execution_date = pendulum.parse(execution_date_value,
                                                        strict=False)
                except ParserError:
                    logger.exception(
                        "Failed to parse execution_date from the request: %s",
                        execution_date_value)

            session.add(log)

        return f(*args, **kwargs)
Пример #5
0
    def wrapper(*args, **kwargs):

        with create_session() as session:
            if g.user.is_anonymous:
                user = '******'
            else:
                user = g.user.username

            fields_skip_logging = {'csrf_token', '_csrf_token'}
            log = Log(
                event=f.__name__,
                task_instance=None,
                owner=user,
                extra=str([(k, v) for k, v in request.values.items()
                           if k not in fields_skip_logging]),
                task_id=request.values.get('task_id'),
                dag_id=request.values.get('dag_id'),
            )

            if 'execution_date' in request.values:
                log.execution_date = pendulum.parse(
                    request.values.get('execution_date'), strict=False)

            session.add(log)

        return f(*args, **kwargs)
Пример #6
0
def _build_metrics(func_name, namespace):
    """
    Builds metrics dict from function args
    It assumes that function arguments is from airflow.bin.cli module's function
    and has Namespace instance where it optionally contains "dag_id", "task_id",
    and "execution_date".

    :param func_name: name of function
    :param namespace: Namespace instance from argparse
    :return: dict with metrics
    """

    metrics = {'sub_command': func_name, 'start_datetime': datetime.utcnow(),
               'full_command': '{}'.format(list(sys.argv)), 'user': getpass.getuser()}

    if not isinstance(namespace, Namespace):
        raise ValueError("namespace argument should be argparse.Namespace instance,"
                         f"but is {type(namespace)}")
    tmp_dic = vars(namespace)
    metrics['dag_id'] = tmp_dic.get('dag_id')
    metrics['task_id'] = tmp_dic.get('task_id')
    metrics['execution_date'] = tmp_dic.get('execution_date')
    metrics['host_name'] = socket.gethostname()

    extra = json.dumps({k: metrics[k] for k in ('host_name', 'full_command')})
    log = Log(
        event='cli_{}'.format(func_name),
        task_instance=None,
        owner=metrics['user'],
        extra=extra,
        task_id=metrics.get('task_id'),
        dag_id=metrics.get('dag_id'),
        execution_date=metrics.get('execution_date'))
    metrics['log'] = log
    return metrics
    def test_should_raises_401_unauthenticated(self, session):
        log_model_1 = Log(
            event='TEST_EVENT_1',
            task_instance=self._create_task_instance(),
        )
        log_model_2 = Log(
            event='TEST_EVENT_2',
            task_instance=self._create_task_instance(),
        )
        log_model_1.dttm = timezone.parse(self.default_time)
        log_model_2.dttm = timezone.parse(self.default_time_2)
        session.add_all([log_model_1, log_model_2])
        session.commit()

        response = self.client.get("/api/v1/eventLogs")

        assert_401(response)
Пример #8
0
def _build_metrics(func_name, namespace):
    """
    Builds metrics dict from function args
    It assumes that function arguments is from airflow.bin.cli module's function
    and has Namespace instance where it optionally contains "dag_id", "task_id",
    and "execution_date".

    :param func_name: name of function
    :param namespace: Namespace instance from argparse
    :return: dict with metrics
    """
    from airflow.models import Log

    sub_commands_to_check = {'users', 'connections'}
    sensitive_fields = {'-p', '--password', '--conn-password'}
    full_command = list(sys.argv)
    sub_command = full_command[1] if len(full_command) > 1 else None
    if sub_command in sub_commands_to_check:
        for idx, command in enumerate(full_command):
            if command in sensitive_fields:
                # For cases when password is passed as "--password xyz" (with space between key and value)
                full_command[idx + 1] = "*" * 8
            else:
                # For cases when password is passed as "--password=xyz" (with '=' between key and value)
                for sensitive_field in sensitive_fields:
                    if command.startswith(f'{sensitive_field}='):
                        full_command[idx] = f'{sensitive_field}={"*" * 8}'

    metrics = {
        'sub_command': func_name,
        'start_datetime': datetime.utcnow(),
        'full_command': f'{full_command}',
        'user': getuser(),
    }

    if not isinstance(namespace, Namespace):
        raise ValueError(
            "namespace argument should be argparse.Namespace instance,"
            f"but is {type(namespace)}")
    tmp_dic = vars(namespace)
    metrics['dag_id'] = tmp_dic.get('dag_id')
    metrics['task_id'] = tmp_dic.get('task_id')
    metrics['execution_date'] = tmp_dic.get('execution_date')
    metrics['host_name'] = socket.gethostname()

    extra = json.dumps({k: metrics[k] for k in ('host_name', 'full_command')})
    log = Log(
        event=f'cli_{func_name}',
        task_instance=None,
        owner=metrics['user'],
        extra=extra,
        task_id=metrics.get('task_id'),
        dag_id=metrics.get('dag_id'),
        execution_date=metrics.get('execution_date'),
    )
    metrics['log'] = log
    return metrics
Пример #9
0
    def log_pod_creation(self, pod: Pod, resp, session=None):
        from openshift_plugin.executor.airflow_openshift_scheduler import AirflowOpenShiftScheduler

        execution_date = AirflowOpenShiftScheduler.label_safe_datestring_to_datetime(pod.labels['execution_date'])

        task_instance = session.query(TaskInstance) \
            .filter(TaskInstance.dag_id == pod.labels['dag_id']) \
            .filter(TaskInstance.task_id == pod.labels['task_id']) \
            .filter(TaskInstance.execution_date == execution_date).first()

        if not task_instance:
            self.log.error(
                "Could not find task instance based on the pod labels"
                " ({dag_id} {task_id} {execution_date} {try_number})".format(
                    **pod.labels))
            self.log.error("Log information will be incomplete. This is a BUG please report!!!")

        def default(o):
            if isinstance(o, (datetime.date, datetime.datetime)):
                return o.isoformat()

        kube_client = get_kube_client()

        headers = {"Authorization": kube_client.api_client.configuration.get_api_key_with_prefix('authorization')}
        url = "{0}/apis/image.openshift.io/v1/namespaces/{1}/imagestreamtags/{2}".format(
            kube_client.api_client.configuration.host,
            pod.image.split("/")[-2], quote_plus(pod.image.split("/")[-1]))

        response = requests.get(url,
                                headers=headers,
                                verify=kube_client.api_client.configuration.ssl_ca_cert)

        resp = resp.to_dict()

        if response.status_code == 200:
            image_reference = response.json()
            resp['spec']['containers'][0]['image'] = image_reference["tag"]["from"]["name"]
        else:
            image_reference = None

        log = Log(
            event=OpenShiftPodLauncer.EVENT_POD_CREATION,
            dag_id=task_instance.dag_id,
            task_instance=None,
            task_id=task_instance.task_id,
            execution_date=task_instance.execution_date,
            extra=json.dumps(
                {
                    "request": self.kube_req_factory.create(pod),
                    "response": resp,
                    "image": image_reference
                }, default=default)
        )
        session.add(log)
        session.commit()
        pass
Пример #10
0
def _build_metrics(func_name, args, kwargs):
    """
    Builds metrics dict from function args
    If the first item in args is a Namespace instance, it assumes that it
    optionally contains "dag_id", "task_id", and "execution_date".

    :param func_name: name of function
    :param args: Arguments from wrapped function, possibly including the Namespace instance from
                 argparse as the first argument
    :param kwargs: Keyword arguments from wrapped function
    :return: dict with metrics
    """
    from airflow.models import Log

    sub_commands_to_check = {'users', 'connections'}
    sensitive_fields = {'-p', '--password', '--conn-password'}
    full_command = list(sys.argv)
    sub_command = full_command[1] if len(full_command) > 1 else None
    if sub_command in sub_commands_to_check:
        for idx, command in enumerate(full_command):
            if command in sensitive_fields:
                # For cases when password is passed as "--password xyz" (with space between key and value)
                full_command[idx + 1] = "*" * 8
            else:
                # For cases when password is passed as "--password=xyz" (with '=' between key and value)
                for sensitive_field in sensitive_fields:
                    if command.startswith(f'{sensitive_field}='):
                        full_command[idx] = f'{sensitive_field}={"*" * 8}'

    metrics = {
        'sub_command': func_name,
        'start_datetime': datetime.utcnow(),
        'full_command': f'{full_command}',
        'user': getuser(),
    }

    tmp_dic = vars(args[0]) if (args
                                and isinstance(args[0], Namespace)) else kwargs
    metrics['dag_id'] = tmp_dic.get('dag_id')
    metrics['task_id'] = tmp_dic.get('task_id')
    metrics['execution_date'] = tmp_dic.get('execution_date')
    metrics['host_name'] = socket.gethostname()

    extra = json.dumps({k: metrics[k] for k in ('host_name', 'full_command')})
    log = Log(
        event=f'cli_{func_name}',
        task_instance=None,
        owner=metrics['user'],
        extra=extra,
        task_id=metrics.get('task_id'),
        dag_id=metrics.get('dag_id'),
        execution_date=metrics.get('execution_date'),
    )
    metrics['log'] = log
    return metrics
    def test_should_raises_401_unauthenticated(self, session):
        log_model = Log(
            event='TEST_EVENT',
            task_instance=self._create_task_instance(),
        )
        log_model.dttm = timezone.parse(self.default_time)
        session.add(log_model)
        session.commit()
        event_log_id = log_model.id

        response = self.client.get(f"/api/v1/eventLogs/{event_log_id}")

        assert_401(response)
 def test_should_response_200(self, session):
     log_model_1 = Log(
         event='TEST_EVENT_1',
         task_instance=self._create_task_instance(),
     )
     log_model_2 = Log(
         event='TEST_EVENT_2',
         task_instance=self._create_task_instance(),
     )
     log_model_1.dttm = timezone.parse(self.default_time)
     log_model_2.dttm = timezone.parse(self.default_time_2)
     session.add_all([log_model_1, log_model_2])
     session.commit()
     response = self.client.get("/api/v1/eventLogs")
     assert response.status_code == 200
     self.assertEqual(
         response.json, {
             "event_logs": [{
                 "event_log_id": log_model_1.id,
                 "event": "TEST_EVENT_1",
                 "dag_id": "TEST_DAG_ID",
                 "task_id": "TEST_TASK_ID",
                 "execution_date": self.default_time,
                 "owner": 'airflow',
                 "when": self.default_time,
                 "extra": None
             }, {
                 "event_log_id": log_model_2.id,
                 "event": "TEST_EVENT_2",
                 "dag_id": "TEST_DAG_ID",
                 "task_id": "TEST_TASK_ID",
                 "execution_date": self.default_time,
                 "owner": 'airflow',
                 "when": self.default_time_2,
                 "extra": None
             }],
             "total_entries":
             2
         })
Пример #13
0
def add_log(execdate, session, timezone_override=None):
    dag = DAG(dag_id='logging', default_args={'start_date': execdate})
    task = DummyOperator(task_id='dummy', dag=dag, owner='airflow')
    task_instance = TaskInstance(task=task,
                                 execution_date=execdate,
                                 state='success')
    session.merge(task_instance)
    log = Log(State.RUNNING, task_instance)
    if timezone_override:
        log.dttm = log.dttm.astimezone(timezone_override)
    session.add(log)
    session.commit()
    return log
Пример #14
0
 def test_serialize(self, session):
     event_log_model = Log(event="TEST_EVENT", task_instance=self._create_task_instance())
     session.add(event_log_model)
     session.commit()
     event_log_model.dttm = timezone.parse(self.default_time)
     log_model = session.query(Log).first()
     deserialized_log = event_log_schema.dump(log_model)
     assert deserialized_log == {
         "event_log_id": event_log_model.id,
         "event": "TEST_EVENT",
         "dag_id": "TEST_DAG_ID",
         "task_id": "TEST_TASK_ID",
         "execution_date": self.default_time,
         "owner": 'airflow',
         "when": self.default_time,
         "extra": None,
     }
Пример #15
0
def _build_metrics(func_name, namespace):
    """
    Builds metrics dict from function args
    It assumes that function arguments is from airflow.bin.cli module's function
    and has Namespace instance where it optionally contains "dag_id", "task_id",
    and "execution_date".

    :param func_name: name of function
    :param namespace: Namespace instance from argparse
    :return: dict with metrics
    """
    sensitive_fields = {'-p', '--password', '--conn-password'}
    full_command = list(sys.argv)
    for idx, command in enumerate(full_command):  # pylint: disable=too-many-nested-blocks
        if command in sensitive_fields:
            # For cases when password is passed as "--password xyz" (with space between key and value)
            full_command[idx + 1] = "*" * 8
        else:
            # For cases when password is passed as "--password=xyz" (with '=' between key and value)
            for sensitive_field in sensitive_fields:
                if command.startswith('{}='.format(sensitive_field)):
                    full_command[idx] = '{}={}'.format(sensitive_field, "*" * 8)

    metrics = {'sub_command': func_name, 'start_datetime': datetime.utcnow(),
               'full_command': '{}'.format(full_command), 'user': getpass.getuser()}

    assert isinstance(namespace, Namespace)
    tmp_dic = vars(namespace)
    metrics['dag_id'] = tmp_dic.get('dag_id')
    metrics['task_id'] = tmp_dic.get('task_id')
    metrics['execution_date'] = tmp_dic.get('execution_date')
    metrics['host_name'] = socket.gethostname()

    extra = json.dumps(dict((k, metrics[k]) for k in ('host_name', 'full_command')))
    log = Log(
        event='cli_{}'.format(func_name),
        task_instance=None,
        owner=metrics['user'],
        extra=extra,
        task_id=metrics.get('task_id'),
        dag_id=metrics.get('dag_id'),
        execution_date=metrics.get('execution_date'))
    metrics['log'] = log
    return metrics
Пример #16
0
    def wrapper(*args, **kwargs):

        with create_session() as session:
            if g.user.is_anonymous:
                user = '******'
            else:
                user = g.user.username

            log = Log(event=f.__name__,
                      task_instance=None,
                      owner=user,
                      extra=str(list(request.args.items())),
                      task_id=request.args.get('task_id'),
                      dag_id=request.args.get('dag_id'))

            if 'execution_date' in request.args:
                log.execution_date = pendulum.parse(
                    request.args.get('execution_date'))

            session.add(log)

        return f(*args, **kwargs)
Пример #17
0
def insert_dag_runs(
    session,
    dag_id="plugin_test_dag",
    dag_runs_count=1,
    task_instances_per_run=0,
    state="success",
    with_log=False,
):
    for i in range(dag_runs_count):
        execution_date = utcnow()

        dag_run = DagRun()
        dag_run.dag_id = dag_id
        dag_run.execution_date = execution_date
        dag_run._state = state
        if AIRFLOW_VERSION_2:
            dag_run.run_type = ""
        session.add(dag_run)

        if with_log:
            task_instance = FakeTaskInstance()
            task_instance.dag_id = dag_id
            task_instance.task_id = "task"
            task_instance.execution_date = execution_date
            task = FakeTask()
            task.owner = "Airflow"
            task_instance.task = task
            log = Log("success", task_instance)
            session.add(log)

        for j in range(task_instances_per_run):
            task = FakeTask(dag_id=dag_id, task_id="task{}".format(j))
            task_instance = TaskInstance(task, execution_date, state="success")
            session.add(task_instance)

    session.commit()
def _run_raw_task(
    self,
    mark_success: bool = False,
    test_mode: bool = False,
    job_id: Optional[str] = None,
    pool: Optional[str] = None,
    error_file: Optional[str] = None,
    session=None,
) -> None:
    """
    Immediately runs the task (without checking or changing db state
    before execution) and then sets the appropriate final state after
    completion and runs any post-execute callbacks. Meant to be called
    only after another function changes the state to running.
    :param mark_success: Don't run the task, mark its state as success
    :type mark_success: bool
    :param test_mode: Doesn't record success or failure in the DB
    :type test_mode: bool
    :param pool: specifies the pool to use to run the task instance
    :type pool: str
    :param session: SQLAlchemy ORM Session
    :type session: Session
    """
    task = self.task
    self.test_mode = test_mode
    refresh_from_task(self, task, pool_override=pool)
    # self.refresh_from_db(session=session)
    self.job_id = job_id
    self.hostname = get_hostname()

    context = {}  # type: Dict
    actual_start_date = timezone.utcnow()
    try:
        if not mark_success:
            context = self.get_template_context()
            _prepare_and_execute_task_with_callbacks(self, context, task)
        self.refresh_from_db(lock_for_update=True)
        self.state = State.SUCCESS
    except AirflowSkipException as e:
        # Recording SKIP
        # log only if exception has any arguments to prevent log flooding
        if e.args:
            self.log.info(e)
        self.refresh_from_db(lock_for_update=True)
        self.state = State.SKIPPED
        self.log.info(
            'Marking task as SKIPPED. '
            'dag_id=%s, task_id=%s, execution_date=%s, start_date=%s, end_date=%s',
            self.dag_id,
            self.task_id,
            _date_or_empty(self, 'execution_date'),
            _date_or_empty(self, 'start_date'),
            _date_or_empty(self, 'end_date'),
        )
    # except AirflowRescheduleException as reschedule_exception:
    #     self.refresh_from_db()
    #     self._handle_reschedule(actual_start_date, reschedule_exception, test_mode)
    #     return
    # except AirflowFailException as e:
    #     self.refresh_from_db()
    #     self.handle_failure(e, test_mode, force_fail=True, error_file=error_file)
    #     raise
    except AirflowException as e:
        self.refresh_from_db()
        # for case when task is marked as success/failed externally
        # current behavior doesn't hit the success callback
        if self.state in {State.SUCCESS, State.FAILED}:
            return
        else:
            self.handle_failure(e, test_mode, error_file=error_file)
            raise
    except (Exception, KeyboardInterrupt) as e:
        self.handle_failure(e, test_mode, error_file=error_file)
        raise

    # Recording SUCCESS
    self.end_date = timezone.utcnow()
    self.log.info(
        'Marking task as SUCCESS. '
        'dag_id=%s, task_id=%s, execution_date=%s, start_date=%s, end_date=%s',
        self.dag_id,
        self.task_id,
        _date_or_empty(self, 'execution_date'),
        _date_or_empty(self, 'start_date'),
        _date_or_empty(self, 'end_date'),
    )
    self.set_duration()
    if not test_mode:
        session.add(Log(self.state, self))
        session.merge(self)

    session.commit()

    if not test_mode:
        _run_mini_scheduler_on_child_tasks(self, session)
 def _create_event_logs(self, count):
     return [
         Log(event="TEST_EVENT_" + str(i),
             task_instance=self._create_task_instance())
         for i in range(1, count + 1)
     ]