def test_serialize(self, session): event_log_model_1 = Log(event="TEST_EVENT_1", task_instance=self._create_task_instance()) event_log_model_2 = Log(event="TEST_EVENT_2", task_instance=self._create_task_instance()) event_logs = [event_log_model_1, event_log_model_2] session.add_all(event_logs) session.commit() event_log_model_1.dttm = timezone.parse(self.default_time) event_log_model_2.dttm = timezone.parse(self.default_time2) instance = EventLogCollection(event_logs=event_logs, total_entries=2) deserialized_event_logs = event_log_collection_schema.dump(instance) assert deserialized_event_logs == { "event_logs": [ { "event_log_id": event_log_model_1.id, "event": "TEST_EVENT_1", "dag_id": "TEST_DAG_ID", "task_id": "TEST_TASK_ID", "execution_date": self.default_time, "owner": 'airflow', "when": self.default_time, "extra": None, }, { "event_log_id": event_log_model_2.id, "event": "TEST_EVENT_2", "dag_id": "TEST_DAG_ID", "task_id": "TEST_TASK_ID", "execution_date": self.default_time, "owner": 'airflow', "when": self.default_time2, "extra": None, }, ], "total_entries": 2, }
def test_should_respond_200(self, session): log_model_1 = Log( event='TEST_EVENT_1', task_instance=self._create_task_instance(), ) log_model_2 = Log( event='TEST_EVENT_2', task_instance=self._create_task_instance(), ) log_model_3 = Log(event="cli_scheduler", owner='root', extra='{"host_name": "e24b454f002a"}') log_model_1.dttm = timezone.parse(self.default_time) log_model_2.dttm = timezone.parse(self.default_time_2) log_model_3.dttm = timezone.parse(self.default_time_2) session.add_all([log_model_1, log_model_2, log_model_3]) session.commit() response = self.client.get("/api/v1/eventLogs", environ_overrides={'REMOTE_USER': "******"}) assert response.status_code == 200 self.assertEqual( response.json, { "event_logs": [ { "event_log_id": log_model_1.id, "event": "TEST_EVENT_1", "dag_id": "TEST_DAG_ID", "task_id": "TEST_TASK_ID", "execution_date": self.default_time, "owner": 'airflow', "when": self.default_time, "extra": None, }, { "event_log_id": log_model_2.id, "event": "TEST_EVENT_2", "dag_id": "TEST_DAG_ID", "task_id": "TEST_TASK_ID", "execution_date": self.default_time, "owner": 'airflow', "when": self.default_time_2, "extra": None, }, { "event_log_id": log_model_3.id, "event": "cli_scheduler", "dag_id": None, "task_id": None, "execution_date": None, "owner": 'root', "when": self.default_time_2, "extra": '{"host_name": "e24b454f002a"}', }, ], "total_entries": 3, }, )
def test_should_response_200(self, session): log_model = Log( event='TEST_EVENT', task_instance=self._create_task_instance(), ) log_model.dttm = timezone.parse(self.default_time) session.add(log_model) session.commit() event_log_id = log_model.id response = self.client.get(f"/api/v1/eventLogs/{event_log_id}", environ_overrides={'REMOTE_USER': "******"}) assert response.status_code == 200 self.assertEqual( response.json, { "event_log_id": event_log_id, "event": "TEST_EVENT", "dag_id": "TEST_DAG_ID", "task_id": "TEST_TASK_ID", "execution_date": self.default_time, "owner": 'airflow', "when": self.default_time, "extra": None, }, )
def wrapper(*args, **kwargs): __tracebackhide__ = True # Hide from pytest traceback. with create_session() as session: if g.user.is_anonymous: user = '******' else: user = g.user.username fields_skip_logging = {'csrf_token', '_csrf_token'} log = Log( event=f.__name__, task_instance=None, owner=user, extra=str([(k, v) for k, v in request.values.items() if k not in fields_skip_logging]), task_id=request.values.get('task_id'), dag_id=request.values.get('dag_id'), ) if 'execution_date' in request.values: execution_date_value = request.values.get('execution_date') try: log.execution_date = pendulum.parse(execution_date_value, strict=False) except ParserError: logger.exception( "Failed to parse execution_date from the request: %s", execution_date_value) session.add(log) return f(*args, **kwargs)
def wrapper(*args, **kwargs): with create_session() as session: if g.user.is_anonymous: user = '******' else: user = g.user.username fields_skip_logging = {'csrf_token', '_csrf_token'} log = Log( event=f.__name__, task_instance=None, owner=user, extra=str([(k, v) for k, v in request.values.items() if k not in fields_skip_logging]), task_id=request.values.get('task_id'), dag_id=request.values.get('dag_id'), ) if 'execution_date' in request.values: log.execution_date = pendulum.parse( request.values.get('execution_date'), strict=False) session.add(log) return f(*args, **kwargs)
def _build_metrics(func_name, namespace): """ Builds metrics dict from function args It assumes that function arguments is from airflow.bin.cli module's function and has Namespace instance where it optionally contains "dag_id", "task_id", and "execution_date". :param func_name: name of function :param namespace: Namespace instance from argparse :return: dict with metrics """ metrics = {'sub_command': func_name, 'start_datetime': datetime.utcnow(), 'full_command': '{}'.format(list(sys.argv)), 'user': getpass.getuser()} if not isinstance(namespace, Namespace): raise ValueError("namespace argument should be argparse.Namespace instance," f"but is {type(namespace)}") tmp_dic = vars(namespace) metrics['dag_id'] = tmp_dic.get('dag_id') metrics['task_id'] = tmp_dic.get('task_id') metrics['execution_date'] = tmp_dic.get('execution_date') metrics['host_name'] = socket.gethostname() extra = json.dumps({k: metrics[k] for k in ('host_name', 'full_command')}) log = Log( event='cli_{}'.format(func_name), task_instance=None, owner=metrics['user'], extra=extra, task_id=metrics.get('task_id'), dag_id=metrics.get('dag_id'), execution_date=metrics.get('execution_date')) metrics['log'] = log return metrics
def test_should_raises_401_unauthenticated(self, session): log_model_1 = Log( event='TEST_EVENT_1', task_instance=self._create_task_instance(), ) log_model_2 = Log( event='TEST_EVENT_2', task_instance=self._create_task_instance(), ) log_model_1.dttm = timezone.parse(self.default_time) log_model_2.dttm = timezone.parse(self.default_time_2) session.add_all([log_model_1, log_model_2]) session.commit() response = self.client.get("/api/v1/eventLogs") assert_401(response)
def _build_metrics(func_name, namespace): """ Builds metrics dict from function args It assumes that function arguments is from airflow.bin.cli module's function and has Namespace instance where it optionally contains "dag_id", "task_id", and "execution_date". :param func_name: name of function :param namespace: Namespace instance from argparse :return: dict with metrics """ from airflow.models import Log sub_commands_to_check = {'users', 'connections'} sensitive_fields = {'-p', '--password', '--conn-password'} full_command = list(sys.argv) sub_command = full_command[1] if len(full_command) > 1 else None if sub_command in sub_commands_to_check: for idx, command in enumerate(full_command): if command in sensitive_fields: # For cases when password is passed as "--password xyz" (with space between key and value) full_command[idx + 1] = "*" * 8 else: # For cases when password is passed as "--password=xyz" (with '=' between key and value) for sensitive_field in sensitive_fields: if command.startswith(f'{sensitive_field}='): full_command[idx] = f'{sensitive_field}={"*" * 8}' metrics = { 'sub_command': func_name, 'start_datetime': datetime.utcnow(), 'full_command': f'{full_command}', 'user': getuser(), } if not isinstance(namespace, Namespace): raise ValueError( "namespace argument should be argparse.Namespace instance," f"but is {type(namespace)}") tmp_dic = vars(namespace) metrics['dag_id'] = tmp_dic.get('dag_id') metrics['task_id'] = tmp_dic.get('task_id') metrics['execution_date'] = tmp_dic.get('execution_date') metrics['host_name'] = socket.gethostname() extra = json.dumps({k: metrics[k] for k in ('host_name', 'full_command')}) log = Log( event=f'cli_{func_name}', task_instance=None, owner=metrics['user'], extra=extra, task_id=metrics.get('task_id'), dag_id=metrics.get('dag_id'), execution_date=metrics.get('execution_date'), ) metrics['log'] = log return metrics
def log_pod_creation(self, pod: Pod, resp, session=None): from openshift_plugin.executor.airflow_openshift_scheduler import AirflowOpenShiftScheduler execution_date = AirflowOpenShiftScheduler.label_safe_datestring_to_datetime(pod.labels['execution_date']) task_instance = session.query(TaskInstance) \ .filter(TaskInstance.dag_id == pod.labels['dag_id']) \ .filter(TaskInstance.task_id == pod.labels['task_id']) \ .filter(TaskInstance.execution_date == execution_date).first() if not task_instance: self.log.error( "Could not find task instance based on the pod labels" " ({dag_id} {task_id} {execution_date} {try_number})".format( **pod.labels)) self.log.error("Log information will be incomplete. This is a BUG please report!!!") def default(o): if isinstance(o, (datetime.date, datetime.datetime)): return o.isoformat() kube_client = get_kube_client() headers = {"Authorization": kube_client.api_client.configuration.get_api_key_with_prefix('authorization')} url = "{0}/apis/image.openshift.io/v1/namespaces/{1}/imagestreamtags/{2}".format( kube_client.api_client.configuration.host, pod.image.split("/")[-2], quote_plus(pod.image.split("/")[-1])) response = requests.get(url, headers=headers, verify=kube_client.api_client.configuration.ssl_ca_cert) resp = resp.to_dict() if response.status_code == 200: image_reference = response.json() resp['spec']['containers'][0]['image'] = image_reference["tag"]["from"]["name"] else: image_reference = None log = Log( event=OpenShiftPodLauncer.EVENT_POD_CREATION, dag_id=task_instance.dag_id, task_instance=None, task_id=task_instance.task_id, execution_date=task_instance.execution_date, extra=json.dumps( { "request": self.kube_req_factory.create(pod), "response": resp, "image": image_reference }, default=default) ) session.add(log) session.commit() pass
def _build_metrics(func_name, args, kwargs): """ Builds metrics dict from function args If the first item in args is a Namespace instance, it assumes that it optionally contains "dag_id", "task_id", and "execution_date". :param func_name: name of function :param args: Arguments from wrapped function, possibly including the Namespace instance from argparse as the first argument :param kwargs: Keyword arguments from wrapped function :return: dict with metrics """ from airflow.models import Log sub_commands_to_check = {'users', 'connections'} sensitive_fields = {'-p', '--password', '--conn-password'} full_command = list(sys.argv) sub_command = full_command[1] if len(full_command) > 1 else None if sub_command in sub_commands_to_check: for idx, command in enumerate(full_command): if command in sensitive_fields: # For cases when password is passed as "--password xyz" (with space between key and value) full_command[idx + 1] = "*" * 8 else: # For cases when password is passed as "--password=xyz" (with '=' between key and value) for sensitive_field in sensitive_fields: if command.startswith(f'{sensitive_field}='): full_command[idx] = f'{sensitive_field}={"*" * 8}' metrics = { 'sub_command': func_name, 'start_datetime': datetime.utcnow(), 'full_command': f'{full_command}', 'user': getuser(), } tmp_dic = vars(args[0]) if (args and isinstance(args[0], Namespace)) else kwargs metrics['dag_id'] = tmp_dic.get('dag_id') metrics['task_id'] = tmp_dic.get('task_id') metrics['execution_date'] = tmp_dic.get('execution_date') metrics['host_name'] = socket.gethostname() extra = json.dumps({k: metrics[k] for k in ('host_name', 'full_command')}) log = Log( event=f'cli_{func_name}', task_instance=None, owner=metrics['user'], extra=extra, task_id=metrics.get('task_id'), dag_id=metrics.get('dag_id'), execution_date=metrics.get('execution_date'), ) metrics['log'] = log return metrics
def test_should_raises_401_unauthenticated(self, session): log_model = Log( event='TEST_EVENT', task_instance=self._create_task_instance(), ) log_model.dttm = timezone.parse(self.default_time) session.add(log_model) session.commit() event_log_id = log_model.id response = self.client.get(f"/api/v1/eventLogs/{event_log_id}") assert_401(response)
def test_should_response_200(self, session): log_model_1 = Log( event='TEST_EVENT_1', task_instance=self._create_task_instance(), ) log_model_2 = Log( event='TEST_EVENT_2', task_instance=self._create_task_instance(), ) log_model_1.dttm = timezone.parse(self.default_time) log_model_2.dttm = timezone.parse(self.default_time_2) session.add_all([log_model_1, log_model_2]) session.commit() response = self.client.get("/api/v1/eventLogs") assert response.status_code == 200 self.assertEqual( response.json, { "event_logs": [{ "event_log_id": log_model_1.id, "event": "TEST_EVENT_1", "dag_id": "TEST_DAG_ID", "task_id": "TEST_TASK_ID", "execution_date": self.default_time, "owner": 'airflow', "when": self.default_time, "extra": None }, { "event_log_id": log_model_2.id, "event": "TEST_EVENT_2", "dag_id": "TEST_DAG_ID", "task_id": "TEST_TASK_ID", "execution_date": self.default_time, "owner": 'airflow', "when": self.default_time_2, "extra": None }], "total_entries": 2 })
def add_log(execdate, session, timezone_override=None): dag = DAG(dag_id='logging', default_args={'start_date': execdate}) task = DummyOperator(task_id='dummy', dag=dag, owner='airflow') task_instance = TaskInstance(task=task, execution_date=execdate, state='success') session.merge(task_instance) log = Log(State.RUNNING, task_instance) if timezone_override: log.dttm = log.dttm.astimezone(timezone_override) session.add(log) session.commit() return log
def test_serialize(self, session): event_log_model = Log(event="TEST_EVENT", task_instance=self._create_task_instance()) session.add(event_log_model) session.commit() event_log_model.dttm = timezone.parse(self.default_time) log_model = session.query(Log).first() deserialized_log = event_log_schema.dump(log_model) assert deserialized_log == { "event_log_id": event_log_model.id, "event": "TEST_EVENT", "dag_id": "TEST_DAG_ID", "task_id": "TEST_TASK_ID", "execution_date": self.default_time, "owner": 'airflow', "when": self.default_time, "extra": None, }
def _build_metrics(func_name, namespace): """ Builds metrics dict from function args It assumes that function arguments is from airflow.bin.cli module's function and has Namespace instance where it optionally contains "dag_id", "task_id", and "execution_date". :param func_name: name of function :param namespace: Namespace instance from argparse :return: dict with metrics """ sensitive_fields = {'-p', '--password', '--conn-password'} full_command = list(sys.argv) for idx, command in enumerate(full_command): # pylint: disable=too-many-nested-blocks if command in sensitive_fields: # For cases when password is passed as "--password xyz" (with space between key and value) full_command[idx + 1] = "*" * 8 else: # For cases when password is passed as "--password=xyz" (with '=' between key and value) for sensitive_field in sensitive_fields: if command.startswith('{}='.format(sensitive_field)): full_command[idx] = '{}={}'.format(sensitive_field, "*" * 8) metrics = {'sub_command': func_name, 'start_datetime': datetime.utcnow(), 'full_command': '{}'.format(full_command), 'user': getpass.getuser()} assert isinstance(namespace, Namespace) tmp_dic = vars(namespace) metrics['dag_id'] = tmp_dic.get('dag_id') metrics['task_id'] = tmp_dic.get('task_id') metrics['execution_date'] = tmp_dic.get('execution_date') metrics['host_name'] = socket.gethostname() extra = json.dumps(dict((k, metrics[k]) for k in ('host_name', 'full_command'))) log = Log( event='cli_{}'.format(func_name), task_instance=None, owner=metrics['user'], extra=extra, task_id=metrics.get('task_id'), dag_id=metrics.get('dag_id'), execution_date=metrics.get('execution_date')) metrics['log'] = log return metrics
def wrapper(*args, **kwargs): with create_session() as session: if g.user.is_anonymous: user = '******' else: user = g.user.username log = Log(event=f.__name__, task_instance=None, owner=user, extra=str(list(request.args.items())), task_id=request.args.get('task_id'), dag_id=request.args.get('dag_id')) if 'execution_date' in request.args: log.execution_date = pendulum.parse( request.args.get('execution_date')) session.add(log) return f(*args, **kwargs)
def insert_dag_runs( session, dag_id="plugin_test_dag", dag_runs_count=1, task_instances_per_run=0, state="success", with_log=False, ): for i in range(dag_runs_count): execution_date = utcnow() dag_run = DagRun() dag_run.dag_id = dag_id dag_run.execution_date = execution_date dag_run._state = state if AIRFLOW_VERSION_2: dag_run.run_type = "" session.add(dag_run) if with_log: task_instance = FakeTaskInstance() task_instance.dag_id = dag_id task_instance.task_id = "task" task_instance.execution_date = execution_date task = FakeTask() task.owner = "Airflow" task_instance.task = task log = Log("success", task_instance) session.add(log) for j in range(task_instances_per_run): task = FakeTask(dag_id=dag_id, task_id="task{}".format(j)) task_instance = TaskInstance(task, execution_date, state="success") session.add(task_instance) session.commit()
def _run_raw_task( self, mark_success: bool = False, test_mode: bool = False, job_id: Optional[str] = None, pool: Optional[str] = None, error_file: Optional[str] = None, session=None, ) -> None: """ Immediately runs the task (without checking or changing db state before execution) and then sets the appropriate final state after completion and runs any post-execute callbacks. Meant to be called only after another function changes the state to running. :param mark_success: Don't run the task, mark its state as success :type mark_success: bool :param test_mode: Doesn't record success or failure in the DB :type test_mode: bool :param pool: specifies the pool to use to run the task instance :type pool: str :param session: SQLAlchemy ORM Session :type session: Session """ task = self.task self.test_mode = test_mode refresh_from_task(self, task, pool_override=pool) # self.refresh_from_db(session=session) self.job_id = job_id self.hostname = get_hostname() context = {} # type: Dict actual_start_date = timezone.utcnow() try: if not mark_success: context = self.get_template_context() _prepare_and_execute_task_with_callbacks(self, context, task) self.refresh_from_db(lock_for_update=True) self.state = State.SUCCESS except AirflowSkipException as e: # Recording SKIP # log only if exception has any arguments to prevent log flooding if e.args: self.log.info(e) self.refresh_from_db(lock_for_update=True) self.state = State.SKIPPED self.log.info( 'Marking task as SKIPPED. ' 'dag_id=%s, task_id=%s, execution_date=%s, start_date=%s, end_date=%s', self.dag_id, self.task_id, _date_or_empty(self, 'execution_date'), _date_or_empty(self, 'start_date'), _date_or_empty(self, 'end_date'), ) # except AirflowRescheduleException as reschedule_exception: # self.refresh_from_db() # self._handle_reschedule(actual_start_date, reschedule_exception, test_mode) # return # except AirflowFailException as e: # self.refresh_from_db() # self.handle_failure(e, test_mode, force_fail=True, error_file=error_file) # raise except AirflowException as e: self.refresh_from_db() # for case when task is marked as success/failed externally # current behavior doesn't hit the success callback if self.state in {State.SUCCESS, State.FAILED}: return else: self.handle_failure(e, test_mode, error_file=error_file) raise except (Exception, KeyboardInterrupt) as e: self.handle_failure(e, test_mode, error_file=error_file) raise # Recording SUCCESS self.end_date = timezone.utcnow() self.log.info( 'Marking task as SUCCESS. ' 'dag_id=%s, task_id=%s, execution_date=%s, start_date=%s, end_date=%s', self.dag_id, self.task_id, _date_or_empty(self, 'execution_date'), _date_or_empty(self, 'start_date'), _date_or_empty(self, 'end_date'), ) self.set_duration() if not test_mode: session.add(Log(self.state, self)) session.merge(self) session.commit() if not test_mode: _run_mini_scheduler_on_child_tasks(self, session)
def _create_event_logs(self, count): return [ Log(event="TEST_EVENT_" + str(i), task_instance=self._create_task_instance()) for i in range(1, count + 1) ]