def test_trigger_dag_with_dict_conf(self, dag_bag_mock): dag_id = "trigger_dag_with_dict_conf" dag = DAG(dag_id) dag_bag_mock.dags = [dag_id] dag_bag_mock.get_dag.return_value = dag conf = dict(foo="bar") dag_run = DagRun() triggers = _trigger_dag(dag_id, dag_bag_mock, dag_run, run_id=None, conf=conf, execution_date=None, replace_microseconds=True) self.assertEqual(triggers[0].conf, conf)
def create_dag_run(self, dag_id, run_id, conf, session): """ Creates new DagRun. Shouldn't be called with not existent dag_id. Raises exception if DagRun for the same DAG with the same run_id was previously created """ if session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.run_id == run_id).one_or_none(): raise ValueError( f"dag_run {run_id} for dag_id {dag_id} already exists") else: run_conf = conf if isinstance(conf, dict) else json.loads(conf) dag_run = DagRun(dag_id=dag_id, run_id=run_id, conf=run_conf) session.add(dag_run) session.commit() return dag_run
def test_should_raises_401_unauthenticated(self, session): dagrun_model = DagRun( dag_id="TEST_DAG_ID", run_id="TEST_DAG_RUN_ID", run_type=DagRunType.MANUAL.value, execution_date=timezone.parse(self.default_time), start_date=timezone.parse(self.default_time), external_trigger=True, ) session.add(dagrun_model) session.commit() response = self.client.get( "api/v1/dags/TEST_DAG_ID/dagRuns/TEST_DAG_RUN_ID") assert_401(response)
def _create_dag_runs(self, count): dag_runs = [ DagRun( dag_id="TEST_DAG_ID", run_id="TEST_DAG_RUN_ID" + str(i), run_type=DagRunType.MANUAL, execution_date=timezone.parse(self.default_time) + timedelta(minutes=i), start_date=timezone.parse(self.default_time), external_trigger=True, ) for i in range(1, count + 1) ] dag = DagModel(dag_id="TEST_DAG_ID") with create_session() as session: session.add_all(dag_runs) session.add(dag)
def execute(self, context): dro = DagRunOrder(run_id='trig__' + datetime.now().isoformat()) dro = self.python_callable(context, dro) if dro: session = settings.Session() dr = DagRun( dag_id=self.trigger_dag_id, run_id=dro.run_id, conf=dro.payload, external_trigger=True) logging.info("Creating DagRun {}".format(dr)) session.add(dr) session.commit() session.close() else: logging.info("Criteria not met, moving on")
def test_trigger_dag_with_str_conf(self, dag_bag_mock): dag_id = "trigger_dag_with_str_conf" dag = DAG(dag_id) dag_bag_mock.dags = [dag_id] dag_bag_mock.get_dag.return_value = dag conf = "{\"foo\": \"bar\"}" dag_run = DagRun() triggers = _trigger_dag(dag_id, dag_bag_mock, dag_run, run_id=None, conf=conf, execution_date=None, replace_microseconds=True) self.assertEqual(triggers[0].conf, json.loads(conf))
def post_dag_run(dag_id, session): """Trigger a DAG.""" if not session.query(DagModel).filter(DagModel.dag_id == dag_id).first(): raise NotFound(title="DAG not found", detail=f"DAG with dag_id: '{dag_id}' not found") post_body = dagrun_schema.load(request.json, session=session) dagrun_instance = ( session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.run_id == post_body["run_id"]).first() ) if not dagrun_instance: dag_run = DagRun(dag_id=dag_id, run_type=DagRunType.MANUAL, **post_body) session.add(dag_run) session.commit() return dagrun_schema.dump(dag_run) raise AlreadyExists( detail=f"DAGRun with DAG ID: '{dag_id}' and DAGRun ID: '{post_body['run_id']}' already exists" )
def test_trigger_dag_dag_run_exist(self, dag_bag_mock, dag_run_mock): dag_id = "dag_run_exist" dag = DAG(dag_id) dag_bag_mock.dags = [dag_id] dag_bag_mock.get_dag.return_value = dag dag_run_mock.find.return_value = DagRun() self.assertRaises( AirflowException, _trigger_dag, dag_id, dag_bag_mock, dag_run_mock, run_id=None, conf=None, execution_date=None, replace_microseconds=True, )
def test_generate_role_session_name(): dag = DAG("a-test-dag") task = airflow_docker.operator.Operator( dag=dag, task_id="some-task", image="hello-world", start_date=datetime.datetime(2019, 2, 14, 15), ) ti = TaskInstance(task=task, execution_date=datetime.datetime(2019, 2, 14, 15)) dag_run = DagRun(dag_id=dag.dag_id) dag_run.id = 5 context = {"dag": dag, "task_instance": ti, "dag_run": dag_run} session_name = airflow_docker.ext.aws.role_assumption.generate_role_session_name( context ) assert "5__1__some-task" == session_name
def test_trigger_dag_with_valid_start_date(self, dag_bag_mock): dag_id = "trigger_dag_with_valid_start_date" dag = DAG(dag_id, default_args={'start_date': timezone.datetime(2016, 9, 5, 10, 10, 0)}) dag_bag_mock.dags = [dag_id] dag_bag_mock.get_dag.return_value = dag dag_run = DagRun() triggers = _trigger_dag( dag_id, dag_bag_mock, dag_run, run_id=None, conf=None, execution_date=timezone.datetime(2018, 7, 5, 10, 10, 0), replace_microseconds=True, ) assert len(triggers) == 1
def test_trigger_dag_with_too_early_start_date(self, dag_bag_mock): dag_id = "trigger_dag_with_too_early_start_date" dag = DAG(dag_id, default_args={'start_date': timezone.datetime(2016, 9, 5, 10, 10, 0)}) dag_bag_mock.dags = [dag_id] dag_bag_mock.get_dag.return_value = dag dag_run = DagRun() self.assertRaises( ValueError, _trigger_dag, dag_id, dag_bag_mock, dag_run, run_id=None, conf=None, execution_date=timezone.datetime(2015, 7, 5, 10, 10, 0), replace_microseconds=True, )
def trigger_dag(args): utils.log_to_stdout() session = settings.Session() # TODO: verify dag_id execution_date = datetime.now() dr = session.query(DagRun).filter(DagRun.dag_id == args.dag_id, DagRun.run_id == args.run_id).first() if dr: logging.error("This run_id already exists") else: trigger = DagRun(dag_id=args.dag_id, run_id=args.run_id, execution_date=execution_date, state=State.RUNNING, external_trigger=True) session.add(trigger) logging.info("Created {}".format(trigger)) session.commit()
def test_subdag_with_propagate_skipped_state(self, propagate_option, states, skip_parent, mock_get_task_instance, mock_skip): """ Tests that skipped state of leaf tasks propagates to the parent dag. Note that the skipped state propagation only takes affect when the dagrun's state is SUCCESS. """ dag = DAG('parent', default_args=default_args) subdag = DAG('parent.test', default_args=default_args) subdag_task = SubDagOperator(task_id='test', subdag=subdag, dag=dag, poke_interval=1, propagate_skipped_state=propagate_option) dummy_subdag_tasks = [ DummyOperator(task_id=f'dummy_subdag_{i}', dag=subdag) for i in range(len(states)) ] dummy_dag_task = DummyOperator(task_id='dummy_dag', dag=dag) subdag_task >> dummy_dag_task subdag_task._get_dagrun = Mock() subdag_task._get_dagrun.return_value = self.dag_run_success mock_get_task_instance.side_effect = [ TaskInstance(task=task, execution_date=DEFAULT_DATE, state=state) for task, state in zip(dummy_subdag_tasks, states) ] context = { 'execution_date': DEFAULT_DATE, 'dag_run': DagRun(), 'task': subdag_task } subdag_task.post_execute(context) if skip_parent: mock_skip.assert_called_once_with(context['dag_run'], context['execution_date'], [dummy_dag_task]) else: mock_skip.assert_not_called()
def trigger_dag( dag_id, run_id=None, conf=None, execution_date=None, replace_microseconds=True, ): dagbag = DagBag() dag_run = DagRun() triggers = _trigger_dag( dag_id=dag_id, dag_run=dag_run, dag_bag=dagbag, run_id=run_id, conf=conf, execution_date=execution_date, replace_microseconds=replace_microseconds, ) return triggers[0] if triggers else None
def test_generate_role_session_name_long_task_id(): dag = DAG("a-test-dag") task = airflow_docker.operator.Operator( dag=dag, task_id= "some-task-id-that-is-very-long-way-past-the-64-character-limit-foo-bar-baz", image="hello-world", start_date=datetime.datetime(2019, 2, 14, 15), ) ti = TaskInstance(task=task, execution_date=datetime.datetime(2019, 2, 14, 15)) dag_run = DagRun(dag_id=dag.dag_id) dag_run.id = 5 context = {"dag": dag, "task_instance": ti, "dag_run": dag_run} session_name = airflow_docker.ext.aws.role_assumption.generate_role_session_name( context) assert ("5__1__some-task-id-that-is-very-long-way-past-the-64-character-l" == session_name)
def create_context(task): dag = DAG(dag_id="dag") tzinfo = pendulum.timezone("Europe/Amsterdam") execution_date = timezone.datetime(2016, 1, 1, 1, 0, 0, tzinfo=tzinfo) dag_run = DagRun( dag_id=dag.dag_id, execution_date=execution_date, run_id=DagRun.generate_run_id(DagRunType.MANUAL, execution_date), ) task_instance = TaskInstance(task=task) task_instance.dag_run = dag_run task_instance.dag_id = dag.dag_id task_instance.xcom_push = mock.Mock() return { "dag": dag, "run_id": dag_run.run_id, "task": task, "ti": task_instance, "task_instance": task_instance, }
def execute(self, context): biowardrobe_uid = context['dag_run'].conf['biowardrobe_uid'] \ if 'biowardrobe_uid' in context['dag_run'].conf else None if not biowardrobe_uid: raise Exception('biowardrobe_id must be provided') run_id = context['dag_run'].conf['run_id'] \ if 'run_id' in context['dag_run'].conf else 'trig__{}__{}'.format(biowardrobe_uid, uuid.uuid4()) _logger.info('Successfully finished: {}'.format(biowardrobe_uid)) mysql = MySqlHook(mysql_conn_id=biowardrobe_connection_id) with closing(mysql.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.execute( "update ems.labdata set libstatus=10, libstatustxt='downloaded' where uid=%s", (biowardrobe_uid, )) conn.commit() data = get_biowardrobe_data(cursor=cursor, biowardrobe_uid=biowardrobe_uid) dag_id = os.path.basename( os.path.splitext(data['workflow'])[0]) payload = { 'biowardrobe_uid': biowardrobe_uid, 'run_id': run_id } _logger.info("Trigger basic analysis with: {}".format(payload)) session = settings.Session() dr = DagRun(dag_id=dag_id, run_id=run_id, conf=payload, execution_date=datetime.now(), external_trigger=True) logging.info("Creating DagRun {}".format(dr)) session.add(dr) session.commit() session.close()
def trigger_dag( dag_id: str, run_id: Optional[str] = None, conf: Optional[Union[dict, str]] = None, execution_date: Optional[datetime] = None, replace_microseconds: bool = True, ) -> Optional[DagRun]: """Triggers execution of DAG specified by dag_id :param dag_id: DAG ID :param run_id: ID of the dag_run :param conf: configuration :param execution_date: date of execution :param replace_microseconds: whether microseconds should be zeroed :return: first dag run triggered - even if more than one Dag Runs were triggered or None """ dag_model = DagModel.get_current(dag_id) if dag_model is None: raise DagNotFound("Dag id {} not found in DagModel".format(dag_id)) def read_store_serialized_dags(): from airflow.configuration import conf return conf.getboolean('core', 'store_serialized_dags') dagbag = DagBag( dag_folder=dag_model.fileloc, store_serialized_dags=read_store_serialized_dags() ) dag_run = DagRun() triggers = _trigger_dag( dag_id=dag_id, dag_run=dag_run, dag_bag=dagbag, run_id=run_id, conf=conf, execution_date=execution_date, replace_microseconds=replace_microseconds, ) return triggers[0] if triggers else None
def trigger_dag(self, dag_id, run_id, conf): try: dag_path = DagModel.get_current(dag_id).fileloc except Exception: dag_path = path.join(DAGS_FOLDER, dag_id + ".py") dag_bag = DagBag(dag_folder=dag_path) if not dag_bag.dags: logging.info("Failed to import dag due to the following errors") logging.info(dag_bag.import_errors) logging.info("Sleep for 3 seconds and give it a second try") sleep(3) dag_bag = DagBag(dag_folder=dag_path) triggers = trigger_dag._trigger_dag(dag_id=dag_id, dag_run=DagRun(), dag_bag=dag_bag, run_id=run_id, conf=conf, execution_date=None, replace_microseconds=False) return triggers[0] if triggers else None
def generate_pod_yaml(args): """Generates yaml files for each task in the DAG. Used for testing output of KubernetesExecutor""" execution_date = args.execution_date dag = get_dag(subdir=args.subdir, dag_id=args.dag_id) yaml_output_path = args.output_path dr = DagRun(dag.dag_id, execution_date=execution_date) kube_config = KubeConfig() for task in dag.tasks: ti = TaskInstance(task, None) ti.dag_run = dr pod = PodGenerator.construct_pod( dag_id=args.dag_id, task_id=ti.task_id, pod_id=create_pod_id(args.dag_id, ti.task_id), try_number=ti.try_number, kube_image=kube_config.kube_image, date=ti.execution_date, args=ti.command_as_list(), pod_override_object=PodGenerator.from_obj(ti.executor_config), scheduler_job_id="worker-config", namespace=kube_config.executor_namespace, base_worker_pod=PodGenerator.deserialize_model_file( kube_config.pod_template_file), ) pod_mutation_hook(pod) api_client = ApiClient() date_string = pod_generator.datetime_to_label_safe_datestring( execution_date) yaml_file_name = f"{args.dag_id}_{ti.task_id}_{date_string}.yml" os.makedirs(os.path.dirname(yaml_output_path + "/airflow_yaml_output/"), exist_ok=True) with open(yaml_output_path + "/airflow_yaml_output/" + yaml_file_name, "w") as output: sanitized_pod = api_client.sanitize_for_serialization(pod) output.write(yaml.dump(sanitized_pod)) print( f"YAML output can be found at {yaml_output_path}/airflow_yaml_output/")
def trigger_dag( dag_id, run_id=None, conf=None, execution_date=None, replace_microseconds=True, ): dag_model = DagModel.get_current(dag_id) if dag_model is None: raise DagNotFound("Dag id {} not found in DagModel".format(dag_id)) dagbag = DagBag(dag_folder=dag_model.fileloc) dag_run = DagRun() triggers = _trigger_dag( dag_id=dag_id, dag_run=dag_run, dag_bag=dagbag, run_id=run_id, conf=conf, execution_date=execution_date, replace_microseconds=replace_microseconds, ) return triggers[0] if triggers else None
def trigger_dag(args): session = settings.Session() # TODO: verify dag_id execution_date = datetime.now() run_id = args.run_id or "manual__{0}".format(execution_date.isoformat()) dr = session.query(DagRun).filter(DagRun.dag_id == args.dag_id, DagRun.run_id == run_id).first() conf = {} if args.conf: conf = json.loads(args.conf) if dr: logging.error("This run_id already exists") else: trigger = DagRun(dag_id=args.dag_id, run_id=run_id, execution_date=execution_date, state=State.RUNNING, conf=conf, external_trigger=True) session.add(trigger) logging.info("Created {}".format(trigger)) session.commit()
def trigger_dag(self): """ Triggers execution of DAG interpreted from the report's dag_id _trigger_dag iterates through the class registry and looks For any model that has dag_id as an attribute and deletes all references to the specific dag_id :param dag_id: DAG ID :param dagbag: dagbag :param dagrun: empty dag run to be created """ dag_model = DagModel.get_current(self.dag_id) if dag_model is None: raise DagNotFound(f"Dag id {self.dag_id} not found in DagModel") dagbag = DagBag( dag_folder=dag_model.fileloc, store_serialized_dags=conf.getboolean("core", "store_serialized_dags"), ) dag_run = DagRun() self._trigger_dag(dag_id=self.dag_id, dag_bag=dagbag, dag_run=dag_run)
def insert_dag_runs( session, dag_id="plugin_test_dag", dag_runs_count=1, task_instances_per_run=0, state="success", with_log=False, ): for i in range(dag_runs_count): execution_date = utcnow() dag_run = DagRun() dag_run.dag_id = dag_id dag_run.execution_date = execution_date dag_run._state = state if AIRFLOW_VERSION_2: dag_run.run_type = "" session.add(dag_run) if with_log: task_instance = FakeTaskInstance() task_instance.dag_id = dag_id task_instance.task_id = "task" task_instance.execution_date = execution_date task = FakeTask() task.owner = "Airflow" task_instance.task = task log = Log("success", task_instance) session.add(log) for j in range(task_instances_per_run): task = FakeTask(dag_id=dag_id, task_id="task{}".format(j)) task_instance = TaskInstance(task, execution_date, state="success") session.add(task_instance) session.commit()
def test_serialze(self, session): dagrun_model = DagRun(run_id='my-dag-run', run_type=DagRunType.MANUAL.value, execution_date=timezone.parse(self.default_time), start_date=timezone.parse(self.default_time), conf='{"start": "stop"}') session.add(dagrun_model) session.commit() dagrun_model = session.query(DagRun).first() deserialized_dagrun = dagrun_schema.dump(dagrun_model) self.assertEqual( deserialized_dagrun[0], { 'dag_id': None, 'dag_run_id': 'my-dag-run', 'end_date': None, 'state': 'running', 'execution_date': self.default_time, 'external_trigger': True, 'start_date': self.default_time, 'conf': { "start": "stop" } })
def dag_backfill(args, dag=None): """Creates backfill job or dry run for a DAG""" logging.basicConfig(level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT) signal.signal(signal.SIGTERM, sigint_handler) import warnings warnings.warn( '--ignore-first-depends-on-past is deprecated as the value is always set to True', category=PendingDeprecationWarning, ) if args.ignore_first_depends_on_past is False: args.ignore_first_depends_on_past = True if not args.start_date and not args.end_date: raise AirflowException("Provide a start_date and/or end_date") dag = dag or get_dag(args.subdir, args.dag_id) # If only one date is passed, using same as start and end args.end_date = args.end_date or args.start_date args.start_date = args.start_date or args.end_date if args.task_regex: dag = dag.partial_subset(task_ids_or_regex=args.task_regex, include_upstream=not args.ignore_dependencies) if not dag.task_dict: raise AirflowException( f"There are no tasks that match '{args.task_regex}' regex. Nothing to run, exiting..." ) run_conf = None if args.conf: run_conf = json.loads(args.conf) if args.dry_run: print(f"Dry run of DAG {args.dag_id} on {args.start_date}") dr = DagRun(dag.dag_id, execution_date=args.start_date) for task in dag.tasks: print(f"Task {task.task_id}") ti = TaskInstance(task, run_id=None) ti.dag_run = dr ti.dry_run() else: if args.reset_dagruns: DAG.clear_dags( [dag], start_date=args.start_date, end_date=args.end_date, confirm_prompt=not args.yes, include_subdags=True, dag_run_state=DagRunState.QUEUED, ) try: dag.run( start_date=args.start_date, end_date=args.end_date, mark_success=args.mark_success, local=args.local, donot_pickle=(args.donot_pickle or conf.getboolean('core', 'donot_pickle')), ignore_first_depends_on_past=args.ignore_first_depends_on_past, ignore_task_deps=args.ignore_dependencies, pool=args.pool, delay_on_limit_secs=args.delay_on_limit, verbose=args.verbose, conf=run_conf, rerun_failed_tasks=args.rerun_failed_tasks, run_backwards=args.run_backwards, continue_on_failures=args.continue_on_failures, ) except ValueError as vr: print(str(vr)) sys.exit(1)
def execute(self, context): started_at = datetime.utcnow() _keep_going = True while _keep_going: _force_run_data = self.get_force_run_data() _logger.info("Force run data: {}".format(_force_run_data)) if not _force_run_data: if (datetime.utcnow() - started_at).total_seconds() > self.timeout: raise AirflowSkipException('Snap. Time is OUT.') sleep(self.poke_interval) continue for row in _force_run_data: _keep_going = False biowardrobe_uid = row['uid'] # TODO: Check if dag is running in airflow # TODO: If not running! data = self.get_record_data(biowardrobe_uid) if not data: _logger.error( 'No biowardrobe data {}'.format(biowardrobe_uid)) continue # # Actual Force RUN basedir = data['output_folder'] try: os.chdir(basedir) for root, dirs, files in os.walk(".", topdown=False): for name in files: if "fastq" in name: continue os.remove(os.path.join(root, name)) rmtree(os.path.join(basedir, 'tophat'), True) except: pass if int(data['deleted']) == 0: cmd = 'bunzip2 {}*.fastq.bz2'.format(biowardrobe_uid) try: check_output(cmd, shell=True) except Exception as e: _logger.error("Can't uncompress: {} {}".format( cmd, str(e))) if not os.path.isfile(biowardrobe_uid + '.fastq'): _logger.error( "File does not exist: {}".format(biowardrobe_uid)) continue if not os.path.isfile(biowardrobe_uid + '_2.fastq') and data['pair']: _logger.error("File 2 does not exist: {}".format( biowardrobe_uid)) continue else: rmtree(basedir, True) mysql = MySqlHook(mysql_conn_id=biowardrobe_connection_id) with closing(mysql.get_conn()) as conn: with closing(conn.cursor()) as cursor: self.drop_sql(cursor, data) if int(data['deleted']) == 0: cursor.execute( "update labdata set libstatustxt=%s, libstatus=10, forcerun=0, tagstotal=0," "tagsmapped=0,tagsribo=0,tagsused=0,tagssuppressed=0 where uid=%s", ("Ready to be reanalyzed", biowardrobe_uid)) conn.commit() else: cursor.execute( "update labdata set libstatustxt=%s,deleted=2,datedel=CURDATE() where uid=%s", ("Deleted", biowardrobe_uid)) conn.commit() _logger.info("Deleted: {}".format(biowardrobe_uid)) continue _dag_id = os.path.basename( os.path.splitext(data['workflow'])[0]) _run_id = 'forcerun__{}__{}'.format(biowardrobe_uid, uuid.uuid4()) session = settings.Session() dr = DagRun(dag_id=_dag_id, run_id=_run_id, conf={ 'biowardrobe_uid': biowardrobe_uid, 'run_id': _run_id }, execution_date=datetime.now(), start_date=datetime.now(), external_trigger=True) logging.info("Creating DagRun {}".format(dr)) session.add(dr) session.commit() session.close()
def test_lineage_backend_capture_executions(mock_emit, inlets, outlets): DEFAULT_DATE = datetime.datetime(2020, 5, 17) mock_emitter = Mock() mock_emit.return_value = mock_emitter # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6. with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub_provider.lineage.datahub.DatahubLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, "AIRFLOW__LINEAGE__DATAHUB_KWARGS": json.dumps({ "graceful_exceptions": False, "capture_executions": True }), }, ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch( "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection( datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1_upstream", inlets=inlets, outlets=outlets, ) op2 = DummyOperator( task_id="task2", inlets=inlets, outlets=outlets, ) op1 >> op2 # Airflow < 2.2 requires the execution_date parameter. Newer Airflow # versions do not require it, but will attempt to find the associated # run_id in the database if execution_date is provided. As such, we # must fake the run_id parameter for newer Airflow versions. if AIRFLOW_VERSION < packaging.version.parse("2.2.0"): ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE) # Ignoring type here because DagRun state is just a sring at Airflow 1 dag_run = DagRun( state="success", run_id=f"scheduled_{DEFAULT_DATE}") # type: ignore ti.dag_run = dag_run ti.start_date = datetime.datetime.utcnow() ti.execution_date = DEFAULT_DATE else: from airflow.utils.state import DagRunState ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") dag_run = DagRun(state=DagRunState.SUCCESS, run_id=f"scheduled_{DEFAULT_DATE}") ti.dag_run = dag_run ti.start_date = datetime.datetime.utcnow() ti.execution_date = DEFAULT_DATE ctx1 = { "dag": dag, "task": op2, "ti": ti, "dag_run": dag_run, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op2, ctx1) post = apply_lineage(func) post(op2, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op2.inlets) == 1 assert len(op2.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op2.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op2.outlets)) # Check that the right things were emitted. assert mock_emitter.emit.call_count == 17 # Running further checks based on python version because args only exists in python 3.7+ if sys.version_info[:3] > (3, 7): assert mock_emitter.method_calls[0].args[ 0].aspectName == "dataFlowInfo" assert ( mock_emitter.method_calls[0].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[1].args[ 0].aspectName == "ownership" assert ( mock_emitter.method_calls[1].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[2].args[ 0].aspectName == "globalTags" assert ( mock_emitter.method_calls[2].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[3].args[ 0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[3].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert (mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput") assert ( mock_emitter.method_calls[4].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0] == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert mock_emitter.method_calls[5].args[0].aspectName == "status" assert ( mock_emitter.method_calls[5].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert mock_emitter.method_calls[6].args[0].aspectName == "status" assert ( mock_emitter.method_calls[6].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert mock_emitter.method_calls[7].args[ 0].aspectName == "ownership" assert ( mock_emitter.method_calls[7].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert mock_emitter.method_calls[8].args[ 0].aspectName == "globalTags" assert ( mock_emitter.method_calls[8].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert (mock_emitter.method_calls[9].args[0].aspectName == "dataProcessInstanceProperties") assert ( mock_emitter.method_calls[9].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[10].args[0].aspectName == "dataProcessInstanceRelationships") assert ( mock_emitter.method_calls[10].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[11].args[0].aspectName == "dataProcessInstanceInput") assert ( mock_emitter.method_calls[11].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[12].args[0].aspectName == "dataProcessInstanceOutput") assert ( mock_emitter.method_calls[12].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert mock_emitter.method_calls[13].args[0].aspectName == "status" assert ( mock_emitter.method_calls[13].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert mock_emitter.method_calls[14].args[0].aspectName == "status" assert ( mock_emitter.method_calls[14].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert (mock_emitter.method_calls[15].args[0].aspectName == "dataProcessInstanceRunEvent") assert ( mock_emitter.method_calls[15].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb") assert (mock_emitter.method_calls[16].args[0].aspectName == "dataProcessInstanceRunEvent") assert ( mock_emitter.method_calls[16].args[0].entityUrn == "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
def create_dagrun_from_dbnd_run( databand_run, dag, execution_date, run_id, state=State.RUNNING, external_trigger=False, conf=None, session=None, ): """ Create new DagRun and all relevant TaskInstances """ dagrun = (session.query(DagRun).filter( DagRun.dag_id == dag.dag_id, DagRun.execution_date == execution_date).first()) if dagrun is None: dagrun = DagRun( run_id=run_id, execution_date=execution_date, start_date=dag.start_date, _state=state, external_trigger=external_trigger, dag_id=dag.dag_id, conf=conf, ) session.add(dagrun) else: logger.warning("Running with existing airflow dag run %s", dagrun) dagrun.dag = dag dagrun.run_id = run_id session.commit() # create the associated task instances # state is None at the moment of creation # dagrun.verify_integrity(session=session) # fetches [TaskInstance] again # tasks_skipped = databand_run.tasks_skipped # we can find a source of the completion, but also, # sometimes we don't know the source of the "complete" TI = TaskInstance tis = (session.query(TI).filter(TI.dag_id == dag.dag_id, TI.execution_date == execution_date).all()) tis = {ti.task_id: ti for ti in tis} for af_task in dag.tasks: ti = tis.get(af_task.task_id) if ti is None: ti = TaskInstance(af_task, execution_date=execution_date) ti.start_date = timezone.utcnow() ti.end_date = timezone.utcnow() session.add(ti) task_run = databand_run.get_task_run_by_af_id(af_task.task_id) # all tasks part of the backfill are scheduled to dagrun # Set log file path to expected airflow log file path task_run.log.local_log_file.path = ti.log_filepath.replace( ".log", "/{0}.log".format(ti.try_number)) if task_run.is_reused: # this task is completed and we don't need to run it anymore ti.state = State.SUCCESS session.commit() return dagrun