示例#1
0
    def test_trigger_dag_with_dict_conf(self, dag_bag_mock):
        dag_id = "trigger_dag_with_dict_conf"
        dag = DAG(dag_id)
        dag_bag_mock.dags = [dag_id]
        dag_bag_mock.get_dag.return_value = dag
        conf = dict(foo="bar")
        dag_run = DagRun()
        triggers = _trigger_dag(dag_id,
                                dag_bag_mock,
                                dag_run,
                                run_id=None,
                                conf=conf,
                                execution_date=None,
                                replace_microseconds=True)

        self.assertEqual(triggers[0].conf, conf)
示例#2
0
 def create_dag_run(self, dag_id, run_id, conf, session):
     """
     Creates new DagRun. Shouldn't be called with not existent dag_id.
     Raises exception if DagRun for the same DAG with the same run_id
     was previously created
     """
     if session.query(DagRun).filter(DagRun.dag_id == dag_id,
                                     DagRun.run_id == run_id).one_or_none():
         raise ValueError(
             f"dag_run {run_id} for dag_id {dag_id} already exists")
     else:
         run_conf = conf if isinstance(conf, dict) else json.loads(conf)
         dag_run = DagRun(dag_id=dag_id, run_id=run_id, conf=run_conf)
         session.add(dag_run)
         session.commit()
         return dag_run
    def test_should_raises_401_unauthenticated(self, session):
        dagrun_model = DagRun(
            dag_id="TEST_DAG_ID",
            run_id="TEST_DAG_RUN_ID",
            run_type=DagRunType.MANUAL.value,
            execution_date=timezone.parse(self.default_time),
            start_date=timezone.parse(self.default_time),
            external_trigger=True,
        )
        session.add(dagrun_model)
        session.commit()

        response = self.client.get(
            "api/v1/dags/TEST_DAG_ID/dagRuns/TEST_DAG_RUN_ID")

        assert_401(response)
 def _create_dag_runs(self, count):
     dag_runs = [
         DagRun(
             dag_id="TEST_DAG_ID",
             run_id="TEST_DAG_RUN_ID" + str(i),
             run_type=DagRunType.MANUAL,
             execution_date=timezone.parse(self.default_time) + timedelta(minutes=i),
             start_date=timezone.parse(self.default_time),
             external_trigger=True,
         )
         for i in range(1, count + 1)
     ]
     dag = DagModel(dag_id="TEST_DAG_ID")
     with create_session() as session:
         session.add_all(dag_runs)
         session.add(dag)
示例#5
0
 def execute(self, context):
     dro = DagRunOrder(run_id='trig__' + datetime.now().isoformat())
     dro = self.python_callable(context, dro)
     if dro:
         session = settings.Session()
         dr = DagRun(
             dag_id=self.trigger_dag_id,
             run_id=dro.run_id,
             conf=dro.payload,
             external_trigger=True)
         logging.info("Creating DagRun {}".format(dr))
         session.add(dr)
         session.commit()
         session.close()
     else:
         logging.info("Criteria not met, moving on")
示例#6
0
    def test_trigger_dag_with_str_conf(self, dag_bag_mock):
        dag_id = "trigger_dag_with_str_conf"
        dag = DAG(dag_id)
        dag_bag_mock.dags = [dag_id]
        dag_bag_mock.get_dag.return_value = dag
        conf = "{\"foo\": \"bar\"}"
        dag_run = DagRun()
        triggers = _trigger_dag(dag_id,
                                dag_bag_mock,
                                dag_run,
                                run_id=None,
                                conf=conf,
                                execution_date=None,
                                replace_microseconds=True)

        self.assertEqual(triggers[0].conf, json.loads(conf))
示例#7
0
def post_dag_run(dag_id, session):
    """Trigger a DAG."""
    if not session.query(DagModel).filter(DagModel.dag_id == dag_id).first():
        raise NotFound(title="DAG not found", detail=f"DAG with dag_id: '{dag_id}' not found")

    post_body = dagrun_schema.load(request.json, session=session)
    dagrun_instance = (
        session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.run_id == post_body["run_id"]).first()
    )
    if not dagrun_instance:
        dag_run = DagRun(dag_id=dag_id, run_type=DagRunType.MANUAL, **post_body)
        session.add(dag_run)
        session.commit()
        return dagrun_schema.dump(dag_run)
    raise AlreadyExists(
        detail=f"DAGRun with DAG ID: '{dag_id}' and DAGRun ID: '{post_body['run_id']}' already exists"
    )
示例#8
0
 def test_trigger_dag_dag_run_exist(self, dag_bag_mock, dag_run_mock):
     dag_id = "dag_run_exist"
     dag = DAG(dag_id)
     dag_bag_mock.dags = [dag_id]
     dag_bag_mock.get_dag.return_value = dag
     dag_run_mock.find.return_value = DagRun()
     self.assertRaises(
         AirflowException,
         _trigger_dag,
         dag_id,
         dag_bag_mock,
         dag_run_mock,
         run_id=None,
         conf=None,
         execution_date=None,
         replace_microseconds=True,
     )
示例#9
0
def test_generate_role_session_name():
    dag = DAG("a-test-dag")
    task = airflow_docker.operator.Operator(
        dag=dag,
        task_id="some-task",
        image="hello-world",
        start_date=datetime.datetime(2019, 2, 14, 15),
    )
    ti = TaskInstance(task=task, execution_date=datetime.datetime(2019, 2, 14, 15))
    dag_run = DagRun(dag_id=dag.dag_id)
    dag_run.id = 5

    context = {"dag": dag, "task_instance": ti, "dag_run": dag_run}

    session_name = airflow_docker.ext.aws.role_assumption.generate_role_session_name(
        context
    )
    assert "5__1__some-task" == session_name
    def test_trigger_dag_with_valid_start_date(self, dag_bag_mock):
        dag_id = "trigger_dag_with_valid_start_date"
        dag = DAG(dag_id, default_args={'start_date': timezone.datetime(2016, 9, 5, 10, 10, 0)})
        dag_bag_mock.dags = [dag_id]
        dag_bag_mock.get_dag.return_value = dag
        dag_run = DagRun()

        triggers = _trigger_dag(
            dag_id,
            dag_bag_mock,
            dag_run,
            run_id=None,
            conf=None,
            execution_date=timezone.datetime(2018, 7, 5, 10, 10, 0),
            replace_microseconds=True,
        )

        assert len(triggers) == 1
    def test_trigger_dag_with_too_early_start_date(self, dag_bag_mock):
        dag_id = "trigger_dag_with_too_early_start_date"
        dag = DAG(dag_id, default_args={'start_date': timezone.datetime(2016, 9, 5, 10, 10, 0)})
        dag_bag_mock.dags = [dag_id]
        dag_bag_mock.get_dag.return_value = dag
        dag_run = DagRun()

        self.assertRaises(
            ValueError,
            _trigger_dag,
            dag_id,
            dag_bag_mock,
            dag_run,
            run_id=None,
            conf=None,
            execution_date=timezone.datetime(2015, 7, 5, 10, 10, 0),
            replace_microseconds=True,
        )
示例#12
0
文件: cli.py 项目: nave91/airflow
def trigger_dag(args):
    utils.log_to_stdout()
    session = settings.Session()
    # TODO: verify dag_id
    execution_date = datetime.now()
    dr = session.query(DagRun).filter(DagRun.dag_id == args.dag_id,
                                      DagRun.run_id == args.run_id).first()
    if dr:
        logging.error("This run_id already exists")
    else:
        trigger = DagRun(dag_id=args.dag_id,
                         run_id=args.run_id,
                         execution_date=execution_date,
                         state=State.RUNNING,
                         external_trigger=True)
        session.add(trigger)
        logging.info("Created {}".format(trigger))
    session.commit()
示例#13
0
    def test_subdag_with_propagate_skipped_state(self, propagate_option,
                                                 states, skip_parent,
                                                 mock_get_task_instance,
                                                 mock_skip):
        """
        Tests that skipped state of leaf tasks propagates to the parent dag.
        Note that the skipped state propagation only takes affect when the dagrun's state is SUCCESS.
        """
        dag = DAG('parent', default_args=default_args)
        subdag = DAG('parent.test', default_args=default_args)
        subdag_task = SubDagOperator(task_id='test',
                                     subdag=subdag,
                                     dag=dag,
                                     poke_interval=1,
                                     propagate_skipped_state=propagate_option)
        dummy_subdag_tasks = [
            DummyOperator(task_id=f'dummy_subdag_{i}', dag=subdag)
            for i in range(len(states))
        ]
        dummy_dag_task = DummyOperator(task_id='dummy_dag', dag=dag)
        subdag_task >> dummy_dag_task

        subdag_task._get_dagrun = Mock()
        subdag_task._get_dagrun.return_value = self.dag_run_success
        mock_get_task_instance.side_effect = [
            TaskInstance(task=task, execution_date=DEFAULT_DATE, state=state)
            for task, state in zip(dummy_subdag_tasks, states)
        ]

        context = {
            'execution_date': DEFAULT_DATE,
            'dag_run': DagRun(),
            'task': subdag_task
        }
        subdag_task.post_execute(context)

        if skip_parent:
            mock_skip.assert_called_once_with(context['dag_run'],
                                              context['execution_date'],
                                              [dummy_dag_task])
        else:
            mock_skip.assert_not_called()
示例#14
0
def trigger_dag(
    dag_id,
    run_id=None,
    conf=None,
    execution_date=None,
    replace_microseconds=True,
):
    dagbag = DagBag()
    dag_run = DagRun()
    triggers = _trigger_dag(
        dag_id=dag_id,
        dag_run=dag_run,
        dag_bag=dagbag,
        run_id=run_id,
        conf=conf,
        execution_date=execution_date,
        replace_microseconds=replace_microseconds,
    )

    return triggers[0] if triggers else None
def test_generate_role_session_name_long_task_id():
    dag = DAG("a-test-dag")
    task = airflow_docker.operator.Operator(
        dag=dag,
        task_id=
        "some-task-id-that-is-very-long-way-past-the-64-character-limit-foo-bar-baz",
        image="hello-world",
        start_date=datetime.datetime(2019, 2, 14, 15),
    )
    ti = TaskInstance(task=task,
                      execution_date=datetime.datetime(2019, 2, 14, 15))
    dag_run = DagRun(dag_id=dag.dag_id)
    dag_run.id = 5

    context = {"dag": dag, "task_instance": ti, "dag_run": dag_run}

    session_name = airflow_docker.ext.aws.role_assumption.generate_role_session_name(
        context)
    assert ("5__1__some-task-id-that-is-very-long-way-past-the-64-character-l"
            == session_name)
示例#16
0
def create_context(task):
    dag = DAG(dag_id="dag")
    tzinfo = pendulum.timezone("Europe/Amsterdam")
    execution_date = timezone.datetime(2016, 1, 1, 1, 0, 0, tzinfo=tzinfo)
    dag_run = DagRun(
        dag_id=dag.dag_id,
        execution_date=execution_date,
        run_id=DagRun.generate_run_id(DagRunType.MANUAL, execution_date),
    )
    task_instance = TaskInstance(task=task)
    task_instance.dag_run = dag_run
    task_instance.dag_id = dag.dag_id
    task_instance.xcom_push = mock.Mock()
    return {
        "dag": dag,
        "run_id": dag_run.run_id,
        "task": task,
        "ti": task_instance,
        "task_instance": task_instance,
    }
    def execute(self, context):

        biowardrobe_uid = context['dag_run'].conf['biowardrobe_uid'] \
            if 'biowardrobe_uid' in context['dag_run'].conf else None

        if not biowardrobe_uid:
            raise Exception('biowardrobe_id must be provided')

        run_id = context['dag_run'].conf['run_id'] \
            if 'run_id' in context['dag_run'].conf else 'trig__{}__{}'.format(biowardrobe_uid, uuid.uuid4())

        _logger.info('Successfully finished: {}'.format(biowardrobe_uid))

        mysql = MySqlHook(mysql_conn_id=biowardrobe_connection_id)
        with closing(mysql.get_conn()) as conn:
            with closing(conn.cursor()) as cursor:
                cursor.execute(
                    "update ems.labdata set libstatus=10, libstatustxt='downloaded' where uid=%s",
                    (biowardrobe_uid, ))
                conn.commit()
                data = get_biowardrobe_data(cursor=cursor,
                                            biowardrobe_uid=biowardrobe_uid)
                dag_id = os.path.basename(
                    os.path.splitext(data['workflow'])[0])

                payload = {
                    'biowardrobe_uid': biowardrobe_uid,
                    'run_id': run_id
                }

                _logger.info("Trigger basic analysis with: {}".format(payload))
                session = settings.Session()
                dr = DagRun(dag_id=dag_id,
                            run_id=run_id,
                            conf=payload,
                            execution_date=datetime.now(),
                            external_trigger=True)
                logging.info("Creating DagRun {}".format(dr))
                session.add(dr)
                session.commit()
                session.close()
示例#18
0
def trigger_dag(
        dag_id: str,
        run_id: Optional[str] = None,
        conf: Optional[Union[dict, str]] = None,
        execution_date: Optional[datetime] = None,
        replace_microseconds: bool = True,
) -> Optional[DagRun]:
    """Triggers execution of DAG specified by dag_id

    :param dag_id: DAG ID
    :param run_id: ID of the dag_run
    :param conf: configuration
    :param execution_date: date of execution
    :param replace_microseconds: whether microseconds should be zeroed
    :return: first dag run triggered - even if more than one Dag Runs were triggered or None
    """
    dag_model = DagModel.get_current(dag_id)
    if dag_model is None:
        raise DagNotFound("Dag id {} not found in DagModel".format(dag_id))

    def read_store_serialized_dags():
        from airflow.configuration import conf
        return conf.getboolean('core', 'store_serialized_dags')
    dagbag = DagBag(
        dag_folder=dag_model.fileloc,
        store_serialized_dags=read_store_serialized_dags()
    )
    dag_run = DagRun()
    triggers = _trigger_dag(
        dag_id=dag_id,
        dag_run=dag_run,
        dag_bag=dagbag,
        run_id=run_id,
        conf=conf,
        execution_date=execution_date,
        replace_microseconds=replace_microseconds,
    )

    return triggers[0] if triggers else None
示例#19
0
    def trigger_dag(self, dag_id, run_id, conf):
        try:
            dag_path = DagModel.get_current(dag_id).fileloc
        except Exception:
            dag_path = path.join(DAGS_FOLDER, dag_id + ".py")

        dag_bag = DagBag(dag_folder=dag_path)
        if not dag_bag.dags:
            logging.info("Failed to import dag due to the following errors")
            logging.info(dag_bag.import_errors)
            logging.info("Sleep for 3 seconds and give it a second try")
            sleep(3)
            dag_bag = DagBag(dag_folder=dag_path)

        triggers = trigger_dag._trigger_dag(dag_id=dag_id,
                                            dag_run=DagRun(),
                                            dag_bag=dag_bag,
                                            run_id=run_id,
                                            conf=conf,
                                            execution_date=None,
                                            replace_microseconds=False)
        return triggers[0] if triggers else None
示例#20
0
def generate_pod_yaml(args):
    """Generates yaml files for each task in the DAG. Used for testing output of KubernetesExecutor"""
    execution_date = args.execution_date
    dag = get_dag(subdir=args.subdir, dag_id=args.dag_id)
    yaml_output_path = args.output_path
    dr = DagRun(dag.dag_id, execution_date=execution_date)
    kube_config = KubeConfig()
    for task in dag.tasks:
        ti = TaskInstance(task, None)
        ti.dag_run = dr
        pod = PodGenerator.construct_pod(
            dag_id=args.dag_id,
            task_id=ti.task_id,
            pod_id=create_pod_id(args.dag_id, ti.task_id),
            try_number=ti.try_number,
            kube_image=kube_config.kube_image,
            date=ti.execution_date,
            args=ti.command_as_list(),
            pod_override_object=PodGenerator.from_obj(ti.executor_config),
            scheduler_job_id="worker-config",
            namespace=kube_config.executor_namespace,
            base_worker_pod=PodGenerator.deserialize_model_file(
                kube_config.pod_template_file),
        )
        pod_mutation_hook(pod)
        api_client = ApiClient()
        date_string = pod_generator.datetime_to_label_safe_datestring(
            execution_date)
        yaml_file_name = f"{args.dag_id}_{ti.task_id}_{date_string}.yml"
        os.makedirs(os.path.dirname(yaml_output_path +
                                    "/airflow_yaml_output/"),
                    exist_ok=True)
        with open(yaml_output_path + "/airflow_yaml_output/" + yaml_file_name,
                  "w") as output:
            sanitized_pod = api_client.sanitize_for_serialization(pod)
            output.write(yaml.dump(sanitized_pod))
    print(
        f"YAML output can be found at {yaml_output_path}/airflow_yaml_output/")
def trigger_dag(
    dag_id,
    run_id=None,
    conf=None,
    execution_date=None,
    replace_microseconds=True,
):
    dag_model = DagModel.get_current(dag_id)
    if dag_model is None:
        raise DagNotFound("Dag id {} not found in DagModel".format(dag_id))
    dagbag = DagBag(dag_folder=dag_model.fileloc)
    dag_run = DagRun()
    triggers = _trigger_dag(
        dag_id=dag_id,
        dag_run=dag_run,
        dag_bag=dagbag,
        run_id=run_id,
        conf=conf,
        execution_date=execution_date,
        replace_microseconds=replace_microseconds,
    )

    return triggers[0] if triggers else None
示例#22
0
def trigger_dag(args):
    session = settings.Session()
    # TODO: verify dag_id
    execution_date = datetime.now()
    run_id = args.run_id or "manual__{0}".format(execution_date.isoformat())
    dr = session.query(DagRun).filter(DagRun.dag_id == args.dag_id,
                                      DagRun.run_id == run_id).first()

    conf = {}
    if args.conf:
        conf = json.loads(args.conf)
    if dr:
        logging.error("This run_id already exists")
    else:
        trigger = DagRun(dag_id=args.dag_id,
                         run_id=run_id,
                         execution_date=execution_date,
                         state=State.RUNNING,
                         conf=conf,
                         external_trigger=True)
        session.add(trigger)
        logging.info("Created {}".format(trigger))
    session.commit()
示例#23
0
    def trigger_dag(self):
        """
        Triggers execution of DAG interpreted from the report's dag_id

        _trigger_dag iterates through the class registry and looks
        For any model that has dag_id as an attribute and deletes
        all references to the specific dag_id

        :param dag_id: DAG ID
        :param dagbag: dagbag
        :param dagrun: empty dag run to be created
        """
        dag_model = DagModel.get_current(self.dag_id)
        if dag_model is None:
            raise DagNotFound(f"Dag id {self.dag_id} not found in DagModel")

        dagbag = DagBag(
            dag_folder=dag_model.fileloc,
            store_serialized_dags=conf.getboolean("core",
                                                  "store_serialized_dags"),
        )
        dag_run = DagRun()
        self._trigger_dag(dag_id=self.dag_id, dag_bag=dagbag, dag_run=dag_run)
示例#24
0
def insert_dag_runs(
    session,
    dag_id="plugin_test_dag",
    dag_runs_count=1,
    task_instances_per_run=0,
    state="success",
    with_log=False,
):
    for i in range(dag_runs_count):
        execution_date = utcnow()

        dag_run = DagRun()
        dag_run.dag_id = dag_id
        dag_run.execution_date = execution_date
        dag_run._state = state
        if AIRFLOW_VERSION_2:
            dag_run.run_type = ""
        session.add(dag_run)

        if with_log:
            task_instance = FakeTaskInstance()
            task_instance.dag_id = dag_id
            task_instance.task_id = "task"
            task_instance.execution_date = execution_date
            task = FakeTask()
            task.owner = "Airflow"
            task_instance.task = task
            log = Log("success", task_instance)
            session.add(log)

        for j in range(task_instances_per_run):
            task = FakeTask(dag_id=dag_id, task_id="task{}".format(j))
            task_instance = TaskInstance(task, execution_date, state="success")
            session.add(task_instance)

    session.commit()
    def test_serialze(self, session):
        dagrun_model = DagRun(run_id='my-dag-run',
                              run_type=DagRunType.MANUAL.value,
                              execution_date=timezone.parse(self.default_time),
                              start_date=timezone.parse(self.default_time),
                              conf='{"start": "stop"}')
        session.add(dagrun_model)
        session.commit()
        dagrun_model = session.query(DagRun).first()
        deserialized_dagrun = dagrun_schema.dump(dagrun_model)

        self.assertEqual(
            deserialized_dagrun[0], {
                'dag_id': None,
                'dag_run_id': 'my-dag-run',
                'end_date': None,
                'state': 'running',
                'execution_date': self.default_time,
                'external_trigger': True,
                'start_date': self.default_time,
                'conf': {
                    "start": "stop"
                }
            })
示例#26
0
def dag_backfill(args, dag=None):
    """Creates backfill job or dry run for a DAG"""
    logging.basicConfig(level=settings.LOGGING_LEVEL,
                        format=settings.SIMPLE_LOG_FORMAT)

    signal.signal(signal.SIGTERM, sigint_handler)

    import warnings

    warnings.warn(
        '--ignore-first-depends-on-past is deprecated as the value is always set to True',
        category=PendingDeprecationWarning,
    )

    if args.ignore_first_depends_on_past is False:
        args.ignore_first_depends_on_past = True

    if not args.start_date and not args.end_date:
        raise AirflowException("Provide a start_date and/or end_date")

    dag = dag or get_dag(args.subdir, args.dag_id)

    # If only one date is passed, using same as start and end
    args.end_date = args.end_date or args.start_date
    args.start_date = args.start_date or args.end_date

    if args.task_regex:
        dag = dag.partial_subset(task_ids_or_regex=args.task_regex,
                                 include_upstream=not args.ignore_dependencies)
        if not dag.task_dict:
            raise AirflowException(
                f"There are no tasks that match '{args.task_regex}' regex. Nothing to run, exiting..."
            )

    run_conf = None
    if args.conf:
        run_conf = json.loads(args.conf)

    if args.dry_run:
        print(f"Dry run of DAG {args.dag_id} on {args.start_date}")
        dr = DagRun(dag.dag_id, execution_date=args.start_date)
        for task in dag.tasks:
            print(f"Task {task.task_id}")
            ti = TaskInstance(task, run_id=None)
            ti.dag_run = dr
            ti.dry_run()
    else:
        if args.reset_dagruns:
            DAG.clear_dags(
                [dag],
                start_date=args.start_date,
                end_date=args.end_date,
                confirm_prompt=not args.yes,
                include_subdags=True,
                dag_run_state=DagRunState.QUEUED,
            )

        try:
            dag.run(
                start_date=args.start_date,
                end_date=args.end_date,
                mark_success=args.mark_success,
                local=args.local,
                donot_pickle=(args.donot_pickle
                              or conf.getboolean('core', 'donot_pickle')),
                ignore_first_depends_on_past=args.ignore_first_depends_on_past,
                ignore_task_deps=args.ignore_dependencies,
                pool=args.pool,
                delay_on_limit_secs=args.delay_on_limit,
                verbose=args.verbose,
                conf=run_conf,
                rerun_failed_tasks=args.rerun_failed_tasks,
                run_backwards=args.run_backwards,
                continue_on_failures=args.continue_on_failures,
            )
        except ValueError as vr:
            print(str(vr))
            sys.exit(1)
示例#27
0
    def execute(self, context):

        started_at = datetime.utcnow()
        _keep_going = True
        while _keep_going:

            _force_run_data = self.get_force_run_data()
            _logger.info("Force run data: {}".format(_force_run_data))

            if not _force_run_data:
                if (datetime.utcnow() -
                        started_at).total_seconds() > self.timeout:
                    raise AirflowSkipException('Snap. Time is OUT.')
                sleep(self.poke_interval)
                continue

            for row in _force_run_data:
                _keep_going = False
                biowardrobe_uid = row['uid']
                #  TODO: Check if dag is running in airflow

                #  TODO: If not running!
                data = self.get_record_data(biowardrobe_uid)
                if not data:
                    _logger.error(
                        'No biowardrobe data {}'.format(biowardrobe_uid))
                    continue
                #
                #  Actual Force RUN
                basedir = data['output_folder']
                try:
                    os.chdir(basedir)

                    for root, dirs, files in os.walk(".", topdown=False):
                        for name in files:
                            if "fastq" in name:
                                continue
                            os.remove(os.path.join(root, name))
                    rmtree(os.path.join(basedir, 'tophat'), True)
                except:
                    pass

                if int(data['deleted']) == 0:
                    cmd = 'bunzip2 {}*.fastq.bz2'.format(biowardrobe_uid)
                    try:
                        check_output(cmd, shell=True)
                    except Exception as e:
                        _logger.error("Can't uncompress: {} {}".format(
                            cmd, str(e)))

                    if not os.path.isfile(biowardrobe_uid + '.fastq'):
                        _logger.error(
                            "File does not exist: {}".format(biowardrobe_uid))
                        continue
                    if not os.path.isfile(biowardrobe_uid +
                                          '_2.fastq') and data['pair']:
                        _logger.error("File 2 does not exist: {}".format(
                            biowardrobe_uid))
                        continue
                else:
                    rmtree(basedir, True)

                mysql = MySqlHook(mysql_conn_id=biowardrobe_connection_id)
                with closing(mysql.get_conn()) as conn:
                    with closing(conn.cursor()) as cursor:
                        self.drop_sql(cursor, data)
                        if int(data['deleted']) == 0:
                            cursor.execute(
                                "update labdata set libstatustxt=%s, libstatus=10, forcerun=0, tagstotal=0,"
                                "tagsmapped=0,tagsribo=0,tagsused=0,tagssuppressed=0 where uid=%s",
                                ("Ready to be reanalyzed", biowardrobe_uid))
                            conn.commit()
                        else:
                            cursor.execute(
                                "update labdata set libstatustxt=%s,deleted=2,datedel=CURDATE() where uid=%s",
                                ("Deleted", biowardrobe_uid))
                            conn.commit()
                            _logger.info("Deleted: {}".format(biowardrobe_uid))
                            continue

                _dag_id = os.path.basename(
                    os.path.splitext(data['workflow'])[0])
                _run_id = 'forcerun__{}__{}'.format(biowardrobe_uid,
                                                    uuid.uuid4())
                session = settings.Session()
                dr = DagRun(dag_id=_dag_id,
                            run_id=_run_id,
                            conf={
                                'biowardrobe_uid': biowardrobe_uid,
                                'run_id': _run_id
                            },
                            execution_date=datetime.now(),
                            start_date=datetime.now(),
                            external_trigger=True)
                logging.info("Creating DagRun {}".format(dr))
                session.add(dr)
                session.commit()
                session.close()
示例#28
0
def test_lineage_backend_capture_executions(mock_emit, inlets, outlets):
    DEFAULT_DATE = datetime.datetime(2020, 5, 17)
    mock_emitter = Mock()
    mock_emit.return_value = mock_emitter
    # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6.
    with mock.patch.dict(
            os.environ,
        {
            "AIRFLOW__LINEAGE__BACKEND":
            "datahub_provider.lineage.datahub.DatahubLineageBackend",
            "AIRFLOW__LINEAGE__DATAHUB_CONN_ID":
            datahub_rest_connection_config.conn_id,
            "AIRFLOW__LINEAGE__DATAHUB_KWARGS":
            json.dumps({
                "graceful_exceptions": False,
                "capture_executions": True
            }),
        },
    ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch(
            "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection(
                datahub_rest_connection_config):
        func = mock.Mock()
        func.__name__ = "foo"

        dag = DAG(dag_id="test_lineage_is_sent_to_backend",
                  start_date=DEFAULT_DATE)

        with dag:
            op1 = DummyOperator(
                task_id="task1_upstream",
                inlets=inlets,
                outlets=outlets,
            )
            op2 = DummyOperator(
                task_id="task2",
                inlets=inlets,
                outlets=outlets,
            )
            op1 >> op2

        # Airflow < 2.2 requires the execution_date parameter. Newer Airflow
        # versions do not require it, but will attempt to find the associated
        # run_id in the database if execution_date is provided. As such, we
        # must fake the run_id parameter for newer Airflow versions.
        if AIRFLOW_VERSION < packaging.version.parse("2.2.0"):
            ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE)
            # Ignoring type here because DagRun state is just a sring at Airflow 1
            dag_run = DagRun(
                state="success",
                run_id=f"scheduled_{DEFAULT_DATE}")  # type: ignore
            ti.dag_run = dag_run
            ti.start_date = datetime.datetime.utcnow()
            ti.execution_date = DEFAULT_DATE

        else:
            from airflow.utils.state import DagRunState

            ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}")
            dag_run = DagRun(state=DagRunState.SUCCESS,
                             run_id=f"scheduled_{DEFAULT_DATE}")
            ti.dag_run = dag_run
            ti.start_date = datetime.datetime.utcnow()
            ti.execution_date = DEFAULT_DATE

        ctx1 = {
            "dag": dag,
            "task": op2,
            "ti": ti,
            "dag_run": dag_run,
            "task_instance": ti,
            "execution_date": DEFAULT_DATE,
            "ts": "2021-04-08T00:54:25.771575+00:00",
        }

        prep = prepare_lineage(func)
        prep(op2, ctx1)
        post = apply_lineage(func)
        post(op2, ctx1)

        # Verify that the inlets and outlets are registered and recognized by Airflow correctly,
        # or that our lineage backend forces it to.
        assert len(op2.inlets) == 1
        assert len(op2.outlets) == 1
        assert all(map(lambda let: isinstance(let, Dataset), op2.inlets))
        assert all(map(lambda let: isinstance(let, Dataset), op2.outlets))

        # Check that the right things were emitted.
        assert mock_emitter.emit.call_count == 17
        # Running further checks based on python version because args only exists in python 3.7+
        if sys.version_info[:3] > (3, 7):
            assert mock_emitter.method_calls[0].args[
                0].aspectName == "dataFlowInfo"
            assert (
                mock_emitter.method_calls[0].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[1].args[
                0].aspectName == "ownership"
            assert (
                mock_emitter.method_calls[1].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[2].args[
                0].aspectName == "globalTags"
            assert (
                mock_emitter.method_calls[2].args[0].entityUrn ==
                "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)"
            )

            assert mock_emitter.method_calls[3].args[
                0].aspectName == "dataJobInfo"
            assert (
                mock_emitter.method_calls[3].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert (mock_emitter.method_calls[4].args[0].aspectName ==
                    "dataJobInputOutput")
            assert (
                mock_emitter.method_calls[4].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )
            assert (
                mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0]
                ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )

            assert mock_emitter.method_calls[5].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[5].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )

            assert mock_emitter.method_calls[6].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[6].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )

            assert mock_emitter.method_calls[7].args[
                0].aspectName == "ownership"
            assert (
                mock_emitter.method_calls[7].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert mock_emitter.method_calls[8].args[
                0].aspectName == "globalTags"
            assert (
                mock_emitter.method_calls[8].args[0].entityUrn ==
                "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)"
            )

            assert (mock_emitter.method_calls[9].args[0].aspectName ==
                    "dataProcessInstanceProperties")
            assert (
                mock_emitter.method_calls[9].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")

            assert (mock_emitter.method_calls[10].args[0].aspectName ==
                    "dataProcessInstanceRelationships")
            assert (
                mock_emitter.method_calls[10].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert (mock_emitter.method_calls[11].args[0].aspectName ==
                    "dataProcessInstanceInput")
            assert (
                mock_emitter.method_calls[11].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert (mock_emitter.method_calls[12].args[0].aspectName ==
                    "dataProcessInstanceOutput")
            assert (
                mock_emitter.method_calls[12].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert mock_emitter.method_calls[13].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[13].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)"
            )
            assert mock_emitter.method_calls[14].args[0].aspectName == "status"
            assert (
                mock_emitter.method_calls[14].args[0].entityUrn ==
                "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)"
            )
            assert (mock_emitter.method_calls[15].args[0].aspectName ==
                    "dataProcessInstanceRunEvent")
            assert (
                mock_emitter.method_calls[15].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
            assert (mock_emitter.method_calls[16].args[0].aspectName ==
                    "dataProcessInstanceRunEvent")
            assert (
                mock_emitter.method_calls[16].args[0].entityUrn ==
                "urn:li:dataProcessInstance:b6375e5f5faeb543cfb5d7d8a47661fb")
示例#29
0
def create_dagrun_from_dbnd_run(
    databand_run,
    dag,
    execution_date,
    run_id,
    state=State.RUNNING,
    external_trigger=False,
    conf=None,
    session=None,
):
    """
    Create new DagRun and all relevant TaskInstances
    """
    dagrun = (session.query(DagRun).filter(
        DagRun.dag_id == dag.dag_id,
        DagRun.execution_date == execution_date).first())
    if dagrun is None:
        dagrun = DagRun(
            run_id=run_id,
            execution_date=execution_date,
            start_date=dag.start_date,
            _state=state,
            external_trigger=external_trigger,
            dag_id=dag.dag_id,
            conf=conf,
        )
        session.add(dagrun)
    else:
        logger.warning("Running with existing airflow dag run %s", dagrun)

    dagrun.dag = dag
    dagrun.run_id = run_id
    session.commit()

    # create the associated task instances
    # state is None at the moment of creation

    # dagrun.verify_integrity(session=session)
    # fetches [TaskInstance] again
    # tasks_skipped = databand_run.tasks_skipped

    # we can find a source of the completion, but also,
    # sometimes we don't know the source of the "complete"
    TI = TaskInstance
    tis = (session.query(TI).filter(TI.dag_id == dag.dag_id,
                                    TI.execution_date == execution_date).all())
    tis = {ti.task_id: ti for ti in tis}

    for af_task in dag.tasks:
        ti = tis.get(af_task.task_id)
        if ti is None:
            ti = TaskInstance(af_task, execution_date=execution_date)
            ti.start_date = timezone.utcnow()
            ti.end_date = timezone.utcnow()
            session.add(ti)
        task_run = databand_run.get_task_run_by_af_id(af_task.task_id)
        # all tasks part of the backfill are scheduled to dagrun

        # Set log file path to expected airflow log file path
        task_run.log.local_log_file.path = ti.log_filepath.replace(
            ".log", "/{0}.log".format(ti.try_number))
        if task_run.is_reused:
            # this task is completed and we don't need to run it anymore
            ti.state = State.SUCCESS

    session.commit()

    return dagrun