def test_backfill_rerun_failed_tasks_without_flag(self):
        dag = DAG(dag_id='test_backfill_rerun_failed',
                  start_date=DEFAULT_DATE,
                  schedule_interval='@daily')

        with dag:
            DummyOperator(task_id='test_backfill_rerun_failed_task-1', dag=dag)

        dag.clear()

        executor = TestExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=2),
        )
        job.run()

        ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        ti.set_state(State.FAILED)

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          rerun_failed_tasks=False)

        with self.assertRaises(AirflowException):
            job.run()
Exemplo n.º 2
0
    def test_cli_backfill_depends_on_past(self):
        """
        Test that CLI respects -I argument
        """
        dag_id = 'test_dagrun_states_deadlock'
        run_date = DEFAULT_DATE + datetime.timedelta(days=1)
        args = [
            'backfill',
            dag_id,
            '-l',
            '-s',
            run_date.isoformat(),
        ]
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        self.assertRaisesRegexp(
            AirflowException,
            'BackfillJob is deadlocked',
            cli.backfill,
            self.parser.parse_args(args))

        cli.backfill(self.parser.parse_args(args + ['-I']))
        ti = TI(dag.get_task('test_depends_on_past'), run_date)
        ti.refresh_from_db()
        # task ran
        self.assertEqual(ti.state, State.SUCCESS)
        dag.clear()
Exemplo n.º 3
0
    def test_backfill_depends_on_past(self):
        """
        Test that backfill respects ignore_depends_on_past
        """
        dag = self.dagbag.get_dag('test_depends_on_past')
        dag.clear()
        run_date = DEFAULT_DATE + datetime.timedelta(days=5)

        # backfill should deadlock
        self.assertRaisesRegex(
            AirflowException,
            'BackfillJob is deadlocked',
            BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run)

        BackfillJob(
            dag=dag,
            start_date=run_date,
            end_date=run_date,
            executor=MockExecutor(),
            ignore_first_depends_on_past=True).run()

        # ti should have succeeded
        ti = TI(dag.tasks[0], run_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 4
0
    def test_scheduler_pooled_tasks(self):
        """
        Test that the scheduler handles queued tasks correctly
        See issue #1299
        """
        session = settings.Session()
        if not (
                session.query(Pool)
                .filter(Pool.pool == 'test_queued_pool')
                .first()):
            pool = Pool(pool='test_queued_pool', slots=5)
            session.merge(pool)
            session.commit()
        session.close()

        dag_id = 'test_scheduled_queued_tasks'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        scheduler = SchedulerJob(dag_id, num_runs=10)
        scheduler.run()

        task_1 = dag.tasks[0]
        ti = TI(task_1, dag.start_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)

        dag.clear()
Exemplo n.º 5
0
    def test_clear_task_instances_without_task(self):
        dag = DAG('test_clear_task_instances_without_task', start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE + datetime.timedelta(days=10))
        task0 = DummyOperator(task_id='task0', owner='test', dag=dag)
        task1 = DummyOperator(task_id='task1', owner='test', dag=dag, retries=2)
        ti0 = TI(task=task0, execution_date=DEFAULT_DATE)
        ti1 = TI(task=task1, execution_date=DEFAULT_DATE)
        ti0.run()
        ti1.run()

        # Remove the task from dag.
        dag.task_dict = {}
        self.assertFalse(dag.has_task(task0.task_id))
        self.assertFalse(dag.has_task(task1.task_id))

        session = settings.Session()
        qry = session.query(TI).filter(
            TI.dag_id == dag.dag_id).all()
        clear_task_instances(qry, session)
        session.commit()
        # When dag is None, max_tries will be maximum of original max_tries or try_number.
        ti0.refresh_from_db()
        ti1.refresh_from_db()
        # Next try to run will be try 2
        self.assertEqual(ti0.try_number, 2)
        self.assertEqual(ti0.max_tries, 1)
        self.assertEqual(ti1.try_number, 2)
        self.assertEqual(ti1.max_tries, 2)
    def test_clear_task_instances_without_dag(self):
        dag = DAG('test_clear_task_instances_without_dag',
                  start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE + datetime.timedelta(days=10))
        task0 = DummyOperator(task_id='task_0', owner='test', dag=dag)
        task1 = DummyOperator(task_id='task_1',
                              owner='test',
                              dag=dag,
                              retries=2)
        ti0 = TI(task=task0, execution_date=DEFAULT_DATE)
        ti1 = TI(task=task1, execution_date=DEFAULT_DATE)
        ti0.run()
        ti1.run()

        with create_session() as session:
            qry = session.query(TI).filter(TI.dag_id == dag.dag_id).all()
            clear_task_instances(qry, session)

        # When dag is None, max_tries will be maximum of original max_tries or try_number.
        ti0.refresh_from_db()
        ti1.refresh_from_db()
        # Next try to run will be try 2
        self.assertEqual(ti0.try_number, 2)
        self.assertEqual(ti0.max_tries, 1)
        self.assertEqual(ti1.try_number, 2)
        self.assertEqual(ti1.max_tries, 2)
Exemplo n.º 7
0
    def test_clear_task_instances(self):
        dag = DAG(
            'test_clear_task_instances',
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=10),
        )
        task0 = DummyOperator(task_id='0', owner='test', dag=dag)
        task1 = DummyOperator(task_id='1', owner='test', dag=dag, retries=2)
        ti0 = TI(task=task0, execution_date=DEFAULT_DATE)
        ti1 = TI(task=task1, execution_date=DEFAULT_DATE)

        dag.create_dagrun(
            execution_date=ti0.execution_date,
            state=State.RUNNING,
            run_type=DagRunType.SCHEDULED,
        )

        ti0.run()
        ti1.run()
        with create_session() as session:
            qry = session.query(TI).filter(TI.dag_id == dag.dag_id).all()
            clear_task_instances(qry, session, dag=dag)

        ti0.refresh_from_db()
        ti1.refresh_from_db()
        # Next try to run will be try 2
        assert ti0.try_number == 2
        assert ti0.max_tries == 1
        assert ti1.try_number == 2
        assert ti1.max_tries == 3
    def test_log_file_template_with_run_task(self):
        """Verify that the taskinstance has the right context for log_filename_template"""

        with mock.patch.object(task_command, "_run_task_by_selected_method"):
            with conf_vars({('core', 'dags_folder'): self.dag_path}):
                # increment the try_number of the task to be run
                dag = DagBag().get_dag(self.dag_id)
                task = dag.get_task(self.task_id)
                with create_session() as session:
                    dag.create_dagrun(
                        execution_date=self.execution_date,
                        start_date=timezone.utcnow(),
                        state=State.RUNNING,
                        run_type=DagRunType.MANUAL,
                        session=session,
                    )
                    ti = TaskInstance(task, self.execution_date)
                    ti.refresh_from_db(session=session, lock_for_update=True)
                    ti.try_number = 1  # not running, so starts at 0
                    session.merge(ti)

                log_file_path = os.path.join(
                    os.path.dirname(self.ti_log_file_path), "2.log")

                try:
                    task_command.task_run(
                        self.parser.parse_args(self.task_args))

                    assert os.path.exists(log_file_path)
                finally:
                    try:
                        os.remove(log_file_path)
                    except OSError:
                        pass
    def test_backfill_rerun_failed_tasks(self):
        dag = DAG(dag_id='test_backfill_rerun_failed',
                  start_date=DEFAULT_DATE,
                  schedule_interval='@daily')

        with dag:
            DummyOperator(task_id='test_backfill_rerun_failed_task-1', dag=dag)

        dag.clear()

        executor = TestExecutor()

        job = BackfillJob(
            dag=dag,
            executor=executor,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE + datetime.timedelta(days=2),
        )
        job.run()

        ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        ti.set_state(State.FAILED)

        job = BackfillJob(dag=dag,
                          executor=executor,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE + datetime.timedelta(days=2),
                          rerun_failed_tasks=True)
        job.run()
        ti = TI(task=dag.get_task('test_backfill_rerun_failed_task-1'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 10
0
    def test_cli_backfill_depends_on_past(self):
        dag_id = 'test_dagrun_states_deadlock'
        run_date = DEFAULT_DATE + datetime.timedelta(days=1)
        args = [
            'backfill',
            dag_id,
            '-l',
            '-sd',
            TEST_DAGS_FOLDER,
            '-s',
            run_date.isoformat(),
        ]
        dag = get_dag(dag_id, TEST_DAGS_FOLDER)

        cli.backfill(self.parser.parse_args(args))
        ti = TI(dag.get_task('test_depends_on_past'), run_date)
        ti.refresh_from_db()
        # task did not run
        self.assertEqual(ti.state, State.NONE)

        cli.backfill(self.parser.parse_args(args + ['-I']))
        ti = TI(dag.get_task('test_depends_on_past'), run_date)
        ti.refresh_from_db()
        # task ran
        self.assertEqual(ti.state, State.SUCCESS)
    def test_backfill_pooled_tasks(self):
        """
        Test that queued tasks are executed by BackfillJob
        """
        session = settings.Session()
        pool = Pool(pool='test_backfill_pooled_task_pool', slots=1)
        session.add(pool)
        session.commit()
        session.close()

        dag = self.dagbag.get_dag('test_backfill_pooled_task_dag')
        dag.clear()

        executor = TestExecutor(do_update=True)
        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            executor=executor)

        # run with timeout because this creates an infinite loop if not
        # caught
        try:
            with timeout(seconds=5):
                job.run()
        except AirflowTaskTimeout:
            pass
        ti = TI(
            task=dag.get_task('test_backfill_pooled_task'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 12
0
    def test_backfill_pooled_tasks(self):
        """
        Test that queued tasks are executed by BackfillJob

        Test for https://github.com/airbnb/airflow/pull/1225
        """
        session = settings.Session()
        pool = Pool(pool='test_backfill_pooled_task_pool', slots=1)
        session.add(pool)
        session.commit()

        dag = self.dagbag.get_dag('test_backfill_pooled_task_dag')
        dag.clear()

        job = BackfillJob(
            dag=dag,
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE)

        # run with timeout because this creates an infinite loop if not
        # caught
        with timeout(seconds=30):
            job.run()

        ti = TI(
            task=dag.get_task('test_backfill_pooled_task'),
            execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
    def test_clear_task_instances_without_task(self):
        dag = DAG('test_clear_task_instances_without_task', start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE + datetime.timedelta(days=10))
        task0 = DummyOperator(task_id='task0', owner='test', dag=dag)
        task1 = DummyOperator(task_id='task1', owner='test', dag=dag, retries=2)
        ti0 = TI(task=task0, execution_date=DEFAULT_DATE)
        ti1 = TI(task=task1, execution_date=DEFAULT_DATE)
        ti0.run()
        ti1.run()

        # Remove the task from dag.
        dag.task_dict = {}
        self.assertFalse(dag.has_task(task0.task_id))
        self.assertFalse(dag.has_task(task1.task_id))

        session = settings.Session()
        qry = session.query(TI).filter(
            TI.dag_id == dag.dag_id).all()
        clear_task_instances(qry, session)
        session.commit()
        # When dag is None, max_tries will be maximum of original max_tries or try_number.
        ti0.refresh_from_db()
        ti1.refresh_from_db()
        # Next try to run will be try 2
        self.assertEqual(ti0.try_number, 2)
        self.assertEqual(ti0.max_tries, 1)
        self.assertEqual(ti1.try_number, 2)
        self.assertEqual(ti1.max_tries, 2)
Exemplo n.º 14
0
    def test_backfill_pooled_tasks(self):
        """
        Test that queued tasks are executed by BackfillJob

        Test for https://github.com/airbnb/airflow/pull/1225
        """
        session = settings.Session()
        pool = Pool(pool='test_backfill_pooled_task_pool', slots=1)
        session.add(pool)
        session.commit()

        dag = self.dagbag.get_dag('test_backfill_pooled_task_dag')
        dag.clear()

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE)

        # run with timeout because this creates an infinite loop if not
        # caught
        with timeout(seconds=30):
            job.run()

        ti = TI(task=dag.get_task('test_backfill_pooled_task'),
                execution_date=DEFAULT_DATE)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 15
0
    def test_cli_backfill_depends_on_past(self):
        """
        Test that CLI respects -I argument
        """
        dag_id = 'test_dagrun_states_deadlock'
        run_date = DEFAULT_DATE + datetime.timedelta(days=1)
        args = [
            'backfill',
            dag_id,
            '-l',
            '-s',
            run_date.isoformat(),
        ]
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        self.assertRaisesRegexp(AirflowException, 'BackfillJob is deadlocked',
                                cli.backfill, self.parser.parse_args(args))

        cli.backfill(self.parser.parse_args(args + ['-I']))
        ti = TI(dag.get_task('test_depends_on_past'), run_date)
        ti.refresh_from_db()
        # task ran
        self.assertEqual(ti.state, State.SUCCESS)
        dag.clear()
Exemplo n.º 16
0
    def test_depends_on_past(self):
        dag = DAG(dag_id='test_depends_on_past', start_date=DEFAULT_DATE)

        task = DummyOperator(
            task_id='test_dop_task',
            dag=dag,
            depends_on_past=True,
        )
        dag.clear()

        run_date = task.start_date + datetime.timedelta(days=5)

        ti = TI(task, run_date)

        # depends_on_past prevents the run
        task.run(start_date=run_date, end_date=run_date)
        ti.refresh_from_db()
        self.assertIs(ti.state, None)

        # ignore first depends_on_past to allow the run
        task.run(start_date=run_date,
                 end_date=run_date,
                 ignore_first_depends_on_past=True)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 17
0
    def test_scheduler_pooled_tasks(self):
        """
        Test that the scheduler handles queued tasks correctly
        See issue #1299
        """
        session = settings.Session()
        if not (session.query(Pool).filter(
                Pool.pool == 'test_queued_pool').first()):
            pool = Pool(pool='test_queued_pool', slots=5)
            session.merge(pool)
            session.commit()
        session.close()

        dag_id = 'test_scheduled_queued_tasks'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        scheduler = SchedulerJob(dag_id, num_runs=10)
        scheduler.run()

        task_1 = dag.tasks[0]
        ti = TI(task_1, dag.start_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)

        dag.clear()
Exemplo n.º 18
0
    def test_success_callbak_no_race_condition(self):
        class CallbackWrapper(object):
            def wrap_task_instance(self, ti):
                self.task_id = ti.task_id
                self.dag_id = ti.dag_id
                self.execution_date = ti.execution_date
                self.task_state_in_callback = ""
                self.callback_ran = False

            def success_handler(self, context):
                self.callback_ran = True
                session = settings.Session()
                temp_instance = session.query(TI).filter(
                    TI.task_id == self.task_id).filter(
                    TI.dag_id == self.dag_id).filter(
                    TI.execution_date == self.execution_date).one()
                self.task_state_in_callback = temp_instance.state
        cw = CallbackWrapper()
        dag = DAG('test_success_callbak_no_race_condition', start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE + datetime.timedelta(days=10))
        task = DummyOperator(task_id='op', email='*****@*****.**',
                             on_success_callback=cw.success_handler, dag=dag)
        ti = TI(task=task, execution_date=datetime.datetime.now())
        ti.state = State.RUNNING
        session = settings.Session()
        session.merge(ti)
        session.commit()
        cw.wrap_task_instance(ti)
        ti._run_raw_task()
        self.assertTrue(cw.callback_ran)
        self.assertEqual(cw.task_state_in_callback, State.RUNNING)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 19
0
    def test_success_callbak_no_race_condition(self):
        class CallbackWrapper:
            def wrap_task_instance(self, ti):
                self.task_id = ti.task_id
                self.dag_id = ti.dag_id
                self.execution_date = ti.execution_date
                self.task_state_in_callback = ""
                self.callback_ran = False

            def success_handler(self, context):
                self.callback_ran = True
                session = settings.Session()
                temp_instance = session.query(TI).filter(
                    TI.task_id == self.task_id).filter(
                    TI.dag_id == self.dag_id).filter(
                    TI.execution_date == self.execution_date).one()
                self.task_state_in_callback = temp_instance.state

        cw = CallbackWrapper()
        dag = DAG('test_success_callbak_no_race_condition', start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE + datetime.timedelta(days=10))
        task = DummyOperator(task_id='op', email='*****@*****.**',
                             on_success_callback=cw.success_handler, dag=dag)
        ti = TI(task=task, execution_date=datetime.datetime.now())
        ti.state = State.RUNNING
        session = settings.Session()
        session.merge(ti)
        session.commit()
        cw.wrap_task_instance(ti)
        ti._run_raw_task()
        self.assertTrue(cw.callback_ran)
        self.assertEqual(cw.task_state_in_callback, State.RUNNING)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 20
0
    def test_retry_handling(self, mock_pool_full):
        """
        Test that task retries are handled properly
        """
        # Mock the pool with a pool with slots open since the pool doesn't actually exist
        mock_pool_full.return_value = False

        dag = models.DAG(dag_id='test_retry_handling')
        task = BashOperator(
            task_id='test_retry_handling_op',
            bash_command='exit 1',
            retries=1,
            retry_delay=datetime.timedelta(seconds=0),
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))

        def run_with_error(ti):
            try:
                ti.run()
            except AirflowException:
                pass

        ti = TI(
            task=task, execution_date=timezone.utcnow())
        self.assertEqual(ti.try_number, 1)

        # first run -- up for retry
        run_with_error(ti)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)
        self.assertEqual(ti._try_number, 1)
        self.assertEqual(ti.try_number, 2)

        # second run -- fail
        run_with_error(ti)
        self.assertEqual(ti.state, State.FAILED)
        self.assertEqual(ti._try_number, 2)
        self.assertEqual(ti.try_number, 3)

        # Clear the TI state since you can't run a task with a FAILED state without
        # clearing it first
        dag.clear()

        # third run -- up for retry
        run_with_error(ti)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)
        self.assertEqual(ti._try_number, 3)
        self.assertEqual(ti.try_number, 4)

        # fourth run -- fail
        run_with_error(ti)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)
        self.assertEqual(ti._try_number, 4)
        self.assertEqual(ti.try_number, 5)
Exemplo n.º 21
0
    def test_retry_handling(self, mock_pool_full):
        """
        Test that task retries are handled properly
        """
        # Mock the pool with a pool with slots open since the pool doesn't actually exist
        mock_pool_full.return_value = False

        dag = models.DAG(dag_id='test_retry_handling')
        task = BashOperator(
            task_id='test_retry_handling_op',
            bash_command='exit 1',
            retries=1,
            retry_delay=datetime.timedelta(seconds=0),
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))

        def run_with_error(ti):
            try:
                ti.run()
            except AirflowException:
                pass

        ti = TI(
            task=task, execution_date=timezone.utcnow())
        self.assertEqual(ti.try_number, 1)

        # first run -- up for retry
        run_with_error(ti)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)
        self.assertEqual(ti._try_number, 1)
        self.assertEqual(ti.try_number, 2)

        # second run -- fail
        run_with_error(ti)
        self.assertEqual(ti.state, State.FAILED)
        self.assertEqual(ti._try_number, 2)
        self.assertEqual(ti.try_number, 3)

        # Clear the TI state since you can't run a task with a FAILED state without
        # clearing it first
        dag.clear()

        # third run -- up for retry
        run_with_error(ti)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)
        self.assertEqual(ti._try_number, 3)
        self.assertEqual(ti.try_number, 4)

        # fourth run -- fail
        run_with_error(ti)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)
        self.assertEqual(ti._try_number, 4)
        self.assertEqual(ti.try_number, 5)
Exemplo n.º 22
0
    def evaluate_dagrun(
            self,
            dag_id,
            first_task_state,
            second_task_state,
            dagrun_state,
            run_kwargs=None,
            advance_execution_date=False,
            session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob()
        dag = get_dag(dag_id, TEST_DAGS_FOLDER)
        dr = scheduler.schedule_dag(dag)
        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.schedule_dag(dag)
        ex_date = dr.execution_date

        # if 'test_dagrun_states_deadlock' in dag_id and run_kwargs:
        #     import ipdb; ipdb.set_trace()

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        task_1, task_2 = dag.tasks
        ti = TI(task_1, ex_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, first_task_state)
        ti = TI(task_2, ex_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, second_task_state)

        # load dagrun
        dr = session.query(DagRun).filter(
            DagRun.dag_id == dag.dag_id,
            DagRun.execution_date == ex_date
        ).first()

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        # import ipdb; ipdb.set_trace()

        dag.get_active_runs()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemplo n.º 23
0
    def test_mark_failure_on_failure_callback(self):
        """
        Test that ensures that mark_failure in the UI fails
        the task, and executes on_failure_callback
        """
        data = {'called': False}

        def check_failure(context):
            self.assertEqual(context['dag_run'].dag_id, 'test_mark_failure')
            data['called'] = True

        def task_function(ti, **context):
            with create_session() as session:
                self.assertEqual(State.RUNNING, ti.state)
                ti.log.info("Marking TI as failed 'externally'")
                ti.state = State.FAILED
                session.merge(ti)
                session.commit()

            time.sleep(60)
            # This should not happen -- the state change should be noticed and the task should get killed
            data['reached_end_of_sleep'] = True

        with DAG(dag_id='test_mark_failure', start_date=DEFAULT_DATE) as dag:
            task = PythonOperator(task_id='test_state_succeeded1',
                                  python_callable=task_function,
                                  provide_context=True,
                                  on_failure_callback=check_failure)

        session = settings.Session()

        dag.clear()
        dag.create_dagrun(run_id="test",
                          state=State.RUNNING,
                          execution_date=DEFAULT_DATE,
                          start_date=DEFAULT_DATE,
                          session=session)
        ti = TI(task=task, execution_date=DEFAULT_DATE)
        ti.refresh_from_db()

        job1 = LocalTaskJob(task_instance=ti,
                            ignore_ti_state=True,
                            executor=SequentialExecutor())
        with timeout(30):
            # This should be _much_ shorter to run.
            # If you change this limit, make the timeout in the callbable above bigger
            job1.run()

        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)
        self.assertTrue(data['called'])
        self.assertNotIn(
            'reached_end_of_sleep', data,
            'Task should not have been allowed to run to completion')
Exemplo n.º 24
0
    def test_submit_task_instance_to_dask_cluster(self):
        """
        Test that the DaskExecutor properly submits tasks to the cluster
        """
        cluster = LocalCluster(nanny=False)

        executor = DaskExecutor(cluster_address=cluster.scheduler_address)

        args = dict(start_date=DEFAULT_DATE)

        def fail():
            raise ValueError('Intentional failure.')

        with DAG('test-dag', default_args=args) as dag:
            # queue should be allowed, but ignored
            success_operator = PythonOperator(task_id='success',
                                              python_callable=lambda: True,
                                              queue='queue')

            fail_operator = PythonOperator(task_id='fail',
                                           python_callable=fail)

        success_ti = TaskInstance(success_operator,
                                  execution_date=DEFAULT_DATE)

        fail_ti = TaskInstance(fail_operator, execution_date=DEFAULT_DATE)

        # queue the tasks
        executor.queue_task_instance(success_ti)
        executor.queue_task_instance(fail_ti)

        # the tasks haven't been submitted to the cluster yet
        self.assertTrue(len(executor.futures) == 0)
        # after the heartbeat, they have been submitted
        executor.heartbeat()
        self.assertTrue(len(executor.futures) == 2)

        # wait a reasonable amount of time for the tasks to complete
        for _ in range(2):
            time.sleep(0.25)
            executor.heartbeat()

        # check that the futures were completed
        if len(executor.futures) == 2:
            raise ValueError('Failed to reach cluster before timeout.')
        self.assertTrue(len(executor.futures) == 0)

        # check that the taskinstances were updated
        success_ti.refresh_from_db()
        self.assertTrue(success_ti.state == State.SUCCESS)
        fail_ti.refresh_from_db()
        self.assertTrue(fail_ti.state == State.FAILED)

        cluster.close()
Exemplo n.º 25
0
    def test_subdag_clear_parentdag_downstream_clear(self):
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag_op_task = dag.get_task('section-1')

        subdag = subdag_op_task.subdag
        subdag.schedule_interval = '@daily'

        executor = TestExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor,
                          donot_pickle=True)

        with timeout(seconds=30):
            job.run()

        ti0 = TI(
            task=subdag.get_task('section-1-task-1'),
            execution_date=DEFAULT_DATE)
        ti0.refresh_from_db()
        self.assertEqual(ti0.state, State.SUCCESS)

        sdag = subdag.sub_dag(
            task_regex='section-1-task-1',
            include_downstream=True,
            include_upstream=False)

        sdag.clear(
            start_date=DEFAULT_DATE,
            end_date=DEFAULT_DATE,
            include_parentdag=True)

        ti0.refresh_from_db()
        self.assertEqual(State.NONE, ti0.state)

        ti1 = TI(
            task=dag.get_task('some-other-task'),
            execution_date=DEFAULT_DATE)
        self.assertEqual(State.NONE, ti1.state)

        # Checks that all the Downstream tasks for Parent DAG
        # have been cleared
        for task in subdag_op_task.downstream_list:
            ti = TI(
                task=dag.get_task(task.task_id),
                execution_date=DEFAULT_DATE
            )
            self.assertEqual(State.NONE, ti.state)

        subdag.clear()
        dag.clear()
Exemplo n.º 26
0
def task_run(args, dag=None):
    """Runs a single task instance"""
    if dag:
        args.dag_id = dag.dag_id

    log = LoggingMixin().log

    # Load custom airflow config
    if args.cfg_path:
        with open(args.cfg_path, 'r') as conf_file:
            conf_dict = json.load(conf_file)

        if os.path.exists(args.cfg_path):
            os.remove(args.cfg_path)

        conf.read_dict(conf_dict, source=args.cfg_path)
        settings.configure_vars()

    # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave
    # behind multiple open sleeping connections while heartbeating, which could
    # easily exceed the database connection limit when
    # processing hundreds of simultaneous tasks.
    settings.configure_orm(disable_connection_pool=True)

    if not args.pickle and not dag:
        dag = get_dag(args)
    elif not dag:
        with db.create_session() as session:
            log.info('Loading pickle id %s', args.pickle)
            dag_pickle = session.query(DagPickle).filter(
                DagPickle.id == args.pickle).first()
            if not dag_pickle:
                raise AirflowException("Who hid the pickle!? [missing pickle]")
            dag = dag_pickle.pickle

    task = dag.get_task(task_id=args.task_id)
    ti = TaskInstance(task, args.execution_date)
    ti.refresh_from_db()

    ti.init_run_context(raw=args.raw)

    hostname = get_hostname()
    log.info("Running %s on host %s", ti, hostname)

    if args.interactive:
        _run(args, dag, ti)
    else:
        with redirect_stdout(ti.log, logging.INFO), redirect_stderr(
                ti.log, logging.WARN):
            _run(args, dag, ti)
    logging.shutdown()
Exemplo n.º 27
0
    def evaluate_dagrun(
            self,
            dag_id,
            expected_task_states,  # dict of task_id: state
            dagrun_state,
            run_kwargs=None,
            advance_execution_date=False,
            session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS.
        This is hackish: a dag run is created but its tasks are
        run by a backfill.
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob(**self.default_scheduler_args)
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        dr = scheduler.create_dag_run(dag)

        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.create_dag_run(dag)
        ex_date = dr.execution_date

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        for task_id, expected_state in expected_task_states.items():
            task = dag.get_task(task_id)
            ti = TI(task, ex_date)
            ti.refresh_from_db()
            self.assertEqual(ti.state, expected_state)

        # load dagrun
        dr = DagRun.find(dag_id=dag_id, execution_date=ex_date)
        dr = dr[0]
        dr.dag = dag

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        dr.update_state()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemplo n.º 28
0
    def evaluate_dagrun(
            self,
            dag_id,
            expected_task_states,  # dict of task_id: state
            dagrun_state,
            run_kwargs=None,
            advance_execution_date=False,
            session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS.
        This is hackish: a dag run is created but its tasks are
        run by a backfill.
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob()
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        dr = scheduler.schedule_dag(dag)

        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.schedule_dag(dag)
        ex_date = dr.execution_date

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        for task_id, expected_state in expected_task_states.items():
            task = dag.get_task(task_id)
            ti = TI(task, ex_date)
            ti.refresh_from_db()
            self.assertEqual(ti.state, expected_state)

        # load dagrun
        dr = DagRun.find(dag_id=dag_id, execution_date=ex_date)
        dr = dr[0]
        dr.dag = dag

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        dr.update_state()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemplo n.º 29
0
def _get_ti(task, exec_date_or_run_id):
    """Get the task instance through DagRun.run_id, if that fails, get the TI the old way"""
    dag_run = task.dag.get_dagrun(run_id=exec_date_or_run_id)
    if not dag_run:
        try:
            execution_date = timezone.parse(exec_date_or_run_id)
            ti = TaskInstance(task, execution_date)
            ti.refresh_from_db()
            return ti
        except (ParserError, TypeError):
            raise AirflowException(f"DagRun with run_id: {exec_date_or_run_id} not found")
    ti = dag_run.get_task_instance(task.task_id)
    ti.task = task
    return ti
Exemplo n.º 30
0
    def test_localtaskjob_maintain_heart_rate(self):
        dagbag = models.DagBag(
            dag_folder=TEST_DAG_FOLDER,
            include_examples=False,
        )
        dag = dagbag.dags.get('test_localtaskjob_double_trigger')
        task = dag.get_task('test_localtaskjob_double_trigger_task')

        session = settings.Session()

        dag.clear()
        dag.create_dagrun(run_id="test",
                          state=State.SUCCESS,
                          execution_date=DEFAULT_DATE,
                          start_date=DEFAULT_DATE,
                          session=session)

        ti_run = TI(task=task, execution_date=DEFAULT_DATE)
        ti_run.refresh_from_db()
        job1 = LocalTaskJob(task_instance=ti_run,
                            executor=SequentialExecutor())

        # this should make sure we only heartbeat once and exit at the second
        # loop in _execute()
        return_codes = [None, 0]

        def multi_return_code():
            return return_codes.pop(0)

        time_start = time.time()
        from airflow.task.task_runner.standard_task_runner import StandardTaskRunner
        with patch.object(StandardTaskRunner, 'start',
                          return_value=None) as mock_start:
            with patch.object(StandardTaskRunner,
                              'return_code') as mock_ret_code:
                mock_ret_code.side_effect = multi_return_code
                job1.run()
                self.assertEqual(mock_start.call_count, 1)
                self.assertEqual(mock_ret_code.call_count, 2)
        time_end = time.time()

        self.assertEqual(self.mock_base_job_sleep.call_count, 1)
        self.assertEqual(job1.state, State.SUCCESS)

        # Consider we have patched sleep call, it should not be sleeping to
        # keep up with the heart rate in other unpatched places
        #
        # We already make sure patched sleep call is only called once
        self.assertLess(time_end - time_start, job1.heartrate)
        session.close()
Exemplo n.º 31
0
    def evaluate_dagrun(self,
                        dag_id,
                        first_task_state,
                        second_task_state,
                        dagrun_state,
                        run_kwargs=None,
                        advance_execution_date=False,
                        session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob()
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        dr = scheduler.schedule_dag(dag)
        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.schedule_dag(dag)
        ex_date = dr.execution_date

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        task_1, task_2 = dag.tasks
        ti = TI(task_1, ex_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, first_task_state)
        ti = TI(task_2, ex_date)
        ti.refresh_from_db()
        self.assertEqual(ti.state, second_task_state)

        # load dagrun
        dr = session.query(DagRun).filter(
            DagRun.dag_id == dag.dag_id,
            DagRun.execution_date == ex_date).first()

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        dag.get_active_runs()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemplo n.º 32
0
    def evaluate_dagrun(
            self,
            dag_id,
            expected_task_states,  # dict of task_id: state
            dagrun_state,
            run_kwargs=None,
            advance_execution_date=False,
            session=None):
        """
        Helper for testing DagRun states with simple two-task DAGS
        """
        if run_kwargs is None:
            run_kwargs = {}

        scheduler = SchedulerJob()
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()
        dr = scheduler.schedule_dag(dag)
        if advance_execution_date:
            # run a second time to schedule a dagrun after the start_date
            dr = scheduler.schedule_dag(dag)
        ex_date = dr.execution_date

        try:
            dag.run(start_date=ex_date, end_date=ex_date, **run_kwargs)
        except AirflowException:
            pass

        # test tasks
        for task_id, expected_state in expected_task_states.items():
            task = dag.get_task(task_id)
            ti = TI(task, ex_date)
            ti.refresh_from_db()
            self.assertEqual(ti.state, expected_state)

        # load dagrun
        dr = session.query(DagRun).filter(
            DagRun.dag_id == dag.dag_id,
            DagRun.execution_date == ex_date
        ).first()

        # dagrun is running
        self.assertEqual(dr.state, State.RUNNING)

        dag.get_active_runs()

        # dagrun failed
        self.assertEqual(dr.state, dagrun_state)
Exemplo n.º 33
0
def task_run(args, dag=None):
    """Runs a single task instance"""

    # Load custom airflow config
    if args.cfg_path:
        with open(args.cfg_path, 'r') as conf_file:
            conf_dict = json.load(conf_file)

        if os.path.exists(args.cfg_path):
            os.remove(args.cfg_path)

        conf.read_dict(conf_dict, source=args.cfg_path)
        settings.configure_vars()

    # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave
    # behind multiple open sleeping connections while heartbeating, which could
    # easily exceed the database connection limit when
    # processing hundreds of simultaneous tasks.
    settings.configure_orm(disable_connection_pool=True)

    if dag and args.pickle:
        raise AirflowException(
            "You cannot use the --pickle option when using DAG.cli() method.")
    elif args.pickle:
        print(f'Loading pickle id: {args.pickle}')
        dag = get_dag_by_pickle(args.pickle)
    elif not dag:
        dag = get_dag(args.subdir, args.dag_id)
    else:
        # Use DAG from parameter
        pass

    task = dag.get_task(task_id=args.task_id)
    ti = TaskInstance(task, args.execution_date)
    ti.refresh_from_db()

    ti.init_run_context(raw=args.raw)

    hostname = get_hostname()
    print(f"Running {ti} on host {hostname}")

    if args.interactive:
        _run_task_by_selected_method(args, dag, ti)
    else:
        with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \
                redirect_stderr(StreamLogWriter(ti.log, logging.WARN)):
            _run_task_by_selected_method(args, dag, ti)
    logging.shutdown()
Exemplo n.º 34
0
    def test_local_run(self):
        args = self.parser.parse_args([
            'tasks', 'run', 'example_python_operator', 'print_the_context',
            '2018-04-27T08:39:51.298439+00:00', '--interactive', '--subdir',
            '/root/dags/example_python_operator.py'
        ])

        dag = get_dag(args.subdir, args.dag_id)
        reset(dag.dag_id)

        task_command.task_run(args)
        task = dag.get_task(task_id=args.task_id)
        ti = TaskInstance(task, args.execution_date)
        ti.refresh_from_db()
        state = ti.current_state()
        self.assertEqual(state, State.SUCCESS)
Exemplo n.º 35
0
    def test_local_run(self):
        args = create_mock_args(
            task_id='print_the_context',
            dag_id='example_python_operator',
            subdir='/root/dags/example_python_operator.py',
            interactive=True,
            execution_date=timezone.parse('2018-04-27T08:39:51.298439+00:00'))
        dag = get_dag(args)
        reset(dag.dag_id)

        task_command.task_run(args)
        task = dag.get_task(task_id=args.task_id)
        ti = TaskInstance(task, args.execution_date)
        ti.refresh_from_db()
        state = ti.current_state()
        self.assertEqual(state, State.SUCCESS)
Exemplo n.º 36
0
    def test_depends_on_past(self):
        dagbag = models.DagBag()
        dag = dagbag.get_dag("test_depends_on_past")
        dag.clear()
        task = dag.tasks[0]
        run_date = task.start_date + datetime.timedelta(days=5)
        ti = TI(task, run_date)

        # depends_on_past prevents the run
        task.run(start_date=run_date, end_date=run_date)
        ti.refresh_from_db()
        self.assertIs(ti.state, None)

        # ignore first depends_on_past to allow the run
        task.run(start_date=run_date, end_date=run_date, ignore_first_depends_on_past=True)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 37
0
    def test_clear_task_instances_for_backfill_dagrun(self):
        now = timezone.utcnow()
        session = settings.Session()
        dag_id = 'test_clear_task_instances_for_backfill_dagrun'
        dag = DAG(dag_id=dag_id, start_date=now)
        self.create_dag_run(dag, execution_date=now, is_backfill=True)

        task0 = DummyOperator(task_id='backfill_task_0', owner='test', dag=dag)
        ti0 = TI(task=task0, execution_date=now)
        ti0.run()

        qry = session.query(TI).filter(TI.dag_id == dag.dag_id).all()
        clear_task_instances(qry, session)
        session.commit()
        ti0.refresh_from_db()
        dr0 = session.query(DagRun).filter(DagRun.dag_id == dag_id, DagRun.execution_date == now).first()
        assert dr0.state == State.RUNNING
Exemplo n.º 38
0
    def test_heartbeat_failed_fast(self, mock_getpid):
        """
        Test that task heartbeat will sleep when it fails fast
        """
        mock_getpid.return_value = 1

        heartbeat_records = []

        def heartbeat_recorder(**kwargs):
            heartbeat_records.append(timezone.utcnow())

        with create_session() as session:
            dagbag = models.DagBag(
                dag_folder=TEST_DAG_FOLDER,
                include_examples=False,
            )
            dag_id = 'test_heartbeat_failed_fast'
            task_id = 'test_heartbeat_failed_fast_op'
            dag = dagbag.get_dag(dag_id)
            task = dag.get_task(task_id)

            dag.create_dagrun(run_id="test_heartbeat_failed_fast_run",
                              state=State.RUNNING,
                              execution_date=DEFAULT_DATE,
                              start_date=DEFAULT_DATE,
                              session=session)
            ti = TI(task=task, execution_date=DEFAULT_DATE)
            ti.refresh_from_db()
            ti.state = State.RUNNING
            ti.hostname = get_hostname()
            ti.pid = 1
            session.commit()

            job = LocalTaskJob(task_instance=ti,
                               executor=MockExecutor(do_update=False))
            job.heartrate = 2
            job.heartbeat_callback = heartbeat_recorder
            job._execute()
            self.assertGreater(len(heartbeat_records), 1)
            for i in range(1, len(heartbeat_records)):
                time1 = heartbeat_records[i - 1]
                time2 = heartbeat_records[i]
                # Assert that difference small enough
                delta = (time2 - time1).total_seconds()
                self.assertAlmostEqual(delta, job.heartrate, delta=0.05)
Exemplo n.º 39
0
    def test_depends_on_past(self):
        dag = get_dag('test_depends_on_past', TEST_DAGS_FOLDER)
        task = dag.tasks[0]
        run_date = task.start_date + datetime.timedelta(days=5)
        ti = TI(task, run_date)

        # depends_on_past prevents the run
        task.run(start_date=run_date, end_date=run_date)
        ti.refresh_from_db()
        self.assertIs(ti.state, None)

        # ignore first depends_on_past to allow the run
        task.run(
            start_date=run_date,
            end_date=run_date,
            ignore_first_depends_on_past=True)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 40
0
    def test_scheduler_pooled_tasks(self):
        """
        Test that the scheduler handles queued tasks correctly
        See issue #1299
        """
        session = settings.Session()
        if not (
                session.query(Pool)
                .filter(Pool.pool == 'test_queued_pool')
                .first()):
            pool = Pool(pool='test_queued_pool', slots=5)
            session.merge(pool)
            session.commit()
        session.close()

        dag_id = 'test_scheduled_queued_tasks'
        dag = self.dagbag.get_dag(dag_id)
        dag.clear()

        scheduler = SchedulerJob(dag_id,
                                 num_runs=1,
                                 executor=TestExecutor(),
                                 **self.default_scheduler_args)
        scheduler.run()

        task_1 = dag.tasks[0]
        logging.info("Trying to find task {}".format(task_1))
        ti = TI(task_1, dag.start_date)
        ti.refresh_from_db()
        logging.error("TI is: {}".format(ti))
        self.assertEqual(ti.state, State.QUEUED)

        # now we use a DIFFERENT scheduler and executor
        # to simulate the num-runs CLI arg
        scheduler2 = SchedulerJob(
            dag_id,
            num_runs=5,
            executor=DEFAULT_EXECUTOR.__class__(),
            **self.default_scheduler_args)
        scheduler2.run()

        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)
        dag.clear()
Exemplo n.º 41
0
    def test_local_run(self):
        args = create_mock_args(
            task_id='print_the_context',
            dag_id='example_python_operator',
            subdir='/root/dags/example_python_operator.py',
            interactive=True,
            execution_date=timezone.parse('2018-04-27T08:39:51.298439+00:00')
        )

        reset(args.dag_id)

        with patch('argparse.Namespace', args) as mock_args:
            run(mock_args)
            dag = get_dag(mock_args)
            task = dag.get_task(task_id=args.task_id)
            ti = TaskInstance(task, args.execution_date)
            ti.refresh_from_db()
            state = ti.current_state()
            self.assertEqual(state, State.SUCCESS)
Exemplo n.º 42
0
    def test_backfill_execute_subdag_with_removed_task(self):
        """
        Ensure that subdag operators execute properly in the case where
        an associated task of the subdag has been removed from the dag
        definition, but has instances in the database from previous runs.
        """
        dag = self.dagbag.get_dag('example_subdag_operator')
        subdag = dag.get_task('section-1').subdag

        executor = MockExecutor()
        job = BackfillJob(dag=subdag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          executor=executor,
                          donot_pickle=True)

        removed_task_ti = TI(
            task=DummyOperator(task_id='removed_task'),
            execution_date=DEFAULT_DATE,
            state=State.REMOVED)
        removed_task_ti.dag_id = subdag.dag_id

        session = settings.Session()
        session.merge(removed_task_ti)
        session.commit()

        with timeout(seconds=30):
            job.run()

        for task in subdag.tasks:
            instance = session.query(TI).filter(
                TI.dag_id == subdag.dag_id,
                TI.task_id == task.task_id,
                TI.execution_date == DEFAULT_DATE).first()

            self.assertIsNotNone(instance)
            self.assertEqual(instance.state, State.SUCCESS)

        removed_task_ti.refresh_from_db()
        self.assertEqual(removed_task_ti.state, State.REMOVED)

        subdag.clear()
        dag.clear()
Exemplo n.º 43
0
    def test_depends_on_past(self):
        dagbag = models.DagBag()
        dag = dagbag.get_dag('test_depends_on_past')
        dag.clear()
        task = dag.tasks[0]
        run_date = task.start_date + datetime.timedelta(days=5)
        ti = TI(task, run_date)

        # depends_on_past prevents the run
        task.run(start_date=run_date, end_date=run_date)
        ti.refresh_from_db()
        self.assertIs(ti.state, None)

        # ignore first depends_on_past to allow the run
        task.run(start_date=run_date,
                 end_date=run_date,
                 ignore_first_depends_on_past=True)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 44
0
    def test_heartbeat_failed_fast(self, mock_getpid):
        """
        Test that task heartbeat will sleep when it fails fast
        """
        mock_getpid.return_value = 1
        self.mock_base_job_sleep.side_effect = time.sleep

        with create_session() as session:
            dagbag = models.DagBag(
                dag_folder=TEST_DAG_FOLDER,
                include_examples=False,
            )
            dag_id = 'test_heartbeat_failed_fast'
            task_id = 'test_heartbeat_failed_fast_op'
            dag = dagbag.get_dag(dag_id)
            task = dag.get_task(task_id)

            dag.create_dagrun(run_id="test_heartbeat_failed_fast_run",
                              state=State.RUNNING,
                              execution_date=DEFAULT_DATE,
                              start_date=DEFAULT_DATE,
                              session=session)
            ti = TI(task=task, execution_date=DEFAULT_DATE)
            ti.refresh_from_db()
            ti.state = State.RUNNING
            ti.hostname = get_hostname()
            ti.pid = 1
            session.commit()

            job = LocalTaskJob(task_instance=ti,
                               executor=MockExecutor(do_update=False))
            job.heartrate = 2
            heartbeat_records = []
            job.heartbeat_callback = lambda session: heartbeat_records.append(
                job.latest_heartbeat)
            job._execute()
            self.assertGreater(len(heartbeat_records), 2)
            for i in range(1, len(heartbeat_records)):
                time1 = heartbeat_records[i - 1]
                time2 = heartbeat_records[i]
                self.assertGreaterEqual((time2 - time1).total_seconds(),
                                        job.heartrate)
Exemplo n.º 45
0
    def test_backfill_depends_on_past(self):
        dag = get_dag('test_depends_on_past', TEST_DAGS_FOLDER)
        run_date = dag.start_date + datetime.timedelta(days=5)
        # import ipdb; ipdb.set_trace()
        BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run()

        # ti should not have run
        ti = TI(dag.tasks[0], run_date)
        ti.refresh_from_db()
        self.assertIs(ti.state, None)

        BackfillJob(
            dag=dag,
            start_date=run_date,
            end_date=run_date,
            ignore_first_depends_on_past=True).run()

        # ti should have run
        ti = TI(dag.tasks[0], run_date)
        ti.refresh_from_db()
        self.assertEquals(ti.state, State.SUCCESS)
Exemplo n.º 46
0
    def test_clear_task_instances(self):
        dag = DAG('test_clear_task_instances', start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE + datetime.timedelta(days=10))
        task0 = DummyOperator(task_id='0', owner='test', dag=dag)
        task1 = DummyOperator(task_id='1', owner='test', dag=dag, retries=2)
        ti0 = TI(task=task0, execution_date=DEFAULT_DATE)
        ti1 = TI(task=task1, execution_date=DEFAULT_DATE)

        ti0.run()
        ti1.run()
        session = settings.Session()
        qry = session.query(TI).filter(
            TI.dag_id == dag.dag_id).all()
        clear_task_instances(qry, session, dag=dag)
        session.commit()
        ti0.refresh_from_db()
        ti1.refresh_from_db()
        self.assertEqual(ti0.try_number, 1)
        self.assertEqual(ti0.max_tries, 1)
        self.assertEqual(ti1.try_number, 1)
        self.assertEqual(ti1.max_tries, 3)
Exemplo n.º 47
0
    def test_dag_clear(self):
        dag = DAG('test_dag_clear', start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE + datetime.timedelta(days=10))
        task0 = DummyOperator(task_id='test_dag_clear_task_0', owner='test', dag=dag)
        ti0 = TI(task=task0, execution_date=DEFAULT_DATE)
        # Next try to run will be try 1
        self.assertEqual(ti0.try_number, 1)
        ti0.run()
        self.assertEqual(ti0.try_number, 2)
        dag.clear()
        ti0.refresh_from_db()
        self.assertEqual(ti0.try_number, 2)
        self.assertEqual(ti0.state, State.NONE)
        self.assertEqual(ti0.max_tries, 1)

        task1 = DummyOperator(task_id='test_dag_clear_task_1', owner='test',
                              dag=dag, retries=2)
        ti1 = TI(task=task1, execution_date=DEFAULT_DATE)
        self.assertEqual(ti1.max_tries, 2)
        ti1.try_number = 1
        # Next try will be 2
        ti1.run()
        self.assertEqual(ti1.try_number, 3)
        self.assertEqual(ti1.max_tries, 2)

        dag.clear()
        ti0.refresh_from_db()
        ti1.refresh_from_db()
        # after clear dag, ti2 should show attempt 3 of 5
        self.assertEqual(ti1.max_tries, 4)
        self.assertEqual(ti1.try_number, 3)
        # after clear dag, ti1 should show attempt 2 of 2
        self.assertEqual(ti0.try_number, 2)
        self.assertEqual(ti0.max_tries, 1)
Exemplo n.º 48
0
    def test_backfill_depends_on_past(self):
        """
        Test that backfill respects ignore_depends_on_past
        """
        dag = self.dagbag.get_dag('test_depends_on_past')
        dag.clear()
        run_date = DEFAULT_DATE + datetime.timedelta(days=5)

        # backfill should deadlock
        self.assertRaisesRegexp(
            AirflowException,
            'BackfillJob is deadlocked',
            BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run)

        BackfillJob(
            dag=dag,
            start_date=run_date,
            end_date=run_date,
            ignore_first_depends_on_past=True).run()

        # ti should have succeeded
        ti = TI(dag.tasks[0], run_date)
        ti.refresh_from_db()
        self.assertEquals(ti.state, State.SUCCESS)
Exemplo n.º 49
0
    def test_reschedule_handling(self, mock_pool_full):
        """
        Test that task reschedules are handled properly
        """
        # Mock the pool with a pool with slots open since the pool doesn't actually exist
        mock_pool_full.return_value = False

        # Return values of the python sensor callable, modified during tests
        done = False
        fail = False

        def callable():
            if fail:
                raise AirflowException()
            return done

        dag = models.DAG(dag_id='test_reschedule_handling')
        task = PythonSensor(
            task_id='test_reschedule_handling_sensor',
            poke_interval=0,
            mode='reschedule',
            python_callable=callable,
            retries=1,
            retry_delay=datetime.timedelta(seconds=0),
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))

        ti = TI(task=task, execution_date=timezone.utcnow())
        self.assertEqual(ti._try_number, 0)
        self.assertEqual(ti.try_number, 1)

        def run_ti_and_assert(run_date, expected_start_date, expected_end_date, expected_duration,
                              expected_state, expected_try_number, expected_task_reschedule_count):
            with freeze_time(run_date):
                try:
                    ti.run()
                except AirflowException:
                    if not fail:
                        raise
            ti.refresh_from_db()
            self.assertEqual(ti.state, expected_state)
            self.assertEqual(ti._try_number, expected_try_number)
            self.assertEqual(ti.try_number, expected_try_number + 1)
            self.assertEqual(ti.start_date, expected_start_date)
            self.assertEqual(ti.end_date, expected_end_date)
            self.assertEqual(ti.duration, expected_duration)
            trs = TaskReschedule.find_for_task_instance(ti)
            self.assertEqual(len(trs), expected_task_reschedule_count)

        date1 = timezone.utcnow()
        date2 = date1 + datetime.timedelta(minutes=1)
        date3 = date2 + datetime.timedelta(minutes=1)
        date4 = date3 + datetime.timedelta(minutes=1)

        # Run with multiple reschedules.
        # During reschedule the try number remains the same, but each reschedule is recorded.
        # The start date is expected to remain the inital date, hence the duration increases.
        # When finished the try number is incremented and there is no reschedule expected
        # for this try.

        done, fail = False, False
        run_ti_and_assert(date1, date1, date1, 0, State.UP_FOR_RESCHEDULE, 0, 1)

        done, fail = False, False
        run_ti_and_assert(date2, date1, date2, 60, State.UP_FOR_RESCHEDULE, 0, 2)

        done, fail = False, False
        run_ti_and_assert(date3, date1, date3, 120, State.UP_FOR_RESCHEDULE, 0, 3)

        done, fail = True, False
        run_ti_and_assert(date4, date1, date4, 180, State.SUCCESS, 1, 0)

        # Clear the task instance.
        dag.clear()
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.NONE)
        self.assertEqual(ti._try_number, 1)

        # Run again after clearing with reschedules and a retry.
        # The retry increments the try number, and for that try no reschedule is expected.
        # After the retry the start date is reset, hence the duration is also reset.

        done, fail = False, False
        run_ti_and_assert(date1, date1, date1, 0, State.UP_FOR_RESCHEDULE, 1, 1)

        done, fail = False, True
        run_ti_and_assert(date2, date1, date2, 60, State.UP_FOR_RETRY, 2, 0)

        done, fail = False, False
        run_ti_and_assert(date3, date3, date3, 0, State.UP_FOR_RESCHEDULE, 2, 1)

        done, fail = True, False
        run_ti_and_assert(date4, date3, date4, 60, State.SUCCESS, 3, 0)