Exemplo n.º 1
0
    def test_parse_execution_date(self):
        execution_date_str_wo_ms = '2017-11-02 00:00:00'
        execution_date_str_w_ms = '2017-11-05 16:18:30.989729'
        bad_execution_date_str = '2017-11-06TXX:00:00Z'

        self.assertEqual(timezone.datetime(2017, 11, 2, 0, 0, 0), dates.parse_execution_date(execution_date_str_wo_ms))
        self.assertEqual(timezone.datetime(2017, 11, 5, 16, 18, 30, 989729), dates.parse_execution_date(execution_date_str_w_ms))
        self.assertRaises(ValueError, dates.parse_execution_date, bad_execution_date_str)
 def test_exec_date_after_end_date(self):
     """
     If the dag's execution date is in the future this dep should fail
     """
     ti = self._get_task_instance(
         dag_end_date=datetime(2016, 1, 3),
         task_end_date=datetime(2016, 1, 3),
         execution_date=datetime(2016, 1, 2),
     )
     self.assertFalse(RunnableExecDateDep().is_met(ti=ti))
 def test_all_deps_met(self):
     """
     Test to make sure all of the conditions for the dep are met
     """
     ti = self._get_task_instance(
         dag_end_date=datetime(2016, 1, 2),
         task_end_date=datetime(2016, 1, 2),
         execution_date=datetime(2016, 1, 1),
     )
     self.assertTrue(RunnableExecDateDep().is_met(ti=ti))
Exemplo n.º 4
0
    def test_check_task_dependencies(self, trigger_rule, successes, skipped,
                                     failed, upstream_failed, done,
                                     flag_upstream_failed,
                                     expect_state, expect_completed):
        start_date = timezone.datetime(2016, 2, 1, 0, 0, 0)
        dag = models.DAG('test-dag', start_date=start_date)
        downstream = DummyOperator(task_id='downstream',
                                   dag=dag, owner='airflow',
                                   trigger_rule=trigger_rule)
        for i in range(5):
            task = DummyOperator(task_id='runme_{}'.format(i),
                                 dag=dag, owner='airflow')
            task.set_downstream(downstream)
        run_date = task.start_date + datetime.timedelta(days=5)

        ti = TI(downstream, run_date)
        dep_results = TriggerRuleDep()._evaluate_trigger_rule(
            ti=ti,
            successes=successes,
            skipped=skipped,
            failed=failed,
            upstream_failed=upstream_failed,
            done=done,
            flag_upstream_failed=flag_upstream_failed)
        completed = all([dep.passed for dep in dep_results])

        self.assertEqual(completed, expect_completed)
        self.assertEqual(ti.state, expect_state)
Exemplo n.º 5
0
    def test_xcom_pull_after_success(self):
        """
        tests xcom set/clear relative to a task in a 'success' rerun scenario
        """
        key = 'xcom_key'
        value = 'xcom_value'

        dag = models.DAG(dag_id='test_xcom', schedule_interval='@monthly')
        task = DummyOperator(
            task_id='test_xcom',
            dag=dag,
            pool='test_xcom',
            owner='airflow',
            start_date=timezone.datetime(2016, 6, 2, 0, 0, 0))
        exec_date = timezone.utcnow()
        ti = TI(
            task=task, execution_date=exec_date)
        ti.run(mark_success=True)
        ti.xcom_push(key=key, value=value)
        self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value)
        ti.run()
        # The second run and assert is to handle AIRFLOW-131 (don't clear on
        # prior success)
        self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value)

        # Test AIRFLOW-703: Xcom shouldn't be cleared if the task doesn't
        # execute, even if dependencies are ignored
        ti.run(ignore_all_deps=True, mark_success=True)
        self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value)
        # Xcom IS finally cleared once task has executed
        ti.run(ignore_all_deps=True)
        self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), None)
Exemplo n.º 6
0
    def test_xcom_pull_different_execution_date(self):
        """
        tests xcom fetch behavior with different execution dates, using
        both xcom_pull with "include_prior_dates" and without
        """
        key = 'xcom_key'
        value = 'xcom_value'

        dag = models.DAG(dag_id='test_xcom', schedule_interval='@monthly')
        task = DummyOperator(
            task_id='test_xcom',
            dag=dag,
            pool='test_xcom',
            owner='airflow',
            start_date=timezone.datetime(2016, 6, 2, 0, 0, 0))
        exec_date = timezone.utcnow()
        ti = TI(
            task=task, execution_date=exec_date)
        ti.run(mark_success=True)
        ti.xcom_push(key=key, value=value)
        self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), value)
        ti.run()
        exec_date += datetime.timedelta(days=1)
        ti = TI(
            task=task, execution_date=exec_date)
        ti.run()
        # We have set a new execution date (and did not pass in
        # 'include_prior_dates'which means this task should now have a cleared
        # xcom value
        self.assertEqual(ti.xcom_pull(task_ids='test_xcom', key=key), None)
        # We *should* get a value using 'include_prior_dates'
        self.assertEqual(ti.xcom_pull(task_ids='test_xcom',
                                      key=key,
                                      include_prior_dates=True),
                         value)
Exemplo n.º 7
0
    def test_post_execute_hook(self):
        """
        Test that post_execute hook is called with the Operator's result.
        The result ('error') will cause an error to be raised and trapped.
        """

        class TestError(Exception):
            pass

        class TestOperator(PythonOperator):
            def post_execute(self, context, result):
                if result == 'error':
                    raise TestError('expected error.')

        dag = models.DAG(dag_id='test_post_execute_dag')
        task = TestOperator(
            task_id='test_operator',
            dag=dag,
            python_callable=lambda: 'error',
            owner='airflow',
            start_date=timezone.datetime(2017, 2, 1))
        ti = TI(task=task, execution_date=timezone.utcnow())

        with self.assertRaises(TestError):
            ti.run()
 def setUp(self):
     configuration.load_test_config()
     args = {
         'owner': 'airflow',
         'start_date': timezone.datetime(2017, 1, 1)
     }
     self.dag = DAG('test_dag_id', default_args=args)
    def setUp(self):
        super().setUp()
        self.remote_log_base = 's3://bucket/remote/log/location'
        self.remote_log_location = 's3://bucket/remote/log/location/1.log'
        self.remote_log_key = 'remote/log/location/1.log'
        self.local_log_location = 'local/log/location'
        self.filename_template = '{try_number}.log'
        self.s3_task_handler = S3TaskHandler(
            self.local_log_location,
            self.remote_log_base,
            self.filename_template
        )

        configuration.load_test_config()
        date = datetime(2016, 1, 1)
        self.dag = DAG('dag_for_testing_file_task_handler', start_date=date)
        task = DummyOperator(task_id='task_for_testing_file_log_handler', dag=self.dag)
        self.ti = TaskInstance(task=task, execution_date=date)
        self.ti.try_number = 1
        self.ti.state = State.RUNNING
        self.addCleanup(self.dag.clear)

        self.conn = boto3.client('s3')
        # We need to create the bucket since this is all in Moto's 'virtual'
        # AWS account
        moto.core.moto_api_backend.reset()
        self.conn.create_bucket(Bucket="bucket")
 def test_still_in_retry_period(self):
     """
     Task instances that are in their retry period should fail this dep
     """
     ti = self._get_task_instance(State.UP_FOR_RETRY,
                                  end_date=datetime(2016, 1, 1, 15, 30))
     self.assertTrue(ti.is_premature)
     self.assertFalse(NotInRetryPeriodDep().is_met(ti=ti))
 def test_retry_period_finished(self):
     """
     Task instance's that have had their retry period elapse should pass this dep
     """
     ti = self._get_task_instance(State.UP_FOR_RETRY,
                                  end_date=datetime(2016, 1, 1))
     self.assertFalse(ti.is_premature)
     self.assertTrue(NotInRetryPeriodDep().is_met(ti=ti))
Exemplo n.º 12
0
    def test_retry_handling(self, mock_pool_full):
        """
        Test that task retries are handled properly
        """
        # Mock the pool with a pool with slots open since the pool doesn't actually exist
        mock_pool_full.return_value = False

        dag = models.DAG(dag_id='test_retry_handling')
        task = BashOperator(
            task_id='test_retry_handling_op',
            bash_command='exit 1',
            retries=1,
            retry_delay=datetime.timedelta(seconds=0),
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))

        def run_with_error(ti):
            try:
                ti.run()
            except AirflowException:
                pass

        ti = TI(
            task=task, execution_date=timezone.utcnow())
        self.assertEqual(ti.try_number, 1)

        # first run -- up for retry
        run_with_error(ti)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)
        self.assertEqual(ti._try_number, 1)
        self.assertEqual(ti.try_number, 2)

        # second run -- fail
        run_with_error(ti)
        self.assertEqual(ti.state, State.FAILED)
        self.assertEqual(ti._try_number, 2)
        self.assertEqual(ti.try_number, 3)

        # Clear the TI state since you can't run a task with a FAILED state without
        # clearing it first
        dag.clear()

        # third run -- up for retry
        run_with_error(ti)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)
        self.assertEqual(ti._try_number, 3)
        self.assertEqual(ti.try_number, 4)

        # fourth run -- fail
        run_with_error(ti)
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.FAILED)
        self.assertEqual(ti._try_number, 4)
        self.assertEqual(ti.try_number, 5)
    def test_skipping(self):
        latest_task = LatestOnlyOperator(
            task_id='latest',
            dag=self.dag)
        downstream_task = DummyOperator(
            task_id='downstream',
            dag=self.dag)
        downstream_task2 = DummyOperator(
            task_id='downstream_2',
            dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state for ti in latest_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'success',
            timezone.datetime(2016, 1, 1, 12): 'success',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state for ti in downstream_instances}
        self.assertEqual({
            timezone.datetime(2016, 1, 1): 'skipped',
            timezone.datetime(2016, 1, 1, 12): 'skipped',
            timezone.datetime(2016, 1, 2): 'success'},
            exec_date_to_downstream_state)
Exemplo n.º 14
0
    def test_task_instance_info(self):
        url_template = '/api/experimental/dags/{}/dag_runs/{}/tasks/{}'
        dag_id = 'example_bash_operator'
        task_id = 'also_run_this'
        execution_date = utcnow().replace(microsecond=0)
        datetime_string = quote_plus(execution_date.isoformat())
        wrong_datetime_string = quote_plus(
            datetime(1990, 1, 1, 1, 1, 1).isoformat()
        )

        # Create DagRun
        trigger_dag(dag_id=dag_id,
                    run_id='test_task_instance_info_run',
                    execution_date=execution_date)

        # Test Correct execution
        response = self.client.get(
            url_template.format(dag_id, datetime_string, task_id)
        )
        self.assertEqual(200, response.status_code)
        self.assertIn('state', response.data.decode('utf-8'))
        self.assertNotIn('error', response.data.decode('utf-8'))

        # Test error for nonexistent dag
        response = self.client.get(
            url_template.format('does_not_exist_dag', datetime_string,
                                task_id),
        )
        self.assertEqual(404, response.status_code)
        self.assertIn('error', response.data.decode('utf-8'))

        # Test error for nonexistent task
        response = self.client.get(
            url_template.format(dag_id, datetime_string, 'does_not_exist_task')
        )
        self.assertEqual(404, response.status_code)
        self.assertIn('error', response.data.decode('utf-8'))

        # Test error for nonexistent dag run (wrong execution_date)
        response = self.client.get(
            url_template.format(dag_id, wrong_datetime_string, task_id)
        )
        self.assertEqual(404, response.status_code)
        self.assertIn('error', response.data.decode('utf-8'))

        # Test error for bad datetime format
        response = self.client.get(
            url_template.format(dag_id, 'not_a_datetime', task_id)
        )
        self.assertEqual(400, response.status_code)
        self.assertIn('error', response.data.decode('utf-8'))
Exemplo n.º 15
0
    def test_run_pooling_task(self, mock_pool_full):
        """
        test that running task update task state as  without running task.
        (no dependency check in ti_deps anymore, so also -> SUCCESS)
        """
        # Mock the pool out with a full pool because the pool doesn't actually exist
        mock_pool_full.return_value = True

        dag = models.DAG(dag_id='test_run_pooling_task')
        task = DummyOperator(task_id='test_run_pooling_task_op', dag=dag,
                             pool='test_run_pooling_task_pool', owner='airflow',
                             start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(
            task=task, execution_date=timezone.utcnow())
        ti.run()
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 16
0
    def test_run_pooling_task_with_mark_success(self, mock_pool_full):
        """
        test that running task with mark_success param update task state as SUCCESS
        without running task.
        """
        # Mock the pool out with a full pool because the pool doesn't actually exist
        mock_pool_full.return_value = True

        dag = models.DAG(dag_id='test_run_pooling_task_with_mark_success')
        task = DummyOperator(
            task_id='test_run_pooling_task_with_mark_success_op',
            dag=dag,
            pool='test_run_pooling_task_with_mark_success_pool',
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(
            task=task, execution_date=timezone.utcnow())
        ti.run(mark_success=True)
        self.assertEqual(ti.state, State.SUCCESS)
Exemplo n.º 17
0
    def test_next_retry_datetime(self):
        delay = datetime.timedelta(seconds=30)
        max_delay = datetime.timedelta(minutes=60)

        dag = models.DAG(dag_id='fail_dag')
        task = BashOperator(
            task_id='task_with_exp_backoff_and_max_delay',
            bash_command='exit 1',
            retries=3,
            retry_delay=delay,
            retry_exponential_backoff=True,
            max_retry_delay=max_delay,
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(
            task=task, execution_date=DEFAULT_DATE)
        ti.end_date = pendulum.instance(timezone.utcnow())

        dt = ti.next_retry_datetime()
        # between 30 * 2^0.5 and 30 * 2^1 (15 and 30)
        period = ti.end_date.add(seconds=30) - ti.end_date.add(seconds=15)
        self.assertTrue(dt in period)

        ti.try_number = 3
        dt = ti.next_retry_datetime()
        # between 30 * 2^2 and 30 * 2^3 (120 and 240)
        period = ti.end_date.add(seconds=240) - ti.end_date.add(seconds=120)
        self.assertTrue(dt in period)

        ti.try_number = 5
        dt = ti.next_retry_datetime()
        # between 30 * 2^4 and 30 * 2^5 (480 and 960)
        period = ti.end_date.add(seconds=960) - ti.end_date.add(seconds=480)
        self.assertTrue(dt in period)

        ti.try_number = 9
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date + max_delay)

        ti.try_number = 50
        dt = ti.next_retry_datetime()
        self.assertEqual(dt, ti.end_date + max_delay)
Exemplo n.º 18
0
    def test_mapred_job_name(self, mock_get_hook):
        mock_hook = mock.MagicMock()
        mock_get_hook.return_value = mock_hook
        t = HiveOperator(
            task_id='test_mapred_job_name',
            hql=self.hql,
            dag=self.dag)

        fake_execution_date = timezone.datetime(2018, 6, 19)
        fake_ti = TaskInstance(task=t, execution_date=fake_execution_date)
        fake_ti.hostname = 'fake_hostname'
        fake_context = {'ti': fake_ti}

        t.execute(fake_context)
        self.assertEqual(
            "Airflow HiveOperator task for {}.{}.{}.{}"
            .format(fake_ti.hostname,
                    self.dag.dag_id, t.task_id,
                    fake_execution_date.isoformat()), mock_hook.mapred_job_name)
Exemplo n.º 19
0
    def test_run_pooling_task_with_skip(self):
        """
        test that running task which returns AirflowSkipOperator will end
        up in a SKIPPED state.
        """

        def raise_skip_exception():
            raise AirflowSkipException

        dag = models.DAG(dag_id='test_run_pooling_task_with_skip')
        task = PythonOperator(
            task_id='test_run_pooling_task_with_skip',
            dag=dag,
            python_callable=raise_skip_exception,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))
        ti = TI(
            task=task, execution_date=timezone.utcnow())
        ti.run()
        self.assertEqual(State.SKIPPED, ti.state)
Exemplo n.º 20
0
    def test_trigger_dag_for_date(self):
        url_template = '/api/experimental/dags/{}/dag_runs'
        dag_id = 'example_bash_operator'
        hour_from_now = utcnow() + timedelta(hours=1)
        execution_date = datetime(hour_from_now.year,
                                  hour_from_now.month,
                                  hour_from_now.day,
                                  hour_from_now.hour)
        datetime_string = execution_date.isoformat()

        # Test Correct execution
        response = self.client.post(
            url_template.format(dag_id),
            data=json.dumps({'execution_date': datetime_string}),
            content_type="application/json"
        )
        self.assertEqual(200, response.status_code)

        dagbag = DagBag()
        dag = dagbag.get_dag(dag_id)
        dag_run = dag.get_dagrun(execution_date)
        self.assertTrue(dag_run,
                        'Dag Run not found for execution date {}'
                        .format(execution_date))

        # Test error for nonexistent dag
        response = self.client.post(
            url_template.format('does_not_exist_dag'),
            data=json.dumps({'execution_date': execution_date.isoformat()}),
            content_type="application/json"
        )
        self.assertEqual(404, response.status_code)

        # Test error for bad datetime format
        response = self.client.post(
            url_template.format(dag_id),
            data=json.dumps({'execution_date': 'not_a_datetime'}),
            content_type="application/json"
        )
        self.assertEqual(400, response.status_code)
Exemplo n.º 21
0
    def test_timezone_awareness(self):
        NAIVE_DATETIME = DEFAULT_DATE.replace(tzinfo=None)

        # check ti without dag (just for bw compat)
        op_no_dag = DummyOperator(task_id='op_no_dag')
        ti = TI(task=op_no_dag, execution_date=NAIVE_DATETIME)

        self.assertEqual(ti.execution_date, DEFAULT_DATE)

        # check with dag without localized execution_date
        dag = DAG('dag', start_date=DEFAULT_DATE)
        op1 = DummyOperator(task_id='op_1')
        dag.add_task(op1)
        ti = TI(task=op1, execution_date=NAIVE_DATETIME)

        self.assertEqual(ti.execution_date, DEFAULT_DATE)

        # with dag and localized execution_date
        tz = pendulum.timezone("Europe/Amsterdam")
        execution_date = timezone.datetime(2016, 1, 1, 1, 0, 0, tzinfo=tz)
        utc_date = timezone.convert_to_utc(execution_date)
        ti = TI(task=op1, execution_date=execution_date)
        self.assertEqual(ti.execution_date, utc_date)
Exemplo n.º 22
0
    def test_retry_delay(self):
        """
        Test that retry delays are respected
        """
        dag = models.DAG(dag_id='test_retry_handling')
        task = BashOperator(
            task_id='test_retry_handling_op',
            bash_command='exit 1',
            retries=1,
            retry_delay=datetime.timedelta(seconds=3),
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))

        def run_with_error(ti):
            try:
                ti.run()
            except AirflowException:
                pass

        ti = TI(
            task=task, execution_date=timezone.utcnow())

        self.assertEqual(ti.try_number, 1)
        # first run -- up for retry
        run_with_error(ti)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)
        self.assertEqual(ti.try_number, 2)

        # second run -- still up for retry because retry_delay hasn't expired
        run_with_error(ti)
        self.assertEqual(ti.state, State.UP_FOR_RETRY)

        # third run -- failed
        time.sleep(3)
        run_with_error(ti)
        self.assertEqual(ti.state, State.FAILED)
Exemplo n.º 23
0
    def test_xcom_pull(self):
        """
        Test xcom_pull, using different filtering methods.
        """
        dag = models.DAG(
            dag_id='test_xcom', schedule_interval='@monthly',
            start_date=timezone.datetime(2016, 6, 1, 0, 0, 0))

        exec_date = timezone.utcnow()

        # Push a value
        task1 = DummyOperator(task_id='test_xcom_1', dag=dag, owner='airflow')
        ti1 = TI(task=task1, execution_date=exec_date)
        ti1.xcom_push(key='foo', value='bar')

        # Push another value with the same key (but by a different task)
        task2 = DummyOperator(task_id='test_xcom_2', dag=dag, owner='airflow')
        ti2 = TI(task=task2, execution_date=exec_date)
        ti2.xcom_push(key='foo', value='baz')

        # Pull with no arguments
        result = ti1.xcom_pull()
        self.assertEqual(result, None)
        # Pull the value pushed most recently by any task.
        result = ti1.xcom_pull(key='foo')
        self.assertIn(result, 'baz')
        # Pull the value pushed by the first task
        result = ti1.xcom_pull(task_ids='test_xcom_1', key='foo')
        self.assertEqual(result, 'bar')
        # Pull the value pushed by the second task
        result = ti1.xcom_pull(task_ids='test_xcom_2', key='foo')
        self.assertEqual(result, 'baz')
        # Pull the values pushed by both tasks
        result = ti1.xcom_pull(
            task_ids=['test_xcom_1', 'test_xcom_2'], key='foo')
        self.assertEqual(result, ('bar', 'baz'))
Exemplo n.º 24
0
 def test_clean_execution_date(self):
     clean_execution_date = self.es_task_handler._clean_execution_date(
         datetime(2016, 7, 8, 9, 10, 11, 12))
     assert '2016_07_08T09_10_11_000012' == clean_execution_date
Exemplo n.º 25
0
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
import time

from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.timezone import datetime


class DummyWithOnKill(DummyOperator):
    def execute(self, context):
        time.sleep(10)

    def on_kill(self):
        self.log.info("Executing on_kill")
        with open("/tmp/airflow_on_kill", "w") as f:
            f.write("ON_KILL_TEST")


# DAG tests backfill with pooled tasks
# Previously backfill would queue the task but never run it
dag1 = DAG(dag_id='test_on_kill', start_date=datetime(2015, 1, 1))
dag1_task1 = DummyWithOnKill(task_id='task1', dag=dag1, owner='airflow')
Exemplo n.º 26
0
    def test_skipping_dagrun(self):
        latest_task = LatestOnlyOperator(task_id='latest', dag=self.dag)
        downstream_task = DummyOperator(task_id='downstream', dag=self.dag)
        downstream_task2 = DummyOperator(task_id='downstream_2', dag=self.dag)

        downstream_task.set_upstream(latest_task)
        downstream_task2.set_upstream(downstream_task)

        self.dag.create_dagrun(run_id="manual__1",
                               start_date=timezone.utcnow(),
                               execution_date=DEFAULT_DATE,
                               state=State.RUNNING)

        self.dag.create_dagrun(run_id="manual__2",
                               start_date=timezone.utcnow(),
                               execution_date=timezone.datetime(
                                   2016, 1, 1, 12),
                               state=State.RUNNING)

        self.dag.create_dagrun(run_id="manual__3",
                               start_date=timezone.utcnow(),
                               execution_date=END_DATE,
                               state=State.RUNNING)

        latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE)
        downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE)

        latest_instances = get_task_instances('latest')
        exec_date_to_latest_state = {
            ti.execution_date: ti.state
            for ti in latest_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): 'success',
                timezone.datetime(2016, 1, 1, 12): 'success',
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_latest_state)

        downstream_instances = get_task_instances('downstream')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state
            for ti in downstream_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): 'skipped',
                timezone.datetime(2016, 1, 1, 12): 'skipped',
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_downstream_state)

        downstream_instances = get_task_instances('downstream_2')
        exec_date_to_downstream_state = {
            ti.execution_date: ti.state
            for ti in downstream_instances
        }
        self.assertEqual(
            {
                timezone.datetime(2016, 1, 1): 'skipped',
                timezone.datetime(2016, 1, 1, 12): 'skipped',
                timezone.datetime(2016, 1, 2): 'success'
            }, exec_date_to_downstream_state)
Exemplo n.º 27
0
# under the License.
import logging

from airflow.models import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.timezone import datetime

logger = logging.getLogger(__name__)


def test_logging_fn(**kwargs):
    """
    Tests DAG logging.
    :param kwargs:
    :return:
    """
    logger.info("Log from DAG Logger")
    kwargs["ti"].log.info("Log from TI Logger")
    print("Log from Print statement")


dag = DAG(dag_id='test_logging_dag',
          schedule_interval=None,
          start_date=datetime(2016, 1, 1),
          catchup=False)

PythonOperator(
    task_id='test_task',
    python_callable=test_logging_fn,
    dag=dag,
)
Exemplo n.º 28
0
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import BranchPythonOperator, PythonOperator
from airflow.utils import timezone

import pandas as pd

default_args = {
    'owner': 'zkan',
    'email': ['*****@*****.**'],
    'sla': timedelta(seconds=30),
}
dag = DAG(
    'branching',
    schedule_interval='0 0 * * THU',
    default_args=default_args,
    start_date=timezone.datetime(2009, 1, 1),
    catchup=False,
)


def test_condition():
    options = ['branch_a', 'branch_b']
    return random.choice(options)


branching = BranchPythonOperator(
    task_id='branching',
    python_callable=test_condition,
    dag=dag,
)
from airflow.operators.python_operator import PythonOperator
from airflow.sensors.named_hive_partition_sensor import NamedHivePartitionSensor
from airflow.utils import timezone

import pandas as pd

default_args = {
    'owner': 'zkan',
    'email': ['*****@*****.**'],
    'sla': timedelta(seconds=30),
}
dag = DAG(
    'product_price_range_pipeline',
    schedule_interval='0 0 * * THU',
    default_args=default_args,
    start_date=timezone.datetime(2020, 8, 15),
    catchup=False,
)

start = DummyOperator(
    task_id='start',
    dag=dag,
)

# Define your pipeline here
check_named_partition = NamedHivePartitionSensor(
    task_id='check_named_partition',
    partition_names=[
        'fact_transactions/execution_date={{ macros.ds_add(ds, -1) }}'
    ],
    metastore_conn_id='my_hive_metastore_conn',
Exemplo n.º 30
0
class TestLogView(unittest.TestCase):
    DAG_ID = 'dag_for_testing_log_view'
    TASK_ID = 'task_for_testing_log_view'
    DEFAULT_DATE = datetime(2017, 9, 1)
    ENDPOINT = '/admin/airflow/log?dag_id={dag_id}&task_id={task_id}&execution_date={execution_date}'.format(
        dag_id=DAG_ID,
        task_id=TASK_ID,
        execution_date=DEFAULT_DATE,
    )

    @classmethod
    def setUpClass(cls):
        super(TestLogView, cls).setUpClass()
        session = Session()
        session.query(TaskInstance).filter(
            TaskInstance.dag_id == cls.DAG_ID and
            TaskInstance.task_id == cls.TASK_ID and
            TaskInstance.execution_date == cls.DEFAULT_DATE).delete()
        session.commit()
        session.close()

    def setUp(self):
        super(TestLogView, self).setUp()

        # Create a custom logging configuration
        configuration.load_test_config()
        logging_config = copy.deepcopy(DEFAULT_LOGGING_CONFIG)
        current_dir = os.path.dirname(os.path.abspath(__file__))
        logging_config['handlers']['task']['base_log_folder'] = os.path.normpath(
            os.path.join(current_dir, 'test_logs'))
        logging_config['handlers']['task']['filename_template'] = \
            '{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts | replace(":", ".") }}/{{ try_number }}.log'

        # Write the custom logging configuration to a file
        self.settings_folder = tempfile.mkdtemp()
        settings_file = os.path.join(self.settings_folder, "airflow_local_settings.py")
        new_logging_file = "LOGGING_CONFIG = {}".format(logging_config)
        with open(settings_file, 'w') as handle:
            handle.writelines(new_logging_file)
        sys.path.append(self.settings_folder)
        conf.set('core', 'logging_config_class', 'airflow_local_settings.LOGGING_CONFIG')

        app = application.create_app(testing=True)
        self.app = app.test_client()
        self.session = Session()
        from airflow.www.views import dagbag
        dag = DAG(self.DAG_ID, start_date=self.DEFAULT_DATE)
        task = DummyOperator(task_id=self.TASK_ID, dag=dag)
        dagbag.bag_dag(dag, parent_dag=dag, root_dag=dag)
        ti = TaskInstance(task=task, execution_date=self.DEFAULT_DATE)
        ti.try_number = 1
        self.session.merge(ti)
        self.session.commit()

    def tearDown(self):
        logging.config.dictConfig(DEFAULT_LOGGING_CONFIG)
        self.session.query(TaskInstance).filter(
            TaskInstance.dag_id == self.DAG_ID and
            TaskInstance.task_id == self.TASK_ID and
            TaskInstance.execution_date == self.DEFAULT_DATE).delete()
        self.session.commit()
        self.session.close()

        sys.path.remove(self.settings_folder)
        shutil.rmtree(self.settings_folder)
        conf.set('core', 'logging_config_class', '')

        super(TestLogView, self).tearDown()

    def test_get_file_task_log(self):
        response = self.app.get(
            TestLogView.ENDPOINT,
            follow_redirects=True,
        )
        self.assertEqual(response.status_code, 200)
        self.assertIn('Log by attempts',
                      response.data.decode('utf-8'))

    def test_get_logs_with_metadata(self):
        url_template = "/admin/airflow/get_logs_with_metadata?dag_id={}&" \
                       "task_id={}&execution_date={}&" \
                       "try_number={}&metadata={}"
        response = \
            self.app.get(url_template.format(self.DAG_ID,
                                             self.TASK_ID,
                                             quote_plus(self.DEFAULT_DATE.isoformat()),
                                             1,
                                             json.dumps({})))

        self.assertIn('"message":', response.data.decode('utf-8'))
        self.assertIn('"metadata":', response.data.decode('utf-8'))
        self.assertIn('Log for testing.', response.data.decode('utf-8'))
        self.assertEqual(200, response.status_code)

    def test_get_logs_with_null_metadata(self):
        url_template = "/admin/airflow/get_logs_with_metadata?dag_id={}&" \
                       "task_id={}&execution_date={}&" \
                       "try_number={}&metadata=null"
        response = \
            self.app.get(url_template.format(self.DAG_ID,
                                             self.TASK_ID,
                                             quote_plus(self.DEFAULT_DATE.isoformat()),
                                             1))

        self.assertIn('"message":', response.data.decode('utf-8'))
        self.assertIn('"metadata":', response.data.decode('utf-8'))
        self.assertIn('Log for testing.', response.data.decode('utf-8'))
        self.assertEqual(200, response.status_code)
Exemplo n.º 31
0
log = getLogger(__name__)


def pushes_a_value(**context):
    log.info('Doing things')
    context['ti'].xcom_push('some', {'my dict': 100, 'ключ': 'значение'})
    context['ti'].xcom_push('нечто', 'ну как бы некий текст')


def pulls_the_value(**context):
    value = context['ti'].xcom_pull(key='some', task_ids='pushing_task')
    log.info(f'The value is: {value}')


DEFAULT_ARGS = dict(
    start_date=timezone.datetime(2016, 1, 1),
    owner='Test',
    retries=0,
    depends_on_past=False,
)

with DAG(
        'test_dag_complex',
        catchup=False,
        default_args=DEFAULT_ARGS,
        schedule_interval=None,
        is_paused_upon_creation=True,
) as dag:
    bash_task = BashOperator(
        task_id='bash_task',
        bash_command="""
Exemplo n.º 32
0
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import datetime
import mock
import unittest

from airflow import configuration, DAG
from airflow.operators.email_operator import EmailOperator
from airflow.utils import timezone

DEFAULT_DATE = timezone.datetime(2016, 1, 1)
END_DATE = timezone.datetime(2016, 1, 2)
INTERVAL = datetime.timedelta(hours=12)
FROZEN_NOW = timezone.datetime(2016, 1, 2, 12, 1, 1)

send_email_test = mock.Mock()


class TestEmailOperator(unittest.TestCase):

    def setUp(self):
        super().setUp()
        configuration.load_test_config()
        self.dag = DAG(
            'test_dag',
            default_args={
Exemplo n.º 33
0
    def __init__(
        self,
        dag_directory: str,
        max_runs: int,
        processor_factory: Callable[[str, List[CallbackRequest]],
                                    AbstractDagFileProcessorProcess],
        processor_timeout: timedelta,
        signal_conn: MultiprocessingConnection,
        dag_ids: Optional[List[str]],
        pickle_dags: bool,
        async_mode: bool = True,
    ):
        super().__init__()
        self._file_paths: List[str] = []
        self._file_path_queue: List[str] = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._pickle_dags = pickle_dags
        self._dag_ids = dag_ids
        self._async_mode = async_mode
        self._parsing_start_time: Optional[datetime] = None

        self._parallelism = conf.getint('scheduler', 'parsing_processes')
        if 'sqlite' in conf.get('core',
                                'sql_alchemy_conn') and self._parallelism > 1:
            self.log.warning(
                "Because we cannot use more than 1 thread (parsing_processes = "
                "%d ) when using sqlite. So we set parallelism to 1.",
                self._parallelism,
            )
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')
        # How many seconds do we wait for tasks to heartbeat before mark them as zombies.
        self._zombie_threshold_secs = conf.getint(
            'scheduler', 'scheduler_zombie_task_threshold')

        # Should store dag file source in a database?
        self.store_dag_code = STORE_DAG_CODE
        # Map from file path to the processor
        self._processors: Dict[str, AbstractDagFileProcessorProcess] = {}

        self._num_run = 0

        # Map from file path to stats about the file
        self._file_stats: Dict[str, DagFileStat] = {}

        self._last_zombie_query_time = None
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.make_aware(
            datetime.fromtimestamp(0))
        # Last time stats were printed
        self.last_stat_print_time = timezone.datetime(2000, 1, 1)
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        # How long to wait before timing out a process to parse a DAG file
        self._processor_timeout = processor_timeout

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler',
                                                 'dag_dir_list_interval')

        # Mapping file name and callbacks requests
        self._callback_to_execute: Dict[
            str, List[CallbackRequest]] = defaultdict(list)

        self._log = logging.getLogger('airflow.processor_manager')

        self.waitables: Dict[Any, Union[MultiprocessingConnection,
                                        AbstractDagFileProcessorProcess]] = {
                                            self._signal_conn:
                                            self._signal_conn,
                                        }
Exemplo n.º 34
0
    def __init__(self,
                 dag_directory,
                 file_paths,
                 max_runs,
                 processor_factory,
                 signal_conn,
                 stat_queue,
                 result_queue,
                 async_mode=True):
        """
        :param dag_directory: Directory where DAG definitions are kept. All
            files in file_paths should be under this directory
        :type dag_directory: unicode
        :param file_paths: list of file paths that contain DAG definitions
        :type file_paths: list[unicode]
        :param max_runs: The number of times to parse and schedule each file. -1
            for unlimited.
        :type max_runs: int
        :param processor_factory: function that creates processors for DAG
            definition files. Arguments are (dag_definition_path)
        :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor)
        :param signal_conn: connection to communicate signal with processor agent.
        :type signal_conn: airflow.models.connection.Connection
        :param stat_queue: the queue to use for passing back parsing stat to agent.
        :type stat_queue: multiprocessing.Queue
        :param result_queue: the queue to use for passing back the result to agent.
        :type result_queue: multiprocessing.Queue
        :param async_mode: whether to start the manager in async mode
        :type async_mode: bool
        """
        self._file_paths = file_paths
        self._file_path_queue = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._stat_queue = stat_queue
        self._result_queue = result_queue
        self._async_mode = async_mode

        self._parallelism = conf.getint('scheduler', 'max_threads')
        if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1:
            self.log.error("Cannot use more than 1 thread when using sqlite. "
                           "Setting parallelism to 1")
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')
        # How many seconds do we wait for tasks to heartbeat before mark them as zombies.
        self._zombie_threshold_secs = (
            conf.getint('scheduler', 'scheduler_zombie_task_threshold'))
        # Map from file path to the processor
        self._processors = {}
        # Map from file path to the last runtime
        self._last_runtime = {}
        # Map from file path to the last finish time
        self._last_finish_time = {}
        self._last_zombie_query_time = timezone.utcnow()
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.utcnow()
        # Last time stats were printed
        self.last_stat_print_time = timezone.datetime(2000, 1, 1)
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        # Map from file path to the number of runs
        self._run_count = defaultdict(int)
        # Manager heartbeat key.
        self._heart_beat_key = 'heart-beat'

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler',
                                                 'dag_dir_list_interval')

        self._log = logging.getLogger('airflow.processor_manager')

        signal.signal(signal.SIGINT, self._exit_gracefully)
        signal.signal(signal.SIGTERM, self._exit_gracefully)
Exemplo n.º 35
0
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from airflow.models import DAG
from airflow.operators.dummy import DummyOperator
from airflow.utils.timezone import datetime

# The schedule_interval specified here is an INVALID
# Cron expression. This invalid DAG will be used to
# test whether dagbag.process_file() can identify
# invalid Cron expression.
dag1 = DAG(dag_id='test_invalid_cron', start_date=datetime(2015, 1, 1), schedule_interval="0 100 * * *")
dag1_task1 = DummyOperator(task_id='task1', dag=dag1, owner='airflow')
Exemplo n.º 36
0
class TestElasticsearchTaskHandler(unittest.TestCase):
    DAG_ID = 'dag_for_testing_file_task_handler'
    TASK_ID = 'task_for_testing_file_log_handler'
    EXECUTION_DATE = datetime(2016, 1, 1)
    LOG_ID = '{dag_id}-{task_id}-2016-01-01T00:00:00+00:00-1'.format(
        dag_id=DAG_ID, task_id=TASK_ID)

    @elasticmock
    def setUp(self):
        super(TestElasticsearchTaskHandler, self).setUp()
        self.local_log_location = 'local/log/location'
        self.filename_template = '{try_number}.log'
        self.log_id_template = '{dag_id}-{task_id}-{execution_date}-{try_number}'
        self.end_of_log_mark = 'end_of_log\n'
        self.es_task_handler = ElasticsearchTaskHandler(
            self.local_log_location, self.filename_template,
            self.log_id_template, self.end_of_log_mark)

        self.es = elasticsearch.Elasticsearch(hosts=[{
            'host': 'localhost',
            'port': 9200
        }])
        self.index_name = 'test_index'
        self.doc_type = 'log'
        self.test_message = 'some random stuff'
        self.body = {
            'message': self.test_message,
            'log_id': self.LOG_ID,
            'offset': 1
        }

        self.es.index(index=self.index_name,
                      doc_type=self.doc_type,
                      body=self.body,
                      id=1)

        configuration.load_test_config()
        self.dag = DAG(self.DAG_ID, start_date=self.EXECUTION_DATE)
        task = DummyOperator(task_id=self.TASK_ID, dag=self.dag)
        self.ti = TaskInstance(task=task, execution_date=self.EXECUTION_DATE)
        self.ti.try_number = 1
        self.ti.state = State.RUNNING
        self.addCleanup(self.dag.clear)

    def tearDown(self):
        shutil.rmtree(self.local_log_location.split(os.path.sep)[0],
                      ignore_errors=True)

    def test_client(self):
        self.assertIsInstance(self.es_task_handler.client,
                              elasticsearch.Elasticsearch)

    def test_read(self):
        ts = pendulum.now()
        logs, metadatas = self.es_task_handler.read(
            self.ti, 1, {
                'offset': 0,
                'last_log_timestamp': str(ts),
                'end_of_log': False
            })
        self.assertEqual(1, len(logs))
        self.assertEqual(len(logs), len(metadatas))
        self.assertEqual(self.test_message, logs[0])
        self.assertFalse(metadatas[0]['end_of_log'])
        self.assertEqual(1, metadatas[0]['offset'])
        self.assertTrue(
            timezone.parse(metadatas[0]['last_log_timestamp']) > ts)

    def test_read_with_match_phrase_query(self):
        simiar_log_id = '{task_id}-{dag_id}-2016-01-01T00:00:00+00:00-1'.format(
            dag_id=TestElasticsearchTaskHandler.DAG_ID,
            task_id=TestElasticsearchTaskHandler.TASK_ID)
        another_test_message = 'another message'

        another_body = {
            'message': another_test_message,
            'log_id': simiar_log_id,
            'offset': 1
        }
        self.es.index(index=self.index_name,
                      doc_type=self.doc_type,
                      body=another_body,
                      id=1)

        ts = pendulum.now()
        logs, metadatas = self.es_task_handler.read(
            self.ti, 1, {
                'offset': 0,
                'last_log_timestamp': str(ts),
                'end_of_log': False
            })
        self.assertEqual(1, len(logs))
        self.assertEqual(len(logs), len(metadatas))
        self.assertEqual(self.test_message, logs[0])
        self.assertNotEqual(another_test_message, logs[0])

        self.assertFalse(metadatas[0]['end_of_log'])
        self.assertEqual(1, metadatas[0]['offset'])
        self.assertTrue(
            timezone.parse(metadatas[0]['last_log_timestamp']) > ts)

    def test_read_with_none_meatadata(self):
        logs, metadatas = self.es_task_handler.read(self.ti, 1)
        self.assertEqual(1, len(logs))
        self.assertEqual(len(logs), len(metadatas))
        self.assertEqual(self.test_message, logs[0])
        self.assertFalse(metadatas[0]['end_of_log'])
        self.assertEqual(1, metadatas[0]['offset'])
        self.assertTrue(
            timezone.parse(metadatas[0]['last_log_timestamp']) <
            pendulum.now())

    def test_read_nonexistent_log(self):
        ts = pendulum.now()
        # In ElasticMock, search is going to return all documents with matching index
        # and doc_type regardless of match filters, so we delete the log entry instead
        # of making a new TaskInstance to query.
        self.es.delete(index=self.index_name, doc_type=self.doc_type, id=1)
        logs, metadatas = self.es_task_handler.read(
            self.ti, 1, {
                'offset': 0,
                'last_log_timestamp': str(ts),
                'end_of_log': False
            })
        self.assertEqual(1, len(logs))
        self.assertEqual(len(logs), len(metadatas))
        self.assertEqual([''], logs)
        self.assertFalse(metadatas[0]['end_of_log'])
        self.assertEqual(0, metadatas[0]['offset'])
        # last_log_timestamp won't change if no log lines read.
        self.assertTrue(
            timezone.parse(metadatas[0]['last_log_timestamp']) == ts)

    def test_read_with_empty_metadata(self):
        ts = pendulum.now()
        logs, metadatas = self.es_task_handler.read(self.ti, 1, {})
        self.assertEqual(1, len(logs))
        self.assertEqual(len(logs), len(metadatas))
        self.assertEqual(self.test_message, logs[0])
        self.assertFalse(metadatas[0]['end_of_log'])
        # offset should be initialized to 0 if not provided.
        self.assertEqual(1, metadatas[0]['offset'])
        # last_log_timestamp will be initialized using log reading time
        # if not last_log_timestamp is provided.
        self.assertTrue(
            timezone.parse(metadatas[0]['last_log_timestamp']) > ts)

        # case where offset is missing but metadata not empty.
        self.es.delete(index=self.index_name, doc_type=self.doc_type, id=1)
        logs, metadatas = self.es_task_handler.read(self.ti, 1,
                                                    {'end_of_log': False})
        self.assertEqual(1, len(logs))
        self.assertEqual(len(logs), len(metadatas))
        self.assertEqual([''], logs)
        self.assertFalse(metadatas[0]['end_of_log'])
        # offset should be initialized to 0 if not provided.
        self.assertEqual(0, metadatas[0]['offset'])
        # last_log_timestamp will be initialized using log reading time
        # if not last_log_timestamp is provided.
        self.assertTrue(
            timezone.parse(metadatas[0]['last_log_timestamp']) > ts)

    def test_read_timeout(self):
        ts = pendulum.now().subtract(minutes=5)

        self.es.delete(index=self.index_name, doc_type=self.doc_type, id=1)
        logs, metadatas = self.es_task_handler.read(
            self.ti, 1, {
                'offset': 0,
                'last_log_timestamp': str(ts),
                'end_of_log': False
            })
        self.assertEqual(1, len(logs))
        self.assertEqual(len(logs), len(metadatas))
        self.assertEqual([''], logs)
        self.assertTrue(metadatas[0]['end_of_log'])
        # offset should be initialized to 0 if not provided.
        self.assertEqual(0, metadatas[0]['offset'])
        self.assertTrue(
            timezone.parse(metadatas[0]['last_log_timestamp']) == ts)

    def test_read_raises(self):
        with mock.patch.object(self.es_task_handler.log,
                               'exception') as mock_exception:
            with mock.patch(
                    "elasticsearch_dsl.Search.execute") as mock_execute:
                mock_execute.side_effect = Exception('Failed to read')
                logs, metadatas = self.es_task_handler.read(self.ti, 1)
            msg = "Could not read log with log_id: {}".format(self.LOG_ID)
            mock_exception.assert_called_once()
            args, kwargs = mock_exception.call_args
            self.assertIn(msg, args[0])

        self.assertEqual(1, len(logs))
        self.assertEqual(len(logs), len(metadatas))
        self.assertEqual([''], logs)
        self.assertFalse(metadatas[0]['end_of_log'])
        self.assertEqual(0, metadatas[0]['offset'])

    def test_set_context(self):
        self.es_task_handler.set_context(self.ti)
        self.assertTrue(self.es_task_handler.mark_end_on_close)

    def test_close(self):
        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.close()
        with open(
                os.path.join(self.local_log_location,
                             self.filename_template.format(try_number=1)),
                'r') as log_file:
            self.assertIn(self.end_of_log_mark, log_file.read())
        self.assertTrue(self.es_task_handler.closed)

    def test_close_no_mark_end(self):
        self.ti.raw = True
        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.close()
        with open(
                os.path.join(self.local_log_location,
                             self.filename_template.format(try_number=1)),
                'r') as log_file:
            self.assertNotIn(self.end_of_log_mark, log_file.read())
        self.assertTrue(self.es_task_handler.closed)

    def test_close_closed(self):
        self.es_task_handler.closed = True
        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.close()
        with open(
                os.path.join(self.local_log_location,
                             self.filename_template.format(try_number=1)),
                'r') as log_file:
            self.assertEqual(0, len(log_file.read()))

    def test_close_with_no_handler(self):
        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.handler = None
        self.es_task_handler.close()
        with open(
                os.path.join(self.local_log_location,
                             self.filename_template.format(try_number=1)),
                'r') as log_file:
            self.assertEqual(0, len(log_file.read()))
        self.assertTrue(self.es_task_handler.closed)

    def test_close_with_no_stream(self):
        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.handler.stream = None
        self.es_task_handler.close()
        with open(
                os.path.join(self.local_log_location,
                             self.filename_template.format(try_number=1)),
                'r') as log_file:
            self.assertIn(self.end_of_log_mark, log_file.read())
        self.assertTrue(self.es_task_handler.closed)

        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.handler.stream.close()
        self.es_task_handler.close()
        with open(
                os.path.join(self.local_log_location,
                             self.filename_template.format(try_number=1)),
                'r') as log_file:
            self.assertIn(self.end_of_log_mark, log_file.read())
        self.assertTrue(self.es_task_handler.closed)

    def test_render_log_id(self):
        expected_log_id = 'dag_for_testing_file_task_handler-' \
                          'task_for_testing_file_log_handler-2016-01-01T00:00:00+00:00-1'
        log_id = self.es_task_handler._render_log_id(self.ti, 1)
        self.assertEqual(expected_log_id, log_id)

        # Switch to use jinja template.
        self.es_task_handler = ElasticsearchTaskHandler(
            self.local_log_location, self.filename_template,
            '{{ ti.dag_id }}-{{ ti.task_id }}-{{ ts }}-{{ try_number }}',
            self.end_of_log_mark)
        log_id = self.es_task_handler._render_log_id(self.ti, 1)
        self.assertEqual(expected_log_id, log_id)
Exemplo n.º 37
0
import logging
import os
import unittest
from datetime import timedelta, date

from airflow import configuration
from airflow.exceptions import AirflowException
from airflow.models import TaskInstance as TI, DAG, DagRun
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator, BranchPythonOperator
from airflow.operators.python_operator import ShortCircuitOperator
from airflow.settings import Session
from airflow.utils import timezone
from airflow.utils.state import State

DEFAULT_DATE = timezone.datetime(2016, 1, 1)
END_DATE = timezone.datetime(2016, 1, 2)
INTERVAL = timedelta(hours=12)
FROZEN_NOW = timezone.datetime(2016, 1, 2, 12, 1, 1)

TI_CONTEXT_ENV_VARS = ['AIRFLOW_CTX_DAG_ID',
                       'AIRFLOW_CTX_TASK_ID',
                       'AIRFLOW_CTX_EXECUTION_DATE',
                       'AIRFLOW_CTX_DAG_RUN_ID']


class Call:
    def __init__(self, *args, **kwargs):
        self.args = args
        self.kwargs = kwargs
Exemplo n.º 38
0
class ViewWithDateTimeAndNumRunsAndDagRunsFormTester:
    DAG_ID = 'dag_for_testing_dt_nr_dr_form'
    DEFAULT_DATE = datetime(2017, 9, 1)
    RUNS_DATA = [
        ('dag_run_for_testing_dt_nr_dr_form_4', datetime(2018, 4, 4)),
        ('dag_run_for_testing_dt_nr_dr_form_3', datetime(2018, 3, 3)),
        ('dag_run_for_testing_dt_nr_dr_form_2', datetime(2018, 2, 2)),
        ('dag_run_for_testing_dt_nr_dr_form_1', datetime(2018, 1, 1)),
    ]

    def __init__(self, test, endpoint):
        self.test = test
        self.endpoint = endpoint

    def setUp(self):
        configuration.load_test_config()
        app = application.create_app(testing=True)
        app.config['WTF_CSRF_METHODS'] = []
        self.app = app.test_client()
        self.session = Session()
        from airflow.www.views import dagbag
        from airflow.utils.state import State
        dag = DAG(self.DAG_ID, start_date=self.DEFAULT_DATE)
        dagbag.bag_dag(dag, parent_dag=dag, root_dag=dag)
        self.runs = []
        for rd in self.RUNS_DATA:
            run = dag.create_dagrun(
                run_id=rd[0],
                execution_date=rd[1],
                state=State.SUCCESS,
                external_trigger=True
            )
            self.runs.append(run)

    def tearDown(self):
        self.session.query(DagRun).filter(
            DagRun.dag_id == self.DAG_ID).delete()
        self.session.commit()
        self.session.close()

    def assertBaseDateAndNumRuns(self, base_date, num_runs, data):
        self.test.assertNotIn('name="base_date" value="{}"'.format(base_date), data)
        self.test.assertNotIn('<option selected="" value="{}">{}</option>'.format(
            num_runs, num_runs), data)

    def assertRunIsNotInDropdown(self, run, data):
        self.test.assertNotIn(run.execution_date.isoformat(), data)
        self.test.assertNotIn(run.run_id, data)

    def assertRunIsInDropdownNotSelected(self, run, data):
        self.test.assertIn('<option value="{}">{}</option>'.format(
            run.execution_date.isoformat(), run.run_id), data)

    def assertRunIsSelected(self, run, data):
        self.test.assertIn('<option selected value="{}">{}</option>'.format(
            run.execution_date.isoformat(), run.run_id), data)

    def test_with_default_parameters(self):
        """
        Tests graph view with no URL parameter.
        Should show all dag runs in the drop down.
        Should select the latest dag run.
        Should set base date to current date (not asserted)
        """
        response = self.app.get(
            self.endpoint
        )
        self.test.assertEqual(response.status_code, 200)
        data = response.data.decode('utf-8')
        self.test.assertIn('Base date:', data)
        self.test.assertIn('Number of runs:', data)
        self.assertRunIsSelected(self.runs[0], data)
        self.assertRunIsInDropdownNotSelected(self.runs[1], data)
        self.assertRunIsInDropdownNotSelected(self.runs[2], data)
        self.assertRunIsInDropdownNotSelected(self.runs[3], data)

    def test_with_execution_date_parameter_only(self):
        """
        Tests graph view with execution_date URL parameter.
        Scenario: click link from dag runs view.
        Should only show dag runs older than execution_date in the drop down.
        Should select the particular dag run.
        Should set base date to execution date.
        """
        response = self.app.get(
            self.endpoint + '&execution_date={}'.format(
                self.runs[1].execution_date.isoformat())
        )
        self.test.assertEqual(response.status_code, 200)
        data = response.data.decode('utf-8')
        self.assertBaseDateAndNumRuns(
            self.runs[1].execution_date,
            configuration.getint('webserver', 'default_dag_run_display_number'),
            data)
        self.assertRunIsNotInDropdown(self.runs[0], data)
        self.assertRunIsSelected(self.runs[1], data)
        self.assertRunIsInDropdownNotSelected(self.runs[2], data)
        self.assertRunIsInDropdownNotSelected(self.runs[3], data)

    def test_with_base_date_and_num_runs_parmeters_only(self):
        """
        Tests graph view with base_date and num_runs URL parameters.
        Should only show dag runs older than base_date in the drop down,
        limited to num_runs.
        Should select the latest dag run.
        Should set base date and num runs to submitted values.
        """
        response = self.app.get(
            self.endpoint + '&base_date={}&num_runs=2'.format(
                self.runs[1].execution_date.isoformat())
        )
        self.test.assertEqual(response.status_code, 200)
        data = response.data.decode('utf-8')
        self.assertBaseDateAndNumRuns(self.runs[1].execution_date, 2, data)
        self.assertRunIsNotInDropdown(self.runs[0], data)
        self.assertRunIsSelected(self.runs[1], data)
        self.assertRunIsInDropdownNotSelected(self.runs[2], data)
        self.assertRunIsNotInDropdown(self.runs[3], data)

    def test_with_base_date_and_num_runs_and_execution_date_outside(self):
        """
        Tests graph view with base_date and num_runs and execution-date URL parameters.
        Scenario: change the base date and num runs and press "Go",
        the selected execution date is outside the new range.
        Should only show dag runs older than base_date in the drop down.
        Should select the latest dag run within the range.
        Should set base date and num runs to submitted values.
        """
        response = self.app.get(
            self.endpoint + '&base_date={}&num_runs=42&execution_date={}'.format(
                self.runs[1].execution_date.isoformat(),
                self.runs[0].execution_date.isoformat())
        )
        self.test.assertEqual(response.status_code, 200)
        data = response.data.decode('utf-8')
        self.assertBaseDateAndNumRuns(self.runs[1].execution_date, 42, data)
        self.assertRunIsNotInDropdown(self.runs[0], data)
        self.assertRunIsSelected(self.runs[1], data)
        self.assertRunIsInDropdownNotSelected(self.runs[2], data)
        self.assertRunIsInDropdownNotSelected(self.runs[3], data)

    def test_with_base_date_and_num_runs_and_execution_date_within(self):
        """
        Tests graph view with base_date and num_runs and execution-date URL parameters.
        Scenario: change the base date and num runs and press "Go",
        the selected execution date is within the new range.
        Should only show dag runs older than base_date in the drop down.
        Should select the dag run with the execution date.
        Should set base date and num runs to submitted values.
        """
        response = self.app.get(
            self.endpoint + '&base_date={}&num_runs=5&execution_date={}'.format(
                self.runs[2].execution_date.isoformat(),
                self.runs[3].execution_date.isoformat())
        )
        self.test.assertEqual(response.status_code, 200)
        data = response.data.decode('utf-8')
        self.assertBaseDateAndNumRuns(self.runs[2].execution_date, 5, data)
        self.assertRunIsNotInDropdown(self.runs[0], data)
        self.assertRunIsNotInDropdown(self.runs[1], data)
        self.assertRunIsInDropdownNotSelected(self.runs[2], data)
        self.assertRunIsSelected(self.runs[3], data)
Exemplo n.º 39
0
    def __init__(self,
                 dag_directory: str,
                 file_paths: List[str],
                 max_runs: int,
                 processor_factory: Callable[[str, List[Any]],
                                             AbstractDagFileProcessorProcess],
                 processor_timeout: timedelta,
                 signal_conn: Connection,
                 async_mode: bool = True):
        self._file_paths = file_paths
        self._file_path_queue: List[str] = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._async_mode = async_mode
        self._parsing_start_time: Optional[datetime] = None

        self._parallelism = conf.getint('scheduler', 'max_threads')
        if 'sqlite' in conf.get('core',
                                'sql_alchemy_conn') and self._parallelism > 1:
            self.log.warning(
                "Because we cannot use more than 1 thread (max_threads = "
                "%d ) when using sqlite. So we set parallelism to 1.",
                self._parallelism)
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')
        # How many seconds do we wait for tasks to heartbeat before mark them as zombies.
        self._zombie_threshold_secs = (conf.getint(
            'scheduler', 'scheduler_zombie_task_threshold'))
        # Map from file path to the processor
        self._processors: Dict[str, AbstractDagFileProcessorProcess] = {}

        self._heartbeat_count = 0

        # Map from file path to stats about the file
        self._file_stats: Dict[str, DagFileStat] = {}

        self._last_zombie_query_time = None
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.utcnow()
        # Last time stats were printed
        self.last_stat_print_time = timezone.datetime(2000, 1, 1)
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        self._zombies: List[SimpleTaskInstance] = []
        # How long to wait before timing out a process to parse a DAG file
        self._processor_timeout = processor_timeout

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler',
                                                 'dag_dir_list_interval')

        self._log = logging.getLogger('airflow.processor_manager')

        signal.signal(signal.SIGINT, self._exit_gracefully)
        signal.signal(signal.SIGTERM, self._exit_gracefully)
class TestRenderedTaskInstanceFields(unittest.TestCase):
    """Unit tests for RenderedTaskInstanceFields."""

    def setUp(self):
        clear_rendered_ti_fields()

    def tearDown(self):
        clear_rendered_ti_fields()

    @parameterized.expand(
        [
            (None, None),
            ([], []),
            ({}, {}),
            ("test-string", "test-string"),
            ({"foo": "bar"}, {"foo": "bar"}),
            ("{{ task.task_id }}", "test"),
            (date(2018, 12, 6), "2018-12-06"),
            (datetime(2018, 12, 6, 10, 55), "2018-12-06 10:55:00+00:00"),
            (
                ClassWithCustomAttributes(
                    att1="{{ task.task_id }}", att2="{{ task.task_id }}", template_fields=["att1"]
                ),
                "ClassWithCustomAttributes({'att1': 'test', 'att2': '{{ task.task_id }}', "
                "'template_fields': ['att1']})",
            ),
            (
                ClassWithCustomAttributes(
                    nested1=ClassWithCustomAttributes(
                        att1="{{ task.task_id }}", att2="{{ task.task_id }}", template_fields=["att1"]
                    ),
                    nested2=ClassWithCustomAttributes(
                        att3="{{ task.task_id }}", att4="{{ task.task_id }}", template_fields=["att3"]
                    ),
                    template_fields=["nested1"],
                ),
                "ClassWithCustomAttributes({'nested1': ClassWithCustomAttributes("
                "{'att1': 'test', 'att2': '{{ task.task_id }}', 'template_fields': ['att1']}), "
                "'nested2': ClassWithCustomAttributes("
                "{'att3': '{{ task.task_id }}', 'att4': '{{ task.task_id }}', 'template_fields': ['att3']}), "
                "'template_fields': ['nested1']})",
            ),
        ]
    )
    def test_get_templated_fields(self, templated_field, expected_rendered_field):
        """
        Test that template_fields are rendered correctly, stored in the Database,
        and are correctly fetched using RTIF.get_templated_fields
        """
        dag = DAG("test_serialized_rendered_fields", start_date=START_DATE)
        with dag:
            task = BashOperator(task_id="test", bash_command=templated_field)

        ti = TI(task=task, execution_date=EXECUTION_DATE)
        rtif = RTIF(ti=ti)
        self.assertEqual(ti.dag_id, rtif.dag_id)
        self.assertEqual(ti.task_id, rtif.task_id)
        self.assertEqual(ti.execution_date, rtif.execution_date)
        self.assertEqual(expected_rendered_field, rtif.rendered_fields.get("bash_command"))

        with create_session() as session:
            session.add(rtif)

        self.assertEqual(
            {"bash_command": expected_rendered_field, "env": None}, RTIF.get_templated_fields(ti=ti)
        )

        # Test the else part of get_templated_fields
        # i.e. for the TIs that are not stored in RTIF table
        # Fetching them will return None
        with dag:
            task_2 = BashOperator(task_id="test2", bash_command=templated_field)

        ti2 = TI(task_2, EXECUTION_DATE)
        self.assertIsNone(RTIF.get_templated_fields(ti=ti2))

    @parameterized.expand(
        [
            (0, 1, 0, 1),
            (1, 1, 1, 1),
            (1, 0, 1, 0),
            (3, 1, 1, 1),
            (4, 2, 2, 1),
            (5, 2, 2, 1),
        ]
    )
    def test_delete_old_records(self, rtif_num, num_to_keep, remaining_rtifs, expected_query_count):
        """
        Test that old records are deleted from rendered_task_instance_fields table
        for a given task_id and dag_id.
        """
        session = settings.Session()
        dag = DAG("test_delete_old_records", start_date=START_DATE)
        with dag:
            task = BashOperator(task_id="test", bash_command="echo {{ ds }}")

        rtif_list = [
            RTIF(TI(task=task, execution_date=EXECUTION_DATE + timedelta(days=num)))
            for num in range(rtif_num)
        ]

        session.add_all(rtif_list)
        session.commit()

        result = session.query(RTIF).filter(RTIF.dag_id == dag.dag_id, RTIF.task_id == task.task_id).all()

        for rtif in rtif_list:
            self.assertIn(rtif, result)

        self.assertEqual(rtif_num, len(result))

        # Verify old records are deleted and only 'num_to_keep' records are kept
        with assert_queries_count(expected_query_count):
            RTIF.delete_old_records(task_id=task.task_id, dag_id=task.dag_id, num_to_keep=num_to_keep)
        result = session.query(RTIF).filter(RTIF.dag_id == dag.dag_id, RTIF.task_id == task.task_id).all()
        self.assertEqual(remaining_rtifs, len(result))

    def test_write(self):
        """
        Test records can be written and overwritten
        """
        Variable.set(key="test_key", value="test_val")

        session = settings.Session()
        result = session.query(RTIF).all()
        self.assertEqual([], result)

        with DAG("test_write", start_date=START_DATE):
            task = BashOperator(task_id="test", bash_command="echo {{ var.value.test_key }}")

        rtif = RTIF(TI(task=task, execution_date=EXECUTION_DATE))
        rtif.write()
        result = (
            session.query(RTIF.dag_id, RTIF.task_id, RTIF.rendered_fields)
            .filter(
                RTIF.dag_id == rtif.dag_id,
                RTIF.task_id == rtif.task_id,
                RTIF.execution_date == rtif.execution_date,
            )
            .first()
        )
        self.assertEqual(('test_write', 'test', {'bash_command': 'echo test_val', 'env': None}), result)

        # Test that overwrite saves new values to the DB
        Variable.delete("test_key")
        Variable.set(key="test_key", value="test_val_updated")

        with DAG("test_write", start_date=START_DATE):
            updated_task = BashOperator(task_id="test", bash_command="echo {{ var.value.test_key }}")

        rtif_updated = RTIF(TI(task=updated_task, execution_date=EXECUTION_DATE))
        rtif_updated.write()

        result_updated = (
            session.query(RTIF.dag_id, RTIF.task_id, RTIF.rendered_fields)
            .filter(
                RTIF.dag_id == rtif_updated.dag_id,
                RTIF.task_id == rtif_updated.task_id,
                RTIF.execution_date == rtif_updated.execution_date,
            )
            .first()
        )
        self.assertEqual(
            ('test_write', 'test', {'bash_command': 'echo test_val_updated', 'env': None}), result_updated
        )

    @mock.patch.dict(os.environ, {"AIRFLOW_IS_K8S_EXECUTOR_POD": "True"})
    @mock.patch("airflow.settings.pod_mutation_hook")
    def test_get_k8s_pod_yaml(self, mock_pod_mutation_hook):
        """
        Test that k8s_pod_yaml is rendered correctly, stored in the Database,
        and are correctly fetched using RTIF.get_k8s_pod_yaml
        """
        dag = DAG("test_get_k8s_pod_yaml", start_date=START_DATE)
        with dag:
            task = BashOperator(task_id="test", bash_command="echo hi")

        ti = TI(task=task, execution_date=EXECUTION_DATE)
        rtif = RTIF(ti=ti)

        # Test that pod_mutation_hook is called
        mock_pod_mutation_hook.assert_called_once_with(mock.ANY)

        self.assertEqual(ti.dag_id, rtif.dag_id)
        self.assertEqual(ti.task_id, rtif.task_id)
        self.assertEqual(ti.execution_date, rtif.execution_date)

        expected_pod_yaml = {
            'metadata': {
                'annotations': {
                    'dag_id': 'test_get_k8s_pod_yaml',
                    'execution_date': '2019-01-01T00:00:00+00:00',
                    'task_id': 'test',
                    'try_number': '1',
                },
                'labels': {
                    'airflow-worker': 'worker-config',
                    'airflow_version': version,
                    'dag_id': 'test_get_k8s_pod_yaml',
                    'execution_date': '2019-01-01T00_00_00_plus_00_00',
                    'kubernetes_executor': 'True',
                    'task_id': 'test',
                    'try_number': '1',
                },
                'name': mock.ANY,
                'namespace': 'default',
            },
            'spec': {
                'containers': [
                    {
                        'command': [
                            'airflow',
                            'tasks',
                            'run',
                            'test_get_k8s_pod_yaml',
                            'test',
                            '2019-01-01T00:00:00+00:00',
                        ],
                        'image': ':',
                        'name': 'base',
                        'env': [{'name': 'AIRFLOW_IS_K8S_EXECUTOR_POD', 'value': 'True'}],
                    }
                ]
            },
        }

        self.assertEqual(expected_pod_yaml, rtif.k8s_pod_yaml)

        with create_session() as session:
            session.add(rtif)

        self.assertEqual(expected_pod_yaml, RTIF.get_k8s_pod_yaml(ti=ti))

        # Test the else part of get_k8s_pod_yaml
        # i.e. for the TIs that are not stored in RTIF table
        # Fetching them will return None
        with dag:
            task_2 = BashOperator(task_id="test2", bash_command="echo hello")

        ti2 = TI(task_2, EXECUTION_DATE)
        self.assertIsNone(RTIF.get_k8s_pod_yaml(ti=ti2))
Exemplo n.º 41
0
    def test_round_time(self):

        rt1 = round_time(datetime(2015, 1, 1, 6), timedelta(days=1))
        self.assertEqual(datetime(2015, 1, 1, 0, 0), rt1)

        rt2 = round_time(datetime(2015, 1, 2), relativedelta(months=1))
        self.assertEqual(datetime(2015, 1, 1, 0, 0), rt2)

        rt3 = round_time(datetime(2015, 9, 16, 0, 0), timedelta(1),
                         datetime(2015, 9, 14, 0, 0))
        self.assertEqual(datetime(2015, 9, 16, 0, 0), rt3)

        rt4 = round_time(datetime(2015, 9, 15, 0, 0), timedelta(1),
                         datetime(2015, 9, 14, 0, 0))
        self.assertEqual(datetime(2015, 9, 15, 0, 0), rt4)

        rt5 = round_time(datetime(2015, 9, 14, 0, 0), timedelta(1),
                         datetime(2015, 9, 14, 0, 0))
        self.assertEqual(datetime(2015, 9, 14, 0, 0), rt5)

        rt6 = round_time(datetime(2015, 9, 13, 0, 0), timedelta(1),
                         datetime(2015, 9, 14, 0, 0))
        self.assertEqual(datetime(2015, 9, 14, 0, 0), rt6)
Exemplo n.º 42
0
    def test_round_time(self):

        rt1 = dates.round_time(timezone.datetime(2015, 1, 1, 6),
                               timedelta(days=1))
        assert timezone.datetime(2015, 1, 1, 0, 0) == rt1

        rt2 = dates.round_time(timezone.datetime(2015, 1, 2),
                               relativedelta(months=1))
        assert timezone.datetime(2015, 1, 1, 0, 0) == rt2

        rt3 = dates.round_time(timezone.datetime(2015, 9, 16, 0, 0),
                               timedelta(1),
                               timezone.datetime(2015, 9, 14, 0, 0))
        assert timezone.datetime(2015, 9, 16, 0, 0) == rt3

        rt4 = dates.round_time(timezone.datetime(2015, 9, 15, 0, 0),
                               timedelta(1),
                               timezone.datetime(2015, 9, 14, 0, 0))
        assert timezone.datetime(2015, 9, 15, 0, 0) == rt4

        rt5 = dates.round_time(timezone.datetime(2015, 9, 14, 0, 0),
                               timedelta(1),
                               timezone.datetime(2015, 9, 14, 0, 0))
        assert timezone.datetime(2015, 9, 14, 0, 0) == rt5

        rt6 = dates.round_time(timezone.datetime(2015, 9, 13, 0, 0),
                               timedelta(1),
                               timezone.datetime(2015, 9, 14, 0, 0))
        assert timezone.datetime(2015, 9, 14, 0, 0) == rt6
Exemplo n.º 43
0
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.timezone import datetime

dag = DAG(
    dag_id="example_dag",
    default_args={
        "start_date": datetime(2020, 5, 1),
        "owner": "airflow"
    },
    schedule_interval=None,
)

bash_task = BashOperator(
    task_id="bash_task",
    bash_command="echo Test",
    dag=dag,
)
    def __init__(self,
                 dag_directory,
                 file_paths,
                 max_runs,
                 processor_factory,
                 processor_timeout,
                 signal_conn,
                 async_mode=True):
        """
        :param dag_directory: Directory where DAG definitions are kept. All
            files in file_paths should be under this directory
        :type dag_directory: unicode
        :param file_paths: list of file paths that contain DAG definitions
        :type file_paths: list[unicode]
        :param max_runs: The number of times to parse and schedule each file. -1
            for unlimited.
        :type max_runs: int
        :param processor_factory: function that creates processors for DAG
            definition files. Arguments are (dag_definition_path)
        :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor)
        :param processor_timeout: How long to wait before timing out a DAG file processor
        :type processor_timeout: timedelta
        :param signal_conn: connection to communicate signal with processor agent.
        :type signal_conn: airflow.models.connection.Connection
        :param async_mode: whether to start the manager in async mode
        :type async_mode: bool
        """
        self._file_paths = file_paths
        self._file_path_queue = []
        self._dag_directory = dag_directory
        self._max_runs = max_runs
        self._processor_factory = processor_factory
        self._signal_conn = signal_conn
        self._async_mode = async_mode

        self._parallelism = conf.getint('scheduler', 'max_threads')
        if 'sqlite' in conf.get('core',
                                'sql_alchemy_conn') and self._parallelism > 1:
            self.log.error("Cannot use more than 1 thread when using sqlite. "
                           "Setting parallelism to 1")
            self._parallelism = 1

        # Parse and schedule each file no faster than this interval.
        self._file_process_interval = conf.getint('scheduler',
                                                  'min_file_process_interval')
        # How often to print out DAG file processing stats to the log. Default to
        # 30 seconds.
        self.print_stats_interval = conf.getint('scheduler',
                                                'print_stats_interval')
        # Map from file path to the processor
        self._processors = {}
        # Map from file path to the last runtime
        self._last_runtime = {}
        # Map from file path to the last finish time
        self._last_finish_time = {}
        self._last_zombie_query_time = timezone.utcnow()
        # Last time that the DAG dir was traversed to look for files
        self.last_dag_dir_refresh_time = timezone.utcnow()
        # Last time stats were printed
        self.last_stat_print_time = timezone.datetime(2000, 1, 1)
        # TODO: Remove magic number
        self._zombie_query_interval = 10
        # Map from file path to the number of runs
        self._run_count = defaultdict(int)
        # Manager heartbeat key.
        self._heart_beat_key = 'heart-beat'
        # How long to wait before timing out a process to parse a DAG file
        self._processor_timeout = processor_timeout

        # How often to scan the DAGs directory for new files. Default to 5 minutes.
        self.dag_dir_list_interval = conf.getint('scheduler',
                                                 'dag_dir_list_interval')

        self._log = logging.getLogger('airflow.processor_manager')

        signal.signal(signal.SIGINT, self._exit_gracefully)
        signal.signal(signal.SIGTERM, self._exit_gracefully)
Exemplo n.º 45
0
    def test_reschedule_handling(self, mock_pool_full):
        """
        Test that task reschedules are handled properly
        """
        # Mock the pool with a pool with slots open since the pool doesn't actually exist
        mock_pool_full.return_value = False

        # Return values of the python sensor callable, modified during tests
        done = False
        fail = False

        def callable():
            if fail:
                raise AirflowException()
            return done

        dag = models.DAG(dag_id='test_reschedule_handling')
        task = PythonSensor(task_id='test_reschedule_handling_sensor',
                            poke_interval=0,
                            mode='reschedule',
                            python_callable=callable,
                            retries=1,
                            retry_delay=datetime.timedelta(seconds=0),
                            dag=dag,
                            owner='airflow',
                            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))

        ti = TI(task=task, execution_date=timezone.utcnow())
        self.assertEqual(ti._try_number, 0)
        self.assertEqual(ti.try_number, 1)

        def run_ti_and_assert(run_date, expected_start_date, expected_end_date,
                              expected_duration, expected_state,
                              expected_try_number,
                              expected_task_reschedule_count):
            with freeze_time(run_date):
                try:
                    ti.run()
                except AirflowException:
                    if not fail:
                        raise
            ti.refresh_from_db()
            self.assertEqual(ti.state, expected_state)
            self.assertEqual(ti._try_number, expected_try_number)
            self.assertEqual(ti.try_number, expected_try_number + 1)
            self.assertEqual(ti.start_date, expected_start_date)
            self.assertEqual(ti.end_date, expected_end_date)
            self.assertEqual(ti.duration, expected_duration)
            trs = TaskReschedule.find_for_task_instance(ti)
            self.assertEqual(len(trs), expected_task_reschedule_count)

        date1 = timezone.utcnow()
        date2 = date1 + datetime.timedelta(minutes=1)
        date3 = date2 + datetime.timedelta(minutes=1)
        date4 = date3 + datetime.timedelta(minutes=1)

        # Run with multiple reschedules.
        # During reschedule the try number remains the same, but each reschedule is recorded.
        # The start date is expected to remain the initial date, hence the duration increases.
        # When finished the try number is incremented and there is no reschedule expected
        # for this try.

        done, fail = False, False
        run_ti_and_assert(date1, date1, date1, 0, State.UP_FOR_RESCHEDULE, 0,
                          1)

        done, fail = False, False
        run_ti_and_assert(date2, date1, date2, 60, State.UP_FOR_RESCHEDULE, 0,
                          2)

        done, fail = False, False
        run_ti_and_assert(date3, date1, date3, 120, State.UP_FOR_RESCHEDULE, 0,
                          3)

        done, fail = True, False
        run_ti_and_assert(date4, date1, date4, 180, State.SUCCESS, 1, 0)

        # Clear the task instance.
        dag.clear()
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.NONE)
        self.assertEqual(ti._try_number, 1)

        # Run again after clearing with reschedules and a retry.
        # The retry increments the try number, and for that try no reschedule is expected.
        # After the retry the start date is reset, hence the duration is also reset.

        done, fail = False, False
        run_ti_and_assert(date1, date1, date1, 0, State.UP_FOR_RESCHEDULE, 1,
                          1)

        done, fail = False, True
        run_ti_and_assert(date2, date1, date2, 60, State.UP_FOR_RETRY, 2, 0)

        done, fail = False, False
        run_ti_and_assert(date3, date3, date3, 0, State.UP_FOR_RESCHEDULE, 2,
                          1)

        done, fail = True, False
        run_ti_and_assert(date4, date3, date4, 60, State.SUCCESS, 3, 0)
Exemplo n.º 46
0
class TestLogView(unittest.TestCase):
    DAG_ID = "dag_log_reader"
    TASK_ID = "task_log_reader"
    DEFAULT_DATE = timezone.datetime(2017, 9, 1)

    def setUp(self):
        self.maxDiff = None  # pylint: disable=invalid-name

        # Make sure that the configure_logging is not cached
        self.old_modules = dict(sys.modules)

        self.settings_folder = tempfile.mkdtemp()
        self.log_dir = tempfile.mkdtemp()

        self._configure_loggers()
        self._prepare_db()
        self._prepare_log_files()

    def _prepare_log_files(self):
        dir_path = f"{self.log_dir}/{self.DAG_ID}/{self.TASK_ID}/2017-09-01T00.00.00+00.00/"
        os.makedirs(dir_path)
        for try_number in range(1, 4):
            with open(f"{dir_path}/{try_number}.log", "w+") as file:
                file.write(f"try_number={try_number}.\n")
                file.flush()

    def _prepare_db(self):
        dag = DAG(self.DAG_ID, start_date=self.DEFAULT_DATE)
        dag.sync_to_db()
        with create_session() as session:
            op = DummyOperator(task_id=self.TASK_ID, dag=dag)
            self.ti = TaskInstance(task=op, execution_date=self.DEFAULT_DATE)
            self.ti.try_number = 3

            session.merge(self.ti)

    def _configure_loggers(self):
        logging_config = copy.deepcopy(DEFAULT_LOGGING_CONFIG)
        logging_config["handlers"]["task"]["base_log_folder"] = self.log_dir
        logging_config["handlers"]["task"][
            "filename_template"] = "{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts | replace(':', '.') }}/{{ try_number }}.log"
        settings_file = os.path.join(self.settings_folder,
                                     "airflow_local_settings.py")
        with open(settings_file, "w") as handle:
            new_logging_file = f"LOGGING_CONFIG = {logging_config}"
            handle.writelines(new_logging_file)
        sys.path.append(self.settings_folder)
        with conf_vars({
            ("logging", "logging_config_class"):
                "airflow_local_settings.LOGGING_CONFIG"
        }):
            settings.configure_logging()

    def tearDown(self):
        logging.config.dictConfig(DEFAULT_LOGGING_CONFIG)
        clear_db_runs()

        # Remove any new modules imported during the test run. This lets us
        # import the same source files for more than one test.
        for mod in [m for m in sys.modules if m not in self.old_modules]:
            del sys.modules[mod]

        sys.path.remove(self.settings_folder)
        shutil.rmtree(self.settings_folder)
        shutil.rmtree(self.log_dir)
        super().tearDown()

    def test_test_read_log_chunks_should_read_one_try(self):
        task_log_reader = TaskLogReader()
        logs, metadatas = task_log_reader.read_log_chunks(ti=self.ti,
                                                          try_number=1,
                                                          metadata={})

        self.assertEqual(
            [(
                '',
                f"*** Reading local file: "
                f"{self.log_dir}/dag_log_reader/task_log_reader/2017-09-01T00.00.00+00.00/1.log\n"
                f"try_number=1.\n",
            )],
            logs[0],
        )
        self.assertEqual({"end_of_log": True}, metadatas)

    def test_test_read_log_chunks_should_read_all_files(self):
        task_log_reader = TaskLogReader()
        logs, metadatas = task_log_reader.read_log_chunks(ti=self.ti,
                                                          try_number=None,
                                                          metadata={})

        self.assertEqual(
            [
                [(
                    '',
                    "*** Reading local file: "
                    f"{self.log_dir}/dag_log_reader/task_log_reader/2017-09-01T00.00.00+00.00/1.log\n"
                    "try_number=1.\n",
                )],
                [(
                    '',
                    f"*** Reading local file: "
                    f"{self.log_dir}/dag_log_reader/task_log_reader/2017-09-01T00.00.00+00.00/2.log\n"
                    f"try_number=2.\n",
                )],
                [(
                    '',
                    f"*** Reading local file: "
                    f"{self.log_dir}/dag_log_reader/task_log_reader/2017-09-01T00.00.00+00.00/3.log\n"
                    f"try_number=3.\n",
                )],
            ],
            logs,
        )
        self.assertEqual({"end_of_log": True}, metadatas)

    def test_test_test_read_log_stream_should_read_one_try(self):
        task_log_reader = TaskLogReader()
        stream = task_log_reader.read_log_stream(ti=self.ti,
                                                 try_number=1,
                                                 metadata={})

        self.assertEqual(
            [
                "\n*** Reading local file: "
                f"{self.log_dir}/dag_log_reader/task_log_reader/2017-09-01T00.00.00+00.00/1.log\n"
                "try_number=1.\n"
                "\n"
            ],
            list(stream),
        )

    def test_test_test_read_log_stream_should_read_all_logs(self):
        task_log_reader = TaskLogReader()
        stream = task_log_reader.read_log_stream(ti=self.ti,
                                                 try_number=None,
                                                 metadata={})
        self.assertEqual(
            [
                "\n*** Reading local file: "
                f"{self.log_dir}/dag_log_reader/task_log_reader/2017-09-01T00.00.00+00.00/1.log\n"
                "try_number=1.\n"
                "\n",
                "\n*** Reading local file: "
                f"{self.log_dir}/dag_log_reader/task_log_reader/2017-09-01T00.00.00+00.00/2.log\n"
                "try_number=2.\n"
                "\n",
                "\n*** Reading local file: "
                f"{self.log_dir}/dag_log_reader/task_log_reader/2017-09-01T00.00.00+00.00/3.log\n"
                "try_number=3.\n"
                "\n",
            ],
            list(stream),
        )

    @mock.patch("airflow.utils.log.file_task_handler.FileTaskHandler.read")
    def test_read_log_stream_should_support_multiple_chunks(self, mock_read):
        first_return = ([[('', "1st line")]], [{}])
        second_return = ([[('', "2nd line")]], [{"end_of_log": False}])
        third_return = ([[('', "3rd line")]], [{"end_of_log": True}])
        fourth_return = ([[('', "should never be read")]], [{
            "end_of_log": True
        }])
        mock_read.side_effect = [
            first_return, second_return, third_return, fourth_return
        ]

        task_log_reader = TaskLogReader()
        log_stream = task_log_reader.read_log_stream(ti=self.ti,
                                                     try_number=1,
                                                     metadata={})
        self.assertEqual(["\n1st line\n", "\n2nd line\n", "\n3rd line\n"],
                         list(log_stream))

        mock_read.assert_has_calls(
            [
                mock.call(self.ti, 1, metadata={}),
                mock.call(self.ti, 1, metadata={}),
                mock.call(self.ti, 1, metadata={"end_of_log": False}),
            ],
            any_order=False,
        )

    @mock.patch("airflow.utils.log.file_task_handler.FileTaskHandler.read")
    def test_read_log_stream_should_read_each_try_in_turn(self, mock_read):
        first_return = ([[('', "try_number=1.")]], [{"end_of_log": True}])
        second_return = ([[('', "try_number=2.")]], [{"end_of_log": True}])
        third_return = ([[('', "try_number=3.")]], [{"end_of_log": True}])
        fourth_return = ([[('', "should never be read")]], [{
            "end_of_log": True
        }])
        mock_read.side_effect = [
            first_return, second_return, third_return, fourth_return
        ]

        task_log_reader = TaskLogReader()
        log_stream = task_log_reader.read_log_stream(ti=self.ti,
                                                     try_number=None,
                                                     metadata={})
        self.assertEqual(
            ['\ntry_number=1.\n', '\ntry_number=2.\n', '\ntry_number=3.\n'],
            list(log_stream))

        mock_read.assert_has_calls(
            [
                mock.call(self.ti, 1, metadata={}),
                mock.call(self.ti, 2, metadata={}),
                mock.call(self.ti, 3, metadata={}),
            ],
            any_order=False,
        )
Exemplo n.º 47
0
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#

import unittest
from airflow import DAG, configuration, models
from airflow.contrib.sensors.weekday_sensor import DayOfWeekSensor
from airflow.contrib.utils.weekday import WeekDay
from airflow.exceptions import AirflowSensorTimeout
from airflow.models import DagBag, TaskFail
from airflow.settings import Session
from airflow.utils.timezone import datetime

DEFAULT_DATE = datetime(2018, 12, 10)
WEEKDAY_DATE = datetime(2018, 12, 20)
WEEKEND_DATE = datetime(2018, 12, 22)
TEST_DAG_ID = 'weekday_sensor_dag'
DEV_NULL = '/dev/null'


class DayOfWeekSensorTests(unittest.TestCase):

    def setUp(self):
        configuration.load_test_config()
        self.dagbag = DagBag(
            dag_folder=DEV_NULL,
            include_examples=True
        )
        self.args = {
from parameterized import parameterized

from airflow import settings
from airflow.models import Variable
from airflow.models.dag import DAG
from airflow.models.renderedtifields import RenderedTaskInstanceFields as RTIF
from airflow.models.taskinstance import TaskInstance as TI
from airflow.operators.bash import BashOperator
from airflow.utils.session import create_session
from airflow.utils.timezone import datetime
from airflow.version import version
from tests.test_utils.asserts import assert_queries_count
from tests.test_utils.db import clear_rendered_ti_fields

TEST_DAG = DAG("example_rendered_ti_field", schedule_interval=None)
START_DATE = datetime(2018, 1, 1)
EXECUTION_DATE = datetime(2019, 1, 1)


class ClassWithCustomAttributes:
    """Class for testing purpose: allows to create objects with custom attributes in one single statement."""

    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

    def __str__(self):
        return "{}({})".format(ClassWithCustomAttributes.__name__, str(self.__dict__))

    def __repr__(self):
        return self.__str__()
Exemplo n.º 49
0
import os
import json
from datetime import timedelta
from django.test import TestCase
from airflow import DAG, configuration
from airflow.utils import timezone
from airflow_ml.edinet_flow.workflow import EDINETMixin
from airflow_ml.edinet_flow.workflow import GetEDINETDocumentListOperator
from airflow_ml.edinet_flow.workflow import GetEDINETDocumentSensor
from airflow_ml.edinet_flow.workflow import RegisterDocumentOperator
from airflow_ml.edinet_flow.workflow import ExtractDocumentFeaturesOperator
from eagle.models.masters import EDINETDocument
from eagle.models import NumberOfExecutives

DEFAULT_DATE = timezone.datetime(2018, 9, 10)
# https://disclosure.edinet-fsa.go.jp/api/v1/documents.json?type=2&date=2018-09-10


class TestExtractDocumentFeaturesOperator(TestCase):
    @classmethod
    def setUpClass(cls):
        super().setUpClass()
        configuration.load_test_config()
        dag_id = "extract_document_feature_prepare_dag"
        cls.prepare_dag = DAG(dag_id=dag_id, start_date=DEFAULT_DATE)

        get_list = GetEDINETDocumentListOperator(task_id="get_document_list",
                                                 dag=cls.prepare_dag)
        get_document = GetEDINETDocumentSensor(max_retrieve=3,
                                               document_ids=("S100E2NM",
                                                             "S100E2S2"),
Exemplo n.º 50
0
    def test_reschedule_handling(self, mock_pool_full):
        """
        Test that task reschedules are handled properly
        """
        # Mock the pool with a pool with slots open since the pool doesn't actually exist
        mock_pool_full.return_value = False

        # Return values of the python sensor callable, modified during tests
        done = False
        fail = False

        def callable():
            if fail:
                raise AirflowException()
            return done

        dag = models.DAG(dag_id='test_reschedule_handling')
        task = PythonSensor(
            task_id='test_reschedule_handling_sensor',
            poke_interval=0,
            mode='reschedule',
            python_callable=callable,
            retries=1,
            retry_delay=datetime.timedelta(seconds=0),
            dag=dag,
            owner='airflow',
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))

        ti = TI(task=task, execution_date=timezone.utcnow())
        self.assertEqual(ti._try_number, 0)
        self.assertEqual(ti.try_number, 1)

        def run_ti_and_assert(run_date, expected_start_date, expected_end_date, expected_duration,
                              expected_state, expected_try_number, expected_task_reschedule_count):
            with freeze_time(run_date):
                try:
                    ti.run()
                except AirflowException:
                    if not fail:
                        raise
            ti.refresh_from_db()
            self.assertEqual(ti.state, expected_state)
            self.assertEqual(ti._try_number, expected_try_number)
            self.assertEqual(ti.try_number, expected_try_number + 1)
            self.assertEqual(ti.start_date, expected_start_date)
            self.assertEqual(ti.end_date, expected_end_date)
            self.assertEqual(ti.duration, expected_duration)
            trs = TaskReschedule.find_for_task_instance(ti)
            self.assertEqual(len(trs), expected_task_reschedule_count)

        date1 = timezone.utcnow()
        date2 = date1 + datetime.timedelta(minutes=1)
        date3 = date2 + datetime.timedelta(minutes=1)
        date4 = date3 + datetime.timedelta(minutes=1)

        # Run with multiple reschedules.
        # During reschedule the try number remains the same, but each reschedule is recorded.
        # The start date is expected to remain the inital date, hence the duration increases.
        # When finished the try number is incremented and there is no reschedule expected
        # for this try.

        done, fail = False, False
        run_ti_and_assert(date1, date1, date1, 0, State.UP_FOR_RESCHEDULE, 0, 1)

        done, fail = False, False
        run_ti_and_assert(date2, date1, date2, 60, State.UP_FOR_RESCHEDULE, 0, 2)

        done, fail = False, False
        run_ti_and_assert(date3, date1, date3, 120, State.UP_FOR_RESCHEDULE, 0, 3)

        done, fail = True, False
        run_ti_and_assert(date4, date1, date4, 180, State.SUCCESS, 1, 0)

        # Clear the task instance.
        dag.clear()
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.NONE)
        self.assertEqual(ti._try_number, 1)

        # Run again after clearing with reschedules and a retry.
        # The retry increments the try number, and for that try no reschedule is expected.
        # After the retry the start date is reset, hence the duration is also reset.

        done, fail = False, False
        run_ti_and_assert(date1, date1, date1, 0, State.UP_FOR_RESCHEDULE, 1, 1)

        done, fail = False, True
        run_ti_and_assert(date2, date1, date2, 60, State.UP_FOR_RETRY, 2, 0)

        done, fail = False, False
        run_ti_and_assert(date3, date3, date3, 0, State.UP_FOR_RESCHEDULE, 2, 1)

        done, fail = True, False
        run_ti_and_assert(date4, date3, date4, 60, State.SUCCESS, 3, 0)
Exemplo n.º 51
0
import pytest
from freezegun import freeze_time

from airflow import settings
from airflow.models import DAG, TaskInstance
from airflow.operators.dummy import DummyOperator
from airflow.ti_deps.deps.runnable_exec_date_dep import RunnableExecDateDep
from airflow.utils.timezone import datetime


@freeze_time('2016-11-01')
@pytest.mark.parametrize(
    "allow_trigger_in_future,schedule_interval,execution_date,is_met",
    [
        (True, None, datetime(2016, 11, 3), True),
        (True, "@daily", datetime(2016, 11, 3), False),
        (False, None, datetime(2016, 11, 3), False),
        (False, "@daily", datetime(2016, 11, 3), False),
        (False, "@daily", datetime(2016, 11, 1), True),
        (False, None, datetime(2016, 11, 1), True),
    ],
)
def test_exec_date_dep(allow_trigger_in_future, schedule_interval,
                       execution_date, is_met):
    """
    If the dag's execution date is in the future but (allow_trigger_in_future=False or not schedule_interval)
    this dep should fail
    """

    with patch.object(settings, 'ALLOW_FUTURE_EXEC_DATES',
Exemplo n.º 52
0
class TestElasticsearchTaskHandler(unittest.TestCase):
    DAG_ID = 'dag_for_testing_file_task_handler'
    TASK_ID = 'task_for_testing_file_log_handler'
    EXECUTION_DATE = datetime(2016, 1, 1)
    LOG_ID = f'{DAG_ID}-{TASK_ID}-2016-01-01T00:00:00+00:00-1'

    @elasticmock
    def setUp(self):
        super().setUp()
        self.local_log_location = 'local/log/location'
        self.filename_template = '{try_number}.log'
        self.log_id_template = '{dag_id}-{task_id}-{execution_date}-{try_number}'
        self.end_of_log_mark = 'end_of_log\n'
        self.write_stdout = False
        self.json_format = False
        self.json_fields = 'asctime,filename,lineno,levelname,message'
        self.es_task_handler = ElasticsearchTaskHandler(
            self.local_log_location,
            self.filename_template,
            self.log_id_template,
            self.end_of_log_mark,
            self.write_stdout,
            self.json_format,
            self.json_fields,
        )

        self.es = elasticsearch.Elasticsearch(  # pylint: disable=invalid-name
            hosts=[{
                'host': 'localhost',
                'port': 9200
            }])
        self.index_name = 'test_index'
        self.doc_type = 'log'
        self.test_message = 'some random stuff'
        self.body = {
            'message': self.test_message,
            'log_id': self.LOG_ID,
            'offset': 1
        }

        self.es.index(index=self.index_name,
                      doc_type=self.doc_type,
                      body=self.body,
                      id=1)

        self.dag = DAG(self.DAG_ID, start_date=self.EXECUTION_DATE)
        task = DummyOperator(task_id=self.TASK_ID, dag=self.dag)
        self.ti = TaskInstance(task=task, execution_date=self.EXECUTION_DATE)
        self.ti.try_number = 1
        self.ti.state = State.RUNNING
        self.addCleanup(self.dag.clear)

    def tearDown(self):
        shutil.rmtree(self.local_log_location.split(os.path.sep)[0],
                      ignore_errors=True)

    def test_client(self):
        assert isinstance(self.es_task_handler.client,
                          elasticsearch.Elasticsearch)

    def test_client_with_config(self):
        es_conf = dict(conf.getsection("elasticsearch_configs"))
        expected_dict = {
            "use_ssl": False,
            "verify_certs": True,
        }
        assert es_conf == expected_dict
        # ensure creating with configs does not fail
        ElasticsearchTaskHandler(
            self.local_log_location,
            self.filename_template,
            self.log_id_template,
            self.end_of_log_mark,
            self.write_stdout,
            self.json_format,
            self.json_fields,
            es_conf,
        )

    def test_read(self):
        ts = pendulum.now()
        logs, metadatas = self.es_task_handler.read(
            self.ti, 1, {
                'offset': 0,
                'last_log_timestamp': str(ts),
                'end_of_log': False
            })

        assert 1 == len(logs)
        assert len(logs) == len(metadatas)
        assert len(logs[0]) == 1
        assert self.test_message == logs[0][0][-1]
        assert not metadatas[0]['end_of_log']
        assert '1' == metadatas[0]['offset']
        assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts

    def test_read_with_match_phrase_query(self):
        similar_log_id = '{task_id}-{dag_id}-2016-01-01T00:00:00+00:00-1'.format(
            dag_id=TestElasticsearchTaskHandler.DAG_ID,
            task_id=TestElasticsearchTaskHandler.TASK_ID)
        another_test_message = 'another message'

        another_body = {
            'message': another_test_message,
            'log_id': similar_log_id,
            'offset': 1
        }
        self.es.index(index=self.index_name,
                      doc_type=self.doc_type,
                      body=another_body,
                      id=1)

        ts = pendulum.now()
        logs, metadatas = self.es_task_handler.read(
            self.ti, 1, {
                'offset': '0',
                'last_log_timestamp': str(ts),
                'end_of_log': False,
                'max_offset': 2
            })
        assert 1 == len(logs)
        assert len(logs) == len(metadatas)
        assert self.test_message == logs[0][0][-1]
        assert another_test_message != logs[0]

        assert not metadatas[0]['end_of_log']
        assert '1' == metadatas[0]['offset']
        assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts

    def test_read_with_none_metadata(self):
        logs, metadatas = self.es_task_handler.read(self.ti, 1)
        assert 1 == len(logs)
        assert len(logs) == len(metadatas)
        assert self.test_message == logs[0][0][-1]
        assert not metadatas[0]['end_of_log']
        assert '1' == metadatas[0]['offset']
        assert timezone.parse(
            metadatas[0]['last_log_timestamp']) < pendulum.now()

    def test_read_nonexistent_log(self):
        ts = pendulum.now()
        # In ElasticMock, search is going to return all documents with matching index
        # and doc_type regardless of match filters, so we delete the log entry instead
        # of making a new TaskInstance to query.
        self.es.delete(index=self.index_name, doc_type=self.doc_type, id=1)
        logs, metadatas = self.es_task_handler.read(
            self.ti, 1, {
                'offset': 0,
                'last_log_timestamp': str(ts),
                'end_of_log': False
            })
        assert 1 == len(logs)
        assert len(logs) == len(metadatas)
        assert [[]] == logs
        assert not metadatas[0]['end_of_log']
        assert '0' == metadatas[0]['offset']
        # last_log_timestamp won't change if no log lines read.
        assert timezone.parse(metadatas[0]['last_log_timestamp']) == ts

    def test_read_with_empty_metadata(self):
        ts = pendulum.now()
        logs, metadatas = self.es_task_handler.read(self.ti, 1, {})
        assert 1 == len(logs)
        assert len(logs) == len(metadatas)
        assert self.test_message == logs[0][0][-1]
        assert not metadatas[0]['end_of_log']
        # offset should be initialized to 0 if not provided.
        assert '1' == metadatas[0]['offset']
        # last_log_timestamp will be initialized using log reading time
        # if not last_log_timestamp is provided.
        assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts

        # case where offset is missing but metadata not empty.
        self.es.delete(index=self.index_name, doc_type=self.doc_type, id=1)
        logs, metadatas = self.es_task_handler.read(self.ti, 1,
                                                    {'end_of_log': False})
        assert 1 == len(logs)
        assert len(logs) == len(metadatas)
        assert [[]] == logs
        assert not metadatas[0]['end_of_log']
        # offset should be initialized to 0 if not provided.
        assert '0' == metadatas[0]['offset']
        # last_log_timestamp will be initialized using log reading time
        # if not last_log_timestamp is provided.
        assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts

    def test_read_timeout(self):
        ts = pendulum.now().subtract(minutes=5)

        self.es.delete(index=self.index_name, doc_type=self.doc_type, id=1)
        logs, metadatas = self.es_task_handler.read(
            self.ti, 1, {
                'offset': 0,
                'last_log_timestamp': str(ts),
                'end_of_log': False
            })
        assert 1 == len(logs)
        assert len(logs) == len(metadatas)
        assert [[]] == logs
        assert metadatas[0]['end_of_log']
        # offset should be initialized to 0 if not provided.
        assert '0' == metadatas[0]['offset']
        assert timezone.parse(metadatas[0]['last_log_timestamp']) == ts

    def test_read_as_download_logs(self):
        ts = pendulum.now()
        logs, metadatas = self.es_task_handler.read(
            self.ti,
            1,
            {
                'offset': 0,
                'last_log_timestamp': str(ts),
                'download_logs': True,
                'end_of_log': False
            },
        )
        assert 1 == len(logs)
        assert len(logs) == len(metadatas)
        assert len(logs[0]) == 1
        assert self.test_message == logs[0][0][-1]
        assert not metadatas[0]['end_of_log']
        assert metadatas[0]['download_logs']
        assert '1' == metadatas[0]['offset']
        assert timezone.parse(metadatas[0]['last_log_timestamp']) > ts

    def test_read_raises(self):
        with mock.patch.object(self.es_task_handler.log,
                               'exception') as mock_exception:
            with mock.patch(
                    "elasticsearch_dsl.Search.execute") as mock_execute:
                mock_execute.side_effect = Exception('Failed to read')
                logs, metadatas = self.es_task_handler.read(self.ti, 1)
            assert mock_exception.call_count == 1
            args, kwargs = mock_exception.call_args
            assert "Could not read log with log_id:" in args[0]

        assert 1 == len(logs)
        assert len(logs) == len(metadatas)
        assert [[]] == logs
        assert not metadatas[0]['end_of_log']
        assert '0' == metadatas[0]['offset']

    def test_set_context(self):
        self.es_task_handler.set_context(self.ti)
        assert self.es_task_handler.mark_end_on_close

    def test_set_context_w_json_format_and_write_stdout(self):
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self.es_task_handler.formatter = formatter
        self.es_task_handler.write_stdout = True
        self.es_task_handler.json_format = True
        self.es_task_handler.set_context(self.ti)

    def test_read_with_json_format(self):
        ts = pendulum.now()
        formatter = logging.Formatter(
            '[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s'
        )
        self.es_task_handler.formatter = formatter
        self.es_task_handler.json_format = True

        self.body = {
            'message': self.test_message,
            'log_id':
            f'{self.DAG_ID}-{self.TASK_ID}-2016_01_01T00_00_00_000000-1',
            'offset': 1,
            'asctime': '2020-12-24 19:25:00,962',
            'filename': 'taskinstance.py',
            'lineno': 851,
            'levelname': 'INFO',
        }
        self.es_task_handler.set_context(self.ti)
        self.es.index(index=self.index_name,
                      doc_type=self.doc_type,
                      body=self.body,
                      id=id)

        logs, _ = self.es_task_handler.read(self.ti, 1, {
            'offset': 0,
            'last_log_timestamp': str(ts),
            'end_of_log': False
        })
        assert "[2020-12-24 19:25:00,962] {taskinstance.py:851} INFO - some random stuff" == logs[
            0][0][1]

    def test_close(self):
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self.es_task_handler.formatter = formatter

        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.close()
        with open(
                os.path.join(
                    self.local_log_location,
                    self.filename_template.format(try_number=1))) as log_file:
            # end_of_log_mark may contain characters like '\n' which is needed to
            # have the log uploaded but will not be stored in elasticsearch.
            # so apply the strip() to log_file.read()
            log_line = log_file.read().strip()
            assert self.end_of_log_mark.strip() == log_line
        assert self.es_task_handler.closed

    def test_close_no_mark_end(self):
        self.ti.raw = True
        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.close()
        with open(
                os.path.join(
                    self.local_log_location,
                    self.filename_template.format(try_number=1))) as log_file:
            assert self.end_of_log_mark not in log_file.read()
        assert self.es_task_handler.closed

    def test_close_closed(self):
        self.es_task_handler.closed = True
        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.close()
        with open(
                os.path.join(
                    self.local_log_location,
                    self.filename_template.format(try_number=1))) as log_file:
            assert 0 == len(log_file.read())

    def test_close_with_no_handler(self):
        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.handler = None
        self.es_task_handler.close()
        with open(
                os.path.join(
                    self.local_log_location,
                    self.filename_template.format(try_number=1))) as log_file:
            assert 0 == len(log_file.read())
        assert self.es_task_handler.closed

    def test_close_with_no_stream(self):
        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.handler.stream = None
        self.es_task_handler.close()
        with open(
                os.path.join(
                    self.local_log_location,
                    self.filename_template.format(try_number=1))) as log_file:
            assert self.end_of_log_mark in log_file.read()
        assert self.es_task_handler.closed

        self.es_task_handler.set_context(self.ti)
        self.es_task_handler.handler.stream.close()
        self.es_task_handler.close()
        with open(
                os.path.join(
                    self.local_log_location,
                    self.filename_template.format(try_number=1))) as log_file:
            assert self.end_of_log_mark in log_file.read()
        assert self.es_task_handler.closed

    def test_render_log_id(self):
        expected_log_id = (
            'dag_for_testing_file_task_handler-'
            'task_for_testing_file_log_handler-2016-01-01T00:00:00+00:00-1')
        log_id = self.es_task_handler._render_log_id(self.ti, 1)
        assert expected_log_id == log_id

        # Switch to use jinja template.
        self.es_task_handler = ElasticsearchTaskHandler(
            self.local_log_location,
            self.filename_template,
            '{{ ti.dag_id }}-{{ ti.task_id }}-{{ ts }}-{{ try_number }}',
            self.end_of_log_mark,
            self.write_stdout,
            self.json_format,
            self.json_fields,
        )
        log_id = self.es_task_handler._render_log_id(self.ti, 1)
        assert expected_log_id == log_id

    def test_clean_execution_date(self):
        clean_execution_date = self.es_task_handler._clean_execution_date(
            datetime(2016, 7, 8, 9, 10, 11, 12))
        assert '2016_07_08T09_10_11_000012' == clean_execution_date

    @parameterized.expand([
        # Common case
        ('localhost:5601/{log_id}',
         'https://localhost:5601/' + quote(LOG_ID.replace('T', ' '))),
        # Ignore template if "{log_id}"" is missing in the URL
        ('localhost:5601', 'https://localhost:5601'),
    ])
    def test_get_external_log_url(self, es_frontend, expected_url):
        es_task_handler = ElasticsearchTaskHandler(
            self.local_log_location,
            self.filename_template,
            self.log_id_template,
            self.end_of_log_mark,
            self.write_stdout,
            self.json_format,
            self.json_fields,
            frontend=es_frontend,
        )
        url = es_task_handler.get_external_log_url(self.ti, self.ti.try_number)
        assert expected_url == url
from base64 import b64encode
from unittest import TestCase, main

import six

from parameterized import parameterized
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.models import DAG, TaskInstance
from airflow.utils import timezone
from airflow.utils.timezone import datetime
from tests.test_utils.config import conf_vars

TEST_CONN_ID = "conn_id_for_testing"
TIMEOUT = 5
TEST_DAG_ID = 'unit_tests_ssh_test_op'
DEFAULT_DATE = datetime(2017, 1, 1)
COMMAND = "echo -n airflow"
COMMAND_WITH_SUDO = "sudo " + COMMAND


class SSHOperatorTest(TestCase):
    def setUp(self):
        from airflow.contrib.hooks.ssh_hook import SSHHook
        hook = SSHHook(ssh_conn_id='ssh_default')
        hook.no_host_key_check = True
        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE,
            'provide_context': True
        }
        dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args)
Exemplo n.º 54
0
    #定义默认参数


with DAG(
        dag_id='android_test2',  #DAG名称
        default_args={
            'owner': 'test',  #流程所有者
            'depends_on_past': False,  # 是否依赖上一个自己的执行状态                  
            'email': ['*****@*****.**'],  # 接收通知的email列表
            'email_on_failure': True,  # 是否在任务执行失败时接收邮件
            'email_on_retry': True,  # 是否在任务重试时接收邮件
            'retries': 3,  # 失败重试次数
            'retry_delay': timedelta(seconds=5),  # 失败重试间隔   
            'start_date':
            timezone.datetime(2019, 11, 23, 7,
                              20),  # 调度时间,utl时间,为了方便测试,一般设置为当前时间减去执行周期
            'end_date': timezone.datetime(2019, 11, 23, 7, 30),  # 结束时间       
        },
        schedule_interval='@once',  #执行周期,执行一次
        #schedule_interval="00, *, *, *, *"  # 执行周期,依次是分,时,天,月,年,此处表示每个整点执行
        #schedule_interval=timedelta(minutes=1)  # 执行周期,表示每分钟执行一次
) as dag:
    start_task = DummyOperator(
        task_id='run_this_first',
        queue='worker',
    )

    run_this_last = DummyOperator(task_id='run_this_last', queue='worker')
    #DummyOperator空操作  执行器
    runner_conf_list = initRunnerConfig()  #提供各种runner_conf
    task_id_to_cmp_list = ['adb_shell_cmp_a', 'adb_shell_cmp_b']
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


import unittest

from airflow import DAG
from airflow import configuration
from airflow.contrib.hooks.mongo_hook import MongoHook
from airflow.contrib.sensors.mongo_sensor import MongoSensor
from airflow.models import Connection
from airflow.utils import db, timezone


DEFAULT_DATE = timezone.datetime(2017, 1, 1)


class TestMongoSensor(unittest.TestCase):

    def setUp(self):
        configuration.load_test_config()
        db.merge_conn(
            Connection(
                conn_id='mongo_test', conn_type='mongo',
                host='mongo', port='27017', schema='test'))

        args = {
            'owner': 'airflow',
            'start_date': DEFAULT_DATE
        }
Exemplo n.º 56
0
import pandas as pd
from hmsclient import HMSClient

from airflow.exceptions import AirflowException
from airflow.models.connection import Connection
from airflow.models.dag import DAG
from airflow.providers.apache.hive.hooks.hive import HiveMetastoreHook, HiveServer2Hook
from airflow.secrets.environment_variables import CONN_ENV_PREFIX
from airflow.utils import timezone
from airflow.utils.operator_helpers import AIRFLOW_VAR_NAME_FORMAT_MAPPING
from tests.test_utils.asserts import assert_equal_ignore_multiple_spaces
from tests.test_utils.mock_hooks import MockHiveCliHook, MockHiveServer2Hook
from tests.test_utils.mock_process import MockSubProcess

DEFAULT_DATE = timezone.datetime(2015, 1, 1)
DEFAULT_DATE_ISO = DEFAULT_DATE.isoformat()
DEFAULT_DATE_DS = DEFAULT_DATE_ISO[:10]


class TestHiveEnvironment(unittest.TestCase):
    def setUp(self):
        self.next_day = (DEFAULT_DATE +
                         datetime.timedelta(days=1)).isoformat()[:10]
        self.database = 'airflow'
        self.partition_by = 'ds'
        self.table = 'static_babynames_partitioned'
        with mock.patch(
                'airflow.providers.apache.hive.hooks.hive.HiveMetastoreHook.get_metastore_client'
        ) as get_metastore_mock:
            get_metastore_mock.return_value = mock.MagicMock()
Exemplo n.º 57
0
from unittest import mock

import pytest
from parameterized import parameterized

from airflow.models import DagBag, DagRun, SlaMiss, TaskInstance
from airflow.security import permissions
from airflow.utils.platform import getuser
from airflow.utils.session import provide_session
from airflow.utils.state import State
from airflow.utils.timezone import datetime
from airflow.utils.types import DagRunType
from tests.test_utils.api_connexion_utils import assert_401, create_user, delete_user
from tests.test_utils.db import clear_db_runs, clear_db_sla_miss

DEFAULT_DATETIME_1 = datetime(2020, 1, 1)
DEFAULT_DATETIME_STR_1 = "2020-01-01T00:00:00+00:00"
DEFAULT_DATETIME_STR_2 = "2020-01-02T00:00:00+00:00"


@pytest.fixture(scope="module")
def configured_app(minimal_app_for_api):
    app = minimal_app_for_api
    create_user(
        app,  # type: ignore
        username="******",
        role_name="Test",
        permissions=[
            (permissions.ACTION_CAN_READ, permissions.RESOURCE_DAG),
            (permissions.ACTION_CAN_READ, permissions.RESOURCE_DAG_RUN),
            (permissions.ACTION_CAN_READ, permissions.RESOURCE_TASK_INSTANCE),
Exemplo n.º 58
0
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
import time

from airflow.models import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.utils.timezone import datetime


class DummyWithOnKill(DummyOperator):
    def execute(self, context):
        time.sleep(10)

    def on_kill(self):
        self.log.info("Executing on_kill")
        f = open("/tmp/airflow_on_kill", "w")
        f.write("ON_KILL_TEST")
        f.close()


# DAG tests backfill with pooled tasks
# Previously backfill would queue the task but never run it
dag1 = DAG(
    dag_id='test_on_kill',
    start_date=datetime(2015, 1, 1))
dag1_task1 = DummyWithOnKill(
    task_id='task1',
    dag=dag1,
    owner='airflow')
from datetime import timedelta, time

from airflow import DAG, configuration, settings
from airflow import exceptions
from airflow.exceptions import AirflowSensorTimeout
from airflow.models import TaskInstance, DagBag
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.sensors.external_task_sensor import ExternalTaskSensor
from airflow.sensors.time_sensor import TimeSensor
from airflow.utils.state import State
from airflow.utils.timezone import datetime

configuration.load_test_config()

DEFAULT_DATE = datetime(2015, 1, 1)
TEST_DAG_ID = 'unit_test_dag'
TEST_TASK_ID = 'time_sensor_check'
DEV_NULL = '/dev/null'


class ExternalTaskSensorTests(unittest.TestCase):

    def setUp(self):
        configuration.load_test_config()
        self.dagbag = DagBag(
            dag_folder=DEV_NULL,
            include_examples=True
        )
        self.args = {
            'owner': 'airflow',
Exemplo n.º 60
0
import datetime
import json
import urllib.parse

import pytest

from airflow.security import permissions
from airflow.utils import timezone
from airflow.utils.state import State
from airflow.utils.types import DagRunType
from tests.test_utils.api_connexion_utils import create_user
from tests.test_utils.db import clear_db_runs
from tests.test_utils.www import check_content_in_response, check_content_not_in_response, client_with_login

NEXT_YEAR = datetime.datetime.now().year + 1
DEFAULT_DATE = timezone.datetime(NEXT_YEAR, 6, 1)
USER_DATA = {
    "dag_tester": (
        "dag_acl_tester",
        {
            "first_name": 'dag_test',
            "last_name": 'dag_test',
            "email": '*****@*****.**',
            "password": '******',
        },
    ),
    "dag_faker": (  # User without permission.
        "dag_acl_faker",
        {
            "first_name": 'dag_faker',
            "last_name": 'dag_faker',