Exemplo n.º 1
0
    def execution_parallelism(self, parallelism=0):
        executor = LocalExecutor(parallelism=parallelism)
        executor.start()

        success_key = 'success {}'
        success_command = ['true', 'some_parameter']
        fail_command = ['false', 'some_parameter']
        self.assertTrue(executor.result_queue.empty())

        execution_date = datetime.datetime.now()
        for i in range(self.TEST_SUCCESS_COMMANDS):
            key_id, command = success_key.format(i), success_command
            key = key_id, 'fake_ti', execution_date, 0
            executor.running.add(key)
            executor.execute_async(key=key, command=command)

        fail_key = 'fail', 'fake_ti', execution_date, 0
        executor.running.add(fail_key)
        executor.execute_async(key=fail_key, command=fail_command)

        executor.end()
        # By that time Queues are already shutdown so we cannot check if they are empty
        self.assertEqual(len(executor.running), 0)

        for i in range(self.TEST_SUCCESS_COMMANDS):
            key_id = success_key.format(i)
            key = key_id, 'fake_ti', execution_date, 0
            self.assertEqual(executor.event_buffer[key], State.SUCCESS)
        self.assertEqual(executor.event_buffer[fail_key], State.FAILED)

        expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism
        self.assertEqual(executor.workers_used, expected)
Exemplo n.º 2
0
 def test_gauge_executor_metrics(self, mock_stats_gauge, mock_trigger_tasks, mock_sync):
     executor = LocalExecutor()
     executor.heartbeat()
     calls = [mock.call('executor.open_slots', mock.ANY),
              mock.call('executor.queued_tasks', mock.ANY),
              mock.call('executor.running_tasks', mock.ANY)]
     mock_stats_gauge.assert_has_calls(calls)
Exemplo n.º 3
0
    def _test_execute(self, parallelism, success_command, fail_command):

        executor = LocalExecutor(parallelism=parallelism)
        executor.start()

        success_key = 'success {}'
        assert executor.result_queue.empty()

        execution_date = datetime.datetime.now()
        for i in range(self.TEST_SUCCESS_COMMANDS):
            key_id, command = success_key.format(i), success_command
            key = key_id, 'fake_ti', execution_date, 0
            executor.running.add(key)
            executor.execute_async(key=key, command=command)

        fail_key = 'fail', 'fake_ti', execution_date, 0
        executor.running.add(fail_key)
        executor.execute_async(key=fail_key, command=fail_command)

        executor.end()
        # By that time Queues are already shutdown so we cannot check if they are empty
        assert len(executor.running) == 0

        for i in range(self.TEST_SUCCESS_COMMANDS):
            key_id = success_key.format(i)
            key = key_id, 'fake_ti', execution_date, 0
            assert executor.event_buffer[key][0] == State.SUCCESS
        assert executor.event_buffer[fail_key][0] == State.FAILED

        expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism
        assert executor.workers_used == expected
Exemplo n.º 4
0
def _get_executor(executor_name):
    """
    Creates a new instance of the named executor. In case the executor name is not know in airflow, 
    look for it in the plugins
    """
    if executor_name == 'LocalExecutor':
        return LocalExecutor()
    elif executor_name == 'SequentialExecutor':
        return SequentialExecutor()
    elif executor_name == 'CeleryExecutor':
        from airflow.executors.celery_executor import CeleryExecutor
        return CeleryExecutor()
    elif executor_name == 'DaskExecutor':
        from airflow.executors.dask_executor import DaskExecutor
        return DaskExecutor()
    elif executor_name == 'MesosExecutor':
        from airflow.contrib.executors.mesos_executor import MesosExecutor
        return MesosExecutor()
    else:
        # Loading plugins
        _integrate_plugins()
        executor_path = executor_name.split('.')
        if len(executor_path) != 2:
            raise AirflowException(
                "Executor {0} not supported: please specify in format plugin_module.executor"
                .format(executor_name))

        if executor_path[0] in globals():
            return globals()[executor_path[0]].__dict__[executor_path[1]]()
        else:
            raise AirflowException(
                "Executor {0} not supported.".format(executor_name))
Exemplo n.º 5
0
    def _get_executor(executor_name: str) -> BaseExecutor:
        """
        Creates a new instance of the named executor.
        In case the executor name is unknown in airflow,
        look for it in the plugins
        """
        if executor_name == ExecutorLoader.LOCAL_EXECUTOR:
            from airflow.executors.local_executor import LocalExecutor
            return LocalExecutor()
        elif executor_name == ExecutorLoader.SEQUENTIAL_EXECUTOR:
            from airflow.executors.sequential_executor import SequentialExecutor
            return SequentialExecutor()
        elif executor_name == ExecutorLoader.CELERY_EXECUTOR:
            from airflow.executors.celery_executor import CeleryExecutor
            return CeleryExecutor()
        elif executor_name == ExecutorLoader.DASK_EXECUTOR:
            from airflow.executors.dask_executor import DaskExecutor
            return DaskExecutor()
        elif executor_name == ExecutorLoader.KUBERNETES_EXECUTOR:
            from airflow.executors.kubernetes_executor import KubernetesExecutor
            return KubernetesExecutor()
        else:
            # Load plugins here for executors as at that time the plugins might not have been initialized yet
            # TODO: verify the above and remove two lines below in case plugins are always initialized first
            from airflow import plugins_manager
            plugins_manager.integrate_executor_plugins()
            executor_path = executor_name.split('.')
            assert len(executor_path) == 2, f"Executor {executor_name} not supported: " \
                                            f"please specify in format plugin_module.executor"

            assert executor_path[0] in globals(
            ), f"Executor {executor_name} not supported"
            return globals()[executor_path[0]].__dict__[executor_path[1]]()
Exemplo n.º 6
0
def _get_executor(executor_name):
    """
    Creates a new instance of the named executor.
    In case the executor name is not know in airflow,
    look for it in the plugins
    """
    parallelism = PARALLELISM
    if executor_name == Executors.LocalExecutor:
        return LocalExecutor(parallelism)
    elif executor_name == Executors.SequentialExecutor:
        return SequentialExecutor(parallelism)
    elif executor_name == Executors.CeleryExecutor:
        from airflow.executors.celery_executor import CeleryExecutor, execute_command
        return CeleryExecutor(parallelism, execute_command)
    elif executor_name == Executors.DaskExecutor:
        from airflow.executors.dask_executor import DaskExecutor
        cluster_address = configuration.conf.get('dask', 'cluster_address')
        tls_ca = configuration.conf.get('dask', 'tls_ca')
        tls_key = configuration.conf.get('dask', 'tls_key')
        tls_cert = configuration.conf.get('dask', 'tls_cert')
        return DaskExecutor(parallelism, cluster_address, tls_ca, tls_key,
                            tls_cert)
    elif executor_name == Executors.MesosExecutor:
        from airflow.contrib.executors.mesos_executor import MesosExecutor
        return MesosExecutor(parallelism)
    elif executor_name == Executors.KubernetesExecutor:
        from airflow.contrib.executors.kubernetes_executor import KubernetesExecutor
        return KubernetesExecutor()
    else:
        # Loading plugins
        _integrate_plugins()
        # 从插件模块中获取指定类
        args = []
        kwargs = {'parallelism': PARALLELISM}
        return create_object_from_plugin_module(executor_name, *args, **kwargs)
Exemplo n.º 7
0
def subdag_task(database):
    sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, database,
                                                     '@once'),
                             task_id=database,
                             dag=main_dag,
                             pool='Pool_max_parallel_500',
                             executor=LocalExecutor())
    return sub_dag
Exemplo n.º 8
0
 def start_scheduler(self, file_path):
     self.scheduler = EventBasedSchedulerJob(
         dag_directory=file_path,
         server_uri="localhost:{}".format(self.port),
         executor=LocalExecutor(3),
         max_runs=-1,
         refresh_dag_dir_interval=30)
     print("scheduler starting")
     self.scheduler.run()
Exemplo n.º 9
0
    def execution_parallelism(self, mock_check_call, parallelism=0):
        success_command = ['airflow', 'tasks', 'run', 'true', 'some_parameter']
        fail_command = ['airflow', 'tasks', 'run', 'false']

        def fake_execute_command(command, close_fds=True):  # pylint: disable=unused-argument
            if command != success_command:
                raise subprocess.CalledProcessError(returncode=1, cmd=command)
            else:
                return 0

        mock_check_call.side_effect = fake_execute_command

        executor = LocalExecutor(parallelism=parallelism)
        executor.start()

        success_key = 'success {}'
        self.assertTrue(executor.result_queue.empty())

        execution_date = datetime.datetime.now()
        for i in range(self.TEST_SUCCESS_COMMANDS):
            key_id, command = success_key.format(i), success_command
            key = key_id, 'fake_ti', execution_date, 0
            executor.running.add(key)
            executor.execute_async(key=key, command=command)

        fail_key = 'fail', 'fake_ti', execution_date, 0
        executor.running.add(fail_key)
        executor.execute_async(key=fail_key, command=fail_command)

        executor.end()
        # By that time Queues are already shutdown so we cannot check if they are empty
        self.assertEqual(len(executor.running), 0)

        for i in range(self.TEST_SUCCESS_COMMANDS):
            key_id = success_key.format(i)
            key = key_id, 'fake_ti', execution_date, 0
            self.assertEqual(executor.event_buffer[key][0], State.SUCCESS)
        self.assertEqual(executor.event_buffer[fail_key][0], State.FAILED)

        expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism
        self.assertEqual(executor.workers_used, expected)
    def start_scheduler(cls, file_path, executor=None):
        if executor is None:
            executor = LocalExecutor(3)

        scheduler = EventBasedSchedulerJob(dag_directory=file_path,
                                           server_uri="localhost:{}".format(
                                               master_port()),
                                           executor=executor,
                                           max_runs=-1,
                                           refresh_dag_dir_interval=30)
        print("scheduler starting")
        scheduler.run()
Exemplo n.º 11
0
    def execution_parallelism(self, parallelism=0):
        executor = LocalExecutor(parallelism=parallelism)
        executor.start()

        success_key = 'success {}'
        success_command = ['true', 'some_parameter']
        fail_command = ['false', 'some_parameter']
        self.assertTrue(executor.result_queue.empty())

        for i in range(self.TEST_SUCCESS_COMMANDS):
            key, command = success_key.format(i), success_command
            executor.running[key] = True
            executor.execute_async(key=key, command=command)

        executor.running['fail'] = True
        executor.execute_async(key='fail', command=fail_command)

        executor.end()

        if isinstance(executor.impl, LocalExecutor._LimitedParallelism):
            self.assertTrue(executor.queue.empty())
        self.assertEqual(len(executor.running), 0)
        self.assertTrue(executor.result_queue.empty())

        for i in range(self.TEST_SUCCESS_COMMANDS):
            key = success_key.format(i)
            self.assertEqual(executor.event_buffer[key], State.SUCCESS)
        self.assertEqual(executor.event_buffer['fail'], State.FAILED)

        expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism
        self.assertEqual(executor.workers_used, expected)
        start_task >> dt_s3
        dt_s3 >> dt_sf
        dt_sf >> end

    return one_dag


#############################################################################
#Defining Main Dag structure
#############################################################################

main_dag = DAG(
    dag_id=parent_dag_name,
    default_args=default_args,
    schedule_interval='@once'
    #schedule_interval=timedelta(minutes=5),
    #max_active_runs=1
    ,
    concurrency=35)

database_list = ['database']

#Each database is an independant task that will run in parallel4
for i in database_list:
    sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, i,
                                                     '@once'),
                             task_id=i,
                             dag=main_dag,
                             pool='Pool_max_parallel_5',
                             executor=LocalExecutor())
Exemplo n.º 13
0
from airflow.exceptions import AirflowException


def _integrate_plugins():
    """Integrate plugins to the context."""
    from airflow.plugins_manager import executors_modules
    for executors_module in executors_modules:
        sys.modules[executors_module.__name__] = executors_module
        globals()[executors_module._name] = executors_module


_EXECUTOR = configuration.get('core', 'EXECUTOR')

if _EXECUTOR == 'LocalExecutor':
    DEFAULT_EXECUTOR = LocalExecutor()
elif _EXECUTOR == 'CeleryExecutor':
    DEFAULT_EXECUTOR = CeleryExecutor()
elif _EXECUTOR == 'SequentialExecutor':
    DEFAULT_EXECUTOR = SequentialExecutor()
elif _EXECUTOR == 'MesosExecutor':
    from airflow.contrib.executors.mesos_executor import MesosExecutor
    DEFAULT_EXECUTOR = MesosExecutor()
else:
    # Loading plugins
    _integrate_plugins()
    if _EXECUTOR in globals():
        DEFAULT_EXECUTOR = globals()[_EXECUTOR]()
    else:
        raise AirflowException("Executor {0} not supported.".format(_EXECUTOR))
    def execution_parallelism(self, parallelism=0):
        executor = LocalExecutor(parallelism=parallelism)
        executor.start()

        success_key = 'success {}'
        success_command = 'echo {}'
        fail_command = 'exit 1'

        for i in range(self.TEST_SUCCESS_COMMANDS):
            key, command = success_key.format(i), success_command.format(i)
            executor.execute_async(key=key, command=command)
            executor.running[key] = True

        # errors are propagated for some reason
        try:
            executor.execute_async(key='fail', command=fail_command)
        except:
            pass

        executor.running['fail'] = True

        if parallelism == 0:
            with timeout(seconds=5):
                executor.end()
        else:
            executor.end()

        for i in range(self.TEST_SUCCESS_COMMANDS):
            key = success_key.format(i)
            self.assertTrue(executor.event_buffer[key], State.SUCCESS)
        self.assertTrue(executor.event_buffer['fail'], State.FAILED)

        for i in range(self.TEST_SUCCESS_COMMANDS):
            self.assertNotIn(success_key.format(i), executor.running)
        self.assertNotIn('fail', executor.running)

        expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism
        self.assertEqual(executor.workers_used, expected)
    def test_scheduler_task(self):
        TEST_DAG_FOLDER = os.environ['AIRFLOW__CORE__DAGS_FOLDER']
        DEFAULT_DATE = timezone.datetime(2020, 1, 1)
        dag_id = 'test_event_based_dag'
        task_id = 'sleep_1000_secs'

        with create_session() as session:
            dag_bag = DagBag(
                dag_folder=TEST_DAG_FOLDER,
                include_examples=False,
            )

            dag = dag_bag.get_dag(dag_id)
            task = dag.get_task(task_id)
            dag.create_dagrun(
                run_id="sleep_1000_secs_run",
                state=State.RUNNING,
                execution_date=DEFAULT_DATE,
                start_date=DEFAULT_DATE,
                session=session,
            )
            ti = TaskInstance(task=task, execution_date=DEFAULT_DATE)
            ti.state = State.SCHEDULED
            dag_model = DagModel(
                dag_id=dag_id,
                is_paused=False,
                concurrency=5,
                has_task_concurrency_limits=False,
            )
            session.merge(dag_model)
            session.merge(ti)
            session.commit()
        executor = LocalExecutor(2)
        executor.start()
        executor.heartbeat()
        executor.schedule_task(ti.key, SchedulingAction.START)
        executor.heartbeat()
        time.sleep(30)

        #  wait for task instance started
        ti.refresh_from_db()
        self.assertEqual(ti.state, State.RUNNING)
        process = psutil.Process(ti.pid)
        self.assertIsNotNone(process)
        child = process.children(recursive=False)
        self.assertEqual(1, len(child))
        grandchild = child[0].children(recursive=False)
        self.assertEqual(1, len(grandchild))
        tes = self._check_task_execution(ti)
        self.assertEqual(1, len(tes))

        #  restart the task instance
        executor.schedule_task(ti.key, SchedulingAction.RESTART)
        executor.heartbeat()
        time.sleep(30)

        self.assertFalse(self._check_process_exist(process.pid))
        self.assertFalse(self._check_process_exist(child[0].pid))
        self.assertFalse(self._check_process_exist(grandchild[0].pid))

        ti.refresh_from_db()
        self.assertEqual(ti.state, State.RUNNING)
        process = psutil.Process(ti.pid)
        self.assertIsNotNone(process)
        child = process.children(recursive=False)
        self.assertEqual(1, len(child))
        grandchild = child[0].children(recursive=False)
        self.assertEqual(1, len(grandchild))
        tes = self._check_task_execution(ti)
        self.assertEqual(2, len(tes))
        self.assertEqual(2, tes[0].seq_num)

        executor.schedule_task(ti.key, SchedulingAction.STOP)
        ti.refresh_from_db()
        time.sleep(10)
        self.assertEqual(State.KILLED, ti.state)
        self.assertFalse(self._check_process_exist(process.pid))
        self.assertFalse(self._check_process_exist(child[0].pid))
        self.assertFalse(self._check_process_exist(grandchild[0].pid))
        self._check_task_execution(ti)

        executor.end()
Exemplo n.º 16
0
    def execution_parallelism(self, parallelism=0):
        executor = LocalExecutor(parallelism=parallelism)
        executor.start()

        success_key = 'success {}'
        success_command = ['true', 'some_parameter']
        fail_command = ['false', 'some_parameter']
        self.assertTrue(executor.result_queue.empty())

        for i in range(self.TEST_SUCCESS_COMMANDS):
            key, command = success_key.format(i), success_command
            executor.running[key] = True
            executor.execute_async(key=key, command=command)

        executor.running['fail'] = True
        executor.execute_async(key='fail', command=fail_command)

        executor.end()
        # By that time Queues are already shutdown so we cannot check if they are empty
        self.assertEqual(len(executor.running), 0)

        for i in range(self.TEST_SUCCESS_COMMANDS):
            key = success_key.format(i)
            self.assertEqual(executor.event_buffer[key], State.SUCCESS)
        self.assertEqual(executor.event_buffer['fail'], State.FAILED)

        expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism
        self.assertEqual(executor.workers_used, expected)