def setUp(self): self.dagbag = DagBag() self.dag_id = 'etl_covid_data_dag'
def run(args): utils.pessimistic_connection_handling() # Setting up logging log = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER')) directory = log + "/{args.dag_id}/{args.task_id}".format(args=args) if not os.path.exists(directory): os.makedirs(directory) args.execution_date = dateutil.parser.parse(args.execution_date) iso = args.execution_date.isoformat() filename = "{directory}/{iso}".format(**locals()) # store old log (to help with S3 appends) if os.path.exists(filename): with open(filename, 'r') as logfile: old_log = logfile.read() else: old_log = None subdir = process_subdir(args.subdir) logging.root.handlers = [] logging.basicConfig(filename=filename, level=settings.LOGGING_LEVEL, format=settings.LOG_FORMAT) if not args.pickle: dagbag = DagBag(subdir) if args.dag_id not in dagbag.dags: msg = 'DAG [{0}] could not be found in {1}'.format( args.dag_id, subdir) logging.error(msg) raise AirflowException(msg) dag = dagbag.dags[args.dag_id] task = dag.get_task(task_id=args.task_id) else: session = settings.Session() logging.info('Loading pickle id {args.pickle}'.format(**locals())) dag_pickle = session.query(DagPickle).filter( DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) task_start_date = None if args.task_start_date: task_start_date = dateutil.parser.parse(args.task_start_date) task.start_date = task_start_date ti = TaskInstance(task, args.execution_date) if args.local: print("Logging into: " + filename) run_job = jobs.LocalTaskJob( task_instance=ti, mark_success=args.mark_success, force=args.force, pickle_id=args.pickle, task_start_date=task_start_date, ignore_dependencies=args.ignore_dependencies, pool=args.pool) run_job.run() elif args.raw: ti.run( mark_success=args.mark_success, force=args.force, ignore_dependencies=args.ignore_dependencies, job_id=args.job_id, pool=args.pool, ) else: pickle_id = None if args.ship_dag: try: # Running remotely, so pickling the DAG session = settings.Session() pickle = DagPickle(dag) session.add(pickle) session.commit() pickle_id = pickle.id print(('Pickled dag {dag} ' 'as pickle_id:{pickle_id}').format(**locals())) except Exception as e: print('Could not pickle the DAG') print(e) raise e executor = DEFAULT_EXECUTOR executor.start() print("Sending to executor.") executor.queue_task_instance( ti, mark_success=args.mark_success, pickle_id=pickle_id, ignore_dependencies=args.ignore_dependencies, force=args.force) executor.heartbeat() executor.end() if configuration.get('core', 'S3_LOG_FOLDER').startswith('s3:'): import boto s3_log = filename.replace(log, configuration.get('core', 'S3_LOG_FOLDER')) bucket, key = s3_log.lstrip('s3:/').split('/', 1) if os.path.exists(filename): # get logs with open(filename, 'r') as logfile: new_log = logfile.read() # remove old logs (since they are already in S3) if old_log: new_log.replace(old_log, '') try: s3 = boto.connect_s3() s3_key = boto.s3.key.Key(s3.get_bucket(bucket), key) # append new logs to old S3 logs, if available if s3_key.exists(): old_s3_log = s3_key.get_contents_as_string().decode() new_log = old_s3_log + '\n' + new_log # send log to S3 encrypt = configuration.get('core', 'ENCRYPT_S3_LOGS') s3_key.set_contents_from_string(new_log, encrypt_key=encrypt) except: print('Could not send logs to S3.')
def index(self): dagbag = DagBag() return self.render("rest_api_plugin/index.html", dags=dagbag.dags, airflow_webserver_base_url=airflow_webserver_base_url, url_dict=url_dict)
def setUp(self): self.dagbag = DagBag(dag_folder=DEV_NULL, include_examples=True) self.args = {'owner': 'airflow', 'start_date': DEFAULT_DATE} self.dag = DAG(TEST_DAG_ID, default_args=self.args)
def test_dag_loading(): dagbag = DagBag() dag = dagbag.get_dag(dag_id='my_dag') assert dagbag.import_errors == {} assert dag is not None assert len(dag.tasks) == 1
def test_should_response_200_serialized(self): # Create empty app with empty dagbag to check if DAG is read from db app_serialized = app.create_app(testing=True) dag_bag = DagBag(os.devnull, include_examples=False, read_dags_from_db=True) app_serialized.dag_bag = dag_bag client = app_serialized.test_client() SerializedDagModel.write_dag(self.dag) expected = { "catchup": True, "concurrency": 16, "dag_id": "test_dag", "dag_run_timeout": None, "default_view": "tree", "description": None, "doc_md": "details", "fileloc": __file__, "is_paused": None, "is_subdag": False, "orientation": "LR", "owners": [], "schedule_interval": { "__type": "TimeDelta", "days": 1, "microseconds": 0, "seconds": 0, }, "start_date": "2020-06-15T00:00:00+00:00", "tags": None, "timezone": "Timezone('UTC')", } response = client.get(f"/api/v1/dags/{self.dag_id}/details", environ_overrides={'REMOTE_USER': "******"}) assert response.status_code == 200 assert response.json == expected response = self.client.get(f"/api/v1/dags/{self.dag_id}/details", environ_overrides={'REMOTE_USER': "******"}) assert response.status_code == 200 expected = { 'catchup': True, 'concurrency': 16, 'dag_id': 'test_dag', 'dag_run_timeout': None, 'default_view': 'tree', 'description': None, 'doc_md': 'details', 'fileloc': __file__, 'is_paused': None, 'is_subdag': False, 'orientation': 'LR', 'owners': [], 'schedule_interval': { '__type': 'TimeDelta', 'days': 1, 'microseconds': 0, 'seconds': 0 }, 'start_date': '2020-06-15T00:00:00+00:00', 'tags': None, 'timezone': "Timezone('UTC')", } assert response.json == expected
def dag_report(args): """Displays dagbag stats at the command line""" dagbag = DagBag(process_subdir(args.subdir)) print(tabulate(dagbag.dagbag_stats, headers="keys", tablefmt=args.output))
def setUp(self): self.dagbag = DagBag() session = settings.Session() session.query(models.ImportError).delete() session.commit()
def setUp(self): self.parser = cli.CLIFactory.get_parser() self.dagbag = DagBag(include_examples=True)
def run(args): utils.pessimistic_connection_handling() # Setting up logging log_base = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER')) directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args) if not os.path.exists(directory): os.makedirs(directory) args.execution_date = dateutil.parser.parse(args.execution_date) iso = args.execution_date.isoformat() filename = "{directory}/{iso}".format(**locals()) subdir = process_subdir(args.subdir) logging.root.handlers = [] logging.basicConfig( filename=filename, level=settings.LOGGING_LEVEL, format=settings.LOG_FORMAT) if not args.pickle: dagbag = DagBag(subdir) if args.dag_id not in dagbag.dags: msg = 'DAG [{0}] could not be found in {1}'.format(args.dag_id, subdir) logging.error(msg) raise AirflowException(msg) dag = dagbag.dags[args.dag_id] task = dag.get_task(task_id=args.task_id) else: session = settings.Session() logging.info('Loading pickle id {args.pickle}'.format(**locals())) dag_pickle = session.query( DagPickle).filter(DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) task_start_date = None if args.task_start_date: task_start_date = dateutil.parser.parse(args.task_start_date) task.start_date = task_start_date ti = TaskInstance(task, args.execution_date) if args.local: print("Logging into: " + filename) run_job = jobs.LocalTaskJob( task_instance=ti, mark_success=args.mark_success, force=args.force, pickle_id=args.pickle, task_start_date=task_start_date, ignore_dependencies=args.ignore_dependencies, pool=args.pool) run_job.run() elif args.raw: ti.run( mark_success=args.mark_success, force=args.force, ignore_dependencies=args.ignore_dependencies, job_id=args.job_id, pool=args.pool, ) else: pickle_id = None if args.ship_dag: try: # Running remotely, so pickling the DAG session = settings.Session() pickle = DagPickle(dag) session.add(pickle) session.commit() pickle_id = pickle.id print(( 'Pickled dag {dag} ' 'as pickle_id:{pickle_id}').format(**locals())) except Exception as e: print('Could not pickle the DAG') print(e) raise e executor = DEFAULT_EXECUTOR executor.start() print("Sending to executor.") executor.queue_task_instance( ti, mark_success=args.mark_success, pickle_id=pickle_id, ignore_dependencies=args.ignore_dependencies, force=args.force, pool=args.pool) executor.heartbeat() executor.end() # store logs remotely remote_base = configuration.get('core', 'REMOTE_BASE_LOG_FOLDER') # deprecated as of March 2016 if not remote_base and configuration.get('core', 'S3_LOG_FOLDER'): warnings.warn( 'The S3_LOG_FOLDER configuration key has been replaced by ' 'REMOTE_BASE_LOG_FOLDER. Your configuration still works but please ' 'update airflow.cfg to ensure future compatibility.', DeprecationWarning) remote_base = configuration.get('core', 'S3_LOG_FOLDER') if os.path.exists(filename): # read log and remove old logs to get just the latest additions with open(filename, 'r') as logfile: log = logfile.read() remote_log_location = filename.replace(log_base, remote_base) # S3 if remote_base.startswith('s3:/'): utils.S3Log().write(log, remote_log_location) # GCS elif remote_base.startswith('gs:/'): utils.GCSLog().write( log, remote_log_location, append=True) # Other elif remote_base: logging.error( 'Unsupported remote log location: {}'.format(remote_base))
def setUpClass(cls): cls.dagbag = DagBag(include_examples=True) cls.parser = cli.CLIFactory.get_parser()
def setUpClass(cls): super().setUpClass() DagBag(example_bash_operator.__file__).get_dag( "example_bash_operator").sync_to_db()
def setUp(self): self.dagbag = DagBag() self.dag_id = self.dagbag.get_dag('sync_country_from_zendesk_pipeline') self.dag_login_aws = self.dag_id.tasks[0] self.dag_sync_country_from_zendesk_pipeline = self.dag_id.tasks[1]
def setUp(self): variable = Variable() self.dagbag = DagBag(dag_folder=variable.get('dags_folder'))
def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster()
def test_retry_still_in_executor(self): """ Checks if the scheduler does not put a task in limbo, when a task is retried but is still present in the executor. """ executor = TestExecutor() dagbag = DagBag(executor=executor) dagbag.dags.clear() dagbag.executor = executor dag = DAG(dag_id='test_retry_still_in_executor', start_date=DEFAULT_DATE, schedule_interval="@once") dag_task1 = BashOperator(task_id='test_retry_handling_op', bash_command='exit 1', retries=1, dag=dag, owner='airflow') dag.clear() dag.is_subdag = False session = settings.Session() orm_dag = DagModel(dag_id=dag.dag_id) orm_dag.is_paused = False session.merge(orm_dag) session.commit() dagbag.bag_dag(dag=dag, root_dag=dag, parent_dag=dag) @mock.patch('airflow.models.DagBag', return_value=dagbag) @mock.patch('airflow.models.DagBag.collect_dags') def do_schedule(function, function2): # Use a empty file since the above mock will return the # expected DAGs. Also specify only a single file so that it doesn't # try to schedule the above DAG repeatedly. scheduler = SchedulerJob(num_runs=1, executor=executor, subdir=os.path.join( models.DAGS_FOLDER, "no_dags.py")) scheduler.heartrate = 0 scheduler.run() do_schedule() self.assertEquals(1, len(executor.queued_tasks)) def run_with_error(task): try: task.run() except AirflowException: pass ti_tuple = six.next(six.itervalues(executor.queued_tasks)) (command, priority, queue, ti) = ti_tuple ti.task = dag_task1 # fail execution run_with_error(ti) self.assertEqual(ti.state, State.UP_FOR_RETRY) self.assertEqual(ti.try_number, 1) ti.refresh_from_db(lock_for_update=True, session=session) ti.state = State.SCHEDULED session.merge(ti) session.commit() # do not schedule do_schedule() self.assertTrue(executor.has_task(ti)) ti.refresh_from_db() self.assertEqual(ti.state, State.SCHEDULED) # now the executor has cleared and it should be allowed the re-queue executor.queued_tasks.clear() do_schedule() ti.refresh_from_db() self.assertEqual(ti.state, State.QUEUED)
def setUp(self): """Method to set up the DAG Validation Class instance for testing.""" self.dagbag = DagBag()
def setUp(self): DAGS_DIR = os.getenv('INPUT_DAGPATHS') os.environ['PYTHONPATH'] = f"{os.getenv('PYTHONPATH')}:{DAGS_DIR}" logging.info("DAGs dir : {}".format(DAGS_DIR)) self.dagbag = DagBag(dag_folder=DAGS_DIR, include_examples=False)
def dag_list_dags(args): """Displays dags with or without stats at the command line""" dagbag = DagBag(process_subdir(args.subdir)) dags = dagbag.dags.values() print(_tabulate_dags(dags, tablefmt=args.output))
def setUpClass(cls): cls.dagbag = DagBag(include_examples=True) cls.dagbag.sync_to_db() cls.parser = cli_parser.get_parser()
def get_dagbag(): return DagBag()
def get_dag(args): dagbag = DagBag(process_subdir(args.subdir)) if args.dag_id not in dagbag.dags: raise AirflowException('dag_id could not be found: {}'.format( args.dag_id)) return dagbag.dags[args.dag_id]
def test_handle_failure_callback_with_zombies_are_correctly_passed_to_dag_file_processor(self): """ Check that the same set of failure callback with zombies are passed to the dag file processors until the next zombie detection logic is invoked. """ test_dag_path = os.path.join(TEST_DAG_FOLDER, 'test_example_bash_operator.py') with conf_vars({('scheduler', 'max_threads'): '1', ('core', 'load_examples'): 'False'}): dagbag = DagBag(test_dag_path) with create_session() as session: session.query(LJ).delete() dag = dagbag.get_dag('test_example_bash_operator') dag.sync_to_db() task = dag.get_task(task_id='run_this_last') ti = TI(task, DEFAULT_DATE, State.RUNNING) local_job = LJ(ti) local_job.state = State.SHUTDOWN session.add(local_job) session.commit() # TODO: If there was an actual Relationshop between TI and Job # we wouldn't need this extra commit session.add(ti) ti.job_id = local_job.id session.commit() expected_failure_callback_requests = [ TaskCallbackRequest( full_filepath=dag.full_filepath, simple_task_instance=SimpleTaskInstance(ti), msg="Message" ) ] test_dag_path = os.path.join(TEST_DAG_FOLDER, 'test_example_bash_operator.py') child_pipe, parent_pipe = multiprocessing.Pipe() async_mode = 'sqlite' not in conf.get('core', 'sql_alchemy_conn') fake_processors = [] def fake_processor_factory(*args, **kwargs): nonlocal fake_processors processor = FakeDagFileProcessorRunner._fake_dag_processor_factory(*args, **kwargs) fake_processors.append(processor) return processor manager = DagFileProcessorManager( dag_directory=test_dag_path, max_runs=1, processor_factory=fake_processor_factory, processor_timeout=timedelta.max, signal_conn=child_pipe, dag_ids=[], pickle_dags=False, async_mode=async_mode) self.run_processor_manager_one_loop(manager, parent_pipe) if async_mode: # Once for initial parse, and then again for the add_callback_to_queue assert len(fake_processors) == 2 assert fake_processors[0]._file_path == test_dag_path assert fake_processors[0]._callback_requests == [] else: assert len(fake_processors) == 1 assert fake_processors[-1]._file_path == test_dag_path callback_requests = fake_processors[-1]._callback_requests assert ( set(zombie.simple_task_instance.key for zombie in expected_failure_callback_requests) == set(result.simple_task_instance.key for result in callback_requests) ) child_pipe.close() parent_pipe.close()
def test_dag_loads_with_no_errors(tmpdir): tmp_directory = str(tmpdir) dag_bag = DagBag(dag_folder=tmp_directory, include_examples=False) dag_bag.process_file(os.path.join(FILE_DIR, 'loader_workflow.py')) assert len(dag_bag.import_errors) == 0 assert len(dag_bag.dags) == 2
def setUpClass(cls): cls.dagbag = DagBag(include_examples=True)
def dagbag(): return DagBag(include_examples=False)
def list_dags(args): dagbag = DagBag(process_subdir(args.subdir)) print("\n".join(sorted(dagbag.dags)))
def setUp(self): self.dagbag = DagBag(include_examples=True)
def make_example_dags(module_path): """Loads DAGs from a module for test.""" dagbag = DagBag(module_path) return dagbag.dags
def setUp(self): self.dagbag = DagBag()