def test_spark_dag(mock_subproc_popen): # Hack to get around having a Connection os.environ["AIRFLOW_CONN_SPARK"] = "something" dag = DAG( dag_id="spark_dag", default_args=default_args, schedule_interval=None, ) # pylint: disable=unused-variable clean_data = SparkSubmitOperator( task_id="run_spark", application="some_path.py", conn_id="SPARK", dag=dag, ) pipeline = make_dagster_pipeline_from_airflow_dag( dag=dag, tags={ AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat() }, ) execute_pipeline(pipeline) # , instance=instance,) assert mock_subproc_popen.call_args_list[0][0] == ([ "spark-submit", "--master", "", "--name", "airflow-spark", "some_path.py" ], )
def test_long_name(): dag_name = "dag-with.dot-dash-lo00ong" * 10 dag = DAG( dag_id=dag_name, default_args=default_args, schedule_interval=None, ) long_name = "task-with.dot-dash2-loong" * 10 # 250 characters, Airflow's max allowed length dummy_operator = DummyOperator( task_id=long_name, dag=dag, ) pipeline_def = make_dagster_pipeline_from_airflow_dag( dag=dag, tags={ AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat() }, ) result = execute_pipeline(pipeline_def) assert result.success assert ( result.pipeline_def.name == "airflow_dag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ongdag_with_dot_dash_lo00ong" ) assert len(result.pipeline_def.solids) == 1 assert ( result.pipeline_def.solids[0].name == "airflow_task_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loongtask_with_dot_dash2_loong" )
def test_multi_leaf_dag(snapshot): dag = DAG( dag_id="multi_leaf_dag", default_args=default_args, schedule_interval=None, ) dummy_operator_1 = DummyOperator( task_id="dummy_operator_1", dag=dag, ) dummy_operator_2 = DummyOperator( task_id="dummy_operator_2", dag=dag, ) dummy_operator_3 = DummyOperator( task_id="dummy_operator_3", dag=dag, ) dummy_operator_4 = DummyOperator( task_id="dummy_operator_4", dag=dag, ) dummy_operator_1 >> dummy_operator_2 dummy_operator_1 >> dummy_operator_3 dummy_operator_1 >> dummy_operator_4 snapshot.assert_match( serialize_pp( PipelineSnapshot.from_pipeline_def( make_dagster_pipeline_from_airflow_dag( dag=dag)).dep_structure_snapshot))
def test_diamond_task_dag(snapshot): dag = DAG( dag_id='diamond_task_dag', default_args=default_args, schedule_interval=None, ) dummy_operator_1 = DummyOperator( task_id='dummy_operator_1', dag=dag, ) dummy_operator_2 = DummyOperator( task_id='dummy_operator_2', dag=dag, ) dummy_operator_3 = DummyOperator( task_id='dummy_operator_3', dag=dag, ) dummy_operator_4 = DummyOperator( task_id='dummy_operator_4', dag=dag, ) dummy_operator_1 >> dummy_operator_2 dummy_operator_1 >> dummy_operator_3 dummy_operator_2 >> dummy_operator_4 dummy_operator_3 >> dummy_operator_4 snapshot.assert_match( serialize_pp( PipelineSnapshot.from_pipeline_def( make_dagster_pipeline_from_airflow_dag( dag)).dep_structure_snapshot))
def test_one_task_dag(): dag = DAG(dag_id='dag', default_args=default_args, schedule_interval=None,) dummy_operator = DummyOperator(task_id='dummy_operator', dag=dag,) pipeline_def = make_dagster_pipeline_from_airflow_dag( dag=dag, tags={AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat()}, ) result = execute_pipeline(pipeline_def) assert result.success
def make_dagster_job_from_airflow_dag(dag, tags=None, use_airflow_template_context=False, unique_id=None): """Construct a Dagster job corresponding to a given Airflow DAG. Tasks in the resulting job will execute the ``execute()`` method on the corresponding Airflow Operator. Dagster, any dependencies required by Airflow Operators, and the module containing your DAG definition must be available in the Python environment within which your Dagster solids execute. To set Airflow's ``execution_date`` for use with Airflow Operator's ``execute()`` methods, either: 1. (Best for ad hoc runs) Execute job directly. This will set execution_date to the time (in UTC) of the run. 2. Add ``{'airflow_execution_date': utc_date_string}`` to the job tags. This will override behavior from (1). .. code-block:: python my_dagster_job = make_dagster_job_from_airflow_dag( dag=dag, tags={'airflow_execution_date': utc_execution_date_str} ) my_dagster_job.execute_in_process() 3. (Recommended) Add ``{'airflow_execution_date': utc_date_string}`` to the run tags, such as in the Dagit UI. This will override behavior from (1) and (2) We apply normalized_name() to the dag id and task ids when generating job name and op names to ensure that names conform to Dagster's naming conventions. Args: dag (DAG): The Airflow DAG to compile into a Dagster job tags (Dict[str, Field]): Job tags. Optionally include `tags={'airflow_execution_date': utc_date_string}` to specify execution_date used within execution of Airflow Operators. use_airflow_template_context (bool): If True, will call get_template_context() on the Airflow TaskInstance model which requires and modifies the DagRun table. (default: False) unique_id (int): If not None, this id will be postpended to generated op names. Used by framework authors to enforce unique op names within a repo. Returns: JobDefinition: The generated Dagster job """ pipeline_def = make_dagster_pipeline_from_airflow_dag( dag, tags, use_airflow_template_context, unique_id) # pass in tags manually because pipeline_def.graph doesn't have it threaded return pipeline_def.graph.to_job(tags={**pipeline_def.tags})
def test_pipeline_tags(): dag = get_dag() instance = DagsterInstance.local_temp() manager = instance.compute_log_manager # When mode is default and tags are set, run with tags result = execute_pipeline( pipeline=make_dagster_pipeline_from_airflow_dag( dag, {AIRFLOW_EXECUTION_DATE_STR: EXECUTION_DATE_MINUS_WEEK_FMT}), instance=instance, ) check_compute_logs(manager, result, EXECUTION_DATE_MINUS_WEEK_FMT)
def test_normalize_name(): dag = DAG(dag_id="dag-with.dot-dash", default_args=default_args, schedule_interval=None,) dummy_operator = DummyOperator(task_id="task-with.dot-dash", dag=dag,) pipeline_def = make_dagster_pipeline_from_airflow_dag( dag=dag, tags={AIRFLOW_EXECUTION_DATE_STR: get_current_datetime_in_utc().isoformat()}, ) result = execute_pipeline(pipeline_def) assert result.success assert result.pipeline_def.name == "airflow_dag_with_dot_dash" assert len(result.pipeline_def.solids) == 1 assert result.pipeline_def.solids[0].name == "airflow_task_with_dot_dash"
def test_one_task_dag(snapshot): dag = DAG( dag_id='one_task_dag', default_args=default_args, schedule_interval=None, ) dummy_operator = DummyOperator( task_id='dummy_operator', dag=dag, ) snapshot.assert_match( serialize_pp( PipelineSnapshot.from_pipeline_def( make_dagster_pipeline_from_airflow_dag( dag)).dep_structure_snapshot))
def test_pipeline_auto_tag(): dag = get_dag() with instance_for_test() as instance: manager = instance.compute_log_manager pre_execute_time = get_current_datetime_in_utc() # When tags are not set, run with current time result = execute_pipeline( pipeline=make_dagster_pipeline_from_airflow_dag(dag=dag), instance=instance, ) post_execute_time = get_current_datetime_in_utc() compute_io_path = manager.get_local_path(result.run_id, "airflow_templated.compute", ComputeIOType.STDOUT) assert os.path.exists(compute_io_path) stdout_file = open(compute_io_path, "r") file_contents = normalize_file_content(stdout_file.read()) stdout_file.close() search_str = "INFO - Running command: \n echo '" date_start = file_contents.find(search_str) + len(search_str) date_end = date_start + 10 # number of characters in YYYY-MM-DD date = file_contents[date_start:date_end] check_compute_logs(manager, result, date) pre_execute_time_fmt = pre_execute_time.strftime("%Y-%m-%d") post_execute_time_fmt = post_execute_time.strftime("%Y-%m-%d") assert date in [pre_execute_time_fmt, post_execute_time_fmt]
def test_template_task_dag(): dag = DAG( dag_id="dag", default_args=default_args, schedule_interval=None, ) t1 = BashOperator( task_id="print_hello", bash_command="echo hello dagsir", dag=dag, ) t2 = BashOperator( task_id="sleep", bash_command="sleep 2", dag=dag, ) templated_command = """ {% for i in range(5) %} echo '{{ ds }}' echo '{{ macros.ds_add(ds, 7)}}' echo '{{ params.my_param }}' {% endfor %} """ t3 = BashOperator( task_id="templated", depends_on_past=False, bash_command=templated_command, params={"my_param": "Parameter I passed in"}, dag=dag, ) # pylint: disable=pointless-statement t1 >> [t2, t3] instance = DagsterInstance.local_temp() manager = instance.compute_log_manager execution_date = get_current_datetime_in_utc() execution_date_add_one_week = execution_date + datetime.timedelta(days=7) execution_date_iso = execution_date.strftime("%Y-%m-%d") execution_date_add_one_week_iso = execution_date_add_one_week.strftime( "%Y-%m-%d") result = execute_pipeline( make_dagster_pipeline_from_airflow_dag( dag=dag, tags={AIRFLOW_EXECUTION_DATE_STR: execution_date_iso}), instance=instance, ) compute_steps = [ event.step_key for event in result.step_event_list if event.event_type == DagsterEventType.STEP_START ] assert compute_steps == [ "airflow_print_hello.compute", "airflow_sleep.compute", "airflow_templated.compute", ] for step_key in compute_steps: compute_io_path = manager.get_local_path(result.run_id, step_key, ComputeIOType.STDOUT) assert os.path.exists(compute_io_path) stdout_file = open(compute_io_path, "r") file_contents = normalize_file_content(stdout_file.read()) stdout_file.close() if step_key == "airflow_print_hello.compute": assert file_contents.count( "INFO - Running command: echo hello dagsir\n") == 1 assert file_contents.count( "INFO - Command exited with return code 0") == 1 elif step_key == "airflow_sleep.compute": assert file_contents.count( "INFO - Running command: sleep 2\n") == 1 assert file_contents.count("INFO - Output:\n") == 1 assert file_contents.count( "INFO - Command exited with return code 0") == 1 elif step_key == "airflow_templated.compute": assert (file_contents.count( "INFO - Running command: \n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n " "echo '{execution_date_iso}'\n " "echo '{execution_date_add_one_week_iso}'\n " "echo 'Parameter I passed in'\n \n \n".format( execution_date_iso=execution_date_iso, execution_date_add_one_week_iso= execution_date_add_one_week_iso, )) == 1) assert (file_contents.count("INFO - {execution_date_iso}\n".format( execution_date_iso=execution_date_iso)) == 5) assert (file_contents.count( "INFO - {execution_date_add_one_week_iso}\n".format( execution_date_add_one_week_iso= execution_date_add_one_week_iso)) == 5) assert file_contents.count("INFO - Parameter I passed in\n") == 5 assert file_contents.count( "INFO - Command exited with return code 0") == 1
# start_repo_marker_0 from airflow_ingest.airflow_complex_dag import complex_dag from airflow_ingest.airflow_simple_dag import simple_dag from dagster_airflow.dagster_pipeline_factory import make_dagster_pipeline_from_airflow_dag from dagster import repository airflow_simple_dag = make_dagster_pipeline_from_airflow_dag(simple_dag) airflow_complex_dag = make_dagster_pipeline_from_airflow_dag(complex_dag) @repository def airflow_ingest_example(): return [airflow_complex_dag, airflow_simple_dag] # end_repo_marker_0
def test_complex_dag(snapshot): dag = DAG(dag_id="complex_dag", default_args=default_args, schedule_interval=None) # Create create_entry_group = DummyOperator( task_id="create_entry_group", dag=dag, ) create_entry_group_result = DummyOperator( task_id="create_entry_group_result", dag=dag, ) create_entry_group_result2 = DummyOperator( task_id="create_entry_group_result2", dag=dag, ) create_entry_gcs = DummyOperator( task_id="create_entry_gcs", dag=dag, ) create_entry_gcs_result = DummyOperator( task_id="create_entry_gcs_result", dag=dag, ) create_entry_gcs_result2 = DummyOperator( task_id="create_entry_gcs_result2", dag=dag, ) create_tag = DummyOperator( task_id="create_tag", dag=dag, ) create_tag_result = DummyOperator( task_id="create_tag_result", dag=dag, ) create_tag_result2 = DummyOperator( task_id="create_tag_result2", dag=dag, ) create_tag_template = DummyOperator( task_id="create_tag_template", dag=dag, ) create_tag_template_result = DummyOperator( task_id="create_tag_template_result", dag=dag, ) create_tag_template_result2 = DummyOperator( task_id="create_tag_template_result2", dag=dag, ) create_tag_template_field = DummyOperator( task_id="create_tag_template_field", dag=dag, ) create_tag_template_field_result = DummyOperator( task_id="create_tag_template_field_result", dag=dag, ) create_tag_template_field_result2 = DummyOperator( task_id="create_tag_template_field_result", dag=dag, ) # Delete delete_entry = DummyOperator( task_id="delete_entry", dag=dag, ) create_entry_gcs >> delete_entry delete_entry_group = DummyOperator( task_id="delete_entry_group", dag=dag, ) create_entry_group >> delete_entry_group delete_tag = DummyOperator( task_id="delete_tag", dag=dag, ) create_tag >> delete_tag delete_tag_template_field = DummyOperator( task_id="delete_tag_template_field", dag=dag, ) delete_tag_template = DummyOperator( task_id="delete_tag_template", dag=dag, ) # Get get_entry_group = DummyOperator( task_id="get_entry_group", dag=dag, ) get_entry_group_result = DummyOperator( task_id="get_entry_group_result", dag=dag, ) get_entry = DummyOperator( task_id="get_entry", dag=dag, ) get_entry_result = DummyOperator( task_id="get_entry_result", dag=dag, ) get_tag_template = DummyOperator( task_id="get_tag_template", dag=dag, ) get_tag_template_result = DummyOperator( task_id="get_tag_template_result", dag=dag, ) # List list_tags = DummyOperator( task_id="list_tags", dag=dag, ) list_tags_result = DummyOperator( task_id="list_tags_result", dag=dag, ) # Lookup lookup_entry = DummyOperator( task_id="lookup_entry", dag=dag, ) lookup_entry_result = DummyOperator( task_id="lookup_entry_result", dag=dag, ) # Rename rename_tag_template_field = DummyOperator( task_id="rename_tag_template_field", dag=dag, ) # Search search_catalog = DummyOperator( task_id="search_catalog", dag=dag, ) search_catalog_result = DummyOperator( task_id="search_catalog_result", dag=dag, ) # Update update_entry = DummyOperator( task_id="update_entry", dag=dag, ) update_tag = DummyOperator( task_id="update_tag", dag=dag, ) update_tag_template = DummyOperator( task_id="update_tag_template", dag=dag, ) update_tag_template_field = DummyOperator( task_id="update_tag_template_field", dag=dag, ) # Create create_tasks = [ create_entry_group, create_entry_gcs, create_tag_template, create_tag_template_field, create_tag, ] chain(*create_tasks) create_entry_group >> delete_entry_group create_entry_group >> create_entry_group_result create_entry_group >> create_entry_group_result2 create_entry_gcs >> delete_entry create_entry_gcs >> create_entry_gcs_result create_entry_gcs >> create_entry_gcs_result2 create_tag_template >> delete_tag_template_field create_tag_template >> create_tag_template_result create_tag_template >> create_tag_template_result2 create_tag_template_field >> delete_tag_template_field create_tag_template_field >> create_tag_template_field_result create_tag_template_field >> create_tag_template_field_result2 create_tag >> delete_tag create_tag >> create_tag_result create_tag >> create_tag_result2 # Delete delete_tasks = [ delete_tag, delete_tag_template_field, delete_tag_template, delete_entry_group, delete_entry, ] chain(*delete_tasks) # Get create_tag_template >> get_tag_template >> delete_tag_template get_tag_template >> get_tag_template_result create_entry_gcs >> get_entry >> delete_entry get_entry >> get_entry_result create_entry_group >> get_entry_group >> delete_entry_group get_entry_group >> get_entry_group_result # List create_tag >> list_tags >> delete_tag list_tags >> list_tags_result # Lookup create_entry_gcs >> lookup_entry >> delete_entry lookup_entry >> lookup_entry_result # Rename create_tag_template_field >> rename_tag_template_field >> delete_tag_template_field # Search chain(create_tasks, search_catalog, delete_tasks) search_catalog >> search_catalog_result # Update create_entry_gcs >> update_entry >> delete_entry create_tag >> update_tag >> delete_tag create_tag_template >> update_tag_template >> delete_tag_template create_tag_template_field >> update_tag_template_field >> rename_tag_template_field snapshot.assert_match( serialize_pp( PipelineSnapshot.from_pipeline_def( make_dagster_pipeline_from_airflow_dag( dag)).dep_structure_snapshot))