def subdag_factory(parent_dag_id, child_dag_id, default_args): with DAG(dag_id=f"{parent_dag_id}.{child_dag_id}", default_args=default_args) as dag: n_estimators = [100, 150] max_features = ['auto', 'sqrt'] training_model_tasks = [] for feature in max_features: for estimator in n_estimators: ml_id = f"{feature}_{estimator}" training_model_tasks.append( PapermillOperator( task_id=f'training_model_{ml_id}', input_nb= '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb= f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb', pool='training_pool', parameters={ 'filepath': '/tmp/avocado.csv', 'n_estimators': estimator, 'max_features': feature, 'ml_id': ml_id })) return dag
def training_group(): with TaskGroup("trainings", tooltip="Training tasks") as group: n_estimators = [100, 150] max_features = ['auto', 'sqrt'] for feature in max_features: for estimator in n_estimators: ml_id = f"{feature}_{estimator}" PapermillOperator( task_id=f'training_model_{ml_id}', input_nb= '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb= f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb', pool='training_pool', parameters={ 'filepath': '/tmp/avocado.csv', 'n_estimators': estimator, 'max_features': feature, 'ml_id': ml_id }) return group
def training_groups(): with TaskGroup("trainings") as group: model_settings = Variable.get('avocado_dag_model_settings', deserialize_json=True) for feature in model_settings['max_features']: for estimator in model_settings['n_estimators']: ml_id = f"{feature}_{estimator}" PapermillOperator( task_id=f'training_model_{ml_id}', input_nb='/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb=f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb', pool='training_pool', parameters={ 'filepath': '/tmp/avocado.csv', 'n_estimators': estimator, 'max_features': feature, 'ml_id': ml_id } ) return group
def test_execute(self, mock_papermill): in_nb = "/tmp/does_not_exist" out_nb = "/tmp/will_not_exist" parameters = {"msg": "hello_world", "train": 1} op = PapermillOperator(input_nb=in_nb, output_nb=out_nb, parameters=parameters, task_id="papermill_operator_test", dag=None) op.pre_execute(context={}) # make sure to have the inlets op.execute(context={}) mock_papermill.execute_notebook.assert_called_once_with( in_nb, out_nb, parameters=parameters, progress_bar=False, report_mode=True)
START_DATE = datetime(2021, 1, 1) SCHEDULE_INTERVAL = '0 0 * * *' DAGRUN_TIMEOUT = timedelta(minutes=60) with DAG( dag_id='example_papermill_operator', schedule_interval=SCHEDULE_INTERVAL, start_date=START_DATE, dagrun_timeout=DAGRUN_TIMEOUT, tags=['example'], catchup=False, ) as dag_1: # [START howto_operator_papermill] run_this = PapermillOperator( task_id="run_example_notebook", input_nb="/tmp/hello_world.ipynb", output_nb="/tmp/out-{{ execution_date }}.ipynb", parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"}, ) # [END howto_operator_papermill] @task def check_notebook(inlets, execution_date): """ Verify the message in the notebook """ notebook = sb.read_notebook(inlets[0].url) message = notebook.scraps['message'] print(f"Message in notebook {message} for {execution_date}") if message.data != f"Ran from Airflow at {execution_date}!":
# specific language governing permissions and limitations # under the License. """ This DAG will use Papermill to run the notebook "hello_world", based on the execution date it will create an output notebook "out-<date>". All fields, including the keys in the parameters, are templated. """ from datetime import timedelta from airflow.models import DAG from airflow.providers.papermill.operators.papermill import PapermillOperator from airflow.utils.dates import days_ago default_args = {'owner': 'airflow', 'start_date': days_ago(2)} with DAG( dag_id='example_papermill_operator', default_args=default_args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60), tags=['example'], ) as dag: # [START howto_operator_papermill] run_this = PapermillOperator( task_id="run_example_notebook", input_nb="/tmp/hello_world.ipynb", output_nb="/tmp/out-{{ execution_date }}.ipynb", parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"}) # [END howto_operator_papermill]
poke_interval=15) n_estimators = [100, 150] max_features = ['auto', 'sqrt'] training_model_tasks = [] for feature in max_features: for estimator in n_estimators: ml_id = f"{feature}_{estimator}" training_model_tasks.append( PapermillOperator( task_id=f'training_model_{ml_id}', input_nb= '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb= f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb', parameters={ 'filepath': '/tmp/avocado.csv', 'n_estimators': estimator, 'max_features': feature, 'ml_id': ml_id })) evaluating_rmse = BranchSQLOperator(task_id='evaluating_rmse', sql='sql/FETCH_MIN_RMSE.sql', conn_id='postgres', follow_task_ids_if_true='accurate', follow_task_ids_if_false='inaccurate') accurate = DummyOperator(task_id='accurate') inaccurate = DummyOperator(task_id='inaccurate')