def create_evaluate_ops( task_prefix: str, data_format: str, input_paths: List[str], prediction_path: str, metric_fn_and_keys: Tuple[T, Iterable[str]], validate_fn: T, batch_prediction_job_id: Optional[str] = None, region: Optional[str] = None, project_id: Optional[str] = None, dataflow_options: Optional[Dict] = None, model_uri: Optional[str] = None, model_name: Optional[str] = None, version_name: Optional[str] = None, dag: Optional[DAG] = None, py_interpreter="python3", ): """ Creates Operators needed for model evaluation and returns. It gets prediction over inputs via Cloud ML Engine BatchPrediction API by calling MLEngineBatchPredictionOperator, then summarize and validate the result via Cloud Dataflow using DataFlowPythonOperator. For details and pricing about Batch prediction, please refer to the website https://cloud.google.com/ml-engine/docs/how-tos/batch-predict and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/ It returns three chained operators for prediction, summary, and validation, named as ``<prefix>-prediction``, ``<prefix>-summary``, and ``<prefix>-validation``, respectively. (``<prefix>`` should contain only alphanumeric characters or hyphen.) The upstream and downstream can be set accordingly like: .. code-block:: python pred, _, val = create_evaluate_ops(...) pred.set_upstream(upstream_op) ... downstream_op.set_upstream(val) Callers will provide two python callables, metric_fn and validate_fn, in order to customize the evaluation behavior as they wish. - metric_fn receives a dictionary per instance derived from json in the batch prediction result. The keys might vary depending on the model. It should return a tuple of metrics. - validation_fn receives a dictionary of the averaged metrics that metric_fn generated over all instances. The key/value of the dictionary matches to what's given by metric_fn_and_keys arg. The dictionary contains an additional metric, 'count' to represent the total number of instances received for evaluation. The function would raise an exception to mark the task as failed, in a case the validation result is not okay to proceed (i.e. to set the trained version as default). Typical examples are like this: .. code-block:: python def get_metric_fn_and_keys(): import math # imports should be outside of the metric_fn below. def error_and_squared_error(inst): label = float(inst["input_label"]) classes = float(inst["classes"]) # 0 or 1 err = abs(classes - label) squared_err = math.pow(classes - label, 2) return (err, squared_err) # returns a tuple. return error_and_squared_error, ["err", "mse"] # key order must match. def validate_err_and_count(summary): if summary["err"] > 0.2: raise ValueError("Too high err>0.2; summary=%s" % summary) if summary["mse"] > 0.05: raise ValueError("Too high mse>0.05; summary=%s" % summary) if summary["count"] < 1000: raise ValueError("Too few instances<1000; summary=%s" % summary) return summary For the details on the other BatchPrediction-related arguments (project_id, job_id, region, data_format, input_paths, prediction_path, model_uri), please refer to MLEngineBatchPredictionOperator too. :param task_prefix: a prefix for the tasks. Only alphanumeric characters and hyphen are allowed (no underscores), since this will be used as dataflow job name, which doesn't allow other characters. :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP' :param input_paths: a list of input paths to be sent to BatchPrediction. :param prediction_path: GCS path to put the prediction results in. :param metric_fn_and_keys: a tuple of metric_fn and metric_keys: - metric_fn is a function that accepts a dictionary (for an instance), and returns a tuple of metric(s) that it calculates. - metric_keys is a list of strings to denote the key of each metric. :param validate_fn: a function to validate whether the averaged metric(s) is good enough to push the model. :param batch_prediction_job_id: the id to use for the Cloud ML Batch prediction job. Passed directly to the MLEngineBatchPredictionOperator as the job_id argument. :param project_id: the Google Cloud project id in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['project_id']` will be used. :param region: the Google Cloud region in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['region']` will be used. :param dataflow_options: options to run Dataflow jobs. If None, then the `dag`'s `default_args['dataflow_default_options']` will be used. :param model_uri: GCS path of the model exported by Tensorflow using ``tensorflow.estimator.export_savedmodel()``. It cannot be used with model_name or version_name below. See MLEngineBatchPredictionOperator for more detail. :param model_name: Used to indicate a model to use for prediction. Can be used in combination with version_name, but cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['model_name']` will be used. :param version_name: Used to indicate a model version to use for prediction, in combination with model_name. Cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['version_name']` will be used. :param dag: The `DAG` to use for all Operators. :param py_interpreter: Python version of the beam pipeline. If None, this defaults to the python3. To track python versions supported by beam and related issues check: https://issues.apache.org/jira/browse/BEAM-1251 :returns: a tuple of three operators, (prediction, summary, validation) :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator, PythonOperator) """ batch_prediction_job_id = batch_prediction_job_id or "" dataflow_options = dataflow_options or {} region = region or "" # Verify that task_prefix doesn't have any special characters except hyphen # '-', which is the only allowed non-alphanumeric character by Dataflow. if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix): raise AirflowException( "Malformed task_id for DataFlowPythonOperator (only alphanumeric " "and hyphens are allowed but got: " + task_prefix) metric_fn, metric_keys = metric_fn_and_keys if not callable(metric_fn): raise AirflowException("`metric_fn` param must be callable.") if not callable(validate_fn): raise AirflowException("`validate_fn` param must be callable.") if dag is not None and dag.default_args is not None: default_args = dag.default_args project_id = project_id or default_args.get('project_id') region = region or default_args['region'] model_name = model_name or default_args.get('model_name') version_name = version_name or default_args.get('version_name') dataflow_options = dataflow_options or default_args.get( 'dataflow_default_options') evaluate_prediction = MLEngineStartBatchPredictionJobOperator( task_id=(task_prefix + "-prediction"), project_id=project_id, job_id=batch_prediction_job_id, region=region, data_format=data_format, input_paths=input_paths, output_path=prediction_path, uri=model_uri, model_name=model_name, version_name=version_name, dag=dag, ) metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True)).decode() evaluate_summary = BeamRunPythonPipelineOperator( task_id=(task_prefix + "-summary"), py_file=os.path.join(os.path.dirname(__file__), 'mlengine_prediction_summary.py'), default_pipeline_options=dataflow_options, pipeline_options={ "prediction_path": prediction_path, "metric_fn_encoded": metric_fn_encoded, "metric_keys": ','.join(metric_keys), }, py_interpreter=py_interpreter, py_requirements=['apache-beam[gcp]>=2.14.0'], dag=dag, ) evaluate_summary.set_upstream(evaluate_prediction) def apply_validate_fn(*args, templates_dict, **kwargs): prediction_path = templates_dict["prediction_path"] scheme, bucket, obj, _, _ = urlsplit(prediction_path) if scheme != "gs" or not bucket or not obj: raise ValueError( f"Wrong format prediction_path: {prediction_path}") summary = os.path.join(obj.strip("/"), "prediction.summary.json") gcs_hook = GCSHook() summary = json.loads( gcs_hook.download(bucket, summary).decode("utf-8")) return validate_fn(summary) evaluate_validation = PythonOperator( task_id=(task_prefix + "-validation"), python_callable=apply_validate_fn, templates_dict={"prediction_path": prediction_path}, dag=dag, ) evaluate_validation.set_upstream(evaluate_summary) return evaluate_prediction, evaluate_summary, evaluate_validation
if message.data != f"Ran from Airflow at {execution_date}!": return False return True args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2)} dag = DAG(dag_id='example_papermill_operator', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60)) run_this = PapermillOperator( task_id="run_example_notebook", dag=dag, input_nb=os.path.join(os.path.dirname(os.path.realpath(__file__)), "input_notebook.ipynb"), output_nb="/tmp/out-{{ execution_date }}.ipynb", parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"}) check_output = PythonOperator(task_id='check_out', python_callable=check_notebook, dag=dag, inlets=AUTO) check_output.set_upstream(run_this) if __name__ == "__main__": dag.cli()
def calculator_func(**kwargs): ti = kwargs['ti'] tasks = [f'push_{i}' for i in range(1, 10)] values = ti.xcom_pull(task_ids=tasks) return sum(values) with DAG( dag_id='xcom_multiple_tasks', start_date=datetime(2021, 3, 1), schedule_interval='@once', ) as dag: tasks = [] for i in range(1, 10): task = PythonOperator( task_id=f'push_{i}', python_callable=lambda i=i: i, ) tasks.append(task) calculator = PythonOperator( task_id='calculator', python_callable=calculator_func, ) calculator.set_upstream(tasks)
'owner': 'airflow', } dag = DAG( dag_id='trigger_with_multi_dagrun_sensor', max_active_runs=1, schedule_interval='@hourly', default_args=args, ) gen_target_dag_run = TriggerMultiDagRunOperator( task_id='gen_target_dag_run', dag=dag, trigger_dag_id='common_target', python_callable=generate_dag_run, ) # Wait until there is no running instance of target DAG wait_target_dag = MultiDagRunSensor( task_id='wait_target_dag', dag=dag ) wait_target_dag.set_upstream(gen_target_dag_run) after_dags_handler_op = PythonOperator( task_id='after_dags_handler', python_callable=after_dags_handler, dag=dag ) after_dags_handler_op.set_upstream(wait_target_dag)
"B.1.617", "B.1.617.1", "B.1.617.2" ] for clade in clades: params = {} params[ 'meta-output'] = directory_output + '/' + clade + '-no-fasta.json' params["sequence-output"] = directory_output + '/' + clade + '.fas' params['only-uniques'] = False params["clades"] = [clade] export_meta_task = PythonOperator( task_id=f'export_meta_{clade}', python_callable=export_meta, op_kwargs={"config": params}, dag=dag, ) export_meta_task.set_upstream(mk_dir_task) export_sequences_task = PythonOperator( task_id=f'export_sequences_{clade}', python_callable=export_sequences, op_kwargs={"config": params}, dag=dag, ) export_sequences_task.set_upstream(mk_dir_task)
def create_dag(dag_id, schedule, window, default_args): with DAG( dag_id, default_args=default_args, description='creates sliding windows based on months', schedule_interval=schedule, start_date=datetime.datetime(2021, 4, 30), on_failure_callback=dag_fail_slack_alert, on_success_callback=dag_success_slack_alert, tags=['selection', 'sliding'], ) as dag: OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows-bealign/" + '_'.join( window) default_args["params"]["output-dir"] = OUTPUT_DIR default_args["params"][ "meta-output"] = OUTPUT_DIR + '/master-no-sequences.json' default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences' with open(dag.params["region_cfg"], 'r') as stream: regions = yaml.safe_load(stream) mk_dir_task = BashOperator( task_id='make_directory', bash_command='mkdir -p {{params.output}}', params={'output': default_args['params']['output-dir']}, dag=dag, ) export_meta_task = PythonOperator( task_id='export_meta', python_callable=export_meta, op_kwargs={"config": default_args['params']}, pool='mongo', dag=dag, ) export_meta_task.set_upstream(mk_dir_task) export_sequences_task = PythonOperator( task_id='export_sequences', python_callable=export_sequences, op_kwargs={"config": default_args['params']}, pool='mongo', dag=dag, ) export_sequences_task.set_upstream(mk_dir_task) # For each region export_by_gene = [] for gene in regions.keys(): filepath_prefix = OUTPUT_DIR + '/sequences.' + gene nuc_sequence_output = filepath_prefix + '_nuc.fas' uniques_fn = filepath_prefix + '_nuc.uniques.fas' duplicate_output = filepath_prefix + '.duplicates.json' variants_csv_output = filepath_prefix + '.variants.csv' variants_json_output = filepath_prefix + '.variants.json' filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas' filtered_json_output = filepath_prefix + '.filtered.json' output_edits_fn = filepath_prefix + '.filtered.edits.json' compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json' tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree' sto_output = filepath_prefix + '.compressed.filtered.sto' slac_output_fn = filepath_prefix + '.SLAC.json' fel_output_fn = filepath_prefix + '.FEL.json' meme_output_fn = filepath_prefix + '.MEME.json' summary_output_fn = filepath_prefix + '.json' default_args["params"]["nuc-sequence-output"] = nuc_sequence_output default_args["params"]["duplicate-output"] = duplicate_output with TaskGroup(f"alignment_{gene}") as alignment: export_bealign_task = PythonOperator( task_id=f'export_bealign', python_callable=export_bealign_sequences, op_kwargs={ "config": default_args['params'], 'nuc_output_fn': nuc_sequence_output, 'gene': gene }, dag=dag, ) # Occasional errors when cleaning up tmp files, so or'ing true cleanup_task = BashOperator( task_id=f'cleanup', bash_command= "sed -i '/^>/! s/[^ACTG-]/N/g' $NUC_OUTPUT_FN || true", env={ 'NUC_OUTPUT_FN': nuc_sequence_output, **os.environ }, dag=dag) export_bealign_task >> cleanup_task with TaskGroup(f"duplicates_{gene}") as duplicates_group: compute_duplicates_task = PythonOperator( task_id=f'write_raw_duplicates', python_callable=write_nuc_raw_duplicates, op_kwargs={ "input": nuc_sequence_output, "duplicate_output": duplicate_output, 'uniques_output': uniques_fn }, dag=dag, ) compute_duplicates_task # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json with TaskGroup(f"filter_{gene}") as filter: COMPRESSOR = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $FASTA_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT """ compressor_task = BashOperator(task_id=f'compressor', bash_command=COMPRESSOR, env={ 'FASTA_FN': uniques_fn, 'DUPLICATE_FN': duplicate_output, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'COMPRESSOR_DUPLICATE_OUT': compressor_duplicate_out, **os.environ }, dag=dag) # --output-edits ${FILE}.${GENE}.filtered.edits.json COMPRESSOR2 = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $FASTA_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS} """ compressor_two_task = BashOperator( task_id=f'compressor_two', bash_command=COMPRESSOR2, env={ 'FASTA_FN': uniques_fn, 'DUPLICATE_FN': compressor_duplicate_out, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'FILTERED_FASTA_FN': filtered_fasta_output, 'FILTERED_JSON_FN': filtered_json_output, 'OUTPUT_EDITS': output_edits_fn, **os.environ }, dag=dag) compressor_task >> compressor_two_task INFER_TREE = """ seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT; rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT sed -i "s/'//g" $TREE_OUTPUT; """ infer_tree_task = BashOperator(task_id=f'infer_tree_{gene}', bash_command=INFER_TREE, env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'STO_OUTPUT': sto_output, 'TREE_OUTPUT': tree_output, **os.environ }, dag=dag) slac_task = BashOperator( task_id=f'slac_{gene}', bash_command= "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT", env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'SLAC_OUTPUT': slac_output_fn, **os.environ }, dag=dag, ) big_data_flags = '--full-model No' fel_task = BashOperator( task_id=f'fel_{gene}', bash_command= "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT", env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'FEL_OUTPUT': fel_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ }, dag=dag, ) meme_task = BashOperator( task_id=f'meme_{gene}', bash_command= "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT", env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'MEME_OUTPUT': meme_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ }, dag=dag, ) annotation_file = filepath_prefix + '.annotation.json' copy_annotation_task = BashOperator( task_id=f'copy_annotation_{gene}', bash_command= 'cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}', params={ 'annotation_file': annotation_file, 'working_dir': WORKING_DIR }, dag=dag) summarize_gene_task = BashOperator( task_id=f'summarize_gene_{gene}', bash_command= '{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION', params={ 'python': default_args['params']['python'], 'working_dir': WORKING_DIR }, env={ 'MASTERNOFASTA': default_args["params"]["meta-output"], 'DUPLICATES': duplicate_output, 'SLAC_OUTPUT': slac_output_fn, 'FEL_OUTPUT': fel_output_fn, 'MEME_OUTPUT': meme_output_fn, 'SUMMARY_OUTPUT': summary_output_fn, 'COMPRESSED_OUTPUT_FN': filtered_fasta_output, 'FRAGMENT': str(regions[gene]['fragment']), 'ADDSHIFT': str(regions[gene]['add_one']), 'SHIFT': str(regions[gene]['shift']), 'OFFSET': str(regions[gene]['offset']), 'ANNOTATION': annotation_file, **os.environ }, dag=dag, ) summarize_gene_task.set_upstream(export_meta_task) alignment.set_upstream(export_sequences_task) export_by_gene.append( alignment >> duplicates_group >> filter >> infer_tree_task >> [ slac_task, fel_task, meme_task ] >> copy_annotation_task >> summarize_gene_task) dag.doc_md = __doc__ # Add export meta and export sequence tasks to be executed in parallel cross_downstream([export_meta_task, export_sequences_task], export_by_gene) return dag
bash_command='mkdir -p {{params.directory_output}}', params={"directory_output": directory_output}, dag=dag, ) print(default_args['params']) export_meta_task = PythonOperator( task_id='export_meta', python_callable=export_meta, op_kwargs={ "config" : default_args['params'] }, pool='mongo', dag=dag, ) export_meta_task.set_upstream(mk_dir_task) export_by_gene = [] for gene in regions.keys(): gene_directory_output = directory_output + "/" + gene default_args["params"]["directory"] = gene_directory_output default_args["params"]["sequence-output"] = gene_directory_output + '/sequences' default_args["params"]["duplicate-output"] = gene_directory_output + '/duplicates.json' nuc_sequence_output = default_args["params"]["directory"] + '/sequences_nuc.fas' bealign_nuc_sequence_output = default_args["params"]["directory"] + '/sequences_nuc.bealign.fas' prot_sequence_output = default_args["params"]["directory"] + '/sequences_protein.fas'
# [START push_data_into_datastore_task] def task_push_data_into_datastore(**context): logging.info("Loading resource via API") resource_dict = context["params"].get("resource", {}) ckan_api_key = context["params"].get("ckan_config", {}).get("api_key") ckan_site_url = context["params"].get("ckan_config", {}).get("site_url") return load_resource_via_api(resource_dict, ckan_api_key, ckan_site_url) push_data_into_datastore_task = PythonOperator( task_id="push_data_into_datastore", provide_context=True, python_callable=task_push_data_into_datastore, trigger_rule="none_failed_or_skipped", dag=dag, doc_md=dedent("""\ #### create new datastore table This task pushes the data into datastore on newly created or exisiting datastore table. """), ) # [END push_data_into_datastore_task] # [SET WORKFLOW ] check_schema_task.set_upstream(fetch_and_read_data_task) create_datastore_table_task.set_upstream(check_schema_task) push_data_into_datastore_task.set_upstream( [create_datastore_table_task, check_schema_task]) # [END WORKFLOW]
dag = DAG(dag_id='vaccination_count_operator', default_args=args, schedule_interval=None) def print_context(ds, **kwargs): print(kwargs) return 'Whatever you return gets printed in the logs' def get_data(): url = "https://opendata.arcgis.com/datasets/da83fdaab14e42f0b3fe198a15c5bad5_0.geojson" obj = requests.get(url).content return obj run_this = PythonOperator(task_id='print_context', provide_context=True, python_callable=print_context, dag=dag) for i in range(10): task = PythonOperator(task_id="get_data_url", provide_context=True, python_callable=print_context, dag=dag) task.set_upstream(run_this)
def create_dag(dag_id, schedule, window, default_args): with DAG( dag_id, default_args=default_args, description='creates sliding windows based on months', schedule_interval=schedule, start_date=datetime.datetime(2021, 4, 30), on_failure_callback=task_fail_slack_alert, on_success_callback=task_success_slack_alert, tags=['selection','sliding'], ) as dag: OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows/" + '_'.join(window) default_args["params"]["output-dir"] = OUTPUT_DIR default_args["params"]["meta-output"] = OUTPUT_DIR + '/master-no-sequences.json' default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences' with open(dag.params["region_cfg"], 'r') as stream: regions = yaml.safe_load(stream) mk_dir_task = BashOperator( task_id='make_directory', bash_command='mkdir -p {{params.output}}', params={'output': default_args['params']['output-dir']}, dag=dag, ) export_meta_task = PythonOperator( task_id='export_meta', python_callable=export_meta, op_kwargs={ "config" : default_args['params'] }, pool='mongo', dag=dag, ) export_meta_task.set_upstream(mk_dir_task) export_sequences_task = PythonOperator( task_id='export_sequences', python_callable=export_sequences, op_kwargs={ "config" : default_args['params'] }, pool='mongo', dag=dag, ) export_sequences_task.set_upstream(mk_dir_task) # For each region export_by_gene = [] for gene in regions.keys(): reference_filepath = WORKING_DIR + 'reference_genes/reference.' + gene + '_protein.fas' filepath_prefix = OUTPUT_DIR + '/sequences.' + gene nuc_sequence_output = filepath_prefix + '_nuc.fas' prot_sequence_output = filepath_prefix + '_protein.fas' initial_duplicate_output = filepath_prefix + '.initial.duplicates.json' protein_duplicate_output = filepath_prefix + '.protein.duplicates.json' duplicate_output = filepath_prefix + '.duplicates.json' map_output = filepath_prefix + '.map.json' variants_csv_output = filepath_prefix + '.variants.csv' variants_json_output = filepath_prefix + '.variants.json' filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas' filtered_json_output = filepath_prefix + '.filtered.json' output_edits_fn = filepath_prefix + '.filtered.edits.json' compressed_output_filepath = filepath_prefix + '.compressed.fas' compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json' tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree' sto_output = filepath_prefix + '.compressed.filtered.sto'; tmp_output_fn = filepath_prefix + '.tmp.msa' output_fn = filepath_prefix + '.msa' slac_output_fn = filepath_prefix + '.SLAC.json' fel_output_fn = filepath_prefix + '.FEL.json' meme_output_fn = filepath_prefix + '.MEME.json' summary_output_fn = filepath_prefix + '.json' default_args["params"]["nuc-sequence-output"] = nuc_sequence_output default_args["params"]["prot-sequence-output"] = prot_sequence_output default_args["params"]["duplicate-output"] = duplicate_output default_args["params"]["protein-duplicate-output"] = protein_duplicate_output default_args["params"]["inital-duplicate-output"] = initial_duplicate_output with TaskGroup(f"alignment_{gene}") as alignment: export_premsa_sequence_task = PythonOperator( task_id=f'export_premsa_sequences_{gene}', python_callable=export_premsa_sequences, op_kwargs={ "config" : default_args['params'], 'nuc_output_fn': nuc_sequence_output, 'prot_output_fn' : prot_sequence_output, 'gene' : gene }, pool='mongo', dag=dag, ) export_duplicates_task = PythonOperator( task_id=f'export_duplicates_{gene}', python_callable=export_duplicates, op_kwargs={ 'output_fn' : initial_duplicate_output, 'gene': gene }, pool='mongo', dag=dag, ) MAFFT = """ {{ params.mafft }} --thread -1 --add $INPUT_FN $REFERENCE_FILEPATH >| $TMP_OUTPUT_FN """ mafft_task = BashOperator( task_id=f'mafft_{gene}', bash_command=MAFFT, params={'mafft': default_args['params']['mafft']}, env={'INPUT_FN': prot_sequence_output, 'TMP_OUTPUT_FN': tmp_output_fn, 'REFERENCE_FILEPATH': reference_filepath }, dag=dag ) # input_fn, reference_fn, output_fn remove_ref_task = PythonOperator( task_id=f'remove_ref_{gene}', python_callable=reserve_only_original_input, op_kwargs={ "input_fn" : tmp_output_fn, "original_fn" : prot_sequence_output, "output_fn": output_fn }, dag=dag, ) POSTMSA = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.post_msa }} --protein-msa $INPUT_FN --nucleotide-sequences $NUC_INPUT_FN --output $COMPRESSED_OUTPUT_FN --duplicates $DUPLICATE_OUTPUT_FN """ # Run POST-MSA on cancatenated dataset to translate back to nucleotides reverse_translate_task = BashOperator( task_id=f'post_msa_{gene}', bash_command=POSTMSA, env={'INPUT_FN': output_fn, 'NUC_INPUT_FN': nuc_sequence_output , 'COMPRESSED_OUTPUT_FN': compressed_output_filepath, 'DUPLICATE_OUTPUT_FN': protein_duplicate_output, **os.environ}, dag=dag ) cleanup_task = BashOperator( task_id=f'cleanup_{gene}', bash_command="sed -i '/^>/! s/[^ACTG-]/N/g' $COMPRESSED_OUTPUT_FN", env={'COMPRESSED_OUTPUT_FN': compressed_output_filepath, **os.environ}, dag=dag ) [export_premsa_sequence_task] >> mafft_task >> remove_ref_task >> reverse_translate_task >> cleanup_task with TaskGroup(f"duplicates_{gene}") as duplicates_group: merge_duplicate_task = PythonOperator( task_id=f'merge_duplicates_{gene}', python_callable=merge_duplicates, op_kwargs={ 'protein_duplicates' : protein_duplicate_output, 'nuc_duplicates': initial_duplicate_output, 'output': duplicate_output}, dag=dag, ) # Fix duplicates fix_duplicate_task = PythonOperator( task_id=f'fix_duplicates_{gene}', python_callable=fix_duplicates, op_kwargs={ 'duplicates' : duplicate_output, 'map': map_output, 'overwrite': True }, dag=dag, ) # # Fix header files # echo "$PYTHON python/update_fasta_duplicates.py -f ${FILE}.${GENE}.compressed.fas -m ${FILE}.${GENE}.map.json" # $PYTHON python/update_fasta_duplicates.py -f ${FILE}.${GENE}.compressed.fas -m ${FILE}.${GENE}.map.json update_fasta_duplicates_task = PythonOperator( task_id=f'update_fasta_duplicates_{gene}', python_callable=update_fasta_duplicates, op_kwargs={ 'fasta_file' : compressed_output_filepath, 'map_file': map_output }, dag=dag, ) merge_duplicate_task >> fix_duplicate_task >> update_fasta_duplicates_task # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json with TaskGroup(f"filter_{gene}") as filter: COMPRESSOR = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $COMPRESSED_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT """ compressor_task = BashOperator( task_id=f'compressor_{gene}', bash_command=COMPRESSOR, env={'COMPRESSED_FN': compressed_output_filepath, 'DUPLICATE_FN': duplicate_output, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'COMPRESSOR_DUPLICATE_OUT': compressor_duplicate_out, **os.environ}, dag=dag ) # --output-edits ${FILE}.${GENE}.filtered.edits.json COMPRESSOR2 = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $COMPRESSED_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS} """ compressor_two_task = BashOperator( task_id=f'compressor_two_{gene}', bash_command=COMPRESSOR2, env={'COMPRESSED_FN': compressed_output_filepath, 'DUPLICATE_FN': compressor_duplicate_out, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'FILTERED_FASTA_FN': filtered_fasta_output, 'FILTERED_JSON_FN': filtered_json_output, 'OUTPUT_EDITS': output_edits_fn, **os.environ}, dag=dag ) compressor_task >> compressor_two_task INFER_TREE = """ seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT; rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT sed -i "s/'//g" $TREE_OUTPUT; """ infer_tree_task = BashOperator( task_id=f'infer_tree_{gene}', bash_command=INFER_TREE, env={'FILTERED_FASTA_FN': filtered_fasta_output, 'STO_OUTPUT': sto_output, 'TREE_OUTPUT': tree_output, **os.environ}, dag=dag ) slac_task = BashOperator( task_id=f'slac_{gene}', bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT", env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'SLAC_OUTPUT': slac_output_fn, **os.environ}, dag=dag, ) big_data_flags='--full-model No' fel_task = BashOperator( task_id=f'fel_{gene}', bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT", env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'FEL_OUTPUT': fel_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ}, dag=dag, ) meme_task = BashOperator( task_id=f'meme_{gene}', bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT", env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'MEME_OUTPUT': meme_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ}, dag=dag, ) # fubar_task = BashOperator( # task_id='fubar_{gene}', # bash_command='mkdir -p {{params.working_dir}}/data/fasta/{{params.date}}', # dag=dag, # ) # prime_task = BashOperator( # task_id='prime_{gene}', # bash_command='mkdir -p {{params.working_dir}}/data/fasta/{{params.date}}', # dag=dag, # ) annotation_file = filepath_prefix + '.annotation.json' copy_annotation_task = BashOperator( task_id=f'copy_annotation_{gene}', bash_command='cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}', params={'annotation_file': annotation_file, 'working_dir': WORKING_DIR}, dag=dag ) summarize_gene_task = BashOperator( task_id=f'summarize_gene_{gene}', bash_command='{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION', params={'python': default_args['params']['python'], 'working_dir': WORKING_DIR}, env={ 'MASTERNOFASTA': default_args["params"]["meta-output"], 'DUPLICATES': duplicate_output, 'SLAC_OUTPUT': slac_output_fn, 'FEL_OUTPUT': fel_output_fn, 'MEME_OUTPUT': meme_output_fn, 'SUMMARY_OUTPUT': summary_output_fn, 'COMPRESSED_OUTPUT_FN': filtered_fasta_output, 'FRAGMENT': str(regions[gene]['fragment']), 'ADDSHIFT': str(regions[gene]['add_one']), 'SHIFT': str(regions[gene]['shift']), 'OFFSET': str(regions[gene]['offset']), 'ANNOTATION': annotation_file, **os.environ}, dag=dag, ) summarize_gene_task.set_upstream(export_meta_task) alignment.set_upstream(export_sequences_task) export_by_gene.append(alignment >> duplicates_group >> filter >> infer_tree_task >> [slac_task, fel_task, meme_task] >> copy_annotation_task >> summarize_gene_task) dag.doc_md = __doc__ # Add export meta and export sequence tasks to be executed in parallel cross_downstream([export_meta_task, export_sequences_task], export_by_gene) return dag
with open(output_path, "w") as fp: fp.write(input_value) fp.write("\n\n") fp.write(str(datetime.now().strftime("%Y-%m-%dT%H:%M:%S"))) return "success" with DAG(dag_id="dbnd_operators", default_args=default_args) as dag_operators: # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = my_task(2) t2, t3 = my_multiple_outputs(t1) tp = PythonOperator( task_id="some_python_function", python_callable=some_python_function, op_kwargs={ "input_path": t3, "output_path": "/tmp/output.txt" }, ) tp.set_upstream(t3.op) t1_op = t1.op if __name__ == "__main__": ti = TaskInstance(t1_op, days_ago(0)) ti.run(ignore_task_deps=True, ignore_ti_state=True, test_mode=True) # # # # dag_operators.clear() # dag_operators.run()