Python BashOperator.set_upstream примеры использования

Язык программирования: Python

Пространство имен/Пакет: airflow.operators.bash

Класс/Тип: BashOperator

Метод/Функция: set_upstream

Примеров на hotexamples.com: 6

Python BashOperator.set_upstream - 6 примеров найдено. Это лучшие примеры Python кода для airflow.operators.bash.BashOperator.set_upstream, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

BashOperator(30)

run(10)

subscribe_event(10)

execute(7)

set_upstream(6)

doc_md(5)

set_downstream(4)

set_events_handler(4)

executor_config(3)

dry_run(1)

render_template_fields(1)

Пример #1

Показать файл

Файл: airflow-log-cleanup.py Проект: josejnra/airflow

        echo "Error removing the lock file. Check file permissions. To re-run the DAG, ensure that the lock file has been deleted (""" + str(
    LOG_CLEANUP_PROCESS_LOCK_FILE) + """)."
        exit ${REMOVE_LOCK_FILE_EXIT_CODE}
    fi

else
    echo "Another task is already deleting logs on this worker node. \
    Skipping it!"
    echo "If you believe you're receiving this message in error, kindly check \
    if """ + str(LOG_CLEANUP_PROCESS_LOCK_FILE) + """ exists and delete it."
    exit 0
fi

"""

for log_cleanup_id in range(1, NUMBER_OF_WORKERS + 1):

    for dir_id, directory in enumerate(DIRECTORIES_TO_DELETE):

        log_cleanup_op = BashOperator(task_id='log_cleanup_worker_num_' +
                                      str(log_cleanup_id) + '_dir_' +
                                      str(dir_id),
                                      bash_command=log_cleanup,
                                      params={
                                          "directory": str(directory),
                                          "sleep_time": int(log_cleanup_id) * 3
                                      },
                                      dag=dag)

        log_cleanup_op.set_upstream(start)

Пример #2

Показать файл

def create_dag(dag_id, schedule, window, default_args):
    with DAG(
            dag_id,
            default_args=default_args,
            description='creates sliding windows based on months',
            schedule_interval=schedule,
            start_date=datetime.datetime(2021, 4, 30),
            on_failure_callback=dag_fail_slack_alert,
            on_success_callback=dag_success_slack_alert,
            tags=['selection', 'sliding'],
    ) as dag:

        OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows-bealign/" + '_'.join(
            window)
        default_args["params"]["output-dir"] = OUTPUT_DIR
        default_args["params"][
            "meta-output"] = OUTPUT_DIR + '/master-no-sequences.json'
        default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences'

        with open(dag.params["region_cfg"], 'r') as stream:
            regions = yaml.safe_load(stream)

        mk_dir_task = BashOperator(
            task_id='make_directory',
            bash_command='mkdir -p {{params.output}}',
            params={'output': default_args['params']['output-dir']},
            dag=dag,
        )

        export_meta_task = PythonOperator(
            task_id='export_meta',
            python_callable=export_meta,
            op_kwargs={"config": default_args['params']},
            pool='mongo',
            dag=dag,
        )

        export_meta_task.set_upstream(mk_dir_task)
        export_sequences_task = PythonOperator(
            task_id='export_sequences',
            python_callable=export_sequences,
            op_kwargs={"config": default_args['params']},
            pool='mongo',
            dag=dag,
        )

        export_sequences_task.set_upstream(mk_dir_task)

        # For each region
        export_by_gene = []

        for gene in regions.keys():

            filepath_prefix = OUTPUT_DIR + '/sequences.' + gene

            nuc_sequence_output = filepath_prefix + '_nuc.fas'
            uniques_fn = filepath_prefix + '_nuc.uniques.fas'
            duplicate_output = filepath_prefix + '.duplicates.json'

            variants_csv_output = filepath_prefix + '.variants.csv'
            variants_json_output = filepath_prefix + '.variants.json'
            filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas'
            filtered_json_output = filepath_prefix + '.filtered.json'
            output_edits_fn = filepath_prefix + '.filtered.edits.json'

            compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json'

            tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree'
            sto_output = filepath_prefix + '.compressed.filtered.sto'

            slac_output_fn = filepath_prefix + '.SLAC.json'
            fel_output_fn = filepath_prefix + '.FEL.json'
            meme_output_fn = filepath_prefix + '.MEME.json'

            summary_output_fn = filepath_prefix + '.json'

            default_args["params"]["nuc-sequence-output"] = nuc_sequence_output
            default_args["params"]["duplicate-output"] = duplicate_output

            with TaskGroup(f"alignment_{gene}") as alignment:

                export_bealign_task = PythonOperator(
                    task_id=f'export_bealign',
                    python_callable=export_bealign_sequences,
                    op_kwargs={
                        "config": default_args['params'],
                        'nuc_output_fn': nuc_sequence_output,
                        'gene': gene
                    },
                    dag=dag,
                )

                # Occasional errors when cleaning up tmp files, so or'ing true
                cleanup_task = BashOperator(
                    task_id=f'cleanup',
                    bash_command=
                    "sed -i '/^>/! s/[^ACTG-]/N/g' $NUC_OUTPUT_FN || true",
                    env={
                        'NUC_OUTPUT_FN': nuc_sequence_output,
                        **os.environ
                    },
                    dag=dag)

                export_bealign_task >> cleanup_task

            with TaskGroup(f"duplicates_{gene}") as duplicates_group:

                compute_duplicates_task = PythonOperator(
                    task_id=f'write_raw_duplicates',
                    python_callable=write_nuc_raw_duplicates,
                    op_kwargs={
                        "input": nuc_sequence_output,
                        "duplicate_output": duplicate_output,
                        'uniques_output': uniques_fn
                    },
                    dag=dag,
                )

                compute_duplicates_task

            # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json

            with TaskGroup(f"filter_{gene}") as filter:
                COMPRESSOR = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $FASTA_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN  --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT
                """
                compressor_task = BashOperator(task_id=f'compressor',
                                               bash_command=COMPRESSOR,
                                               env={
                                                   'FASTA_FN': uniques_fn,
                                                   'DUPLICATE_FN':
                                                   duplicate_output,
                                                   'VARIANTS_CSV_FN':
                                                   variants_csv_output,
                                                   'VARIANTS_JSON_FN':
                                                   variants_json_output,
                                                   'COMPRESSOR_DUPLICATE_OUT':
                                                   compressor_duplicate_out,
                                                   **os.environ
                                               },
                                               dag=dag)

                # --output-edits ${FILE}.${GENE}.filtered.edits.json
                COMPRESSOR2 = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $FASTA_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN  --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS}
                """
                compressor_two_task = BashOperator(
                    task_id=f'compressor_two',
                    bash_command=COMPRESSOR2,
                    env={
                        'FASTA_FN': uniques_fn,
                        'DUPLICATE_FN': compressor_duplicate_out,
                        'VARIANTS_CSV_FN': variants_csv_output,
                        'VARIANTS_JSON_FN': variants_json_output,
                        'FILTERED_FASTA_FN': filtered_fasta_output,
                        'FILTERED_JSON_FN': filtered_json_output,
                        'OUTPUT_EDITS': output_edits_fn,
                        **os.environ
                    },
                    dag=dag)

                compressor_task >> compressor_two_task

            INFER_TREE = """
            seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT;
            rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT
            sed -i "s/'//g" $TREE_OUTPUT;
            """

            infer_tree_task = BashOperator(task_id=f'infer_tree_{gene}',
                                           bash_command=INFER_TREE,
                                           env={
                                               'FILTERED_FASTA_FN':
                                               filtered_fasta_output,
                                               'STO_OUTPUT': sto_output,
                                               'TREE_OUTPUT': tree_output,
                                               **os.environ
                                           },
                                           dag=dag)

            slac_task = BashOperator(
                task_id=f'slac_{gene}',
                bash_command=
                "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT",
                env={
                    'FILTERED_FASTA_FN': filtered_fasta_output,
                    'TREE_OUTPUT': tree_output,
                    'SLAC_OUTPUT': slac_output_fn,
                    **os.environ
                },
                dag=dag,
            )

            big_data_flags = '--full-model No'

            fel_task = BashOperator(
                task_id=f'fel_{gene}',
                bash_command=
                "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT",
                env={
                    'FILTERED_FASTA_FN': filtered_fasta_output,
                    'TREE_OUTPUT': tree_output,
                    'FEL_OUTPUT': fel_output_fn,
                    'BIG_DATA_FLAGS': big_data_flags,
                    **os.environ
                },
                dag=dag,
            )

            meme_task = BashOperator(
                task_id=f'meme_{gene}',
                bash_command=
                "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT",
                env={
                    'FILTERED_FASTA_FN': filtered_fasta_output,
                    'TREE_OUTPUT': tree_output,
                    'MEME_OUTPUT': meme_output_fn,
                    'BIG_DATA_FLAGS': big_data_flags,
                    **os.environ
                },
                dag=dag,
            )

            annotation_file = filepath_prefix + '.annotation.json'
            copy_annotation_task = BashOperator(
                task_id=f'copy_annotation_{gene}',
                bash_command=
                'cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}',
                params={
                    'annotation_file': annotation_file,
                    'working_dir': WORKING_DIR
                },
                dag=dag)

            summarize_gene_task = BashOperator(
                task_id=f'summarize_gene_{gene}',
                bash_command=
                '{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output  $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION',
                params={
                    'python': default_args['params']['python'],
                    'working_dir': WORKING_DIR
                },
                env={
                    'MASTERNOFASTA': default_args["params"]["meta-output"],
                    'DUPLICATES': duplicate_output,
                    'SLAC_OUTPUT': slac_output_fn,
                    'FEL_OUTPUT': fel_output_fn,
                    'MEME_OUTPUT': meme_output_fn,
                    'SUMMARY_OUTPUT': summary_output_fn,
                    'COMPRESSED_OUTPUT_FN': filtered_fasta_output,
                    'FRAGMENT': str(regions[gene]['fragment']),
                    'ADDSHIFT': str(regions[gene]['add_one']),
                    'SHIFT': str(regions[gene]['shift']),
                    'OFFSET': str(regions[gene]['offset']),
                    'ANNOTATION': annotation_file,
                    **os.environ
                },
                dag=dag,
            )

            summarize_gene_task.set_upstream(export_meta_task)
            alignment.set_upstream(export_sequences_task)
            export_by_gene.append(
                alignment >> duplicates_group >> filter >> infer_tree_task >> [
                    slac_task, fel_task, meme_task
                ] >> copy_annotation_task >> summarize_gene_task)

        dag.doc_md = __doc__

        # Add export meta and export sequence tasks to be executed in parallel
        cross_downstream([export_meta_task, export_sequences_task],
                         export_by_gene)

        return dag

Пример #3

Показать файл

    
""" 
Setting up Dependencies 

Let's say we have tasks t1, t2 and t3 that do not depend on each other.
Note that when executing your script, Airflow will raise exceptions 
when it finds cycles in your DAG or when a dependency is referenced more 
than once.
"""

t1.set_downstream(t2)

# This means that t2 will depend on t1
# running successfully to run.
# It is equivalent to:
t2.set_upstream(t1)

# The bit shift operator can also be
# used to chain operations:
t1 >> t2

# And the upstream dependency with the
# bit shift operator:
t2 << t1

# Chaining multiple dependencies becomes
# concise with the bit shift operator:
t1 >> t2 >> t3

# A list of tasks can also be set as
# dependencies. These operations

Пример #4

Показать файл

        default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences'

    test_task = PythonOperator(
        task_id=f'test_task',
        python_callable=my_task,
        dag=dag,
    )

    mk_dir_task = BashOperator(
        task_id='make_directory',
        bash_command='mkdir -p {{params.output}}',
        params={'output': default_args['params']['output-dir']},
        dag=dag,
    )

    mk_dir_task.set_upstream(test_task)

    export_meta_task = PythonOperator(
        task_id='export_meta',
        python_callable=export_meta,
        op_kwargs={"config": default_args['params']},
        pool='mongo',
        dag=dag,
    )

    export_meta_task.set_upstream(mk_dir_task)

    export_sequences_task = PythonOperator(
        task_id='export_sequences',
        python_callable=export_sequences,
        op_kwargs={"config": default_args['params']},

Пример #5

Показать файл

Файл: sliding-windows.py Проект: wm75/SARS-CoV-3

def create_dag(dag_id, schedule, window, default_args):
    with DAG(
        dag_id,
        default_args=default_args,
        description='creates sliding windows based on months',
        schedule_interval=schedule,
        start_date=datetime.datetime(2021, 4, 30),
        on_failure_callback=task_fail_slack_alert,
        on_success_callback=task_success_slack_alert,
        tags=['selection','sliding'],
        ) as dag:

        OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows/" + '_'.join(window)
        default_args["params"]["output-dir"] = OUTPUT_DIR
        default_args["params"]["meta-output"] = OUTPUT_DIR + '/master-no-sequences.json'
        default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences'

        with open(dag.params["region_cfg"], 'r') as stream:
            regions = yaml.safe_load(stream)

        mk_dir_task = BashOperator(
            task_id='make_directory',
            bash_command='mkdir -p {{params.output}}',
            params={'output': default_args['params']['output-dir']},
            dag=dag,
        )

        export_meta_task = PythonOperator(
                task_id='export_meta',
                python_callable=export_meta,
                op_kwargs={ "config" : default_args['params'] },
                pool='mongo',
                dag=dag,
            )

        export_meta_task.set_upstream(mk_dir_task)
        export_sequences_task = PythonOperator(
                task_id='export_sequences',
                python_callable=export_sequences,
                op_kwargs={ "config" : default_args['params'] },
                pool='mongo',
                dag=dag,
            )

        export_sequences_task.set_upstream(mk_dir_task)

        # For each region
        export_by_gene = []

        for gene in regions.keys():

            reference_filepath = WORKING_DIR + 'reference_genes/reference.' + gene + '_protein.fas'
            filepath_prefix = OUTPUT_DIR + '/sequences.' + gene

            nuc_sequence_output = filepath_prefix + '_nuc.fas'
            prot_sequence_output = filepath_prefix + '_protein.fas'

            initial_duplicate_output = filepath_prefix + '.initial.duplicates.json'
            protein_duplicate_output = filepath_prefix + '.protein.duplicates.json'
            duplicate_output = filepath_prefix + '.duplicates.json'
            map_output = filepath_prefix + '.map.json'

            variants_csv_output = filepath_prefix + '.variants.csv'
            variants_json_output = filepath_prefix + '.variants.json'
            filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas'
            filtered_json_output = filepath_prefix + '.filtered.json'
            output_edits_fn = filepath_prefix + '.filtered.edits.json'

            compressed_output_filepath =  filepath_prefix + '.compressed.fas'
            compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json'

            tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree'
            sto_output = filepath_prefix + '.compressed.filtered.sto';

            tmp_output_fn = filepath_prefix + '.tmp.msa'
            output_fn = filepath_prefix + '.msa'

            slac_output_fn = filepath_prefix + '.SLAC.json'
            fel_output_fn = filepath_prefix + '.FEL.json'
            meme_output_fn = filepath_prefix + '.MEME.json'

            summary_output_fn = filepath_prefix + '.json'

            default_args["params"]["nuc-sequence-output"] = nuc_sequence_output
            default_args["params"]["prot-sequence-output"] = prot_sequence_output
            default_args["params"]["duplicate-output"] = duplicate_output
            default_args["params"]["protein-duplicate-output"] = protein_duplicate_output
            default_args["params"]["inital-duplicate-output"] = initial_duplicate_output

            with TaskGroup(f"alignment_{gene}") as alignment:

                export_premsa_sequence_task = PythonOperator(
                        task_id=f'export_premsa_sequences_{gene}',
                        python_callable=export_premsa_sequences,
                        op_kwargs={ "config" : default_args['params'], 'nuc_output_fn':  nuc_sequence_output, 'prot_output_fn' : prot_sequence_output, 'gene' : gene },
                        pool='mongo',
                        dag=dag,
                    )

                export_duplicates_task = PythonOperator(
                    task_id=f'export_duplicates_{gene}',
                    python_callable=export_duplicates,
                    op_kwargs={ 'output_fn' : initial_duplicate_output, 'gene': gene },
                    pool='mongo',
                    dag=dag,
                )


                MAFFT = """
                {{ params.mafft }} --thread -1 --add $INPUT_FN $REFERENCE_FILEPATH >| $TMP_OUTPUT_FN
                """

                mafft_task = BashOperator(
                    task_id=f'mafft_{gene}',
                    bash_command=MAFFT,
                    params={'mafft': default_args['params']['mafft']},
                    env={'INPUT_FN': prot_sequence_output, 'TMP_OUTPUT_FN': tmp_output_fn, 'REFERENCE_FILEPATH': reference_filepath },
                    dag=dag
                )

                # input_fn, reference_fn, output_fn
                remove_ref_task = PythonOperator(
                    task_id=f'remove_ref_{gene}',
                    python_callable=reserve_only_original_input,
                    op_kwargs={ "input_fn" : tmp_output_fn, "original_fn" : prot_sequence_output, "output_fn": output_fn },
                    dag=dag,
                )

                POSTMSA = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.post_msa }} --protein-msa $INPUT_FN --nucleotide-sequences $NUC_INPUT_FN --output $COMPRESSED_OUTPUT_FN --duplicates $DUPLICATE_OUTPUT_FN
                """

                # Run POST-MSA on cancatenated dataset to translate back to nucleotides
                reverse_translate_task = BashOperator(
                    task_id=f'post_msa_{gene}',
                    bash_command=POSTMSA,
                    env={'INPUT_FN': output_fn, 'NUC_INPUT_FN': nuc_sequence_output , 'COMPRESSED_OUTPUT_FN': compressed_output_filepath, 'DUPLICATE_OUTPUT_FN': protein_duplicate_output, **os.environ},
                    dag=dag
                )

                cleanup_task = BashOperator(
                    task_id=f'cleanup_{gene}',
                    bash_command="sed -i '/^>/! s/[^ACTG-]/N/g' $COMPRESSED_OUTPUT_FN",
                    env={'COMPRESSED_OUTPUT_FN': compressed_output_filepath, **os.environ},
                    dag=dag
                )

                [export_premsa_sequence_task] >> mafft_task >> remove_ref_task >> reverse_translate_task >> cleanup_task

            with TaskGroup(f"duplicates_{gene}") as duplicates_group:
                merge_duplicate_task = PythonOperator(
                    task_id=f'merge_duplicates_{gene}',
                    python_callable=merge_duplicates,
                    op_kwargs={ 'protein_duplicates' : protein_duplicate_output, 'nuc_duplicates': initial_duplicate_output, 'output':  duplicate_output},
                    dag=dag,
                )

                # Fix duplicates
                fix_duplicate_task = PythonOperator(
                    task_id=f'fix_duplicates_{gene}',
                    python_callable=fix_duplicates,
                    op_kwargs={ 'duplicates' : duplicate_output, 'map': map_output, 'overwrite': True },
                    dag=dag,
                )

                # # Fix header files
                # echo "$PYTHON python/update_fasta_duplicates.py -f ${FILE}.${GENE}.compressed.fas -m ${FILE}.${GENE}.map.json"
                # $PYTHON python/update_fasta_duplicates.py -f ${FILE}.${GENE}.compressed.fas -m ${FILE}.${GENE}.map.json

                update_fasta_duplicates_task = PythonOperator(
                    task_id=f'update_fasta_duplicates_{gene}',
                    python_callable=update_fasta_duplicates,
                    op_kwargs={ 'fasta_file' : compressed_output_filepath, 'map_file': map_output },
                    dag=dag,
                )

                merge_duplicate_task >> fix_duplicate_task >> update_fasta_duplicates_task

            # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json

            with TaskGroup(f"filter_{gene}") as filter:
                COMPRESSOR = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $COMPRESSED_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN  --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT
                """
                compressor_task = BashOperator(
                    task_id=f'compressor_{gene}',
                    bash_command=COMPRESSOR,
                    env={'COMPRESSED_FN': compressed_output_filepath, 'DUPLICATE_FN': duplicate_output, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'COMPRESSOR_DUPLICATE_OUT': compressor_duplicate_out, **os.environ},
                    dag=dag
                )

                # --output-edits ${FILE}.${GENE}.filtered.edits.json
                COMPRESSOR2 = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $COMPRESSED_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN  --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS}
                """
                compressor_two_task = BashOperator(
                    task_id=f'compressor_two_{gene}',
                    bash_command=COMPRESSOR2,
                    env={'COMPRESSED_FN': compressed_output_filepath, 'DUPLICATE_FN': compressor_duplicate_out, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'FILTERED_FASTA_FN': filtered_fasta_output, 'FILTERED_JSON_FN': filtered_json_output, 'OUTPUT_EDITS': output_edits_fn, **os.environ},
                    dag=dag
                )

                compressor_task >> compressor_two_task

            INFER_TREE = """
            seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT;
            rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT
            sed -i "s/'//g" $TREE_OUTPUT;
            """

            infer_tree_task = BashOperator(
                task_id=f'infer_tree_{gene}',
                bash_command=INFER_TREE,
                env={'FILTERED_FASTA_FN': filtered_fasta_output, 'STO_OUTPUT': sto_output, 'TREE_OUTPUT': tree_output, **os.environ},
                dag=dag
            )

            slac_task = BashOperator(
                task_id=f'slac_{gene}',
                bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT",
                env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'SLAC_OUTPUT': slac_output_fn, **os.environ},
                dag=dag,
            )

            big_data_flags='--full-model No'

            fel_task = BashOperator(
                task_id=f'fel_{gene}',
                bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT",
                env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'FEL_OUTPUT': fel_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ},
                dag=dag,
            )

            meme_task = BashOperator(
                task_id=f'meme_{gene}',
                bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT",
                env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'MEME_OUTPUT': meme_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ},
                dag=dag,
            )

            # fubar_task = BashOperator(
            #     task_id='fubar_{gene}',
            #     bash_command='mkdir -p {{params.working_dir}}/data/fasta/{{params.date}}',
            #     dag=dag,
            # )

            # prime_task = BashOperator(
            #     task_id='prime_{gene}',
            #     bash_command='mkdir -p {{params.working_dir}}/data/fasta/{{params.date}}',
            #     dag=dag,
            # )

            annotation_file = filepath_prefix + '.annotation.json'
            copy_annotation_task = BashOperator(
                task_id=f'copy_annotation_{gene}',
                bash_command='cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}',
                params={'annotation_file': annotation_file, 'working_dir': WORKING_DIR},
                dag=dag
            )

            summarize_gene_task = BashOperator(
                task_id=f'summarize_gene_{gene}',
                bash_command='{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output  $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION',
                params={'python': default_args['params']['python'], 'working_dir': WORKING_DIR},
                env={
                    'MASTERNOFASTA': default_args["params"]["meta-output"],
                    'DUPLICATES': duplicate_output,
                    'SLAC_OUTPUT': slac_output_fn,
                    'FEL_OUTPUT': fel_output_fn,
                    'MEME_OUTPUT': meme_output_fn,
                    'SUMMARY_OUTPUT': summary_output_fn,
                    'COMPRESSED_OUTPUT_FN': filtered_fasta_output,
                    'FRAGMENT': str(regions[gene]['fragment']),
                    'ADDSHIFT': str(regions[gene]['add_one']),
                    'SHIFT': str(regions[gene]['shift']),
                    'OFFSET': str(regions[gene]['offset']),
                    'ANNOTATION': annotation_file,
                    **os.environ},
                dag=dag,
            )

            summarize_gene_task.set_upstream(export_meta_task)
            alignment.set_upstream(export_sequences_task)
            export_by_gene.append(alignment >> duplicates_group >> filter >> infer_tree_task >> [slac_task, fel_task, meme_task] >> copy_annotation_task >> summarize_gene_task)

        dag.doc_md = __doc__

        # Add export meta and export sequence tasks to be executed in parallel
        cross_downstream([export_meta_task, export_sequences_task], export_by_gene)

        return dag

Пример #6

Показать файл

Файл: main.py Проект: cellarstone/iotoi-sample

default_args = {
    "owner": "iotoi",
    "start_date": days_ago(1),
}

dag = DAG(
    "check_minio_1",
    default_args=default_args,
    schedule_interval="@once",
    tags=["iotoi-samples"],
)

t1 = BashOperator(
    task_id="bash_test",
    bash_command='echo "hello, it should work" > s3_conn_test.txt',
    dag=dag,
)

sensor = S3KeySensor(
    task_id="check_s3_for_file_in_s3",
    bucket_key="*",
    bucket_name="my-first-bucket",
    wildcard_match=True,
    aws_conn_id="local_minio",
    poke_interval=10,
    dag=dag,
)

t1.set_upstream(sensor)