Пример #1
0
        echo "Error removing the lock file. Check file permissions. To re-run the DAG, ensure that the lock file has been deleted (""" + str(
    LOG_CLEANUP_PROCESS_LOCK_FILE) + """)."
        exit ${REMOVE_LOCK_FILE_EXIT_CODE}
    fi

else
    echo "Another task is already deleting logs on this worker node. \
    Skipping it!"
    echo "If you believe you're receiving this message in error, kindly check \
    if """ + str(LOG_CLEANUP_PROCESS_LOCK_FILE) + """ exists and delete it."
    exit 0
fi

"""

for log_cleanup_id in range(1, NUMBER_OF_WORKERS + 1):

    for dir_id, directory in enumerate(DIRECTORIES_TO_DELETE):

        log_cleanup_op = BashOperator(task_id='log_cleanup_worker_num_' +
                                      str(log_cleanup_id) + '_dir_' +
                                      str(dir_id),
                                      bash_command=log_cleanup,
                                      params={
                                          "directory": str(directory),
                                          "sleep_time": int(log_cleanup_id) * 3
                                      },
                                      dag=dag)

        log_cleanup_op.set_upstream(start)
Пример #2
0
def create_dag(dag_id, schedule, window, default_args):
    with DAG(
            dag_id,
            default_args=default_args,
            description='creates sliding windows based on months',
            schedule_interval=schedule,
            start_date=datetime.datetime(2021, 4, 30),
            on_failure_callback=dag_fail_slack_alert,
            on_success_callback=dag_success_slack_alert,
            tags=['selection', 'sliding'],
    ) as dag:

        OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows-bealign/" + '_'.join(
            window)
        default_args["params"]["output-dir"] = OUTPUT_DIR
        default_args["params"][
            "meta-output"] = OUTPUT_DIR + '/master-no-sequences.json'
        default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences'

        with open(dag.params["region_cfg"], 'r') as stream:
            regions = yaml.safe_load(stream)

        mk_dir_task = BashOperator(
            task_id='make_directory',
            bash_command='mkdir -p {{params.output}}',
            params={'output': default_args['params']['output-dir']},
            dag=dag,
        )

        export_meta_task = PythonOperator(
            task_id='export_meta',
            python_callable=export_meta,
            op_kwargs={"config": default_args['params']},
            pool='mongo',
            dag=dag,
        )

        export_meta_task.set_upstream(mk_dir_task)
        export_sequences_task = PythonOperator(
            task_id='export_sequences',
            python_callable=export_sequences,
            op_kwargs={"config": default_args['params']},
            pool='mongo',
            dag=dag,
        )

        export_sequences_task.set_upstream(mk_dir_task)

        # For each region
        export_by_gene = []

        for gene in regions.keys():

            filepath_prefix = OUTPUT_DIR + '/sequences.' + gene

            nuc_sequence_output = filepath_prefix + '_nuc.fas'
            uniques_fn = filepath_prefix + '_nuc.uniques.fas'
            duplicate_output = filepath_prefix + '.duplicates.json'

            variants_csv_output = filepath_prefix + '.variants.csv'
            variants_json_output = filepath_prefix + '.variants.json'
            filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas'
            filtered_json_output = filepath_prefix + '.filtered.json'
            output_edits_fn = filepath_prefix + '.filtered.edits.json'

            compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json'

            tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree'
            sto_output = filepath_prefix + '.compressed.filtered.sto'

            slac_output_fn = filepath_prefix + '.SLAC.json'
            fel_output_fn = filepath_prefix + '.FEL.json'
            meme_output_fn = filepath_prefix + '.MEME.json'

            summary_output_fn = filepath_prefix + '.json'

            default_args["params"]["nuc-sequence-output"] = nuc_sequence_output
            default_args["params"]["duplicate-output"] = duplicate_output

            with TaskGroup(f"alignment_{gene}") as alignment:

                export_bealign_task = PythonOperator(
                    task_id=f'export_bealign',
                    python_callable=export_bealign_sequences,
                    op_kwargs={
                        "config": default_args['params'],
                        'nuc_output_fn': nuc_sequence_output,
                        'gene': gene
                    },
                    dag=dag,
                )

                # Occasional errors when cleaning up tmp files, so or'ing true
                cleanup_task = BashOperator(
                    task_id=f'cleanup',
                    bash_command=
                    "sed -i '/^>/! s/[^ACTG-]/N/g' $NUC_OUTPUT_FN || true",
                    env={
                        'NUC_OUTPUT_FN': nuc_sequence_output,
                        **os.environ
                    },
                    dag=dag)

                export_bealign_task >> cleanup_task

            with TaskGroup(f"duplicates_{gene}") as duplicates_group:

                compute_duplicates_task = PythonOperator(
                    task_id=f'write_raw_duplicates',
                    python_callable=write_nuc_raw_duplicates,
                    op_kwargs={
                        "input": nuc_sequence_output,
                        "duplicate_output": duplicate_output,
                        'uniques_output': uniques_fn
                    },
                    dag=dag,
                )

                compute_duplicates_task

            # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json

            with TaskGroup(f"filter_{gene}") as filter:
                COMPRESSOR = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $FASTA_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN  --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT
                """
                compressor_task = BashOperator(task_id=f'compressor',
                                               bash_command=COMPRESSOR,
                                               env={
                                                   'FASTA_FN': uniques_fn,
                                                   'DUPLICATE_FN':
                                                   duplicate_output,
                                                   'VARIANTS_CSV_FN':
                                                   variants_csv_output,
                                                   'VARIANTS_JSON_FN':
                                                   variants_json_output,
                                                   'COMPRESSOR_DUPLICATE_OUT':
                                                   compressor_duplicate_out,
                                                   **os.environ
                                               },
                                               dag=dag)

                # --output-edits ${FILE}.${GENE}.filtered.edits.json
                COMPRESSOR2 = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $FASTA_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN  --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS}
                """
                compressor_two_task = BashOperator(
                    task_id=f'compressor_two',
                    bash_command=COMPRESSOR2,
                    env={
                        'FASTA_FN': uniques_fn,
                        'DUPLICATE_FN': compressor_duplicate_out,
                        'VARIANTS_CSV_FN': variants_csv_output,
                        'VARIANTS_JSON_FN': variants_json_output,
                        'FILTERED_FASTA_FN': filtered_fasta_output,
                        'FILTERED_JSON_FN': filtered_json_output,
                        'OUTPUT_EDITS': output_edits_fn,
                        **os.environ
                    },
                    dag=dag)

                compressor_task >> compressor_two_task

            INFER_TREE = """
            seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT;
            rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT
            sed -i "s/'//g" $TREE_OUTPUT;
            """

            infer_tree_task = BashOperator(task_id=f'infer_tree_{gene}',
                                           bash_command=INFER_TREE,
                                           env={
                                               'FILTERED_FASTA_FN':
                                               filtered_fasta_output,
                                               'STO_OUTPUT': sto_output,
                                               'TREE_OUTPUT': tree_output,
                                               **os.environ
                                           },
                                           dag=dag)

            slac_task = BashOperator(
                task_id=f'slac_{gene}',
                bash_command=
                "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT",
                env={
                    'FILTERED_FASTA_FN': filtered_fasta_output,
                    'TREE_OUTPUT': tree_output,
                    'SLAC_OUTPUT': slac_output_fn,
                    **os.environ
                },
                dag=dag,
            )

            big_data_flags = '--full-model No'

            fel_task = BashOperator(
                task_id=f'fel_{gene}',
                bash_command=
                "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT",
                env={
                    'FILTERED_FASTA_FN': filtered_fasta_output,
                    'TREE_OUTPUT': tree_output,
                    'FEL_OUTPUT': fel_output_fn,
                    'BIG_DATA_FLAGS': big_data_flags,
                    **os.environ
                },
                dag=dag,
            )

            meme_task = BashOperator(
                task_id=f'meme_{gene}',
                bash_command=
                "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT",
                env={
                    'FILTERED_FASTA_FN': filtered_fasta_output,
                    'TREE_OUTPUT': tree_output,
                    'MEME_OUTPUT': meme_output_fn,
                    'BIG_DATA_FLAGS': big_data_flags,
                    **os.environ
                },
                dag=dag,
            )

            annotation_file = filepath_prefix + '.annotation.json'
            copy_annotation_task = BashOperator(
                task_id=f'copy_annotation_{gene}',
                bash_command=
                'cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}',
                params={
                    'annotation_file': annotation_file,
                    'working_dir': WORKING_DIR
                },
                dag=dag)

            summarize_gene_task = BashOperator(
                task_id=f'summarize_gene_{gene}',
                bash_command=
                '{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output  $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION',
                params={
                    'python': default_args['params']['python'],
                    'working_dir': WORKING_DIR
                },
                env={
                    'MASTERNOFASTA': default_args["params"]["meta-output"],
                    'DUPLICATES': duplicate_output,
                    'SLAC_OUTPUT': slac_output_fn,
                    'FEL_OUTPUT': fel_output_fn,
                    'MEME_OUTPUT': meme_output_fn,
                    'SUMMARY_OUTPUT': summary_output_fn,
                    'COMPRESSED_OUTPUT_FN': filtered_fasta_output,
                    'FRAGMENT': str(regions[gene]['fragment']),
                    'ADDSHIFT': str(regions[gene]['add_one']),
                    'SHIFT': str(regions[gene]['shift']),
                    'OFFSET': str(regions[gene]['offset']),
                    'ANNOTATION': annotation_file,
                    **os.environ
                },
                dag=dag,
            )

            summarize_gene_task.set_upstream(export_meta_task)
            alignment.set_upstream(export_sequences_task)
            export_by_gene.append(
                alignment >> duplicates_group >> filter >> infer_tree_task >> [
                    slac_task, fel_task, meme_task
                ] >> copy_annotation_task >> summarize_gene_task)

        dag.doc_md = __doc__

        # Add export meta and export sequence tasks to be executed in parallel
        cross_downstream([export_meta_task, export_sequences_task],
                         export_by_gene)

        return dag
Пример #3
0
    
""" 
Setting up Dependencies 

Let's say we have tasks t1, t2 and t3 that do not depend on each other.
Note that when executing your script, Airflow will raise exceptions 
when it finds cycles in your DAG or when a dependency is referenced more 
than once.
"""

t1.set_downstream(t2)

# This means that t2 will depend on t1
# running successfully to run.
# It is equivalent to:
t2.set_upstream(t1)

# The bit shift operator can also be
# used to chain operations:
t1 >> t2

# And the upstream dependency with the
# bit shift operator:
t2 << t1

# Chaining multiple dependencies becomes
# concise with the bit shift operator:
t1 >> t2 >> t3

# A list of tasks can also be set as
# dependencies. These operations
Пример #4
0
        default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences'

    test_task = PythonOperator(
        task_id=f'test_task',
        python_callable=my_task,
        dag=dag,
    )

    mk_dir_task = BashOperator(
        task_id='make_directory',
        bash_command='mkdir -p {{params.output}}',
        params={'output': default_args['params']['output-dir']},
        dag=dag,
    )

    mk_dir_task.set_upstream(test_task)

    export_meta_task = PythonOperator(
        task_id='export_meta',
        python_callable=export_meta,
        op_kwargs={"config": default_args['params']},
        pool='mongo',
        dag=dag,
    )

    export_meta_task.set_upstream(mk_dir_task)

    export_sequences_task = PythonOperator(
        task_id='export_sequences',
        python_callable=export_sequences,
        op_kwargs={"config": default_args['params']},
Пример #5
0
def create_dag(dag_id, schedule, window, default_args):
    with DAG(
        dag_id,
        default_args=default_args,
        description='creates sliding windows based on months',
        schedule_interval=schedule,
        start_date=datetime.datetime(2021, 4, 30),
        on_failure_callback=task_fail_slack_alert,
        on_success_callback=task_success_slack_alert,
        tags=['selection','sliding'],
        ) as dag:

        OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows/" + '_'.join(window)
        default_args["params"]["output-dir"] = OUTPUT_DIR
        default_args["params"]["meta-output"] = OUTPUT_DIR + '/master-no-sequences.json'
        default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences'

        with open(dag.params["region_cfg"], 'r') as stream:
            regions = yaml.safe_load(stream)

        mk_dir_task = BashOperator(
            task_id='make_directory',
            bash_command='mkdir -p {{params.output}}',
            params={'output': default_args['params']['output-dir']},
            dag=dag,
        )

        export_meta_task = PythonOperator(
                task_id='export_meta',
                python_callable=export_meta,
                op_kwargs={ "config" : default_args['params'] },
                pool='mongo',
                dag=dag,
            )

        export_meta_task.set_upstream(mk_dir_task)
        export_sequences_task = PythonOperator(
                task_id='export_sequences',
                python_callable=export_sequences,
                op_kwargs={ "config" : default_args['params'] },
                pool='mongo',
                dag=dag,
            )

        export_sequences_task.set_upstream(mk_dir_task)

        # For each region
        export_by_gene = []

        for gene in regions.keys():

            reference_filepath = WORKING_DIR + 'reference_genes/reference.' + gene + '_protein.fas'
            filepath_prefix = OUTPUT_DIR + '/sequences.' + gene

            nuc_sequence_output = filepath_prefix + '_nuc.fas'
            prot_sequence_output = filepath_prefix + '_protein.fas'

            initial_duplicate_output = filepath_prefix + '.initial.duplicates.json'
            protein_duplicate_output = filepath_prefix + '.protein.duplicates.json'
            duplicate_output = filepath_prefix + '.duplicates.json'
            map_output = filepath_prefix + '.map.json'

            variants_csv_output = filepath_prefix + '.variants.csv'
            variants_json_output = filepath_prefix + '.variants.json'
            filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas'
            filtered_json_output = filepath_prefix + '.filtered.json'
            output_edits_fn = filepath_prefix + '.filtered.edits.json'

            compressed_output_filepath =  filepath_prefix + '.compressed.fas'
            compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json'

            tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree'
            sto_output = filepath_prefix + '.compressed.filtered.sto';

            tmp_output_fn = filepath_prefix + '.tmp.msa'
            output_fn = filepath_prefix + '.msa'

            slac_output_fn = filepath_prefix + '.SLAC.json'
            fel_output_fn = filepath_prefix + '.FEL.json'
            meme_output_fn = filepath_prefix + '.MEME.json'

            summary_output_fn = filepath_prefix + '.json'

            default_args["params"]["nuc-sequence-output"] = nuc_sequence_output
            default_args["params"]["prot-sequence-output"] = prot_sequence_output
            default_args["params"]["duplicate-output"] = duplicate_output
            default_args["params"]["protein-duplicate-output"] = protein_duplicate_output
            default_args["params"]["inital-duplicate-output"] = initial_duplicate_output

            with TaskGroup(f"alignment_{gene}") as alignment:

                export_premsa_sequence_task = PythonOperator(
                        task_id=f'export_premsa_sequences_{gene}',
                        python_callable=export_premsa_sequences,
                        op_kwargs={ "config" : default_args['params'], 'nuc_output_fn':  nuc_sequence_output, 'prot_output_fn' : prot_sequence_output, 'gene' : gene },
                        pool='mongo',
                        dag=dag,
                    )

                export_duplicates_task = PythonOperator(
                    task_id=f'export_duplicates_{gene}',
                    python_callable=export_duplicates,
                    op_kwargs={ 'output_fn' : initial_duplicate_output, 'gene': gene },
                    pool='mongo',
                    dag=dag,
                )


                MAFFT = """
                {{ params.mafft }} --thread -1 --add $INPUT_FN $REFERENCE_FILEPATH >| $TMP_OUTPUT_FN
                """

                mafft_task = BashOperator(
                    task_id=f'mafft_{gene}',
                    bash_command=MAFFT,
                    params={'mafft': default_args['params']['mafft']},
                    env={'INPUT_FN': prot_sequence_output, 'TMP_OUTPUT_FN': tmp_output_fn, 'REFERENCE_FILEPATH': reference_filepath },
                    dag=dag
                )

                # input_fn, reference_fn, output_fn
                remove_ref_task = PythonOperator(
                    task_id=f'remove_ref_{gene}',
                    python_callable=reserve_only_original_input,
                    op_kwargs={ "input_fn" : tmp_output_fn, "original_fn" : prot_sequence_output, "output_fn": output_fn },
                    dag=dag,
                )

                POSTMSA = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.post_msa }} --protein-msa $INPUT_FN --nucleotide-sequences $NUC_INPUT_FN --output $COMPRESSED_OUTPUT_FN --duplicates $DUPLICATE_OUTPUT_FN
                """

                # Run POST-MSA on cancatenated dataset to translate back to nucleotides
                reverse_translate_task = BashOperator(
                    task_id=f'post_msa_{gene}',
                    bash_command=POSTMSA,
                    env={'INPUT_FN': output_fn, 'NUC_INPUT_FN': nuc_sequence_output , 'COMPRESSED_OUTPUT_FN': compressed_output_filepath, 'DUPLICATE_OUTPUT_FN': protein_duplicate_output, **os.environ},
                    dag=dag
                )

                cleanup_task = BashOperator(
                    task_id=f'cleanup_{gene}',
                    bash_command="sed -i '/^>/! s/[^ACTG-]/N/g' $COMPRESSED_OUTPUT_FN",
                    env={'COMPRESSED_OUTPUT_FN': compressed_output_filepath, **os.environ},
                    dag=dag
                )

                [export_premsa_sequence_task] >> mafft_task >> remove_ref_task >> reverse_translate_task >> cleanup_task

            with TaskGroup(f"duplicates_{gene}") as duplicates_group:
                merge_duplicate_task = PythonOperator(
                    task_id=f'merge_duplicates_{gene}',
                    python_callable=merge_duplicates,
                    op_kwargs={ 'protein_duplicates' : protein_duplicate_output, 'nuc_duplicates': initial_duplicate_output, 'output':  duplicate_output},
                    dag=dag,
                )

                # Fix duplicates
                fix_duplicate_task = PythonOperator(
                    task_id=f'fix_duplicates_{gene}',
                    python_callable=fix_duplicates,
                    op_kwargs={ 'duplicates' : duplicate_output, 'map': map_output, 'overwrite': True },
                    dag=dag,
                )

                # # Fix header files
                # echo "$PYTHON python/update_fasta_duplicates.py -f ${FILE}.${GENE}.compressed.fas -m ${FILE}.${GENE}.map.json"
                # $PYTHON python/update_fasta_duplicates.py -f ${FILE}.${GENE}.compressed.fas -m ${FILE}.${GENE}.map.json

                update_fasta_duplicates_task = PythonOperator(
                    task_id=f'update_fasta_duplicates_{gene}',
                    python_callable=update_fasta_duplicates,
                    op_kwargs={ 'fasta_file' : compressed_output_filepath, 'map_file': map_output },
                    dag=dag,
                )

                merge_duplicate_task >> fix_duplicate_task >> update_fasta_duplicates_task

            # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json

            with TaskGroup(f"filter_{gene}") as filter:
                COMPRESSOR = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $COMPRESSED_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN  --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT
                """
                compressor_task = BashOperator(
                    task_id=f'compressor_{gene}',
                    bash_command=COMPRESSOR,
                    env={'COMPRESSED_FN': compressed_output_filepath, 'DUPLICATE_FN': duplicate_output, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'COMPRESSOR_DUPLICATE_OUT': compressor_duplicate_out, **os.environ},
                    dag=dag
                )

                # --output-edits ${FILE}.${GENE}.filtered.edits.json
                COMPRESSOR2 = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $COMPRESSED_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN  --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS}
                """
                compressor_two_task = BashOperator(
                    task_id=f'compressor_two_{gene}',
                    bash_command=COMPRESSOR2,
                    env={'COMPRESSED_FN': compressed_output_filepath, 'DUPLICATE_FN': compressor_duplicate_out, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'FILTERED_FASTA_FN': filtered_fasta_output, 'FILTERED_JSON_FN': filtered_json_output, 'OUTPUT_EDITS': output_edits_fn, **os.environ},
                    dag=dag
                )

                compressor_task >> compressor_two_task

            INFER_TREE = """
            seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT;
            rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT
            sed -i "s/'//g" $TREE_OUTPUT;
            """

            infer_tree_task = BashOperator(
                task_id=f'infer_tree_{gene}',
                bash_command=INFER_TREE,
                env={'FILTERED_FASTA_FN': filtered_fasta_output, 'STO_OUTPUT': sto_output, 'TREE_OUTPUT': tree_output, **os.environ},
                dag=dag
            )

            slac_task = BashOperator(
                task_id=f'slac_{gene}',
                bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT",
                env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'SLAC_OUTPUT': slac_output_fn, **os.environ},
                dag=dag,
            )

            big_data_flags='--full-model No'

            fel_task = BashOperator(
                task_id=f'fel_{gene}',
                bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT",
                env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'FEL_OUTPUT': fel_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ},
                dag=dag,
            )

            meme_task = BashOperator(
                task_id=f'meme_{gene}',
                bash_command="{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT",
                env={'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'MEME_OUTPUT': meme_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ},
                dag=dag,
            )

            # fubar_task = BashOperator(
            #     task_id='fubar_{gene}',
            #     bash_command='mkdir -p {{params.working_dir}}/data/fasta/{{params.date}}',
            #     dag=dag,
            # )

            # prime_task = BashOperator(
            #     task_id='prime_{gene}',
            #     bash_command='mkdir -p {{params.working_dir}}/data/fasta/{{params.date}}',
            #     dag=dag,
            # )

            annotation_file = filepath_prefix + '.annotation.json'
            copy_annotation_task = BashOperator(
                task_id=f'copy_annotation_{gene}',
                bash_command='cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}',
                params={'annotation_file': annotation_file, 'working_dir': WORKING_DIR},
                dag=dag
            )

            summarize_gene_task = BashOperator(
                task_id=f'summarize_gene_{gene}',
                bash_command='{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output  $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION',
                params={'python': default_args['params']['python'], 'working_dir': WORKING_DIR},
                env={
                    'MASTERNOFASTA': default_args["params"]["meta-output"],
                    'DUPLICATES': duplicate_output,
                    'SLAC_OUTPUT': slac_output_fn,
                    'FEL_OUTPUT': fel_output_fn,
                    'MEME_OUTPUT': meme_output_fn,
                    'SUMMARY_OUTPUT': summary_output_fn,
                    'COMPRESSED_OUTPUT_FN': filtered_fasta_output,
                    'FRAGMENT': str(regions[gene]['fragment']),
                    'ADDSHIFT': str(regions[gene]['add_one']),
                    'SHIFT': str(regions[gene]['shift']),
                    'OFFSET': str(regions[gene]['offset']),
                    'ANNOTATION': annotation_file,
                    **os.environ},
                dag=dag,
            )

            summarize_gene_task.set_upstream(export_meta_task)
            alignment.set_upstream(export_sequences_task)
            export_by_gene.append(alignment >> duplicates_group >> filter >> infer_tree_task >> [slac_task, fel_task, meme_task] >> copy_annotation_task >> summarize_gene_task)

        dag.doc_md = __doc__

        # Add export meta and export sequence tasks to be executed in parallel
        cross_downstream([export_meta_task, export_sequences_task], export_by_gene)

        return dag
Пример #6
0
default_args = {
    "owner": "iotoi",
    "start_date": days_ago(1),
}

dag = DAG(
    "check_minio_1",
    default_args=default_args,
    schedule_interval="@once",
    tags=["iotoi-samples"],
)

t1 = BashOperator(
    task_id="bash_test",
    bash_command='echo "hello, it should work" > s3_conn_test.txt',
    dag=dag,
)

sensor = S3KeySensor(
    task_id="check_s3_for_file_in_s3",
    bucket_key="*",
    bucket_name="my-first-bucket",
    wildcard_match=True,
    aws_conn_id="local_minio",
    poke_interval=10,
    dag=dag,
)

t1.set_upstream(sensor)