예제 #1
0
def makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim,
                   makeblastdb, fpatt):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Building BLAST databases for reads.')
        if makeblastdb is None:
            Log.err('makeblastdb is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, se)
        fa_path = se_fastq_files[se]['filter_path_fa']
        out_f = opj(dir_blast_fa_trim_sample, se)
        se_fastq_files[se]['blast_db_path'] = out_f

        if ope(dir_blast_fa_trim_sample):
            Log.msg('BLAST database already exists:', se)
        else:
            make_dirs(dir_blast_fa_trim_sample)
            Log.msg(basename(fa_path))
            make_blast_db(exec_file=makeblastdb,
                          in_file=fa_path,
                          out_file=out_f,
                          title=se,
                          dbtype='nucl')

    for pe in pe_fastq_files:
        dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, pe)
        fa_paths = pe_fastq_files[pe]['filter_path_fa']
        out_fs = [x.replace('@D@', dir_blast_fa_trim_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        pe_fastq_files[pe]['blast_db_path'] = out_fs

        if ope(dir_blast_fa_trim_sample):
            Log.msg('BLAST database already exists:', pe)
        else:
            make_dirs(dir_blast_fa_trim_sample)
            pe_trim_files = zip(fa_paths, out_fs)
            for x in pe_trim_files:
                Log.msg(basename(x[0]))
                make_blast_db(exec_file=makeblastdb,
                              in_file=x[0],
                              out_file=x[1],
                              title=basename(x[1]),
                              dbtype='nucl')
예제 #2
0
def makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb):
    if len(assemblies) > 0:
        print()
        Log.inf('Building BLAST databases for assemblies.')
        if makeblastdb is None:
            Log.err('makeblastdb is not available. Cannot continue. Exiting.')
            exit(0)
    for a in assemblies:
        assmbl_name = a['name']

        assmbl_blast_db_dir = opj(dir_prj_blast_assmbl, assmbl_name)
        assmbl_blast_db_file = opj(assmbl_blast_db_dir, assmbl_name)

        a['blast_db_path'] = assmbl_blast_db_file

        if ope(assmbl_blast_db_dir):
            Log.msg('BLAST database already exists:', assmbl_name)
        else:
            Log.msg(assmbl_name)
            make_dirs(assmbl_blast_db_dir)
            make_blast_db(exec_file=makeblastdb,
                          in_file=a['path'],
                          out_file=assmbl_blast_db_file,
                          title=assmbl_name)
예제 #3
0
def filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk,
                      fpatt):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Converting FASTQ to FASTA using Seqtk.')
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_fa_trim_data_sample = opj(dir_fa_trim_data, se)
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_fa_trim_data_sample, se + '.fasta')
        se_fastq_files[se]['filter_path_fa'] = out_f

        if ope(dir_fa_trim_data_sample):
            Log.msg('Filtered FASTA files already exist:', se)
        else:
            make_dirs(dir_fa_trim_data_sample)
            Log.msg(basename(fq_path))
            seqtk_fq_to_fa(seqtk, fq_path, out_f)

    for pe in pe_fastq_files:
        dir_fa_trim_data_sample = opj(dir_fa_trim_data, pe)
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_fa_trim_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        pe_fastq_files[pe]['filter_path_fa'] = out_fs

        if ope(dir_fa_trim_data_sample):
            Log.msg('Filtered FASTA files already exist:', pe)
        else:
            make_dirs(dir_fa_trim_data_sample)
            pe_trim_files = zip(fq_paths, out_fs)
            for x in pe_trim_files:
                Log.msg(basename(x[0]))
                seqtk_fq_to_fa(seqtk, x[0], x[1])
예제 #4
0
파일: __main__.py 프로젝트: muti99/kakapo
def main():
    """Run the script."""
    # Prepare initial logger (before we know the log file path) --------------
    prj_log_file_suffix = time_stamp() + '.log'
    log_stream = StringIO()

    Log.set_colors(COLORS)
    Log.set_file(log_stream)
    Log.set_write(True)

    # Prepare configuration directory ----------------------------------------
    if ope(DIR_CFG):
        Log.inf('Found configuration directory:', DIR_CFG)
    else:
        Log.wrn('Creating configuration directory:', DIR_CFG)
        make_dirs(DIR_CFG)

    print()

    # Check for dependencies -------------------------------------------------
    Log.inf('Checking for dependencies.')
    make_dirs(DIR_DEP)
    make_dirs(DIR_KRK)
    seqtk = deps.dep_check_seqtk(DIR_DEP, FORCE_DEPS)
    trimmomatic, adapters = deps.dep_check_trimmomatic(DIR_DEP)
    fasterq_dump = deps.dep_check_sra_toolkit(DIR_DEP, OS_ID, DIST_ID,
                                              DEBIAN_DISTS, REDHAT_DISTS,
                                              FORCE_DEPS)
    makeblastdb, _, tblastn = deps.dep_check_blast(DIR_DEP, OS_ID, DIST_ID,
                                                   DEBIAN_DISTS, REDHAT_DISTS,
                                                   FORCE_DEPS)
    vsearch = deps.dep_check_vsearch(DIR_DEP, OS_ID, DIST_ID, DEBIAN_DISTS,
                                     REDHAT_DISTS, FORCE_DEPS)
    spades = deps.dep_check_spades(DIR_DEP, OS_ID, FORCE_DEPS)
    bowtie2, bowtie2_build = deps.dep_check_bowtie2(DIR_DEP, OS_ID, FORCE_DEPS)
    rcorrector = deps.dep_check_rcorrector(DIR_DEP, FORCE_DEPS)
    kraken2, kraken2_build = deps.dep_check_kraken2(DIR_DEP, OS_ID,
                                                    RELEASE_NAME, FORCE_DEPS)

    print()

    kraken2_dbs = deps.dnld_kraken2_dbs(DIR_KRK)

    if INSTALL_DEPS is True or DNLD_KRAKEN_DBS is True:
        exit(0)

    print()

    # Initialize NCBI taxonomy database --------------------------------------
    tax = Taxonomy()
    if tax.is_initialized() is False:
        tax.init(data_dir_path=DIR_TAX, logger=Log)
        print()

    # Parse configuration file -----------------------------------------------
    Log.inf('Reading configuration file:', CONFIG_FILE_PATH)
    _ = config_file_parse(CONFIG_FILE_PATH, tax)

    allow_no_stop_cod = _['allow_no_stop_cod']
    allow_no_strt_cod = _['allow_no_strt_cod']
    allow_non_aug = _['allow_non_aug']

    blast_1_evalue = _['blast_1_evalue']
    blast_1_max_hsps = _['blast_1_max_hsps']
    blast_1_qcov_hsp_perc = _['blast_1_qcov_hsp_perc']
    blast_1_best_hit_overhang = _['blast_1_best_hit_overhang']
    blast_1_best_hit_score_edge = _['blast_1_best_hit_score_edge']
    blast_1_max_target_seqs = _['blast_1_max_target_seqs']

    blast_2_evalue = _['blast_2_evalue']
    blast_2_max_hsps = _['blast_2_max_hsps']
    blast_2_qcov_hsp_perc = _['blast_2_qcov_hsp_perc']
    blast_2_best_hit_overhang = _['blast_2_best_hit_overhang']
    blast_2_best_hit_score_edge = _['blast_2_best_hit_score_edge']
    blast_2_max_target_seqs = _['blast_2_max_target_seqs']

    dir_out = _['output_directory']
    email = _['email']
    requery_after = _['requery_after']
    fq_pe = _['fq_pe']
    fq_se = _['fq_se']
    should_run_rcorrector = _['should_run_rcorrector']
    should_run_ipr = _['should_run_ipr']
    bt2_order = _['bt2_order']
    kraken_confidence = _['kraken_confidence']
    krkn_order = _['krkn_order']
    prepend_assmbl = _['prepend_assmbl']
    prj_name = _['project_name']
    sras = _['sras']
    tax_group = _['tax_group']
    # tax_group_name = _['tax_group_name']
    tax_ids_user = _['tax_ids']
    user_assemblies = _['assmbl']

    print()

    # Parse search strategies file -------------------------------------------
    if SS_FILE_PATH is not None:
        Log.inf('Reading search strategies file:', SS_FILE_PATH)
        sss = ss_file_parse(SS_FILE_PATH)
    else:
        Log.wrn('Search strategies file was not provided.\n' +
                'Will process reads, assemblies and then stop.')
        sss = dict()

    print()

    # Create output directory ------------------------------------------------
    if dir_out is not None:
        if ope(dir_out):
            Log.inf('Found output directory:', dir_out)
        else:
            Log.wrn('Creating output directory:', dir_out)
            make_dirs(dir_out)

    print()

    # Write Kakapo version information to the output directory ---------------
    version_file = opj(dir_out, 'kakapo_version.txt')
    if ope(version_file):
        with open(version_file, 'r') as f:
            version_prev = f.read().strip()
            if __version__ != version_prev:
                Log.wrn('The output directory contains data produced by a ' +
                        'different version of Kakapo: ' + version_prev +
                        '.\nThe currently running version is: ' + __version__ +
                        '.\n' +
                        'Delete "kakapo_version.txt" file located in the ' +
                        'output directory if you would like to continue.')
                exit(0)

    with open(version_file, 'w') as f:
        f.write(__version__)

    # Create subdirectories in the output directory --------------------------
    _ = prepare_output_directories(dir_out, prj_name)

    dir_temp = _['dir_temp']
    dir_cache_pfam_acc = _['dir_cache_pfam_acc']
    dir_cache_fq_minlen = _['dir_cache_fq_minlen']
    dir_cache_prj = _['dir_cache_prj']
    dir_cache_refseqs = _['dir_cache_refseqs']
    dir_prj_logs = _['dir_prj_logs']
    dir_prj_queries = _['dir_prj_queries']
    dir_fq_data = _['dir_fq_data']
    dir_fq_cor_data = _['dir_fq_cor_data']
    dir_fq_trim_data = _['dir_fq_trim_data']
    dir_fq_filter_bt2_data = _['dir_fq_filter_bt2_data']
    dir_fq_filter_krkn2_data = _['dir_fq_filter_krkn2_data']
    dir_fa_trim_data = _['dir_fa_trim_data']
    dir_blast_fa_trim = _['dir_blast_fa_trim']
    dir_prj_blast_results_fa_trim = _['dir_prj_blast_results_fa_trim']
    dir_prj_vsearch_results_fa_trim = _['dir_prj_vsearch_results_fa_trim']
    dir_prj_spades_assemblies = _['dir_prj_spades_assemblies']
    dir_prj_blast_assmbl = _['dir_prj_blast_assmbl']
    dir_prj_assmbl_blast_results = _['dir_prj_assmbl_blast_results']
    dir_prj_transcripts = _['dir_prj_transcripts']
    dir_prj_ips = _['dir_prj_ips']
    dir_prj_transcripts_combined = _['dir_prj_transcripts_combined']

    # Prepare logger ---------------------------------------------------------
    prj_log_file = opj(dir_prj_logs, prj_name + '_' + prj_log_file_suffix)
    with open(prj_log_file, 'w') as f:
        f.write(SCRIPT_INFO.strip() + '\n\n' + log_stream.getvalue())

    Log.set_colors(COLORS)
    Log.set_file(prj_log_file)
    Log.set_write(True)

    log_stream.close()

    # Resolve descending taxonomy nodes --------------------------------------
    tax_ids = tax.all_descending_taxids_for_taxids([tax_group])

    # Pfam uniprot accessions ------------------------------------------------
    pfam_uniprot_acc = OrderedDict()
    for ss in sss:
        pfam_acc = sss[ss]['pfam_families']
        pfam_uniprot_acc[ss] = pfam_uniprot_accessions(ss, pfam_acc, tax_ids,
                                                       dir_cache_pfam_acc)

    # Download Pfam uniprot sequences if needed ------------------------------
    aa_uniprot_files = OrderedDict()
    for ss in sss:
        aa_uniprot_files[ss] = opj(dir_prj_queries,
                                   'aa_uniprot__' + ss + '.fasta')
        # ToDo: add support for the requery_after parameter.
        dnld_pfam_uniprot_seqs(ss, pfam_uniprot_acc[ss], aa_uniprot_files[ss],
                               dir_cache_prj)

    # User provided entrez query ---------------------------------------------
    prot_acc_user_from_query = OrderedDict()
    for ss in sss:
        entrez_queries = sss[ss]['entrez_search_queries']
        prot_acc_user_from_query[ss] = user_entrez_search(
            ss, entrez_queries, dir_cache_prj, requery_after)

    # User provided protein accessions ---------------------------------------
    prot_acc_user = OrderedDict()
    for ss in sss:
        print()
        prot_acc_all = sorted(
            set(sss[ss]['ncbi_accessions_aa'] + prot_acc_user_from_query[ss]))
        prot_acc_user[ss] = user_protein_accessions(ss, prot_acc_all,
                                                    dir_cache_prj, tax)

    # Download from NCBI if needed -------------------------------------------
    aa_prot_ncbi_files = OrderedDict()
    for ss in sss:
        aa_prot_ncbi_files[ss] = opj(dir_prj_queries,
                                     'aa_prot_ncbi__' + ss + '.fasta')
        prot_acc_user[ss] = dnld_prot_seqs(ss, prot_acc_user[ss],
                                           aa_prot_ncbi_files[ss],
                                           dir_cache_prj)

    # User provided protein sequences ----------------------------------------
    aa_prot_user_files = OrderedDict()
    for ss in sss:
        user_queries = sss[ss]['fasta_files_aa']
        aa_prot_user_files[ss] = opj(dir_prj_queries,
                                     'aa_prot_user__' + ss + '.fasta')
        user_aa_fasta(ss, user_queries, aa_prot_user_files[ss])

    # Combine all AA queries -------------------------------------------------
    print()
    aa_queries_files = OrderedDict()
    for ss in sss:
        aa_queries_files[ss] = opj(dir_prj_queries, 'aa_all__' + ss + '.fasta')
        combine_aa_fasta(ss, [
            aa_uniprot_files[ss], aa_prot_ncbi_files[ss],
            aa_prot_user_files[ss]
        ], aa_queries_files[ss])

    # Filter AA queries ------------------------------------------------------
    prot_acc_user_filtered = OrderedDict()
    for ss in sss:
        min_query_length = sss[ss]['min_query_length']
        max_query_length = sss[ss]['max_query_length']
        max_query_identity = sss[ss]['max_query_identity']

        # Dereplicate all queries
        filter_queries(ss,
                       aa_queries_files[ss],
                       min_query_length,
                       max_query_length,
                       max_query_identity,
                       vsearch,
                       prot_acc_user[ss],
                       overwrite=True)

        # Dereplicate only NCBI queries. CDS for these will be downloaded
        # later for reference.
        if ope(aa_prot_ncbi_files[ss]):
            prot_acc_user_filtered[ss] = filter_queries(ss,
                                                        aa_prot_ncbi_files[ss],
                                                        min_query_length,
                                                        max_query_length,
                                                        max_query_identity,
                                                        vsearch,
                                                        prot_acc_user[ss],
                                                        overwrite=False,
                                                        logging=False)

    # Download SRA run metadata if needed ------------------------------------
    sra_runs_info, sras_acceptable = dnld_sra_info(sras, dir_cache_prj)

    # Download SRA run FASTQ files if needed ---------------------------------
    x, y, z = dnld_sra_fastq_files(sras_acceptable, sra_runs_info, dir_fq_data,
                                   fasterq_dump, THREADS, dir_temp)

    se_fastq_files_sra = x
    pe_fastq_files_sra = y
    sra_runs_info = z

    # User provided FASTQ files ----------------------------------------------
    se_fastq_files_usr, pe_fastq_files_usr = user_fastq_files(fq_se, fq_pe)

    # Collate FASTQ file info ------------------------------------------------
    se_fastq_files = se_fastq_files_sra.copy()
    se_fastq_files.update(se_fastq_files_usr)
    pe_fastq_files = pe_fastq_files_sra.copy()
    pe_fastq_files.update(pe_fastq_files_usr)

    def gc_tt(k, d, tax):
        taxid = d[k]['tax_id']

        gc = tax.genetic_code_for_taxid(taxid)

        d[k]['gc_id'] = gc
        d[k]['gc_tt'] = TranslationTable(gc)

        gc_mito = None
        tt_mito = None

        gc_plastid = None
        tt_plastid = None

        if tax.is_eukaryote(taxid) is True:
            gc_mito = tax.mito_genetic_code_for_taxid(taxid)
            if gc_mito != '0':
                tt_mito = TranslationTable(gc_mito)

            if tax.contains_plastid(taxid) is True:
                gc_plastid = tax.plastid_genetic_code_for_taxid(taxid)
                if gc_plastid != '0':
                    tt_plastid = TranslationTable(gc_plastid)

        d[k]['gc_id_mito'] = gc_mito
        d[k]['gc_tt_mito'] = tt_mito

        d[k]['gc_id_plastid'] = gc_plastid
        d[k]['gc_tt_plastid'] = tt_plastid

    for se in se_fastq_files:
        gc_tt(se, se_fastq_files, tax)

    for pe in pe_fastq_files:
        gc_tt(pe, pe_fastq_files, tax)

    # Minimum acceptable read length -----------------------------------------
    min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp,
                        dir_cache_fq_minlen, vsearch)

    # Run Rcorrector ---------------------------------------------------------
    run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   THREADS, dir_temp, should_run_rcorrector)

    # File name patterns -----------------------------------------------------
    a, b, c, d, e = file_name_patterns()

    pe_trim_fq_file_patterns = a
    pe_trim_fa_file_patterns = b
    pe_blast_db_file_patterns = c
    pe_blast_results_file_patterns = d
    pe_vsearch_results_file_patterns = e

    # Run Trimmomatic --------------------------------------------------------
    run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data,
                    trimmomatic, adapters, pe_trim_fq_file_patterns, THREADS)

    # Run Bowtie 2 -----------------------------------------------------------
    run_bt2_fq(se_fastq_files, pe_fastq_files, dir_fq_filter_bt2_data, bowtie2,
               bowtie2_build, THREADS, dir_temp, bt2_order,
               pe_trim_fq_file_patterns, tax, dir_cache_refseqs)

    # Run Kraken2 ------------------------------------------------------------
    run_kraken2(krkn_order, kraken2_dbs, se_fastq_files, pe_fastq_files,
                dir_fq_filter_krkn2_data, kraken_confidence, kraken2, THREADS,
                dir_temp, pe_trim_fq_file_patterns)

    se_fastq_files = OrderedDict(se_fastq_files)
    pe_fastq_files = OrderedDict(pe_fastq_files)

    se_fastq_files = OrderedDict(
        sorted(se_fastq_files.items(), key=lambda x: x[1]['filter_path_fq']))

    pe_fastq_files = OrderedDict(
        sorted(pe_fastq_files.items(), key=lambda x: x[1]['filter_path_fq']))

    # Stop After Filter ------------------------------------------------------
    if STOP_AFTER_FILTER is True:
        Log.wrn('Stopping after Kraken2/Bowtie2 filtering step as requested.')
        exit(0)

    # Convert filtered FASTQ files to FASTA ----------------------------------
    filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk,
                      pe_trim_fa_file_patterns)

    # Run makeblastdb on reads -----------------------------------------------
    makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim,
                   makeblastdb, pe_blast_db_file_patterns)

    # Check if there are any query sequences.
    any_queries = False
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        else:
            any_queries = True

    # Run tblastn on reads ---------------------------------------------------
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        changed_blast_1 = run_tblastn_on_reads(
            se_fastq_files, pe_fastq_files, aa_queries_files[ss], tblastn,
            blast_1_evalue, blast_1_max_hsps, blast_1_qcov_hsp_perc,
            blast_1_best_hit_overhang, blast_1_best_hit_score_edge,
            blast_1_max_target_seqs, dir_prj_blast_results_fa_trim,
            pe_blast_results_file_patterns, ss, THREADS, seqtk, vsearch,
            dir_cache_prj)

        if changed_blast_1 is True:
            if ope(dir_prj_vsearch_results_fa_trim):
                rmtree(dir_prj_vsearch_results_fa_trim)
            if ope(dir_prj_spades_assemblies):
                rmtree(dir_prj_spades_assemblies)
            if ope(dir_prj_blast_assmbl):
                rmtree(dir_prj_blast_assmbl)
            if ope(dir_prj_assmbl_blast_results):
                rmtree(dir_prj_assmbl_blast_results)
            if ope(dir_prj_transcripts):
                rmtree(dir_prj_transcripts)
            if ope(dir_prj_transcripts_combined):
                rmtree(dir_prj_transcripts_combined)

    prepare_output_directories(dir_out, prj_name)

    # Run vsearch on reads ---------------------------------------------------
    # should_run_vsearch = False
    # for ss in sss:
    #     if stat(aa_queries_files[ss]).st_size == 0:
    #         continue
    #     else:
    #         should_run_vsearch = True
    #         break

    # if should_run_vsearch is True:
    #     print()
    #     Log.inf('Checking if Vsearch should be run.')

    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        print()
        Log.inf('Checking if Vsearch should be run:', ss)
        run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch,
                             dir_prj_vsearch_results_fa_trim,
                             pe_vsearch_results_file_patterns, ss, seqtk)

    # Run SPAdes -------------------------------------------------------------
    # should_run_spades = False
    # for ss in sss:
    #     if stat(aa_queries_files[ss]).st_size == 0:
    #         continue
    #     else:
    #         should_run_spades = True
    #         break

    # if should_run_spades is True:
    #     print()
    #     Log.inf('Checking if SPAdes should be run.')

    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            for se in se_fastq_files:
                se_fastq_files[se]['spades_assembly' + '__' + ss] = None
            for pe in pe_fastq_files:
                pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None
            continue
        print()
        Log.inf('Checking if SPAdes should be run:', ss)
        run_spades(se_fastq_files, pe_fastq_files, dir_prj_spades_assemblies,
                   spades, dir_temp, ss, THREADS, RAM)

    # Combine SPAdes and user provided assemblies ----------------------------
    assemblies = combine_assemblies(se_fastq_files, pe_fastq_files,
                                    user_assemblies, tax, sss)

    # Run makeblastdb on assemblies  -----------------------------------------
    makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb)

    if any_queries is False:
        Log.wrn('No query sequences were provided.')

    # Run tblastn on assemblies ----------------------------------------------
    for ss in sss:

        if stat(aa_queries_files[ss]).st_size == 0:
            continue

        should_run_tblastn = False
        for a in assemblies:
            assmbl_src = a['src']
            assmbl_name = a['name']
            if assmbl_src != 'user_fasta':
                if assmbl_name.endswith('__' + ss):
                    should_run_tblastn = True
                    break
            else:
                should_run_tblastn = True
                break

        if should_run_tblastn is False:
            print()
            Log.inf('Will not run BLAST. No transcripts exist:', ss)
            continue

        blast_2_evalue_ss = sss[ss]['blast_2_evalue']
        blast_2_max_hsps_ss = sss[ss]['blast_2_max_hsps']
        blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc']
        blast_2_best_hit_overhang_ss = sss[ss]['blast_2_best_hit_overhang']
        blast_2_best_hit_score_edge_ss = sss[ss]['blast_2_best_hit_score_edge']
        blast_2_max_target_seqs_ss = sss[ss]['blast_2_max_target_seqs']

        if blast_2_evalue_ss is None:
            blast_2_evalue_ss = blast_2_evalue
        if blast_2_max_hsps_ss is None:
            blast_2_max_hsps_ss = blast_2_max_hsps
        if blast_2_qcov_hsp_perc_ss is None:
            blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc
        if blast_2_best_hit_overhang_ss is None:
            blast_2_best_hit_overhang_ss = blast_2_best_hit_overhang
        if blast_2_best_hit_score_edge_ss is None:
            blast_2_best_hit_score_edge_ss = blast_2_best_hit_score_edge
        if blast_2_max_target_seqs_ss is None:
            blast_2_max_target_seqs_ss = blast_2_max_target_seqs

        run_tblastn_on_assemblies(
            ss, assemblies, aa_queries_files[ss], tblastn,
            dir_prj_assmbl_blast_results, blast_2_evalue_ss,
            blast_2_max_hsps_ss, blast_2_qcov_hsp_perc_ss,
            blast_2_best_hit_overhang_ss, blast_2_best_hit_score_edge_ss,
            blast_2_max_target_seqs_ss, THREADS, dir_cache_prj, dir_prj_ips)

    # Prepare BLAST hits for analysis: find ORFs, translate ------------------
    for ss in sss:

        if stat(aa_queries_files[ss]).st_size == 0:
            continue

        min_target_orf_len_ss = sss[ss]['min_target_orf_length']
        max_target_orf_len_ss = sss[ss]['max_target_orf_length']
        organelle = sss[ss]['organelle']

        blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc']

        if blast_2_qcov_hsp_perc_ss is None:
            blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc

        find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk,
                            dir_temp, prepend_assmbl, min_target_orf_len_ss,
                            max_target_orf_len_ss, allow_non_aug,
                            allow_no_strt_cod, allow_no_stop_cod, tax,
                            tax_group, tax_ids_user, blast_2_qcov_hsp_perc_ss,
                            organelle)

    # GFF3 files from kakapo results JSON files ------------------------------
    # print()
    for ss in sss:
        if stat(aa_queries_files[ss]).st_size == 0:
            continue
        gff_from_json(ss, assemblies, dir_prj_ips,
                      dir_prj_transcripts_combined, prj_name)

    # Run InterProScan 5 -----------------------------------------------------
    if should_run_ipr is True:
        print()
        ss_names = tuple(sss.keys())

        # Determine the length of printed strings, for better spacing --------
        max_title_a_len = 0
        max_run_id_len = 0
        for a in assemblies:
            for ss in ss_names:
                if 'transcripts_aa_orf_fasta_file__' + ss not in a:
                    continue

                aa_file = a['transcripts_aa_orf_fasta_file__' + ss]

                if aa_file is None:
                    continue

                assmbl_name = a['name']
                run_id = ss + '_' + assmbl_name
                max_run_id_len = max(len(run_id), max_run_id_len)

                seqs = seq_records_to_dict(read_fasta(aa_file, SEQ_TYPE_AA))

                # Filter all ORFs except the first one.
                for seq_def in tuple(seqs.keys()):
                    seq_def_prefix = seq_def.split(' ')[0]
                    if seq_def_prefix.endswith('ORF001'):
                        max_title_a_len = max(len(seq_def_prefix),
                                              max_title_a_len)

        max_title_a_len += 2
        max_run_id_len += 2
        # --------------------------------------------------------------------

        parallel_run_count = min(THREADS, len(ss_names))

        def run_inter_pro_scan_parallel(ss):
            if stat(aa_queries_files[ss]).st_size == 0:
                return

            run_inter_pro_scan(ss, assemblies, email, dir_prj_ips,
                               dir_cache_prj, parallel_run_count,
                               max_title_a_len, max_run_id_len)

            # GFF3 files from kakapo and InterProScan 5 results JSON files
            gff_from_json(ss, assemblies, dir_prj_ips,
                          dir_prj_transcripts_combined, prj_name)

        Parallel(n_jobs=parallel_run_count, verbose=0,
                 require='sharedmem')(delayed(run_inter_pro_scan_parallel)(ss)
                                      for ss in ss_names)

    # Download CDS for NCBI protein queries ----------------------------------
    print()
    prot_cds_ncbi_files = OrderedDict()

    def dnld_cds_for_ncbi_prot_acc_parallel(ss):
        if stat(aa_queries_files[ss]).st_size == 0:
            return

        if ss not in prot_acc_user_filtered:
            return

        prot_cds_ncbi_files[ss] = opj(
            dir_prj_transcripts_combined,
            prj_name + '_ncbi_query_cds__' + ss + '.fasta')

        if len(prot_acc_user_filtered[ss]) > 0:
            dnld_cds_for_ncbi_prot_acc(ss, prot_acc_user_filtered[ss],
                                       prot_cds_ncbi_files[ss], tax,
                                       dir_cache_prj)

    ss_names = tuple(sss.keys())
    Parallel(n_jobs=2, verbose=0, require='sharedmem')(
        delayed(dnld_cds_for_ncbi_prot_acc_parallel)(ss) for ss in ss_names)

    # ------------------------------------------------------------------------

    rmtree(dir_temp)

    # ------------------------------------------------------------------------

    rerun = input('\nRepeat ([y]/n)? ').lower().strip()
    if rerun.startswith('y') or rerun == '':
        print()
        return False
    else:
        print('\nExiting...')
        return True
예제 #5
0
def run_spades(se_fastq_files, pe_fastq_files, dir_spades_assemblies,
               spades, dir_temp, ss, threads, ram):

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        if spades is None:
            Log.err('SPAdes is not available. Cannot continue. Exiting.')
            exit(0)

    for se in se_fastq_files:
        dir_results = opj(dir_spades_assemblies, se + '__' + ss)
        fq_path = se_fastq_files[se]['vsearch_results_path' + '__' + ss]
        se_fastq_files[se]['spades_assembly' + '__' + ss] = None

        if ope(dir_results):
            Log.msg('SPAdes assembly already exists:', se)
        else:
            make_dirs(dir_results)
            Log.msg('Running SPAdes on:', se)
            run_spades_se(spades,
                          out_dir=dir_results,
                          input_file=fq_path,
                          threads=threads,
                          memory=ram,
                          rna=True)

        assmbl_path = opj(dir_results, 'transcripts.fasta')
        if ope(assmbl_path):
            count = len(read_fasta(assmbl_path, SEQ_TYPE_NT))
            tr_str = ' transcripts.'
            if count == 1:
                tr_str = ' transcript.'
            Log.msg('SPAdes produced ' + str(count) + tr_str, False)
            se_fastq_files[se]['spades_assembly' + '__' + ss] = assmbl_path
        else:
            Log.wrn('SPAdes produced no transcripts.', False)

    for pe in pe_fastq_files:
        dir_results = opj(dir_spades_assemblies, pe + '__' + ss)
        fq_paths = pe_fastq_files[pe]['vsearch_results_path' + '__' + ss]
        pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None

        if ope(dir_results):
            Log.msg('SPAdes assembly already exists:', pe)
        else:
            make_dirs(dir_results)
            Log.msg('Running SPAdes on: ' + pe)

            if osstat(fq_paths[0]).st_size > 0 and \
               osstat(fq_paths[1]).st_size > 0:

                run_spades_pe(spades,
                              out_dir=dir_results,
                              input_files=fq_paths,
                              threads=threads,
                              memory=ram,
                              rna=True)

            else:
                _ = opj(dir_temp, 'temp.fasta')
                combine_text_files(fq_paths, _)
                run_spades_se(spades,
                              out_dir=dir_results,
                              input_file=_,
                              threads=threads,
                              memory=ram,
                              rna=True)
                osremove(_)

        assmbl_path = opj(dir_results, 'transcripts.fasta')
        if ope(assmbl_path):
            count = len(read_fasta(assmbl_path, SEQ_TYPE_NT))
            tr_str = ' transcripts.'
            if count == 1:
                tr_str = ' transcript.'
            Log.msg('SPAdes produced ' + str(count) + tr_str, False)
            pe_fastq_files[pe]['spades_assembly' + '__' + ss] = assmbl_path
        else:
            Log.wrn('SPAdes produced no transcripts.', False)
예제 #6
0
def run_bt2_fq(se_fastq_files, pe_fastq_files, dir_fq_filter_data, bowtie2,
               bowtie2_build, threads, dir_temp, bt2_order, fpatt, taxonomy,
               dir_cache_refseqs):

    new_se_fastq_files = dict()
    new_pe_fastq_files = dict()

    msg_printed = False

    # SE
    for se in se_fastq_files:

        taxid = se_fastq_files[se]['tax_id']
        dbs = _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2,
                              bowtie2_build)

        in_f = se_fastq_files[se]['trim_path_fq']
        in_f_orig = in_f

        if len(dbs) == 0:
            se_fastq_files[se]['filter_path_fq'] = in_f
            continue

        if msg_printed is False:
            print()
            Log.inf('Running Bowtie2.')
            msg_printed = True

        for i, db in enumerate(dbs):

            db_path = dbs[db]

            dir_fq_bt_data_sample = opj(dir_fq_filter_data, se, db)
            dir_fq_bt_data_sample_un = opj(dir_fq_filter_data, se)

            new_se = se + '_' + db

            out_f = opj(dir_fq_bt_data_sample, new_se + '.fastq')

            out_f_un = opj(dir_temp, new_se + '_bt2_unaligned' + '.fastq')

            sam_f = opj(dir_fq_bt_data_sample, new_se + '.sam')
            new_se_fastq_files[new_se] = deepcopy(se_fastq_files[se])
            new_se_fastq_files[new_se]['path'] = None
            new_se_fastq_files[new_se]['cor_path_fq'] = None
            new_se_fastq_files[new_se]['trim_path_fq'] = None
            taxid = new_se_fastq_files[new_se]['tax_id']
            gc = new_se_fastq_files[new_se]['gc_id']
            if db == MT:
                gc = taxonomy.mito_genetic_code_for_taxid(taxid)
                new_se_fastq_files[new_se]['gc_id'] = gc
            elif db == PT:
                gc = taxonomy.plastid_genetic_code_for_taxid(taxid)
                new_se_fastq_files[new_se]['gc_id'] = gc
            new_se_fastq_files[new_se]['gc_tt'] = TranslationTable(gc)
            new_se_fastq_files[new_se]['filter_path_fq'] = out_f
            if ope(dir_fq_bt_data_sample):
                Log.msg('Bowtie2 filtered FASTQ file already exists:', new_se)
                in_f = opj(dir_fq_bt_data_sample_un, se + '.fastq')
            else:
                Log.msg('SE mode:', new_se)
                make_dirs(dir_fq_bt_data_sample)

                db_fasta_path = None
                bt2_idx_path = None
                if db_path in (MT, PT):
                    db_fasta_path = dnld_refseqs_for_taxid(taxid,
                                                           db,
                                                           taxonomy,
                                                           dir_cache_refseqs,
                                                           query='',
                                                           db='nuccore')
                    bt2_idx_path = splitext(db_fasta_path)[0]
                else:
                    db_fasta_path = db_path
                    bt2_idx_path = opj(dir_cache_refseqs,
                                       splitext(basename(db_fasta_path))[0])

                if not ope(bt2_idx_path + '.1.bt2'):
                    build_bt2_index(bowtie2_build, [db_fasta_path],
                                    bt2_idx_path, threads)

                run_bowtie2_se(bowtie2=bowtie2,
                               input_file=in_f,
                               output_file=out_f,
                               output_file_un=out_f_un,
                               sam_output_file=sam_f,
                               index=bt2_idx_path,
                               threads=threads,
                               dir_temp=dir_temp)

                if i > 0:
                    remove(in_f)

                in_f = out_f_un

        out_f_un = opj(dir_fq_bt_data_sample_un, se + '.fastq')
        se_fastq_files[se]['filter_path_fq'] = out_f_un

        if in_f != in_f_orig:
            move(in_f, out_f_un)

    se_fastq_files.update(new_se_fastq_files)

    # PE
    for pe in pe_fastq_files:

        taxid = pe_fastq_files[pe]['tax_id']
        dbs = _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2,
                              bowtie2_build)

        in_fs = pe_fastq_files[pe]['trim_path_fq']
        in_fs_orig = tuple(in_fs)

        if len(dbs) == 0:
            pe_fastq_files[pe]['filter_path_fq'] = in_fs
            continue

        if msg_printed is False:
            print()
            Log.inf('Running Bowtie2.')
            msg_printed = True

        for i, db in enumerate(dbs):

            db_path = dbs[db]

            dir_fq_bt_data_sample = opj(dir_fq_filter_data, pe, db)
            dir_fq_bt_data_sample_un = opj(dir_fq_filter_data, pe)

            new_pe = pe + '_' + db

            out_fs = [x.replace('@D@', dir_fq_bt_data_sample) for x in fpatt]
            out_fs = [x.replace('@N@', new_pe) for x in out_fs]

            out_fs_un = [x.replace('@D@', dir_temp) for x in fpatt]
            out_fs_un = [
                x.replace('@N@', new_pe + '_bt2_unaligned') for x in out_fs_un
            ]

            sam_f = opj(dir_fq_bt_data_sample, new_pe + '.sam')
            new_pe_fastq_files[new_pe] = deepcopy(pe_fastq_files[pe])
            new_pe_fastq_files[new_pe]['path'] = None
            new_pe_fastq_files[new_pe]['cor_path_fq'] = None
            new_pe_fastq_files[new_pe]['trim_path_fq'] = None
            taxid = new_pe_fastq_files[new_pe]['tax_id']
            gc = new_pe_fastq_files[new_pe]['gc_id']
            if db == MT:
                gc = taxonomy.mito_genetic_code_for_taxid(taxid)
                new_pe_fastq_files[new_pe]['gc_id'] = gc
            elif db == PT:
                gc = taxonomy.plastid_genetic_code_for_taxid(taxid)
                new_pe_fastq_files[new_pe]['gc_id'] = gc
            new_pe_fastq_files[new_pe]['gc_tt'] = TranslationTable(gc)
            new_pe_fastq_files[new_pe]['filter_path_fq'] = out_fs
            if ope(dir_fq_bt_data_sample):
                Log.msg('Bowtie2 filtered FASTQ files already exist:', new_pe)
                in_fs = [
                    x.replace('@D@', dir_fq_bt_data_sample_un) for x in fpatt
                ]
                in_fs = [x.replace('@N@', pe) for x in in_fs]
            else:
                Log.msg('PE mode:', new_pe)
                make_dirs(dir_fq_bt_data_sample)

                db_fasta_path = None
                bt2_idx_path = None
                if db_path in (MT, PT):
                    db_fasta_path = dnld_refseqs_for_taxid(taxid,
                                                           db,
                                                           taxonomy,
                                                           dir_cache_refseqs,
                                                           query='',
                                                           db='nuccore')
                    bt2_idx_path = splitext(db_fasta_path)[0]
                else:
                    db_fasta_path = db_path
                    bt2_idx_path = opj(dir_cache_refseqs,
                                       splitext(basename(db_fasta_path))[0])

                if not ope(bt2_idx_path + '.1.bt2'):
                    build_bt2_index(bowtie2_build, [db_fasta_path],
                                    bt2_idx_path, threads)

                paired_out_pattern = out_fs[0].replace('_paired_1.fastq',
                                                       '_paired_%.fastq')

                paired_out_pattern_un = out_fs_un[0].replace(
                    '_paired_1.fastq', '_paired_%.fastq')

                run_bowtie2_pe(bowtie2=bowtie2,
                               input_files=in_fs,
                               paired_out_pattern=paired_out_pattern,
                               paired_out_pattern_un=paired_out_pattern_un,
                               unpaired_out_1=out_fs[2],
                               unpaired_out_2=out_fs[3],
                               unpaired_out_1_un=out_fs_un[2],
                               unpaired_out_2_un=out_fs_un[3],
                               sam_output_file=sam_f,
                               index=bt2_idx_path,
                               threads=threads,
                               dir_temp=dir_temp)

                if i > 0:
                    remove(in_fs[0])
                    remove(in_fs[1])
                    remove(in_fs[2])
                    remove(in_fs[3])

                in_fs = out_fs_un

        out_fs_un = [x.replace('@D@', dir_fq_bt_data_sample_un) for x in fpatt]
        out_fs_un = [x.replace('@N@', pe) for x in out_fs_un]
        pe_fastq_files[pe]['filter_path_fq'] = out_fs_un

        if tuple(in_fs) != in_fs_orig:
            move(in_fs[0], out_fs_un[0])
            move(in_fs[1], out_fs_un[1])
            move(in_fs[2], out_fs_un[2])
            move(in_fs[3], out_fs_un[3])

    pe_fastq_files.update(new_pe_fastq_files)
예제 #7
0
def run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data,
                    trimmomatic, adapters, fpatt, threads):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Running Trimmomatic.')
        if trimmomatic is None:
            Log.err('trimmomatic is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, se)
        fq_path = se_fastq_files[se]['cor_path_fq']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        min_acc_len = se_fastq_files[se]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, se + '.txt')
        out_f = opj(dir_fq_trim_data_sample, se + '.fastq' + ext)
        se_fastq_files[se]['trim_path_fq'] = out_f

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('SE mode:', se)
            trimmomatic_se(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file=fq_path,
                           out_file=out_f,
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

    for pe in pe_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, pe)
        fq_path_1 = pe_fastq_files[pe]['cor_path_fq'][0]
        fq_path_2 = pe_fastq_files[pe]['cor_path_fq'][1]
        fq_path_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        if len(pe_fastq_files[pe]['cor_path_fq']) == 3:
            fq_path_3 = pe_fastq_files[pe]['cor_path_fq'][2]
        min_acc_len = pe_fastq_files[pe]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, pe + '.txt')
        out_fs = [x.replace('@D@', dir_fq_trim_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x + ext for x in out_fs]
        pe_fastq_files[pe]['trim_path_fq'] = out_fs

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('PE mode:', pe)
            trimmomatic_pe(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file_1=fq_path_1,
                           in_file_2=fq_path_2,
                           out_file_paired_1=out_fs[0],
                           out_file_paired_2=out_fs[1],
                           out_file_unpaired_1=out_fs[2],
                           out_file_unpaired_2=out_fs[3],
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

            if fq_path_3 is not None:

                out_f = opj(dir_fq_trim_data_sample, 'unpaired.fastq' + ext)
                stats_f = opj(dir_fq_trim_data_sample, pe + '_unpaired.txt')

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                trimmomatic_se(trimmomatic=trimmomatic,
                               adapters=adapters,
                               in_file=fq_path_3,
                               out_file=out_f,
                               stats_file=stats_f,
                               threads=threads,
                               minlen=min_acc_len)

                _ = opj(dir_fq_trim_data_sample, 'temp.fastq' + ext)
                f_temp = fqopen(_, w_mode)
                with fileinput.FileInput(
                        files=[out_fs[2], out_f],
                        openhook=fileinput.hook_compressed) as f:
                    for line in f:
                        f_temp.write(line)
                f_temp.close()

                remove(out_fs[2])
                remove(out_f)
                copyfile(_, out_fs[2])
                remove(_)
예제 #8
0
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   threads, dir_temp, should_run):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        if should_run is False:
            Log.wrn('Skipping Rcorrector as requested.')
        else:
            Log.inf('Running Rcorrector.')

        if rcorrector is None:
            Log.err('Rcorrector is not available. Cannot continue. Exiting.')
            exit(0)

    for se in se_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, se)
        fq_path = se_fastq_files[se]['path']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        log_f = opj(dir_fq_cor_data_sample, se + '.txt')
        out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext)

        se_fastq_files[se]['cor_path_fq'] = out_f

        if should_run is False:
            se_fastq_files[se]['cor_path_fq'] = fq_path
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('SE mode:', se)
            run_rcorrector_se(rcorrector=rcorrector,
                              in_file=fq_path,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path))
            fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext

            filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f)

            remove(fq_cor_path)

    for pe in pe_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe)
        fq_path_1 = pe_fastq_files[pe]['path'][0]
        fq_path_2 = pe_fastq_files[pe]['path'][1]
        fq_path_3 = None
        out_f_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        log_f = opj(dir_fq_cor_data_sample, pe + '.txt')
        out_f_1 = opj(dir_fq_cor_data_sample, pe + '_R1.fastq' + ext)
        out_f_2 = opj(dir_fq_cor_data_sample, pe + '_R2.fastq' + ext)

        pe_fastq_files[pe]['cor_path_fq'] = [out_f_1, out_f_2]

        if len(pe_fastq_files[pe]['path']) == 3:
            fq_path_3 = pe_fastq_files[pe]['path'][2]
            out_f_3 = opj(dir_fq_cor_data_sample, pe + '_R3.fastq' + ext)
            pe_fastq_files[pe]['cor_path_fq'].append(out_f_3)

        if should_run is False:
            pe_fastq_files[pe]['cor_path_fq'] = [fq_path_1, fq_path_2]
            if fq_path_3 is not None:
                pe_fastq_files[pe]['cor_path_fq'].append(fq_path_3)
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('PE mode:', pe)
            run_rcorrector_pe(rcorrector=rcorrector,
                              in_file_1=fq_path_1,
                              in_file_2=fq_path_2,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1))
            fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext
            fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2))
            fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext

            filter_unc_pe(in_file_1=fq_cor_path_1,
                          in_file_2=fq_cor_path_2,
                          out_file_1=out_f_1,
                          out_file_2=out_f_2,
                          log_file=log_f)

            remove(fq_cor_path_1)
            remove(fq_cor_path_2)

            if fq_path_3 is not None:

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_3,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_3 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_3))
                fq_cor_path_3 = splitext_gz(fq_base_path_3)[0] + '.cor.fq'
                log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired.txt')

                filter_unc_se(in_file=fq_cor_path_3,
                              out_file=out_f_3,
                              log_file=log_f_3)

                remove(fq_cor_path_3)
예제 #9
0
def run_kraken2(order, dbs, se_fastq_files, pe_fastq_files, dir_fq_filter_data,
                confidence, kraken2, threads, dir_temp, fpatt):

    if (len(se_fastq_files) > 0 or len(pe_fastq_files) > 0) and len(order) > 0:
        print()
        Log.inf('Running Kraken2.', 'Confidence: ' + str(confidence))
        if kraken2 is None:
            Log.err('kraken2 is not available. Cannot continue. Exiting.')
            exit(0)

    nuclear = None
    for nuc in order:
        if nuc[1] == 'nuclear':
            nuclear = nuc[0]
            break

    for se in se_fastq_files:

        if len(order) == 0:
            continue

        if se_fastq_files[se]['path'] is None:
            continue

        fq_path = se_fastq_files[se]['filter_path_fq']
        dir_fq_filter_data_sample = opj(dir_fq_filter_data, se)

        if nuclear is None:
            out_f = opj(dir_fq_filter_data_sample, se + '.fastq')
        else:
            out_f = opj(dir_fq_filter_data_sample, nuclear, se + '.fastq')

        se_fastq_files[se]['filter_path_fq'] = out_f

        if ope(dir_fq_filter_data_sample):
            Log.msg('Kraken2 filtered FASTQ files already exist:', se)
        else:
            make_dirs(dir_fq_filter_data_sample)
            print()
            Log.msg('SE mode:', se)
            run_kraken_filters(order=order,
                               dbs=dbs,
                               base_name=se,
                               in_files=fq_path,
                               dir_out=dir_fq_filter_data_sample,
                               confidence=confidence,
                               kraken2=kraken2,
                               threads=threads,
                               dir_temp=dir_temp)

    for pe in pe_fastq_files:

        if len(order) == 0:
            continue

        if pe_fastq_files[pe]['path'] is None:
            continue

        fq_path = pe_fastq_files[pe]['filter_path_fq']
        dir_fq_filter_data_sample = opj(dir_fq_filter_data, pe)

        if nuclear is None:
            dir_name_nuclear = dir_fq_filter_data_sample
        else:
            dir_name_nuclear = dir_fq_filter_data_sample + ops + nuclear

        out_fs = [x.replace('@D@', dir_name_nuclear) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]

        pe_fastq_files[pe]['filter_path_fq'] = out_fs

        if ope(dir_fq_filter_data_sample):
            Log.msg('Kraken2 filtered FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_filter_data_sample)
            print()
            Log.msg('PE mode:', pe)
            run_kraken_filters(order=order,
                               dbs=dbs,
                               base_name=pe,
                               in_files=fq_path,
                               dir_out=dir_fq_filter_data_sample,
                               confidence=confidence,
                               kraken2=kraken2,
                               threads=threads,
                               dir_temp=dir_temp)
예제 #10
0
def prepare_output_directories(dir_out, prj_name):
    # ToDo: Lock cache files in case of parallel execution -------------------
    dir_temp = opj(dir_out, '00-temp')
    make_dirs(dir_temp)

    dir_cache = opj(dir_out, '00-cache')
    make_dirs(dir_cache)

    dir_cache_pfam_acc = opj(dir_cache, 'pfam-uniprot-accessions')
    make_dirs(dir_cache_pfam_acc)

    dir_cache_fq_minlen = opj(dir_cache, 'min-acceptable-read-lengths')
    make_dirs(dir_cache_fq_minlen)

    dir_cache_prj = opj(dir_cache, 'projects', prj_name)
    make_dirs(dir_cache_prj)

    dir_cache_refseqs = opj(dir_cache, 'ref-seqs')
    make_dirs(dir_cache_refseqs)

    dir_prj = opj(dir_out, '02-project-specific', prj_name)
    make_dirs(dir_prj)

    dir_prj_logs = opj(dir_prj, '00-logs')
    make_dirs(dir_prj_logs)

    dir_prj_queries = opj(dir_prj, '01-queries')
    make_dirs(dir_prj_queries)

    dir_prj_blast_results_fa_trim = opj(dir_prj,
                                        '02-filtered-fa-blast-results')
    make_dirs(dir_prj_blast_results_fa_trim)

    dir_prj_vsearch_results_fa_trim = opj(dir_prj,
                                          '03-filtered-fq-vsearch-results')
    make_dirs(dir_prj_vsearch_results_fa_trim)

    dir_prj_spades_assemblies = opj(dir_prj, '04-spades-assemblies')
    make_dirs(dir_prj_spades_assemblies)

    dir_prj_blast_assmbl = opj(dir_prj, '05-assemblies-blast-db-data')
    make_dirs(dir_prj_blast_assmbl)

    dir_prj_assmbl_blast_results = opj(dir_prj, '06-assemblies-blast-results')
    make_dirs(dir_prj_assmbl_blast_results)

    dir_prj_transcripts = opj(dir_prj, '07-transcripts')
    make_dirs(dir_prj_transcripts)

    dir_prj_ips = dir_prj_transcripts

    dir_prj_transcripts_combined = opj(dir_prj, '08-transcripts-combined')
    make_dirs(dir_prj_transcripts_combined)

    dir_global = opj(dir_out, '01-global')
    make_dirs(dir_global)

    dir_fq_data = opj(dir_global, '01-sra-fq-data')
    make_dirs(dir_fq_data)

    dir_fq_trim_data = opj(dir_global, '02-trimmed-fq-data')
    make_dirs(dir_fq_trim_data)

    dir_fq_cor_data = opj(dir_global, '03-corrected-fq-data')
    make_dirs(dir_fq_cor_data)

    dir_fq_filter_bt2_data = opj(dir_global, '04-bowtie2-filtered-fq-data')
    make_dirs(dir_fq_filter_bt2_data)

    dir_fq_filter_krkn2_data = opj(dir_global, '05-kraken2-filtered-fq-data')
    make_dirs(dir_fq_filter_krkn2_data)

    dir_fa_trim_data = opj(dir_global, '06-fa-data')
    make_dirs(dir_fa_trim_data)

    dir_blast_fa_trim = opj(dir_global, '07-fa-blast-db-data')
    make_dirs(dir_blast_fa_trim)

    ret_dict = {'dir_blast_fa_trim': dir_blast_fa_trim,
                'dir_cache': dir_cache,
                'dir_cache_fq_minlen': dir_cache_fq_minlen,
                'dir_cache_pfam_acc': dir_cache_pfam_acc,
                'dir_cache_prj': dir_cache_prj,
                'dir_cache_refseqs': dir_cache_refseqs,
                'dir_fa_trim_data': dir_fa_trim_data,
                'dir_fq_cor_data': dir_fq_cor_data,
                'dir_fq_data': dir_fq_data,
                'dir_fq_trim_data': dir_fq_trim_data,
                'dir_fq_filter_bt2_data': dir_fq_filter_bt2_data,
                'dir_fq_filter_krkn2_data': dir_fq_filter_krkn2_data,
                'dir_prj': dir_prj,
                'dir_prj_logs': dir_prj_logs,
                'dir_prj_assmbl_blast_results': dir_prj_assmbl_blast_results,
                'dir_prj_blast_assmbl': dir_prj_blast_assmbl,
                'dir_prj_blast_results_fa_trim': dir_prj_blast_results_fa_trim,
                'dir_prj_ips': dir_prj_ips,
                'dir_prj_queries': dir_prj_queries,
                'dir_prj_spades_assemblies': dir_prj_spades_assemblies,
                'dir_prj_transcripts': dir_prj_transcripts,
                'dir_prj_transcripts_combined': dir_prj_transcripts_combined,
                'dir_prj_vsearch_results_fa_trim':
                    dir_prj_vsearch_results_fa_trim,
                'dir_temp': dir_temp}

    return ret_dict
예제 #11
0
def run_tblastn_on_reads(se_fastq_files, pe_fastq_files, aa_queries_file,
                         tblastn, blast_1_evalue, blast_1_max_hsps,
                         blast_1_qcov_hsp_perc, blast_1_best_hit_overhang,
                         blast_1_best_hit_score_edge, blast_1_max_target_seqs,
                         dir_blast_results_fa_trim, fpatt, ss, threads, seqtk,
                         vsearch, dir_cache_prj):

    changed_blast_1 = False

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Running BLAST on reads:', ss)
        if tblastn is None:
            Log.err('tblastn is not available. Cannot continue. Exiting.')
            exit(0)

        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)

        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    cache_file = opj(dir_cache_prj, 'blast_1_settings_cache__' + ss)

    pickled = dict()
    settings = {
        'blast_1_evalue': blast_1_evalue,
        'blast_1_max_hsps': blast_1_max_hsps,
        'blast_1_qcov_hsp_perc': blast_1_qcov_hsp_perc,
        'blast_1_best_hit_overhang': blast_1_best_hit_overhang,
        'blast_1_best_hit_score_edge': blast_1_best_hit_score_edge,
        'blast_1_max_target_seqs': blast_1_max_target_seqs,
        'queries': seq_records_to_dict(read_fasta(aa_queries_file,
                                                  SEQ_TYPE_AA))
    }

    Log.msg('evalue:', str(blast_1_evalue))
    Log.msg('max_hsps:', str(blast_1_max_hsps))
    Log.msg('qcov_hsp_perc:', str(blast_1_qcov_hsp_perc))
    Log.msg('best_hit_overhang:', str(blast_1_best_hit_overhang))
    Log.msg('best_hit_score_edge:', str(blast_1_best_hit_score_edge))
    Log.msg('max_target_seqs:', str(blast_1_max_target_seqs))
    print()

    # FixMe: Expose in configuration files?
    ident = 0.85

    for se in se_fastq_files:
        dir_results = opj(dir_blast_results_fa_trim, se)
        blast_db_path = se_fastq_files[se]['blast_db_path']
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_results, se + '__' + ss + '.txt')
        out_f_fastq = out_f.replace('.txt', '.fastq')
        out_f_fasta = out_f.replace('.txt', '.fasta')
        se_fastq_files[se]['blast_results_path' + '__' + ss] = out_f_fasta
        genetic_code = se_fastq_files[se]['gc_id']

        if ope(out_f_fasta) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(out_f_fasta) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', se)

        else:
            changed_blast_1 = True
            make_dirs(dir_results)
            Log.msg('Running tblastn on: ' + basename(blast_db_path), ss)
            run_blast(exec_file=tblastn,
                      task='tblastn',
                      threads=threads,
                      db_path=blast_db_path,
                      queries_file=aa_queries_file,
                      out_file=out_f,
                      evalue=blast_1_evalue,
                      max_hsps=blast_1_max_hsps,
                      qcov_hsp_perc=blast_1_qcov_hsp_perc,
                      best_hit_overhang=blast_1_best_hit_overhang,
                      best_hit_score_edge=blast_1_best_hit_score_edge,
                      max_target_seqs=blast_1_max_target_seqs,
                      db_genetic_code=genetic_code,
                      out_cols=BLST_RES_COLS_1)

            Log.inf('Extracting unique BLAST hits using Seqtk:', ss)

            keep_unique_lines_in_file(out_f)

            seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f)
            seqtk_fq_to_fa(seqtk, out_f_fastq, out_f_fasta)

            osremove(out_f)
            osremove(out_f_fastq)

            out_f_fasta_temp = out_f_fasta + '_temp'
            copyfile(out_f_fasta, out_f_fasta_temp)
            run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta)
            osremove(out_f_fasta_temp)

    for pe in pe_fastq_files:
        dir_results = opj(dir_blast_results_fa_trim, pe)
        blast_db_paths = pe_fastq_files[pe]['blast_db_path']
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_results) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x.replace('@Q@', ss) for x in out_fs]
        out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs]
        out_fs_fasta = [x.replace('.txt', '.fasta') for x in out_fs]
        out_f_fasta = opj(dir_results, pe + '__' + ss + '.fasta')
        pe_fastq_files[pe]['blast_results_path' + '__' + ss] = out_f_fasta
        genetic_code = pe_fastq_files[pe]['gc_id']

        if ope(out_f_fasta) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(out_f_fasta) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', pe)

        else:
            changed_blast_1 = True
            make_dirs(dir_results)
            pe_trim_files = zip(blast_db_paths, out_fs, fq_paths, out_fs_fastq,
                                out_fs_fasta)
            for x in pe_trim_files:
                Log.msg('Running tblastn on: ' + basename(x[0]), ss)
                run_blast(exec_file=tblastn,
                          task='tblastn',
                          threads=threads,
                          db_path=x[0],
                          queries_file=aa_queries_file,
                          out_file=x[1],
                          evalue=blast_1_evalue,
                          max_hsps=blast_1_max_hsps,
                          qcov_hsp_perc=blast_1_qcov_hsp_perc,
                          best_hit_overhang=blast_1_best_hit_overhang,
                          best_hit_score_edge=blast_1_best_hit_score_edge,
                          max_target_seqs=blast_1_max_target_seqs,
                          db_genetic_code=genetic_code,
                          out_cols=BLST_RES_COLS_1)

                Log.msg('Extracting unique BLAST hits using Seqtk:', ss)

                keep_unique_lines_in_file(x[1])

                seqtk_extract_reads(seqtk, x[2], x[3], x[1])
                seqtk_fq_to_fa(seqtk, x[3], x[4])

                osremove(x[1])
                osremove(x[3])

            combine_text_files(out_fs_fasta, out_f_fasta)

            out_f_fasta_temp = out_f_fasta + '_temp'
            copyfile(out_f_fasta, out_f_fasta_temp)
            run_cluster_fast(vsearch, ident, out_f_fasta_temp, out_f_fasta)
            osremove(out_f_fasta_temp)

            for x in out_fs_fasta:
                osremove(x)

    with open(cache_file, 'wb') as f:
        pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)

    return changed_blast_1
예제 #12
0
def run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch,
                         dir_vsearch_results_fa_trim, fpatt, ss, seqtk):

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)

    # FixMe: Expose in configuration files?
    ident = 0.85

    for se in se_fastq_files:
        dir_results = opj(dir_vsearch_results_fa_trim, se)
        min_acc_len = se_fastq_files[se]['min_acc_len']
        blast_results_fa_path = se_fastq_files[se]['blast_results_path' +
                                                   '__' + ss]
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_results, se + '__' + ss + '.txt')
        out_f_fastq = out_f.replace('.txt', '.fastq')
        se_fastq_files[se]['vsearch_results_path' + '__' + ss] = out_f_fastq

        if ope(out_f_fastq):
            Log.msg('Vsearch results already exist:', se)
        else:
            make_dirs(dir_results)
            Log.msg('Running vsearch on: ' + basename(fq_path), ss)
            run_vsearch(vsearch,
                        ident=ident,
                        q_file=blast_results_fa_path,
                        db_file=fq_path,
                        out_file=out_f,
                        minlen=min_acc_len)

            Log.msg('Extracting unique vsearch hits using Seqtk:', ss)
            keep_unique_lines_in_file(out_f)
            seqtk_extract_reads(seqtk, fq_path, out_f_fastq, out_f)
            osremove(out_f)

    for pe in pe_fastq_files:
        dir_results = opj(dir_vsearch_results_fa_trim, pe)
        min_acc_len = pe_fastq_files[pe]['min_acc_len']
        blast_results_fa_path = pe_fastq_files[pe]['blast_results_path' +
                                                   '__' + ss]
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_results) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x.replace('@Q@', ss) for x in out_fs]
        out_fs_fastq = [x.replace('.txt', '.fastq') for x in out_fs]
        pe_fastq_files[pe]['vsearch_results_path' + '__' + ss] = out_fs_fastq

        if ope(out_fs_fastq[0]) and ope(out_fs_fastq[1]) and \
           ope(out_fs_fastq[2]) and ope(out_fs_fastq[3]):
            Log.msg('Vsearch results already exist:', pe)
        else:
            make_dirs(dir_results)
            pe_trim_files = zip(fq_paths, out_fs, out_fs_fastq)
            for x in pe_trim_files:
                Log.msg('Running vsearch on: ' + basename(x[0]), ss)
                run_vsearch(vsearch,
                            ident=ident,
                            q_file=blast_results_fa_path,
                            db_file=x[0],
                            out_file=x[1],
                            minlen=min_acc_len)

            Log.msg(
                'Extracting unique vsearch hits from paired files '
                'using Seqtk:', ss)

            p1txt = out_fs[0]
            p2txt = out_fs[1]

            p1fq = fq_paths[0]
            p2fq = fq_paths[1]

            p1fq_out = out_fs_fastq[0]
            p2fq_out = out_fs_fastq[1]

            p12txt_temp = opj(dir_results, pe + '__' + ss + '_paired.txt')

            combine_text_files([p1txt, p2txt], p12txt_temp)
            keep_unique_lines_in_file(p12txt_temp)

            seqtk_extract_reads(seqtk, p1fq, p1fq_out, p12txt_temp)
            seqtk_extract_reads(seqtk, p2fq, p2fq_out, p12txt_temp)

            osremove(p1txt)
            osremove(p2txt)
            osremove(p12txt_temp)

            Log.msg(
                'Extracting unique vsearch hits from unpaired files '
                'using Seqtk:', ss)

            u1txt = out_fs[2]
            u2txt = out_fs[3]

            u1fq = fq_paths[2]
            u2fq = fq_paths[3]

            u1fq_out = out_fs_fastq[2]
            u2fq_out = out_fs_fastq[3]

            keep_unique_lines_in_file(u1txt)
            keep_unique_lines_in_file(u2txt)

            seqtk_extract_reads(seqtk, u1fq, u1fq_out, u1txt)
            seqtk_extract_reads(seqtk, u2fq, u2fq_out, u2txt)

            osremove(u1txt)
            osremove(u2txt)
예제 #13
0
파일: kraken.py 프로젝트: muti99/kakapo
def run_kraken_filters(order, dbs, base_name, in_files, dir_out, confidence,
                       kraken2, threads, dir_temp):

    dbs_ordered = OrderedDict()
    for dbn in order:
        db_name = dbn[0]
        if db_name in dbs:
            dbs_ordered[db_name] = dbs[db_name]
        else:
            Log.wrn('Kraken2 database not found:', db_name)

    # SE
    if isinstance(in_files, (str, bytes)):
        in_file = in_files
        _, in_file_ext, _ = splitext_gz(in_file)
        for i, db in enumerate(dbs_ordered):
            Log.msg('Filtering SE reads using Kraken2 database:', db)
            dir_out_db = opj(dir_out, db)
            make_dirs(dir_out_db)
            report_file = opj(dir_out_db, base_name + '.txt')
            out_class_file = opj(dir_out_db, base_name + in_file_ext)
            out_unclass_file = opj(
                dir_temp,
                base_name + '_' + db + '_kraken2_unclassified' + in_file_ext)

            if stat(in_file
                    ).st_size > 0:  # Kraken2 freaks out if the file is empty.
                run_kraken_se(kraken=kraken2,
                              db=dbs_ordered[db],
                              in_file=in_file,
                              out_class_file=out_class_file,
                              out_unclass_file=out_unclass_file,
                              report_file=report_file,
                              confidence=confidence,
                              threads=threads,
                              dir_temp=dir_temp)
            else:
                copyfile(in_file, out_class_file)
                copyfile(in_file, out_unclass_file)

            if i > 0:
                remove(in_file)
            in_file = out_unclass_file

        move(in_file, opj(dir_out, base_name + in_file_ext))

    # PE
    elif isinstance(in_files, (list, tuple)):

        assert len(in_files) > 1

        _, in_file_ext, _ = splitext_gz(in_files[0])

        in_file_R1 = in_files[0]
        in_file_R2 = in_files[1]

        if len(in_files) > 2:
            in_file = in_files[2]

            for i, db in enumerate(dbs_ordered):
                Log.msg(
                    'Filtering unpaired forward reads using Kraken2 database:',
                    db)
                dir_out_db = opj(dir_out, db)
                make_dirs(dir_out_db)
                report_file = opj(dir_out_db, base_name + '_unpaired_1.txt')
                out_class_file = opj(dir_out_db,
                                     base_name + '_unpaired_1' + in_file_ext)
                out_unclass_file = opj(
                    dir_temp, base_name + '_' + db + '_kraken2_unclassified' +
                    in_file_ext)

                if stat(in_file).st_size > 0:
                    run_kraken_se(kraken=kraken2,
                                  db=dbs_ordered[db],
                                  in_file=in_file,
                                  out_class_file=out_class_file,
                                  out_unclass_file=out_unclass_file,
                                  report_file=report_file,
                                  confidence=confidence,
                                  threads=threads,
                                  dir_temp=dir_temp)
                else:
                    copyfile(in_file, out_class_file)
                    copyfile(in_file, out_unclass_file)

                if i > 0:
                    remove(in_file)
                in_file = out_unclass_file

            move(in_file, opj(dir_out,
                              base_name + '_unpaired_1' + in_file_ext))

        if len(in_files) == 4:
            in_file = in_files[3]

            for i, db in enumerate(dbs_ordered):
                Log.msg(
                    'Filtering unpaired reverse reads using Kraken2 database:',
                    db)
                dir_out_db = opj(dir_out, db)
                make_dirs(dir_out_db)
                report_file = opj(dir_out_db, base_name + '_unpaired_2.txt')
                out_class_file = opj(dir_out_db,
                                     base_name + '_unpaired_2' + in_file_ext)
                out_unclass_file = opj(
                    dir_temp, base_name + '_' + db + '_kraken2_unclassified' +
                    in_file_ext)

                if stat(in_file).st_size > 0:
                    run_kraken_se(kraken=kraken2,
                                  db=dbs_ordered[db],
                                  in_file=in_file,
                                  out_class_file=out_class_file,
                                  out_unclass_file=out_unclass_file,
                                  report_file=report_file,
                                  confidence=confidence,
                                  threads=threads,
                                  dir_temp=dir_temp)
                else:
                    copyfile(in_file, out_class_file)
                    copyfile(in_file, out_unclass_file)

                if i > 0:
                    remove(in_file)
                in_file = out_unclass_file

            move(in_file, opj(dir_out,
                              base_name + '_unpaired_2' + in_file_ext))

        for i, db in enumerate(dbs_ordered):
            Log.msg('Filtering paired reads using Kraken2 database:', db)
            dir_out_db = opj(dir_out, db)
            make_dirs(dir_out_db)
            report_file = opj(dir_out_db, base_name + '_paired.txt')
            out_class_file = opj(dir_out_db,
                                 base_name + '_paired#' + in_file_ext)
            out_unclass_file = opj(
                dir_temp, base_name + '_' + db + '_kraken2_unclassified' +
                '_paired#' + in_file_ext)

            if stat(in_file_R1).st_size > 0 and stat(in_file_R2).st_size > 0:
                run_kraken_pe(kraken=kraken2,
                              db=dbs_ordered[db],
                              in_file_1=in_file_R1,
                              in_file_2=in_file_R2,
                              out_class_file=out_class_file,
                              out_unclass_file=out_unclass_file,
                              report_file=report_file,
                              confidence=confidence,
                              threads=threads,
                              dir_temp=dir_temp)
            else:
                copyfile(
                    in_file_R1,
                    copyfile(in_file_R1, out_class_file.replace('#', '_1')))
                copyfile(
                    in_file_R2,
                    copyfile(in_file_R2, out_class_file.replace('#', '_2')))
                copyfile(
                    in_file_R1,
                    copyfile(in_file_R1, out_unclass_file.replace('#', '_1')))
                copyfile(
                    in_file_R2,
                    copyfile(in_file_R2, out_unclass_file.replace('#', '_2')))

            if i > 0:
                remove(in_file_R1)
                remove(in_file_R2)

            in_file_R1 = out_unclass_file.replace('#', '_1')
            in_file_R2 = out_unclass_file.replace('#', '_2')

        move(in_file_R1, opj(dir_out, base_name + '_paired_1' + in_file_ext))
        move(in_file_R2, opj(dir_out, base_name + '_paired_2' + in_file_ext))
예제 #14
0
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   threads, dir_temp, fpatt, should_run):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        if should_run is False:
            Log.wrn('Skipping Rcorrector as requested.')
        else:
            Log.inf('Running Rcorrector.')

            if rcorrector is None:
                Log.err(
                    'Rcorrector is not available. Cannot continue. Exiting.')
                exit(0)

    for se in se_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, se)
        fq_path = se_fastq_files[se]['trim_path_fq']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        log_f = opj(dir_fq_cor_data_sample, se + '.txt')
        out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext)

        se_fastq_files[se]['cor_path_fq'] = out_f

        if should_run is False:
            se_fastq_files[se]['cor_path_fq'] = fq_path
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('SE mode:', se)
            run_rcorrector_se(rcorrector=rcorrector,
                              in_file=fq_path,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path))
            fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext

            filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f)

            remove(fq_cor_path)

    for pe in pe_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe)

        fq_path_1 = pe_fastq_files[pe]['trim_path_fq'][0]
        fq_path_2 = pe_fastq_files[pe]['trim_path_fq'][1]
        fq_path_3 = pe_fastq_files[pe]['trim_path_fq'][2]
        fq_path_4 = pe_fastq_files[pe]['trim_path_fq'][3]

        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        log_f = opj(dir_fq_cor_data_sample, pe + '_paired.txt')

        out_fs = [x.replace('@D@', dir_fq_cor_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x + ext for x in out_fs]

        pe_fastq_files[pe]['cor_path_fq'] = out_fs

        if should_run is False:
            pe_fastq_files[pe]['cor_path_fq'] = [
                fq_path_1, fq_path_2, fq_path_3, fq_path_4
            ]
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('PE mode:', pe)
            run_rcorrector_pe(rcorrector=rcorrector,
                              in_file_1=fq_path_1,
                              in_file_2=fq_path_2,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1))
            fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext
            fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2))
            fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext

            filter_unc_pe(in_file_1=fq_cor_path_1,
                          in_file_2=fq_cor_path_2,
                          out_file_1=out_fs[0],
                          out_file_2=out_fs[1],
                          log_file=log_f)

            remove(fq_cor_path_1)
            remove(fq_cor_path_2)

            # unpaired 1
            if stat(fq_path_3).st_size != 0:
                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_3,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_3 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_3))
                fq_cor_path_3 = splitext_gz(
                    fq_base_path_3)[0] + '.cor.fq' + ext
                log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired_1.txt')

                filter_unc_se(in_file=fq_cor_path_3,
                              out_file=out_fs[2],
                              log_file=log_f_3)

                remove(fq_cor_path_3)
            else:
                with open(out_fs[2], 'w') as f:
                    f.write('')

            # unpaired 2
            if stat(fq_path_4).st_size != 0:
                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_4,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_4 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_4))
                fq_cor_path_4 = splitext_gz(
                    fq_base_path_4)[0] + '.cor.fq' + ext
                log_f_4 = opj(dir_fq_cor_data_sample, pe + '_unpaired_2.txt')

                filter_unc_se(in_file=fq_cor_path_4,
                              out_file=out_fs[3],
                              log_file=log_f_4)
                remove(fq_cor_path_4)

            else:
                with open(out_fs[3], 'w') as f:
                    f.write('')