Пример #1
0
def _parse_taxa(taxa, tax_group, taxonomy, config_file_path):
    txids = list()

    for tax in taxa:
        if tax.isdigit():
            txids.append(int(tax))
        else:
            # tax_orig = tax
            txid = taxonomy.tax_id_for_name_and_group_tax_id(
                name=tax, group_tax_id=tax_group)

            if txid is None:
                txid = taxonomy.tax_id_for_name_and_group_tax_id(
                    name=tax.split(' ')[0], group_tax_id=tax_group)

            if txid is None:
                txids.append(txid)
                msg = 'NCBI taxonomy ID could not be found for:'
                Log.wrn(msg, tax)
                # replace_line_in_file(
                #     file_path=config_file_path,
                #     line_str=tax_orig,
                #     replace_str='; NCBI taxid not found: ' + tax)

            else:
                txids.append(int(txid))
                msg = 'NCBI taxonomy ID for ' + tax + ' is:'
                Log.msg(msg, str(txid))
                # replace_line_in_file(
                #     file_path=config_file_path,
                #     line_str=tax_orig,
                #     replace_str='; ' + tax + '\n' + str(txid))

    return txids
Пример #2
0
def dep_check_bowtie2(dir_dep, os_id, force):
    if os_id == 'mac':
        url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/'
               '2.4.1/bowtie2-2.4.1-macos-x86_64.zip/download')
    elif os_id == 'linux':
        url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/'
               '2.4.1/bowtie2-2.4.1-linux-x86_64.zip/download')

    dnld_path = opj(dir_dep, 'bowtie2.zip')

    try:
        if force is True:
            raise
        bowtie2 = which('bowtie2')
        bowtie2_build = which('bowtie2-build')
        run([bowtie2, '-h'])
        run([bowtie2_build, '-h'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2'))
            bowtie2 = opj(dir_bin, 'bowtie2')
            bowtie2_build = opj(dir_bin, 'bowtie2-build')
            run([bowtie2, '-h'])
            run([bowtie2_build, '-h'])
        except Exception:
            Log.wrn('Bowtie 2 was not found on this system, trying to '
                    'download.')
            download_file(url, dnld_path)
            zip_ref = zipfile.ZipFile(dnld_path, 'r')
            zip_ref.extractall(dir_dep)
            zip_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2'))
            bowtie2 = opj(dir_bin, 'bowtie2')
            bowtie2_build = opj(dir_bin, 'bowtie2-build')

            bowtie2_execs = ('', '-align-l', '-align-l-debug', '-align-s',
                             '-align-s-debug', '-build', '-build-l',
                             '-build-l-debug', '-build-s', '-build-s-debug',
                             '-inspect', '-inspect-l', '-inspect-l-debug',
                             '-inspect-s', '-inspect-s-debug')

            for bt2exe in bowtie2_execs:
                chmod(
                    bowtie2 + bt2exe, stat.S_IRWXU | stat.S_IRGRP
                    | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)

            if not ope(bowtie2):
                Log.err('Could not download Bowtie 2.')
                return None, None

    regexp = r'^.*?version\s([\d\.]*)'
    v = get_dep_version([bowtie2, '--version'], regexp)
    Log.msg('bowtie2 is available:', v + ' ' + bowtie2)
    v = get_dep_version([bowtie2_build, '--version'], regexp)
    Log.msg('bowtie2-build is available:', v + ' ' + bowtie2_build)

    return bowtie2, bowtie2_build
Пример #3
0
def dnld_refseqs_for_taxid(taxid,
                           filter_term,
                           taxonomy,
                           dir_cache_refseqs,
                           query='',
                           db='nuccore'):
    ft = None
    if filter_term == 'plastid':
        ft = '("chloroplast"[filter] OR "plastid"[filter])'
    else:
        ft = '("' + filter_term + '"[filter])'

    tax_terms = tuple(reversed(taxonomy.lineage_for_taxid(taxid)['names']))
    for tax_term in tax_terms:
        if tax_term is None:
            tax_term = taxonomy.scientific_name_for_taxid(taxid)
        term = '"RefSeq"[Keyword] AND "{}"[Primary Organism] AND {}'.format(
            tax_term, ft)
        term = query + term
        accs = set(accs_eutil(search_eutil(db, term)))
        if len(accs) > 0:
            plural = 'sequences'
            if len(accs) == 1:
                plural = 'sequence'
            Log.msg(
                'Found {} RefSeq {} {} for'.format(len(accs), filter_term,
                                                   plural), tax_term)
            # Random sample ###################################################
            if len(accs) > 10:
                Log.wrn('Using a random sample of ten RefSeq sequences.')
                random.seed(a=len(accs), version=2)
                accs = set(random.sample(accs, 10))
            ###################################################################
            break
        else:
            Log.wrn(
                'No RefSeq {} sequences were found for'.format(filter_term),
                tax_term)

    cache_path = opj(
        dir_cache_refseqs,
        filter_term + '__' + tax_term.replace(' ', '_') + '.fasta')
    parsed_fasta_cache = {}
    if ope(cache_path):
        parsed_fasta_cache = read_fasta(cache_path,
                                        seq_type=SEQ_TYPE_NT,
                                        def_to_first_space=True)
        parsed_fasta_cache = seq_records_to_dict(parsed_fasta_cache)
        for acc in parsed_fasta_cache:
            if acc in accs:
                accs.remove(acc)
    if len(accs) > 0:
        parsed_fasta = dnld_ncbi_seqs(db, list(accs))
        parsed_fasta = seq_records_to_dict(parsed_fasta, prepend_acc=True)
        parsed_fasta.update(parsed_fasta_cache)
        write_fasta(parsed_fasta, cache_path)

    return cache_path
Пример #4
0
def user_protein_accessions(ss, prot_acc_user, dir_cache_prj, taxonomy):
    if len(prot_acc_user) > 0:
        Log.inf('Reading user provided protein accessions:', ss)
        print()
        pickle_file = opj(dir_cache_prj, 'ncbi_prot_metadata_cache__' + ss)
        acc_old = set()
        if ope(pickle_file):
            with open(pickle_file, 'rb') as f:
                pickled = pickle.load(f)
                acc_old = set([x['accessionversion'] for x in pickled])

        if acc_old == set(prot_acc_user):
            pa_info = pickled
        else:
            pa_info = summary_eutil('protein', prot_acc_user)

        prot_acc = []
        prot_info_to_print = []
        max_acc_len = 0
        for pa in pa_info:
            acc = pa['accessionversion']
            prot_acc.append(acc)
            title = pa['title']
            title_split = title.split('[')
            taxid = pa['taxid']
            if 'organism' in pa:
                organism = pa['organism']
            else:
                organism = taxonomy.scientific_name_for_taxid(taxid)
                pa['organism'] = organism
            # title = title_split[0]
            # title = title.lower().strip()
            # title = title.replace('_', ' ').replace('-', ' ')
            # title = title.replace(',', '')
            # title = title[0].upper() + title[1:] + ' [' + organism + ']'
            max_acc_len = max(max_acc_len, len(acc))
            prot_info_to_print.append((title, acc))

        prot_info_to_print = sorted(prot_info_to_print)
        for pi in prot_info_to_print:
            title = pi[0]
            acc = pi[1]
            if len(title) > 80:
                title = title[:77] + '...'
            Log.msg(acc.rjust(max_acc_len) + ':', title, False)

        with open(pickle_file, 'wb') as f:
            pickle.dump(pa_info, f, protocol=PICKLE_PROTOCOL)

        return prot_acc

    else:

        return prot_acc_user
Пример #5
0
def user_aa_fasta(ss, user_queries, aa_prot_user_file):
    _ = ''
    if len(user_queries) > 0:
        print()
        Log.inf('Reading user provided AA sequences:', ss)
        for ap in user_queries:
            Log.msg(ap)
            with open(ap, 'r') as f:
                _ = _ + f.read()
    if _ != '':
        with open(aa_prot_user_file, 'w') as f:
            write_fasta(standardize_fasta_text(_, SEQ_TYPE_AA), f)
Пример #6
0
def dep_check_sra_toolkit(dir_dep, os_id, dist_id, debian_dists, redhat_dists,
                          force):
    if os_id == 'mac':
        url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/'
               'sratoolkit.2.10.8-mac64.tar.gz')
    elif os_id == 'linux':
        if dist_id in debian_dists:
            url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/'
                   'sratoolkit.2.10.8-ubuntu64.tar.gz')
        elif dist_id in redhat_dists:
            url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/'
                   'sratoolkit.2.10.8-centos_linux64.tar.gz')

    dnld_path = opj(dir_dep, 'sra-toolkit.tar.gz')

    fasterq_dump = None
    try:
        if force is True:
            raise
        fasterq_dump = which('fasterq-dump')
        dir_bin = dirname(fasterq_dump).strip('bin')
        _ensure_vdb_cfg(dir_bin)
        run(fasterq_dump)
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit'))
            _ensure_vdb_cfg(dir_bin)
            fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump')
            run(fasterq_dump)
        except Exception:
            Log.wrn('SRA Toolkit was not found on this system, trying to '
                    'download.')
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit'))
            fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump')

            _ensure_vdb_cfg(dir_bin)

            if not ope(fasterq_dump):
                Log.err('Could not download SRA Toolkit.')
                return None

    v = get_dep_version([fasterq_dump, '--version'], r':\s([\d\.]*)')
    if v == '?':
        v = get_dep_version([fasterq_dump, '--version'], r'version\s([\d\.]*)')
    Log.msg('fasterq-dump is available:', v + ' ' + fasterq_dump)

    return fasterq_dump
Пример #7
0
def dep_check_kakapolib(force=False, quiet=False):
    kkpl = KAKAPOLIB
    if not ope(kkpl):
        if quiet is False:
            Log.wrn('Compiling kakapolib.')
        run(['make', 'install'], cwd=DIR_C_SRC)
    if ope(kkpl):
        if quiet is False:
            Log.msg('kakapolib is available:', kkpl)
    else:
        Log.err('Compilation of kakapolib failed.')
        return None
    return ctypes.CDLL(kkpl)
Пример #8
0
def dep_check_seqtk(dir_dep, force):
    url = 'https://github.com/lh3/seqtk/archive/master.zip'
    dnld_path = opj(dir_dep, 'seqtk.zip')
    dir_bin = opj(dir_dep, 'seqtk-master')

    fp = NamedTemporaryFile()
    fp.write(str.encode('>seq' + lns + 'ATGC'))
    fp.seek(0)
    cmd = ['', 'seq', '-r', fp.name]

    try:
        if force is True:
            raise
        seqtk = which('seqtk')
        cmd[0] = seqtk
        run(cmd, do_not_raise=True)
    except Exception:
        try:
            seqtk = opj(dir_bin, 'seqtk')
            cmd[0] = seqtk
            run(cmd, do_not_raise=True)
        except Exception:
            Log.wrn('Seqtk was not found on this system, trying to download.')
            download_file(url, dnld_path)
            zip_ref = zipfile.ZipFile(dnld_path, 'r')
            zip_ref.extractall(dir_dep)
            zip_ref.close()
            try:
                Log.wrn('Compiling Seqtk.')
                run('make', cwd=dir_bin)
                run(cmd, do_not_raise=True)
            except Exception:
                replace_line_in_file(opj(dir_bin, 'Makefile'), 'CC=gcc',
                                     'CC=cc')
                try:
                    run('make', cwd=dir_bin)
                    run(cmd, do_not_raise=True)
                except Exception:
                    Log.err(
                        'Something went wrong while trying to compile Seqtk.')
                    Log.msg('Try downloading and installing it manually from: '
                            'https://github.com/lh3/seqtk')
                    fp.close()
                    return None

    fp.close()

    v = get_dep_version([seqtk], r'Version\:\s([\d\w\.\-]*)')
    Log.msg('Seqtk is available:', v + ' ' + seqtk)

    return seqtk
Пример #9
0
def dep_check_vsearch(dir_dep, os_id, dist_id, debian_dists, redhat_dists,
                      force):
    if os_id == 'mac':
        url = ('https://github.com/torognes/vsearch/releases/download/v2.15.0/'
               'vsearch-2.15.0-macos-x86_64.tar.gz')
    elif os_id == 'linux':
        if dist_id in debian_dists:
            url = ('https://github.com/torognes/vsearch/releases/download/'
                   'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz')
        elif dist_id in redhat_dists:
            url = ('https://github.com/torognes/vsearch/releases/download/'
                   'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz')

    dnld_path = opj(dir_dep, 'vsearch.tar.gz')

    try:
        if force is True:
            raise
        vsearch = which('vsearch')
        run(vsearch)
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch'))
            vsearch = opj(dir_bin, 'bin', 'vsearch')
            run(vsearch)
        except Exception:
            Log.wrn(
                'Vsearch was not found on this system, trying to download.')
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()
            try:
                dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch'))
                vsearch = opj(dir_bin, 'bin', 'vsearch')
                if not ope(vsearch):
                    Log.err('Could not download Vsearch.')
                    return None
                else:
                    run(vsearch)
            except Exception:
                Log.err('Vsearch was downloaded, but does not execute.')
                Log.msg('Try downloading and installing it manually from: '
                        'https://github.com/torognes/vsearch')
                return None

    v = get_dep_version([vsearch, '-version'], r'vsearch\sv([\d\.]*)')
    Log.msg('Vsearch is available:', v + ' ' + vsearch)

    return vsearch
Пример #10
0
def user_fastq_files(fq_se, fq_pe):
    if len(fq_se) > 0 or len(fq_pe) > 0:
        print()
        Log.inf('Preparing user provided FASTQ files.')

    se_fastq_files = {}
    pe_fastq_files = {}

    fq_type_1_regex = r'(.*)_L\d\d\d(_R.)_\d\d\d(.*)'

    for se in fq_se:
        tax_id = se[0]
        path = se[1]
        base = basename(path)
        if plain_or_gzip(base)[4] != '':
            base = splitext(base)[0]
        base = splitext(base)[0]
        fq_type_1_match = re.findall(fq_type_1_regex, base)
        if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3:
            base = fq_type_1_match[0][0]
        sample_base_name = base
        se_fastq_files[sample_base_name] = {'path': path}
        se_fastq_files[sample_base_name]['src'] = 'usr'
        se_fastq_files[sample_base_name]['avg_len'] = None
        se_fastq_files[sample_base_name]['tax_id'] = tax_id
        Log.msg(sample_base_name + ':', basename(path))

    for pe in fq_pe:
        tax_id = pe[0]
        path = pe[1]
        base = basename(path[0])
        if plain_or_gzip(base)[4] != '':
            base = splitext(base)[0]
        base = splitext(base)[0]
        fq_type_1_match = re.findall(fq_type_1_regex, base)
        if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3:
            base = fq_type_1_match[0][0]
        else:
            base = basename(commonprefix(path)).rstrip('_- R')
        sample_base_name = base
        pe_fastq_files[sample_base_name] = {'path': path}
        pe_fastq_files[sample_base_name]['src'] = 'usr'
        pe_fastq_files[sample_base_name]['avg_len'] = None
        pe_fastq_files[sample_base_name]['tax_id'] = tax_id
        Log.msg(
            sample_base_name + ':',
            basename(path[0]) + '\n' + ' ' * (len(sample_base_name) + 2) +
            basename(path[1]))

    return se_fastq_files, pe_fastq_files
Пример #11
0
def dep_check_spades(dir_dep, os_id, force):
    if os_id == 'mac':
        url = ('http://cab.spbu.ru/files/release3.14.1/'
               'SPAdes-3.14.1-Darwin.tar.gz')
    elif os_id == 'linux':
        url = ('http://cab.spbu.ru/files/release3.14.1/'
               'SPAdes-3.14.1-Linux.tar.gz')

    dnld_path = opj(dir_dep, 'SPAdes.tar.gz')

    try:
        if force is True:
            raise
        spades = which('spades.py')
        run([PY3, spades])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes'))
            spades = opj(dir_bin, 'bin', 'spades.py')
            run([PY3, spades])
        except Exception:
            Log.wrn('SPAdes was not found on this system, trying to download.')
            try:
                download_file(url, dnld_path)
                tar_ref = tarfile.open(dnld_path, 'r:gz')
                tar_ref.extractall(dir_dep)
                tar_ref.close()
            except Exception:
                Log.err('Could not download SPAdes.')
                return None
            try:
                dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes'))
                spades = opj(dir_bin, 'bin', 'spades.py')
                # replace_line_in_file(spades,
                #                      '#!/usr/bin/env python',
                #                      '#!/usr/bin/env python3')
                if ope(spades):
                    run([PY3, spades])
                else:
                    Log.err('Could not download SPAdes.')
                    return None
            except Exception:
                Log.err('SPAdes was downloaded, but does not execute.')
                return None

    v = get_dep_version([PY3, spades, '--version'], r'^.*SPAdes.*v([\d\.]*)')
    Log.msg('SPAdes is available:', v + ' ' + spades)

    return spades
Пример #12
0
def _write_trimmomatic_adapters_file(dir_dep):
    path_adapters = opj(dir_dep, 'trimmomatic_adapters.fasta')

    adapters = ('>TruSeq2_SE'
                'AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG'
                '>TruSeq2_PE_f'
                'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT'
                '>TruSeq2_PE_r'
                'AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG'
                '>TruSeq3_IndexedAdapter'
                'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
                '>TruSeq3_UniversalAdapter'
                'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'
                '>PrefixPE/1'
                'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
                '>PrefixPE/2'
                'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT'
                '>PCR_Primer1'
                'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
                '>PCR_Primer1_rc'
                'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT'
                '>PCR_Primer2'
                'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT'
                '>PCR_Primer2_rc'
                'AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG'
                '>FlowCell1'
                'TTTTTTTTTTAATGATACGGCGACCACCGAGATCTACAC'
                '>FlowCell2'
                'TTTTTTTTTTCAAGCAGAAGACGGCATACGA'
                '>PrefixPE/1'
                'TACACTCTTTCCCTACACGACGCTCTTCCGATCT'
                '>PrefixPE/2'
                'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT'
                '>PE1'
                'TACACTCTTTCCCTACACGACGCTCTTCCGATCT'
                '>PE1_rc'
                'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA'
                '>PE2'
                'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT'
                '>PE2_rc'
                'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC')

    if not ope(path_adapters):
        Log.msg('Writing Trimmomatic adapter files: ' + path_adapters)
        with open(path_adapters, mode='w') as f:
            f.write(adapters)

    return path_adapters
Пример #13
0
def dep_check_blast(dir_dep, os_id, dist_id, debian_dists, redhat_dists,
                    force):
    if os_id == 'mac':
        url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.10.1/'
               'ncbi-blast-2.10.1+-x64-macosx.tar.gz')
    elif os_id == 'linux':
        if dist_id in debian_dists:
            url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/'
                   '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz')
        elif dist_id in redhat_dists:
            url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/'
                   '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz')

    dnld_path = opj(dir_dep, 'ncbi-blast.tar.gz')

    makeblastdb = None
    blastn = None
    tblastn = None

    try:
        if force is True:
            raise
        makeblastdb = which('makeblastdb')
        blastn = which('blastn')
        tblastn = which('tblastn')
        run([makeblastdb, '-help'])
    except Exception:
        try:
            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast'))
            makeblastdb = opj(dir_bin, 'bin', 'makeblastdb')
            blastn = opj(dir_bin, 'bin', 'blastn')
            tblastn = opj(dir_bin, 'bin', 'tblastn')
            run([makeblastdb, '-help'])
        except Exception:
            Log.wrn('BLAST+ was not found on this system, trying to download.')
            download_file(url, dnld_path)
            tar_ref = tarfile.open(dnld_path, 'r:gz')
            tar_ref.extractall(dir_dep)
            tar_ref.close()

            dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast'))
            makeblastdb = opj(dir_bin, 'bin', 'makeblastdb')
            blastn = opj(dir_bin, 'bin', 'blastn')
            tblastn = opj(dir_bin, 'bin', 'tblastn')

            if not ope(makeblastdb) or \
                    not ope(blastn) or \
                    not ope(tblastn):
                Log.err('Could not download BLAST+.')
                return None, None, None

    regexp = r'\sblast\s([\d\.]*)'
    v = get_dep_version([makeblastdb, '-version'], regexp)
    Log.msg('makeblastdb is available:', v + ' ' + makeblastdb)
    v = get_dep_version([blastn, '-version'], regexp)
    Log.msg('blastn is available:', v + ' ' + blastn)
    v = get_dep_version([tblastn, '-version'], regexp)
    Log.msg('tblastn is available:', v + ' ' + tblastn)

    return makeblastdb, blastn, tblastn
Пример #14
0
def dep_check_trimmomatic(dir_dep):
    url = ('http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/'
           'Trimmomatic-0.39.zip')
    dnld_path = opj(dir_dep, 'Trimmomatic-0.39.zip')
    dir_bin = opj(dir_dep, 'Trimmomatic-0.39')
    trimmomatic = opj(dir_bin, 'trimmomatic-0.39.jar')

    if not ope(trimmomatic):
        download_file(url, dnld_path)
        zip_ref = zipfile.ZipFile(dnld_path, 'r')
        zip_ref.extractall(dir_dep)
        zip_ref.close()

    if not ope(trimmomatic):
        Log.err('Could not download Trimmomatic.')
        return None, None

    v = get_dep_version(['java', '-jar', trimmomatic, '-version'], r'\d+\.\d+')
    Log.msg('Trimmomatic is available:', v + ' ' + trimmomatic)

    path_adapters = _write_trimmomatic_adapters_file(dir_dep)

    return trimmomatic, path_adapters
Пример #15
0
def pfam_uniprot_accessions(ss, pfam_acc, tax_ids, dir_cache_pfam_acc):
    if len(pfam_acc) > 0:
        Log.inf('Downloading UniProt accessions for Pfam accessions:', ss)
    pfam_seqs_list = []
    for pa in pfam_acc:
        pfam_id = pfam_entry(pa)[0]['id']
        Log.msg(pa + ':', pfam_id)
        _ = opj(dir_cache_pfam_acc, pa + '__' + ss)
        if ope(_):
            with open(_, 'rb') as f:
                acc = pickle.load(f)
            pfam_seqs_list = pfam_seqs_list + acc
        else:
            # Note: the results may include "obsolete" accessions.
            # This is not a problem, they will not appear in the set of
            # downloaded sequences from UniProt.
            acc = pfam_seqs(query=pa)
            pfam_seqs_list = pfam_seqs_list + acc
            with open(_, 'wb') as f:
                pickle.dump(acc, f, protocol=PICKLE_PROTOCOL)

    pfam_uniprot_acc = prot_ids_for_tax_ids(pfam_seqs_list, tax_ids)
    return pfam_uniprot_acc
Пример #16
0
def makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb):
    if len(assemblies) > 0:
        print()
        Log.inf('Building BLAST databases for assemblies.')
        if makeblastdb is None:
            Log.err('makeblastdb is not available. Cannot continue. Exiting.')
            exit(0)
    for a in assemblies:
        assmbl_name = a['name']

        assmbl_blast_db_dir = opj(dir_prj_blast_assmbl, assmbl_name)
        assmbl_blast_db_file = opj(assmbl_blast_db_dir, assmbl_name)

        a['blast_db_path'] = assmbl_blast_db_file

        if ope(assmbl_blast_db_dir):
            Log.msg('BLAST database already exists:', assmbl_name)
        else:
            Log.msg(assmbl_name)
            make_dirs(assmbl_blast_db_dir)
            make_blast_db(exec_file=makeblastdb,
                          in_file=a['path'],
                          out_file=assmbl_blast_db_file,
                          title=assmbl_name)
Пример #17
0
def makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim,
                   makeblastdb, fpatt):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Building BLAST databases for reads.')
        if makeblastdb is None:
            Log.err('makeblastdb is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, se)
        fa_path = se_fastq_files[se]['filter_path_fa']
        out_f = opj(dir_blast_fa_trim_sample, se)
        se_fastq_files[se]['blast_db_path'] = out_f

        if ope(dir_blast_fa_trim_sample):
            Log.msg('BLAST database already exists:', se)
        else:
            make_dirs(dir_blast_fa_trim_sample)
            Log.msg(basename(fa_path))
            make_blast_db(exec_file=makeblastdb,
                          in_file=fa_path,
                          out_file=out_f,
                          title=se,
                          dbtype='nucl')

    for pe in pe_fastq_files:
        dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, pe)
        fa_paths = pe_fastq_files[pe]['filter_path_fa']
        out_fs = [x.replace('@D@', dir_blast_fa_trim_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        pe_fastq_files[pe]['blast_db_path'] = out_fs

        if ope(dir_blast_fa_trim_sample):
            Log.msg('BLAST database already exists:', pe)
        else:
            make_dirs(dir_blast_fa_trim_sample)
            pe_trim_files = zip(fa_paths, out_fs)
            for x in pe_trim_files:
                Log.msg(basename(x[0]))
                make_blast_db(exec_file=makeblastdb,
                              in_file=x[0],
                              out_file=x[1],
                              title=basename(x[1]),
                              dbtype='nucl')
Пример #18
0
def filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk,
                      fpatt):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Converting FASTQ to FASTA using Seqtk.')
        if seqtk is None:
            Log.err('seqtk is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_fa_trim_data_sample = opj(dir_fa_trim_data, se)
        fq_path = se_fastq_files[se]['filter_path_fq']
        out_f = opj(dir_fa_trim_data_sample, se + '.fasta')
        se_fastq_files[se]['filter_path_fa'] = out_f

        if ope(dir_fa_trim_data_sample):
            Log.msg('Filtered FASTA files already exist:', se)
        else:
            make_dirs(dir_fa_trim_data_sample)
            Log.msg(basename(fq_path))
            seqtk_fq_to_fa(seqtk, fq_path, out_f)

    for pe in pe_fastq_files:
        dir_fa_trim_data_sample = opj(dir_fa_trim_data, pe)
        fq_paths = pe_fastq_files[pe]['filter_path_fq']
        out_fs = [x.replace('@D@', dir_fa_trim_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        pe_fastq_files[pe]['filter_path_fa'] = out_fs

        if ope(dir_fa_trim_data_sample):
            Log.msg('Filtered FASTA files already exist:', pe)
        else:
            make_dirs(dir_fa_trim_data_sample)
            pe_trim_files = zip(fq_paths, out_fs)
            for x in pe_trim_files:
                Log.msg(basename(x[0]))
                seqtk_fq_to_fa(seqtk, x[0], x[1])
Пример #19
0
def filter_queries(ss,
                   aa_queries_file,
                   min_query_length,
                   max_query_length,
                   max_query_identity,
                   vsearch,
                   prot_acc_user,
                   overwrite,
                   logging=True):

    if logging is True:
        print()
        Log.inf('Filtering AA query sequences:', ss)
        Log.msg('min_query_length:', str(min_query_length))
        Log.msg('max_query_length:', str(max_query_length))
        Log.msg('max_query_identity:', str(max_query_identity))

    parsed_fasta_1 = filter_fasta_by_length(aa_queries_file, SEQ_TYPE_AA,
                                            min_query_length, max_query_length)
    tmp1 = aa_queries_file + '_temp1'
    tmp2 = aa_queries_file + '_temp2'
    for rec in parsed_fasta_1:
        rec.seq.gc_code = 1
        rec.seq = rec.seq.untranslate()
    write_fasta(parsed_fasta_1, tmp1)
    run_cluster_fast(vsearch, max_query_identity, tmp1, tmp2)
    parsed_fasta_2 = read_fasta(tmp2, SEQ_TYPE_DNA, parse_def=True)
    prot_acc_user_new = list()
    for rec in parsed_fasta_2:
        rec.seq.gc_code = 1
        rec.seq = rec.seq.translate()
        acc = rec.accession_version
        if acc in prot_acc_user:
            prot_acc_user_new.append(acc)

    if overwrite is True:
        write_fasta(parsed_fasta_2, aa_queries_file, prepend_acc=True)

    osremove(tmp1)
    osremove(tmp2)

    return prot_acc_user_new
Пример #20
0
def dnld_sra_fastq_files(sras, sra_runs_info, dir_fq_data, fasterq_dump,
                         threads, dir_temp):

    if len(sras) > 0:
        if fasterq_dump is None:
            Log.err('fasterq-dump from SRA Toolkit is not available. ' +
                    'Cannot continue. Exiting.')
            exit(0)

        print()
        Log.inf('Downloading SRA read data.')

    se_fastq_files = {}
    pe_fastq_files = {}

    for sra in sras:
        sra_run_info = sra_runs_info[sra]
        sra_lib_layout = sra_run_info['LibraryLayout'].lower()
        sra_lib_layout_k = sra_run_info['KakapoLibraryLayout'].lower()
        sample_base_name = sra_run_info['KakapoSampleBaseName']
        sra_taxid = int(sra_run_info['TaxID'])
        avg_len = int(sra_run_info['avgLength'])

        sra_dnld_needed = False

        if sra_lib_layout == 'single' or sra_lib_layout_k == 'single':
            se_file = opj(dir_fq_data, sra + '.fastq')
            se_fastq_files[sample_base_name] = {'path': se_file}
            se_fastq_files[sample_base_name]['src'] = 'sra'
            se_fastq_files[sample_base_name]['avg_len'] = avg_len
            se_fastq_files[sample_base_name]['tax_id'] = sra_taxid
            if not ope(se_file):
                sra_dnld_needed = True

        elif sra_lib_layout == 'paired':
            pe_file_1 = opj(dir_fq_data, sra + '_1.fastq')
            pe_file_2 = opj(dir_fq_data, sra + '_2.fastq')
            pe_file_1_renamed = opj(dir_fq_data, sra + '_R1.fastq')
            pe_file_2_renamed = opj(dir_fq_data, sra + '_R2.fastq')
            pe_fastq_files[sample_base_name] = {
                'path': [pe_file_1_renamed, pe_file_2_renamed]
            }
            pe_fastq_files[sample_base_name]['src'] = 'sra'
            pe_fastq_files[sample_base_name]['avg_len'] = avg_len // 2
            pe_fastq_files[sample_base_name]['tax_id'] = sra_taxid
            if sra_lib_layout_k == 'paired_unp':
                pe_file_3 = opj(dir_fq_data, sra + '.fastq')
                pe_file_3_renamed = opj(dir_fq_data, sra + '_R3.fastq')
                pe_fastq_files[sample_base_name]['path'].append(
                    pe_file_3_renamed)
            if not ope(pe_file_1_renamed) or not ope(pe_file_2_renamed):
                sra_dnld_needed = True

        if not sra_dnld_needed:
            Log.msg('FASTQ reads are available locally:', sample_base_name)

        retry_count = 0
        while sra_dnld_needed:

            if retry_count > 50:
                Log.err('Download failed. Exiting.')
                rmtree(dir_temp)
                exit(1)

            elif retry_count > 0:
                Log.wrn('Download failed. Retrying.')
                sleep(2)

            retry_count += 1

            Log.msg('Downloading FASTQ reads for:', sample_base_name)

            cmd = [
                fasterq_dump, '--threads',
                str(threads * 2), '--split-3', '--bufsize', '819200',
                '--outdir', dir_fq_data, '--temp', dir_temp, sra
            ]

            run(cmd, do_not_raise=True)

            if sra_lib_layout == 'single' or sra_lib_layout_k == 'single':
                if not ope(se_file):
                    continue

            elif sra_lib_layout == 'paired':

                if not ope(pe_file_1) or not ope(pe_file_2):
                    continue
                else:
                    move(pe_file_1, pe_file_1_renamed)
                    move(pe_file_2, pe_file_2_renamed)

                if sra_lib_layout_k == 'paired_unp':
                    if not ope(pe_file_3):
                        continue
                    else:
                        move(pe_file_3, pe_file_3_renamed)

            sra_dnld_needed = False

            if sra_lib_layout == 'single' or sra_lib_layout_k == 'single':
                if ope(se_file):
                    Log.msg('Renaming FASTQ reads in:', se_file)
                    rename_fq_seqs(se_file, sra, '1:N:0')

            elif sra_lib_layout == 'paired':
                if ope(pe_file_1_renamed):
                    Log.msg('Renaming FASTQ reads in:', pe_file_1_renamed)
                    rename_fq_seqs(pe_file_1_renamed, sra, '1:N:0')
                if ope(pe_file_2_renamed):
                    Log.msg('Renaming FASTQ reads in:', pe_file_2_renamed)
                    rename_fq_seqs(pe_file_2_renamed, sra, '2:N:0')
                if sra_lib_layout_k == 'paired_unp':
                    if ope(pe_file_3_renamed):
                        Log.msg('Renaming FASTQ reads in:', pe_file_3_renamed)
                        rename_fq_seqs(pe_file_3_renamed, sra + '_unpaired',
                                       '1:N:0')

    return se_fastq_files, pe_fastq_files, sra_runs_info
Пример #21
0
def run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data,
                    trimmomatic, adapters, fpatt, threads):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Running Trimmomatic.')
        if trimmomatic is None:
            Log.err('trimmomatic is not available. Cannot continue. Exiting.')
            exit(0)
    for se in se_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, se)
        fq_path = se_fastq_files[se]['cor_path_fq']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        min_acc_len = se_fastq_files[se]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, se + '.txt')
        out_f = opj(dir_fq_trim_data_sample, se + '.fastq' + ext)
        se_fastq_files[se]['trim_path_fq'] = out_f

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('SE mode:', se)
            trimmomatic_se(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file=fq_path,
                           out_file=out_f,
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

    for pe in pe_fastq_files:
        dir_fq_trim_data_sample = opj(dir_fq_trim_data, pe)
        fq_path_1 = pe_fastq_files[pe]['cor_path_fq'][0]
        fq_path_2 = pe_fastq_files[pe]['cor_path_fq'][1]
        fq_path_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        if len(pe_fastq_files[pe]['cor_path_fq']) == 3:
            fq_path_3 = pe_fastq_files[pe]['cor_path_fq'][2]
        min_acc_len = pe_fastq_files[pe]['min_acc_len']
        stats_f = opj(dir_fq_trim_data_sample, pe + '.txt')
        out_fs = [x.replace('@D@', dir_fq_trim_data_sample) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]
        out_fs = [x + ext for x in out_fs]
        pe_fastq_files[pe]['trim_path_fq'] = out_fs

        if ope(dir_fq_trim_data_sample):
            Log.msg('Trimmed FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_trim_data_sample)
            Log.msg('PE mode:', pe)
            trimmomatic_pe(trimmomatic=trimmomatic,
                           adapters=adapters,
                           in_file_1=fq_path_1,
                           in_file_2=fq_path_2,
                           out_file_paired_1=out_fs[0],
                           out_file_paired_2=out_fs[1],
                           out_file_unpaired_1=out_fs[2],
                           out_file_unpaired_2=out_fs[3],
                           stats_file=stats_f,
                           threads=threads,
                           minlen=min_acc_len)

            if fq_path_3 is not None:

                out_f = opj(dir_fq_trim_data_sample, 'unpaired.fastq' + ext)
                stats_f = opj(dir_fq_trim_data_sample, pe + '_unpaired.txt')

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                trimmomatic_se(trimmomatic=trimmomatic,
                               adapters=adapters,
                               in_file=fq_path_3,
                               out_file=out_f,
                               stats_file=stats_f,
                               threads=threads,
                               minlen=min_acc_len)

                _ = opj(dir_fq_trim_data_sample, 'temp.fastq' + ext)
                f_temp = fqopen(_, w_mode)
                with fileinput.FileInput(
                        files=[out_fs[2], out_f],
                        openhook=fileinput.hook_compressed) as f:
                    for line in f:
                        f_temp.write(line)
                f_temp.close()

                remove(out_fs[2])
                remove(out_f)
                copyfile(_, out_fs[2])
                remove(_)
Пример #22
0
def run_spades(se_fastq_files, pe_fastq_files, dir_spades_assemblies,
               spades, dir_temp, ss, threads, ram):

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        if spades is None:
            Log.err('SPAdes is not available. Cannot continue. Exiting.')
            exit(0)

    for se in se_fastq_files:
        dir_results = opj(dir_spades_assemblies, se + '__' + ss)
        fq_path = se_fastq_files[se]['vsearch_results_path' + '__' + ss]
        se_fastq_files[se]['spades_assembly' + '__' + ss] = None

        if ope(dir_results):
            Log.msg('SPAdes assembly already exists:', se)
        else:
            make_dirs(dir_results)
            Log.msg('Running SPAdes on:', se)
            run_spades_se(spades,
                          out_dir=dir_results,
                          input_file=fq_path,
                          threads=threads,
                          memory=ram,
                          rna=True)

        assmbl_path = opj(dir_results, 'transcripts.fasta')
        if ope(assmbl_path):
            count = len(read_fasta(assmbl_path, SEQ_TYPE_NT))
            tr_str = ' transcripts.'
            if count == 1:
                tr_str = ' transcript.'
            Log.msg('SPAdes produced ' + str(count) + tr_str, False)
            se_fastq_files[se]['spades_assembly' + '__' + ss] = assmbl_path
        else:
            Log.wrn('SPAdes produced no transcripts.', False)

    for pe in pe_fastq_files:
        dir_results = opj(dir_spades_assemblies, pe + '__' + ss)
        fq_paths = pe_fastq_files[pe]['vsearch_results_path' + '__' + ss]
        pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None

        if ope(dir_results):
            Log.msg('SPAdes assembly already exists:', pe)
        else:
            make_dirs(dir_results)
            Log.msg('Running SPAdes on: ' + pe)

            if osstat(fq_paths[0]).st_size > 0 and \
               osstat(fq_paths[1]).st_size > 0:

                run_spades_pe(spades,
                              out_dir=dir_results,
                              input_files=fq_paths,
                              threads=threads,
                              memory=ram,
                              rna=True)

            else:
                _ = opj(dir_temp, 'temp.fasta')
                combine_text_files(fq_paths, _)
                run_spades_se(spades,
                              out_dir=dir_results,
                              input_file=_,
                              threads=threads,
                              memory=ram,
                              rna=True)
                osremove(_)

        assmbl_path = opj(dir_results, 'transcripts.fasta')
        if ope(assmbl_path):
            count = len(read_fasta(assmbl_path, SEQ_TYPE_NT))
            tr_str = ' transcripts.'
            if count == 1:
                tr_str = ' transcript.'
            Log.msg('SPAdes produced ' + str(count) + tr_str, False)
            pe_fastq_files[pe]['spades_assembly' + '__' + ss] = assmbl_path
        else:
            Log.wrn('SPAdes produced no transcripts.', False)
Пример #23
0
def min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp,
                        dir_cache_fq_minlen, vsearch):
    # lowest allowable
    low = 35

    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        Log.inf('Calculating minimum acceptable read length.')
        if vsearch is None:
            Log.err('vsearch is not available. Cannot continue. Exiting.')
            exit(0)
    else:
        return None

    __ = opj(dir_cache_fq_minlen, 'minlen')

    pickled = {}

    if ope(__):
        with open(__, 'rb') as f:
            pickled = pickle.load(f)

    queue = []

    for se in se_fastq_files:
        src = se_fastq_files[se]['src']
        avg_len = se_fastq_files[se]['avg_len']
        if src == 'sra':
            ml = max(avg_len // 3, low)
            se_fastq_files[se]['min_acc_len'] = ml
            Log.msg(str(ml) + ' nt:', se)
            continue

        fq_path = se_fastq_files[se]['path']
        stats_file = opj(dir_temp, se + '_stats.txt')
        queue.append([se, fq_path, stats_file, 'se'])

    for pe in pe_fastq_files:
        src = pe_fastq_files[pe]['src']
        avg_len = pe_fastq_files[pe]['avg_len']
        if src == 'sra':
            ml = max(avg_len // 3, low)
            pe_fastq_files[pe]['min_acc_len'] = ml
            Log.msg(str(ml) + ' nt:', pe)
            continue

        fq_path = pe_fastq_files[pe]['path'][0]
        stats_file = opj(dir_temp, pe + '_stats.txt')
        queue.append([pe, fq_path, stats_file, 'pe'])

    for x in queue:

        if x[0] in pickled:
            ml = pickled[x[0]]

        else:
            # ----------------------------------------------------------------
            # Use 'vsearch --fastq_stats'. About 2x slower than the
            #   approx_avg_read_len_fq function.
            #
            # cmd = [vsearch, '--fastq_stats', x[1], '--log', x[2]]
            # run(cmd, do_not_raise=True)
            # with open(x[2]) as f:
            #     stats = f.read()
            # remove(x[2])
            # ml = re.findall(r'>=\s+(\d+)', stats)
            # if len(ml) != 0:
            #     ml = max(int(ml[0]) // 3, low)
            # else:
            #     ml = None
            # ----------------------------------------------------------------
            # 22:59:12 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961
            # 22:59:46 50 nt: Schlumbergera_truncata_15H-02_pol_S47    34s
            # 23:00:30 50 nt: Schlumbergera_truncata_15H-02_sty_S49    44s
            # ----------------------------------------------------------------

            # ----------------------------------------------------------------
            ml = approx_avg_read_len_fq(x[1])
            ml = max(int(ml) // 3, low)
            # ----------------------------------------------------------------
            # 23:12:06 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961
            # 23:12:20 50 nt: Schlumbergera_truncata_15H-02_pol_S47    14s
            # 23:12:39 50 nt: Schlumbergera_truncata_15H-02_sty_S49    19s
            # ----------------------------------------------------------------

            pickled[x[0]] = ml

        if ml is not None:
            Log.msg(str(ml) + ' nt:', x[0])
        else:
            Log.msg(' ?' + ' nt:', x[0])
            ml = low

        if x[3] == 'se':
            se_fastq_files[x[0]]['min_acc_len'] = ml

        elif x[3] == 'pe':
            pe_fastq_files[x[0]]['min_acc_len'] = ml

        with open(__, 'wb') as f:
            pickle.dump(pickled, f, protocol=PICKLE_PROTOCOL)
Пример #24
0
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector,
                   threads, dir_temp, should_run):
    if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0:
        print()
        if should_run is False:
            Log.wrn('Skipping Rcorrector as requested.')
        else:
            Log.inf('Running Rcorrector.')

        if rcorrector is None:
            Log.err('Rcorrector is not available. Cannot continue. Exiting.')
            exit(0)

    for se in se_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, se)
        fq_path = se_fastq_files[se]['path']
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path)
        log_f = opj(dir_fq_cor_data_sample, se + '.txt')
        out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext)

        se_fastq_files[se]['cor_path_fq'] = out_f

        if should_run is False:
            se_fastq_files[se]['cor_path_fq'] = fq_path
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ file already exists:', se)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('SE mode:', se)
            run_rcorrector_se(rcorrector=rcorrector,
                              in_file=fq_path,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path))
            fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext

            filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f)

            remove(fq_cor_path)

    for pe in pe_fastq_files:
        dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe)
        fq_path_1 = pe_fastq_files[pe]['path'][0]
        fq_path_2 = pe_fastq_files[pe]['path'][1]
        fq_path_3 = None
        out_f_3 = None
        r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1)
        log_f = opj(dir_fq_cor_data_sample, pe + '.txt')
        out_f_1 = opj(dir_fq_cor_data_sample, pe + '_R1.fastq' + ext)
        out_f_2 = opj(dir_fq_cor_data_sample, pe + '_R2.fastq' + ext)

        pe_fastq_files[pe]['cor_path_fq'] = [out_f_1, out_f_2]

        if len(pe_fastq_files[pe]['path']) == 3:
            fq_path_3 = pe_fastq_files[pe]['path'][2]
            out_f_3 = opj(dir_fq_cor_data_sample, pe + '_R3.fastq' + ext)
            pe_fastq_files[pe]['cor_path_fq'].append(out_f_3)

        if should_run is False:
            pe_fastq_files[pe]['cor_path_fq'] = [fq_path_1, fq_path_2]
            if fq_path_3 is not None:
                pe_fastq_files[pe]['cor_path_fq'].append(fq_path_3)
            continue

        if ope(dir_fq_cor_data_sample):
            Log.msg('Corrected FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_cor_data_sample)
            Log.msg('PE mode:', pe)
            run_rcorrector_pe(rcorrector=rcorrector,
                              in_file_1=fq_path_1,
                              in_file_2=fq_path_2,
                              out_dir=dir_fq_cor_data_sample,
                              threads=threads,
                              dir_temp=dir_temp)

            fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1))
            fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext
            fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2))
            fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext

            filter_unc_pe(in_file_1=fq_cor_path_1,
                          in_file_2=fq_cor_path_2,
                          out_file_1=out_f_1,
                          out_file_2=out_f_2,
                          log_file=log_f)

            remove(fq_cor_path_1)
            remove(fq_cor_path_2)

            if fq_path_3 is not None:

                Log.msg(
                    'SE mode (Paired-read SRA run contains unpaired reads):',
                    pe)

                run_rcorrector_se(rcorrector=rcorrector,
                                  in_file=fq_path_3,
                                  out_dir=dir_fq_cor_data_sample,
                                  threads=threads,
                                  dir_temp=dir_temp)

                fq_base_path_3 = opj(dir_fq_cor_data_sample,
                                     basename(fq_path_3))
                fq_cor_path_3 = splitext_gz(fq_base_path_3)[0] + '.cor.fq'
                log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired.txt')

                filter_unc_se(in_file=fq_cor_path_3,
                              out_file=out_f_3,
                              log_file=log_f_3)

                remove(fq_cor_path_3)
Пример #25
0
def run_inter_pro_scan(ss, assemblies, email, dir_prj_ips, dir_cache_prj,
                       parallel_run_count, max_title_a_len, max_run_id_len):

    delay = 0.25

    for a in assemblies:

        if 'transcripts_aa_orf_fasta_file__' + ss not in a:
            continue

        aa_file = a['transcripts_aa_orf_fasta_file__' + ss]

        if aa_file is None:
            continue

        assmbl_name = a['name']

        json_dump_file_path = opj(dir_prj_ips,
                                  assmbl_name + '_ann_ips__' + ss + '.json')

        if ope(json_dump_file_path):
            Log.inf('InterProScan results for assembly ' + assmbl_name + ', '
                    'search strategy ' + ss + ' have already been downloaded.')
            continue
        else:
            Log.inf('Running InterProScan on translated ' + ss + ' from ' +
                    assmbl_name + '.')

        seqs = seq_records_to_dict(read_fasta(aa_file, SEQ_TYPE_AA))

        # Filter all ORFs except the first one.
        for seq_def in tuple(seqs.keys()):
            seq_def_prefix = seq_def.split(' ')[0]
            if not seq_def_prefix.endswith('ORF001'):
                del seqs[seq_def]

        seqs = OrderedDict(
            sorted(seqs.items(),
                   key=lambda x: x[0].split(' ')[1],
                   reverse=True))

        run_id = ss + '_' + assmbl_name

        _ = opj(dir_cache_prj, 'ips5_cache_done_' + run_id)

        if ope(_):
            with open(_, 'rb') as f:
                jobs = pickle.load(f)

        else:
            jobs = job_runner(email=email,
                              dir_cache=dir_cache_prj,
                              seqs=seqs,
                              run_id=run_id,
                              parallel_run_count=parallel_run_count,
                              max_title_a_len=max_title_a_len,
                              max_run_id_len=max_run_id_len)

            with open(_, 'wb') as f:
                pickle.dump(jobs, f, protocol=PICKLE_PROTOCOL)

        Log.inf('Downloading InterProScan results for ' + ss + ' in ' +
                assmbl_name + '.')

        all_ips_results = {}

        # Nicer printing
        for i, job in enumerate(jobs['finished']):

            job_id = jobs['finished'][job]

            titles_ab = split_seq_defn(job)
            title_a = titles_ab[0]

            progress = round(((i + 1) / len(jobs['finished'])) * 100)
            progress_str = '{:3d}'.format(progress) + '%'

            msg = (' ' * 12 + title_a.ljust(max_title_a_len) +
                   run_id.ljust(max_run_id_len) + progress_str.rjust(4) + ' ' +
                   job_id)

            Log.msg(msg)

            sleep(delay)

            ips_json = result_json(job_id)
            if ips_json is None:
                continue
            # ips_version = ips_json['interproscan-version']
            ips_json = ips_json['results']

            # These fields are set to 'EMBOSS_001' by default
            # Delete them
            del ips_json[0]['xref']

            job_no_def = job.split(' ')[0]

            all_ips_results[job_no_def] = ips_json

        with open(json_dump_file_path, 'w') as f:
            json.dump(all_ips_results, f, sort_keys=True, indent=4)

        # Removes cached jobs file.
        osremove(_)
Пример #26
0
def run_bt2_fq(se_fastq_files, pe_fastq_files, dir_fq_filter_data, bowtie2,
               bowtie2_build, threads, dir_temp, bt2_order, fpatt, taxonomy,
               dir_cache_refseqs):

    new_se_fastq_files = dict()
    new_pe_fastq_files = dict()

    msg_printed = False

    # SE
    for se in se_fastq_files:

        taxid = se_fastq_files[se]['tax_id']
        dbs = _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2,
                              bowtie2_build)

        in_f = se_fastq_files[se]['trim_path_fq']
        in_f_orig = in_f

        if len(dbs) == 0:
            se_fastq_files[se]['filter_path_fq'] = in_f
            continue

        if msg_printed is False:
            print()
            Log.inf('Running Bowtie2.')
            msg_printed = True

        for i, db in enumerate(dbs):

            db_path = dbs[db]

            dir_fq_bt_data_sample = opj(dir_fq_filter_data, se, db)
            dir_fq_bt_data_sample_un = opj(dir_fq_filter_data, se)

            new_se = se + '_' + db

            out_f = opj(dir_fq_bt_data_sample, new_se + '.fastq')

            out_f_un = opj(dir_temp, new_se + '_bt2_unaligned' + '.fastq')

            sam_f = opj(dir_fq_bt_data_sample, new_se + '.sam')
            new_se_fastq_files[new_se] = deepcopy(se_fastq_files[se])
            new_se_fastq_files[new_se]['path'] = None
            new_se_fastq_files[new_se]['cor_path_fq'] = None
            new_se_fastq_files[new_se]['trim_path_fq'] = None
            taxid = new_se_fastq_files[new_se]['tax_id']
            gc = new_se_fastq_files[new_se]['gc_id']
            if db == MT:
                gc = taxonomy.mito_genetic_code_for_taxid(taxid)
                new_se_fastq_files[new_se]['gc_id'] = gc
            elif db == PT:
                gc = taxonomy.plastid_genetic_code_for_taxid(taxid)
                new_se_fastq_files[new_se]['gc_id'] = gc
            new_se_fastq_files[new_se]['gc_tt'] = TranslationTable(gc)
            new_se_fastq_files[new_se]['filter_path_fq'] = out_f
            if ope(dir_fq_bt_data_sample):
                Log.msg('Bowtie2 filtered FASTQ file already exists:', new_se)
                in_f = opj(dir_fq_bt_data_sample_un, se + '.fastq')
            else:
                Log.msg('SE mode:', new_se)
                make_dirs(dir_fq_bt_data_sample)

                db_fasta_path = None
                bt2_idx_path = None
                if db_path in (MT, PT):
                    db_fasta_path = dnld_refseqs_for_taxid(taxid,
                                                           db,
                                                           taxonomy,
                                                           dir_cache_refseqs,
                                                           query='',
                                                           db='nuccore')
                    bt2_idx_path = splitext(db_fasta_path)[0]
                else:
                    db_fasta_path = db_path
                    bt2_idx_path = opj(dir_cache_refseqs,
                                       splitext(basename(db_fasta_path))[0])

                if not ope(bt2_idx_path + '.1.bt2'):
                    build_bt2_index(bowtie2_build, [db_fasta_path],
                                    bt2_idx_path, threads)

                run_bowtie2_se(bowtie2=bowtie2,
                               input_file=in_f,
                               output_file=out_f,
                               output_file_un=out_f_un,
                               sam_output_file=sam_f,
                               index=bt2_idx_path,
                               threads=threads,
                               dir_temp=dir_temp)

                if i > 0:
                    remove(in_f)

                in_f = out_f_un

        out_f_un = opj(dir_fq_bt_data_sample_un, se + '.fastq')
        se_fastq_files[se]['filter_path_fq'] = out_f_un

        if in_f != in_f_orig:
            move(in_f, out_f_un)

    se_fastq_files.update(new_se_fastq_files)

    # PE
    for pe in pe_fastq_files:

        taxid = pe_fastq_files[pe]['tax_id']
        dbs = _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2,
                              bowtie2_build)

        in_fs = pe_fastq_files[pe]['trim_path_fq']
        in_fs_orig = tuple(in_fs)

        if len(dbs) == 0:
            pe_fastq_files[pe]['filter_path_fq'] = in_fs
            continue

        if msg_printed is False:
            print()
            Log.inf('Running Bowtie2.')
            msg_printed = True

        for i, db in enumerate(dbs):

            db_path = dbs[db]

            dir_fq_bt_data_sample = opj(dir_fq_filter_data, pe, db)
            dir_fq_bt_data_sample_un = opj(dir_fq_filter_data, pe)

            new_pe = pe + '_' + db

            out_fs = [x.replace('@D@', dir_fq_bt_data_sample) for x in fpatt]
            out_fs = [x.replace('@N@', new_pe) for x in out_fs]

            out_fs_un = [x.replace('@D@', dir_temp) for x in fpatt]
            out_fs_un = [
                x.replace('@N@', new_pe + '_bt2_unaligned') for x in out_fs_un
            ]

            sam_f = opj(dir_fq_bt_data_sample, new_pe + '.sam')
            new_pe_fastq_files[new_pe] = deepcopy(pe_fastq_files[pe])
            new_pe_fastq_files[new_pe]['path'] = None
            new_pe_fastq_files[new_pe]['cor_path_fq'] = None
            new_pe_fastq_files[new_pe]['trim_path_fq'] = None
            taxid = new_pe_fastq_files[new_pe]['tax_id']
            gc = new_pe_fastq_files[new_pe]['gc_id']
            if db == MT:
                gc = taxonomy.mito_genetic_code_for_taxid(taxid)
                new_pe_fastq_files[new_pe]['gc_id'] = gc
            elif db == PT:
                gc = taxonomy.plastid_genetic_code_for_taxid(taxid)
                new_pe_fastq_files[new_pe]['gc_id'] = gc
            new_pe_fastq_files[new_pe]['gc_tt'] = TranslationTable(gc)
            new_pe_fastq_files[new_pe]['filter_path_fq'] = out_fs
            if ope(dir_fq_bt_data_sample):
                Log.msg('Bowtie2 filtered FASTQ files already exist:', new_pe)
                in_fs = [
                    x.replace('@D@', dir_fq_bt_data_sample_un) for x in fpatt
                ]
                in_fs = [x.replace('@N@', pe) for x in in_fs]
            else:
                Log.msg('PE mode:', new_pe)
                make_dirs(dir_fq_bt_data_sample)

                db_fasta_path = None
                bt2_idx_path = None
                if db_path in (MT, PT):
                    db_fasta_path = dnld_refseqs_for_taxid(taxid,
                                                           db,
                                                           taxonomy,
                                                           dir_cache_refseqs,
                                                           query='',
                                                           db='nuccore')
                    bt2_idx_path = splitext(db_fasta_path)[0]
                else:
                    db_fasta_path = db_path
                    bt2_idx_path = opj(dir_cache_refseqs,
                                       splitext(basename(db_fasta_path))[0])

                if not ope(bt2_idx_path + '.1.bt2'):
                    build_bt2_index(bowtie2_build, [db_fasta_path],
                                    bt2_idx_path, threads)

                paired_out_pattern = out_fs[0].replace('_paired_1.fastq',
                                                       '_paired_%.fastq')

                paired_out_pattern_un = out_fs_un[0].replace(
                    '_paired_1.fastq', '_paired_%.fastq')

                run_bowtie2_pe(bowtie2=bowtie2,
                               input_files=in_fs,
                               paired_out_pattern=paired_out_pattern,
                               paired_out_pattern_un=paired_out_pattern_un,
                               unpaired_out_1=out_fs[2],
                               unpaired_out_2=out_fs[3],
                               unpaired_out_1_un=out_fs_un[2],
                               unpaired_out_2_un=out_fs_un[3],
                               sam_output_file=sam_f,
                               index=bt2_idx_path,
                               threads=threads,
                               dir_temp=dir_temp)

                if i > 0:
                    remove(in_fs[0])
                    remove(in_fs[1])
                    remove(in_fs[2])
                    remove(in_fs[3])

                in_fs = out_fs_un

        out_fs_un = [x.replace('@D@', dir_fq_bt_data_sample_un) for x in fpatt]
        out_fs_un = [x.replace('@N@', pe) for x in out_fs_un]
        pe_fastq_files[pe]['filter_path_fq'] = out_fs_un

        if tuple(in_fs) != in_fs_orig:
            move(in_fs[0], out_fs_un[0])
            move(in_fs[1], out_fs_un[1])
            move(in_fs[2], out_fs_un[2])
            move(in_fs[3], out_fs_un[3])

    pe_fastq_files.update(new_pe_fastq_files)
Пример #27
0
def run_kraken2(order, dbs, se_fastq_files, pe_fastq_files, dir_fq_filter_data,
                confidence, kraken2, threads, dir_temp, fpatt):

    if (len(se_fastq_files) > 0 or len(pe_fastq_files) > 0) and len(order) > 0:
        print()
        Log.inf('Running Kraken2.', 'Confidence: ' + str(confidence))
        if kraken2 is None:
            Log.err('kraken2 is not available. Cannot continue. Exiting.')
            exit(0)

    nuclear = None
    for nuc in order:
        if nuc[1] == 'nuclear':
            nuclear = nuc[0]
            break

    for se in se_fastq_files:

        if len(order) == 0:
            continue

        if se_fastq_files[se]['path'] is None:
            continue

        fq_path = se_fastq_files[se]['filter_path_fq']
        dir_fq_filter_data_sample = opj(dir_fq_filter_data, se)

        if nuclear is None:
            out_f = opj(dir_fq_filter_data_sample, se + '.fastq')
        else:
            out_f = opj(dir_fq_filter_data_sample, nuclear, se + '.fastq')

        se_fastq_files[se]['filter_path_fq'] = out_f

        if ope(dir_fq_filter_data_sample):
            Log.msg('Kraken2 filtered FASTQ files already exist:', se)
        else:
            make_dirs(dir_fq_filter_data_sample)
            print()
            Log.msg('SE mode:', se)
            run_kraken_filters(order=order,
                               dbs=dbs,
                               base_name=se,
                               in_files=fq_path,
                               dir_out=dir_fq_filter_data_sample,
                               confidence=confidence,
                               kraken2=kraken2,
                               threads=threads,
                               dir_temp=dir_temp)

    for pe in pe_fastq_files:

        if len(order) == 0:
            continue

        if pe_fastq_files[pe]['path'] is None:
            continue

        fq_path = pe_fastq_files[pe]['filter_path_fq']
        dir_fq_filter_data_sample = opj(dir_fq_filter_data, pe)

        if nuclear is None:
            dir_name_nuclear = dir_fq_filter_data_sample
        else:
            dir_name_nuclear = dir_fq_filter_data_sample + ops + nuclear

        out_fs = [x.replace('@D@', dir_name_nuclear) for x in fpatt]
        out_fs = [x.replace('@N@', pe) for x in out_fs]

        pe_fastq_files[pe]['filter_path_fq'] = out_fs

        if ope(dir_fq_filter_data_sample):
            Log.msg('Kraken2 filtered FASTQ files already exist:', pe)
        else:
            make_dirs(dir_fq_filter_data_sample)
            print()
            Log.msg('PE mode:', pe)
            run_kraken_filters(order=order,
                               dbs=dbs,
                               base_name=pe,
                               in_files=fq_path,
                               dir_out=dir_fq_filter_data_sample,
                               confidence=confidence,
                               kraken2=kraken2,
                               threads=threads,
                               dir_temp=dir_temp)
Пример #28
0
def dnld_sra_info(sras, dir_cache_prj):

    sra_runs_info = {}
    sras_acceptable = []

    if len(sras) > 0:
        print()
        Log.inf('Downloading SRA run information.')
    else:
        return sra_runs_info, sras_acceptable

    __ = opj(dir_cache_prj, 'sra_runs_info_cache')

    if ope(__):
        with open(__, 'rb') as f:
            sra_runs_info = pickle.load(f)

    sras_local = [k for k in sra_runs_info.keys()]
    sras_to_dnld = set(sras).difference(set(sras_local))
    if len(sras_to_dnld) > 0:
        temp = sra_run_info(list(sras_to_dnld))
        new_sra_runs_info = {i['Run']: i for i in temp}
        sra_runs_info.update(new_sra_runs_info)

    for sra in sras:

        if sra in sra_runs_info:

            info = sra_runs_info[sra]

            sra_lib_layout = info['LibraryLayout'].lower()
            sra_lib_source = info['LibrarySource'].lower()
            sra_lib_strategy = info['LibraryStrategy']
            sra_seq_platform = info['Platform'].lower().capitalize()
            sra_seq_platform_model = info['Model']
            sra_species = info['ScientificName']
            sra_taxid = info['TaxID']
            sra_spots = int(info['spots'])
            sra_spots_with_mates = int(info['spots_with_mates'])

            sample_base_name = (sra_species.replace(' ', '_') + '_' +
                                sra_taxid + '_' + sra)

            sra_runs_info[sra]['KakapoSampleBaseName'] = sample_base_name

            src_check = sra_lib_source.lower()
            strategy_check = sra_lib_strategy.lower()

            if not ('transcript' in src_check or 'rna' in src_check
                    or 'rna' in strategy_check):

                sra_info_str = ('{sra}: the SRA library source type "{ltype}" '
                                'or library strategy "{strategy}" '
                                'is not supported.').format(
                                    sra=sra,
                                    ltype=sra_lib_source,
                                    strategy=sra_lib_strategy)

                Log.err(sra_info_str, 'Skipping.')

            elif sra_seq_platform != 'Illumina':
                sra_info_str = ('{sra}: the SRA library sequencing platform '
                                '"{plat}" is not supported').format(
                                    sra=sra, plat=sra_seq_platform)

                Log.err(sra_info_str, 'Skipping.')

            else:
                # sra_info_str = ('SRA run {sra} {strategy} ({source}) '
                #                 '{layout}-end library.\n'
                #                 'Sourced from {species} '
                #                 '(TaxID: {txid}).\n'
                #                 'Sequenced using {platform} platform on '
                #                 '{model}.').format(
                #                     sra=sra,
                #                     source=sra_lib_source.title(),
                #                     strategy=sra_lib_strategy,
                #                     layout=sra_lib_layout,
                #                     platform=sra_seq_platform,
                #                     model=sra_seq_platform_model,
                #                     species=sra_species,
                #                     txid=sra_taxid)

                Log.msg(
                    '{sra}:'.format(sra=sra),
                    '{strategy} {layout}-end library ({source}).'.format(
                        strategy=sra_lib_strategy,
                        layout=sra_lib_layout,
                        source=sra_lib_source.title()))
                Log.msg(
                    '    Source:',
                    '{species} (TaxID: {txid}).'.format(species=sra_species,
                                                        txid=sra_taxid), False)
                Log.msg(
                    'Technology:', '{platform} platform on {model}.'.format(
                        platform=sra_seq_platform,
                        model=sra_seq_platform_model), False)

                sra_runs_info[sra]['KakapoLibraryLayout'] = \
                    sra_runs_info[sra]['LibraryLayout']

                if sra_lib_layout == 'paired' and sra_spots_with_mates == 0:
                    sra_runs_info[sra]['KakapoLibraryLayout'] = 'SINGLE'
                    # sra_info_str = (
                    #     sra_info_str + '\nListed as containing '
                    #     'paired-end reads, but only a single set of reads '
                    #     'is available. Treating as single-ended.')

                elif (sra_lib_layout == 'paired'
                      and sra_spots != sra_spots_with_mates):
                    sra_runs_info[sra]['KakapoLibraryLayout'] = 'PAIRED_UNP'
                    # sra_info_str = (
                    #     sra_info_str + '\nListed as containing '
                    #     'paired-end reads, but not all reads are paired.')

                sras_acceptable.append(sra)

                # Log.msg(sra_info_str)

    with open(__, 'wb') as f:
        pickle.dump(sra_runs_info, f, protocol=PICKLE_PROTOCOL)

    return sra_runs_info, sras_acceptable
Пример #29
0
def run_tblastn_on_assemblies(ss, assemblies, aa_queries_file, tblastn,
                              dir_prj_assmbl_blast_results, blast_2_evalue,
                              blast_2_max_hsps, blast_2_qcov_hsp_perc,
                              blast_2_best_hit_overhang,
                              blast_2_best_hit_score_edge,
                              blast_2_max_target_seqs, threads, dir_cache_prj,
                              dir_prj_ips):

    if len(assemblies) > 0:
        print()
        Log.inf('Running BLAST on assemblies:', ss)
        if tblastn is None:
            Log.err('tblastn is not available. Cannot continue. Exiting.')
            exit(0)
    else:
        Log.wrn('There are no assemblies. Nothing to do, stopping.')
        exit(0)

    cache_file = opj(dir_cache_prj, 'blast_2_settings_cache__' + ss)

    pickled = dict()
    settings = {'blast_2_evalue': blast_2_evalue,
                'blast_2_max_hsps': blast_2_max_hsps,
                'blast_2_qcov_hsp_perc': blast_2_qcov_hsp_perc,
                'blast_2_best_hit_overhang': blast_2_best_hit_overhang,
                'blast_2_best_hit_score_edge': blast_2_best_hit_score_edge,
                'blast_2_max_target_seqs': blast_2_max_target_seqs,
                'queries': seq_records_to_dict(
                    read_fasta(aa_queries_file, SEQ_TYPE_AA))}

    Log.msg('evalue:', str(blast_2_evalue))
    Log.msg('max_hsps:', str(blast_2_max_hsps))
    Log.msg('qcov_hsp_perc:', str(blast_2_qcov_hsp_perc))
    Log.msg('best_hit_overhang:', str(blast_2_best_hit_overhang))
    Log.msg('best_hit_score_edge:', str(blast_2_best_hit_score_edge))
    Log.msg('max_target_seqs:', str(blast_2_max_target_seqs))
    print()

    for a in assemblies:

        assmbl_src = a['src']
        assmbl_name = a['name']

        if assmbl_src != 'user_fasta':
            if assmbl_name.endswith('__' + ss):
                assmbl_name = assmbl_name.replace('__' + ss, '')
            else:
                continue

        assmbl_blast_db_path = a['blast_db_path']
        assmbl_genetic_code = a['gc_id']

        ips_json_dump_path = opj(dir_prj_ips, assmbl_name + '_ann_ips__' + ss +
                                 '.json')

        _ = opj(dir_prj_assmbl_blast_results, assmbl_name + '__' + ss + '.tsv')

        if ope(_) and ope(cache_file):
            with open(cache_file, 'rb') as f:
                pickled = pickle.load(f)

        if ope(_) and pickled == settings:
            # Log.msg('The provided BLAST settings and query sequences did '
            #         'not change since the previous run.')
            Log.msg('BLAST results already exist:', assmbl_name)

        else:
            Log.msg('Running tblastn on: ' + assmbl_name, ss)

            if ope(ips_json_dump_path):
                osremove(ips_json_dump_path)

            run_blast(exec_file=tblastn,
                      task='tblastn',
                      threads=threads,
                      db_path=assmbl_blast_db_path,
                      queries_file=aa_queries_file,
                      out_file=_,
                      evalue=blast_2_evalue,
                      max_hsps=blast_2_max_hsps,
                      qcov_hsp_perc=blast_2_qcov_hsp_perc,
                      best_hit_overhang=blast_2_best_hit_overhang,
                      best_hit_score_edge=blast_2_best_hit_score_edge,
                      max_target_seqs=blast_2_max_target_seqs,
                      db_genetic_code=assmbl_genetic_code,
                      out_cols=BLST_RES_COLS_2)

        a['blast_hits_aa__' + ss] = parse_blast_results_file(_, BLST_RES_COLS_2)

    with open(cache_file, 'wb') as f:
        pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)
Пример #30
0
def download_kraken2_dbs(dbs_path):
    base_kraken2_url = 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/'
    msg_prefix = 'Downloading Kraken2 database: '

    # ------------------------------------------------------------------------

    base = '16S_Silva132_20200326'
    url = base_kraken2_url + base + '.tgz'
    tgz = opj(dbs_path, base + '.tgz')
    # ToDo: Use pattern matching for the directory name.
    #       Instead of using 16S_SILVA132_k2db -> 16S_SILVA132
    p_orig = opj(dbs_path, '16S_SILVA132_k2db')
    db_name = '16S_Silva132'
    p_new = opj(dbs_path, db_name)

    if not ope(p_new):
        Log.msg(msg_prefix + db_name)
        download_file(url=url, local_path=tgz, protocol='ftp')
        tar_ref = tarfile.open(tgz, 'r:gz')
        tar_ref.extractall(dbs_path)
        tar_ref.close()
        remove(tgz)
        move(p_orig, p_new)

    # ------------------------------------------------------------------------

    base = '16S_Silva138_20200326'
    url = base_kraken2_url + base + '.tgz'
    tgz = opj(dbs_path, base + '.tgz')
    # ToDo: Use pattern matching for the directory name.
    #       Instead of using 16S_SILVA138_k2db -> 16S_SILVA138
    p_orig = opj(dbs_path, '16S_SILVA138_k2db')
    db_name = '16S_Silva138'
    p_new = opj(dbs_path, db_name)

    if not ope(p_new):
        Log.msg(msg_prefix + db_name)
        download_file(url=url, local_path=tgz, protocol='ftp')
        tar_ref = tarfile.open(tgz, 'r:gz')
        tar_ref.extractall(dbs_path)
        tar_ref.close()
        remove(tgz)
        move(p_orig, p_new)

    # ------------------------------------------------------------------------

    base = 'minikraken_8GB_202003'
    url = base_kraken2_url + base + '.tgz'
    tgz = opj(dbs_path, base + '.tgz')
    # ToDo: Use pattern matching for the directory name.
    #       Instead of using minikraken_8GB_20200312 -> minikraken
    p_orig = opj(dbs_path, 'minikraken_8GB_20200312')
    db_name = 'minikraken_8GB_2020-03-12'
    p_new = opj(dbs_path, db_name)

    if not ope(p_new):
        Log.msg(msg_prefix + db_name)
        download_file(url=url, local_path=tgz, protocol='ftp')
        tar_ref = tarfile.open(tgz, 'r:gz')
        tar_ref.extractall(dbs_path)
        tar_ref.close()
        remove(tgz)
        move(p_orig, p_new)

    # ------------------------------------------------------------------------

    base_dropbox_url = 'https://www.dropbox.com/s/'

    base = 'mitochondrion_and_plastid'
    garb = 'vkbp7iys6s76tvf/'
    url = base_dropbox_url + garb + base + '.tar.gz?dl=1'
    tgz = opj(dbs_path, base + '.tar.gz')
    p = opj(dbs_path, base)

    if not ope(p):
        Log.msg(msg_prefix + base)
        download_file(url=url, local_path=tgz, protocol='http')
        tar_ref = tarfile.open(tgz, 'r:gz')
        tar_ref.extractall(dbs_path)
        tar_ref.close()
        remove(tgz)

    # ------------------------------------------------------------------------

    base_dropbox_url = 'https://www.dropbox.com/s/'

    base = 'mitochondrion'
    garb = '6liwneb26uvjuec/'
    url = base_dropbox_url + garb + base + '.tar.gz?dl=1'
    tgz = opj(dbs_path, base + '.tar.gz')
    p = opj(dbs_path, base)

    if not ope(p):
        Log.msg(msg_prefix + base)
        download_file(url=url, local_path=tgz, protocol='http')
        tar_ref = tarfile.open(tgz, 'r:gz')
        tar_ref.extractall(dbs_path)
        tar_ref.close()
        remove(tgz)

    # ------------------------------------------------------------------------

    base_dropbox_url = 'https://www.dropbox.com/s/'

    base = 'plastid'
    garb = 's9vdg4mxrfy1szn/'
    url = base_dropbox_url + garb + base + '.tar.gz?dl=1'
    tgz = opj(dbs_path, base + '.tar.gz')
    p = opj(dbs_path, base)

    if not ope(p):
        Log.msg(msg_prefix + base)
        download_file(url=url, local_path=tgz, protocol='http')
        tar_ref = tarfile.open(tgz, 'r:gz')
        tar_ref.extractall(dbs_path)
        tar_ref.close()
        remove(tgz)

    # ------------------------------------------------------------------------

    base_dropbox_url = 'https://www.dropbox.com/s/'

    base = 'viral'
    garb = '7xz31c7vw088n27/'
    url = base_dropbox_url + garb + base + '.tar.gz?dl=1'
    tgz = opj(dbs_path, base + '.tar.gz')
    p = opj(dbs_path, base)

    if not ope(p):
        Log.msg(msg_prefix + base)
        download_file(url=url, local_path=tgz, protocol='http')
        tar_ref = tarfile.open(tgz, 'r:gz')
        tar_ref.extractall(dbs_path)
        tar_ref.close()
        remove(tgz)

    # ------------------------------------------------------------------------

    dbs_available, e = list_of_dirs_at_path(dbs_path)
    kraken2_dbs = {basename(p): p for p in dbs_available}
    return kraken2_dbs