def _parse_taxa(taxa, tax_group, taxonomy, config_file_path): txids = list() for tax in taxa: if tax.isdigit(): txids.append(int(tax)) else: # tax_orig = tax txid = taxonomy.tax_id_for_name_and_group_tax_id( name=tax, group_tax_id=tax_group) if txid is None: txid = taxonomy.tax_id_for_name_and_group_tax_id( name=tax.split(' ')[0], group_tax_id=tax_group) if txid is None: txids.append(txid) msg = 'NCBI taxonomy ID could not be found for:' Log.wrn(msg, tax) # replace_line_in_file( # file_path=config_file_path, # line_str=tax_orig, # replace_str='; NCBI taxid not found: ' + tax) else: txids.append(int(txid)) msg = 'NCBI taxonomy ID for ' + tax + ' is:' Log.msg(msg, str(txid)) # replace_line_in_file( # file_path=config_file_path, # line_str=tax_orig, # replace_str='; ' + tax + '\n' + str(txid)) return txids
def dep_check_bowtie2(dir_dep, os_id, force): if os_id == 'mac': url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/' '2.4.1/bowtie2-2.4.1-macos-x86_64.zip/download') elif os_id == 'linux': url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/' '2.4.1/bowtie2-2.4.1-linux-x86_64.zip/download') dnld_path = opj(dir_dep, 'bowtie2.zip') try: if force is True: raise bowtie2 = which('bowtie2') bowtie2_build = which('bowtie2-build') run([bowtie2, '-h']) run([bowtie2_build, '-h']) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2')) bowtie2 = opj(dir_bin, 'bowtie2') bowtie2_build = opj(dir_bin, 'bowtie2-build') run([bowtie2, '-h']) run([bowtie2_build, '-h']) except Exception: Log.wrn('Bowtie 2 was not found on this system, trying to ' 'download.') download_file(url, dnld_path) zip_ref = zipfile.ZipFile(dnld_path, 'r') zip_ref.extractall(dir_dep) zip_ref.close() dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2')) bowtie2 = opj(dir_bin, 'bowtie2') bowtie2_build = opj(dir_bin, 'bowtie2-build') bowtie2_execs = ('', '-align-l', '-align-l-debug', '-align-s', '-align-s-debug', '-build', '-build-l', '-build-l-debug', '-build-s', '-build-s-debug', '-inspect', '-inspect-l', '-inspect-l-debug', '-inspect-s', '-inspect-s-debug') for bt2exe in bowtie2_execs: chmod( bowtie2 + bt2exe, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) if not ope(bowtie2): Log.err('Could not download Bowtie 2.') return None, None regexp = r'^.*?version\s([\d\.]*)' v = get_dep_version([bowtie2, '--version'], regexp) Log.msg('bowtie2 is available:', v + ' ' + bowtie2) v = get_dep_version([bowtie2_build, '--version'], regexp) Log.msg('bowtie2-build is available:', v + ' ' + bowtie2_build) return bowtie2, bowtie2_build
def dnld_refseqs_for_taxid(taxid, filter_term, taxonomy, dir_cache_refseqs, query='', db='nuccore'): ft = None if filter_term == 'plastid': ft = '("chloroplast"[filter] OR "plastid"[filter])' else: ft = '("' + filter_term + '"[filter])' tax_terms = tuple(reversed(taxonomy.lineage_for_taxid(taxid)['names'])) for tax_term in tax_terms: if tax_term is None: tax_term = taxonomy.scientific_name_for_taxid(taxid) term = '"RefSeq"[Keyword] AND "{}"[Primary Organism] AND {}'.format( tax_term, ft) term = query + term accs = set(accs_eutil(search_eutil(db, term))) if len(accs) > 0: plural = 'sequences' if len(accs) == 1: plural = 'sequence' Log.msg( 'Found {} RefSeq {} {} for'.format(len(accs), filter_term, plural), tax_term) # Random sample ################################################### if len(accs) > 10: Log.wrn('Using a random sample of ten RefSeq sequences.') random.seed(a=len(accs), version=2) accs = set(random.sample(accs, 10)) ################################################################### break else: Log.wrn( 'No RefSeq {} sequences were found for'.format(filter_term), tax_term) cache_path = opj( dir_cache_refseqs, filter_term + '__' + tax_term.replace(' ', '_') + '.fasta') parsed_fasta_cache = {} if ope(cache_path): parsed_fasta_cache = read_fasta(cache_path, seq_type=SEQ_TYPE_NT, def_to_first_space=True) parsed_fasta_cache = seq_records_to_dict(parsed_fasta_cache) for acc in parsed_fasta_cache: if acc in accs: accs.remove(acc) if len(accs) > 0: parsed_fasta = dnld_ncbi_seqs(db, list(accs)) parsed_fasta = seq_records_to_dict(parsed_fasta, prepend_acc=True) parsed_fasta.update(parsed_fasta_cache) write_fasta(parsed_fasta, cache_path) return cache_path
def user_protein_accessions(ss, prot_acc_user, dir_cache_prj, taxonomy): if len(prot_acc_user) > 0: Log.inf('Reading user provided protein accessions:', ss) print() pickle_file = opj(dir_cache_prj, 'ncbi_prot_metadata_cache__' + ss) acc_old = set() if ope(pickle_file): with open(pickle_file, 'rb') as f: pickled = pickle.load(f) acc_old = set([x['accessionversion'] for x in pickled]) if acc_old == set(prot_acc_user): pa_info = pickled else: pa_info = summary_eutil('protein', prot_acc_user) prot_acc = [] prot_info_to_print = [] max_acc_len = 0 for pa in pa_info: acc = pa['accessionversion'] prot_acc.append(acc) title = pa['title'] title_split = title.split('[') taxid = pa['taxid'] if 'organism' in pa: organism = pa['organism'] else: organism = taxonomy.scientific_name_for_taxid(taxid) pa['organism'] = organism # title = title_split[0] # title = title.lower().strip() # title = title.replace('_', ' ').replace('-', ' ') # title = title.replace(',', '') # title = title[0].upper() + title[1:] + ' [' + organism + ']' max_acc_len = max(max_acc_len, len(acc)) prot_info_to_print.append((title, acc)) prot_info_to_print = sorted(prot_info_to_print) for pi in prot_info_to_print: title = pi[0] acc = pi[1] if len(title) > 80: title = title[:77] + '...' Log.msg(acc.rjust(max_acc_len) + ':', title, False) with open(pickle_file, 'wb') as f: pickle.dump(pa_info, f, protocol=PICKLE_PROTOCOL) return prot_acc else: return prot_acc_user
def user_aa_fasta(ss, user_queries, aa_prot_user_file): _ = '' if len(user_queries) > 0: print() Log.inf('Reading user provided AA sequences:', ss) for ap in user_queries: Log.msg(ap) with open(ap, 'r') as f: _ = _ + f.read() if _ != '': with open(aa_prot_user_file, 'w') as f: write_fasta(standardize_fasta_text(_, SEQ_TYPE_AA), f)
def dep_check_sra_toolkit(dir_dep, os_id, dist_id, debian_dists, redhat_dists, force): if os_id == 'mac': url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/' 'sratoolkit.2.10.8-mac64.tar.gz') elif os_id == 'linux': if dist_id in debian_dists: url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/' 'sratoolkit.2.10.8-ubuntu64.tar.gz') elif dist_id in redhat_dists: url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/' 'sratoolkit.2.10.8-centos_linux64.tar.gz') dnld_path = opj(dir_dep, 'sra-toolkit.tar.gz') fasterq_dump = None try: if force is True: raise fasterq_dump = which('fasterq-dump') dir_bin = dirname(fasterq_dump).strip('bin') _ensure_vdb_cfg(dir_bin) run(fasterq_dump) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit')) _ensure_vdb_cfg(dir_bin) fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump') run(fasterq_dump) except Exception: Log.wrn('SRA Toolkit was not found on this system, trying to ' 'download.') download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit')) fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump') _ensure_vdb_cfg(dir_bin) if not ope(fasterq_dump): Log.err('Could not download SRA Toolkit.') return None v = get_dep_version([fasterq_dump, '--version'], r':\s([\d\.]*)') if v == '?': v = get_dep_version([fasterq_dump, '--version'], r'version\s([\d\.]*)') Log.msg('fasterq-dump is available:', v + ' ' + fasterq_dump) return fasterq_dump
def dep_check_kakapolib(force=False, quiet=False): kkpl = KAKAPOLIB if not ope(kkpl): if quiet is False: Log.wrn('Compiling kakapolib.') run(['make', 'install'], cwd=DIR_C_SRC) if ope(kkpl): if quiet is False: Log.msg('kakapolib is available:', kkpl) else: Log.err('Compilation of kakapolib failed.') return None return ctypes.CDLL(kkpl)
def dep_check_seqtk(dir_dep, force): url = 'https://github.com/lh3/seqtk/archive/master.zip' dnld_path = opj(dir_dep, 'seqtk.zip') dir_bin = opj(dir_dep, 'seqtk-master') fp = NamedTemporaryFile() fp.write(str.encode('>seq' + lns + 'ATGC')) fp.seek(0) cmd = ['', 'seq', '-r', fp.name] try: if force is True: raise seqtk = which('seqtk') cmd[0] = seqtk run(cmd, do_not_raise=True) except Exception: try: seqtk = opj(dir_bin, 'seqtk') cmd[0] = seqtk run(cmd, do_not_raise=True) except Exception: Log.wrn('Seqtk was not found on this system, trying to download.') download_file(url, dnld_path) zip_ref = zipfile.ZipFile(dnld_path, 'r') zip_ref.extractall(dir_dep) zip_ref.close() try: Log.wrn('Compiling Seqtk.') run('make', cwd=dir_bin) run(cmd, do_not_raise=True) except Exception: replace_line_in_file(opj(dir_bin, 'Makefile'), 'CC=gcc', 'CC=cc') try: run('make', cwd=dir_bin) run(cmd, do_not_raise=True) except Exception: Log.err( 'Something went wrong while trying to compile Seqtk.') Log.msg('Try downloading and installing it manually from: ' 'https://github.com/lh3/seqtk') fp.close() return None fp.close() v = get_dep_version([seqtk], r'Version\:\s([\d\w\.\-]*)') Log.msg('Seqtk is available:', v + ' ' + seqtk) return seqtk
def dep_check_vsearch(dir_dep, os_id, dist_id, debian_dists, redhat_dists, force): if os_id == 'mac': url = ('https://github.com/torognes/vsearch/releases/download/v2.15.0/' 'vsearch-2.15.0-macos-x86_64.tar.gz') elif os_id == 'linux': if dist_id in debian_dists: url = ('https://github.com/torognes/vsearch/releases/download/' 'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz') elif dist_id in redhat_dists: url = ('https://github.com/torognes/vsearch/releases/download/' 'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz') dnld_path = opj(dir_dep, 'vsearch.tar.gz') try: if force is True: raise vsearch = which('vsearch') run(vsearch) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch')) vsearch = opj(dir_bin, 'bin', 'vsearch') run(vsearch) except Exception: Log.wrn( 'Vsearch was not found on this system, trying to download.') download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch')) vsearch = opj(dir_bin, 'bin', 'vsearch') if not ope(vsearch): Log.err('Could not download Vsearch.') return None else: run(vsearch) except Exception: Log.err('Vsearch was downloaded, but does not execute.') Log.msg('Try downloading and installing it manually from: ' 'https://github.com/torognes/vsearch') return None v = get_dep_version([vsearch, '-version'], r'vsearch\sv([\d\.]*)') Log.msg('Vsearch is available:', v + ' ' + vsearch) return vsearch
def user_fastq_files(fq_se, fq_pe): if len(fq_se) > 0 or len(fq_pe) > 0: print() Log.inf('Preparing user provided FASTQ files.') se_fastq_files = {} pe_fastq_files = {} fq_type_1_regex = r'(.*)_L\d\d\d(_R.)_\d\d\d(.*)' for se in fq_se: tax_id = se[0] path = se[1] base = basename(path) if plain_or_gzip(base)[4] != '': base = splitext(base)[0] base = splitext(base)[0] fq_type_1_match = re.findall(fq_type_1_regex, base) if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3: base = fq_type_1_match[0][0] sample_base_name = base se_fastq_files[sample_base_name] = {'path': path} se_fastq_files[sample_base_name]['src'] = 'usr' se_fastq_files[sample_base_name]['avg_len'] = None se_fastq_files[sample_base_name]['tax_id'] = tax_id Log.msg(sample_base_name + ':', basename(path)) for pe in fq_pe: tax_id = pe[0] path = pe[1] base = basename(path[0]) if plain_or_gzip(base)[4] != '': base = splitext(base)[0] base = splitext(base)[0] fq_type_1_match = re.findall(fq_type_1_regex, base) if len(fq_type_1_match) > 0 and len(fq_type_1_match[0]) == 3: base = fq_type_1_match[0][0] else: base = basename(commonprefix(path)).rstrip('_- R') sample_base_name = base pe_fastq_files[sample_base_name] = {'path': path} pe_fastq_files[sample_base_name]['src'] = 'usr' pe_fastq_files[sample_base_name]['avg_len'] = None pe_fastq_files[sample_base_name]['tax_id'] = tax_id Log.msg( sample_base_name + ':', basename(path[0]) + '\n' + ' ' * (len(sample_base_name) + 2) + basename(path[1])) return se_fastq_files, pe_fastq_files
def dep_check_spades(dir_dep, os_id, force): if os_id == 'mac': url = ('http://cab.spbu.ru/files/release3.14.1/' 'SPAdes-3.14.1-Darwin.tar.gz') elif os_id == 'linux': url = ('http://cab.spbu.ru/files/release3.14.1/' 'SPAdes-3.14.1-Linux.tar.gz') dnld_path = opj(dir_dep, 'SPAdes.tar.gz') try: if force is True: raise spades = which('spades.py') run([PY3, spades]) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes')) spades = opj(dir_bin, 'bin', 'spades.py') run([PY3, spades]) except Exception: Log.wrn('SPAdes was not found on this system, trying to download.') try: download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() except Exception: Log.err('Could not download SPAdes.') return None try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes')) spades = opj(dir_bin, 'bin', 'spades.py') # replace_line_in_file(spades, # '#!/usr/bin/env python', # '#!/usr/bin/env python3') if ope(spades): run([PY3, spades]) else: Log.err('Could not download SPAdes.') return None except Exception: Log.err('SPAdes was downloaded, but does not execute.') return None v = get_dep_version([PY3, spades, '--version'], r'^.*SPAdes.*v([\d\.]*)') Log.msg('SPAdes is available:', v + ' ' + spades) return spades
def _write_trimmomatic_adapters_file(dir_dep): path_adapters = opj(dir_dep, 'trimmomatic_adapters.fasta') adapters = ('>TruSeq2_SE' 'AGATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG' '>TruSeq2_PE_f' 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT' '>TruSeq2_PE_r' 'AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG' '>TruSeq3_IndexedAdapter' 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC' '>TruSeq3_UniversalAdapter' 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA' '>PrefixPE/1' 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' '>PrefixPE/2' 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT' '>PCR_Primer1' 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' '>PCR_Primer1_rc' 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' '>PCR_Primer2' 'CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT' '>PCR_Primer2_rc' 'AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG' '>FlowCell1' 'TTTTTTTTTTAATGATACGGCGACCACCGAGATCTACAC' '>FlowCell2' 'TTTTTTTTTTCAAGCAGAAGACGGCATACGA' '>PrefixPE/1' 'TACACTCTTTCCCTACACGACGCTCTTCCGATCT' '>PrefixPE/2' 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT' '>PE1' 'TACACTCTTTCCCTACACGACGCTCTTCCGATCT' '>PE1_rc' 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA' '>PE2' 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT' '>PE2_rc' 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC') if not ope(path_adapters): Log.msg('Writing Trimmomatic adapter files: ' + path_adapters) with open(path_adapters, mode='w') as f: f.write(adapters) return path_adapters
def dep_check_blast(dir_dep, os_id, dist_id, debian_dists, redhat_dists, force): if os_id == 'mac': url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.10.1/' 'ncbi-blast-2.10.1+-x64-macosx.tar.gz') elif os_id == 'linux': if dist_id in debian_dists: url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/' '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz') elif dist_id in redhat_dists: url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/' '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz') dnld_path = opj(dir_dep, 'ncbi-blast.tar.gz') makeblastdb = None blastn = None tblastn = None try: if force is True: raise makeblastdb = which('makeblastdb') blastn = which('blastn') tblastn = which('tblastn') run([makeblastdb, '-help']) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast')) makeblastdb = opj(dir_bin, 'bin', 'makeblastdb') blastn = opj(dir_bin, 'bin', 'blastn') tblastn = opj(dir_bin, 'bin', 'tblastn') run([makeblastdb, '-help']) except Exception: Log.wrn('BLAST+ was not found on this system, trying to download.') download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast')) makeblastdb = opj(dir_bin, 'bin', 'makeblastdb') blastn = opj(dir_bin, 'bin', 'blastn') tblastn = opj(dir_bin, 'bin', 'tblastn') if not ope(makeblastdb) or \ not ope(blastn) or \ not ope(tblastn): Log.err('Could not download BLAST+.') return None, None, None regexp = r'\sblast\s([\d\.]*)' v = get_dep_version([makeblastdb, '-version'], regexp) Log.msg('makeblastdb is available:', v + ' ' + makeblastdb) v = get_dep_version([blastn, '-version'], regexp) Log.msg('blastn is available:', v + ' ' + blastn) v = get_dep_version([tblastn, '-version'], regexp) Log.msg('tblastn is available:', v + ' ' + tblastn) return makeblastdb, blastn, tblastn
def dep_check_trimmomatic(dir_dep): url = ('http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/' 'Trimmomatic-0.39.zip') dnld_path = opj(dir_dep, 'Trimmomatic-0.39.zip') dir_bin = opj(dir_dep, 'Trimmomatic-0.39') trimmomatic = opj(dir_bin, 'trimmomatic-0.39.jar') if not ope(trimmomatic): download_file(url, dnld_path) zip_ref = zipfile.ZipFile(dnld_path, 'r') zip_ref.extractall(dir_dep) zip_ref.close() if not ope(trimmomatic): Log.err('Could not download Trimmomatic.') return None, None v = get_dep_version(['java', '-jar', trimmomatic, '-version'], r'\d+\.\d+') Log.msg('Trimmomatic is available:', v + ' ' + trimmomatic) path_adapters = _write_trimmomatic_adapters_file(dir_dep) return trimmomatic, path_adapters
def pfam_uniprot_accessions(ss, pfam_acc, tax_ids, dir_cache_pfam_acc): if len(pfam_acc) > 0: Log.inf('Downloading UniProt accessions for Pfam accessions:', ss) pfam_seqs_list = [] for pa in pfam_acc: pfam_id = pfam_entry(pa)[0]['id'] Log.msg(pa + ':', pfam_id) _ = opj(dir_cache_pfam_acc, pa + '__' + ss) if ope(_): with open(_, 'rb') as f: acc = pickle.load(f) pfam_seqs_list = pfam_seqs_list + acc else: # Note: the results may include "obsolete" accessions. # This is not a problem, they will not appear in the set of # downloaded sequences from UniProt. acc = pfam_seqs(query=pa) pfam_seqs_list = pfam_seqs_list + acc with open(_, 'wb') as f: pickle.dump(acc, f, protocol=PICKLE_PROTOCOL) pfam_uniprot_acc = prot_ids_for_tax_ids(pfam_seqs_list, tax_ids) return pfam_uniprot_acc
def makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb): if len(assemblies) > 0: print() Log.inf('Building BLAST databases for assemblies.') if makeblastdb is None: Log.err('makeblastdb is not available. Cannot continue. Exiting.') exit(0) for a in assemblies: assmbl_name = a['name'] assmbl_blast_db_dir = opj(dir_prj_blast_assmbl, assmbl_name) assmbl_blast_db_file = opj(assmbl_blast_db_dir, assmbl_name) a['blast_db_path'] = assmbl_blast_db_file if ope(assmbl_blast_db_dir): Log.msg('BLAST database already exists:', assmbl_name) else: Log.msg(assmbl_name) make_dirs(assmbl_blast_db_dir) make_blast_db(exec_file=makeblastdb, in_file=a['path'], out_file=assmbl_blast_db_file, title=assmbl_name)
def makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim, makeblastdb, fpatt): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Building BLAST databases for reads.') if makeblastdb is None: Log.err('makeblastdb is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, se) fa_path = se_fastq_files[se]['filter_path_fa'] out_f = opj(dir_blast_fa_trim_sample, se) se_fastq_files[se]['blast_db_path'] = out_f if ope(dir_blast_fa_trim_sample): Log.msg('BLAST database already exists:', se) else: make_dirs(dir_blast_fa_trim_sample) Log.msg(basename(fa_path)) make_blast_db(exec_file=makeblastdb, in_file=fa_path, out_file=out_f, title=se, dbtype='nucl') for pe in pe_fastq_files: dir_blast_fa_trim_sample = opj(dir_blast_fa_trim, pe) fa_paths = pe_fastq_files[pe]['filter_path_fa'] out_fs = [x.replace('@D@', dir_blast_fa_trim_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] pe_fastq_files[pe]['blast_db_path'] = out_fs if ope(dir_blast_fa_trim_sample): Log.msg('BLAST database already exists:', pe) else: make_dirs(dir_blast_fa_trim_sample) pe_trim_files = zip(fa_paths, out_fs) for x in pe_trim_files: Log.msg(basename(x[0])) make_blast_db(exec_file=makeblastdb, in_file=x[0], out_file=x[1], title=basename(x[1]), dbtype='nucl')
def filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk, fpatt): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Converting FASTQ to FASTA using Seqtk.') if seqtk is None: Log.err('seqtk is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fa_trim_data_sample = opj(dir_fa_trim_data, se) fq_path = se_fastq_files[se]['filter_path_fq'] out_f = opj(dir_fa_trim_data_sample, se + '.fasta') se_fastq_files[se]['filter_path_fa'] = out_f if ope(dir_fa_trim_data_sample): Log.msg('Filtered FASTA files already exist:', se) else: make_dirs(dir_fa_trim_data_sample) Log.msg(basename(fq_path)) seqtk_fq_to_fa(seqtk, fq_path, out_f) for pe in pe_fastq_files: dir_fa_trim_data_sample = opj(dir_fa_trim_data, pe) fq_paths = pe_fastq_files[pe]['filter_path_fq'] out_fs = [x.replace('@D@', dir_fa_trim_data_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] pe_fastq_files[pe]['filter_path_fa'] = out_fs if ope(dir_fa_trim_data_sample): Log.msg('Filtered FASTA files already exist:', pe) else: make_dirs(dir_fa_trim_data_sample) pe_trim_files = zip(fq_paths, out_fs) for x in pe_trim_files: Log.msg(basename(x[0])) seqtk_fq_to_fa(seqtk, x[0], x[1])
def filter_queries(ss, aa_queries_file, min_query_length, max_query_length, max_query_identity, vsearch, prot_acc_user, overwrite, logging=True): if logging is True: print() Log.inf('Filtering AA query sequences:', ss) Log.msg('min_query_length:', str(min_query_length)) Log.msg('max_query_length:', str(max_query_length)) Log.msg('max_query_identity:', str(max_query_identity)) parsed_fasta_1 = filter_fasta_by_length(aa_queries_file, SEQ_TYPE_AA, min_query_length, max_query_length) tmp1 = aa_queries_file + '_temp1' tmp2 = aa_queries_file + '_temp2' for rec in parsed_fasta_1: rec.seq.gc_code = 1 rec.seq = rec.seq.untranslate() write_fasta(parsed_fasta_1, tmp1) run_cluster_fast(vsearch, max_query_identity, tmp1, tmp2) parsed_fasta_2 = read_fasta(tmp2, SEQ_TYPE_DNA, parse_def=True) prot_acc_user_new = list() for rec in parsed_fasta_2: rec.seq.gc_code = 1 rec.seq = rec.seq.translate() acc = rec.accession_version if acc in prot_acc_user: prot_acc_user_new.append(acc) if overwrite is True: write_fasta(parsed_fasta_2, aa_queries_file, prepend_acc=True) osremove(tmp1) osremove(tmp2) return prot_acc_user_new
def dnld_sra_fastq_files(sras, sra_runs_info, dir_fq_data, fasterq_dump, threads, dir_temp): if len(sras) > 0: if fasterq_dump is None: Log.err('fasterq-dump from SRA Toolkit is not available. ' + 'Cannot continue. Exiting.') exit(0) print() Log.inf('Downloading SRA read data.') se_fastq_files = {} pe_fastq_files = {} for sra in sras: sra_run_info = sra_runs_info[sra] sra_lib_layout = sra_run_info['LibraryLayout'].lower() sra_lib_layout_k = sra_run_info['KakapoLibraryLayout'].lower() sample_base_name = sra_run_info['KakapoSampleBaseName'] sra_taxid = int(sra_run_info['TaxID']) avg_len = int(sra_run_info['avgLength']) sra_dnld_needed = False if sra_lib_layout == 'single' or sra_lib_layout_k == 'single': se_file = opj(dir_fq_data, sra + '.fastq') se_fastq_files[sample_base_name] = {'path': se_file} se_fastq_files[sample_base_name]['src'] = 'sra' se_fastq_files[sample_base_name]['avg_len'] = avg_len se_fastq_files[sample_base_name]['tax_id'] = sra_taxid if not ope(se_file): sra_dnld_needed = True elif sra_lib_layout == 'paired': pe_file_1 = opj(dir_fq_data, sra + '_1.fastq') pe_file_2 = opj(dir_fq_data, sra + '_2.fastq') pe_file_1_renamed = opj(dir_fq_data, sra + '_R1.fastq') pe_file_2_renamed = opj(dir_fq_data, sra + '_R2.fastq') pe_fastq_files[sample_base_name] = { 'path': [pe_file_1_renamed, pe_file_2_renamed] } pe_fastq_files[sample_base_name]['src'] = 'sra' pe_fastq_files[sample_base_name]['avg_len'] = avg_len // 2 pe_fastq_files[sample_base_name]['tax_id'] = sra_taxid if sra_lib_layout_k == 'paired_unp': pe_file_3 = opj(dir_fq_data, sra + '.fastq') pe_file_3_renamed = opj(dir_fq_data, sra + '_R3.fastq') pe_fastq_files[sample_base_name]['path'].append( pe_file_3_renamed) if not ope(pe_file_1_renamed) or not ope(pe_file_2_renamed): sra_dnld_needed = True if not sra_dnld_needed: Log.msg('FASTQ reads are available locally:', sample_base_name) retry_count = 0 while sra_dnld_needed: if retry_count > 50: Log.err('Download failed. Exiting.') rmtree(dir_temp) exit(1) elif retry_count > 0: Log.wrn('Download failed. Retrying.') sleep(2) retry_count += 1 Log.msg('Downloading FASTQ reads for:', sample_base_name) cmd = [ fasterq_dump, '--threads', str(threads * 2), '--split-3', '--bufsize', '819200', '--outdir', dir_fq_data, '--temp', dir_temp, sra ] run(cmd, do_not_raise=True) if sra_lib_layout == 'single' or sra_lib_layout_k == 'single': if not ope(se_file): continue elif sra_lib_layout == 'paired': if not ope(pe_file_1) or not ope(pe_file_2): continue else: move(pe_file_1, pe_file_1_renamed) move(pe_file_2, pe_file_2_renamed) if sra_lib_layout_k == 'paired_unp': if not ope(pe_file_3): continue else: move(pe_file_3, pe_file_3_renamed) sra_dnld_needed = False if sra_lib_layout == 'single' or sra_lib_layout_k == 'single': if ope(se_file): Log.msg('Renaming FASTQ reads in:', se_file) rename_fq_seqs(se_file, sra, '1:N:0') elif sra_lib_layout == 'paired': if ope(pe_file_1_renamed): Log.msg('Renaming FASTQ reads in:', pe_file_1_renamed) rename_fq_seqs(pe_file_1_renamed, sra, '1:N:0') if ope(pe_file_2_renamed): Log.msg('Renaming FASTQ reads in:', pe_file_2_renamed) rename_fq_seqs(pe_file_2_renamed, sra, '2:N:0') if sra_lib_layout_k == 'paired_unp': if ope(pe_file_3_renamed): Log.msg('Renaming FASTQ reads in:', pe_file_3_renamed) rename_fq_seqs(pe_file_3_renamed, sra + '_unpaired', '1:N:0') return se_fastq_files, pe_fastq_files, sra_runs_info
def run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data, trimmomatic, adapters, fpatt, threads): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Running Trimmomatic.') if trimmomatic is None: Log.err('trimmomatic is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_trim_data_sample = opj(dir_fq_trim_data, se) fq_path = se_fastq_files[se]['cor_path_fq'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) min_acc_len = se_fastq_files[se]['min_acc_len'] stats_f = opj(dir_fq_trim_data_sample, se + '.txt') out_f = opj(dir_fq_trim_data_sample, se + '.fastq' + ext) se_fastq_files[se]['trim_path_fq'] = out_f if ope(dir_fq_trim_data_sample): Log.msg('Trimmed FASTQ file already exists:', se) else: make_dirs(dir_fq_trim_data_sample) Log.msg('SE mode:', se) trimmomatic_se(trimmomatic=trimmomatic, adapters=adapters, in_file=fq_path, out_file=out_f, stats_file=stats_f, threads=threads, minlen=min_acc_len) for pe in pe_fastq_files: dir_fq_trim_data_sample = opj(dir_fq_trim_data, pe) fq_path_1 = pe_fastq_files[pe]['cor_path_fq'][0] fq_path_2 = pe_fastq_files[pe]['cor_path_fq'][1] fq_path_3 = None r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) if len(pe_fastq_files[pe]['cor_path_fq']) == 3: fq_path_3 = pe_fastq_files[pe]['cor_path_fq'][2] min_acc_len = pe_fastq_files[pe]['min_acc_len'] stats_f = opj(dir_fq_trim_data_sample, pe + '.txt') out_fs = [x.replace('@D@', dir_fq_trim_data_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] out_fs = [x + ext for x in out_fs] pe_fastq_files[pe]['trim_path_fq'] = out_fs if ope(dir_fq_trim_data_sample): Log.msg('Trimmed FASTQ files already exist:', pe) else: make_dirs(dir_fq_trim_data_sample) Log.msg('PE mode:', pe) trimmomatic_pe(trimmomatic=trimmomatic, adapters=adapters, in_file_1=fq_path_1, in_file_2=fq_path_2, out_file_paired_1=out_fs[0], out_file_paired_2=out_fs[1], out_file_unpaired_1=out_fs[2], out_file_unpaired_2=out_fs[3], stats_file=stats_f, threads=threads, minlen=min_acc_len) if fq_path_3 is not None: out_f = opj(dir_fq_trim_data_sample, 'unpaired.fastq' + ext) stats_f = opj(dir_fq_trim_data_sample, pe + '_unpaired.txt') Log.msg( 'SE mode (Paired-read SRA run contains unpaired reads):', pe) trimmomatic_se(trimmomatic=trimmomatic, adapters=adapters, in_file=fq_path_3, out_file=out_f, stats_file=stats_f, threads=threads, minlen=min_acc_len) _ = opj(dir_fq_trim_data_sample, 'temp.fastq' + ext) f_temp = fqopen(_, w_mode) with fileinput.FileInput( files=[out_fs[2], out_f], openhook=fileinput.hook_compressed) as f: for line in f: f_temp.write(line) f_temp.close() remove(out_fs[2]) remove(out_f) copyfile(_, out_fs[2]) remove(_)
def run_spades(se_fastq_files, pe_fastq_files, dir_spades_assemblies, spades, dir_temp, ss, threads, ram): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: if spades is None: Log.err('SPAdes is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_results = opj(dir_spades_assemblies, se + '__' + ss) fq_path = se_fastq_files[se]['vsearch_results_path' + '__' + ss] se_fastq_files[se]['spades_assembly' + '__' + ss] = None if ope(dir_results): Log.msg('SPAdes assembly already exists:', se) else: make_dirs(dir_results) Log.msg('Running SPAdes on:', se) run_spades_se(spades, out_dir=dir_results, input_file=fq_path, threads=threads, memory=ram, rna=True) assmbl_path = opj(dir_results, 'transcripts.fasta') if ope(assmbl_path): count = len(read_fasta(assmbl_path, SEQ_TYPE_NT)) tr_str = ' transcripts.' if count == 1: tr_str = ' transcript.' Log.msg('SPAdes produced ' + str(count) + tr_str, False) se_fastq_files[se]['spades_assembly' + '__' + ss] = assmbl_path else: Log.wrn('SPAdes produced no transcripts.', False) for pe in pe_fastq_files: dir_results = opj(dir_spades_assemblies, pe + '__' + ss) fq_paths = pe_fastq_files[pe]['vsearch_results_path' + '__' + ss] pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None if ope(dir_results): Log.msg('SPAdes assembly already exists:', pe) else: make_dirs(dir_results) Log.msg('Running SPAdes on: ' + pe) if osstat(fq_paths[0]).st_size > 0 and \ osstat(fq_paths[1]).st_size > 0: run_spades_pe(spades, out_dir=dir_results, input_files=fq_paths, threads=threads, memory=ram, rna=True) else: _ = opj(dir_temp, 'temp.fasta') combine_text_files(fq_paths, _) run_spades_se(spades, out_dir=dir_results, input_file=_, threads=threads, memory=ram, rna=True) osremove(_) assmbl_path = opj(dir_results, 'transcripts.fasta') if ope(assmbl_path): count = len(read_fasta(assmbl_path, SEQ_TYPE_NT)) tr_str = ' transcripts.' if count == 1: tr_str = ' transcript.' Log.msg('SPAdes produced ' + str(count) + tr_str, False) pe_fastq_files[pe]['spades_assembly' + '__' + ss] = assmbl_path else: Log.wrn('SPAdes produced no transcripts.', False)
def min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp, dir_cache_fq_minlen, vsearch): # lowest allowable low = 35 if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() Log.inf('Calculating minimum acceptable read length.') if vsearch is None: Log.err('vsearch is not available. Cannot continue. Exiting.') exit(0) else: return None __ = opj(dir_cache_fq_minlen, 'minlen') pickled = {} if ope(__): with open(__, 'rb') as f: pickled = pickle.load(f) queue = [] for se in se_fastq_files: src = se_fastq_files[se]['src'] avg_len = se_fastq_files[se]['avg_len'] if src == 'sra': ml = max(avg_len // 3, low) se_fastq_files[se]['min_acc_len'] = ml Log.msg(str(ml) + ' nt:', se) continue fq_path = se_fastq_files[se]['path'] stats_file = opj(dir_temp, se + '_stats.txt') queue.append([se, fq_path, stats_file, 'se']) for pe in pe_fastq_files: src = pe_fastq_files[pe]['src'] avg_len = pe_fastq_files[pe]['avg_len'] if src == 'sra': ml = max(avg_len // 3, low) pe_fastq_files[pe]['min_acc_len'] = ml Log.msg(str(ml) + ' nt:', pe) continue fq_path = pe_fastq_files[pe]['path'][0] stats_file = opj(dir_temp, pe + '_stats.txt') queue.append([pe, fq_path, stats_file, 'pe']) for x in queue: if x[0] in pickled: ml = pickled[x[0]] else: # ---------------------------------------------------------------- # Use 'vsearch --fastq_stats'. About 2x slower than the # approx_avg_read_len_fq function. # # cmd = [vsearch, '--fastq_stats', x[1], '--log', x[2]] # run(cmd, do_not_raise=True) # with open(x[2]) as f: # stats = f.read() # remove(x[2]) # ml = re.findall(r'>=\s+(\d+)', stats) # if len(ml) != 0: # ml = max(int(ml[0]) // 3, low) # else: # ml = None # ---------------------------------------------------------------- # 22:59:12 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961 # 22:59:46 50 nt: Schlumbergera_truncata_15H-02_pol_S47 34s # 23:00:30 50 nt: Schlumbergera_truncata_15H-02_sty_S49 44s # ---------------------------------------------------------------- # ---------------------------------------------------------------- ml = approx_avg_read_len_fq(x[1]) ml = max(int(ml) // 3, low) # ---------------------------------------------------------------- # 23:12:06 50 nt: Hylocereus_polyrhizus_1195597_SRR7829961 # 23:12:20 50 nt: Schlumbergera_truncata_15H-02_pol_S47 14s # 23:12:39 50 nt: Schlumbergera_truncata_15H-02_sty_S49 19s # ---------------------------------------------------------------- pickled[x[0]] = ml if ml is not None: Log.msg(str(ml) + ' nt:', x[0]) else: Log.msg(' ?' + ' nt:', x[0]) ml = low if x[3] == 'se': se_fastq_files[x[0]]['min_acc_len'] = ml elif x[3] == 'pe': pe_fastq_files[x[0]]['min_acc_len'] = ml with open(__, 'wb') as f: pickle.dump(pickled, f, protocol=PICKLE_PROTOCOL)
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector, threads, dir_temp, should_run): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() if should_run is False: Log.wrn('Skipping Rcorrector as requested.') else: Log.inf('Running Rcorrector.') if rcorrector is None: Log.err('Rcorrector is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, se) fq_path = se_fastq_files[se]['path'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) log_f = opj(dir_fq_cor_data_sample, se + '.txt') out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext) se_fastq_files[se]['cor_path_fq'] = out_f if should_run is False: se_fastq_files[se]['cor_path_fq'] = fq_path continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ file already exists:', se) else: make_dirs(dir_fq_cor_data_sample) Log.msg('SE mode:', se) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path)) fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f) remove(fq_cor_path) for pe in pe_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe) fq_path_1 = pe_fastq_files[pe]['path'][0] fq_path_2 = pe_fastq_files[pe]['path'][1] fq_path_3 = None out_f_3 = None r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) log_f = opj(dir_fq_cor_data_sample, pe + '.txt') out_f_1 = opj(dir_fq_cor_data_sample, pe + '_R1.fastq' + ext) out_f_2 = opj(dir_fq_cor_data_sample, pe + '_R2.fastq' + ext) pe_fastq_files[pe]['cor_path_fq'] = [out_f_1, out_f_2] if len(pe_fastq_files[pe]['path']) == 3: fq_path_3 = pe_fastq_files[pe]['path'][2] out_f_3 = opj(dir_fq_cor_data_sample, pe + '_R3.fastq' + ext) pe_fastq_files[pe]['cor_path_fq'].append(out_f_3) if should_run is False: pe_fastq_files[pe]['cor_path_fq'] = [fq_path_1, fq_path_2] if fq_path_3 is not None: pe_fastq_files[pe]['cor_path_fq'].append(fq_path_3) continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ files already exist:', pe) else: make_dirs(dir_fq_cor_data_sample) Log.msg('PE mode:', pe) run_rcorrector_pe(rcorrector=rcorrector, in_file_1=fq_path_1, in_file_2=fq_path_2, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1)) fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2)) fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext filter_unc_pe(in_file_1=fq_cor_path_1, in_file_2=fq_cor_path_2, out_file_1=out_f_1, out_file_2=out_f_2, log_file=log_f) remove(fq_cor_path_1) remove(fq_cor_path_2) if fq_path_3 is not None: Log.msg( 'SE mode (Paired-read SRA run contains unpaired reads):', pe) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_3, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_3 = opj(dir_fq_cor_data_sample, basename(fq_path_3)) fq_cor_path_3 = splitext_gz(fq_base_path_3)[0] + '.cor.fq' log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired.txt') filter_unc_se(in_file=fq_cor_path_3, out_file=out_f_3, log_file=log_f_3) remove(fq_cor_path_3)
def run_inter_pro_scan(ss, assemblies, email, dir_prj_ips, dir_cache_prj, parallel_run_count, max_title_a_len, max_run_id_len): delay = 0.25 for a in assemblies: if 'transcripts_aa_orf_fasta_file__' + ss not in a: continue aa_file = a['transcripts_aa_orf_fasta_file__' + ss] if aa_file is None: continue assmbl_name = a['name'] json_dump_file_path = opj(dir_prj_ips, assmbl_name + '_ann_ips__' + ss + '.json') if ope(json_dump_file_path): Log.inf('InterProScan results for assembly ' + assmbl_name + ', ' 'search strategy ' + ss + ' have already been downloaded.') continue else: Log.inf('Running InterProScan on translated ' + ss + ' from ' + assmbl_name + '.') seqs = seq_records_to_dict(read_fasta(aa_file, SEQ_TYPE_AA)) # Filter all ORFs except the first one. for seq_def in tuple(seqs.keys()): seq_def_prefix = seq_def.split(' ')[0] if not seq_def_prefix.endswith('ORF001'): del seqs[seq_def] seqs = OrderedDict( sorted(seqs.items(), key=lambda x: x[0].split(' ')[1], reverse=True)) run_id = ss + '_' + assmbl_name _ = opj(dir_cache_prj, 'ips5_cache_done_' + run_id) if ope(_): with open(_, 'rb') as f: jobs = pickle.load(f) else: jobs = job_runner(email=email, dir_cache=dir_cache_prj, seqs=seqs, run_id=run_id, parallel_run_count=parallel_run_count, max_title_a_len=max_title_a_len, max_run_id_len=max_run_id_len) with open(_, 'wb') as f: pickle.dump(jobs, f, protocol=PICKLE_PROTOCOL) Log.inf('Downloading InterProScan results for ' + ss + ' in ' + assmbl_name + '.') all_ips_results = {} # Nicer printing for i, job in enumerate(jobs['finished']): job_id = jobs['finished'][job] titles_ab = split_seq_defn(job) title_a = titles_ab[0] progress = round(((i + 1) / len(jobs['finished'])) * 100) progress_str = '{:3d}'.format(progress) + '%' msg = (' ' * 12 + title_a.ljust(max_title_a_len) + run_id.ljust(max_run_id_len) + progress_str.rjust(4) + ' ' + job_id) Log.msg(msg) sleep(delay) ips_json = result_json(job_id) if ips_json is None: continue # ips_version = ips_json['interproscan-version'] ips_json = ips_json['results'] # These fields are set to 'EMBOSS_001' by default # Delete them del ips_json[0]['xref'] job_no_def = job.split(' ')[0] all_ips_results[job_no_def] = ips_json with open(json_dump_file_path, 'w') as f: json.dump(all_ips_results, f, sort_keys=True, indent=4) # Removes cached jobs file. osremove(_)
def run_bt2_fq(se_fastq_files, pe_fastq_files, dir_fq_filter_data, bowtie2, bowtie2_build, threads, dir_temp, bt2_order, fpatt, taxonomy, dir_cache_refseqs): new_se_fastq_files = dict() new_pe_fastq_files = dict() msg_printed = False # SE for se in se_fastq_files: taxid = se_fastq_files[se]['tax_id'] dbs = _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2, bowtie2_build) in_f = se_fastq_files[se]['trim_path_fq'] in_f_orig = in_f if len(dbs) == 0: se_fastq_files[se]['filter_path_fq'] = in_f continue if msg_printed is False: print() Log.inf('Running Bowtie2.') msg_printed = True for i, db in enumerate(dbs): db_path = dbs[db] dir_fq_bt_data_sample = opj(dir_fq_filter_data, se, db) dir_fq_bt_data_sample_un = opj(dir_fq_filter_data, se) new_se = se + '_' + db out_f = opj(dir_fq_bt_data_sample, new_se + '.fastq') out_f_un = opj(dir_temp, new_se + '_bt2_unaligned' + '.fastq') sam_f = opj(dir_fq_bt_data_sample, new_se + '.sam') new_se_fastq_files[new_se] = deepcopy(se_fastq_files[se]) new_se_fastq_files[new_se]['path'] = None new_se_fastq_files[new_se]['cor_path_fq'] = None new_se_fastq_files[new_se]['trim_path_fq'] = None taxid = new_se_fastq_files[new_se]['tax_id'] gc = new_se_fastq_files[new_se]['gc_id'] if db == MT: gc = taxonomy.mito_genetic_code_for_taxid(taxid) new_se_fastq_files[new_se]['gc_id'] = gc elif db == PT: gc = taxonomy.plastid_genetic_code_for_taxid(taxid) new_se_fastq_files[new_se]['gc_id'] = gc new_se_fastq_files[new_se]['gc_tt'] = TranslationTable(gc) new_se_fastq_files[new_se]['filter_path_fq'] = out_f if ope(dir_fq_bt_data_sample): Log.msg('Bowtie2 filtered FASTQ file already exists:', new_se) in_f = opj(dir_fq_bt_data_sample_un, se + '.fastq') else: Log.msg('SE mode:', new_se) make_dirs(dir_fq_bt_data_sample) db_fasta_path = None bt2_idx_path = None if db_path in (MT, PT): db_fasta_path = dnld_refseqs_for_taxid(taxid, db, taxonomy, dir_cache_refseqs, query='', db='nuccore') bt2_idx_path = splitext(db_fasta_path)[0] else: db_fasta_path = db_path bt2_idx_path = opj(dir_cache_refseqs, splitext(basename(db_fasta_path))[0]) if not ope(bt2_idx_path + '.1.bt2'): build_bt2_index(bowtie2_build, [db_fasta_path], bt2_idx_path, threads) run_bowtie2_se(bowtie2=bowtie2, input_file=in_f, output_file=out_f, output_file_un=out_f_un, sam_output_file=sam_f, index=bt2_idx_path, threads=threads, dir_temp=dir_temp) if i > 0: remove(in_f) in_f = out_f_un out_f_un = opj(dir_fq_bt_data_sample_un, se + '.fastq') se_fastq_files[se]['filter_path_fq'] = out_f_un if in_f != in_f_orig: move(in_f, out_f_un) se_fastq_files.update(new_se_fastq_files) # PE for pe in pe_fastq_files: taxid = pe_fastq_files[pe]['tax_id'] dbs = _should_run_bt2(taxid, taxonomy, bt2_order, bowtie2, bowtie2_build) in_fs = pe_fastq_files[pe]['trim_path_fq'] in_fs_orig = tuple(in_fs) if len(dbs) == 0: pe_fastq_files[pe]['filter_path_fq'] = in_fs continue if msg_printed is False: print() Log.inf('Running Bowtie2.') msg_printed = True for i, db in enumerate(dbs): db_path = dbs[db] dir_fq_bt_data_sample = opj(dir_fq_filter_data, pe, db) dir_fq_bt_data_sample_un = opj(dir_fq_filter_data, pe) new_pe = pe + '_' + db out_fs = [x.replace('@D@', dir_fq_bt_data_sample) for x in fpatt] out_fs = [x.replace('@N@', new_pe) for x in out_fs] out_fs_un = [x.replace('@D@', dir_temp) for x in fpatt] out_fs_un = [ x.replace('@N@', new_pe + '_bt2_unaligned') for x in out_fs_un ] sam_f = opj(dir_fq_bt_data_sample, new_pe + '.sam') new_pe_fastq_files[new_pe] = deepcopy(pe_fastq_files[pe]) new_pe_fastq_files[new_pe]['path'] = None new_pe_fastq_files[new_pe]['cor_path_fq'] = None new_pe_fastq_files[new_pe]['trim_path_fq'] = None taxid = new_pe_fastq_files[new_pe]['tax_id'] gc = new_pe_fastq_files[new_pe]['gc_id'] if db == MT: gc = taxonomy.mito_genetic_code_for_taxid(taxid) new_pe_fastq_files[new_pe]['gc_id'] = gc elif db == PT: gc = taxonomy.plastid_genetic_code_for_taxid(taxid) new_pe_fastq_files[new_pe]['gc_id'] = gc new_pe_fastq_files[new_pe]['gc_tt'] = TranslationTable(gc) new_pe_fastq_files[new_pe]['filter_path_fq'] = out_fs if ope(dir_fq_bt_data_sample): Log.msg('Bowtie2 filtered FASTQ files already exist:', new_pe) in_fs = [ x.replace('@D@', dir_fq_bt_data_sample_un) for x in fpatt ] in_fs = [x.replace('@N@', pe) for x in in_fs] else: Log.msg('PE mode:', new_pe) make_dirs(dir_fq_bt_data_sample) db_fasta_path = None bt2_idx_path = None if db_path in (MT, PT): db_fasta_path = dnld_refseqs_for_taxid(taxid, db, taxonomy, dir_cache_refseqs, query='', db='nuccore') bt2_idx_path = splitext(db_fasta_path)[0] else: db_fasta_path = db_path bt2_idx_path = opj(dir_cache_refseqs, splitext(basename(db_fasta_path))[0]) if not ope(bt2_idx_path + '.1.bt2'): build_bt2_index(bowtie2_build, [db_fasta_path], bt2_idx_path, threads) paired_out_pattern = out_fs[0].replace('_paired_1.fastq', '_paired_%.fastq') paired_out_pattern_un = out_fs_un[0].replace( '_paired_1.fastq', '_paired_%.fastq') run_bowtie2_pe(bowtie2=bowtie2, input_files=in_fs, paired_out_pattern=paired_out_pattern, paired_out_pattern_un=paired_out_pattern_un, unpaired_out_1=out_fs[2], unpaired_out_2=out_fs[3], unpaired_out_1_un=out_fs_un[2], unpaired_out_2_un=out_fs_un[3], sam_output_file=sam_f, index=bt2_idx_path, threads=threads, dir_temp=dir_temp) if i > 0: remove(in_fs[0]) remove(in_fs[1]) remove(in_fs[2]) remove(in_fs[3]) in_fs = out_fs_un out_fs_un = [x.replace('@D@', dir_fq_bt_data_sample_un) for x in fpatt] out_fs_un = [x.replace('@N@', pe) for x in out_fs_un] pe_fastq_files[pe]['filter_path_fq'] = out_fs_un if tuple(in_fs) != in_fs_orig: move(in_fs[0], out_fs_un[0]) move(in_fs[1], out_fs_un[1]) move(in_fs[2], out_fs_un[2]) move(in_fs[3], out_fs_un[3]) pe_fastq_files.update(new_pe_fastq_files)
def run_kraken2(order, dbs, se_fastq_files, pe_fastq_files, dir_fq_filter_data, confidence, kraken2, threads, dir_temp, fpatt): if (len(se_fastq_files) > 0 or len(pe_fastq_files) > 0) and len(order) > 0: print() Log.inf('Running Kraken2.', 'Confidence: ' + str(confidence)) if kraken2 is None: Log.err('kraken2 is not available. Cannot continue. Exiting.') exit(0) nuclear = None for nuc in order: if nuc[1] == 'nuclear': nuclear = nuc[0] break for se in se_fastq_files: if len(order) == 0: continue if se_fastq_files[se]['path'] is None: continue fq_path = se_fastq_files[se]['filter_path_fq'] dir_fq_filter_data_sample = opj(dir_fq_filter_data, se) if nuclear is None: out_f = opj(dir_fq_filter_data_sample, se + '.fastq') else: out_f = opj(dir_fq_filter_data_sample, nuclear, se + '.fastq') se_fastq_files[se]['filter_path_fq'] = out_f if ope(dir_fq_filter_data_sample): Log.msg('Kraken2 filtered FASTQ files already exist:', se) else: make_dirs(dir_fq_filter_data_sample) print() Log.msg('SE mode:', se) run_kraken_filters(order=order, dbs=dbs, base_name=se, in_files=fq_path, dir_out=dir_fq_filter_data_sample, confidence=confidence, kraken2=kraken2, threads=threads, dir_temp=dir_temp) for pe in pe_fastq_files: if len(order) == 0: continue if pe_fastq_files[pe]['path'] is None: continue fq_path = pe_fastq_files[pe]['filter_path_fq'] dir_fq_filter_data_sample = opj(dir_fq_filter_data, pe) if nuclear is None: dir_name_nuclear = dir_fq_filter_data_sample else: dir_name_nuclear = dir_fq_filter_data_sample + ops + nuclear out_fs = [x.replace('@D@', dir_name_nuclear) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] pe_fastq_files[pe]['filter_path_fq'] = out_fs if ope(dir_fq_filter_data_sample): Log.msg('Kraken2 filtered FASTQ files already exist:', pe) else: make_dirs(dir_fq_filter_data_sample) print() Log.msg('PE mode:', pe) run_kraken_filters(order=order, dbs=dbs, base_name=pe, in_files=fq_path, dir_out=dir_fq_filter_data_sample, confidence=confidence, kraken2=kraken2, threads=threads, dir_temp=dir_temp)
def dnld_sra_info(sras, dir_cache_prj): sra_runs_info = {} sras_acceptable = [] if len(sras) > 0: print() Log.inf('Downloading SRA run information.') else: return sra_runs_info, sras_acceptable __ = opj(dir_cache_prj, 'sra_runs_info_cache') if ope(__): with open(__, 'rb') as f: sra_runs_info = pickle.load(f) sras_local = [k for k in sra_runs_info.keys()] sras_to_dnld = set(sras).difference(set(sras_local)) if len(sras_to_dnld) > 0: temp = sra_run_info(list(sras_to_dnld)) new_sra_runs_info = {i['Run']: i for i in temp} sra_runs_info.update(new_sra_runs_info) for sra in sras: if sra in sra_runs_info: info = sra_runs_info[sra] sra_lib_layout = info['LibraryLayout'].lower() sra_lib_source = info['LibrarySource'].lower() sra_lib_strategy = info['LibraryStrategy'] sra_seq_platform = info['Platform'].lower().capitalize() sra_seq_platform_model = info['Model'] sra_species = info['ScientificName'] sra_taxid = info['TaxID'] sra_spots = int(info['spots']) sra_spots_with_mates = int(info['spots_with_mates']) sample_base_name = (sra_species.replace(' ', '_') + '_' + sra_taxid + '_' + sra) sra_runs_info[sra]['KakapoSampleBaseName'] = sample_base_name src_check = sra_lib_source.lower() strategy_check = sra_lib_strategy.lower() if not ('transcript' in src_check or 'rna' in src_check or 'rna' in strategy_check): sra_info_str = ('{sra}: the SRA library source type "{ltype}" ' 'or library strategy "{strategy}" ' 'is not supported.').format( sra=sra, ltype=sra_lib_source, strategy=sra_lib_strategy) Log.err(sra_info_str, 'Skipping.') elif sra_seq_platform != 'Illumina': sra_info_str = ('{sra}: the SRA library sequencing platform ' '"{plat}" is not supported').format( sra=sra, plat=sra_seq_platform) Log.err(sra_info_str, 'Skipping.') else: # sra_info_str = ('SRA run {sra} {strategy} ({source}) ' # '{layout}-end library.\n' # 'Sourced from {species} ' # '(TaxID: {txid}).\n' # 'Sequenced using {platform} platform on ' # '{model}.').format( # sra=sra, # source=sra_lib_source.title(), # strategy=sra_lib_strategy, # layout=sra_lib_layout, # platform=sra_seq_platform, # model=sra_seq_platform_model, # species=sra_species, # txid=sra_taxid) Log.msg( '{sra}:'.format(sra=sra), '{strategy} {layout}-end library ({source}).'.format( strategy=sra_lib_strategy, layout=sra_lib_layout, source=sra_lib_source.title())) Log.msg( ' Source:', '{species} (TaxID: {txid}).'.format(species=sra_species, txid=sra_taxid), False) Log.msg( 'Technology:', '{platform} platform on {model}.'.format( platform=sra_seq_platform, model=sra_seq_platform_model), False) sra_runs_info[sra]['KakapoLibraryLayout'] = \ sra_runs_info[sra]['LibraryLayout'] if sra_lib_layout == 'paired' and sra_spots_with_mates == 0: sra_runs_info[sra]['KakapoLibraryLayout'] = 'SINGLE' # sra_info_str = ( # sra_info_str + '\nListed as containing ' # 'paired-end reads, but only a single set of reads ' # 'is available. Treating as single-ended.') elif (sra_lib_layout == 'paired' and sra_spots != sra_spots_with_mates): sra_runs_info[sra]['KakapoLibraryLayout'] = 'PAIRED_UNP' # sra_info_str = ( # sra_info_str + '\nListed as containing ' # 'paired-end reads, but not all reads are paired.') sras_acceptable.append(sra) # Log.msg(sra_info_str) with open(__, 'wb') as f: pickle.dump(sra_runs_info, f, protocol=PICKLE_PROTOCOL) return sra_runs_info, sras_acceptable
def run_tblastn_on_assemblies(ss, assemblies, aa_queries_file, tblastn, dir_prj_assmbl_blast_results, blast_2_evalue, blast_2_max_hsps, blast_2_qcov_hsp_perc, blast_2_best_hit_overhang, blast_2_best_hit_score_edge, blast_2_max_target_seqs, threads, dir_cache_prj, dir_prj_ips): if len(assemblies) > 0: print() Log.inf('Running BLAST on assemblies:', ss) if tblastn is None: Log.err('tblastn is not available. Cannot continue. Exiting.') exit(0) else: Log.wrn('There are no assemblies. Nothing to do, stopping.') exit(0) cache_file = opj(dir_cache_prj, 'blast_2_settings_cache__' + ss) pickled = dict() settings = {'blast_2_evalue': blast_2_evalue, 'blast_2_max_hsps': blast_2_max_hsps, 'blast_2_qcov_hsp_perc': blast_2_qcov_hsp_perc, 'blast_2_best_hit_overhang': blast_2_best_hit_overhang, 'blast_2_best_hit_score_edge': blast_2_best_hit_score_edge, 'blast_2_max_target_seqs': blast_2_max_target_seqs, 'queries': seq_records_to_dict( read_fasta(aa_queries_file, SEQ_TYPE_AA))} Log.msg('evalue:', str(blast_2_evalue)) Log.msg('max_hsps:', str(blast_2_max_hsps)) Log.msg('qcov_hsp_perc:', str(blast_2_qcov_hsp_perc)) Log.msg('best_hit_overhang:', str(blast_2_best_hit_overhang)) Log.msg('best_hit_score_edge:', str(blast_2_best_hit_score_edge)) Log.msg('max_target_seqs:', str(blast_2_max_target_seqs)) print() for a in assemblies: assmbl_src = a['src'] assmbl_name = a['name'] if assmbl_src != 'user_fasta': if assmbl_name.endswith('__' + ss): assmbl_name = assmbl_name.replace('__' + ss, '') else: continue assmbl_blast_db_path = a['blast_db_path'] assmbl_genetic_code = a['gc_id'] ips_json_dump_path = opj(dir_prj_ips, assmbl_name + '_ann_ips__' + ss + '.json') _ = opj(dir_prj_assmbl_blast_results, assmbl_name + '__' + ss + '.tsv') if ope(_) and ope(cache_file): with open(cache_file, 'rb') as f: pickled = pickle.load(f) if ope(_) and pickled == settings: # Log.msg('The provided BLAST settings and query sequences did ' # 'not change since the previous run.') Log.msg('BLAST results already exist:', assmbl_name) else: Log.msg('Running tblastn on: ' + assmbl_name, ss) if ope(ips_json_dump_path): osremove(ips_json_dump_path) run_blast(exec_file=tblastn, task='tblastn', threads=threads, db_path=assmbl_blast_db_path, queries_file=aa_queries_file, out_file=_, evalue=blast_2_evalue, max_hsps=blast_2_max_hsps, qcov_hsp_perc=blast_2_qcov_hsp_perc, best_hit_overhang=blast_2_best_hit_overhang, best_hit_score_edge=blast_2_best_hit_score_edge, max_target_seqs=blast_2_max_target_seqs, db_genetic_code=assmbl_genetic_code, out_cols=BLST_RES_COLS_2) a['blast_hits_aa__' + ss] = parse_blast_results_file(_, BLST_RES_COLS_2) with open(cache_file, 'wb') as f: pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)
def download_kraken2_dbs(dbs_path): base_kraken2_url = 'ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/' msg_prefix = 'Downloading Kraken2 database: ' # ------------------------------------------------------------------------ base = '16S_Silva132_20200326' url = base_kraken2_url + base + '.tgz' tgz = opj(dbs_path, base + '.tgz') # ToDo: Use pattern matching for the directory name. # Instead of using 16S_SILVA132_k2db -> 16S_SILVA132 p_orig = opj(dbs_path, '16S_SILVA132_k2db') db_name = '16S_Silva132' p_new = opj(dbs_path, db_name) if not ope(p_new): Log.msg(msg_prefix + db_name) download_file(url=url, local_path=tgz, protocol='ftp') tar_ref = tarfile.open(tgz, 'r:gz') tar_ref.extractall(dbs_path) tar_ref.close() remove(tgz) move(p_orig, p_new) # ------------------------------------------------------------------------ base = '16S_Silva138_20200326' url = base_kraken2_url + base + '.tgz' tgz = opj(dbs_path, base + '.tgz') # ToDo: Use pattern matching for the directory name. # Instead of using 16S_SILVA138_k2db -> 16S_SILVA138 p_orig = opj(dbs_path, '16S_SILVA138_k2db') db_name = '16S_Silva138' p_new = opj(dbs_path, db_name) if not ope(p_new): Log.msg(msg_prefix + db_name) download_file(url=url, local_path=tgz, protocol='ftp') tar_ref = tarfile.open(tgz, 'r:gz') tar_ref.extractall(dbs_path) tar_ref.close() remove(tgz) move(p_orig, p_new) # ------------------------------------------------------------------------ base = 'minikraken_8GB_202003' url = base_kraken2_url + base + '.tgz' tgz = opj(dbs_path, base + '.tgz') # ToDo: Use pattern matching for the directory name. # Instead of using minikraken_8GB_20200312 -> minikraken p_orig = opj(dbs_path, 'minikraken_8GB_20200312') db_name = 'minikraken_8GB_2020-03-12' p_new = opj(dbs_path, db_name) if not ope(p_new): Log.msg(msg_prefix + db_name) download_file(url=url, local_path=tgz, protocol='ftp') tar_ref = tarfile.open(tgz, 'r:gz') tar_ref.extractall(dbs_path) tar_ref.close() remove(tgz) move(p_orig, p_new) # ------------------------------------------------------------------------ base_dropbox_url = 'https://www.dropbox.com/s/' base = 'mitochondrion_and_plastid' garb = 'vkbp7iys6s76tvf/' url = base_dropbox_url + garb + base + '.tar.gz?dl=1' tgz = opj(dbs_path, base + '.tar.gz') p = opj(dbs_path, base) if not ope(p): Log.msg(msg_prefix + base) download_file(url=url, local_path=tgz, protocol='http') tar_ref = tarfile.open(tgz, 'r:gz') tar_ref.extractall(dbs_path) tar_ref.close() remove(tgz) # ------------------------------------------------------------------------ base_dropbox_url = 'https://www.dropbox.com/s/' base = 'mitochondrion' garb = '6liwneb26uvjuec/' url = base_dropbox_url + garb + base + '.tar.gz?dl=1' tgz = opj(dbs_path, base + '.tar.gz') p = opj(dbs_path, base) if not ope(p): Log.msg(msg_prefix + base) download_file(url=url, local_path=tgz, protocol='http') tar_ref = tarfile.open(tgz, 'r:gz') tar_ref.extractall(dbs_path) tar_ref.close() remove(tgz) # ------------------------------------------------------------------------ base_dropbox_url = 'https://www.dropbox.com/s/' base = 'plastid' garb = 's9vdg4mxrfy1szn/' url = base_dropbox_url + garb + base + '.tar.gz?dl=1' tgz = opj(dbs_path, base + '.tar.gz') p = opj(dbs_path, base) if not ope(p): Log.msg(msg_prefix + base) download_file(url=url, local_path=tgz, protocol='http') tar_ref = tarfile.open(tgz, 'r:gz') tar_ref.extractall(dbs_path) tar_ref.close() remove(tgz) # ------------------------------------------------------------------------ base_dropbox_url = 'https://www.dropbox.com/s/' base = 'viral' garb = '7xz31c7vw088n27/' url = base_dropbox_url + garb + base + '.tar.gz?dl=1' tgz = opj(dbs_path, base + '.tar.gz') p = opj(dbs_path, base) if not ope(p): Log.msg(msg_prefix + base) download_file(url=url, local_path=tgz, protocol='http') tar_ref = tarfile.open(tgz, 'r:gz') tar_ref.extractall(dbs_path) tar_ref.close() remove(tgz) # ------------------------------------------------------------------------ dbs_available, e = list_of_dirs_at_path(dbs_path) kraken2_dbs = {basename(p): p for p in dbs_available} return kraken2_dbs