def dep_check_blast(dir_dep, os_id, dist_id, debian_dists, redhat_dists, force): if os_id == 'mac': url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.10.1/' 'ncbi-blast-2.10.1+-x64-macosx.tar.gz') elif os_id == 'linux': if dist_id in debian_dists: url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/' '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz') elif dist_id in redhat_dists: url = ('https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/' '2.10.1/ncbi-blast-2.10.1+-x64-linux.tar.gz') dnld_path = opj(dir_dep, 'ncbi-blast.tar.gz') makeblastdb = None blastn = None tblastn = None try: if force is True: raise makeblastdb = which('makeblastdb') blastn = which('blastn') tblastn = which('tblastn') run([makeblastdb, '-help']) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast')) makeblastdb = opj(dir_bin, 'bin', 'makeblastdb') blastn = opj(dir_bin, 'bin', 'blastn') tblastn = opj(dir_bin, 'bin', 'tblastn') run([makeblastdb, '-help']) except Exception: Log.wrn('BLAST+ was not found on this system, trying to download.') download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'ncbi-blast')) makeblastdb = opj(dir_bin, 'bin', 'makeblastdb') blastn = opj(dir_bin, 'bin', 'blastn') tblastn = opj(dir_bin, 'bin', 'tblastn') if not ope(makeblastdb) or \ not ope(blastn) or \ not ope(tblastn): Log.err('Could not download BLAST+.') return None, None, None regexp = r'\sblast\s([\d\.]*)' v = get_dep_version([makeblastdb, '-version'], regexp) Log.msg('makeblastdb is available:', v + ' ' + makeblastdb) v = get_dep_version([blastn, '-version'], regexp) Log.msg('blastn is available:', v + ' ' + blastn) v = get_dep_version([tblastn, '-version'], regexp) Log.msg('tblastn is available:', v + ' ' + tblastn) return makeblastdb, blastn, tblastn
def _parse_taxa(taxa, tax_group, taxonomy, config_file_path): txids = list() for tax in taxa: if tax.isdigit(): txids.append(int(tax)) else: # tax_orig = tax txid = taxonomy.tax_id_for_name_and_group_tax_id( name=tax, group_tax_id=tax_group) if txid is None: txid = taxonomy.tax_id_for_name_and_group_tax_id( name=tax.split(' ')[0], group_tax_id=tax_group) if txid is None: txids.append(txid) msg = 'NCBI taxonomy ID could not be found for:' Log.wrn(msg, tax) # replace_line_in_file( # file_path=config_file_path, # line_str=tax_orig, # replace_str='; NCBI taxid not found: ' + tax) else: txids.append(int(txid)) msg = 'NCBI taxonomy ID for ' + tax + ' is:' Log.msg(msg, str(txid)) # replace_line_in_file( # file_path=config_file_path, # line_str=tax_orig, # replace_str='; ' + tax + '\n' + str(txid)) return txids
def dnld_refseqs_for_taxid(taxid, filter_term, taxonomy, dir_cache_refseqs, query='', db='nuccore'): ft = None if filter_term == 'plastid': ft = '("chloroplast"[filter] OR "plastid"[filter])' else: ft = '("' + filter_term + '"[filter])' tax_terms = tuple(reversed(taxonomy.lineage_for_taxid(taxid)['names'])) for tax_term in tax_terms: if tax_term is None: tax_term = taxonomy.scientific_name_for_taxid(taxid) term = '"RefSeq"[Keyword] AND "{}"[Primary Organism] AND {}'.format( tax_term, ft) term = query + term accs = set(accs_eutil(search_eutil(db, term))) if len(accs) > 0: plural = 'sequences' if len(accs) == 1: plural = 'sequence' Log.msg( 'Found {} RefSeq {} {} for'.format(len(accs), filter_term, plural), tax_term) # Random sample ################################################### if len(accs) > 10: Log.wrn('Using a random sample of ten RefSeq sequences.') random.seed(a=len(accs), version=2) accs = set(random.sample(accs, 10)) ################################################################### break else: Log.wrn( 'No RefSeq {} sequences were found for'.format(filter_term), tax_term) cache_path = opj( dir_cache_refseqs, filter_term + '__' + tax_term.replace(' ', '_') + '.fasta') parsed_fasta_cache = {} if ope(cache_path): parsed_fasta_cache = read_fasta(cache_path, seq_type=SEQ_TYPE_NT, def_to_first_space=True) parsed_fasta_cache = seq_records_to_dict(parsed_fasta_cache) for acc in parsed_fasta_cache: if acc in accs: accs.remove(acc) if len(accs) > 0: parsed_fasta = dnld_ncbi_seqs(db, list(accs)) parsed_fasta = seq_records_to_dict(parsed_fasta, prepend_acc=True) parsed_fasta.update(parsed_fasta_cache) write_fasta(parsed_fasta, cache_path) return cache_path
def dep_check_bowtie2(dir_dep, os_id, force): if os_id == 'mac': url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/' '2.4.1/bowtie2-2.4.1-macos-x86_64.zip/download') elif os_id == 'linux': url = ('https://sourceforge.net/projects/bowtie-bio/files/bowtie2/' '2.4.1/bowtie2-2.4.1-linux-x86_64.zip/download') dnld_path = opj(dir_dep, 'bowtie2.zip') try: if force is True: raise bowtie2 = which('bowtie2') bowtie2_build = which('bowtie2-build') run([bowtie2, '-h']) run([bowtie2_build, '-h']) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2')) bowtie2 = opj(dir_bin, 'bowtie2') bowtie2_build = opj(dir_bin, 'bowtie2-build') run([bowtie2, '-h']) run([bowtie2_build, '-h']) except Exception: Log.wrn('Bowtie 2 was not found on this system, trying to ' 'download.') download_file(url, dnld_path) zip_ref = zipfile.ZipFile(dnld_path, 'r') zip_ref.extractall(dir_dep) zip_ref.close() dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'bowtie2')) bowtie2 = opj(dir_bin, 'bowtie2') bowtie2_build = opj(dir_bin, 'bowtie2-build') bowtie2_execs = ('', '-align-l', '-align-l-debug', '-align-s', '-align-s-debug', '-build', '-build-l', '-build-l-debug', '-build-s', '-build-s-debug', '-inspect', '-inspect-l', '-inspect-l-debug', '-inspect-s', '-inspect-s-debug') for bt2exe in bowtie2_execs: chmod( bowtie2 + bt2exe, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) if not ope(bowtie2): Log.err('Could not download Bowtie 2.') return None, None regexp = r'^.*?version\s([\d\.]*)' v = get_dep_version([bowtie2, '--version'], regexp) Log.msg('bowtie2 is available:', v + ' ' + bowtie2) v = get_dep_version([bowtie2_build, '--version'], regexp) Log.msg('bowtie2-build is available:', v + ' ' + bowtie2_build) return bowtie2, bowtie2_build
def find_api_key() -> str: global_variables = globals() if 'ENTREZ_KEY' in global_variables: api_key = global_variables['ENTREZ_KEY'] elif 'ENTREZ_KEY' in os.environ: api_key = os.environ['ENTREZ_KEY'] else: Log.wrn('Warning:', 'ENTREZ_KEY is not defined.') api_key = None return api_key
def dep_check_sra_toolkit(dir_dep, os_id, dist_id, debian_dists, redhat_dists, force): if os_id == 'mac': url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/' 'sratoolkit.2.10.8-mac64.tar.gz') elif os_id == 'linux': if dist_id in debian_dists: url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/' 'sratoolkit.2.10.8-ubuntu64.tar.gz') elif dist_id in redhat_dists: url = ('https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/2.10.8/' 'sratoolkit.2.10.8-centos_linux64.tar.gz') dnld_path = opj(dir_dep, 'sra-toolkit.tar.gz') fasterq_dump = None try: if force is True: raise fasterq_dump = which('fasterq-dump') dir_bin = dirname(fasterq_dump).strip('bin') _ensure_vdb_cfg(dir_bin) run(fasterq_dump) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit')) _ensure_vdb_cfg(dir_bin) fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump') run(fasterq_dump) except Exception: Log.wrn('SRA Toolkit was not found on this system, trying to ' 'download.') download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'sratoolkit')) fasterq_dump = opj(dir_bin, 'bin', 'fasterq-dump') _ensure_vdb_cfg(dir_bin) if not ope(fasterq_dump): Log.err('Could not download SRA Toolkit.') return None v = get_dep_version([fasterq_dump, '--version'], r':\s([\d\.]*)') if v == '?': v = get_dep_version([fasterq_dump, '--version'], r'version\s([\d\.]*)') Log.msg('fasterq-dump is available:', v + ' ' + fasterq_dump) return fasterq_dump
def dep_check_kakapolib(force=False, quiet=False): kkpl = KAKAPOLIB if not ope(kkpl): if quiet is False: Log.wrn('Compiling kakapolib.') run(['make', 'install'], cwd=DIR_C_SRC) if ope(kkpl): if quiet is False: Log.msg('kakapolib is available:', kkpl) else: Log.err('Compilation of kakapolib failed.') return None return ctypes.CDLL(kkpl)
def dep_check_seqtk(dir_dep, force): url = 'https://github.com/lh3/seqtk/archive/master.zip' dnld_path = opj(dir_dep, 'seqtk.zip') dir_bin = opj(dir_dep, 'seqtk-master') fp = NamedTemporaryFile() fp.write(str.encode('>seq' + lns + 'ATGC')) fp.seek(0) cmd = ['', 'seq', '-r', fp.name] try: if force is True: raise seqtk = which('seqtk') cmd[0] = seqtk run(cmd, do_not_raise=True) except Exception: try: seqtk = opj(dir_bin, 'seqtk') cmd[0] = seqtk run(cmd, do_not_raise=True) except Exception: Log.wrn('Seqtk was not found on this system, trying to download.') download_file(url, dnld_path) zip_ref = zipfile.ZipFile(dnld_path, 'r') zip_ref.extractall(dir_dep) zip_ref.close() try: Log.wrn('Compiling Seqtk.') run('make', cwd=dir_bin) run(cmd, do_not_raise=True) except Exception: replace_line_in_file(opj(dir_bin, 'Makefile'), 'CC=gcc', 'CC=cc') try: run('make', cwd=dir_bin) run(cmd, do_not_raise=True) except Exception: Log.err( 'Something went wrong while trying to compile Seqtk.') Log.msg('Try downloading and installing it manually from: ' 'https://github.com/lh3/seqtk') fp.close() return None fp.close() v = get_dep_version([seqtk], r'Version\:\s([\d\w\.\-]*)') Log.msg('Seqtk is available:', v + ' ' + seqtk) return seqtk
def dep_check_vsearch(dir_dep, os_id, dist_id, debian_dists, redhat_dists, force): if os_id == 'mac': url = ('https://github.com/torognes/vsearch/releases/download/v2.15.0/' 'vsearch-2.15.0-macos-x86_64.tar.gz') elif os_id == 'linux': if dist_id in debian_dists: url = ('https://github.com/torognes/vsearch/releases/download/' 'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz') elif dist_id in redhat_dists: url = ('https://github.com/torognes/vsearch/releases/download/' 'v2.15.0/vsearch-2.15.0-linux-x86_64.tar.gz') dnld_path = opj(dir_dep, 'vsearch.tar.gz') try: if force is True: raise vsearch = which('vsearch') run(vsearch) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch')) vsearch = opj(dir_bin, 'bin', 'vsearch') run(vsearch) except Exception: Log.wrn( 'Vsearch was not found on this system, trying to download.') download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'vsearch')) vsearch = opj(dir_bin, 'bin', 'vsearch') if not ope(vsearch): Log.err('Could not download Vsearch.') return None else: run(vsearch) except Exception: Log.err('Vsearch was downloaded, but does not execute.') Log.msg('Try downloading and installing it manually from: ' 'https://github.com/torognes/vsearch') return None v = get_dep_version([vsearch, '-version'], r'vsearch\sv([\d\.]*)') Log.msg('Vsearch is available:', v + ' ' + vsearch) return vsearch
def dep_check_spades(dir_dep, os_id, force): if os_id == 'mac': url = ('http://cab.spbu.ru/files/release3.14.1/' 'SPAdes-3.14.1-Darwin.tar.gz') elif os_id == 'linux': url = ('http://cab.spbu.ru/files/release3.14.1/' 'SPAdes-3.14.1-Linux.tar.gz') dnld_path = opj(dir_dep, 'SPAdes.tar.gz') try: if force is True: raise spades = which('spades.py') run([PY3, spades]) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes')) spades = opj(dir_bin, 'bin', 'spades.py') run([PY3, spades]) except Exception: Log.wrn('SPAdes was not found on this system, trying to download.') try: download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() except Exception: Log.err('Could not download SPAdes.') return None try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'SPAdes')) spades = opj(dir_bin, 'bin', 'spades.py') # replace_line_in_file(spades, # '#!/usr/bin/env python', # '#!/usr/bin/env python3') if ope(spades): run([PY3, spades]) else: Log.err('Could not download SPAdes.') return None except Exception: Log.err('SPAdes was downloaded, but does not execute.') return None v = get_dep_version([PY3, spades, '--version'], r'^.*SPAdes.*v([\d\.]*)') Log.msg('SPAdes is available:', v + ' ' + spades) return spades
def run_spades(se_fastq_files, pe_fastq_files, dir_spades_assemblies, spades, dir_temp, ss, threads, ram): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: if spades is None: Log.err('SPAdes is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_results = opj(dir_spades_assemblies, se + '__' + ss) fq_path = se_fastq_files[se]['vsearch_results_path' + '__' + ss] se_fastq_files[se]['spades_assembly' + '__' + ss] = None if ope(dir_results): Log.msg('SPAdes assembly already exists:', se) else: make_dirs(dir_results) Log.msg('Running SPAdes on:', se) run_spades_se(spades, out_dir=dir_results, input_file=fq_path, threads=threads, memory=ram, rna=True) assmbl_path = opj(dir_results, 'transcripts.fasta') if ope(assmbl_path): count = len(read_fasta(assmbl_path, SEQ_TYPE_NT)) tr_str = ' transcripts.' if count == 1: tr_str = ' transcript.' Log.msg('SPAdes produced ' + str(count) + tr_str, False) se_fastq_files[se]['spades_assembly' + '__' + ss] = assmbl_path else: Log.wrn('SPAdes produced no transcripts.', False) for pe in pe_fastq_files: dir_results = opj(dir_spades_assemblies, pe + '__' + ss) fq_paths = pe_fastq_files[pe]['vsearch_results_path' + '__' + ss] pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None if ope(dir_results): Log.msg('SPAdes assembly already exists:', pe) else: make_dirs(dir_results) Log.msg('Running SPAdes on: ' + pe) if osstat(fq_paths[0]).st_size > 0 and \ osstat(fq_paths[1]).st_size > 0: run_spades_pe(spades, out_dir=dir_results, input_files=fq_paths, threads=threads, memory=ram, rna=True) else: _ = opj(dir_temp, 'temp.fasta') combine_text_files(fq_paths, _) run_spades_se(spades, out_dir=dir_results, input_file=_, threads=threads, memory=ram, rna=True) osremove(_) assmbl_path = opj(dir_results, 'transcripts.fasta') if ope(assmbl_path): count = len(read_fasta(assmbl_path, SEQ_TYPE_NT)) tr_str = ' transcripts.' if count == 1: tr_str = ' transcript.' Log.msg('SPAdes produced ' + str(count) + tr_str, False) pe_fastq_files[pe]['spades_assembly' + '__' + ss] = assmbl_path else: Log.wrn('SPAdes produced no transcripts.', False)
def run_kraken_filters(order, dbs, base_name, in_files, dir_out, confidence, kraken2, threads, dir_temp): dbs_ordered = OrderedDict() for dbn in order: db_name = dbn[0] if db_name in dbs: dbs_ordered[db_name] = dbs[db_name] else: Log.wrn('Kraken2 database not found:', db_name) # SE if isinstance(in_files, (str, bytes)): in_file = in_files _, in_file_ext, _ = splitext_gz(in_file) for i, db in enumerate(dbs_ordered): Log.msg('Filtering SE reads using Kraken2 database:', db) dir_out_db = opj(dir_out, db) make_dirs(dir_out_db) report_file = opj(dir_out_db, base_name + '.txt') out_class_file = opj(dir_out_db, base_name + in_file_ext) out_unclass_file = opj( dir_temp, base_name + '_' + db + '_kraken2_unclassified' + in_file_ext) if stat(in_file ).st_size > 0: # Kraken2 freaks out if the file is empty. run_kraken_se(kraken=kraken2, db=dbs_ordered[db], in_file=in_file, out_class_file=out_class_file, out_unclass_file=out_unclass_file, report_file=report_file, confidence=confidence, threads=threads, dir_temp=dir_temp) else: copyfile(in_file, out_class_file) copyfile(in_file, out_unclass_file) if i > 0: remove(in_file) in_file = out_unclass_file move(in_file, opj(dir_out, base_name + in_file_ext)) # PE elif isinstance(in_files, (list, tuple)): assert len(in_files) > 1 _, in_file_ext, _ = splitext_gz(in_files[0]) in_file_R1 = in_files[0] in_file_R2 = in_files[1] if len(in_files) > 2: in_file = in_files[2] for i, db in enumerate(dbs_ordered): Log.msg( 'Filtering unpaired forward reads using Kraken2 database:', db) dir_out_db = opj(dir_out, db) make_dirs(dir_out_db) report_file = opj(dir_out_db, base_name + '_unpaired_1.txt') out_class_file = opj(dir_out_db, base_name + '_unpaired_1' + in_file_ext) out_unclass_file = opj( dir_temp, base_name + '_' + db + '_kraken2_unclassified' + in_file_ext) if stat(in_file).st_size > 0: run_kraken_se(kraken=kraken2, db=dbs_ordered[db], in_file=in_file, out_class_file=out_class_file, out_unclass_file=out_unclass_file, report_file=report_file, confidence=confidence, threads=threads, dir_temp=dir_temp) else: copyfile(in_file, out_class_file) copyfile(in_file, out_unclass_file) if i > 0: remove(in_file) in_file = out_unclass_file move(in_file, opj(dir_out, base_name + '_unpaired_1' + in_file_ext)) if len(in_files) == 4: in_file = in_files[3] for i, db in enumerate(dbs_ordered): Log.msg( 'Filtering unpaired reverse reads using Kraken2 database:', db) dir_out_db = opj(dir_out, db) make_dirs(dir_out_db) report_file = opj(dir_out_db, base_name + '_unpaired_2.txt') out_class_file = opj(dir_out_db, base_name + '_unpaired_2' + in_file_ext) out_unclass_file = opj( dir_temp, base_name + '_' + db + '_kraken2_unclassified' + in_file_ext) if stat(in_file).st_size > 0: run_kraken_se(kraken=kraken2, db=dbs_ordered[db], in_file=in_file, out_class_file=out_class_file, out_unclass_file=out_unclass_file, report_file=report_file, confidence=confidence, threads=threads, dir_temp=dir_temp) else: copyfile(in_file, out_class_file) copyfile(in_file, out_unclass_file) if i > 0: remove(in_file) in_file = out_unclass_file move(in_file, opj(dir_out, base_name + '_unpaired_2' + in_file_ext)) for i, db in enumerate(dbs_ordered): Log.msg('Filtering paired reads using Kraken2 database:', db) dir_out_db = opj(dir_out, db) make_dirs(dir_out_db) report_file = opj(dir_out_db, base_name + '_paired.txt') out_class_file = opj(dir_out_db, base_name + '_paired#' + in_file_ext) out_unclass_file = opj( dir_temp, base_name + '_' + db + '_kraken2_unclassified' + '_paired#' + in_file_ext) if stat(in_file_R1).st_size > 0 and stat(in_file_R2).st_size > 0: run_kraken_pe(kraken=kraken2, db=dbs_ordered[db], in_file_1=in_file_R1, in_file_2=in_file_R2, out_class_file=out_class_file, out_unclass_file=out_unclass_file, report_file=report_file, confidence=confidence, threads=threads, dir_temp=dir_temp) else: copyfile( in_file_R1, copyfile(in_file_R1, out_class_file.replace('#', '_1'))) copyfile( in_file_R2, copyfile(in_file_R2, out_class_file.replace('#', '_2'))) copyfile( in_file_R1, copyfile(in_file_R1, out_unclass_file.replace('#', '_1'))) copyfile( in_file_R2, copyfile(in_file_R2, out_unclass_file.replace('#', '_2'))) if i > 0: remove(in_file_R1) remove(in_file_R2) in_file_R1 = out_unclass_file.replace('#', '_1') in_file_R2 = out_unclass_file.replace('#', '_2') move(in_file_R1, opj(dir_out, base_name + '_paired_1' + in_file_ext)) move(in_file_R2, opj(dir_out, base_name + '_paired_2' + in_file_ext))
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector, threads, dir_temp, should_run): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() if should_run is False: Log.wrn('Skipping Rcorrector as requested.') else: Log.inf('Running Rcorrector.') if rcorrector is None: Log.err('Rcorrector is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, se) fq_path = se_fastq_files[se]['path'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) log_f = opj(dir_fq_cor_data_sample, se + '.txt') out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext) se_fastq_files[se]['cor_path_fq'] = out_f if should_run is False: se_fastq_files[se]['cor_path_fq'] = fq_path continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ file already exists:', se) else: make_dirs(dir_fq_cor_data_sample) Log.msg('SE mode:', se) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path)) fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f) remove(fq_cor_path) for pe in pe_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe) fq_path_1 = pe_fastq_files[pe]['path'][0] fq_path_2 = pe_fastq_files[pe]['path'][1] fq_path_3 = None out_f_3 = None r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) log_f = opj(dir_fq_cor_data_sample, pe + '.txt') out_f_1 = opj(dir_fq_cor_data_sample, pe + '_R1.fastq' + ext) out_f_2 = opj(dir_fq_cor_data_sample, pe + '_R2.fastq' + ext) pe_fastq_files[pe]['cor_path_fq'] = [out_f_1, out_f_2] if len(pe_fastq_files[pe]['path']) == 3: fq_path_3 = pe_fastq_files[pe]['path'][2] out_f_3 = opj(dir_fq_cor_data_sample, pe + '_R3.fastq' + ext) pe_fastq_files[pe]['cor_path_fq'].append(out_f_3) if should_run is False: pe_fastq_files[pe]['cor_path_fq'] = [fq_path_1, fq_path_2] if fq_path_3 is not None: pe_fastq_files[pe]['cor_path_fq'].append(fq_path_3) continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ files already exist:', pe) else: make_dirs(dir_fq_cor_data_sample) Log.msg('PE mode:', pe) run_rcorrector_pe(rcorrector=rcorrector, in_file_1=fq_path_1, in_file_2=fq_path_2, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1)) fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2)) fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext filter_unc_pe(in_file_1=fq_cor_path_1, in_file_2=fq_cor_path_2, out_file_1=out_f_1, out_file_2=out_f_2, log_file=log_f) remove(fq_cor_path_1) remove(fq_cor_path_2) if fq_path_3 is not None: Log.msg( 'SE mode (Paired-read SRA run contains unpaired reads):', pe) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_3, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_3 = opj(dir_fq_cor_data_sample, basename(fq_path_3)) fq_cor_path_3 = splitext_gz(fq_base_path_3)[0] + '.cor.fq' log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired.txt') filter_unc_se(in_file=fq_cor_path_3, out_file=out_f_3, log_file=log_f_3) remove(fq_cor_path_3)
def dnld_sra_fastq_files(sras, sra_runs_info, dir_fq_data, fasterq_dump, threads, dir_temp): if len(sras) > 0: if fasterq_dump is None: Log.err('fasterq-dump from SRA Toolkit is not available. ' + 'Cannot continue. Exiting.') exit(0) print() Log.inf('Downloading SRA read data.') se_fastq_files = {} pe_fastq_files = {} for sra in sras: sra_run_info = sra_runs_info[sra] sra_lib_layout = sra_run_info['LibraryLayout'].lower() sra_lib_layout_k = sra_run_info['KakapoLibraryLayout'].lower() sample_base_name = sra_run_info['KakapoSampleBaseName'] sra_taxid = int(sra_run_info['TaxID']) avg_len = int(sra_run_info['avgLength']) sra_dnld_needed = False if sra_lib_layout == 'single' or sra_lib_layout_k == 'single': se_file = opj(dir_fq_data, sra + '.fastq') se_fastq_files[sample_base_name] = {'path': se_file} se_fastq_files[sample_base_name]['src'] = 'sra' se_fastq_files[sample_base_name]['avg_len'] = avg_len se_fastq_files[sample_base_name]['tax_id'] = sra_taxid if not ope(se_file): sra_dnld_needed = True elif sra_lib_layout == 'paired': pe_file_1 = opj(dir_fq_data, sra + '_1.fastq') pe_file_2 = opj(dir_fq_data, sra + '_2.fastq') pe_file_1_renamed = opj(dir_fq_data, sra + '_R1.fastq') pe_file_2_renamed = opj(dir_fq_data, sra + '_R2.fastq') pe_fastq_files[sample_base_name] = { 'path': [pe_file_1_renamed, pe_file_2_renamed] } pe_fastq_files[sample_base_name]['src'] = 'sra' pe_fastq_files[sample_base_name]['avg_len'] = avg_len // 2 pe_fastq_files[sample_base_name]['tax_id'] = sra_taxid if sra_lib_layout_k == 'paired_unp': pe_file_3 = opj(dir_fq_data, sra + '.fastq') pe_file_3_renamed = opj(dir_fq_data, sra + '_R3.fastq') pe_fastq_files[sample_base_name]['path'].append( pe_file_3_renamed) if not ope(pe_file_1_renamed) or not ope(pe_file_2_renamed): sra_dnld_needed = True if not sra_dnld_needed: Log.msg('FASTQ reads are available locally:', sample_base_name) retry_count = 0 while sra_dnld_needed: if retry_count > 50: Log.err('Download failed. Exiting.') rmtree(dir_temp) exit(1) elif retry_count > 0: Log.wrn('Download failed. Retrying.') sleep(2) retry_count += 1 Log.msg('Downloading FASTQ reads for:', sample_base_name) cmd = [ fasterq_dump, '--threads', str(threads * 2), '--split-3', '--bufsize', '819200', '--outdir', dir_fq_data, '--temp', dir_temp, sra ] run(cmd, do_not_raise=True) if sra_lib_layout == 'single' or sra_lib_layout_k == 'single': if not ope(se_file): continue elif sra_lib_layout == 'paired': if not ope(pe_file_1) or not ope(pe_file_2): continue else: move(pe_file_1, pe_file_1_renamed) move(pe_file_2, pe_file_2_renamed) if sra_lib_layout_k == 'paired_unp': if not ope(pe_file_3): continue else: move(pe_file_3, pe_file_3_renamed) sra_dnld_needed = False if sra_lib_layout == 'single' or sra_lib_layout_k == 'single': if ope(se_file): Log.msg('Renaming FASTQ reads in:', se_file) rename_fq_seqs(se_file, sra, '1:N:0') elif sra_lib_layout == 'paired': if ope(pe_file_1_renamed): Log.msg('Renaming FASTQ reads in:', pe_file_1_renamed) rename_fq_seqs(pe_file_1_renamed, sra, '1:N:0') if ope(pe_file_2_renamed): Log.msg('Renaming FASTQ reads in:', pe_file_2_renamed) rename_fq_seqs(pe_file_2_renamed, sra, '2:N:0') if sra_lib_layout_k == 'paired_unp': if ope(pe_file_3_renamed): Log.msg('Renaming FASTQ reads in:', pe_file_3_renamed) rename_fq_seqs(pe_file_3_renamed, sra + '_unpaired', '1:N:0') return se_fastq_files, pe_fastq_files, sra_runs_info
def dep_check_kraken2(dir_dep, os_id, release_name, force): url = 'https://github.com/karolisr/kraken2/archive/master.tar.gz' dnld_path = opj(dir_dep, 'kraken2.tar.gz') try: if force is True: raise kraken2 = which('kraken2') kraken2_build = which('kraken2-build') dir_bin = dirname(kraken2) classify_bin = opj(dir_bin, 'classify') _ = run([classify_bin], do_not_raise=True) if not _.stderr.startswith('classify: mandatory filename'): raise run([kraken2, '--help']) run([kraken2_build, '--help']) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'kraken2')) kraken2 = opj(dir_bin, 'bin', 'kraken2') kraken2_build = opj(dir_bin, 'bin', 'kraken2-build') classify_bin = opj(dir_bin, 'bin', 'classify') _ = run([classify_bin], do_not_raise=True) if not _.stderr.startswith('classify: mandatory filename'): raise run([kraken2, '--help']) run([kraken2_build, '--help']) except Exception: Log.wrn('Kraken2 was not found on this system, trying to ' 'download.') if ope(dnld_path): remove(dnld_path) download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'kraken2')) classify_bin = opj(dir_bin, 'bin', 'classify') kraken2 = opj(dir_bin, 'bin', 'kraken2') kraken2_build = opj(dir_bin, 'bin', 'kraken2-build') makefile = opj(dir_bin, 'src', 'Makefile') replace_line_in_file(makefile, 'cp $(PROGS) $(KRAKEN2_DIR)/', 'cp $(PROGS) "$(KRAKEN2_DIR)"/') try: Log.wrn('Compiling Kraken2 Attempt 1') run(['./install_kraken2.sh', 'bin'], cwd=dir_bin) _ = run([classify_bin], do_not_raise=True) if not _.stderr.startswith('classify: mandatory filename'): raise run([kraken2, '--help']) run([kraken2_build, '--help']) except Exception: try: Log.wrn('Compiling Kraken2 Attempt 2') dir_libomp = opj(dir_dep, 'libomp') if ope(dir_libomp): rmtree(dir_libomp) libomp_fp, v = brew_get('libomp', os_id, release_name, dir_dep) tar_ref = tarfile.open(libomp_fp, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() dir_libomp_l = opj(dir_libomp, v, 'lib') dir_libomp_i = opj(dir_libomp, v, 'include') if os_id == 'mac': # Changes the shared library identification name of a # dynamic shared library. dylib_f = opj(dir_libomp_l, 'libomp.dylib') chmod( dylib_f, stat.S_IRWXU | stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH) cmd = ['install_name_tool', '-id', dylib_f, dylib_f] run(cmd) cxx_flags = ('CXXFLAGS = -L{} -I{} -Xpreprocessor ' '-fopenmp -lomp -Wall -std=c++11 -O3') elif os_id == 'linux': cxx_flags = ('CXXFLAGS = -L{} -I{} -fopenmp -lomp ' '-static -Wall -std=c++11 -O3') cxx_flags = cxx_flags.format(dir_libomp_l, dir_libomp_i) makefile = opj(dir_bin, 'src', 'Makefile') replace_line_in_file( makefile, 'CXXFLAGS = -fopenmp -Wall -std=c++11' ' -O3', cxx_flags) run(['./install_kraken2.sh', 'bin'], cwd=dir_bin) _ = run([classify_bin], do_not_raise=True) if not _.stderr.startswith('classify: mandatory filename'): raise run([kraken2, '--help']) run([kraken2_build, '--help']) except Exception: try: Log.wrn('Compiling Kraken2 Attempt 3') makefile = opj(dir_bin, 'src', 'Makefile') replace_line_in_file( makefile, cxx_flags, 'CXXFLAGS = -Wall -std=c++11 -O3') run(['./install_kraken2.sh', 'bin'], cwd=dir_bin) _ = run([classify_bin], do_not_raise=True) if not _.stderr.startswith( 'classify: mandatory filename'): raise run([kraken2, '--help']) run([kraken2_build, '--help']) except Exception: pass if not ope(kraken2): Log.err('Something went wrong while trying to compile ' 'Kraken2.') Log.msg('Try downloading and installing it manually from: ' 'https://github.com/karolisr/kraken2') return None, None regexp = r'^.*?version\s([\d\.\-A-Za-z]*)' v = get_dep_version([kraken2, '--version'], regexp) Log.msg('kraken2 is available:', v + ' ' + kraken2) v = get_dep_version([kraken2_build, '--version'], regexp) Log.msg('kraken2-build is available:', v + ' ' + kraken2_build) return kraken2, kraken2_build
def run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector, threads, dir_temp, fpatt, should_run): if len(se_fastq_files) > 0 or len(pe_fastq_files) > 0: print() if should_run is False: Log.wrn('Skipping Rcorrector as requested.') else: Log.inf('Running Rcorrector.') if rcorrector is None: Log.err( 'Rcorrector is not available. Cannot continue. Exiting.') exit(0) for se in se_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, se) fq_path = se_fastq_files[se]['trim_path_fq'] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path) log_f = opj(dir_fq_cor_data_sample, se + '.txt') out_f = opj(dir_fq_cor_data_sample, se + '.fastq' + ext) se_fastq_files[se]['cor_path_fq'] = out_f if should_run is False: se_fastq_files[se]['cor_path_fq'] = fq_path continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ file already exists:', se) else: make_dirs(dir_fq_cor_data_sample) Log.msg('SE mode:', se) run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path = opj(dir_fq_cor_data_sample, basename(fq_path)) fq_cor_path = splitext_gz(fq_base_path)[0] + '.cor.fq' + ext filter_unc_se(in_file=fq_cor_path, out_file=out_f, log_file=log_f) remove(fq_cor_path) for pe in pe_fastq_files: dir_fq_cor_data_sample = opj(dir_fq_cor_data, pe) fq_path_1 = pe_fastq_files[pe]['trim_path_fq'][0] fq_path_2 = pe_fastq_files[pe]['trim_path_fq'][1] fq_path_3 = pe_fastq_files[pe]['trim_path_fq'][2] fq_path_4 = pe_fastq_files[pe]['trim_path_fq'][3] r_mode, w_mode, a_mode, fqopen, ext = plain_or_gzip(fq_path_1) log_f = opj(dir_fq_cor_data_sample, pe + '_paired.txt') out_fs = [x.replace('@D@', dir_fq_cor_data_sample) for x in fpatt] out_fs = [x.replace('@N@', pe) for x in out_fs] out_fs = [x + ext for x in out_fs] pe_fastq_files[pe]['cor_path_fq'] = out_fs if should_run is False: pe_fastq_files[pe]['cor_path_fq'] = [ fq_path_1, fq_path_2, fq_path_3, fq_path_4 ] continue if ope(dir_fq_cor_data_sample): Log.msg('Corrected FASTQ files already exist:', pe) else: make_dirs(dir_fq_cor_data_sample) Log.msg('PE mode:', pe) run_rcorrector_pe(rcorrector=rcorrector, in_file_1=fq_path_1, in_file_2=fq_path_2, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_1 = opj(dir_fq_cor_data_sample, basename(fq_path_1)) fq_cor_path_1 = splitext_gz(fq_base_path_1)[0] + '.cor.fq' + ext fq_base_path_2 = opj(dir_fq_cor_data_sample, basename(fq_path_2)) fq_cor_path_2 = splitext_gz(fq_base_path_2)[0] + '.cor.fq' + ext filter_unc_pe(in_file_1=fq_cor_path_1, in_file_2=fq_cor_path_2, out_file_1=out_fs[0], out_file_2=out_fs[1], log_file=log_f) remove(fq_cor_path_1) remove(fq_cor_path_2) # unpaired 1 if stat(fq_path_3).st_size != 0: run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_3, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_3 = opj(dir_fq_cor_data_sample, basename(fq_path_3)) fq_cor_path_3 = splitext_gz( fq_base_path_3)[0] + '.cor.fq' + ext log_f_3 = opj(dir_fq_cor_data_sample, pe + '_unpaired_1.txt') filter_unc_se(in_file=fq_cor_path_3, out_file=out_fs[2], log_file=log_f_3) remove(fq_cor_path_3) else: with open(out_fs[2], 'w') as f: f.write('') # unpaired 2 if stat(fq_path_4).st_size != 0: run_rcorrector_se(rcorrector=rcorrector, in_file=fq_path_4, out_dir=dir_fq_cor_data_sample, threads=threads, dir_temp=dir_temp) fq_base_path_4 = opj(dir_fq_cor_data_sample, basename(fq_path_4)) fq_cor_path_4 = splitext_gz( fq_base_path_4)[0] + '.cor.fq' + ext log_f_4 = opj(dir_fq_cor_data_sample, pe + '_unpaired_2.txt') filter_unc_se(in_file=fq_cor_path_4, out_file=out_fs[3], log_file=log_f_4) remove(fq_cor_path_4) else: with open(out_fs[3], 'w') as f: f.write('')
def main(): """Run the script.""" # Prepare initial logger (before we know the log file path) -------------- prj_log_file_suffix = time_stamp() + '.log' log_stream = StringIO() Log.set_colors(COLORS) Log.set_file(log_stream) Log.set_write(True) # Prepare configuration directory ---------------------------------------- if ope(DIR_CFG): Log.inf('Found configuration directory:', DIR_CFG) else: Log.wrn('Creating configuration directory:', DIR_CFG) make_dirs(DIR_CFG) print() # Check for dependencies ------------------------------------------------- Log.inf('Checking for dependencies.') make_dirs(DIR_DEP) make_dirs(DIR_KRK) seqtk = deps.dep_check_seqtk(DIR_DEP, FORCE_DEPS) trimmomatic, adapters = deps.dep_check_trimmomatic(DIR_DEP) fasterq_dump = deps.dep_check_sra_toolkit(DIR_DEP, OS_ID, DIST_ID, DEBIAN_DISTS, REDHAT_DISTS, FORCE_DEPS) makeblastdb, _, tblastn = deps.dep_check_blast(DIR_DEP, OS_ID, DIST_ID, DEBIAN_DISTS, REDHAT_DISTS, FORCE_DEPS) vsearch = deps.dep_check_vsearch(DIR_DEP, OS_ID, DIST_ID, DEBIAN_DISTS, REDHAT_DISTS, FORCE_DEPS) spades = deps.dep_check_spades(DIR_DEP, OS_ID, FORCE_DEPS) bowtie2, bowtie2_build = deps.dep_check_bowtie2(DIR_DEP, OS_ID, FORCE_DEPS) rcorrector = deps.dep_check_rcorrector(DIR_DEP, FORCE_DEPS) kraken2, kraken2_build = deps.dep_check_kraken2(DIR_DEP, OS_ID, RELEASE_NAME, FORCE_DEPS) print() kraken2_dbs = deps.dnld_kraken2_dbs(DIR_KRK) if INSTALL_DEPS is True or DNLD_KRAKEN_DBS is True: exit(0) print() # Initialize NCBI taxonomy database -------------------------------------- tax = Taxonomy() if tax.is_initialized() is False: tax.init(data_dir_path=DIR_TAX, logger=Log) print() # Parse configuration file ----------------------------------------------- Log.inf('Reading configuration file:', CONFIG_FILE_PATH) _ = config_file_parse(CONFIG_FILE_PATH, tax) allow_no_stop_cod = _['allow_no_stop_cod'] allow_no_strt_cod = _['allow_no_strt_cod'] allow_non_aug = _['allow_non_aug'] blast_1_evalue = _['blast_1_evalue'] blast_1_max_hsps = _['blast_1_max_hsps'] blast_1_qcov_hsp_perc = _['blast_1_qcov_hsp_perc'] blast_1_best_hit_overhang = _['blast_1_best_hit_overhang'] blast_1_best_hit_score_edge = _['blast_1_best_hit_score_edge'] blast_1_max_target_seqs = _['blast_1_max_target_seqs'] blast_2_evalue = _['blast_2_evalue'] blast_2_max_hsps = _['blast_2_max_hsps'] blast_2_qcov_hsp_perc = _['blast_2_qcov_hsp_perc'] blast_2_best_hit_overhang = _['blast_2_best_hit_overhang'] blast_2_best_hit_score_edge = _['blast_2_best_hit_score_edge'] blast_2_max_target_seqs = _['blast_2_max_target_seqs'] dir_out = _['output_directory'] email = _['email'] requery_after = _['requery_after'] fq_pe = _['fq_pe'] fq_se = _['fq_se'] should_run_rcorrector = _['should_run_rcorrector'] should_run_ipr = _['should_run_ipr'] bt2_order = _['bt2_order'] kraken_confidence = _['kraken_confidence'] krkn_order = _['krkn_order'] prepend_assmbl = _['prepend_assmbl'] prj_name = _['project_name'] sras = _['sras'] tax_group = _['tax_group'] # tax_group_name = _['tax_group_name'] tax_ids_user = _['tax_ids'] user_assemblies = _['assmbl'] print() # Parse search strategies file ------------------------------------------- if SS_FILE_PATH is not None: Log.inf('Reading search strategies file:', SS_FILE_PATH) sss = ss_file_parse(SS_FILE_PATH) else: Log.wrn('Search strategies file was not provided.\n' + 'Will process reads, assemblies and then stop.') sss = dict() print() # Create output directory ------------------------------------------------ if dir_out is not None: if ope(dir_out): Log.inf('Found output directory:', dir_out) else: Log.wrn('Creating output directory:', dir_out) make_dirs(dir_out) print() # Write Kakapo version information to the output directory --------------- version_file = opj(dir_out, 'kakapo_version.txt') if ope(version_file): with open(version_file, 'r') as f: version_prev = f.read().strip() if __version__ != version_prev: Log.wrn('The output directory contains data produced by a ' + 'different version of Kakapo: ' + version_prev + '.\nThe currently running version is: ' + __version__ + '.\n' + 'Delete "kakapo_version.txt" file located in the ' + 'output directory if you would like to continue.') exit(0) with open(version_file, 'w') as f: f.write(__version__) # Create subdirectories in the output directory -------------------------- _ = prepare_output_directories(dir_out, prj_name) dir_temp = _['dir_temp'] dir_cache_pfam_acc = _['dir_cache_pfam_acc'] dir_cache_fq_minlen = _['dir_cache_fq_minlen'] dir_cache_prj = _['dir_cache_prj'] dir_cache_refseqs = _['dir_cache_refseqs'] dir_prj_logs = _['dir_prj_logs'] dir_prj_queries = _['dir_prj_queries'] dir_fq_data = _['dir_fq_data'] dir_fq_cor_data = _['dir_fq_cor_data'] dir_fq_trim_data = _['dir_fq_trim_data'] dir_fq_filter_bt2_data = _['dir_fq_filter_bt2_data'] dir_fq_filter_krkn2_data = _['dir_fq_filter_krkn2_data'] dir_fa_trim_data = _['dir_fa_trim_data'] dir_blast_fa_trim = _['dir_blast_fa_trim'] dir_prj_blast_results_fa_trim = _['dir_prj_blast_results_fa_trim'] dir_prj_vsearch_results_fa_trim = _['dir_prj_vsearch_results_fa_trim'] dir_prj_spades_assemblies = _['dir_prj_spades_assemblies'] dir_prj_blast_assmbl = _['dir_prj_blast_assmbl'] dir_prj_assmbl_blast_results = _['dir_prj_assmbl_blast_results'] dir_prj_transcripts = _['dir_prj_transcripts'] dir_prj_ips = _['dir_prj_ips'] dir_prj_transcripts_combined = _['dir_prj_transcripts_combined'] # Prepare logger --------------------------------------------------------- prj_log_file = opj(dir_prj_logs, prj_name + '_' + prj_log_file_suffix) with open(prj_log_file, 'w') as f: f.write(SCRIPT_INFO.strip() + '\n\n' + log_stream.getvalue()) Log.set_colors(COLORS) Log.set_file(prj_log_file) Log.set_write(True) log_stream.close() # Resolve descending taxonomy nodes -------------------------------------- tax_ids = tax.all_descending_taxids_for_taxids([tax_group]) # Pfam uniprot accessions ------------------------------------------------ pfam_uniprot_acc = OrderedDict() for ss in sss: pfam_acc = sss[ss]['pfam_families'] pfam_uniprot_acc[ss] = pfam_uniprot_accessions(ss, pfam_acc, tax_ids, dir_cache_pfam_acc) # Download Pfam uniprot sequences if needed ------------------------------ aa_uniprot_files = OrderedDict() for ss in sss: aa_uniprot_files[ss] = opj(dir_prj_queries, 'aa_uniprot__' + ss + '.fasta') # ToDo: add support for the requery_after parameter. dnld_pfam_uniprot_seqs(ss, pfam_uniprot_acc[ss], aa_uniprot_files[ss], dir_cache_prj) # User provided entrez query --------------------------------------------- prot_acc_user_from_query = OrderedDict() for ss in sss: entrez_queries = sss[ss]['entrez_search_queries'] prot_acc_user_from_query[ss] = user_entrez_search( ss, entrez_queries, dir_cache_prj, requery_after) # User provided protein accessions --------------------------------------- prot_acc_user = OrderedDict() for ss in sss: print() prot_acc_all = sorted( set(sss[ss]['ncbi_accessions_aa'] + prot_acc_user_from_query[ss])) prot_acc_user[ss] = user_protein_accessions(ss, prot_acc_all, dir_cache_prj, tax) # Download from NCBI if needed ------------------------------------------- aa_prot_ncbi_files = OrderedDict() for ss in sss: aa_prot_ncbi_files[ss] = opj(dir_prj_queries, 'aa_prot_ncbi__' + ss + '.fasta') prot_acc_user[ss] = dnld_prot_seqs(ss, prot_acc_user[ss], aa_prot_ncbi_files[ss], dir_cache_prj) # User provided protein sequences ---------------------------------------- aa_prot_user_files = OrderedDict() for ss in sss: user_queries = sss[ss]['fasta_files_aa'] aa_prot_user_files[ss] = opj(dir_prj_queries, 'aa_prot_user__' + ss + '.fasta') user_aa_fasta(ss, user_queries, aa_prot_user_files[ss]) # Combine all AA queries ------------------------------------------------- print() aa_queries_files = OrderedDict() for ss in sss: aa_queries_files[ss] = opj(dir_prj_queries, 'aa_all__' + ss + '.fasta') combine_aa_fasta(ss, [ aa_uniprot_files[ss], aa_prot_ncbi_files[ss], aa_prot_user_files[ss] ], aa_queries_files[ss]) # Filter AA queries ------------------------------------------------------ prot_acc_user_filtered = OrderedDict() for ss in sss: min_query_length = sss[ss]['min_query_length'] max_query_length = sss[ss]['max_query_length'] max_query_identity = sss[ss]['max_query_identity'] # Dereplicate all queries filter_queries(ss, aa_queries_files[ss], min_query_length, max_query_length, max_query_identity, vsearch, prot_acc_user[ss], overwrite=True) # Dereplicate only NCBI queries. CDS for these will be downloaded # later for reference. if ope(aa_prot_ncbi_files[ss]): prot_acc_user_filtered[ss] = filter_queries(ss, aa_prot_ncbi_files[ss], min_query_length, max_query_length, max_query_identity, vsearch, prot_acc_user[ss], overwrite=False, logging=False) # Download SRA run metadata if needed ------------------------------------ sra_runs_info, sras_acceptable = dnld_sra_info(sras, dir_cache_prj) # Download SRA run FASTQ files if needed --------------------------------- x, y, z = dnld_sra_fastq_files(sras_acceptable, sra_runs_info, dir_fq_data, fasterq_dump, THREADS, dir_temp) se_fastq_files_sra = x pe_fastq_files_sra = y sra_runs_info = z # User provided FASTQ files ---------------------------------------------- se_fastq_files_usr, pe_fastq_files_usr = user_fastq_files(fq_se, fq_pe) # Collate FASTQ file info ------------------------------------------------ se_fastq_files = se_fastq_files_sra.copy() se_fastq_files.update(se_fastq_files_usr) pe_fastq_files = pe_fastq_files_sra.copy() pe_fastq_files.update(pe_fastq_files_usr) def gc_tt(k, d, tax): taxid = d[k]['tax_id'] gc = tax.genetic_code_for_taxid(taxid) d[k]['gc_id'] = gc d[k]['gc_tt'] = TranslationTable(gc) gc_mito = None tt_mito = None gc_plastid = None tt_plastid = None if tax.is_eukaryote(taxid) is True: gc_mito = tax.mito_genetic_code_for_taxid(taxid) if gc_mito != '0': tt_mito = TranslationTable(gc_mito) if tax.contains_plastid(taxid) is True: gc_plastid = tax.plastid_genetic_code_for_taxid(taxid) if gc_plastid != '0': tt_plastid = TranslationTable(gc_plastid) d[k]['gc_id_mito'] = gc_mito d[k]['gc_tt_mito'] = tt_mito d[k]['gc_id_plastid'] = gc_plastid d[k]['gc_tt_plastid'] = tt_plastid for se in se_fastq_files: gc_tt(se, se_fastq_files, tax) for pe in pe_fastq_files: gc_tt(pe, pe_fastq_files, tax) # Minimum acceptable read length ----------------------------------------- min_accept_read_len(se_fastq_files, pe_fastq_files, dir_temp, dir_cache_fq_minlen, vsearch) # Run Rcorrector --------------------------------------------------------- run_rcorrector(se_fastq_files, pe_fastq_files, dir_fq_cor_data, rcorrector, THREADS, dir_temp, should_run_rcorrector) # File name patterns ----------------------------------------------------- a, b, c, d, e = file_name_patterns() pe_trim_fq_file_patterns = a pe_trim_fa_file_patterns = b pe_blast_db_file_patterns = c pe_blast_results_file_patterns = d pe_vsearch_results_file_patterns = e # Run Trimmomatic -------------------------------------------------------- run_trimmomatic(se_fastq_files, pe_fastq_files, dir_fq_trim_data, trimmomatic, adapters, pe_trim_fq_file_patterns, THREADS) # Run Bowtie 2 ----------------------------------------------------------- run_bt2_fq(se_fastq_files, pe_fastq_files, dir_fq_filter_bt2_data, bowtie2, bowtie2_build, THREADS, dir_temp, bt2_order, pe_trim_fq_file_patterns, tax, dir_cache_refseqs) # Run Kraken2 ------------------------------------------------------------ run_kraken2(krkn_order, kraken2_dbs, se_fastq_files, pe_fastq_files, dir_fq_filter_krkn2_data, kraken_confidence, kraken2, THREADS, dir_temp, pe_trim_fq_file_patterns) se_fastq_files = OrderedDict(se_fastq_files) pe_fastq_files = OrderedDict(pe_fastq_files) se_fastq_files = OrderedDict( sorted(se_fastq_files.items(), key=lambda x: x[1]['filter_path_fq'])) pe_fastq_files = OrderedDict( sorted(pe_fastq_files.items(), key=lambda x: x[1]['filter_path_fq'])) # Stop After Filter ------------------------------------------------------ if STOP_AFTER_FILTER is True: Log.wrn('Stopping after Kraken2/Bowtie2 filtering step as requested.') exit(0) # Convert filtered FASTQ files to FASTA ---------------------------------- filtered_fq_to_fa(se_fastq_files, pe_fastq_files, dir_fa_trim_data, seqtk, pe_trim_fa_file_patterns) # Run makeblastdb on reads ----------------------------------------------- makeblastdb_fq(se_fastq_files, pe_fastq_files, dir_blast_fa_trim, makeblastdb, pe_blast_db_file_patterns) # Check if there are any query sequences. any_queries = False for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue else: any_queries = True # Run tblastn on reads --------------------------------------------------- for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue changed_blast_1 = run_tblastn_on_reads( se_fastq_files, pe_fastq_files, aa_queries_files[ss], tblastn, blast_1_evalue, blast_1_max_hsps, blast_1_qcov_hsp_perc, blast_1_best_hit_overhang, blast_1_best_hit_score_edge, blast_1_max_target_seqs, dir_prj_blast_results_fa_trim, pe_blast_results_file_patterns, ss, THREADS, seqtk, vsearch, dir_cache_prj) if changed_blast_1 is True: if ope(dir_prj_vsearch_results_fa_trim): rmtree(dir_prj_vsearch_results_fa_trim) if ope(dir_prj_spades_assemblies): rmtree(dir_prj_spades_assemblies) if ope(dir_prj_blast_assmbl): rmtree(dir_prj_blast_assmbl) if ope(dir_prj_assmbl_blast_results): rmtree(dir_prj_assmbl_blast_results) if ope(dir_prj_transcripts): rmtree(dir_prj_transcripts) if ope(dir_prj_transcripts_combined): rmtree(dir_prj_transcripts_combined) prepare_output_directories(dir_out, prj_name) # Run vsearch on reads --------------------------------------------------- # should_run_vsearch = False # for ss in sss: # if stat(aa_queries_files[ss]).st_size == 0: # continue # else: # should_run_vsearch = True # break # if should_run_vsearch is True: # print() # Log.inf('Checking if Vsearch should be run.') for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue print() Log.inf('Checking if Vsearch should be run:', ss) run_vsearch_on_reads(se_fastq_files, pe_fastq_files, vsearch, dir_prj_vsearch_results_fa_trim, pe_vsearch_results_file_patterns, ss, seqtk) # Run SPAdes ------------------------------------------------------------- # should_run_spades = False # for ss in sss: # if stat(aa_queries_files[ss]).st_size == 0: # continue # else: # should_run_spades = True # break # if should_run_spades is True: # print() # Log.inf('Checking if SPAdes should be run.') for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: for se in se_fastq_files: se_fastq_files[se]['spades_assembly' + '__' + ss] = None for pe in pe_fastq_files: pe_fastq_files[pe]['spades_assembly' + '__' + ss] = None continue print() Log.inf('Checking if SPAdes should be run:', ss) run_spades(se_fastq_files, pe_fastq_files, dir_prj_spades_assemblies, spades, dir_temp, ss, THREADS, RAM) # Combine SPAdes and user provided assemblies ---------------------------- assemblies = combine_assemblies(se_fastq_files, pe_fastq_files, user_assemblies, tax, sss) # Run makeblastdb on assemblies ----------------------------------------- makeblastdb_assemblies(assemblies, dir_prj_blast_assmbl, makeblastdb) if any_queries is False: Log.wrn('No query sequences were provided.') # Run tblastn on assemblies ---------------------------------------------- for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue should_run_tblastn = False for a in assemblies: assmbl_src = a['src'] assmbl_name = a['name'] if assmbl_src != 'user_fasta': if assmbl_name.endswith('__' + ss): should_run_tblastn = True break else: should_run_tblastn = True break if should_run_tblastn is False: print() Log.inf('Will not run BLAST. No transcripts exist:', ss) continue blast_2_evalue_ss = sss[ss]['blast_2_evalue'] blast_2_max_hsps_ss = sss[ss]['blast_2_max_hsps'] blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc'] blast_2_best_hit_overhang_ss = sss[ss]['blast_2_best_hit_overhang'] blast_2_best_hit_score_edge_ss = sss[ss]['blast_2_best_hit_score_edge'] blast_2_max_target_seqs_ss = sss[ss]['blast_2_max_target_seqs'] if blast_2_evalue_ss is None: blast_2_evalue_ss = blast_2_evalue if blast_2_max_hsps_ss is None: blast_2_max_hsps_ss = blast_2_max_hsps if blast_2_qcov_hsp_perc_ss is None: blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc if blast_2_best_hit_overhang_ss is None: blast_2_best_hit_overhang_ss = blast_2_best_hit_overhang if blast_2_best_hit_score_edge_ss is None: blast_2_best_hit_score_edge_ss = blast_2_best_hit_score_edge if blast_2_max_target_seqs_ss is None: blast_2_max_target_seqs_ss = blast_2_max_target_seqs run_tblastn_on_assemblies( ss, assemblies, aa_queries_files[ss], tblastn, dir_prj_assmbl_blast_results, blast_2_evalue_ss, blast_2_max_hsps_ss, blast_2_qcov_hsp_perc_ss, blast_2_best_hit_overhang_ss, blast_2_best_hit_score_edge_ss, blast_2_max_target_seqs_ss, THREADS, dir_cache_prj, dir_prj_ips) # Prepare BLAST hits for analysis: find ORFs, translate ------------------ for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue min_target_orf_len_ss = sss[ss]['min_target_orf_length'] max_target_orf_len_ss = sss[ss]['max_target_orf_length'] organelle = sss[ss]['organelle'] blast_2_qcov_hsp_perc_ss = sss[ss]['blast_2_qcov_hsp_perc'] if blast_2_qcov_hsp_perc_ss is None: blast_2_qcov_hsp_perc_ss = blast_2_qcov_hsp_perc find_orfs_translate(ss, assemblies, dir_prj_transcripts, seqtk, dir_temp, prepend_assmbl, min_target_orf_len_ss, max_target_orf_len_ss, allow_non_aug, allow_no_strt_cod, allow_no_stop_cod, tax, tax_group, tax_ids_user, blast_2_qcov_hsp_perc_ss, organelle) # GFF3 files from kakapo results JSON files ------------------------------ # print() for ss in sss: if stat(aa_queries_files[ss]).st_size == 0: continue gff_from_json(ss, assemblies, dir_prj_ips, dir_prj_transcripts_combined, prj_name) # Run InterProScan 5 ----------------------------------------------------- if should_run_ipr is True: print() ss_names = tuple(sss.keys()) # Determine the length of printed strings, for better spacing -------- max_title_a_len = 0 max_run_id_len = 0 for a in assemblies: for ss in ss_names: if 'transcripts_aa_orf_fasta_file__' + ss not in a: continue aa_file = a['transcripts_aa_orf_fasta_file__' + ss] if aa_file is None: continue assmbl_name = a['name'] run_id = ss + '_' + assmbl_name max_run_id_len = max(len(run_id), max_run_id_len) seqs = seq_records_to_dict(read_fasta(aa_file, SEQ_TYPE_AA)) # Filter all ORFs except the first one. for seq_def in tuple(seqs.keys()): seq_def_prefix = seq_def.split(' ')[0] if seq_def_prefix.endswith('ORF001'): max_title_a_len = max(len(seq_def_prefix), max_title_a_len) max_title_a_len += 2 max_run_id_len += 2 # -------------------------------------------------------------------- parallel_run_count = min(THREADS, len(ss_names)) def run_inter_pro_scan_parallel(ss): if stat(aa_queries_files[ss]).st_size == 0: return run_inter_pro_scan(ss, assemblies, email, dir_prj_ips, dir_cache_prj, parallel_run_count, max_title_a_len, max_run_id_len) # GFF3 files from kakapo and InterProScan 5 results JSON files gff_from_json(ss, assemblies, dir_prj_ips, dir_prj_transcripts_combined, prj_name) Parallel(n_jobs=parallel_run_count, verbose=0, require='sharedmem')(delayed(run_inter_pro_scan_parallel)(ss) for ss in ss_names) # Download CDS for NCBI protein queries ---------------------------------- print() prot_cds_ncbi_files = OrderedDict() def dnld_cds_for_ncbi_prot_acc_parallel(ss): if stat(aa_queries_files[ss]).st_size == 0: return if ss not in prot_acc_user_filtered: return prot_cds_ncbi_files[ss] = opj( dir_prj_transcripts_combined, prj_name + '_ncbi_query_cds__' + ss + '.fasta') if len(prot_acc_user_filtered[ss]) > 0: dnld_cds_for_ncbi_prot_acc(ss, prot_acc_user_filtered[ss], prot_cds_ncbi_files[ss], tax, dir_cache_prj) ss_names = tuple(sss.keys()) Parallel(n_jobs=2, verbose=0, require='sharedmem')( delayed(dnld_cds_for_ncbi_prot_acc_parallel)(ss) for ss in ss_names) # ------------------------------------------------------------------------ rmtree(dir_temp) # ------------------------------------------------------------------------ rerun = input('\nRepeat ([y]/n)? ').lower().strip() if rerun.startswith('y') or rerun == '': print() return False else: print('\nExiting...') return True
def run_tblastn_on_assemblies(ss, assemblies, aa_queries_file, tblastn, dir_prj_assmbl_blast_results, blast_2_evalue, blast_2_max_hsps, blast_2_qcov_hsp_perc, blast_2_best_hit_overhang, blast_2_best_hit_score_edge, blast_2_max_target_seqs, threads, dir_cache_prj, dir_prj_ips): if len(assemblies) > 0: print() Log.inf('Running BLAST on assemblies:', ss) if tblastn is None: Log.err('tblastn is not available. Cannot continue. Exiting.') exit(0) else: Log.wrn('There are no assemblies. Nothing to do, stopping.') exit(0) cache_file = opj(dir_cache_prj, 'blast_2_settings_cache__' + ss) pickled = dict() settings = {'blast_2_evalue': blast_2_evalue, 'blast_2_max_hsps': blast_2_max_hsps, 'blast_2_qcov_hsp_perc': blast_2_qcov_hsp_perc, 'blast_2_best_hit_overhang': blast_2_best_hit_overhang, 'blast_2_best_hit_score_edge': blast_2_best_hit_score_edge, 'blast_2_max_target_seqs': blast_2_max_target_seqs, 'queries': seq_records_to_dict( read_fasta(aa_queries_file, SEQ_TYPE_AA))} Log.msg('evalue:', str(blast_2_evalue)) Log.msg('max_hsps:', str(blast_2_max_hsps)) Log.msg('qcov_hsp_perc:', str(blast_2_qcov_hsp_perc)) Log.msg('best_hit_overhang:', str(blast_2_best_hit_overhang)) Log.msg('best_hit_score_edge:', str(blast_2_best_hit_score_edge)) Log.msg('max_target_seqs:', str(blast_2_max_target_seqs)) print() for a in assemblies: assmbl_src = a['src'] assmbl_name = a['name'] if assmbl_src != 'user_fasta': if assmbl_name.endswith('__' + ss): assmbl_name = assmbl_name.replace('__' + ss, '') else: continue assmbl_blast_db_path = a['blast_db_path'] assmbl_genetic_code = a['gc_id'] ips_json_dump_path = opj(dir_prj_ips, assmbl_name + '_ann_ips__' + ss + '.json') _ = opj(dir_prj_assmbl_blast_results, assmbl_name + '__' + ss + '.tsv') if ope(_) and ope(cache_file): with open(cache_file, 'rb') as f: pickled = pickle.load(f) if ope(_) and pickled == settings: # Log.msg('The provided BLAST settings and query sequences did ' # 'not change since the previous run.') Log.msg('BLAST results already exist:', assmbl_name) else: Log.msg('Running tblastn on: ' + assmbl_name, ss) if ope(ips_json_dump_path): osremove(ips_json_dump_path) run_blast(exec_file=tblastn, task='tblastn', threads=threads, db_path=assmbl_blast_db_path, queries_file=aa_queries_file, out_file=_, evalue=blast_2_evalue, max_hsps=blast_2_max_hsps, qcov_hsp_perc=blast_2_qcov_hsp_perc, best_hit_overhang=blast_2_best_hit_overhang, best_hit_score_edge=blast_2_best_hit_score_edge, max_target_seqs=blast_2_max_target_seqs, db_genetic_code=assmbl_genetic_code, out_cols=BLST_RES_COLS_2) a['blast_hits_aa__' + ss] = parse_blast_results_file(_, BLST_RES_COLS_2) with open(cache_file, 'wb') as f: pickle.dump(settings, f, protocol=PICKLE_PROTOCOL)
def dep_check_rcorrector(dir_dep, force): url = 'https://github.com/karolisr/Rcorrector/archive/master.tar.gz' dnld_path = opj(dir_dep, 'rcorrector.tar.gz') try: try: jellyfish = which('jellyfish') run([jellyfish, '--help']) except Exception: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'Rcorrector')) jellyfish = opj(dir_bin, 'jellyfish', 'bin', 'jellyfish') raise if force is True: raise rcorrector = which('run_rcorrector.pl') run([rcorrector, '-version']) except Exception: try: dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'Rcorrector')) try: rcorrector = opj(dir_bin, 'run_rcorrector.pl') run([rcorrector, '-version']) except Exception: Log.wrn('Rcorrector was not found on this system, trying to ' 'download.') raise try: run([jellyfish, '--version']) except Exception: Log.wrn( 'jellyfish is required by Rcorrector, but was not found. ' 'Trying to download and recompile Rcorrector and ' 'jellyfish.') raise except Exception: if ope(dnld_path): remove(dnld_path) if dir_bin != opj(dir_dep, ''): rmtree(dir_bin) download_file(url, dnld_path) tar_ref = tarfile.open(dnld_path, 'r:gz') tar_ref.extractall(dir_dep) tar_ref.close() dir_bin = opj(dir_dep, get_dep_dir(dir_dep, 'Rcorrector')) try: Log.wrn('Compiling Rcorrector.') run('make', cwd=dir_bin) rcorrector = opj(dir_bin, 'run_rcorrector.pl') jellyfish = opj(dir_bin, 'jellyfish', 'bin', 'jellyfish') chmod( rcorrector, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) run([rcorrector, '-version']) if not ope(jellyfish): jellyfish = which('jellyfish') run([jellyfish, '--version']) except Exception: Log.err('Something went wrong while trying to compile ' 'Rcorrector.') Log.msg('Try downloading and installing it manually from: ' 'https://github.com/karolisr/Rcorrector') return None v = get_dep_version([rcorrector, '-version'], r'^Rcorrector\sv([\d\.]*)') Log.msg('Rcorrector is available:', v + ' ' + rcorrector) return rcorrector