def make_config(output_dirpath, tmp_dirpath, threads, clade_dirpath, augustus_dirpath): busco_dirpath = join(dirname(realpath(__file__)), 'busco') domain = 'prokaryota' if qconfig.prokaryote else 'eukaryota' values = { 'out_path': output_dirpath, 'lineage_path': clade_dirpath, 'domain': domain, 'tmp_dir': tmp_dirpath, 'threads': str(threads), 'tblastn_path': dirname(get_blast_fpath('tblastn')), 'makeblastdb_path': dirname(get_blast_fpath('makeblastdb')), 'augustus_path': join(augustus_dirpath, 'bin'), 'etraining_path': join(augustus_dirpath, 'bin'), 'augustus_scripts_path': join(augustus_dirpath, 'scripts'), 'hmmsearch_path': busco_dirpath } default_config_fpath = join(busco_dirpath, default_config_fname) config_fpath = join(output_dirpath, config_fname) with open(default_config_fpath) as f_in: with open(config_fpath, 'w') as f_out: for line in f_in: fs = line.strip().split() if not fs: continue keyword = fs[-1] if keyword in values: fs[-1] = values[keyword] f_out.write(' '.join(fs) + '\n') return config_fpath
def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads): logger.info(' ' + 'processing ' + label) blast_query_fpath = contigs_fpath compress_ext = ['.gz', '.gzip', '.bz2', '.bzip2', '.zip'] if any(contigs_fpath.endswith(ext) for ext in compress_ext): logger.info(' ' + 'unpacking ' + label) unpacked_fpath = os.path.join( corrected_dirpath, os.path.basename(contigs_fpath) + '.unpacked') with _get_fasta_file_handler(contigs_fpath) as f_in: with open(unpacked_fpath, 'w') as f_out: for l in f_in: f_out.write(l) blast_query_fpath = unpacked_fpath res_fpath = get_blast_output_fpath(blast_res_fpath, label) check_fpath = get_blast_output_fpath(blast_check_fpath, label) cmd = get_blast_fpath('blastn') + ( ' -query %s -db %s -outfmt 7 -num_threads %s' % (blast_query_fpath, db_fpath, blast_threads)) qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines('Assembly: %s md5 checksum: %s\n' % (contigs_fpath, md5(contigs_fpath)))
def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads): logger.info(' ' + 'processing ' + label) blast_query_fpath = contigs_fpath compress_ext = ['.gz', '.gzip', '.bz2', '.bzip2', '.zip'] if any(contigs_fpath.endswith(ext) for ext in compress_ext): logger.info(' ' + 'unpacking ' + label) unpacked_fpath = os.path.join(corrected_dirpath, os.path.basename(contigs_fpath) + '.unpacked') with _get_fasta_file_handler(contigs_fpath) as f_in: with open(unpacked_fpath, 'w') as f_out: for l in f_in: f_out.write(l) blast_query_fpath = unpacked_fpath res_fpath = get_blast_output_fpath(blast_res_fpath, label) check_fpath = get_blast_output_fpath(blast_check_fpath, label) cmd = get_blast_fpath('blastn') + (' -query %s -db %s -outfmt 7 -num_threads %s' % ( blast_query_fpath, db_fpath, blast_threads)) qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines('Assembly: %s size: %d\n' % (contigs_fpath, os.path.getsize(contigs_fpath)))
def download_blastdb(logger=logger, only_clean=False): global blastdb_dirpath blastdb_dirpath = get_dir_for_download('silva', 'Silva', [silva_downloaded_fname + '.nsq'], logger, only_clean=only_clean) if not blastdb_dirpath: return False if only_clean: if os.path.isdir(blastdb_dirpath): logger.info('Removing ' + blastdb_dirpath) shutil.rmtree(blastdb_dirpath) return True global db_fpath db_fpath = join(blastdb_dirpath, silva_downloaded_fname) if os.path.isfile(db_fpath + '.nsq') and os.path.getsize(db_fpath + '.nsq') >= db_nsq_fsize: return True log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log') db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz') silva_fpath = os.path.join(blastdb_dirpath, silva_fname) logger.info() if os.path.isfile(db_gz_fpath): logger.info( 'SILVA 16S ribosomal RNA gene database has already been downloaded.' ) else: logger.info('Downloading SILVA 16S ribosomal RNA gene database...') if not os.path.isdir(blastdb_dirpath): os.makedirs(blastdb_dirpath) silva_download = urllib.FancyURLopener() silva_remote_fpath = silva_db_url + silva_fname + '.gz' try: silva_download.retrieve(silva_remote_fpath, db_gz_fpath + '.download', show_progress) except Exception: logger.error( 'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. ' 'Try to download it manually in %s and restart your command.' % (silva_remote_fpath, blastdb_dirpath)) return False shutil.move(db_gz_fpath + '.download', db_gz_fpath) logger.info('Processing downloaded file. Logging to %s...' % log_fpath) if not os.path.isfile(silva_fpath): logger.info('Unpacking and replacing " " with "_"...') unpacked_fpath = silva_fpath + ".unpacked" cmd = "gunzip -c %s" % db_gz_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_fpath, 'w'), stderr=open(log_fpath, 'a'), logger=logger) substituted_fpath = silva_fpath + ".substituted" with open(unpacked_fpath) as in_file: with open(substituted_fpath, 'w') as out_file: for line in in_file: out_file.write(line.replace(' ', '_')) os.remove(unpacked_fpath) shutil.move(substituted_fpath, silva_fpath) logger.info('Making BLAST database...') cmd = get_blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' % (silva_fpath, db_fpath)) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_fpath, 'a'), stderr=open(log_fpath, 'a'), logger=logger) if not os.path.exists(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize: logger.error('Failed to make BLAST database ("' + blastdb_dirpath + '"). See details in log. Try to make it manually: %s' % cmd) return False elif not qconfig.debug: os.remove(db_gz_fpath) os.remove(silva_fpath) return True
def download_blastdb(logger=logger, only_clean=False): global blastdb_dirpath blastdb_dirpath = get_dir_for_download('silva', 'Silva', [silva_downloaded_fname + '.nsq'], logger, only_clean=only_clean) if not blastdb_dirpath: return False if only_clean: if os.path.isdir(blastdb_dirpath): logger.info('Removing ' + blastdb_dirpath) shutil.rmtree(blastdb_dirpath) return True global db_fpath db_fpath = join(blastdb_dirpath, silva_downloaded_fname) if os.path.isfile(db_fpath + '.nsq') and os.path.getsize(db_fpath + '.nsq') >= db_nsq_fsize: return True log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log') db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz') silva_fpath = os.path.join(blastdb_dirpath, silva_fname) logger.info() if os.path.isfile(db_gz_fpath): logger.info('SILVA 16S ribosomal RNA gene database has already been downloaded.') else: logger.info('Downloading SILVA 16S ribosomal RNA gene database...') if not os.path.isdir(blastdb_dirpath): os.makedirs(blastdb_dirpath) silva_download = urllib.FancyURLopener() silva_remote_fpath = silva_db_url + silva_fname + '.gz' try: silva_download.retrieve(silva_remote_fpath, db_gz_fpath + '.download', show_progress) except Exception: logger.error( 'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. ' 'Try to download it manually in %s and restart your command.' % (silva_remote_fpath, blastdb_dirpath)) return False shutil.move(db_gz_fpath + '.download', db_gz_fpath) logger.info('Processing downloaded file. Logging to %s...' % log_fpath) if not os.path.isfile(silva_fpath): logger.info('Unpacking and replacing " " with "_"...') unpacked_fpath = silva_fpath + ".unpacked" cmd = "gunzip -c %s" % db_gz_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_fpath, 'w'), stderr=open(log_fpath, 'a'), logger=logger) substituted_fpath = silva_fpath + ".substituted" with open(unpacked_fpath) as in_file: with open(substituted_fpath, 'w') as out_file: for line in in_file: out_file.write(line.replace(' ', '_')) os.remove(unpacked_fpath) shutil.move(substituted_fpath, silva_fpath) logger.info('Making BLAST database...') cmd = get_blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' % (silva_fpath, db_fpath)) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_fpath, 'a'), stderr=open(log_fpath, 'a'), logger=logger) if not os.path.exists(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize: logger.error('Failed to make BLAST database ("' + blastdb_dirpath + '"). See details in log. Try to make it manually: %s' % cmd) return False elif not qconfig.debug: os.remove(db_gz_fpath) os.remove(silva_fpath) return True