def _get_from_srst2_argannot(self, outprefix): srst2_version = '0.2.0' srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta' srst2_fa = outprefix + '.original.fa' command = 'wget -O ' + srst2_fa + ' ' + srst2_url common.syscall(command, verbose=True) final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' f_out_fa = pyfastaq.utils.open_file_write(final_fasta) f_out_meta = pyfastaq.utils.open_file_write(final_tsv) seq_reader = pyfastaq.sequences.file_reader(srst2_fa) for seq in seq_reader: original_id = seq.id name, extra = seq.id.split() cluster_id, cluster_name, allele_name, allele_id = name.split('__') seq.id = cluster_name + '.' + name print(seq, file=f_out_fa) print(seq.id, 1, 0, '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_meta) pyfastaq.utils.close(f_out_fa) pyfastaq.utils.close(f_out_meta) if not self.debug: os.unlink(srst2_fa) print('Finished downloading and converting data. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n') print(argannot_ref) print('and in your methods say that the ARG-ANNOT sequences were used from version', srst2_version, 'of SRST2.')
def _gap_fill_with_gapfiller(self): if not os.path.exists(self.scaffolder_scaffolds): raise Error('Cannot gap fill because scaffolds file not found: ' + self.scaffolder_scaffolds) cwd = os.getcwd() if self.gapfiller_exe is None or not self._has_gaps_to_fill(self.scaffolder_scaffolds): self._rename_scaffolds(self.scaffolder_scaffolds, self.gapfilled_scaffolds) return try: os.mkdir(self.gapfill_dir) except: raise Error('Error mkdir '+ self.gapfill_dir) os.chdir(self.gapfill_dir) lib_file = 'lib' with open(lib_file, 'w') as f: print('LIB', 'bwa', self.reads1, self.reads2, self.reads_insert, self.sspace_sd, 'FR', file=f) cmd = ' '.join([ 'perl', self.gapfiller_exe, '-l', lib_file, '-s', self.scaffolder_scaffolds ]) gapfilled_scaffolds = os.path.join(self.gapfill_dir, 'standard_output', 'standard_output.gapfilled.final.fa') common.syscall(cmd, verbose=self.verbose) self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds) os.chdir(cwd)
def run(self): tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd()) cdhit_fasta = os.path.join(tmpdir, 'cdhit') cluster_info_outfile = cdhit_fasta + '.bak.clstr' infile_renamed = os.path.join(tmpdir, 'input.renamed.fa') # cd-hit truncates all names to 19 bases in its report of which # sequences belong to which clusters. So need to temporarily # rename all sequences to have short enough names. Grrr. new_to_old_name = self._enumerate_fasta(self.infile, infile_renamed) cmd = ' '.join([ 'cd-hit-est', '-i', infile_renamed, '-o', cdhit_fasta, '-c', str(self.seq_identity_threshold), '-T', str(self.threads), '-s', str(self.length_diff_cutoff), '-bak 1', ]) common.syscall(cmd, verbose=self.verbose) cluster_representatives = self._get_ids(cdhit_fasta) clusters, cluster_rep_to_cluster = self._parse_cluster_info_file(cluster_info_outfile, new_to_old_name, cluster_representatives) self._rename_fasta(cdhit_fasta, self.outfile, cluster_rep_to_cluster) shutil.rmtree(tmpdir) return clusters
def _scaffold_with_sspace(self): if not os.path.exists(self.assembly_contigs): raise Error('Cannot scaffold because contigs file not found: ' + self.assembly_contigs) try: os.mkdir(self.scaffold_dir) except: raise Error('Error mkdir '+ self.scaffold_dir) cwd = os.getcwd() if self.sspace_exe is None: os.chdir(self.assembly_dir) os.symlink(os.path.basename(self.assembly_contigs), os.path.basename(self.scaffolder_scaffolds)) os.chdir(cwd) return os.chdir(self.scaffold_dir) lib_file = 'lib' with open(lib_file, 'w') as f: print('LIB', self.reads1, self.reads2, int(self.reads_insert), self.sspace_sd, 'FR', file=f) cmd = ' '.join([ 'perl', self.sspace_exe, '-k', str(self.sspace_k), '-l', lib_file, '-s', self.assembly_contigs ]) sspace_scaffolds = os.path.abspath('standard_output.final.scaffolds.fasta') common.syscall(cmd, verbose=self.verbose) os.chdir(self.assembly_dir) os.symlink(os.path.relpath(sspace_scaffolds), os.path.basename(self.scaffolder_scaffolds)) os.chdir(cwd)
def _gap_fill_with_gapfiller(self): if not os.path.exists(self.scaffolder_scaffolds): raise Error('Cannot gap fill because scaffolds file not found: ' + self.scaffolder_scaffolds) cwd = os.getcwd() if self.extern_progs.exe('gapfiller') is None or not self._has_gaps_to_fill(self.scaffolder_scaffolds): self._rename_scaffolds(self.scaffolder_scaffolds, self.gapfilled_scaffolds, self.scaff_name_prefix) return try: os.mkdir(self.gapfill_dir) except: raise Error('Error mkdir '+ self.gapfill_dir) os.chdir(self.gapfill_dir) lib_file = 'lib' with open(lib_file, 'w') as f: print('LIB', 'bwa', self.reads1, self.reads2, self.reads_insert, self.sspace_sd, 'FR', file=f) cmd = ' '.join([ 'perl', self.extern_progs.exe('gapfiller'), '-l', lib_file, '-s', self.scaffolder_scaffolds ]) gapfilled_scaffolds = os.path.join(self.gapfill_dir, 'standard_output', 'standard_output.gapfilled.final.fa') common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh) self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds, self.scaff_name_prefix) os.chdir(cwd) if self.clean: print('Deleting GapFiller directory', self.gapfill_dir, file=self.log_fh) shutil.rmtree(self.gapfill_dir)
def _sketch(self, infile, individual): cmd_list = [self.extern_progs.exe("mash"), "sketch", "-s 100000"] if individual: cmd_list.append("-i") cmd_list.append(infile) common.syscall(" ".join(cmd_list), verbose=True, verbose_filehandle=self.log_fh)
def _get_from_argannot(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'arg-annot-database_doc.zip' common.download_file( 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) common.syscall('unzip ' + zipfile) os.chdir(current_dir) print('Extracted files.') genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' seq_reader = pyfastaq.sequences.file_reader(genes_file) f_out_tsv = pyfastaq.utils.open_file_write(final_tsv) f_out_fa = pyfastaq.utils.open_file_write(final_fasta) for seq in seq_reader: original_id = seq.id seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0]) print(seq, file=f_out_fa) print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_tsv) pyfastaq.utils.close(f_out_tsv) pyfastaq.utils.close(f_out_fa) if not self.debug: shutil.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print(argannot_ref)
def _get_from_resfinder(self, outprefix): outprefix = os.path.abspath(outprefix) final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'resfinder.zip' cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php' print('Downloading data with:', cmd, sep='\n') common.syscall(cmd) common.syscall('unzip ' + zipfile) print('Combining downloaded fasta files...') fout_fa = pyfastaq.utils.open_file_write(final_fasta) fout_tsv = pyfastaq.utils.open_file_write(final_tsv) used_names = {} for filename in os.listdir(): if filename.endswith('.fsa'): print(' ', filename) file_reader = pyfastaq.sequences.file_reader(filename) for seq in file_reader: try: prefix, suffix = seq.id.split('_', maxsplit=1) description = 'Original name: ' + seq.id seq.id = prefix + '.' + suffix except: description = '.' # names are not unique across the files if seq.id in used_names: used_names[seq.id] += 1 seq.id += '_' + str(used_names[seq.id]) else: used_names[seq.id] = 1 print(seq, file=fout_fa) print(seq.id, '1', '0', '.', '.', description, sep='\t', file=fout_tsv) pyfastaq.utils.close(fout_fa) pyfastaq.utils.close(fout_tsv) print('\nFinished combining files\n') os.chdir(current_dir) if not self.debug: shutil.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"Identification of acquired antimicrobial resistance genes", Zankari et al 2012, PMID: 22782487\n')
def _get_from_virulencefinder(self, outprefix): outprefix = os.path.abspath(outprefix) final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() if self.version == 'old': try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'virulencefinder.zip' cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php' print('Downloading data with:', cmd, sep='\n') common.syscall(cmd) common.syscall('unzip ' + zipfile) else: RefGenesGetter._get_genetic_epi_database_from_bitbucket('virulencefinder', tmpdir, git_commit=self.version) os.chdir(tmpdir) print('Combining downloaded fasta files...') fout_fa = pyfastaq.utils.open_file_write(final_fasta) fout_tsv = pyfastaq.utils.open_file_write(final_tsv) name_count = {} for filename in os.listdir(tmpdir): if filename.endswith('.fsa'): print(' ', filename) fix_file = os.path.join(tmpdir, filename + '.fix.fsa') RefGenesGetter._fix_virulencefinder_fasta_file(os.path.join(tmpdir, filename), fix_file) file_reader = pyfastaq.sequences.file_reader(fix_file) for seq in file_reader: original_id = seq.id seq.id = seq.id.replace('_', '.', 1) seq.id = seq.id.replace(' ', '_') if seq.id in name_count: name_count[seq.id] += 1 seq.id = seq.id + '.' + str(name_count[seq.id]) else: name_count[seq.id] = 1 print(seq, file=fout_fa) print(seq.id, '0', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=fout_tsv) pyfastaq.utils.close(fout_fa) pyfastaq.utils.close(fout_tsv) print('\nFinished combining files\n') os.chdir(current_dir) if not self.debug: common.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia coli", Joensen al 2014, PMID: 24574290\n')
def run(self): tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd()) cdhit_fasta = os.path.join(tmpdir, 'cdhit') cluster_info_outfile = cdhit_fasta + '.bak.clstr' cmd = self.get_run_cmd(cdhit_fasta) common.syscall(cmd, verbose=self.verbose) clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number) common.rmtree(tmpdir) return clusters
def _assemble_with_velvet(self): # map reads to reference gene to make BAM input to velvet columbus mapping.run_bowtie2( self.reads1, self.reads2, self.gene_fa, self.gene_bam[:-4], threads=self.threads, sort=True, samtools=self.samtools_exe, bowtie2=self.bowtie2_exe, bowtie2_preset=self.bowtie2_preset, verbose=self.verbose, ) cmd = ' '.join([ self.velveth, self.assembler_dir, str(self.assembly_kmer), '-reference', self.gene_fa, '-shortPaired -bam', self.gene_bam[:-4] + '.unsorted.bam' ]) cwd = os.getcwd() os.chdir(self.assembly_dir) velvet_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'contigs.fa') self.velveth_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True) if not self.velveth_ok: with open('velveth_errors', 'w') as f: print(err, file=f) f.close() self.status_flag.add('assembly_fail') os.chdir(cwd) return cmd = ' '.join([ self.velvetg, self.assembler_dir, '-ins_length', str(int(self.reads_insert)), '-scaffolding no', '-exp_cov auto', '-very_clean yes', '-cov_cutoff auto', ]) self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True) if self.assembled_ok: os.symlink(velvet_contigs, os.path.basename(self.assembly_contigs)) else: with open('velvetg_errors', 'w') as f: print(err, file=f) f.close() self.status_flag.add('assembly_fail') os.chdir(cwd)
def _get_from_plasmidfinder(self, outprefix): outprefix = os.path.abspath(outprefix) final_fasta = outprefix + ".fa" final_tsv = outprefix + ".tsv" tmpdir = outprefix + ".tmp.download" current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error("Error mkdir/chdir " + tmpdir) zipfile = "plasmidfinder.zip" cmd = ( 'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o ' + zipfile + " https://cge.cbs.dtu.dk/cge/download_data.php" ) print("Downloading data with:", cmd, sep="\n") common.syscall(cmd) common.syscall("unzip " + zipfile) print("Combining downloaded fasta files...") fout_fa = pyfastaq.utils.open_file_write(final_fasta) fout_tsv = pyfastaq.utils.open_file_write(final_tsv) name_count = {} for filename in os.listdir(tmpdir): if filename.endswith(".fsa"): print(" ", filename) file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename)) for seq in file_reader: original_id = seq.id seq.id = seq.id.replace("_", ".", 1) if seq.id in name_count: name_count[seq.id] += 1 seq.id = seq.id + "." + str(name_count[seq.id]) else: name_count[seq.id] = 1 print(seq, file=fout_fa) print(seq.id, "0", "0", ".", ".", "Original name was " + original_id, sep="\t", file=fout_tsv) pyfastaq.utils.close(fout_fa) pyfastaq.utils.close(fout_tsv) print("\nFinished combining files\n") os.chdir(current_dir) shutil.rmtree(tmpdir) print("Finished. Final files are:", final_fasta, final_tsv, sep="\n\t", end="\n\n") print("You can use them with ARIBA like this:") print("ariba prepareref -f", final_fasta, "-m", final_tsv, "output_directory\n") print("If you use this downloaded data, please cite:") print( '"PlasmidFinder and pMLST: in silico detection and typing of plasmids", Carattoli et al 2014, PMID: 24777092\n' )
def _get_genetic_epi_database_from_bitbucket(cls, db_name, outdir, git_commit=None): assert db_name in {'plasmidfinder', 'resfinder', 'virulencefinder'} cmd = 'git clone ' + 'https://bitbucket.org/genomicepidemiology/' + db_name + '_db.git ' + outdir common.syscall(cmd) if git_commit is not None: common.syscall('cd ' + outdir + ' && git checkout ' + git_commit) print('Using this git commit for ' + db_name + ' database:') subprocess.check_call('cd ' + outdir + ' && git log -n 1', shell=True)
def _run_cdhit_est_2d(reference, reads, outfile, cdhitest2d, verbose=False, verbose_fh=None): cmd = ' '.join([ cdhitest2d, '-i', reference, '-i2', reads, '-G 0 -M 0 -d 0 -aS 0.95', '-o', outfile ]) common.syscall(cmd, verbose=verbose, verbose_filehandle=verbose_fh) os.unlink(outfile)
def _get_from_plasmidfinder(self, outprefix): outprefix = os.path.abspath(outprefix) final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() if self.version == 'old': try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'plasmidfinder.zip' cmd = 'curl -X POST --data "folder=plasmidfinder&filename=plasmidfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php' print('Downloading data with:', cmd, sep='\n') common.syscall(cmd) common.syscall('unzip ' + zipfile) else: RefGenesGetter._get_genetic_epi_database_from_bitbucket('plasmidfinder', tmpdir, git_commit=self.version) os.chdir(tmpdir) print('Combining downloaded fasta files...') fout_fa = pyfastaq.utils.open_file_write(final_fasta) fout_tsv = pyfastaq.utils.open_file_write(final_tsv) name_count = {} for filename in os.listdir(tmpdir): if filename.endswith('.fsa'): print(' ', filename) file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename)) for seq in file_reader: original_id = seq.id seq.id = seq.id.replace('_', '.', 1) if seq.id in name_count: name_count[seq.id] += 1 seq.id = seq.id + '.' + str(name_count[seq.id]) else: name_count[seq.id] = 1 print(seq, file=fout_fa) print(seq.id, '0', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=fout_tsv) pyfastaq.utils.close(fout_fa) pyfastaq.utils.close(fout_tsv) print('\nFinished combining files\n') os.chdir(current_dir) if not self.debug: shutil.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"PlasmidFinder and pMLST: in silico detection and typing of plasmids", Carattoli et al 2014, PMID: 24777092\n')
def _dist(self, outfile): cmd = " ".join( [ self.extern_progs.exe("mash"), "dist", self.reference_fa + ".msh", self.query_fa + ".msh", "| sort -k3n >", outfile, ] ) common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
def _get_from_srst2_argannot(self, outprefix): if self.version is None: self.version = 'r2' if self.version not in {'r1', 'r2'}: raise Error('srst2_argannot version must be r1 or r2. Got this: ' + self.version) version_string = '.r1' if self.version == 'r1' else '_r2' srst2_url = 'https://raw.githubusercontent.com/katholt/srst2/master/data/ARGannot' + version_string + '.fasta' srst2_fa = outprefix + '.original.fa' command = 'wget -O ' + srst2_fa + ' ' + srst2_url common.syscall(command, verbose=True) final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' f_out_fa = pyfastaq.utils.open_file_write(final_fasta) f_out_meta = pyfastaq.utils.open_file_write(final_tsv) seq_reader = pyfastaq.sequences.file_reader(srst2_fa) for seq in seq_reader: original_id = seq.id name, extra = seq.id.split() cluster_id, cluster_name, allele_name, allele_id = name.split('__') seq.id = cluster_name + '.' + name print(seq, file=f_out_fa) print(seq.id, 1, 0, '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_meta) pyfastaq.utils.close(f_out_fa) pyfastaq.utils.close(f_out_meta) if not self.debug: os.unlink(srst2_fa) print('Finished downloading and converting data. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print( '"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n' ) print(argannot_ref)
def _newick_from_dist_matrix(cls, distance_file, outfile): r_script = outfile + '.tmp.R' with open(r_script, 'w') as f: print('library(ape)', file=f) print('a=read.table("', distance_file, '", header=TRUE, row.names=1, comment.char="")', sep='', file=f) print('h=hclust(dist(a))', file=f) print('write.tree(as.phylo(h), file="', outfile, '")', sep='', file=f) common.syscall('Rscript --no-save ' + r_script) if os.path.exists(r_script + 'out'): os.unlink(r_script + 'out') os.unlink(r_script)
def _get_from_virulencefinder(self, outprefix): outprefix = os.path.abspath(outprefix) final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'plasmidfinder.zip' cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php' print('Downloading data with:', cmd, sep='\n') common.syscall(cmd) common.syscall('unzip ' + zipfile) print('Combining downloaded fasta files...') fout_fa = pyfastaq.utils.open_file_write(final_fasta) fout_tsv = pyfastaq.utils.open_file_write(final_tsv) name_count = {} for filename in os.listdir(tmpdir): if filename.endswith('.fsa'): print(' ', filename) file_reader = pyfastaq.sequences.file_reader(os.path.join(tmpdir, filename)) for seq in file_reader: original_id = seq.id seq.id = seq.id.replace('_', '.', 1) seq.id = seq.id.replace(' ', '_') if seq.id in name_count: name_count[seq.id] += 1 seq.id = seq.id + '.' + str(name_count[seq.id]) else: name_count[seq.id] = 1 print(seq, file=fout_fa) print(seq.id, '0', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=fout_tsv) pyfastaq.utils.close(fout_fa) pyfastaq.utils.close(fout_tsv) print('\nFinished combining files\n') os.chdir(current_dir) if not self.debug: shutil.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia coli", Joensen al 2014, PMID: 24574290\n')
def write_fa_subset(seq_names, infile, outfile, samtools_exe='samtools', verbose=False, verbose_filehandle=sys.stdout): if not os.path.exists(infile + '.fai'): common.syscall(samtools_exe + ' faidx ' + infile, verbose=verbose, verbose_filehandle=verbose_filehandle) if os.path.exists(outfile): os.path.unlink(outfile) for name in seq_names: common.syscall(' '.join([ samtools_exe + ' faidx', infile, '"' + name + '"', '>>', outfile ]))
def _get_from_srst2_argannot(self, outprefix): srst2_version = '0.2.0' srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta' srst2_fa = outprefix + '.original.fa' command = 'wget -O ' + srst2_fa + ' ' + srst2_url common.syscall(command, verbose=True) final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' f_out_fa = pyfastaq.utils.open_file_write(final_fasta) f_out_meta = pyfastaq.utils.open_file_write(final_tsv) seq_reader = pyfastaq.sequences.file_reader(srst2_fa) for seq in seq_reader: original_id = seq.id name, extra = seq.id.split() cluster_id, cluster_name, allele_name, allele_id = name.split('__') seq.id = cluster_name + '.' + name print(seq, file=f_out_fa) print(seq.id, 1, 0, '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_meta) pyfastaq.utils.close(f_out_fa) pyfastaq.utils.close(f_out_meta) if not self.debug: os.unlink(srst2_fa) print('Finished downloading and converting data. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print( '"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n' ) print(argannot_ref) print( 'and in your methods say that the ARG-ANNOT sequences were used from version', srst2_version, 'of SRST2.')
def write_fa_subset(seq_names, infile, outfile, samtools_exe='samtools', verbose=False): if not os.path.exists(infile + '.fai'): common.syscall(samtools_exe + ' faidx ' + infile, verbose=verbose) if os.path.exists(outfile): os.path.unlink(outfile) for name in seq_names: common.syscall(' '.join( [samtools_exe + ' faidx', infile, '"' + name + '"', '>>', outfile]))
def _assemble_with_spades(self, unittest=False): cmd = ' '.join([ self.spades_exe, '-1', self.reads1, '-2', self.reads2, '-o', self.assembler_dir, '-k', str(self.assembly_kmer), '--threads', str(self.threads), '--untrusted-contigs', self.gene_fa, ]) if self.spades_other is not None: cmd += ' ' + self.spades_other cwd = os.getcwd() os.chdir(self.assembly_dir) spades_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'scaffolds.fasta') if unittest: os.mkdir(self.assembler_dir) open(spades_contigs, 'w').close() self.assembled_ok = True else: self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True) if self.assembled_ok: os.symlink(spades_contigs, os.path.basename(self.assembly_contigs)) else: with open('spades_errors', 'w') as f: print(err, file=f) f.close() self.status_flag.add('assembly_fail') os.chdir(cwd)
def _get_from_resfinder(self, outprefix): outprefix = os.path.abspath(outprefix) final_fasta = outprefix + ".fa" final_tsv = outprefix + ".tsv" tmpdir = outprefix + ".tmp.download" current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error("Error mkdir/chdir " + tmpdir) zipfile = "resfinder.zip" cmd = ( 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + " https://cge.cbs.dtu.dk/cge/download_data.php" ) print("Downloading data with:", cmd, sep="\n") common.syscall(cmd) common.syscall("unzip " + zipfile) print("Combining downloaded fasta files...") fout_fa = pyfastaq.utils.open_file_write(final_fasta) fout_tsv = pyfastaq.utils.open_file_write(final_tsv) for filename in os.listdir("database"): if filename.endswith(".fsa"): print(" ", filename) prefix = filename.split(".")[0] file_reader = pyfastaq.sequences.file_reader(os.path.join("database", filename)) for seq in file_reader: seq.id = prefix + "." + seq.id print(seq, file=fout_fa) print(seq.id, "1", "0", ".", ".", ".", sep="\t", file=fout_tsv) pyfastaq.utils.close(fout_fa) pyfastaq.utils.close(fout_tsv) print("\nFinished combining files\n") os.chdir(current_dir) shutil.rmtree(tmpdir) print("Finished. Final files are:", final_fasta, final_tsv, sep="\n\t", end="\n\n") print("You can use them with ARIBA like this:") print("ariba prepareref -f", final_fasta, "-m", final_tsv, "output_directory\n") print("If you use this downloaded data, please cite:") print('"Identification of acquired antimicrobial resistance genes", Zankari et al 2012, PMID: 22782487\n')
def _scaffold_with_sspace(self): if not os.path.exists(self.assembly_contigs): raise Error('Cannot scaffold because contigs file not found: ' + self.assembly_contigs) try: os.mkdir(self.scaffold_dir) except: raise Error('Error mkdir '+ self.scaffold_dir) cwd = os.getcwd() #if self.extern_progs.exe('sspace') is None: if True: # no longer use sspace, but leave the option here just in case os.chdir(self.working_dir) os.symlink(self.assembly_contigs, os.path.basename(self.scaffolder_scaffolds)) os.chdir(cwd) return os.chdir(self.scaffold_dir) lib_file = 'lib' with open(lib_file, 'w') as f: print('LIB', self.reads1, self.reads2, int(self.reads_insert), self.sspace_sd, 'FR', file=f) cmd = ' '.join([ 'perl', self.extern_progs.exe('sspace'), '-k', str(self.sspace_k), '-l', lib_file, '-s', self.assembly_contigs ]) common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh) sspace_scaffolds = os.path.abspath('standard_output.final.scaffolds.fasta') sspace_log = os.path.abspath('standard_output.logfile.txt') with open(sspace_log) as f: print('\n_______________ SSPACE log __________________\n', file=self.log_fh) for line in f: print(line.rstrip(), file=self.log_fh) print('_______________ End of SSPACE log __________________\n', file=self.log_fh) os.rename(sspace_scaffolds, self.scaffolder_scaffolds) os.chdir(cwd) if self.clean: print('Deleting scaffolding directory', self.scaffold_dir, file=self.log_fh) shutil.rmtree(self.scaffold_dir)
def _assemble_with_spades(self, unittest=False): cmd = ' '.join([ self.extern_progs.exe('spades'), '-1', self.reads1, '-2', self.reads2, '-o', self.assembler_dir, '-k', str(self.assembly_kmer), '--threads 1', # otherwise defaults to 16! '--untrusted-contigs', self.ref_fasta, ]) if self.spades_other_options is not None: cmd += ' ' + self.spades_other_options cwd = os.getcwd() try: os.chdir(self.working_dir) except: raise Error('Error chdir ' + self.working_dir) spades_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'scaffolds.fasta') if unittest: os.mkdir(self.assembler_dir) open(spades_contigs, 'w').close() self.assembled_ok = True else: self.assembled_ok, err = common.syscall(cmd, verbose=True, allow_fail=True, verbose_filehandle=self.log_fh, print_errors=False) if self.assembled_ok: os.rename(spades_contigs, os.path.basename(self.assembly_contigs)) else: print('Assembly finished with errors. These are the errors:', file=self.log_fh) print(err, file=self.log_fh) print('\nEnd of spades errors\n', file=self.log_fh) spades_log = os.path.join(self.assembler_dir, 'spades.log') if os.path.exists(spades_log): self._check_spades_log_file(spades_log) with open(spades_log) as f: print('\n______________ SPAdes log ___________________\n', file=self.log_fh) for line in f: print(line.rstrip(), file=self.log_fh) print('\n______________ End of SPAdes log _________________\n', file=self.log_fh) spades_warnings = os.path.join(self.assembler_dir, 'warnings.log') if os.path.exists(spades_warnings): with open(spades_warnings) as f: print('\n______________ SPAdes warnings ___________________\n', file=self.log_fh) for line in f: print(line.rstrip(), file=self.log_fh) print('\n______________ End of SPAdes warnings _________________\n', file=self.log_fh) os.chdir(cwd) if self.clean: print('Deleting assembly directory', self.assembler_dir, file=self.log_fh) shutil.rmtree(self.assembler_dir)
def run_bowtie2( reads_fwd, reads_rev, ref_fa, out_prefix, threads=1, max_insert=1000, sort=False, samtools='samtools', bowtie2='bowtie2', bowtie2_preset='very-sensitive-local', verbose=False ): map_index = out_prefix + '.map_index' clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']] index_cmd = ' '.join([ bowtie2 + '-build', '-q', ref_fa, map_index ]) final_bam = out_prefix + '.bam' if sort: intermediate_bam = out_prefix + '.unsorted.bam' else: intermediate_bam = final_bam map_cmd = ' '.join([ bowtie2, '--threads', str(threads), '--' + bowtie2_preset, '-X', str(max_insert), '-x', map_index, '-1', reads_fwd, '-2', reads_rev, '|', samtools, 'view', '-bS -T', ref_fa, '- >', intermediate_bam ]) common.syscall(index_cmd, verbose=verbose) common.syscall(map_cmd, verbose=verbose) if sort: threads = min(4, threads) thread_mem = int(500 / threads) sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix index_cmd = samtools + ' index ' + final_bam common.syscall(sort_cmd, verbose=verbose) common.syscall(index_cmd, verbose=verbose) for fname in clean_files: os.unlink(fname)
def bowtie2_index(ref_fa, outprefix, bowtie2='bowtie2', verbose=False, verbose_filehandle=sys.stdout): expected_files = [outprefix + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']] file_missing = False for filename in expected_files: if not os.path.exists(filename): file_missing = True break if not file_missing: return cmd = ' '.join([ bowtie2 + '-build', '-q', ref_fa, outprefix ]) common.syscall(cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
def _get_from_argannot(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + ".tmp.download" current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error("Error mkdir/chdir " + tmpdir) zipfile = "arg-annot-database_doc.zip" self._download_file( "http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip", zipfile, ) common.syscall("unzip " + zipfile) os.chdir(current_dir) print("Extracted files.") genes_file = os.path.join(tmpdir, "Database Nt Sequences File.txt") final_fasta = outprefix + ".fa" final_tsv = outprefix + ".tsv" seq_reader = pyfastaq.sequences.file_reader(genes_file) f_out_tsv = pyfastaq.utils.open_file_write(final_tsv) f_out_fa = pyfastaq.utils.open_file_write(final_fasta) for seq in seq_reader: original_id = seq.id seq.id = re.sub(r"\((.*)\)", r"\1.", seq.id) print(seq, file=f_out_fa) print(seq.id, "1", "0", ".", ".", "Original name was " + original_id, sep="\t", file=f_out_tsv) pyfastaq.utils.close(f_out_tsv) pyfastaq.utils.close(f_out_fa) shutil.rmtree(tmpdir) print("Finished. Final files are:", final_fasta, final_tsv, sep="\n\t", end="\n\n") print("You can use them with ARIBA like this:") print("ariba prepareref -f", final_fasta, "-m", final_tsv, "output_directory\n") print("If you use this downloaded data, please cite:") print(argannot_ref)
def _scaffold_with_sspace(self): if not os.path.exists(self.assembly_contigs): raise Error('Cannot scaffold because contigs file not found: ' + self.assembly_contigs) try: os.mkdir(self.scaffold_dir) except: raise Error('Error mkdir ' + self.scaffold_dir) cwd = os.getcwd() if self.sspace_exe is None: os.chdir(self.assembly_dir) os.symlink(os.path.basename(self.assembly_contigs), os.path.basename(self.scaffolder_scaffolds)) os.chdir(cwd) return os.chdir(self.scaffold_dir) lib_file = 'lib' with open(lib_file, 'w') as f: print('LIB', self.reads1, self.reads2, int(self.reads_insert), self.sspace_sd, 'FR', file=f) cmd = ' '.join([ 'perl', self.sspace_exe, '-k', str(self.sspace_k), '-l', lib_file, '-s', self.assembly_contigs ]) sspace_scaffolds = os.path.abspath( 'standard_output.final.scaffolds.fasta') common.syscall(cmd, verbose=self.verbose) os.chdir(self.assembly_dir) os.symlink(os.path.relpath(sspace_scaffolds), os.path.basename(self.scaffolder_scaffolds)) os.chdir(cwd)
def _get_from_argannot(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'arg-annot-database_doc.zip' common.download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True) common.syscall('unzip ' + zipfile) os.chdir(current_dir) print('Extracted files.') genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' seq_reader = pyfastaq.sequences.file_reader(genes_file) f_out_tsv = pyfastaq.utils.open_file_write(final_tsv) f_out_fa = pyfastaq.utils.open_file_write(final_fasta) for seq in seq_reader: original_id = seq.id seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0]) print(seq, file=f_out_fa) print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_tsv) pyfastaq.utils.close(f_out_tsv) pyfastaq.utils.close(f_out_fa) if not self.debug: shutil.rmtree(tmpdir) print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print(argannot_ref)
def run(self): tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd()) cdhit_fasta = os.path.join(tmpdir, 'cdhit') cluster_info_outfile = cdhit_fasta + '.bak.clstr' cmd = ' '.join([ self.cd_hit_est, '-i', self.infile, '-o', cdhit_fasta, '-c', str(self.seq_identity_threshold), '-T', str(self.threads), '-s', str(self.length_diff_cutoff), '-d 0', '-bak 1', ]) common.syscall(cmd, verbose=self.verbose) clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number) shutil.rmtree(tmpdir) return clusters
def _gap_fill_with_gapfiller(self): if not os.path.exists(self.scaffolder_scaffolds): raise Error('Cannot gap fill because scaffolds file not found: ' + self.scaffolder_scaffolds) cwd = os.getcwd() if self.gapfiller_exe is None or not self._has_gaps_to_fill( self.scaffolder_scaffolds): self._rename_scaffolds(self.scaffolder_scaffolds, self.gapfilled_scaffolds) return try: os.mkdir(self.gapfill_dir) except: raise Error('Error mkdir ' + self.gapfill_dir) os.chdir(self.gapfill_dir) lib_file = 'lib' with open(lib_file, 'w') as f: print('LIB', 'bwa', self.reads1, self.reads2, self.reads_insert, self.sspace_sd, 'FR', file=f) cmd = ' '.join([ 'perl', self.gapfiller_exe, '-l', lib_file, '-s', self.scaffolder_scaffolds ]) gapfilled_scaffolds = os.path.join( self.gapfill_dir, 'standard_output', 'standard_output.gapfilled.final.fa') common.syscall(cmd, verbose=self.verbose) self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds) os.chdir(cwd)
def _get_from_resfinder(self, outprefix): outprefix = os.path.abspath(outprefix) final_fasta = outprefix + '.presence_absence.fa' tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'resfinder.zip' cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php' print('Downloading data with:', cmd, sep='\n') common.syscall(cmd) common.syscall('unzip ' + zipfile) print('Combining downloaded fasta files...') f = pyfastaq.utils.open_file_write(final_fasta) for filename in os.listdir('database'): if filename.endswith('.fsa'): print(' ', filename) prefix = filename.split('.')[0] file_reader = pyfastaq.sequences.file_reader(os.path.join('database', filename)) for seq in file_reader: seq.id = prefix + '.' + seq.id print(seq, file=f) pyfastaq.utils.close(f) print('\nCombined files. Final genes file is called', final_fasta, end='\n\n') os.chdir(current_dir) shutil.rmtree(tmpdir) print('You can use it with ARIBA like this:') print('ariba prepareref --ref_prefix', outprefix, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"Identification of acquired antimicrobial resistance genes", Zankari et al 2012, PMID: 22782487\n')
def _make_assembly_vcf(self): tmp_vcf = self.final_assembly_vcf + '.tmp' cmd = ' '.join([ self.samtools_exe, 'mpileup', '-t INFO/DPR,DV', '-A', '-f', self.final_assembly_fa, '-u', '-v', self.final_assembly_bam, '>', tmp_vcf ]) common.syscall(cmd, verbose=self.verbose) cmd = ' '.join([ self.bcftools_exe, 'call -m', tmp_vcf, '|', self.bcftools_exe, 'query', r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''', '>', self.final_assembly_read_depths + '.tmp' ]) common.syscall(cmd, verbose=self.verbose) pysam.tabix_compress(self.final_assembly_read_depths + '.tmp', self.final_assembly_read_depths) pysam.tabix_index(self.final_assembly_read_depths, seq_col=0, start_col=1, end_col=1) os.unlink(self.final_assembly_read_depths + '.tmp') cmd = ' '.join([ self.bcftools_exe, 'call -m -v', tmp_vcf, '|', self.bcftools_exe, 'filter', '-i', '"MIN(DP)>=' + str(self.bcf_min_dp), ' & MIN(DV)>=' + str(self.bcf_min_dv), ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp), ' & QUAL >=', str(self.bcf_min_qual), '"', '-o', self.final_assembly_vcf ]) common.syscall(cmd, verbose=self.verbose) os.unlink(tmp_vcf)
def _get_from_argannot(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) zipfile = 'arg-annot-database_doc.zip' self._download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile) common.syscall('unzip ' + zipfile) os.chdir(current_dir) print('Extracted files.') genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt') final_fasta = outprefix + '.presence_absence.fa' seq_reader = pyfastaq.sequences.file_reader(genes_file) ids = {} for seq in seq_reader: ids[seq.id] = ids.get(seq.id, 0) + 1 for name, count in sorted(ids.items()): if count > 1: print('Warning! Sequence name', name, 'found', count, 'times in download. Keeping longest sequence', file=sys.stderr) pyfastaq.tasks.to_unique_by_id(genes_file, final_fasta) shutil.rmtree(tmpdir) print('Finished. Final genes file is called', final_fasta, end='\n\n') print('You can use it with ARIBA like this:') print('ariba prepareref --ref_prefix', outprefix, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n')
def _get_from_srst2_argannot(self, outprefix): srst2_version = "0.2.0" srst2_url = "https://github.com/katholt/srst2/raw/v" + srst2_version + "/data/ARGannot.r1.fasta" srst2_fa = outprefix + ".original.fa" command = "wget -O " + srst2_fa + " " + srst2_url common.syscall(command, verbose=True) final_fasta = outprefix + ".fa" final_tsv = outprefix + ".tsv" f_out_fa = pyfastaq.utils.open_file_write(final_fasta) f_out_meta = pyfastaq.utils.open_file_write(final_tsv) seq_reader = pyfastaq.sequences.file_reader(srst2_fa) for seq in seq_reader: original_id = seq.id name, extra = seq.id.split() cluster_id, cluster_name, allele_name, allele_id = name.split("__") seq.id = cluster_name + "." + name print(seq, file=f_out_fa) print(seq.id, 1, 0, ".", ".", "Original name: " + original_id, sep="\t", file=f_out_meta) pyfastaq.utils.close(f_out_fa) pyfastaq.utils.close(f_out_meta) print( "Finished downloading and converting data. Final files are:", final_fasta, final_tsv, sep="\n\t", end="\n\n" ) print("You can use them with ARIBA like this:") print("ariba prepareref -f", final_fasta, "-m", final_tsv, "output_directory\n") print("If you use this downloaded data, please cite:") print( '"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n' ) print(argannot_ref) print("and in your methods say that the ARG-ANNOT sequences were used from version", srst2_version, "of SRST2.")
def run_bowtie2(reads_fwd, reads_rev, ref_fa, out_prefix, threads=1, max_insert=1000, sort=False, samtools='samtools', bowtie2='bowtie2', bowtie2_preset='very-sensitive-local', verbose=False): map_index = out_prefix + '.map_index' clean_files = [ map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2'] ] index_cmd = ' '.join([bowtie2 + '-build', '-q', ref_fa, map_index]) final_bam = out_prefix + '.bam' if sort: intermediate_bam = out_prefix + '.unsorted.bam' else: intermediate_bam = final_bam map_cmd = ' '.join([ bowtie2, '--threads', str(threads), '--' + bowtie2_preset, '-X', str(max_insert), '-x', map_index, '-1', reads_fwd, '-2', reads_rev, '|', samtools, 'view', '-bS -T', ref_fa, '- >', intermediate_bam ]) common.syscall(index_cmd, verbose=verbose) common.syscall(map_cmd, verbose=verbose) if sort: threads = min(4, threads) thread_mem = int(500 / threads) sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str( thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix index_cmd = samtools + ' index ' + final_bam common.syscall(sort_cmd, verbose=verbose) common.syscall(index_cmd, verbose=verbose) for fname in clean_files: os.unlink(fname)
def _assemble_with_spades(self, unittest=False): cmd = ' '.join([ self.spades_exe, '-1', self.reads1, '-2', self.reads2, '-o', self.assembler_dir, '-k', str(self.assembly_kmer), '--threads', str(self.threads), '--untrusted-contigs', self.gene_fa, ]) if self.spades_other is not None: cmd += ' ' + self.spades_other cwd = os.getcwd() os.chdir(self.assembly_dir) spades_contigs = os.path.join( os.path.split(self.assembler_dir)[1], 'scaffolds.fasta') if unittest: os.mkdir(self.assembler_dir) open(spades_contigs, 'w').close() self.assembled_ok = True else: self.assembled_ok, err = common.syscall(cmd, verbose=self.verbose, allow_fail=True) if self.assembled_ok: os.symlink(spades_contigs, os.path.basename(self.assembly_contigs)) else: with open('spades_errors', 'w') as f: print(err, file=f) f.close() self.status_flag.add('assembly_fail') os.chdir(cwd)
def _make_vcf_and_read_depths_files(self): tmp_vcf = self.vcf_file + '.tmp' cmd = ' '.join([ self.samtools_exe, 'mpileup', '-t INFO/AD', '-A', '-f', self.ref_fa, '-u', '-v', self.bam, '>', tmp_vcf ]) common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh) cmd = ' '.join([ self.bcftools_exe, 'call -m', tmp_vcf, '|', self.bcftools_exe, 'query', r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%AD]\n' ''', '>', self.read_depths_file + '.tmp' ]) common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh) pysam.tabix_compress(self.read_depths_file + '.tmp', self.read_depths_file) pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1) os.unlink(self.read_depths_file + '.tmp') cmd = ' '.join([ self.bcftools_exe, 'call -m -v', tmp_vcf, '|', self.bcftools_exe, 'filter', '-i', '"SUM(AD)>=5 & MIN(AD)/DP>=0.1"', '-o', self.vcf_file ]) common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh) os.unlink(tmp_vcf)
def run_bowtie2( reads_fwd, reads_rev, ref_fa, out_prefix, threads=1, max_insert=1000, sort=False, bowtie2='bowtie2', bowtie2_preset='very-sensitive-local', bowtie2_version=None, verbose=False, verbose_filehandle=sys.stdout, remove_both_unmapped=False, clean_index=True, ): ref_is_indexed = True for ext in bowtie2_index_extensions: if not os.path.exists(ref_fa + '.' + ext): ref_is_indexed = False break clean_files = [] if ref_is_indexed: if verbose: print('Bowtie2 index files found (', ref_fa, '.*.bt2) so no need to index', sep='', file=verbose_filehandle) map_index = ref_fa else: map_index = out_prefix + '.map_index' bowtie2_index(ref_fa, map_index, bowtie2=bowtie2, verbose=verbose, verbose_filehandle=verbose_filehandle) if clean_index: clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']] final_bam = out_prefix + '.bam' if sort: intermediate_bam = out_prefix + '.unsorted.bam' else: intermediate_bam = final_bam map_cmd = [ bowtie2, '--threads', str(threads), '--reorder', '--' + bowtie2_preset, '-X', str(max_insert), '-x', map_index, '-1', reads_fwd, '-2', reads_rev, ] if LooseVersion(bowtie2_version) >= LooseVersion('2.3.1'): map_cmd.append('--score-min G,1,10') # We use gawk instead of awk here as we need bitwise comparisons # and these are not available via awk on Mac OSX. if remove_both_unmapped: map_cmd.append(r''' | gawk ' !(and($2,4)) || !(and($2,8)) ' ''') tmp_sam_file = out_prefix + '.unsorted.sam' map_cmd.append(' > ' + tmp_sam_file) map_cmd = ' '.join(map_cmd) common.syscall(map_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle) if verbose: print('Converting', tmp_sam_file, '->', intermediate_bam, file=verbose_filehandle) infile = pysam.AlignmentFile(tmp_sam_file, "r") outfile = pysam.AlignmentFile(intermediate_bam, "wb", template=infile) for x in infile: outfile.write(x) infile.close() outfile.close() os.unlink(tmp_sam_file) if sort: if verbose: print('Sorting', intermediate_bam, '->', final_bam, file=verbose_filehandle) pysam.sort('-o', final_bam, '-O', 'BAM', intermediate_bam) if verbose: print('Indexing', final_bam, file=verbose_filehandle) pysam.index(final_bam) clean_files.append(intermediate_bam) for fname in clean_files: os.unlink(fname)
def _sort_file(infile, outfile, log_fh=None): cmd = 'sort -k1,1 -k 2,2n ' + infile + ' > ' + outfile verbose = log_fh is not None common.syscall(cmd, verbose=verbose, verbose_filehandle=log_fh)
def _assemble_with_spades(self): cwd = os.getcwd() self.assembled_ok = False try: try: os.chdir(self.working_dir) except: raise Error('Error chdir ' + self.working_dir) spades_exe = self.extern_progs.exe('spades') if not spades_exe: raise Error("Spades executable has not been found") spades_options = self.spades_options if spades_options is not None: spades_options = shlex.split(self.spades_options) if self.spades_mode == "rna": spades_options = ["--rna"] + (["-k", "127"] if spades_options is None else spades_options) spades_out_seq_base = "transcripts.fasta" elif self.spades_mode == "sc": spades_options = ["--sc"] + ([ "-k", "33,55,77,99,127", "--careful" ] if spades_options is None else spades_options) spades_out_seq_base = "contigs.fasta" elif self.spades_mode == "wgs": spades_options = [ "-k", "33,55,77,99,127", "--careful" ] if spades_options is None else spades_options spades_out_seq_base = "contigs.fasta" else: raise ValueError("Unknown spades_mode value: {}".format( self.spades_mode)) asm_cmd = [spades_exe, "-t", str(self.threads), "--pe1-1", self.reads1, "--pe1-2", self.reads2, "-o", self.assembler_dir] + \ spades_options asm_ok, err = common.syscall(asm_cmd, verbose=True, verbose_filehandle=self.log_fh, shell=False, allow_fail=True) if not asm_ok: print('Assembly finished with errors. These are the errors:', file=self.log_fh) print(err, file=self.log_fh) print('\nEnd of spades errors\n', file=self.log_fh) else: spades_log = os.path.join(self.assembler_dir, 'spades.log') if os.path.exists(spades_log): self._check_spades_log_file(spades_log) with open(spades_log) as f: print( '\n______________ SPAdes log ___________________\n', file=self.log_fh) for line in f: print(line.rstrip(), file=self.log_fh) print( '\n______________ End of SPAdes log _________________\n', file=self.log_fh) spades_warnings = os.path.join(self.assembler_dir, 'warnings.log') if os.path.exists(spades_warnings): with open(spades_warnings) as f: print( '\n______________ SPAdes warnings ___________________\n', file=self.log_fh) for line in f: print(line.rstrip(), file=self.log_fh) print( '\n______________ End of SPAdes warnings _________________\n', file=self.log_fh) ## fermilight module generates contig names that look like `cluster_1.l15.c17.ctg.1` where 'cluster_1'==self.contig_name_prefix ## the whole structure of the contig name is expected in several places downstream where it is parsed into individual components. ## For example, it is parsed into to l and c parts in ref_seq_chooser (although the parts are not actually used). ## This is the code from fermilight module that generates the contig ID string: ## ofs << ">" << namePrefix << ".l" << overlap << ".c" << minCount << ".ctg." << i + 1 << '\n' ## ## We generate the same contig name structure here using dummy values for overlap and minCount, in order ## to avoid distrupting the downstream code. ## Note that the fermilight module generates multiple versions of the assembly on a grid of l and c values, ## and ref_seq_chooser then picks a single "best" (l,c) version based on coverage/identity of the nucmer ## alignment to the reference. Spades generates a single version of the assembly, so ref_seq_chooser ## can only pick that one version. spades_out_seq = os.path.join(self.assembler_dir, spades_out_seq_base) ## No need really to use general-purpose pyfastaq.sequences.file_reader here and pay performance cost for ## its multi-format line tests since we are only replacing the IDs in a pre-defined format if os.path.exists(spades_out_seq): with open(spades_out_seq, "r") as inp, open(self.all_assembly_contigs_fa, "w") as out: pref = self.contig_name_prefix i_cont = 0 for line in inp: if line.startswith(">"): i_cont += 1 line = ">{}.l15.c17.ctg.{}\n".format( pref, i_cont) out.write(line) if i_cont > 0: self.assembled_ok = True if self.clean: print('Deleting assembly directory', self.assembler_dir, file=self.log_fh) shutil.rmtree(self.assembler_dir, ignore_errors=True) finally: os.chdir(cwd)
def run_smalt(reads_fwd, reads_rev, ref_fa, out_prefix, index_k=9, index_s=2, threads=1, max_insert=1000, minid=0.9, sort=False, extra_smalt_map_ops='-x', samtools='samtools', smalt='smalt', verbose=False): if extra_smalt_map_ops is None: extra_smalt_map_ops = '' map_index = out_prefix + '.map_index' clean_files = [map_index + '.' + x for x in ['smi', 'sma']] index_cmd = ' '.join([ smalt, 'index', '-k', str(index_k), '-s', str(index_s), map_index, ref_fa ]) map_cmd = smalt + ' map ' + extra_smalt_map_ops + ' ' # depending on OS, -n can break smalt, so only use -n if it's > 1. if threads > 1: map_cmd += '-n ' + str(threads) + ' -O ' if reads_rev is None: map_cmd += ' '.join([ '-y', str(minid), map_index, reads_fwd, ]) else: map_cmd += ' '.join([ '-i', str(max_insert), '-y', str(minid), map_index, reads_fwd, reads_rev, ]) map_cmd += ' | ' + samtools + ' view' final_bam = out_prefix + '.bam' if sort: intermediate_bam = out_prefix + '.unsorted.bam' else: intermediate_bam = final_bam map_cmd += ' -bS -T ' + ref_fa + ' - > ' + intermediate_bam common.syscall(index_cmd, verbose=verbose) common.syscall(map_cmd, verbose=verbose) if sort: threads = min(4, threads) thread_mem = int(500 / threads) sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str( thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix index_cmd = samtools + ' index ' + final_bam common.syscall(sort_cmd, verbose=verbose) common.syscall(index_cmd, verbose=verbose) for fname in clean_files: os.unlink(fname)
def _get_from_card(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.download' current_dir = os.getcwd() try: os.mkdir(tmpdir) os.chdir(tmpdir) except: raise Error('Error mkdir/chdir ' + tmpdir) versions = self._get_card_versions('download.html') if self.version is not None: key = tuple([int(x) for x in self.version.split('.')]) if key not in versions: raise Error('Error! Did not find requested version ' + self.version) else: key = sorted(list(versions.keys()))[-1] self.version = '.'.join([str(x) for x in key]) print('Getting version', self.version) card_tarball_url = versions[key] card_tarball = 'card.tar.bz2' print('Working in temporary directory', tmpdir) print('Downloading data from card:', card_tarball_url, flush=True) common.syscall('wget -O ' + card_tarball + ' ' + card_tarball_url, verbose=True) print('...finished downloading', flush=True) if not tarfile.is_tarfile(card_tarball): raise Error( 'File ' + card_tarball + ' downloaded from ' + card_tarball_url + ' does not look like a valid tar archive. Cannot continue') json_file = './card.json' with tarfile.open(card_tarball, 'r') as tfile: tfile.extract(json_file) print('Extracted json data file ', json_file, '. Reading its contents...', sep='') final_fasta = outprefix + '.fa' final_tsv = outprefix + '.tsv' log_file = outprefix + '.log' f_out_fa = pyfastaq.utils.open_file_write(final_fasta) f_out_tsv = pyfastaq.utils.open_file_write(final_tsv) f_out_log = pyfastaq.utils.open_file_write(log_file) with open(json_file) as f: json_data = json.load(f) json_data = { int(x): json_data[x] for x in json_data if not x.startswith('_') } print('Found', len(json_data), 'records in the json file. Analysing...', flush=True) for gene_key, gene_dict in sorted(json_data.items()): crecord = card_record.CardRecord(gene_dict) data = crecord.get_data() data['ARO_description'] = data['ARO_description'].encode('utf-8') fasta_name_prefix = '.'.join([ card_record.CardRecord._ARO_name_to_fasta_name( data['ARO_name']), data['ARO_accession'], ]) for card_key, gi, genbank_id, start, end, dna_seq, protein_seq in data[ 'dna_seqs_and_ids']: if dna_seq == '': print('Empty dna sequence', gene_key, data['ARO_id'], data['ARO_accession'], sep='\t', file=f_out_log) continue fasta_id = '.'.join([ fasta_name_prefix, genbank_id, start + '-' + end, card_key ]) fasta = pyfastaq.sequences.Fasta(fasta_id, dna_seq) if gi != 'NA': gene_tuple = fasta.make_into_gene() if gene_tuple is None: print('Could not make gene from sequence', fasta.id, sep='\t', file=f_out_log) continue else: translated = gene_tuple[0].translate() if gene_tuple[0][:3] in pyfastaq.genetic_codes.starts[ self.genetic_code]: translated.seq = 'M' + translated.seq[1:] if translated.seq[:-1] != protein_seq: print( 'Translation of inferred gene dna sequence does not match protein sequence', fasta.id, sep='\t', file=f_out_log) continue print(fasta, file=f_out_fa) if gi == 'NA': gene_or_not = '0' variant_only = '0' elif len(data['snps']) == 0: gene_or_not = '1' variant_only = '0' else: gene_or_not = '1' variant_only = '1' print(fasta.id, gene_or_not, variant_only, '.', '.', data['ARO_name'], sep='\t', file=f_out_tsv) if len(data['snps']) == 0 and data['ARO_description'] != '': print(fasta.id, gene_or_not, variant_only, '.', '.', data['ARO_description'], sep='\t', file=f_out_tsv) else: for snp in data['snps']: if data['ARO_description'] != '': print(fasta.id, gene_or_not, variant_only, snp, '.', data['ARO_description'], sep='\t', file=f_out_tsv) pyfastaq.utils.close(f_out_fa) pyfastaq.utils.close(f_out_tsv) pyfastaq.utils.close(f_out_log) os.chdir(current_dir) if not self.debug: common.rmtree(tmpdir) print('Extracted data and written ARIBA input files\n') print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print( '"CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database", Alcock et al 2020, PMID: 31665441' ) print('and in your methods say that version', self.version, 'of the database was used')
def run_smalt( reads_fwd, reads_rev, ref_fa, out_prefix, index_k=9, index_s=2, threads=1, max_insert=1000, minid=0.9, sort=False, extra_smalt_map_ops='-x', samtools='samtools', smalt='smalt', verbose=False ): if extra_smalt_map_ops is None: extra_smalt_map_ops = '' map_index = out_prefix + '.map_index' clean_files = [map_index + '.' + x for x in ['smi', 'sma']] index_cmd = ' '.join([ smalt, 'index', '-k', str(index_k), '-s', str(index_s), map_index, ref_fa ]) map_cmd = smalt + ' map ' + extra_smalt_map_ops + ' ' # depending on OS, -n can break smalt, so only use -n if it's > 1. if threads > 1: map_cmd += '-n ' + str(threads) + ' -O ' if reads_rev is None: map_cmd += ' '.join([ '-y', str(minid), map_index, reads_fwd, ]) else: map_cmd += ' '.join([ '-i', str(max_insert), '-y', str(minid), map_index, reads_fwd, reads_rev, ]) map_cmd += ' | ' + samtools + ' view' final_bam = out_prefix + '.bam' if sort: intermediate_bam = out_prefix + '.unsorted.bam' else: intermediate_bam = final_bam map_cmd += ' -bS -T ' + ref_fa + ' - > ' + intermediate_bam common.syscall(index_cmd, verbose=verbose) common.syscall(map_cmd, verbose=verbose) if sort: threads = min(4, threads) thread_mem = int(500 / threads) sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix index_cmd = samtools + ' index ' + final_bam common.syscall(sort_cmd, verbose=verbose) common.syscall(index_cmd, verbose=verbose) for fname in clean_files: os.unlink(fname)