def run(self, filtered_reads_file, debug=False): """ Takes in a (filtered) reads file, returns a dict with the keys: command - the command that was run corrected_reads - the corrected fastq file """ # command: # bfc <flag params> filtered_fastq_file mkdir(os.path.join(self.output_dir, "bfc")) bfc_output_file = os.path.join(self.output_dir, "bfc", "bfc_output.fastq") bfc_params = ["-1", "-k", "21", "-t", "10"] if not debug: bfc_params = bfc_params + ["-s", "10g"] bfc_params = bfc_params + [filtered_reads_file, ">", bfc_output_file] (exit_code, command) = super(BFCRunner, self).run(*bfc_params) if exit_code != 0: raise RuntimeError("An error occurred while running BFC!") return { "command": command, "corrected_reads": bfc_output_file, "version_string": self.version_string() }
def run(self, reads_file, contigs_file): """ Runs BBMap to map the given reads file (FASTQ) to the contigs file (FASTA). Returns the paths to the SAM file, coverage stats, and overall BBMap stats as map_file, coverage_file, and stats_file, respectively. """ bbmap_output_dir = os.path.join(self.output_dir, "readMappingPairs") mkdir(bbmap_output_dir) sam_output = os.path.join(bbmap_output_dir, "pairedMapped.sam.gz") coverage_stats_output = os.path.join(bbmap_output_dir, "covstats.txt") bbmap_stats_output = os.path.join(bbmap_output_dir, "bbmap_stats.txt") bbmap_params = [ "-Xmx100g", "nodisk=true", "interleaved=true", "ambiguous=random", "in={}".format(reads_file), "ref={}".format(contigs_file), "out={}".format(sam_output), "covstats={}".format(coverage_stats_output), "2>", bbmap_stats_output ] (exit_code, command) = super(BBMapRunner, self).run(*bbmap_params) if exit_code != 0: raise RuntimeError("An error occurred while running BBMap!") return { "map_file": sam_output, "coverage_file": coverage_stats_output, "stats_file": bbmap_stats_output, "command": command, "version_string": self.version_string() }
def run_skip(self, reads_file): """ Doesn't run RQCFilter, but a dummy skip version. It returns the same result structure, so it doesn't derail the other pipeline steps. However, the "filtered_fastq_file" is the unchanged fastq file, other than gzipping it. run_log is just an empty (but existing!) file. """ print("NOT running RQCFilter, just putting together some results.") # make the dummy output dir outdir = os.path.join( self.scratch_dir, "dummy_rqcfilter_output_{}".format(int(time() * 1000))) mkdir(outdir) # mock up a log file dummy_log = os.path.join(outdir, "dummy_rqcfilter_log.txt") open(dummy_log, 'w').close() # just compress the reads and move them into that output dir (probably don't need to # move them, but let's be consistent) dfu = DataFileUtil(self.callback_url) compressed_reads = dfu.pack_file({ "file_path": reads_file, "pack": "gzip" })["file_path"] base_name = os.path.basename(compressed_reads) not_filtered_reads = os.path.join(outdir, base_name) os.rename(compressed_reads, not_filtered_reads) return { "output_directory": outdir, "filtered_fastq_file": not_filtered_reads, "run_log": dummy_log, "command": "BBTools.run_RQCFilter_local -- skipped. No command run.", "version_string": "KBase BBTools module" }
def test_mkdir_fail(self): # try to make an empty path with self.assertRaises(ValueError) as cm: mkdir(None) self.assertIn("A path is required", str(cm.exception)) # try to make a path that already exists (should fail silently, but not crash self.assertTrue(os.path.exists("data")) mkdir("data")
def __init__(self, callback_url, scratch_dir): """ Initialize a few things. Starting points, paths, etc. """ self.callback_url = callback_url self.scratch_dir = scratch_dir self.timestamp = int(time.time() * 1000) self.output_dir = os.path.join( self.scratch_dir, "jgi_mga_output_{}".format(self.timestamp)) mkdir(self.output_dir) self.file_util = FileUtil(callback_url)
def run(self, scaffold_file): stats_output_dir = os.path.join(self.output_dir, "assembly_stats") mkdir(stats_output_dir) stats_output = os.path.join(stats_output_dir, "assembly.scaffolds.fasta.stats.tsv") stats_stdout = os.path.join(stats_output_dir, "assembly.scaffolds.fasta.stats.txt") stats_stderr = os.path.join(stats_output_dir, "stderr.out") stats_first_params = [ "format=6", "in={}".format(scaffold_file), "1>", stats_output, "2>", stats_stderr ] (exit_code, command) = super(StatsRunner, self).run(*stats_first_params) if exit_code != 0: raise RuntimeError( "Unable to run first pass at stats to generate tab-delimited files!" ) stats_second_params = [ "in={}".format(scaffold_file), "1>", stats_stdout, "2>>", stats_stderr ] (exit_code, command2) = super(StatsRunner, self).run(*stats_second_params) if exit_code != 0: raise RuntimeError( "Unable to run second pass at stats to generate standard text files!" ) return { "stats_tsv": stats_output, "stats_txt": stats_stdout, "stats_err": stats_stderr, "version_string": self.version_string(), "command": command + " && " + command2 }
def test_mkdir_ok(self): some_path = os.path.join("a_dir", "another_dir", "a_deep_dir") self.assertFalse(os.path.exists(some_path)) mkdir(some_path) self.assertTrue(os.path.exists(some_path))
def run(self, input_file, output_file_name): """ Runs readlength.sh on input_file to generate a file named output_file under the output_dir. It then skims that file for several values and returns them as a dictionary. The keys to this return dict are: count - the number of reads bases - the total number of bases max - the length of the longest read min - the length of the shortest read avg - the average read length median - the median read length mode - the mode of the mean lengths std_dev - the standard deviation of read lengths output_file - the output file from readlength, containing a histogram of reads info This also calculates the histogram, but it's left out for now. (Unless it's needed later) If the output file exists, it will be overwritten. """ if not os.path.exists(input_file): raise ValueError( "The input file '{}' can't be found!".format(input_file)) mkdir(os.path.join(self.output_dir, "readlength")) output_file_path = os.path.join(self.output_dir, "readlength", output_file_name) readlength_params = [ "in={}".format(input_file), "1>|", output_file_path ] (exit_code, command) = super(ReadLengthRunner, self).run(*readlength_params) if exit_code != 0: raise RuntimeError("An error occurred while running readlength!") if not os.path.exists(output_file_path): raise RuntimeError( "The output file '{}' appears not to have been made!".format( output_file_path)) ret_value = dict() # This file will have some standard lines, all of which start with a '#' # like this: # #Reads: 358 # #Bases: 35279 # #Max: 100 # #Min: 89 # #Avg: 98.5 # #Median: 100 # #Mode: 100 # #Std_Dev: 4.9 # #Read Length Histogram: (a table follows that we're not using) # These get parsed based on their name. #Reads is an int, so parse it that way. #Avg is a float, etc. # The parsed numerical values get returned in the output dictionary. with open(output_file_path, "r") as read_len_file: line_mapping = { "#Reads:": ("count", int), "#Bases:": ("bases", int), "#Max:": ("max", int), "#Min:": ("min", int), "#Avg:": ("avg", float), "#Median:": ("median", int), "#Mode:": ("mode", int), "#Std_Dev:": ("std_dev", float), } for line in read_len_file: chopped = line.split() if chopped[0] in line_mapping: key, map_fn = line_mapping[chopped[0]] ret_value[key] = map_fn(chopped[1]) ret_value.update({ "output_file": output_file_path, "command": command, "version_string": self.version_string() }) return ret_value
def run(self, input_file, reads_info, options): """ Runs spades, returns the generated output directory name. It's full of standard files. This will use (by default) k=33,55,77,99,127. However, if the max read length < any of those k, that'll be omitted. For example, if your input reads are such that the longest one is 100 bases, this'll omit k=127. :param input_file: string or path to the input paired-end reads file :param reads_info: dict - info about the reads from readlength.py. This uses the output_file and avg keys. :param options: dict - "max_memory" - max allowed memory in GB (default 2000) """ spades_output_dir = os.path.join(self.output_dir, "spades", "spades3") mkdir(spades_output_dir) spades_kmers = [33, 55, 77, 99, 127] used_kmers = [k for k in spades_kmers if k <= reads_info["avg"]] max_memory = str(options.get("max_memory", 2000)) spades_params = [ "--only-assembler", "-k", ",".join(map(str, used_kmers)), "--meta", "-t", "32", "-m", max_memory, "-o", spades_output_dir, "--12", input_file ] print("SPAdes input reads info:\n{}\n".format("=" * 24)) file_to_log(reads_info["output_file"]) print("{}\nEnd SPAdes input reads info\n".format("=" * 27)) (exit_code, command) = super(SpadesRunner, self).run(*spades_params) # get the SPAdes logs and cat them to stdout print("Done running SPAdes") print("See log transcripts below for details") log_files = ["warnings.log", "params.txt", "spades.log"] for f in log_files: log_file = os.path.join(spades_output_dir, f) if os.path.exists(log_file): print("SPAdes log file {}:\n{}\n".format( f, "=" * (17 + len(f)))) file_to_log(log_file) print("{}\nEnd SPAdes log file {}\n".format( "=" * (20 + len(f)), f)) if exit_code != 0: raise RuntimeError( "Errors occurred while running spades. Check the logs for details. Unable to continue pipeline." ) return_dict = { "command": command, "version_string": self.version_string(), "output_dir": spades_output_dir, "run_log": os.path.join(spades_output_dir, "spades.log"), "params_log": os.path.join(spades_output_dir, "params.txt") } warnings_log = os.path.join(spades_output_dir, "warnings.log") if os.path.exists(warnings_log): return_dict["warnings_log"] = warnings_log scaffolds = os.path.join(spades_output_dir, "scaffolds.fasta") if os.path.exists(scaffolds): return_dict["scaffolds_file"] = scaffolds contigs = os.path.join(spades_output_dir, "contigs.fasta") if os.path.exists(contigs): return_dict["contigs_file"] = contigs return return_dict