def make_glennfile_from_pileup(job, gam_key, condition, options): """ Toil job which, assuming that the GAM has already been turned into a pileup in the cache for the given experimental condition, produces the augmented graph and associated glennfile in the cache for the current condition. Returns nothing. Supports a pre-execution mode: if job is None, returns True if we really need to run the job, and False otherwise. """ # Make IOStores graph_store = IOStore.get(options.in_graphs) cache_store = IOStore.get(options.cache) # Determine output filenames out_glennfile_key = glennfile_key(gam_key, condition) out_augmented_graph_key = augmented_graph_key(gam_key, condition) if cache_store.exists(out_glennfile_key) and cache_store.exists(out_augmented_graph_key): # We already made these files return False elif job is None: # We aren't really executing yet, but we need to return True # Get the non-augmented graph input_graph = graph_store.get_input_file(job, graph_key(gam_key)) # Get the pileup from the cache input_pileup = cache_store.get_input_file(job, pileup_key(gam_key, condition)) # Plan where the output glennfile goes out_glennfile = "{}/sample.glenn".format(job.fileStore.getLocalTempDir()) # And where the output augmented graph goes out_augmented_graph = "{}/sample.vg".format(job.fileStore.getLocalTempDir()) # Do the actual vg call-ing pipeline = [] pipeline.append("vg call {} {} {} -l -c {} -t {} > {}".format( input_graph, input_pileup, condition.get_call_options(), out_glennfile, job.cores, out_augmented_graph)) run(pipeline, fail_hard = True) # Save the glennfile and augmented graph back to the cache cache_store.write_output_file(out_glennfile, out_glennfile_key) cache_store.write_output_file(out_augmented_graph, out_augmented_graph_key)
def make_pileup(job, gam_key, condition, options): """ Toil job to make a pileup from the given GAM, in the given experimental condition. Loads the GAM from the input GAM IOStore, and saves the pileup to the right place in the cache IOStore for the given sample. Returns nothing. Supports a pre-execution mode: if job is None, returns True if we really need to run the job, and False otherwise. """ # Make IOStores gam_store = IOStore.get(options.in_gams) graph_store = IOStore.get(options.in_graphs) cache_store = IOStore.get(options.cache) # Determine output key out_pileup_key = pileup_key(gam_key, condition) if cache_store.exists(out_pileup_key): # We already made this file. No need to run if we aren't already. return False elif job is None: # We aren't really executing yet, but we need to return True # Download the GAM input_gam = gam_store.get_input_file(job, gam_key) # And the graph it was aligned to input_graph = graph_store.get_input_file(job, graph_key(gam_key)) # Plan where the output pileup goes out_pileup_path = "{}/pileup.vgpu".format(job.fileStore.getLocalTempDir()) # Run the filter and pileup steps, and die if they fail pipeline = [] pipeline.append("vg filter {} {}".format(input_gam, condition.get_read_filter_options())) pipeline.append("vg pileup {} - {} -t {} > {}".format(input_graph, condition.get_pileup_options(), job.cores, out_pileup_path)) run(pipeline, fail_hard = True) # Upload the pileup to the cache for the current experimental conditions cache_store.write_output_file(out_pileup_path, out_pileup_key)
def get_max_f_score(job, gam_key, condition, options): """ Given the GAM file key for a sample that has already had vcfeval run under the given conditions, parse the vcfeval roc and return the biggest F score. """ # Make the IOStore cache_store = IOStore.get(options.cache) # Find the ROC curve roc_key = vcfeval_roc_key(gam_key, condition) # Get the file roc_compressed = cache_store.get_input_file(job, roc_key) # Read it reader = tsv.TsvReader(gzip.GzipFile(roc_compressed)) # What's the max F score we found? max_f_score = None for parts in reader: # Parse all the F scores f_score = float(parts[6]) if max_f_score is None or f_score > max_f_score: # And keep the max max_f_score = f_score # Return the max F score. return max_f_score
def run_experiment(job, options): """ Toil job to run an experiment on a variety of conditions and compare the results. """ # Make the IOStore we can search for GAMs gam_store = IOStore.get(options.in_gams) # And one so we can check if truth files exist truth_store = IOStore.get(options.truth) # This will hold best F score by region, graph, sample, and then condition. # We stick in dicts by condition. results = collections.defaultdict(lambda: collections.defaultdict(dict)) # Make some experimental conditions with filter, pileup, call, # and glenn2vcf options. # First define the lists we want the product of for all the parameters grid = [{ # vg filter "-r": [0.97], # minimum score to keep primary alignment [default=0] "-d": [0], # mininum (primary - secondary) score delta to keep secondary alignment "-e": [0], # minimum (primary - secondary) score delta to keep primary alignment "-a": [""], # use (secondary / primary) for delta comparisons "-f": [""], # normalize score based on length "-u": [""], # use substitution count instead of score "-s": [2], # minimum score to keep secondary alignment [default=0] "-o": [0] # filter reads whose alignments begin or end with an insert > N [default=99999] }, { # vg pileup "-w": [40], # size of window to apply -m option (default=0) "-m": [2], # ignore bases with > N mismatches within window centered on read (default=1) "-q": [10] # ignore bases with PHRED quality < N (default=0) }, { # vg call "-r": [0.0001], # Prior for being heterozygous "-b": [1.0], # Max strand bias "-f": [0.05], # Min fraction of reads required to support a variant "-d": [4] # Min pileup depth }, { # glenn2vcf "--depth": [10], # search depth not read depth "--min_fraction": [0.15], # Min fraction of average coverage to call at "--min_count": [6], # Min total supporting reads for an allele to have it "--max_het_bias": [4.2] # Max bias towards one alt of a called het }, { # vcfeval "--all-records": [""], "--vcf-score-field": ["XAAD"] }] # Make the whole grid of conditions for the grid search conditions = [ExperimentCondition(*point) for point in make_grid(grid)] # Add a condition that opens everything way up so we can try and # get maximum recall. conditions.append(ExperimentCondition( { # vg filter "-r": 0, "-d": 0.05, "-e": 0.05, "-a": "", "-f": "", "-u": "", "-s": 10000, "-o": 99999 }, { # vg pileup "-w": 40, "-m": 10, "-q": 10 }, { # vg call "-r": 0.0001, "-b": 0.4, "-f": 0.25, "-d": 11 }, { # glenn2vcf "--depth": 10, "--min_fraction": 0, # Min fraction of average coverage to call at "--min_count": 1, # Min total supporting reads for an allele to have it "--max_het_bias": 20 # Max bias towards one alt of a called het }, { # vcfeval "--all-records": "", "--vcf-score-field": "XAAD" }) ) RealTimeLogger.get().info("Running {} conditions...".format(len(conditions))) for region_dir in gam_store.list_input_directory(""): # Within every region we have samples for, look through all the # different graphs. if options.important_regions is not None and region_dir not in options.important_regions: # Skip it if it's unimportant continue for graph_dir in gam_store.list_input_directory(region_dir): # Within every graph for a region, we have a collection of samples. if ("{}:{}".format(region_dir, graph_dir) in options.blacklist or region_dir in options.blacklist or graph_dir in options.blacklist): # We don't want to process this region/graph pair. RealTimeLogger.get().info("Skipping {} graph {}".format( region_dir, graph_dir)) continue if options.important_graphs is not None and graph_dir not in options.important_graphs: # Skip it if it's unimportant continue for filename in gam_store.list_input_directory("{}/{}".format( region_dir, graph_dir)): # Look at each potential sample file # Is this file a sample? match = re.match("(.+)\\.gam$", filename) if not match: # It's not a sample continue if options.important_samples is not None and filename not in options.important_samples: # Skip it if it's unimportant continue # Otherwise, compose the full GAM key gam_key = "{}/{}/{}".format(region_dir, graph_dir, filename) if (not truth_store.exists(truth_compressed_key(gam_key)) or not truth_store.exists(truth_index_key(gam_key))): # We don't have a truth for this sample, so don't bother doing it. RealTimeLogger.get().warning("Skipping missing truth for {}".format(gam_key)) continue # Kick off a pipeline to make the variant calls. # TODO: assumes all the extra directories we need to read stuff from are set exp_job = job.addChildJobFn(run_conditions, gam_key, conditions, options, cores=1, memory="2G", disk="10G") # Save the best F score by condition under this region, graph, and sample filename results[region_dir][graph_dir][filename] = exp_job.rv() # Give back the results # TODO: we run it through JSON to fix the pickle-ability. return de_defaultdict(results)
def make_vcfeval_from_vcf(job, gam_key, condition, options): """ Compute the performance of the given GAM (aligned to the given graph) against the truth set for its region. Places the vcfeval summary file for the given sample in the cache. Supports a pre-execution mode: if job is None, returns True if we really need to run the job, and False otherwise. """ # Make IOStores cache_store = IOStore.get(options.cache) truth_store = IOStore.get(options.truth) # Determine where the resuls go # Summary out_summary_key = vcfeval_summary_key(gam_key, condition) # False positives VCF out_fp_key = vcfeval_fp_key(gam_key, condition) # False negatives VCF out_fn_key = vcfeval_fn_key(gam_key, condition) # ROC curve data out_roc_key = vcfeval_roc_key(gam_key, condition) if (cache_store.exists(out_summary_key) and cache_store.exists(out_fp_key) and cache_store.exists(out_fn_key) and cache_store.exists(out_roc_key)): # We already did this return False elif job is None: # We aren't really executing yet, but we need to return True # Get the query VCF query_vcf_compressed = cache_store.get_input_file(job, vcf_compressed_key(gam_key, condition)) query_vcf_index = cache_store.get_input_file(job, vcf_index_key(gam_key, condition)) if query_vcf_index != query_vcf_compressed + ".tbi": # Hack them over to the right names with symlinks new_vcf_name = "{}/sample.vcf.gz".format(job.fileStore.getLocalTempDir()) os.symlink(query_vcf_compressed, new_vcf_name) os.symlink(query_vcf_index, new_vcf_name + ".tbi") query_vcf_compressed = new_vcf_name query_vcf_index = query_vcf_compressed + ".tbi" # Find the truth VCF truth_vcf_compressed = truth_store.get_input_file(job, truth_compressed_key(gam_key)) truth_vcf_index = truth_store.get_input_file(job, truth_index_key(gam_key)) if truth_vcf_index != truth_vcf_compressed + ".tbi": # Hack them over to the right names with symlinks new_vcf_name = "{}/truth.vcf.gz".format(job.fileStore.getLocalTempDir()) os.symlink(truth_vcf_compressed, new_vcf_name) os.symlink(truth_vcf_index, new_vcf_name + ".tbi") truth_vcf_compressed = new_vcf_name truth_vcf_index = truth_vcf_compressed + ".tbi" # Decide on an output directory out_dir = "{}/vcfeval".format(job.fileStore.getLocalTempDir()) # Do the actual VCF conversion pipeline = [] pipeline.append("rtg vcfeval -b {} -c {} -t {} -o {} {}".format( truth_vcf_compressed, query_vcf_compressed, options.sdf, out_dir, condition.get_vcfeval_options())) run(pipeline, fail_hard=True) # Save the result files back to the cache cache_store.write_output_file(out_dir + "/summary.txt", out_summary_key) cache_store.write_output_file(out_dir + "/fp.vcf.gz", out_fp_key) cache_store.write_output_file(out_dir + "/fn.vcf.gz", out_fn_key) cache_store.write_output_file(out_dir + "/weighted_roc.tsv.gz", out_roc_key)
def make_vcf_from_glennfile(job, gam_key, condition, options): """ Toil job which, assuming that the Glennfile and augmented graph have already been made for the given sample and experimental condition, produces the VCF. Needs the regions option to be specified. Returns nothing. Supports a pre-execution mode: if job is None, returns True if we really need to run the job, and False otherwise. """ # Make IOStores cache_store = IOStore.get(options.cache) region_store = IOStore.get(options.regions) # Determine output keys out_vcf_compressed_key = vcf_compressed_key(gam_key, condition) out_vcf_index_key = vcf_index_key(gam_key, condition) out_vcf_log_key = vcf_log_key(gam_key, condition) if (cache_store.exists(out_vcf_compressed_key) and cache_store.exists(out_vcf_index_key) and cache_store.exists(out_vcf_log_key)): # We already made these files return False elif job is None: # We aren't really executing yet, but we need to return True # Get the augmented graph from the cache input_augmented_graph = cache_store.get_input_file(job, augmented_graph_key(gam_key, condition)) # Get the glennfile from the cache input_glennfile = cache_store.get_input_file(job, glennfile_key(gam_key, condition)) # Get the BED that tells us where the region is region_bed = region_store.get_input_file(job, alignment_region_tag(gam_key).upper() + ".bed") with open(region_bed) as f: # Read the contig and offset we want our VCF to be in from the BED file. contig, offset = f.readline().split()[0:2] # Get the sample name sample_name = alignment_sample_tag(gam_key) # Plan where to put the output VCF out_vcf = "{}/sample.vcf".format(job.fileStore.getLocalTempDir()) # And its compressed and indexed versions out_vcf_compressed = out_vcf + ".gz" out_vcf_index = out_vcf_compressed + ".tbi" # Plan where to put the intermediate unsorted VCF unsorted_vcf = "{}/unsorted.vcf".format(job.fileStore.getLocalTempDir()) # And the glenn2vcf error log (which has bases dropped, etc.) out_errlog = "{}/sample.err".format(job.fileStore.getLocalTempDir()) # Do the actual VCF conversion pipeline = [] pipeline.append("glenn2vcf {} {} -o {} -c {} -s {} {} > {} 2> {}".format( input_augmented_graph, input_glennfile, offset, contig, sample_name, condition.get_vcf_options(), unsorted_vcf, out_errlog)) run(pipeline, fail_hard = True) pipeline = [] # Sort the VCF pipeline.append("scripts/vcfsort {}".format(unsorted_vcf)) # And uniquify it pipeline.append("vcfuniq > {}".format(out_vcf)) run(pipeline, fail_hard = True) # Compress and index the VCF run(["bgzip {} -c > {}".format(out_vcf, out_vcf_compressed)], fail_hard=True) # TODO: This is forced to append .tbi as the index name run(["tabix -f -p vcf {}".format(out_vcf_compressed)], fail_hard=True) # Save the compressed VCF, its index, and its error log back to the cache cache_store.write_output_file(out_vcf_compressed, out_vcf_compressed_key) cache_store.write_output_file(out_vcf_index, out_vcf_index_key) cache_store.write_output_file(out_errlog, out_vcf_log_key)