예제 #1
0
def make_glennfile_from_pileup(job, gam_key, condition, options):
    """
    Toil job which, assuming that the GAM has already been turned into a pileup
    in the cache for the given experimental condition, produces the augmented
    graph and associated glennfile in the cache for the current condition.
    
    Returns nothing.
    
    Supports a pre-execution mode: if job is None, returns True if we really
    need to run the job, and False otherwise.
    """
    
    # Make IOStores
    graph_store = IOStore.get(options.in_graphs)
    cache_store = IOStore.get(options.cache)
    
    # Determine output filenames
    out_glennfile_key = glennfile_key(gam_key, condition)
    out_augmented_graph_key = augmented_graph_key(gam_key, condition)
    
    if cache_store.exists(out_glennfile_key) and cache_store.exists(out_augmented_graph_key):
        # We already made these files
        return False
    elif job is None:
        # We aren't really executing yet, but we need to
        return True
    
    # Get the non-augmented graph
    input_graph = graph_store.get_input_file(job, graph_key(gam_key))
    
    # Get the pileup from the cache
    input_pileup = cache_store.get_input_file(job, pileup_key(gam_key, condition))
    
    # Plan where the output glennfile goes
    out_glennfile = "{}/sample.glenn".format(job.fileStore.getLocalTempDir())
    
    # And where the output augmented graph goes
    out_augmented_graph = "{}/sample.vg".format(job.fileStore.getLocalTempDir())
    
    # Do the actual vg call-ing
    pipeline = []
    pipeline.append("vg call {} {} {} -l -c {} -t {} > {}".format(
        input_graph, input_pileup, condition.get_call_options(),
        out_glennfile, job.cores, out_augmented_graph))
    run(pipeline, fail_hard = True)
    
    # Save the glennfile and augmented graph back to the cache
    cache_store.write_output_file(out_glennfile, out_glennfile_key)
    cache_store.write_output_file(out_augmented_graph, out_augmented_graph_key)
예제 #2
0
def make_pileup(job, gam_key, condition, options):
    """
    Toil job to make a pileup from the given GAM, in the given experimental
    condition.
    
    Loads the GAM from the input GAM IOStore, and saves the pileup to the right
    place in the cache IOStore for the given sample.
    
    Returns nothing.
    
    Supports a pre-execution mode: if job is None, returns True if we really
    need to run the job, and False otherwise.
    
    """
    
    # Make IOStores
    gam_store = IOStore.get(options.in_gams)
    graph_store = IOStore.get(options.in_graphs)
    cache_store = IOStore.get(options.cache)
    
    # Determine output key
    out_pileup_key = pileup_key(gam_key, condition)
    
    if cache_store.exists(out_pileup_key):
        # We already made this file. No need to run if we aren't already.
        return False
    elif job is None:
        # We aren't really executing yet, but we need to
        return True
    
    # Download the GAM
    input_gam = gam_store.get_input_file(job, gam_key)
    
    # And the graph it was aligned to
    input_graph = graph_store.get_input_file(job, graph_key(gam_key))
    
    # Plan where the output pileup goes
    out_pileup_path = "{}/pileup.vgpu".format(job.fileStore.getLocalTempDir())
    
    # Run the filter and pileup steps, and die if they fail
    pipeline = []
    pipeline.append("vg filter {} {}".format(input_gam,
        condition.get_read_filter_options()))
    pipeline.append("vg pileup {} - {} -t {} > {}".format(input_graph,
        condition.get_pileup_options(), job.cores, out_pileup_path))
    run(pipeline, fail_hard = True)
    
    # Upload the pileup to the cache for the current experimental conditions
    cache_store.write_output_file(out_pileup_path, out_pileup_key)
예제 #3
0
def get_max_f_score(job, gam_key, condition, options):
    """
    Given the GAM file key for a sample that has already had vcfeval run under
    the given conditions, parse the vcfeval roc and return the biggest F score.
    
    """
    
    # Make the IOStore
    cache_store = IOStore.get(options.cache)
    
    # Find the ROC curve
    roc_key = vcfeval_roc_key(gam_key, condition)
    
    # Get the file
    roc_compressed = cache_store.get_input_file(job, roc_key)
    
    # Read it
    reader = tsv.TsvReader(gzip.GzipFile(roc_compressed))
    
    # What's the max F score we found?
    max_f_score = None
    for parts in reader:
        # Parse all the F scores
        f_score = float(parts[6])
        
        if max_f_score is None or f_score > max_f_score:
            # And keep the max
            max_f_score = f_score
            
    # Return the max F score.
    return max_f_score
예제 #4
0
def run_experiment(job, options):
    """
    Toil job to run an experiment on a variety of conditions and compare the
    results.
    """
    
    # Make the IOStore we can search for GAMs
    gam_store = IOStore.get(options.in_gams)
    # And one so we can check if truth files exist
    truth_store = IOStore.get(options.truth)
    
    # This will hold best F score by region, graph, sample, and then condition.
    # We stick in dicts by condition.
    results = collections.defaultdict(lambda: collections.defaultdict(dict))
    
    # Make some experimental conditions with filter, pileup, call,
    # and glenn2vcf options. 
    
    # First define the lists we want the product of for all the parameters
    grid = [{ # vg filter 
            "-r": [0.97], # minimum score to keep primary alignment [default=0]
            "-d": [0], # mininum (primary - secondary) score delta to keep secondary alignment
            "-e": [0], # minimum (primary - secondary) score delta to keep primary alignment
            "-a": [""], # use (secondary / primary) for delta comparisons
            "-f": [""], # normalize score based on length
            "-u": [""], # use substitution count instead of score
            "-s": [2], # minimum score to keep secondary alignment [default=0]
            "-o": [0] #  filter reads whose alignments begin or end with an insert > N [default=99999]
        }, { # vg pileup
            "-w": [40], # size of window to apply -m option (default=0)
            "-m": [2], # ignore bases with > N mismatches within window centered on read (default=1)
            "-q": [10] # ignore bases with PHRED quality < N (default=0)
        }, { # vg call
            "-r": [0.0001], # Prior for being heterozygous
            "-b": [1.0], # Max strand bias
            "-f": [0.05], # Min fraction of reads required to support a variant
            "-d": [4] # Min pileup depth
        }, { # glenn2vcf
            "--depth": [10], # search depth not read depth
            "--min_fraction": [0.15], # Min fraction of average coverage to call at
            "--min_count": [6], # Min total supporting reads for an allele to have it
            "--max_het_bias": [4.2] # Max bias towards one alt of a called het
        }, { # vcfeval
            "--all-records": [""],
            "--vcf-score-field": ["XAAD"]
        }]
        
    # Make the whole grid of conditions for the grid search
    conditions = [ExperimentCondition(*point) for point in make_grid(grid)]
            
        
    # Add a condition that opens everything way up so we can try and
    # get maximum recall.
    conditions.append(ExperimentCondition(
        { # vg filter 
            "-r": 0,
            "-d": 0.05,
            "-e": 0.05,
            "-a": "",
            "-f": "",
            "-u": "",
            "-s": 10000,
            "-o": 99999
        }, { # vg pileup
            "-w": 40,
            "-m": 10,
            "-q": 10
        }, { # vg call
            "-r": 0.0001,
            "-b": 0.4,
            "-f": 0.25,
            "-d": 11
        }, { # glenn2vcf
            "--depth": 10,
            "--min_fraction": 0, # Min fraction of average coverage to call at
            "--min_count": 1, # Min total supporting reads for an allele to have it
            "--max_het_bias": 20 # Max bias towards one alt of a called het
        }, { # vcfeval
            "--all-records": "",
            "--vcf-score-field": "XAAD"
        })
    )
    
    RealTimeLogger.get().info("Running {} conditions...".format(len(conditions)))
    
    for region_dir in gam_store.list_input_directory(""):
        # Within every region we have samples for, look through all the
        # different graphs.
        
        if options.important_regions is not None and region_dir not in options.important_regions:
            # Skip it if it's unimportant
            continue
        
        for graph_dir in gam_store.list_input_directory(region_dir):
            # Within every graph for a region, we have a collection of samples.
            
            if ("{}:{}".format(region_dir, graph_dir) in options.blacklist or
                region_dir in options.blacklist or
                graph_dir in options.blacklist):
                # We don't want to process this region/graph pair.
                RealTimeLogger.get().info("Skipping {} graph {}".format(
                    region_dir, graph_dir))
                continue
                
            if options.important_graphs is not None and graph_dir not in options.important_graphs:
                # Skip it if it's unimportant
                continue
                
            for filename in gam_store.list_input_directory("{}/{}".format(
                region_dir, graph_dir)):
                # Look at each potential sample file
                
                # Is this file a sample?
                match = re.match("(.+)\\.gam$", filename)
                
                if not match:
                    # It's not a sample
                    continue
                    
                if options.important_samples is not None and filename not in options.important_samples:
                    # Skip it if it's unimportant
                    continue
                    
                # Otherwise, compose the full GAM key
                gam_key = "{}/{}/{}".format(region_dir, graph_dir, filename)
                
                if (not truth_store.exists(truth_compressed_key(gam_key)) or
                    not truth_store.exists(truth_index_key(gam_key))):
                    
                    # We don't have a truth for this sample, so don't bother doing it.
                    RealTimeLogger.get().warning("Skipping missing truth for {}".format(gam_key))
                    continue
                
                # Kick off a pipeline to make the variant calls.
                # TODO: assumes all the extra directories we need to read stuff from are set
                exp_job = job.addChildJobFn(run_conditions, gam_key, conditions, options,
                    cores=1, memory="2G", disk="10G")
                    
                # Save the best F score by condition under this region, graph, and sample filename
                results[region_dir][graph_dir][filename] = exp_job.rv()
                
    # Give back the results
    # TODO: we run it through JSON to fix the pickle-ability.
    return de_defaultdict(results)
예제 #5
0
def make_vcfeval_from_vcf(job, gam_key, condition, options):
    """
    Compute the performance of the given GAM (aligned to the given graph)
    against the truth set for its region.
    
    Places the vcfeval summary file for the given sample in the cache.
    
    Supports a pre-execution mode: if job is None, returns True if we really
    need to run the job, and False otherwise.
    """
    
    # Make IOStores
    cache_store = IOStore.get(options.cache)
    truth_store = IOStore.get(options.truth)
    
    # Determine where the resuls go
    # Summary
    out_summary_key = vcfeval_summary_key(gam_key, condition)
    # False positives VCF
    out_fp_key = vcfeval_fp_key(gam_key, condition)
    # False negatives VCF
    out_fn_key = vcfeval_fn_key(gam_key, condition)
    # ROC curve data
    out_roc_key = vcfeval_roc_key(gam_key, condition)
    
    if (cache_store.exists(out_summary_key) and
        cache_store.exists(out_fp_key) and
        cache_store.exists(out_fn_key) and
        cache_store.exists(out_roc_key)):
        # We already did this
        return False
    elif job is None:
        # We aren't really executing yet, but we need to
        return True

    # Get the query VCF
    query_vcf_compressed = cache_store.get_input_file(job, vcf_compressed_key(gam_key, condition))
    query_vcf_index = cache_store.get_input_file(job, vcf_index_key(gam_key, condition))
    
    if query_vcf_index != query_vcf_compressed + ".tbi":
        # Hack them over to the right names with symlinks
        new_vcf_name = "{}/sample.vcf.gz".format(job.fileStore.getLocalTempDir())
        os.symlink(query_vcf_compressed, new_vcf_name)
        os.symlink(query_vcf_index, new_vcf_name + ".tbi")
        query_vcf_compressed = new_vcf_name
        query_vcf_index = query_vcf_compressed + ".tbi"
        
    # Find the truth VCF
    truth_vcf_compressed = truth_store.get_input_file(job, truth_compressed_key(gam_key))
    truth_vcf_index = truth_store.get_input_file(job, truth_index_key(gam_key))
    
    if truth_vcf_index != truth_vcf_compressed + ".tbi":
        # Hack them over to the right names with symlinks
        new_vcf_name = "{}/truth.vcf.gz".format(job.fileStore.getLocalTempDir())
        os.symlink(truth_vcf_compressed, new_vcf_name)
        os.symlink(truth_vcf_index, new_vcf_name + ".tbi")
        truth_vcf_compressed = new_vcf_name
        truth_vcf_index = truth_vcf_compressed + ".tbi"
    
    # Decide on an output directory
    out_dir = "{}/vcfeval".format(job.fileStore.getLocalTempDir())
    
    # Do the actual VCF conversion
    pipeline = []
    pipeline.append("rtg vcfeval -b {} -c {} -t {} -o {} {}".format(
        truth_vcf_compressed, query_vcf_compressed, options.sdf, out_dir,
        condition.get_vcfeval_options()))
    run(pipeline, fail_hard=True)
    
    # Save the result files back to the cache
    cache_store.write_output_file(out_dir + "/summary.txt", out_summary_key)
    cache_store.write_output_file(out_dir + "/fp.vcf.gz", out_fp_key)
    cache_store.write_output_file(out_dir + "/fn.vcf.gz", out_fn_key)
    cache_store.write_output_file(out_dir + "/weighted_roc.tsv.gz", out_roc_key)
예제 #6
0
def make_vcf_from_glennfile(job, gam_key, condition, options):
    """
    Toil job which, assuming that the Glennfile and augmented graph have already
    been made for the given sample and experimental condition, produces the VCF.
    
    Needs the regions option to be specified.
    
    Returns nothing.
    
    Supports a pre-execution mode: if job is None, returns True if we really
    need to run the job, and False otherwise.
    """
    
    # Make IOStores
    cache_store = IOStore.get(options.cache)
    region_store = IOStore.get(options.regions)
    
    # Determine output keys
    out_vcf_compressed_key = vcf_compressed_key(gam_key, condition)
    out_vcf_index_key = vcf_index_key(gam_key, condition)
    out_vcf_log_key = vcf_log_key(gam_key, condition)
    
    if (cache_store.exists(out_vcf_compressed_key) and 
        cache_store.exists(out_vcf_index_key) and
        cache_store.exists(out_vcf_log_key)):
        # We already made these files
        return False
    elif job is None:
        # We aren't really executing yet, but we need to
        return True
    
    # Get the augmented graph from the cache
    input_augmented_graph = cache_store.get_input_file(job, augmented_graph_key(gam_key, condition))
    
    # Get the glennfile from the cache
    input_glennfile = cache_store.get_input_file(job, glennfile_key(gam_key, condition))
    
    # Get the BED that tells us where the region is
    region_bed = region_store.get_input_file(job, alignment_region_tag(gam_key).upper() + ".bed")

    with open(region_bed) as f:
        # Read the contig and offset we want our VCF to be in from the BED file.
        contig, offset = f.readline().split()[0:2]
        
    # Get the sample name
    sample_name = alignment_sample_tag(gam_key)
    
    # Plan where to put the output VCF
    out_vcf = "{}/sample.vcf".format(job.fileStore.getLocalTempDir())
    
    # And its compressed and indexed versions
    out_vcf_compressed = out_vcf + ".gz"
    out_vcf_index = out_vcf_compressed + ".tbi"
    
    # Plan where to put the intermediate unsorted VCF
    unsorted_vcf = "{}/unsorted.vcf".format(job.fileStore.getLocalTempDir())
    
    # And the glenn2vcf error log (which has bases dropped, etc.)
    out_errlog = "{}/sample.err".format(job.fileStore.getLocalTempDir())
    
    # Do the actual VCF conversion
    pipeline = []
    pipeline.append("glenn2vcf {} {} -o {} -c {} -s {} {} > {} 2> {}".format(
        input_augmented_graph, input_glennfile, offset, contig, sample_name,
        condition.get_vcf_options(), unsorted_vcf, out_errlog))
    run(pipeline, fail_hard = True)
    
    pipeline = []
    # Sort the VCF
    pipeline.append("scripts/vcfsort {}".format(unsorted_vcf))
    # And uniquify it
    pipeline.append("vcfuniq > {}".format(out_vcf))
    run(pipeline, fail_hard = True)
    
    # Compress and index the VCF
    run(["bgzip {} -c > {}".format(out_vcf, out_vcf_compressed)], fail_hard=True)
    # TODO: This is forced to append .tbi as the index name
    run(["tabix -f -p vcf {}".format(out_vcf_compressed)], fail_hard=True)
    
    # Save the compressed VCF, its index, and its error log back to the cache
    cache_store.write_output_file(out_vcf_compressed, out_vcf_compressed_key)
    cache_store.write_output_file(out_vcf_index, out_vcf_index_key)
    cache_store.write_output_file(out_errlog, out_vcf_log_key)