def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) fastq_dir = get_fastq_dir(fc_dir) run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_lane, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)): pass # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples(align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)): pass write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def _generate_fastq(fc_dir, config, compress_fastq): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] postprocess_dir = config.get("postprocess_dir", "") if postprocess_dir: fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq") if not fastq_dir == fc_dir:# and not os.path.exists(fastq_dir): with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] if postprocess_dir: cl += ["-o", fastq_dir] if compress_fastq: cl += ["--gzip"] logger2.debug("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) return fastq_dir
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(run_module, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("recalibrate_sample", samples) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("process_sample", samples) samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items): assert len(xs) == 1 upload.from_sample(xs[0]) write_metrics(run_info, fc_name, fc_date, dirs)
def _generate_fastq(fc_dir, config, compress_fastq): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] postprocess_dir = config.get("postprocess_dir", "") if postprocess_dir: fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq") if not fastq_dir == fc_dir: # and not os.path.exists(fastq_dir): with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] if postprocess_dir: cl += ["-o", fastq_dir] if compress_fastq: cl += ["--gzip"] logger2.debug("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) return fastq_dir
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() align_dir = os.path.join(work_dir, "alignments") fc_name, fc_date = get_flowcell_info(fc_dir) run_info = _get_run_info(fc_name, fc_date, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) # process each flowcell lane lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = _run_parallel("process_lane", lanes, dirs, config) _run_parallel("process_alignment", lane_items, dirs, config) # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = \ organize_samples(dirs, fc_name, fc_date, run_items) samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file) for n, bam_files in sample_files) _run_parallel("process_sample", samples, dirs, config) write_metrics(run_info, fc_name, fc_date, dirs)
def select_upload_files(base, bc_id, fc_dir, analysis_dir): """Select fastq, bam alignment and summary files for upload to Galaxy. """ fastq_dir = analysis_dir if bc_id else get_fastq_dir(fc_dir) for fname in glob.glob(os.path.join(fastq_dir, "%s_*fastq.txt" % base)): yield (fname, os.path.basename(fname)) for summary_file in glob.glob( os.path.join(analysis_dir, "%s-*summary.pdf" % base)): yield (summary_file, _name_with_ext(summary_file, "-summary.pdf")) for bam_file in glob.glob( os.path.join(analysis_dir, "%s-*sort-dup.bam" % base)): yield (bam_file, _name_with_ext(bam_file, ".bam")) for wig_file in glob.glob( os.path.join(analysis_dir, "%s-*sort.bigwig" % base)): yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig")) # upload any recalibrated BAM files used for SNP calling found_recal = False for bam_file in glob.glob( os.path.join(analysis_dir, "%s-*gatkrecal-realign-sort.bam" % base)): found_recal = True yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam")) if not found_recal: for bam_file in glob.glob( os.path.join(analysis_dir, "%s-*gatkrecal.bam" % base)): yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam")) # Genotype files produced by SNP calling for snp_file in glob.glob( os.path.join(analysis_dir, "%s-*snp-filter.vcf" % base)): yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf")) # Effect information on SNPs for snp_file in glob.glob( os.path.join(analysis_dir, "%s-*snp-filter-effects.tsv" % base)): yield (snp_file, _name_with_ext(bam_file, "-snp-effects.tsv"))
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) align_dir = os.path.join(work_dir, "alignments") fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) ## process each flowcell lane #run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) #lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) #lane_items = run_parallel("process_lane", lanes) logger.info (">>> Parse lane") lane_items = parse_lane(run_info["details"], fc_name, fc_date, dirs, config) #for item in lane_items: #utils.prettyprint_dict(item) logger.info (">>> Process alignment") align_items = run_parallel("process_alignment", lane_items) ## process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) logger.info (">>> Merge samples") samples = run_parallel("merge_sample", samples) logger.info (">>> Recalibrate samples") samples = run_parallel("recalibrate_sample", samples) logger.info (">>> realign sample") samples = parallel_realign_sample(samples, run_parallel) logger.info (">>> variantcall") samples = parallel_variantcall(samples, run_parallel) logger.info (">>> postprocess_variatns") samples = run_parallel("postprocess_variants", samples) logger.info (">>> combine_multiple_calles") samples = combine_multiple_callers(samples) logger.info (">>> detect_sv") samples = run_parallel("detect_sv", samples) logger.info (">>> combine_calls") samples = run_parallel("combine_calls", samples) logger.info (">>> process_sample") run_parallel("process_sample", samples) logger.info (">>> Generate bigwig") run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) logger.info (">>> Writing project summary") write_project_summary(samples) logger.info (">>> Writing metrics") write_metrics(run_info, fc_name, fc_date, dirs) logger.info (">>> Done")
def select_upload_files(lane, fc_dir, analysis_dir): """Select fastq, bam alignment and summary files for upload to Galaxy. """ # fastq, summary and alignment file for fname in glob.glob(os.path.join(get_fastq_dir(fc_dir), "%s_*_fastq.txt" % lane)): yield (fname, os.path.basename(fname)) for summary_file in glob.glob(os.path.join(analysis_dir, "%s_*-summary.pdf" % lane)): yield (summary_file, _name_with_ext(summary_file, "-summary.pdf")) for bam_file in glob.glob(os.path.join(analysis_dir, "%s_*-sort-dup.bam" % lane)): yield (bam_file, _name_with_ext(bam_file, ".bam")) for wig_file in glob.glob(os.path.join(analysis_dir, "%s_*-sort.bigwig" % lane)): yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig")) # upload any recalibrated BAM files used for SNP calling found_recal = False for bam_file in glob.glob(os.path.join(analysis_dir, "%s_*-gatkrecal-realign-sort.bam" % lane)): found_recal = True yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam")) if not found_recal: for bam_file in glob.glob(os.path.join(analysis_dir, "%s_*-gatkrecal.bam" % lane)): yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam")) # Genotype files produced by SNP calling for snp_file in glob.glob(os.path.join(analysis_dir, "%s_*-snp-filter.vcf" % lane)): yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf"))
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir, } run_parallel = parallel_runner(run_module, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("recalibrate_sample", samples) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("detect_sv", samples) samples = run_parallel("process_sample", samples) samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = run_info.organize(dirs, config, run_info_yaml) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane lane_items = lane.process_all_lanes(run_items, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, run_parallel, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def main(config_file, fc_dir): work_dir = os.getcwd() with open(config_file) as in_handle: config = yaml.load(in_handle) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) fc_name, fc_date = get_flowcell_info(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = get_fastq_dir(fc_dir) #print "Generating fastq files" #all_lanes = [i['lane'] for i in run_info["details"]] #short_fc_name = "%s_%s" % (fc_date, fc_name) #fastq_dir = generate_fastq(fc_dir, short_fc_name, all_lanes) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) write_metrics(run_info, work_dir, fc_dir, fastq_dir)
def select_upload_files(base, bc_id, fc_dir, analysis_dir): """Select fastq, bam alignment and summary files for upload to Galaxy. """ fastq_dir = analysis_dir if bc_id else get_fastq_dir(fc_dir) for fname in glob.glob(os.path.join(fastq_dir, "%s_*fastq.txt" % base)): yield (fname, os.path.basename(fname)) for summary_file in glob.glob(os.path.join(analysis_dir, "%s-*summary.pdf" % base)): yield (summary_file, _name_with_ext(summary_file, "-summary.pdf")) for bam_file in glob.glob(os.path.join(analysis_dir, "%s-*sort-dup.bam" % base)): yield (bam_file, _name_with_ext(bam_file, ".bam")) for wig_file in glob.glob(os.path.join(analysis_dir, "%s-*sort.bigwig" % base)): yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig")) # upload any recalibrated BAM files used for SNP calling found_recal = False for bam_file in glob.glob(os.path.join(analysis_dir, "%s-*gatkrecal-realign-sort.bam" % base)): found_recal = True yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam")) if not found_recal: for bam_file in glob.glob(os.path.join(analysis_dir, "%s-*gatkrecal.bam" % base)): yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam")) # Genotype files produced by SNP calling for snp_file in glob.glob(os.path.join(analysis_dir, "%s-*snp-filter.vcf" % base)): yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf")) # Effect information on SNPs for snp_file in glob.glob(os.path.join(analysis_dir, "%s-*snp-filter-effects.tsv" % base)): yield (snp_file, _name_with_ext(bam_file, "-snp-effects.tsv"))
def select_upload_files(base, bc_id, fc_dir, analysis_dir): """Select fastq, bam alignment and summary files for upload to Galaxy. """ # if we have barcodes, update our search name and get local fastq files if bc_id: fastq_dir = os.path.join(analysis_dir, "%s_barcode" % base) base = "%s_%s" % (base, bc_id) # otherwise, use the original fastq files else: fastq_dir = get_fastq_dir(fc_dir) # fastq, summary and alignment file for fname in glob.glob(os.path.join(fastq_dir, "%s*_fastq.txt" % base)): yield (fname, os.path.basename(fname)) for summary_file in glob.glob(os.path.join(analysis_dir, "%s*-summary.pdf" % base)): yield (summary_file, _name_with_ext(summary_file, "-summary.pdf")) for bam_file in glob.glob(os.path.join(analysis_dir, "%s*-sort-dup.bam" % base)): yield (bam_file, _name_with_ext(bam_file, ".bam")) for wig_file in glob.glob(os.path.join(analysis_dir, "%s*-sort.bigwig" % base)): yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig")) # upload any recalibrated BAM files used for SNP calling found_recal = False for bam_file in glob.glob(os.path.join(analysis_dir, "%s*-gatkrecal-realign-sort.bam" % base)): found_recal = True yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam")) if not found_recal: for bam_file in glob.glob(os.path.join(analysis_dir, "%s*-gatkrecal.bam" % base)): yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam")) # Genotype files produced by SNP calling for snp_file in glob.glob(os.path.join(analysis_dir, "%s*-snp-filter.vcf" % base)): yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf"))
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_lanes_with_pipelines(samples) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def select_upload_files(base, bc_id, fc_dir, analysis_dir, config, fname_out=None): """Select fastq, bam alignment and summary files for upload to Galaxy. """ def _name_with_ext(orig_file, ext): """Return a normalized filename without internal processing names. Use specific base out filename if specific, allowing configuration named output files. """ if fname_out is None: base = os.path.basename(orig_file).split("-")[0] else: base = fname_out for extra in ["_trim"]: if base.endswith(extra): base = base[:-len(extra)] return "%s%s" % (base, ext) base_glob = _dir_glob(base, analysis_dir) # Configurable upload of fastq files -- BAM provide same information, compacted if config["algorithm"].get("upload_fastq", True): # look for fastq files in a barcode directory or the main fastq directory bc_base = base.rsplit("_", 1)[0] if bc_id else base bc_dir = os.path.join(analysis_dir, "%s_barcode" % bc_base) fastq_glob = "%s_*fastq.txt" % base found_fastq = False for fname in glob.glob(os.path.join(bc_dir, fastq_glob)): found_fastq = True yield (fname, os.path.basename(fname)) if not found_fastq: fastq_dir = get_fastq_dir(fc_dir) for fname in glob.glob(os.path.join(fastq_dir, fastq_glob)): yield (fname, os.path.basename(fname)) for summary_file in base_glob("summary.pdf"): yield (summary_file, _name_with_ext(summary_file, "-summary.pdf")) for wig_file in base_glob(".bigwig"): yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig")) # upload BAM files, preferring recalibrated and realigned files found_bam = False for orig_ext, new_ext in [("gatkrecal-realign-dup.bam", "-gatkrecal-realign.bam"), ("gatkrecal-realign.bam", "-gatkrecal-realign.bam"), ("gatkrecal.bam", "-gatkrecal.bam"), ("sort-dup.bam", ".bam"), ("sort.bam", ".bam")]: if not found_bam: for bam_file in base_glob(orig_ext): yield (bam_file, _name_with_ext(bam_file, new_ext)) found_bam = True # Genotype files produced by SNP calling found = False for orig_ext, new_ext in [("variants-combined-annotated.vcf", "-variants.vcf"), ("variants-*-annotated.vcf", "-variants.vcf")]: if not found: for snp_file in base_glob(orig_ext): yield (snp_file, _name_with_ext(bam_file, new_ext)) found = True # Effect information on SNPs for snp_file in base_glob("variants-*-effects.tsv"): yield (snp_file, _name_with_ext(bam_file, "-variants-effects.tsv"))
def _generate_fastq(fc_dir): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) if not fastq_dir == fc_dir and not os.path.exists(fastq_dir): with utils.chdir(os.path.split(fastq_dir)[0]): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] subprocess.check_call(cl) return fastq_dir
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def _normalize_files(item, fc_dir): """Ensure the files argument is a list of absolute file names. Handles BAM, single and paired end fastq. """ files = item.get("files") if files: if isinstance(files, basestring): files = [files] if fc_dir: fastq_dir = get_fastq_dir(fc_dir) files = [ x if os.path.isabs(x) else os.path.normpath( os.path.join(fastq_dir, x)) for x in files ] item["files"] = files return item
def _normalize_files(item, fc_dir): """Ensure the files argument is a list of absolute file names. Handles BAM, single and paired end fastq. """ files = item.get("files") if files: if isinstance(files, basestring): files = [files] if fc_dir: fastq_dir = get_fastq_dir(fc_dir) else: fastq_dir = os.getcwd() files = [x if os.path.isabs(x) else os.path.normpath(os.path.join(fastq_dir, x)) for x in files] item["files"] = files return item
def _generate_fastq(fc_dir, config): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] if not fastq_dir == fc_dir and not os.path.exists(fastq_dir): log.info("Generating fastq files for %s" % fc_dir) with utils.chdir(basecall_dir): lanes = sorted( list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] log.info("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) log.info("Qseq to fastq conversion completed.") return fastq_dir
def _generate_fastq(fc_dir, config): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] if not fastq_dir == fc_dir and not os.path.exists(fastq_dir): log.info("Generating fastq files for %s" % fc_dir) with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] log.info("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) log.info("Qseq to fastq conversion completed.") return fastq_dir
def main(config_file, fc_dir): work_dir = os.getcwd() config = load_config(config_file) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) fc_name, fc_date = get_flowcell_info(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = get_fastq_dir(fc_dir) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]))
def main(config_file, fc_dir, run_info_yaml=None): work_dir = os.getcwd() with open(config_file) as in_handle: config = yaml.load(in_handle) if run_info_yaml: with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name) fc_name, fc_date = get_flowcell_info(fc_dir) run_items = _add_multiplex_to_control(run_info["details"]) fastq_dir = get_fastq_dir(fc_dir) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane pool = (Pool(config["algorithm"]["num_cores"]) if config["algorithm"]["num_cores"] > 1 else None) map_fn = pool.map if pool else map try: map_fn(_process_lane_wrapper, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)) except: if pool: pool.terminate() raise # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples(align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) try: map_fn(_process_sample_wrapper, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)) except: if pool: pool.terminate() raise write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = lane.process_all_lanes(lanes, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, config) for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items): assert len(xs) == 1 upload.from_sample(xs[0]) qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
def select_upload_files(base, bc_id, fc_dir, analysis_dir, config): """Select fastq, bam alignment and summary files for upload to Galaxy. """ base_glob = _dir_glob(base, analysis_dir) # Configurable upload of fastq files -- BAM provide same information, compacted if config["algorithm"].get("upload_fastq", True): # look for fastq files in a barcode directory or the main fastq directory bc_base = base.rsplit("_", 1)[0] if bc_id else base bc_dir = os.path.join(analysis_dir, "%s_barcode" % bc_base) fastq_glob = "%s_*fastq.txt" % base found_fastq = False for fname in glob.glob(os.path.join(bc_dir, fastq_glob)): found_fastq = True yield (fname, os.path.basename(fname)) if not found_fastq: fastq_dir = get_fastq_dir(fc_dir) for fname in glob.glob(os.path.join(fastq_dir, fastq_glob)): yield (fname, os.path.basename(fname)) for summary_file in base_glob("summary.pdf"): yield (summary_file, _name_with_ext(summary_file, "-summary.pdf")) for wig_file in base_glob(".bigwig"): yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig")) # upload BAM files, preferring recalibrated and realigned files found_recal = False for bam_file in base_glob("gatkrecal-realign-dup.bam"): found_recal = True yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam")) if not found_recal: for bam_file in base_glob("gatkrecal.bam"): found_recal = True yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam")) if not found_recal: for bam_file in base_glob("sort-dup.bam"): yield (bam_file, _name_with_ext(bam_file, ".bam")) # Genotype files produced by SNP calling for snp_file in base_glob("variants-combined-annotated.vcf"): yield (snp_file, _name_with_ext(bam_file, "-variants.vcf")) # Effect information on SNPs for snp_file in base_glob("variants-combined-effects.tsv"): yield (snp_file, _name_with_ext(bam_file, "-variants-effects.tsv"))
def select_upload_files(base, bc_id, fc_dir, analysis_dir, config): """Select fastq, bam alignment and summary files for upload to Galaxy. """ base_glob = _dir_glob(base, analysis_dir) # Configurable upload of fastq files -- BAM provide same information, compacted if config["algorithm"].get("upload_fastq", True): # look for fastq files in a barcode directory or the main fastq directory bc_base = base.rsplit("_", 1)[0] if bc_id else base bc_dir = os.path.join(analysis_dir, "%s_barcode" % bc_base) fastq_glob = "%s_*fastq.txt" % base found_fastq = False for fname in glob.glob(os.path.join(bc_dir, fastq_glob)): found_fastq = True yield (fname, os.path.basename(fname)) if not found_fastq: fastq_dir = get_fastq_dir(fc_dir) for fname in glob.glob(os.path.join(fastq_dir, fastq_glob)): yield (fname, os.path.basename(fname)) for summary_file in base_glob("summary.pdf"): yield (summary_file, _name_with_ext(summary_file, "-summary.pdf")) for bam_file in base_glob("sort-dup.bam"): yield (bam_file, _name_with_ext(bam_file, ".bam")) for wig_file in base_glob("sort.bigwig"): yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig")) # upload any recalibrated BAM files used for SNP calling found_recal = False for bam_file in base_glob("gatkrecal-realign-sort.bam"): found_recal = True yield (bam_file, _name_with_ext(bam_file, "-gatkrecal-realign.bam")) if not found_recal: for bam_file in base_glob("gatkrecal.bam"): yield (bam_file, _name_with_ext(bam_file, "-gatkrecal.bam")) # Genotype files produced by SNP calling for snp_file in base_glob("snp-filter.vcf"): yield (snp_file, _name_with_ext(bam_file, "-snp-filter.vcf")) # Effect information on SNPs for snp_file in base_glob("snp-filter-effects.tsv"): yield (snp_file, _name_with_ext(bam_file, "-snp-effects.tsv"))
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) fastq_dir = get_fastq_dir(fc_dir) run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap( process_lane, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)): pass # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples( align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)): pass write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt")) prog = RecordProgress(work_dir) to_compress = set() prog.progress("analysis_start") align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(run_module, dirs, config, config_file) run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) _add_to_compress(to_compress, lane_items, 'lane_items') prog.dummy() prog.progress("process_lane") # Remove spiked in controls, contaminants etc. lane_items = run_parallel("remove_contaminants", lane_items) _add_to_compress(to_compress, lane_items, 'lane_items') prog.dummy() prog.progress("remove_contaminants") align_items = run_parallel("process_alignment", lane_items) _add_to_compress(to_compress, align_items, 'align_items') prog.dummy() prog.progress("process_alignment") # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("merge_sample") samples = run_parallel("mark_duplicates_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("mark_duplicates_sample") run_parallel("screen_sample_contaminants", samples) prog.dummy() prog.progress("screen_sample_contaminants") samples = run_parallel("recalibrate_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("recalibrate_sample") samples = parallel_realign_sample(samples, run_parallel) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("realign_sample") samples = parallel_variantcall(samples, run_parallel) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("variantcall") samples = run_parallel("detect_sv", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("detect_sv") samples = run_parallel("process_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("process_sample") samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("generate_bigwig") write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs) prog.dummy() prog.progress("write_metrics") # Compress all files in to_compress if config['algorithm'].get('compress_files', True): sizes = run_parallel("compress_files", [[[cf]] for cf in to_compress]) before = sum([s[0] for s in sizes]) after = sum([s[1] for s in sizes]) logger.info("Space used by the files before compressing (in bytes): " \ + str(before)) logger.info("Space used by the files after compressing (in bytes): " \ + str(after)) logger.info("Saved space (in bytes): " + str(before - after))
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt")) prog = utils.RecordProgress(work_dir) to_compress = set() prog.progress("analysis_start") align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(run_module, dirs, config, config_file) run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) [to_compress.add(f) for f in lane_items[0][0:2]] prog.progress("process_lane") # upload the sequencing report to Google Docs # will skip this for now and rely on external mechanism for uploading this data #gdocs_indicator = os.path.join(work_dir, "gdocs_report_complete.txt") #if not os.path.exists(gdocs_indicator) \ #and queue_report(fc_date, fc_name, os.path.abspath(run_info_yaml), dirs, config, config_file): # utils.touch_file(gdocs_indicator) # Remove spiked in controls, contaminants etc. lane_items = run_parallel("remove_contaminants", lane_items) [to_compress.add(f) for f in lane_items[0][0:2]] prog.progress("remove_contaminants") align_items = run_parallel("process_alignment", lane_items) [to_compress.add(f) for f in align_items[0]['fastq']] prog.progress("process_alignment") # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) to_compress.add(samples[0][0]['fastq1']) to_compress.add(samples[0][0]['fastq2']) prog.progress("merge_sample") samples = run_parallel("mark_duplicates_sample", samples) to_compress.add(samples[0][0]['fastq1']) to_compress.add(samples[0][0]['fastq2']) prog.progress("mark_duplicates_sample") run_parallel("screen_sample_contaminants", samples) prog.progress("screen_sample_contaminants") samples = run_parallel("recalibrate_sample", samples) prog.progress("recalibrate_sample") samples = parallel_realign_sample(samples, run_parallel) prog.progress("realign_sample") samples = parallel_variantcall(samples, run_parallel) prog.progress("variantcall") samples = run_parallel("detect_sv", samples) prog.progress("detect_sv") samples = run_parallel("process_sample", samples) prog.progress("process_sample") samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) prog.progress("generate_bigwig") write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs) prog.progress("write_metrics") # Write statusdb metrics # will skip this for now and rely on external mechanism for uploading this data #report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config) #Compress all files in to_compress if config['algorithm'].get('compress_files', True): (before, after) = utils.compress_files(to_compress) logger.info("Space used by the files before compressing (in bytes): " \ + str(before)) logger.info("Space used by the files after compressing (in bytes): " \ + str(after)) logger.info("Saved space (in bytes): " + str(before - after))
def select_upload_files(base, bc_id, fc_dir, analysis_dir, config, fname_out=None): """Select fastq, bam alignment and summary files for upload to Galaxy. """ def _name_with_ext(orig_file, ext): """Return a normalized filename without internal processing names. Use specific base out filename if specific, allowing configuration named output files. """ if fname_out is None: base = os.path.basename(orig_file).split("-")[0] else: base = fname_out for extra in ["_trim"]: if base.endswith(extra): base = base[:-len(extra)] return "%s%s" % (base, ext) base_glob = _dir_glob(base, analysis_dir) # Configurable upload of fastq files -- BAM provide same information, compacted if config["algorithm"].get("upload_fastq", True): # look for fastq files in a barcode directory or the main fastq directory bc_base = base.rsplit("_", 1)[0] if bc_id else base bc_dir = os.path.join(analysis_dir, "%s_barcode" % bc_base) fastq_glob = "%s_*fastq.txt" % base found_fastq = False for fname in glob.glob(os.path.join(bc_dir, fastq_glob)): found_fastq = True yield (fname, os.path.basename(fname)) if not found_fastq: fastq_dir = get_fastq_dir(fc_dir) for fname in glob.glob(os.path.join(fastq_dir, fastq_glob)): yield (fname, os.path.basename(fname)) for summary_file in base_glob("summary.pdf"): yield (summary_file, _name_with_ext(summary_file, "-summary.pdf")) for wig_file in base_glob(".bigwig"): yield (wig_file, _name_with_ext(wig_file, "-coverage.bigwig")) # upload BAM files, preferring recalibrated and realigned files found_bam = False for orig_ext, new_ext in [ ("gatkrecal-realign-dup.bam", "-gatkrecal-realign.bam"), ("gatkrecal-realign.bam", "-gatkrecal-realign.bam"), ("gatkrecal.bam", "-gatkrecal.bam"), ("sort-dup.bam", ".bam"), ("sort.bam", ".bam") ]: if not found_bam: for bam_file in base_glob(orig_ext): yield (bam_file, _name_with_ext(bam_file, new_ext)) found_bam = True # Genotype files produced by SNP calling found = False for orig_ext, new_ext in [("variants-combined-annotated.vcf", "-variants.vcf"), ("variants-*-annotated.vcf", "-variants.vcf")]: if not found: for snp_file in base_glob(orig_ext): yield (snp_file, _name_with_ext(bam_file, new_ext)) found = True # Effect information on SNPs for snp_file in base_glob("variants-*-effects.tsv"): yield (snp_file, _name_with_ext(bam_file, "-variants-effects.tsv"))