def main(config_file, fc_dir, run_info_yaml=None): config = load_config(config_file) work_dir = os.getcwd() if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") setup_logging(config) run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) align_dir = os.path.join(work_dir, "alignments") fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) ## process each flowcell lane #run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) #lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) #lane_items = run_parallel("process_lane", lanes) logger.info (">>> Parse lane") lane_items = parse_lane(run_info["details"], fc_name, fc_date, dirs, config) #for item in lane_items: #utils.prettyprint_dict(item) logger.info (">>> Process alignment") align_items = run_parallel("process_alignment", lane_items) ## process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) logger.info (">>> Merge samples") samples = run_parallel("merge_sample", samples) logger.info (">>> Recalibrate samples") samples = run_parallel("recalibrate_sample", samples) logger.info (">>> realign sample") samples = parallel_realign_sample(samples, run_parallel) logger.info (">>> variantcall") samples = parallel_variantcall(samples, run_parallel) logger.info (">>> postprocess_variatns") samples = run_parallel("postprocess_variants", samples) logger.info (">>> combine_multiple_calles") samples = combine_multiple_callers(samples) logger.info (">>> detect_sv") samples = run_parallel("detect_sv", samples) logger.info (">>> combine_calls") samples = run_parallel("combine_calls", samples) logger.info (">>> process_sample") run_parallel("process_sample", samples) logger.info (">>> Generate bigwig") run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) logger.info (">>> Writing project summary") write_project_summary(samples) logger.info (">>> Writing metrics") write_metrics(run_info, fc_name, fc_date, dirs) logger.info (">>> Done")
def main(config_file, fc_dir, run_info_yaml=None): config = load_config(config_file) work_dir = os.getcwd() if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") setup_logging(config) run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items): assert len(xs) == 1 upload.from_sample(xs[0]) write_metrics(run_info, fc_name, fc_date, dirs)
def main(config_file, fc_dir, run_info_yaml=None, num_cores=None): config = load_config(config_file) work_dir = os.getcwd() if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") if num_cores: config["algorithm"]["num_cores"] = int(num_cores) setup_logging(config) run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
def main(config_file, fc_dir, run_info_yaml=None, num_cores=None): config = load_config(config_file) work_dir = os.getcwd() if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") if num_cores: config["algorithm"]["num_cores"] = int(num_cores) setup_logging(config) run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
def main(config_file, fc_dir, run_info_yaml=None): config = load_config(config_file) if config.get("qcdb", None) is None: sys.exit() else: qcdb_config = config.get("qcdb", {}) analysis = config.get("analysis", {}) setup_logging(config) qcdb_store_dir = qcdb_config.get("qcdb_store_dir", None) run_main(fc_dir, qcdb_store_dir)
def setUp(self): self.data_dir = os.path.join(os.path.dirname(__file__), "data", "automated") config_file = os.path.join(self.data_dir, "post_process-statusdb.yaml") config = load_config(config_file) setup_logging(config) fc_date = "110106" fc_name = "FC70BUKAAXX" run_info_yaml = os.path.join(self.data_dir, "run_info.yaml") workdir = os.path.join(os.path.dirname(__file__), "110106_FC70BUKAAXX") fc_dir = os.path.join(self.data_dir, os.pardir, "110106_FC70BUKAAXX")
def setUp(self): self.data_dir = os.path.join(os.path.dirname(__file__), "data", "automated") config_file = os.path.join(self.data_dir, "post_process-statusdb.yaml") config = load_config(config_file) setup_logging(config) fc_date = "110106" fc_name = "FC70BUKAAXX" run_info_yaml = os.path.join(self.data_dir, "run_info.yaml") workdir = os.path.join(os.path.dirname(__file__), "110106_FC70BUKAAXX") fc_dir = os.path.join(self.data_dir, os.pardir, "110106_FC70BUKAAXX")
def runner(parallel, fn_name, items, work_dir, config): """Run a task on an ipython parallel cluster, allowing alternative queue types. This will spawn clusters for parallel and custom queue types like multicore and high I/O tasks on demand. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. """ setup_logging(config) out = [] checkpoint_dir = utils.safe_makedir( os.path.join(work_dir, "checkpoints_ipython")) checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % fn_name) fn = getattr( __import__("{base}.ipythontasks".format(base=parallel["module"]), fromlist=["ipythontasks"]), fn_name) queue_type = _get_queue_type(fn) if queue_type: parallel = dictadd(parallel, "queue_type", queue_type) # already finished, run locally on current machine to collect details if os.path.exists(checkpoint_file): logger.info("ipython: %s -- local; checkpoint passed" % fn_name) for args in items: if args: data = fn(args) if data: out.extend(data) # Run on a multicore queue with available cores on the same machine elif queue_type == "multicore": logger.info("ipython: %s -- multicore" % fn_name) with cluster_view(parallel) as view: for args in items: if args: data = view.apply_sync(fn, args) if data: out.extend(data) # Run on a standard parallel queue else: logger.info("ipython: %s -- parallel" % fn_name) with cluster_view(parallel) as view: xs = [x for x in items if x is not None] if len(xs) > 0: for data in view.map_sync(fn, xs): if data: out.extend(data) with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n") return out
def main(config_file, fc_dir, run_info_yaml=None): config = load_config(config_file) work_dir = os.getcwd() if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") def insert_command(record): record.extra["command"] = sys.argv record.extra["version"] = version.get_pipeline_version() setup_logging(config) handler = create_log_handler(config) with handler, logbook.Processor(insert_command): run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def runner(parallel, fn_name, items, work_dir, config): """Run a task on an ipython parallel cluster, allowing alternative queue types. This will spawn clusters for parallel and custom queue types like multicore and high I/O tasks on demand. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. """ setup_logging(config) out = [] checkpoint_dir = utils.safe_makedir(os.path.join(work_dir, "checkpoints_ipython")) checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % fn_name) fn = getattr(__import__("{base}.ipythontasks".format(base=parallel["module"]), fromlist=["ipythontasks"]), fn_name) queue_type = _get_queue_type(fn) if queue_type: parallel = dictadd(parallel, "queue_type", queue_type) # already finished, run locally on current machine to collect details if os.path.exists(checkpoint_file): logger.info("ipython: %s -- local; checkpoint passed" % fn_name) for args in items: if args: data = fn(args) if data: out.extend(data) # Run on a multicore queue with available cores on the same machine elif queue_type == "multicore": logger.info("ipython: %s -- multicore" % fn_name) with cluster_view(parallel, config) as view: for args in items: if args: data = view.apply_sync(fn, args) if data: out.extend(data) # Run on a standard parallel queue else: logger.info("ipython: %s -- parallel" % fn_name) with cluster_view(parallel, config) as view: xs = [x for x in items if x is not None] if len(xs) > 0: for data in view.map_sync(fn, xs): if data: out.extend(data) with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n") return out
def main(config_file, fc_dir, run_info_yaml=None): config = load_config(config_file) work_dir = os.getcwd() if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(work_dir, "log") def insert_command(record): record.extra["command"] = sys.argv record.extra["version"] = version.get_pipeline_version() setup_logging(config) handler = create_log_handler(config) with handler, \ logbook.Processor(insert_command): run_main(config, config_file, fc_dir, work_dir, run_info_yaml)
def main(config_file, queues=None, task_module=None, base_dir=None): if base_dir is None: base_dir = os.getcwd() if task_module is None: task_module = "bcbio.distributed.tasks" config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(base_dir, "log") signals.setup_logging.connect(celery_logger(config)) setup_logging(config) logger.info("Starting distributed worker process: {0}".format(queues if queues else "")) with utils.chdir(base_dir): with utils.curdir_tmpdir() as work_dir: dirs = {"work": work_dir, "config": os.path.dirname(config_file)} with create_celeryconfig(task_module, dirs, config, os.path.abspath(config_file)): run_celeryd(work_dir, queues)
def _setup_logging(args): if len(args) > 0: for check_i in [0, -1]: config = args[0][check_i] if isinstance(config, dict) and config.has_key("config"): config = config["config"] break elif isinstance(config, dict) and config.has_key("algorithm"): break else: config = None setup_logging(config) try: yield None except: logger.exception("Unexpected error") raise
def runner(parallel, fn_name, items, work_dir, config): """Run a task on an ipython parallel cluster, allowing alternative queue types. This will spawn clusters for parallel and custom queue types like multicore and high I/O tasks on demand. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. """ setup_logging(config) out = [] checkpoint_dir = utils.safe_makedir( os.path.join(work_dir, "checkpoints_ipython")) checkpoint_file = _get_checkpoint_file(checkpoint_dir, fn_name) fn = getattr( __import__("{base}.ipythontasks".format(base=parallel["module"]), fromlist=["ipythontasks"]), fn_name) items = [x for x in items if x is not None] num_jobs, cores_per_job = find_cores_per_job(fn, parallel, items, config) parallel = dictadd(parallel, "cores_per_job", cores_per_job) parallel = dictadd(parallel, "num_jobs", num_jobs) # already finished, run locally on current machine to collect details if os.path.exists(checkpoint_file): logger.info("ipython: %s -- local; checkpoint passed" % fn_name) for args in items: if args: data = fn(args) if data: out.extend(data) # Run on a standard parallel queue else: logger.info("ipython: %s" % fn_name) if len(items) > 0: items = [add_cores_to_config(x, cores_per_job) for x in items] with ipython_cluster.cluster_view( parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=parallel["profile"]) as view: for data in view.map_sync(fn, items, track=False): if data: out.extend(data) with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n") return out
def _setup_logging(args): if len(args) > 0: for check_i in [0, -1]: config = args[0][check_i] if isinstance(config, dict) and config.has_key("config"): config = config["config"] break elif isinstance(config, dict) and config.has_key("algorithm"): break else: config = None setup_logging(config) try: yield None except: logger.exception("Unexpected error") raise
def main(config_file, queues=None, task_module=None, base_dir=None): if base_dir is None: base_dir = os.getcwd() if task_module is None: task_module = "bcbio.distributed.tasks" config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(base_dir, "log") signals.setup_logging.connect(celery_logger(config)) setup_logging(config) logger.info("Starting distributed worker process: {0}".format(queues if queues else "")) with utils.chdir(base_dir): with utils.curdir_tmpdir() as work_dir: dirs = {"work": work_dir, "config": os.path.dirname(config_file)} with create_celeryconfig(task_module, dirs, config, os.path.abspath(config_file)): run_celeryd(work_dir, queues)
def runner(parallel, fn_name, items, work_dir, config): """Run a task on an ipython parallel cluster, allowing alternative queue types. This will spawn clusters for parallel and custom queue types like multicore and high I/O tasks on demand. A checkpoint directory keeps track of finished tasks, avoiding spinning up clusters for sections that have been previous processed. """ setup_logging(config) out = [] checkpoint_dir = utils.safe_makedir(os.path.join(work_dir, "checkpoints_ipython")) checkpoint_file = _get_checkpoint_file(checkpoint_dir, fn_name) fn = getattr(__import__("{base}.ipythontasks".format(base=parallel["module"]), fromlist=["ipythontasks"]), fn_name) items = [x for x in items if x is not None] num_jobs, cores_per_job = find_cores_per_job(fn, parallel, items, config) parallel = dictadd(parallel, "cores_per_job", cores_per_job) parallel = dictadd(parallel, "num_jobs", num_jobs) # already finished, run locally on current machine to collect details if os.path.exists(checkpoint_file): logger.info("ipython: %s -- local; checkpoint passed" % fn_name) for args in items: if args: data = fn(args) if data: out.extend(data) # Run on a standard parallel queue else: logger.info("ipython: %s" % fn_name) if len(items) > 0: items = [add_cores_to_config(x, cores_per_job) for x in items] with ipython_cluster.cluster_view(parallel["scheduler"].lower(), parallel["queue"], parallel["num_jobs"], parallel["cores_per_job"], profile=parallel["profile"]) as view: for data in view.map_sync(fn, items, track=False): if data: out.extend(data) with open(checkpoint_file, "w") as out_handle: out_handle.write("done\n") return out
def _setup_logging(args): config = None if len(args) == 1 and isinstance(args[0], (list, tuple)): args = args[0] for arg in args: if ipython.is_nested_config_arg(arg): config = arg["config"] break elif ipython.is_std_config_arg(arg): config = arg break if config is not None: setup_logging(config) else: raise NotImplementedError("No config in %s:" % args[0]) try: yield None except: logger.exception("Unexpected error") raise
def main(config_file, fc_dir, project_dir, run_info_yaml=None, fc_alias=None, project_desc=None, lanes=None, barcodes=None): config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(project_dir, "log") setup_logging(config) if project_desc is None and lanes is None: logger.error("No project description or lanes provided: cannot deliver files without this information") sys.exit() if options.customer_delivery and not fc_alias == "": logger.info("INFO: Ignoring flowcell_alias when doing customer_delivery") fc_dir = os.path.abspath(fc_dir) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fp = open(run_info_yaml) run_info_structure = yaml.load(fp) original_fc = PostProcessedFlowcell(fc_name, fc_date, run_info_structure, fc_dir=fc_dir, fc_results_dir=fc_dir) pruned_fc = original_fc.prune_to_project(project_desc, exclude_unmatched=True) if pruned_fc is None or len(pruned_fc.get_lanes()) == 0: if not project_desc is None: logger.error("No lanes found with matching description %s: please check your flowcell run information" % project_desc) print >> sys.stderr, "Available projects: \n\t%s" % ("\n\t".join(original_fc.get_project_names())) sys.exit() if not lanes is None: logger.error("No lanes found with numbers %s: please check your flowcell run information" % " ".join(lanes)) sys.exit() # Set up a raw data flowcell that contains the delivery information for raw data (demuxed fastq data) rawdata_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias) rawdata_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, "nobackup/data", rawdata_fc.get_fc_id()))) analysis_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias) analysis_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, "nobackup/intermediate", rawdata_fc.get_fc_id()))) # If customer delivery setup some special options if options.customer_delivery: rawdata_fc.set_fc_dir(os.path.abspath(os.path.join(project_dir, project_desc, rawdata_fc.get_fc_id()))) rawdata_fc.set_fc_alias(rawdata_fc.get_fc_id()) analysis_fc = rawdata_fc _make_delivery_directory(rawdata_fc) _make_delivery_directory(analysis_fc) run_main(pruned_fc, rawdata_fc, analysis_fc)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = lane.process_all_lanes(lanes, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, config) for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items): assert len(xs) == 1 upload.from_sample(xs[0]) qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def _worker(**kwds): setup_logging(config)
def main(config_file, delivery_dir, run_info_yaml, analysis_dir=None): if analysis_dir is None: analysis_dir = os.path.abspath(os.path.curdir) config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(analysis_dir, "log") setup_logging(config) if not os.path.exists(analysis_dir): logger.error("No analysis directory found!") sys.exit() with open(run_info_yaml) as fp: run_info_structure = yaml.load(fp) lane2sample = dict() infiles = dict() for info in run_info_structure["details"]: lane2sample[info["lane"]] = info["description"] infiles[info["lane"]] = dict(vcf=[], bam=[], bigwig=[], metrics=[]) # Vcf files, tsv and tranches vcftypes = ("*.vcf", "*.idx", "*.tranches", "*.eval", "*.tsv") for vcftype in vcftypes: glob_str = os.path.join(analysis_dir, str(info["lane"]) + "_" + vcftype) infiles[info["lane"]]["vcf"].extend(glob.glob(glob_str)) # Bam files glob_str = os.path.join(analysis_dir, str(info["lane"]) + "_" + options.bam_glob) bamfiles = glob.glob(glob_str) infiles[info["lane"]]["bam"] = bamfiles # Bigwig files glob_str = os.path.join(analysis_dir, str(info["lane"]) + "_" + "*.bigwig") bigwigfiles = glob.glob(glob_str) infiles[info["lane"]]["bigwig"] = bigwigfiles # metrics files glob_str = os.path.join(analysis_dir, str(info["lane"]) + "_" + "*metrics") metricsfiles = glob.glob(glob_str) infiles[info["lane"]]["metrics"] = metricsfiles # snpEff files glob_str = os.path.join(analysis_dir, "snpEff*") snpeff_files = glob.glob(glob_str) # Loop through the list and deliver if appropriate _make_dir(delivery_dir) _deliver_file(os.path.join(analysis_dir, "project-summary.csv"), os.path.join(delivery_dir, "project-summary.csv")) _deliver_file(os.path.join(analysis_dir, "run_summary.yaml"), os.path.join(delivery_dir, "run_summary.yaml")) _deliver_file(run_info_yaml, os.path.join(delivery_dir, os.path.basename(run_info_yaml))) if not options.no_vcf: for sf in snpeff_files: _deliver_file(sf, os.path.join(delivery_dir, os.path.basename(sf))) for lane_num in infiles.keys(): lane = infiles[lane_num] if not options.no_vcf: for vcf in lane["vcf"]: (src, tgt) = _rename_sample_file(vcf, lane_num, lane2sample[lane_num], delivery_dir) _deliver_file(src, tgt) if not options.no_bigwig: for bigwig in lane["bigwig"]: (src, tgt) = _rename_sample_file(bigwig, lane_num, lane2sample[lane_num], delivery_dir) _deliver_file(src, tgt) if not options.no_metrics: for metrics in lane["metrics"]: (src, tgt) = _rename_sample_file(metrics, lane_num, lane2sample[lane_num], delivery_dir) _deliver_file(src, tgt) if options.bam: for bamfile in lane["bam"]: (src, tgt) = _rename_sample_file(bamfile, lane_num, lane2sample[lane_num], delivery_dir) _deliver_file(src, tgt)
def main(config_file, fc_dir, project_dir, run_info_yaml=None, fc_alias=None, project_desc=None, lanes=None, barcodes=None): config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(project_dir, "log") setup_logging(config) if project_desc is None and lanes is None: logger.error( "No project description or lanes provided: cannot deliver files without this information" ) sys.exit() if options.customer_delivery and not fc_alias == "": logger.info( "INFO: Ignoring flowcell_alias when doing customer_delivery") fc_dir = os.path.abspath(fc_dir) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fp = open(run_info_yaml) run_info_structure = yaml.load(fp) original_fc = PostProcessedFlowcell(fc_name, fc_date, run_info_structure, fc_dir=fc_dir, fc_results_dir=fc_dir) pruned_fc = original_fc.prune_to_project(project_desc, exclude_unmatched=True) if pruned_fc is None or len(pruned_fc.get_lanes()) == 0: if not project_desc is None: logger.error( "No lanes found with matching description %s: please check your flowcell run information" % project_desc) print >> sys.stderr, "Available projects: \n\t%s" % ("\n\t".join( original_fc.get_project_names())) sys.exit() if not lanes is None: logger.error( "No lanes found with numbers %s: please check your flowcell run information" % " ".join(lanes)) sys.exit() # Set up a raw data flowcell that contains the delivery information for raw data (demuxed fastq data) rawdata_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias) rawdata_fc.set_fc_dir( os.path.abspath( os.path.join(project_dir, "nobackup/data", rawdata_fc.get_fc_id()))) analysis_fc = PostProcessedFlowcell(pruned_fc.get_fc_name(), pruned_fc.get_fc_date(), pruned_fc.to_structure()['details'], fc_alias=fc_alias) analysis_fc.set_fc_dir( os.path.abspath( os.path.join(project_dir, "nobackup/intermediate", rawdata_fc.get_fc_id()))) # If customer delivery setup some special options if options.customer_delivery: rawdata_fc.set_fc_dir( os.path.abspath( os.path.join(project_dir, project_desc, rawdata_fc.get_fc_id()))) rawdata_fc.set_fc_alias(rawdata_fc.get_fc_id()) analysis_fc = rawdata_fc _make_delivery_directory(rawdata_fc) _make_delivery_directory(analysis_fc) run_main(pruned_fc, rawdata_fc, analysis_fc)
def main(config_file, delivery_dir, run_info_yaml, analysis_dir=None): if analysis_dir is None: analysis_dir = os.path.abspath(os.path.curdir) config = load_config(config_file) if config.get("log_dir", None) is None: config["log_dir"] = os.path.join(analysis_dir, "log") setup_logging(config) if not os.path.exists(analysis_dir): logger.error("No analysis directory found!") sys.exit() with open(run_info_yaml) as fp: run_info_structure = yaml.load(fp) lane2sample = dict() infiles = dict() for info in run_info_structure['details']: lane2sample[info['lane']] = info['description'] infiles[info['lane']] = dict(vcf=[], bam=[], bigwig=[], metrics=[]) # Vcf files, tsv and tranches vcftypes = ('*.vcf', '*.idx', '*.tranches', '*.eval', '*.tsv') for vcftype in vcftypes: glob_str = os.path.join(analysis_dir, str(info['lane']) + "_" + vcftype) infiles[info['lane']]['vcf'].extend(glob.glob(glob_str)) # Bam files glob_str = os.path.join(analysis_dir, str(info['lane']) + "_" + options.bam_glob) bamfiles = glob.glob(glob_str) infiles[info['lane']]['bam'] = bamfiles # Bigwig files glob_str = os.path.join(analysis_dir, str(info['lane']) + "_" + "*.bigwig") bigwigfiles = glob.glob(glob_str) infiles[info['lane']]['bigwig'] = bigwigfiles # metrics files glob_str = os.path.join(analysis_dir, str(info['lane']) + "_" + "*metrics") metricsfiles = glob.glob(glob_str) infiles[info['lane']]['metrics'] = metricsfiles # snpEff files glob_str = os.path.join(analysis_dir, "snpEff*") snpeff_files = glob.glob(glob_str) # Loop through the list and deliver if appropriate _make_dir(delivery_dir) _deliver_file(os.path.join(analysis_dir, "project-summary.csv"), os.path.join(delivery_dir, "project-summary.csv")) _deliver_file(os.path.join(analysis_dir, "run_summary.yaml"), os.path.join(delivery_dir, "run_summary.yaml")) _deliver_file(run_info_yaml, os.path.join(delivery_dir, os.path.basename(run_info_yaml))) if not options.no_vcf: for sf in snpeff_files: _deliver_file(sf, os.path.join(delivery_dir, os.path.basename(sf))) for lane_num in infiles.keys(): lane = infiles[lane_num] if not options.no_vcf: for vcf in lane['vcf']: (src, tgt) = _rename_sample_file(vcf, lane_num, lane2sample[lane_num], delivery_dir) _deliver_file(src, tgt) if not options.no_bigwig: for bigwig in lane['bigwig']: (src, tgt) = _rename_sample_file(bigwig, lane_num, lane2sample[lane_num], delivery_dir) _deliver_file(src, tgt) if not options.no_metrics: for metrics in lane['metrics']: (src, tgt) = _rename_sample_file(metrics, lane_num, lane2sample[lane_num], delivery_dir) _deliver_file(src, tgt) if options.bam: for bamfile in lane['bam']: (src, tgt) = _rename_sample_file(bamfile, lane_num, lane2sample[lane_num], delivery_dir) _deliver_file(src, tgt)
def _worker(**kwds): setup_logging(config)