def organize(dirs, config, run_info_yaml, sample_names): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. sample_names is a list of samples to include from the overall file, for cases where we are running multiple pipelines from the same configuration file. """ logger.info("Using input YAML configuration: %s" % run_info_yaml) assert run_info_yaml and os.path.exists(run_info_yaml), \ "Did not find input sample YAML file: %s" % run_info_yaml run_details = _run_info_from_yaml(dirs, run_info_yaml, config, sample_names) out = [] for item in run_details: item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] elif isinstance(item["name"], basestring): description = "%s-%s" % (item["name"], clean_name(item["description"])) item["name"] = [item["name"], description] item["description"] = description # add algorithm details to configuration, avoid double specification item["resources"] = _add_remote_resources(item["resources"]) item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item = add_reference_resources(item) # Create temporary directories and make absolute, expanding environmental variables tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item) if tmp_dir: tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir)) item["config"]["resources"]["tmp"]["dir"] = genome.abs_file_paths(tmp_dir) out.append(item) out = _add_provenance(out, dirs, config) return out
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ logger.info("Using input YAML configuration: %s" % run_info_yaml) assert run_info_yaml and os.path.exists(run_info_yaml), \ "Did not find input sample YAML file: %s" % run_info_yaml run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) out = [] for item in run_details: # add algorithm details to configuration, avoid double specification item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] elif isinstance(item["name"], basestring): description = "%s-%s" % (item["name"], clean_name(item["description"])) item["name"] = [item["name"], description] item["description"] = description item = add_reference_resources(item) # Create temporary directories and make absolute if utils.get_in(item, ("config", "resources", "tmp", "dir")): utils.safe_makedir(utils.get_in(item, ("config", "resources", "tmp", "dir"))) item["config"]["resources"]["tmp"] = genome.abs_file_paths( utils.get_in(item, ("config", "resources", "tmp"))) out.append(item) return out
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ logger.info("Using input YAML configuration: %s" % run_info_yaml) assert run_info_yaml and os.path.exists(run_info_yaml), \ "Did not find input sample YAML file: %s" % run_info_yaml run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) out = [] for item in run_details: # add algorithm details to configuration, avoid double specification item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] elif isinstance(item["name"], basestring): description = "%s-%s" % (item["name"], clean_name(item["description"])) item["name"] = [item["name"], description] item["description"] = description item = add_reference_resources(item) # Create temporary directories and make absolute if utils.get_in(item, ("config", "resources", "tmp", "dir")): utils.safe_makedir( utils.get_in(item, ("config", "resources", "tmp", "dir"))) item["config"]["resources"]["tmp"] = genome.abs_file_paths( utils.get_in(item, ("config", "resources", "tmp"))) out.append(item) return out
def organize(dirs, config, run_info_yaml, sample_names=None, is_cwl=False, integrations=None): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. sample_names is a list of samples to include from the overall file, for cases where we are running multiple pipelines from the same configuration file. """ from bcbio.pipeline import qcsummary if integrations is None: integrations = {} logger.info("Using input YAML configuration: %s" % run_info_yaml) assert run_info_yaml and os.path.exists(run_info_yaml), \ "Did not find input sample YAML file: %s" % run_info_yaml run_details = _run_info_from_yaml(dirs, run_info_yaml, config, sample_names, is_cwl=is_cwl, integrations=integrations) remote_retriever = None for iname, retriever in integrations.items(): if iname in config: run_details = retriever.add_remotes(run_details, config[iname]) remote_retriever = retriever out = [] for item in run_details: item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] elif isinstance(item["name"], basestring): description = "%s-%s" % (item["name"], clean_name(item["description"])) item["name"] = [item["name"], description] item["description"] = description # add algorithm details to configuration, avoid double specification item["resources"] = _add_remote_resources(item["resources"]) item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item = add_reference_resources(item, remote_retriever) item["config"]["algorithm"]["qc"] = qcsummary.get_qc_tools(item) item["config"]["algorithm"]["vcfanno"] = vcfanno.find_annotations(item) # Create temporary directories and make absolute, expanding environmental variables tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item) if tmp_dir: # if no environmental variables, make and normalize the directory # otherwise we normalize later in distributed.transaction: if os.path.expandvars(tmp_dir) == tmp_dir: tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir)) tmp_dir = genome.abs_file_paths(tmp_dir, do_download=not integrations) item["config"]["resources"]["tmp"]["dir"] = tmp_dir out.append(item) out = _add_provenance(out, dirs, config, not is_cwl) return out
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = flowcell.parse_dirname(fc_dir) except ValueError: pass global_config = {} global_vars = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) item = _clean_metadata(item) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = flowcell.parse_dirname(fc_dir) except ValueError: pass global_config = {} global_vars = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError("No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) item = _clean_metadata(item) item = _clean_algorithm(item) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if loaded.has_key("fc_name") and loaded.has_key("fc_date"): fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if not item.has_key("lane"): item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if not item.has_key("description"): if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) upload = global_config.get("upload", {}) if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=[ "variantcaller", "realign", "recalibrate", "phasing", "svcaller" ]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError("No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=["variantcaller", "realign", "recalibrate", "phasing", "svcaller"] ) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def organize(dirs, config, run_info_yaml, sample_names=None, add_provenance=True, integrations=None): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. sample_names is a list of samples to include from the overall file, for cases where we are running multiple pipelines from the same configuration file. """ from bcbio.pipeline import qcsummary if integrations is None: integrations = {} logger.info("Using input YAML configuration: %s" % run_info_yaml) assert run_info_yaml and os.path.exists(run_info_yaml), \ "Did not find input sample YAML file: %s" % run_info_yaml run_details = _run_info_from_yaml(dirs, run_info_yaml, config, sample_names, integrations=integrations) remote_retriever = None for iname, retriever in integrations.items(): if iname in config: run_details = retriever.add_remotes(run_details, config[iname]) remote_retriever = retriever out = [] for item in run_details: item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] elif isinstance(item["name"], basestring): description = "%s-%s" % (item["name"], clean_name(item["description"])) item["name"] = [item["name"], description] item["description"] = description # add algorithm details to configuration, avoid double specification item["resources"] = _add_remote_resources(item["resources"]) item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item = add_reference_resources(item, remote_retriever) item["config"]["algorithm"]["qc"] = qcsummary.get_qc_tools(item) # Create temporary directories and make absolute, expanding environmental variables tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item) if tmp_dir: # if no environmental variables, make and normalize the directory # otherwise we normalize later in distributed.transaction: if os.path.expandvars(tmp_dir) == tmp_dir: tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir)) tmp_dir = genome.abs_file_paths(tmp_dir, do_download=not integrations) item["config"]["resources"]["tmp"]["dir"] = tmp_dir out.append(item) out = _add_provenance(out, dirs, config, add_provenance) return out
def organize(dirs, config, run_info_yaml, sample_names): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. sample_names is a list of samples to include from the overall file, for cases where we are running multiple pipelines from the same configuration file. """ logger.info("Using input YAML configuration: %s" % run_info_yaml) assert run_info_yaml and os.path.exists(run_info_yaml), \ "Did not find input sample YAML file: %s" % run_info_yaml run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config, sample_names) out = [] for item in run_details: item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] elif isinstance(item["name"], basestring): description = "%s-%s" % (item["name"], clean_name(item["description"])) item["name"] = [item["name"], description] item["description"] = description # add algorithm details to configuration, avoid double specification item["resources"] = _add_remote_resources(item["resources"]) item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item = add_reference_resources(item) # Create temporary directories and make absolute, expanding environmental variables tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item) if tmp_dir: tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir)) item["config"]["resources"]["tmp"]["dir"] = genome.abs_file_paths( tmp_dir) out.append(item) out = _add_provenance(out, dirs, config) return out
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError("No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [genome.abs_file_paths(f) for f in item["files"]] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir(os.path.join(dirs.get("work", os.getcwd()), "inputs")) item["vrn_file"] = vcfutils.bgzip_and_index(genome.abs_file_paths(item["vrn_file"]), config, remove_orig=False, out_dir=inputs_dir) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.iteritems(): if prog not in item["resources"]: item["resources"][prog] = {} for key, val in pkvs.iteritems(): item["resources"][prog][key] = val run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None, integrations=None): """Read run information from a passed YAML file. """ validate_yaml(run_info_yaml, run_info_yaml) with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} integration_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") if "fc_date" in loaded: fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) for iname in ["arvados"]: integration_config[iname] = global_config.pop(iname, {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] if integrations: for iname, retriever in integrations.items(): if iname in config: loaded = retriever.add_remotes(loaded, config[iname]) run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name: upload["fc_name"] = fc_name if fc_date: upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS, fileonly_keys=ALGORITHM_FILEONLY_KEYS, do_download=all(not x for x in integrations.values())) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["metadata"] = add_metadata_defaults(item.get("metadata", {})) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [ genome.abs_file_paths( f, do_download=all(not x for x in integrations.values())) for f in item["files"] ] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir( os.path.join(dirs.get("work", os.getcwd()), "inputs", item["description"])) item["vrn_file"] = vcfutils.bgzip_and_index(genome.abs_file_paths( item["vrn_file"], do_download=all(not x for x in integrations.values())), config, remove_orig=False, out_dir=inputs_dir) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.items(): if prog not in item["resources"]: item["resources"][prog] = {} if pkvs is not None: for key, val in pkvs.items(): item["resources"][prog][key] = val for iname, ivals in integration_config.items(): if ivals: if iname not in item: item[iname] = {} for k, v in ivals.items(): item[iname][k] = v run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None, is_cwl=False, integrations=None): """Read run information from a passed YAML file. """ validate_yaml(run_info_yaml, run_info_yaml) with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} integration_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") if "fc_date" in loaded: fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) for iname in ["arvados"]: integration_config[iname] = global_config.pop(iname, {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] if integrations: for iname, retriever in integrations.items(): if iname in config: config[iname] = retriever.set_cache(config[iname]) loaded = retriever.add_remotes(loaded, config[iname]) run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) description = _clean_characters(str(item["description"])) item["description"] = description # make names R safe if we are likely to use R downstream if item["analysis"].lower() in R_DOWNSTREAM_ANALYSIS: if description[0].isdigit(): valid = "X" + description logger.info("%s is not a valid R name, converting to %s." % (description, valid)) item["description"] = valid if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if not upload: upload["dir"] = "../final" if fc_name: upload["fc_name"] = fc_name if fc_date: upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS, fileonly_keys=ALGORITHM_FILEONLY_KEYS, do_download=all(not x for x in integrations.values())) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"], item.get("analysis", ""), is_cwl) item["metadata"] = add_metadata_defaults(item.get("metadata", {})) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) if item.get("files"): item["files"] = [ genome.abs_file_paths( f, do_download=all(not x for x in integrations.values())) for f in item["files"] ] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): inputs_dir = utils.safe_makedir( os.path.join(dirs.get("work", os.getcwd()), "inputs", item["description"])) item["vrn_file"] = genome.abs_file_paths( item["vrn_file"], do_download=all(not x for x in integrations.values())) if os.path.isfile(item["vrn_file"]): # Try to prepare in place (or use ready to go inputs) try: item["vrn_file"] = vcfutils.bgzip_and_index( item["vrn_file"], config, remove_orig=False) # In case of permission errors, fix in inputs directory except IOError: item["vrn_file"] = vcfutils.bgzip_and_index( item["vrn_file"], config, remove_orig=False, out_dir=inputs_dir) if not tz.get_in(("metadata", "batch"), item) and tz.get_in( ["algorithm", "validate"], item): raise ValueError( "%s: Please specify a metadata batch for variant file (vrn_file) input.\n" % (item["description"]) + "Batching with a standard sample provides callable regions for validation." ) item = _clean_metadata(item) item = _clean_algorithm(item) item = _clean_background(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.items(): if prog not in item["resources"]: item["resources"][prog] = {} if pkvs is not None: for key, val in pkvs.items(): item["resources"][prog][key] = val for iname, ivals in integration_config.items(): if ivals: if iname not in item: item[iname] = {} for k, v in ivals.items(): item[iname][k] = v run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if dirs.get("flowcell"): try: fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell")) except ValueError: pass global_config = {} global_vars = {} resources = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) resources = global_config.pop("resources", {}) loaded = loaded["details"] if sample_names: loaded = [x for x in loaded if x["description"] in sample_names] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, dirs.get("flowcell")) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if _item_is_bam(item): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) if "upload" not in item: upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" if upload.get("dir"): upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True) item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS) item["genome_build"] = str(item.get("genome_build", "")) item["algorithm"] = _add_algorithm_defaults(item["algorithm"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) if item.get("files"): item["files"] = [genome.abs_file_paths(f) for f in item["files"]] elif "files" in item: del item["files"] if item.get("vrn_file") and isinstance(item["vrn_file"], basestring): item["vrn_file"] = vcfutils.bgzip_and_index( genome.abs_file_paths(item["vrn_file"]), config) item = _clean_metadata(item) item = _clean_algorithm(item) # Add any global resource specifications if "resources" not in item: item["resources"] = {} for prog, pkvs in resources.iteritems(): if prog not in item["resources"]: item["resources"][prog] = {} for key, val in pkvs.iteritems(): item["resources"][prog][key] = val run_details.append(item) _check_sample_config(run_details, run_info_yaml, config) return run_details