Пример #1
0
def organize(dirs, config, run_info_yaml, sample_names):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs, run_info_yaml, config, sample_names)
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
            item["config"]["resources"]["tmp"]["dir"] = genome.abs_file_paths(tmp_dir)
        out.append(item)
    out = _add_provenance(out, dirs, config)
    return out
Пример #2
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        item = add_reference_resources(item)
        # Create temporary directories and make absolute
        if utils.get_in(item, ("config", "resources", "tmp", "dir")):
            utils.safe_makedir(utils.get_in(item, ("config", "resources", "tmp", "dir")))
            item["config"]["resources"]["tmp"] = genome.abs_file_paths(
                utils.get_in(item, ("config", "resources", "tmp")))
        out.append(item)
    return out
Пример #3
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"],
                                     clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        item = add_reference_resources(item)
        # Create temporary directories and make absolute
        if utils.get_in(item, ("config", "resources", "tmp", "dir")):
            utils.safe_makedir(
                utils.get_in(item, ("config", "resources", "tmp", "dir")))
            item["config"]["resources"]["tmp"] = genome.abs_file_paths(
                utils.get_in(item, ("config", "resources", "tmp")))
        out.append(item)
    return out
Пример #4
0
def organize(dirs,
             config,
             run_info_yaml,
             sample_names=None,
             is_cwl=False,
             integrations=None):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    """
    from bcbio.pipeline import qcsummary
    if integrations is None: integrations = {}
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs,
                                      run_info_yaml,
                                      config,
                                      sample_names,
                                      is_cwl=is_cwl,
                                      integrations=integrations)
    remote_retriever = None
    for iname, retriever in integrations.items():
        if iname in config:
            run_details = retriever.add_remotes(run_details, config[iname])
            remote_retriever = retriever
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"],
                                     clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item, remote_retriever)
        item["config"]["algorithm"]["qc"] = qcsummary.get_qc_tools(item)
        item["config"]["algorithm"]["vcfanno"] = vcfanno.find_annotations(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            # if no environmental variables, make and normalize the directory
            # otherwise we normalize later in distributed.transaction:
            if os.path.expandvars(tmp_dir) == tmp_dir:
                tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
                tmp_dir = genome.abs_file_paths(tmp_dir,
                                                do_download=not integrations)
            item["config"]["resources"]["tmp"]["dir"] = tmp_dir
        out.append(item)
    out = _add_provenance(out, dirs, config, not is_cwl)
    return out
Пример #5
0
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
        try:
            fc_name, fc_date = flowcell.parse_dirname(fc_dir)
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        loaded = loaded["details"]

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name and fc_date:
                upload["fc_name"] = fc_name
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
                                                 global_vars)
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS)
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
        item = _clean_metadata(item)
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Пример #6
0
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
        try:
            fc_name, fc_date = flowcell.parse_dirname(fc_dir)
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        loaded = loaded["details"]

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError("No `description` sample name provided for input #%s" % (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name and fc_date:
                upload["fc_name"] = fc_name
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars)
        item["algorithm"] = genome.abs_file_paths(item["algorithm"],
                                                  ignore_keys=ALGORITHM_NOPATH_KEYS)
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Пример #7
0
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
        try:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
            pass
    global_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if loaded.has_key("fc_name") and loaded.has_key("fc_date"):
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if not item.has_key("lane"):
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if not item.has_key("description"):
            if len(item.get("files",
                            [])) == 1 and item["files"][0].endswith(".bam"):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        upload = global_config.get("upload", {})
        if fc_name and fc_date:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        upload["run_id"] = ""
        item["upload"] = upload
        item["algorithm"] = genome.abs_file_paths(item["algorithm"],
                                                  ignore_keys=[
                                                      "variantcaller",
                                                      "realign", "recalibrate",
                                                      "phasing", "svcaller"
                                                  ])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Пример #8
0
def _run_info_from_yaml(fc_dir, run_info_yaml, config):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if fc_dir:
        try:
            fc_name, fc_date = get_flowcell_info(fc_dir)
        except ValueError:
            pass
    global_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        loaded = loaded["details"]
    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, fc_dir)
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError("No `description` sample name provided for input #%s" % (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        upload = global_config.get("upload", {})
        # Handle specifying a local directory directly in upload
        if isinstance(upload, basestring):
            upload = {"dir": upload}
        if fc_name and fc_date:
            upload["fc_name"] = fc_name
            upload["fc_date"] = fc_date
        upload["run_id"] = ""
        item["upload"] = upload
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"], ignore_keys=["variantcaller", "realign", "recalibrate", "phasing", "svcaller"]
        )
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml)
    return run_details
Пример #9
0
def organize(dirs, config, run_info_yaml, sample_names=None, add_provenance=True,
             integrations=None):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    """
    from bcbio.pipeline import qcsummary
    if integrations is None: integrations = {}
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs, run_info_yaml, config, sample_names,
                                      integrations=integrations)
    remote_retriever = None
    for iname, retriever in integrations.items():
        if iname in config:
            run_details = retriever.add_remotes(run_details, config[iname])
            remote_retriever = retriever
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item, remote_retriever)
        item["config"]["algorithm"]["qc"] = qcsummary.get_qc_tools(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            # if no environmental variables, make and normalize the directory
            # otherwise we normalize later in distributed.transaction:
            if os.path.expandvars(tmp_dir) == tmp_dir:
                tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
                tmp_dir = genome.abs_file_paths(tmp_dir, do_download=not integrations)
            item["config"]["resources"]["tmp"]["dir"] = tmp_dir
        out.append(item)
    out = _add_provenance(out, dirs, config, add_provenance)
    return out
Пример #10
0
def organize(dirs, config, run_info_yaml, sample_names):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config,
                                      sample_names)
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"],
                                     clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
            item["config"]["resources"]["tmp"]["dir"] = genome.abs_file_paths(
                tmp_dir)
        out.append(item)
    out = _add_provenance(out, dirs, config)
    return out
Пример #11
0
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
        try:
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    resources = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError("No `description` sample name provided for input #%s" % (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name and fc_date:
                upload["fc_name"] = fc_name
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")], makedir=True)
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars)
        item["algorithm"] = genome.abs_file_paths(item["algorithm"],
                                                  ignore_keys=ALGORITHM_NOPATH_KEYS)
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        if item.get("files"):
            item["files"] = [genome.abs_file_paths(f) for f in item["files"]]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            inputs_dir = utils.safe_makedir(os.path.join(dirs.get("work", os.getcwd()), "inputs"))
            item["vrn_file"] = vcfutils.bgzip_and_index(genome.abs_file_paths(item["vrn_file"]), config,
                                                        remove_orig=False, out_dir=inputs_dir)
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.iteritems():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            for key, val in pkvs.iteritems():
                item["resources"][prog][key] = val
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml, config)
    return run_details
Пример #12
0
def _run_info_from_yaml(dirs,
                        run_info_yaml,
                        config,
                        sample_names=None,
                        integrations=None):
    """Read run information from a passed YAML file.
    """
    validate_yaml(run_info_yaml, run_info_yaml)
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
        try:
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    resources = {}
    integration_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
        if "fc_date" in loaded:
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        for iname in ["arvados"]:
            integration_config[iname] = global_config.pop(iname, {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    if integrations:
        for iname, retriever in integrations.items():
            if iname in config:
                loaded = retriever.add_remotes(loaded, config[iname])

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name:
                upload["fc_name"] = fc_name
            if fc_date:
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")],
                                             makedir=True)
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
                                                 global_vars)
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"],
            ignore_keys=ALGORITHM_NOPATH_KEYS,
            fileonly_keys=ALGORITHM_FILEONLY_KEYS,
            do_download=all(not x for x in integrations.values()))
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["metadata"] = add_metadata_defaults(item.get("metadata", {}))
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        if item.get("files"):
            item["files"] = [
                genome.abs_file_paths(
                    f, do_download=all(not x for x in integrations.values()))
                for f in item["files"]
            ]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            inputs_dir = utils.safe_makedir(
                os.path.join(dirs.get("work", os.getcwd()), "inputs",
                             item["description"]))
            item["vrn_file"] = vcfutils.bgzip_and_index(genome.abs_file_paths(
                item["vrn_file"],
                do_download=all(not x for x in integrations.values())),
                                                        config,
                                                        remove_orig=False,
                                                        out_dir=inputs_dir)
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.items():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            if pkvs is not None:
                for key, val in pkvs.items():
                    item["resources"][prog][key] = val
        for iname, ivals in integration_config.items():
            if ivals:
                if iname not in item:
                    item[iname] = {}
                for k, v in ivals.items():
                    item[iname][k] = v

        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml, config)
    return run_details
Пример #13
0
def _run_info_from_yaml(dirs,
                        run_info_yaml,
                        config,
                        sample_names=None,
                        is_cwl=False,
                        integrations=None):
    """Read run information from a passed YAML file.
    """
    validate_yaml(run_info_yaml, run_info_yaml)
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
        try:
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    resources = {}
    integration_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
        if "fc_date" in loaded:
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        for iname in ["arvados"]:
            integration_config[iname] = global_config.pop(iname, {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    if integrations:
        for iname, retriever in integrations.items():
            if iname in config:
                config[iname] = retriever.set_cache(config[iname])
                loaded = retriever.add_remotes(loaded, config[iname])

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        description = _clean_characters(str(item["description"]))
        item["description"] = description
        # make names R safe if we are likely to use R downstream
        if item["analysis"].lower() in R_DOWNSTREAM_ANALYSIS:
            if description[0].isdigit():
                valid = "X" + description
                logger.info("%s is not a valid R name, converting to %s." %
                            (description, valid))
                item["description"] = valid
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if not upload:
                upload["dir"] = "../final"
            if fc_name:
                upload["fc_name"] = fc_name
            if fc_date:
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")],
                                             makedir=True)
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
                                                 global_vars)
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"],
            ignore_keys=ALGORITHM_NOPATH_KEYS,
            fileonly_keys=ALGORITHM_FILEONLY_KEYS,
            do_download=all(not x for x in integrations.values()))
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"],
                                                    item.get("analysis", ""),
                                                    is_cwl)
        item["metadata"] = add_metadata_defaults(item.get("metadata", {}))
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        if item.get("files"):
            item["files"] = [
                genome.abs_file_paths(
                    f, do_download=all(not x for x in integrations.values()))
                for f in item["files"]
            ]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            inputs_dir = utils.safe_makedir(
                os.path.join(dirs.get("work", os.getcwd()), "inputs",
                             item["description"]))
            item["vrn_file"] = genome.abs_file_paths(
                item["vrn_file"],
                do_download=all(not x for x in integrations.values()))
            if os.path.isfile(item["vrn_file"]):
                # Try to prepare in place (or use ready to go inputs)
                try:
                    item["vrn_file"] = vcfutils.bgzip_and_index(
                        item["vrn_file"], config, remove_orig=False)
                # In case of permission errors, fix in inputs directory
                except IOError:
                    item["vrn_file"] = vcfutils.bgzip_and_index(
                        item["vrn_file"],
                        config,
                        remove_orig=False,
                        out_dir=inputs_dir)
            if not tz.get_in(("metadata", "batch"), item) and tz.get_in(
                ["algorithm", "validate"], item):
                raise ValueError(
                    "%s: Please specify a metadata batch for variant file (vrn_file) input.\n"
                    % (item["description"]) +
                    "Batching with a standard sample provides callable regions for validation."
                )
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        item = _clean_background(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.items():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            if pkvs is not None:
                for key, val in pkvs.items():
                    item["resources"][prog][key] = val
        for iname, ivals in integration_config.items():
            if ivals:
                if iname not in item:
                    item[iname] = {}
                for k, v in ivals.items():
                    item[iname][k] = v

        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml, config)
    return run_details
Пример #14
0
def _run_info_from_yaml(dirs, run_info_yaml, config, sample_names=None):
    """Read run information from a passed YAML file.
    """
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
        try:
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    resources = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded and "fc_date" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name and fc_date:
                upload["fc_name"] = fc_name
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")],
                                             makedir=True)
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
                                                 global_vars)
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"], ignore_keys=ALGORITHM_NOPATH_KEYS)
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        item["test_run"] = global_config.get("test_run", False)
        if item.get("files"):
            item["files"] = [genome.abs_file_paths(f) for f in item["files"]]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            item["vrn_file"] = vcfutils.bgzip_and_index(
                genome.abs_file_paths(item["vrn_file"]), config)
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.iteritems():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            for key, val in pkvs.iteritems():
                item["resources"][prog][key] = val
        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml, config)
    return run_details