示例#1
0
def merge_sample(data):
    """Merge fastq and BAM files for multiple samples.
    """
    logger.debug("Combining fastq and BAM files %s" % str(data["name"]))
    config = config_utils.update_w_custom(data["config"], data["info"])
    if config["algorithm"].get("upload_fastq", False):
        fastq1, fastq2 = combine_fastq_files(data["fastq_files"], data["dirs"]["work"], config)
    else:
        fastq1, fastq2 = None, None

    out_file = os.path.join(data["dirs"]["work"], data["info"]["rgnames"]["sample"] + ".bam")
    sort_bam = merge_bam_files(data["bam_files"], data["dirs"]["work"], config, out_file=out_file)
    return [
        [
            {
                "name": data["name"],
                "metadata": data["info"].get("metadata", {}),
                "info": data["info"],
                "genome_build": data["genome_build"],
                "sam_ref": data["sam_ref"],
                "work_bam": sort_bam,
                "fastq1": fastq1,
                "fastq2": fastq2,
                "dirs": data["dirs"],
                "config": config,
                "config_file": data["config_file"],
            }
        ]
    ]
示例#2
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        logger.info("Using input YAML configuration: %s" % run_info_yaml)
        run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml,
                                          config)
    else:
        logger.info("Fetching run details from Galaxy instance")
        fc_name, fc_date = get_flowcell_info(dirs["flowcell"])
        galaxy_api = GalaxyApiAccess(config['galaxy_url'],
                                     config['galaxy_api_key'])
        run_details = []
        galaxy_info = galaxy_api.run_details(fc_name, fc_date)
        for item in galaxy_info["details"]:
            item["upload"] = {
                "method": "galaxy",
                "run_id": galaxy_info["run_id"],
                "fc_name": fc_name,
                "fc_date": fc_date
            }
            run_details.append(item)
    out = []
    for item in run_details:
        item["config"] = config_utils.update_w_custom(config, item)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        item = _add_reference_resources(item)
        out.append(item)
    return out
示例#3
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
            elif any(x.lower() == "mutect" for x in vcs):
                raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s"
                                 % (batch, [dd.get_sample_name(data) for data in items]))
示例#4
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = list(
                set(
                    tz.concat(
                        [dd.get_variantcaller(data) or [] for data in items])))
            if any(x.lower().startswith("vardict") for x in vcs):
                raise ValueError(
                    "VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                    % (batch, [dd.get_sample_name(data) for data in items]))
示例#5
0
def _pair_samples_with_pipelines(run_info_yaml, config):
    """Map samples defined in input file to pipelines to run.
    """
    with open(run_info_yaml) as in_handle:
        samples = yaml.safe_load(in_handle)
        if isinstance(samples, dict):
            resources = samples.pop("resources", {})
            samples = samples["details"]
        else:
            resources = {}
    ready_samples = []
    for sample in samples:
        if "files" in sample:
            del sample["files"]
        # add any resources to this item to recalculate global configuration
        usample = copy.deepcopy(sample)
        usample.pop("algorithm", None)
        if "resources" not in usample:
            usample["resources"] = {}
        for prog, pkvs in resources.iteritems():
            if prog not in usample["resources"]:
                usample["resources"][prog] = {}
            for key, val in pkvs.iteritems():
                usample["resources"][prog][key] = val
        config = config_utils.update_w_custom(config, usample)
        sample["resources"] = {}
        ready_samples.append(sample)
    paired = [(x, _get_pipeline(x)) for x in ready_samples]
    d = defaultdict(list)
    for x in paired:
        d[x[1]].append([x[0]])
    return d, config
示例#6
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        item = add_reference_resources(item)
        # Create temporary directories and make absolute
        if utils.get_in(item, ("config", "resources", "tmp", "dir")):
            utils.safe_makedir(utils.get_in(item, ("config", "resources", "tmp", "dir")))
            item["config"]["resources"]["tmp"] = genome.abs_file_paths(
                utils.get_in(item, ("config", "resources", "tmp")))
        out.append(item)
    return out
示例#7
0
def _check_for_problem_somatic_batches(items, config):
    """Identify problem batch setups for somatic calling.

    We do not support multiple tumors in a single batch and VarDict(Java) does not
    handle pooled calling, only tumor/normal.
    """
    to_check = []
    for data in items:
        data = copy.deepcopy(data)
        data["config"] = config_utils.update_w_custom(config, data)
        to_check.append(data)
    data_by_batches = collections.defaultdict(list)
    for data in to_check:
        batches = dd.get_batches(data)
        if batches:
            for batch in batches:
                data_by_batches[batch].append(data)
    for batch, items in data_by_batches.items():
        if vcfutils.get_paired(items):
            vcfutils.check_paired_problems(items)
        elif len(items) > 1:
            vcs = vcfutils.get_somatic_variantcallers(items)
            if "vardict" in vcs:
                raise ValueError(
                    "VarDict does not support pooled non-tumor/normal calling, in batch %s: %s"
                    % (batch, [dd.get_sample_name(data) for data in items]))
            elif "mutect" in vcs or "mutect2" in vcs:
                raise ValueError(
                    "MuTect and MuTect2 require a 'phenotype: tumor' sample for calling, "
                    "in batch %s: %s" %
                    (batch, [dd.get_sample_name(data) for data in items]))
示例#8
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    if run_info_yaml and os.path.exists(run_info_yaml):
        logger.info("Using input YAML configuration: %s" % run_info_yaml)
        run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    else:
        logger.info("Fetching run details from Galaxy instance")
        fc_name, fc_date = flowcell.parse_dirname(dirs["flowcell"])
        galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key'])
        run_details = []
        galaxy_info = galaxy_api.run_details(fc_name, fc_date)
        for item in galaxy_info["details"]:
            item["upload"] = {"method": "galaxy", "run_id": galaxy_info["run_id"],
                              "fc_name": fc_name, "fc_date": fc_date}
            run_details.append(item)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        item = add_reference_resources(item)
        out.append(item)
    return out
示例#9
0
def _pair_samples_with_pipelines(run_info_yaml, config):
    """Map samples defined in input file to pipelines to run.
    """
    samples = config_utils.load_config(run_info_yaml)
    if isinstance(samples, dict):
        resources = samples.pop("resources")
        samples = samples["details"]
    else:
        resources = {}
    ready_samples = []
    for sample in samples:
        if "files" in sample:
            del sample["files"]
        # add any resources to this item to recalculate global configuration
        usample = copy.deepcopy(sample)
        usample.pop("algorithm", None)
        if "resources" not in usample:
            usample["resources"] = {}
        for prog, pkvs in resources.items():
            if prog not in usample["resources"]:
                usample["resources"][prog] = {}
            if pkvs is not None:
                for key, val in pkvs.items():
                    usample["resources"][prog][key] = val
        config = config_utils.update_w_custom(config, usample)
        sample["resources"] = {}
        ready_samples.append(sample)
    paired = [(x, _get_pipeline(x)) for x in ready_samples]
    d = defaultdict(list)
    for x in paired:
        d[x[1]].append([x[0]])
    return d, config
示例#10
0
文件: run_info.py 项目: zeneofa/bcbio
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"],
                                     clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        item = add_reference_resources(item)
        # Create temporary directories and make absolute
        if utils.get_in(item, ("config", "resources", "tmp", "dir")):
            utils.safe_makedir(
                utils.get_in(item, ("config", "resources", "tmp", "dir")))
            item["config"]["resources"]["tmp"] = genome.abs_file_paths(
                utils.get_in(item, ("config", "resources", "tmp")))
        out.append(item)
    return out
示例#11
0
def organize(dirs, config, run_info_yaml, sample_names):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs, run_info_yaml, config, sample_names)
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
            item["config"]["resources"]["tmp"]["dir"] = genome.abs_file_paths(tmp_dir)
        out.append(item)
    out = _add_provenance(out, dirs, config)
    return out
示例#12
0
def merge_sample(data):
    """Merge fastq and BAM files for multiple samples.
    """
    logger.debug("Combining fastq and BAM files %s" % str(data["name"]))
    config = config_utils.update_w_custom(data["config"], data["info"])
    if config["algorithm"].get("upload_fastq", False):
        fastq1, fastq2 = combine_fastq_files(data["fastq_files"],
                                             data["dirs"]["work"], config)
    else:
        fastq1, fastq2 = None, None

    out_file = os.path.join(data["dirs"]["work"],
                            data["info"]["rgnames"]["sample"] + ".bam")
    sort_bam = merge_bam_files(data["bam_files"],
                               data["dirs"]["work"],
                               config,
                               out_file=out_file)
    return [[{
        "name": data["name"],
        "metadata": data["info"].get("metadata", {}),
        "info": data["info"],
        "genome_build": data["genome_build"],
        "sam_ref": data["sam_ref"],
        "work_bam": sort_bam,
        "fastq1": fastq1,
        "fastq2": fastq2,
        "dirs": data["dirs"],
        "config": config,
        "config_file": data["config_file"]
    }]]
示例#13
0
def organize(dirs,
             config,
             run_info_yaml,
             sample_names=None,
             is_cwl=False,
             integrations=None):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    """
    from bcbio.pipeline import qcsummary
    if integrations is None: integrations = {}
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs,
                                      run_info_yaml,
                                      config,
                                      sample_names,
                                      is_cwl=is_cwl,
                                      integrations=integrations)
    remote_retriever = None
    for iname, retriever in integrations.items():
        if iname in config:
            run_details = retriever.add_remotes(run_details, config[iname])
            remote_retriever = retriever
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"],
                                     clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item, remote_retriever)
        item["config"]["algorithm"]["qc"] = qcsummary.get_qc_tools(item)
        item["config"]["algorithm"]["vcfanno"] = vcfanno.find_annotations(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            # if no environmental variables, make and normalize the directory
            # otherwise we normalize later in distributed.transaction:
            if os.path.expandvars(tmp_dir) == tmp_dir:
                tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
                tmp_dir = genome.abs_file_paths(tmp_dir,
                                                do_download=not integrations)
            item["config"]["resources"]["tmp"]["dir"] = tmp_dir
        out.append(item)
    out = _add_provenance(out, dirs, config, not is_cwl)
    return out
示例#14
0
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"],
                                               dirs["work"], lane_items[0], fc_name, dirs=dirs,
                                               config=config_utils.update_w_custom(config, lane_items[0]))
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_items[0]["rgnames"]["lane"], dirs, config)
    out = []
    for item in lane_items:
        logger.debug("Preparing %s" % item["rgnames"]["lane"])
        config = config_utils.update_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if item["barcode_id"] in bc_files:
            for fastq1, fastq2, lane_ext in _prep_fastq_files(item, bc_files, dirs, config):
                if item["barcode_id"] is not None:
                    item["rgnames"]["lane"] += "_%s" % (item["barcode_id"])
                if lane_ext is not None:
                    item["rgnames"]["lane"] += "_s{0}".format(lane_ext)
                out.append((fastq1, fastq2, item, dirs, config))
    return out
示例#15
0
def organize(dirs, config, run_info_yaml, sample_names=None, add_provenance=True,
             integrations=None):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    """
    from bcbio.pipeline import qcsummary
    if integrations is None: integrations = {}
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs, run_info_yaml, config, sample_names,
                                      integrations=integrations)
    remote_retriever = None
    for iname, retriever in integrations.items():
        if iname in config:
            run_details = retriever.add_remotes(run_details, config[iname])
            remote_retriever = retriever
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item, remote_retriever)
        item["config"]["algorithm"]["qc"] = qcsummary.get_qc_tools(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            # if no environmental variables, make and normalize the directory
            # otherwise we normalize later in distributed.transaction:
            if os.path.expandvars(tmp_dir) == tmp_dir:
                tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
                tmp_dir = genome.abs_file_paths(tmp_dir, do_download=not integrations)
            item["config"]["resources"]["tmp"]["dir"] = tmp_dir
        out.append(item)
    out = _add_provenance(out, dirs, config, add_provenance)
    return out
示例#16
0
def _item_needs_compute(lanes):
    """Determine if any item needs computing resources to spin up a cluster.
    """
    for lane_items, _, _, _, config in lanes:
        # check if multiplexed
        if len(lane_items) > 1 or lane_items[0]["barcode_id"] is not None:
            return True
        # check if we need to process the input by splitting or conversion
        item = lane_items[0]
        config = config_utils.update_w_custom(config, item)
        split_size = config.get("distributed", {}).get("align_split_size",
                                                       config["algorithm"].get("align_split_size", None))
        if split_size is not None:
            return True
        if needs_fastq_conversion(item, config):
            return True
    return False
示例#17
0
def organize(dirs, config, run_info_yaml, sample_names):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.

    sample_names is a list of samples to include from the overall file, for cases
    where we are running multiple pipelines from the same configuration file.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config,
                                      sample_names)
    out = []
    for item in run_details:
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"],
                                     clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        # add algorithm details to configuration, avoid double specification
        item["resources"] = _add_remote_resources(item["resources"])
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item = add_reference_resources(item)
        # Create temporary directories and make absolute, expanding environmental variables
        tmp_dir = tz.get_in(["config", "resources", "tmp", "dir"], item)
        if tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.expandvars(tmp_dir))
            item["config"]["resources"]["tmp"]["dir"] = genome.abs_file_paths(
                tmp_dir)
        out.append(item)
    out = _add_provenance(out, dirs, config)
    return out