Exemplo n.º 1
0
def _merge_by_batch(batch, fnames):
    """Merge all calls in a family into a single callset.
    """
    merge_dir = utils.safe_makedir(os.path.join(os.getcwd(), "merged"))
    clean_dir = utils.safe_makedir(os.path.join(merge_dir, "clean"))
    merge_file = os.path.join(merge_dir, "%s-ensemble.bed" % batch)
    if not utils.file_uptodate(merge_file, fnames[0]):
        for fname in glob.glob(os.path.join(merge_dir, "%s-ensemble*" % batch)):
            os.remove(fname)
    ensemble.combine_bed_by_size(fnames, batch, merge_dir, {}, delim="&&")
    return bedutils.clean_file(merge_file, {}, bedprep_dir=clean_dir)
Exemplo n.º 2
0
def _merge_by_batch(batch, fnames):
    """Merge all calls in a family into a single callset.
    """
    merge_dir = utils.safe_makedir(os.path.join(os.getcwd(), "merged"))
    clean_dir = utils.safe_makedir(os.path.join(merge_dir, "clean"))
    merge_file = os.path.join(merge_dir, "%s-ensemble.bed" % batch)
    if not utils.file_uptodate(merge_file, fnames[0]):
        for fname in glob.glob(os.path.join(merge_dir,
                                            "%s-ensemble*" % batch)):
            os.remove(fname)
    ensemble.combine_bed_by_size(fnames, batch, merge_dir, {}, delim="&&")
    return bedutils.clean_file(merge_file, {}, bedprep_dir=clean_dir)
Exemplo n.º 3
0
def _combine_calls(in_files, sample_info):
    sample_files = {}
    sample_dir = utils.safe_makedir(os.path.join(os.getcwd(), "prepped"))

    files_by_batch = collections.defaultdict(list)
    for fname in in_files:
        descr = os.path.basename(fname).split("-")[0]
        sample_files[descr] = fname
        cur_file = os.path.join(sample_dir, os.path.basename(fname))
        files_by_batch[sample_info[descr]["batch"]].append(cur_file)
        if not utils.file_uptodate(cur_file, fname):
            with open(cur_file, "w") as out_handle:
                with open(fname) as in_handle:
                    for line in in_handle:
                        chrom, start, end, calls = line.strip().split("\t")[:4]
                        info = {"sample": descr, "phenotype": sample_info[descr]["phenotype"],
                                "risk": sample_info[descr]["risk"],
                                "calls": calls.split(","), "region": "%s:%s-%s" % (chrom, start, end)}
                        is_cnv = any([x.startswith("cnv") for x in info["calls"]])
                        if int(end) - int(start) >= MIN_SIZE and (is_cnv or int(end) - int(start) < MAX_SV_SIZE):
                            out_handle.write("\t".join([chrom, start, end, json.dumps(info)]) + "\n")
    samples_by_batch = collections.defaultdict(list)
    for s, x in sample_info.items():
        x["sample"] = s
        samples_by_batch[x["batch"]].append(x)
    ready_files = []
    for batch, fnames in files_by_batch.items():
        merged_file = _merge_by_batch(batch, fnames)
        filtered_file = _filter_batch(merged_file, samples_by_batch[batch])
        ready_files.append(filtered_file)
    return ensemble.combine_bed_by_size(ready_files, "combined", os.getcwd(), {}), sample_files
Exemplo n.º 4
0
def _combine_calls(in_files, sample_info):
    sample_files = {}
    sample_dir = utils.safe_makedir(os.path.join(os.getcwd(), "prepped"))

    files_by_batch = collections.defaultdict(list)
    for fname in in_files:
        descr = os.path.basename(fname).split("-")[0]
        sample_files[descr] = fname
        cur_file = os.path.join(sample_dir, os.path.basename(fname))
        files_by_batch[sample_info[descr]["batch"]].append(cur_file)
        if not utils.file_uptodate(cur_file, fname):
            with open(cur_file, "w") as out_handle:
                with open(fname) as in_handle:
                    for line in in_handle:
                        chrom, start, end, calls = line.strip().split("\t")[:4]
                        info = {
                            "sample": descr,
                            "phenotype": sample_info[descr]["phenotype"],
                            "risk": sample_info[descr]["risk"],
                            "calls": calls.split(","),
                            "region": "%s:%s-%s" % (chrom, start, end)
                        }
                        is_cnv = any(
                            [x.startswith("cnv") for x in info["calls"]])
                        if int(end) - int(start) >= MIN_SIZE and (
                                is_cnv or int(end) - int(start) < MAX_SV_SIZE):
                            out_handle.write("\t".join(
                                [chrom, start, end,
                                 json.dumps(info)]) + "\n")
    samples_by_batch = collections.defaultdict(list)
    for s, x in sample_info.items():
        x["sample"] = s
        samples_by_batch[x["batch"]].append(x)
    ready_files = []
    for batch, fnames in files_by_batch.items():
        merged_file = _merge_by_batch(batch, fnames)
        filtered_file = _filter_batch(merged_file, samples_by_batch[batch])
        ready_files.append(filtered_file)
    return ensemble.combine_bed_by_size(ready_files, "combined", os.getcwd(),
                                        {}), sample_files