def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ from bcbio.variation import bedutils highdepth_beds = filter(lambda x: x is not None, list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items]))) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def _remove_regions(in_file, remove_beds, ext, data): """Subtract a list of BED files from an input BED. General approach handling none, one and more remove_beds. """ from bcbio.variation import bedutils out_file = "%s-%s.bed" % (utils.splitext_plus(in_file)[0], ext) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with bedtools_tmpdir(data): if len(remove_beds) == 0: to_remove = None elif len(remove_beds) == 1: to_remove = remove_beds[0] else: to_remove = "%s-all.bed" % utils.splitext_plus( tx_out_file)[0] with open(to_remove, "w") as out_handle: for b in remove_beds: with utils.open_gzipsafe(b) as in_handle: for line in in_handle: parts = line.split("\t") out_handle.write( "\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(to_remove): to_remove = bedutils.sort_merge(to_remove, data) if to_remove and utils.file_exists(to_remove): cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove problematic regions: %s" % ext) else: utils.symlink_plus(in_file, out_file) return out_file
def _collapse_transcripts(in_file, window, data, out_dir): """Collapse transcripts into min/max coordinates and optionally add windows. """ if out_dir is None: out_dir = os.path.dirname(in_file) out_file = os.path.join(out_dir, "%s-transcripts_w%s.bed" % (os.path.splitext(os.path.basename(in_file))[0], window)) chrom_sizes = {} for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]): chrom_sizes[contig.name] = contig.size if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file) sort_cmd = bedutils.get_sort_cmd() cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}" do.run(cmd.format(**locals()), "Sort BED file by transcript name") with open(tx_out_file, "w") as out_handle: # Work around for segmentation fault issue with groupby # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476 x = pybedtools.BedTool(prep_file) def gen(): for r in x: yield r for name, rs in itertools.groupby(gen(), lambda r: (r.name, r.chrom)): rs = list(rs) r = rs[0] for gcoords in _group_coords(rs): min_pos = max(min(gcoords) - window, 0) max_pos = min(max(gcoords) + window, chrom_sizes[r.chrom]) out_handle.write("%s\t%s\t%s\t%s\n" % (r.chrom, min_pos, max_pos, r.name)) return bedutils.sort_merge(out_file, data)
def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ from bcbio.variation import bedutils highdepth_beds = filter(lambda x: x is not None, list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items]))) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def _get_target_access_files(cov_interval, data, work_dir): """Retrieve target and access files based on the type of data to process. pick targets, anti-targets and access files based on analysis type http://cnvkit.readthedocs.org/en/latest/nonhybrid.html """ base_regions = shared.get_base_cnv_regions(data, work_dir) target_bed = bedutils.sort_merge(base_regions, data, out_dir=work_dir) if cov_interval == "amplicon": return target_bed, target_bed elif cov_interval == "genome": return target_bed, target_bed else: access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data) return target_bed, access_file