def genes_to_bedtool(gene_collection, hgnc_ids=None, ensembl_ids=None, build="GRCh37"): """Create a Bedtool object with gene coordinates from a list of genes contained in the database Accepts: hgnc_ids(list): a list of hgnc genes ids ensembl_ids(list): a list of ensembl gene ids gene_collection(pymongo.collection.Collection) build(str): genome build, GRCh37 or GRCh38 Returns: bt(pybedtools.bedtool.BedTool): a BedTool object containing gene intervals """ if not (hgnc_ids or ensembl_ids): return None # No gene was specified to filter VCF file with query = {"build": build} if hgnc_ids: query["hgnc_id"] = {"$in": hgnc_ids} elif ensembl_ids: # either HGNC or ENSEMBL IDs, not both in the query dictionary query["ensembl_id"] = {"$in": ensembl_ids} # Query database for genes coordinates results = gene_collection.find(query) # Create a string containing gene intervals to initialize a Bedtool object with bedtool_string = "" for gene in results: bedtool_string += ( "\t".join([gene["chromosome"], str(gene["start"]), str(gene["end"])]) + "\n" ) if bedtool_string == "": return None bt = BedTool(bedtool_string, from_string=True) return bt
def aggregate_by_tad(all_TADs_by_celltype, aggregations, other, extension=0.1, n_windows=100): tot_windows = n_windows + int(n_windows * extension) * 2 tad_start_window = int(n_windows * extension) tad_end_window = n_windows + int(n_windows * extension) regions = all_TADs_by_celltype[coords + ['tad_uid']].copy() regions['tad_uid'] = regions.tad_uid.map(lambda x: x.replace("_", "-")) windows = BedTool().window_maker(b=BedTool.from_dataframe(regions)\ .slop(l=extension, r=extension, pct=True, genome="hg19"), n=tot_windows, i='srcwinnum')\ .to_dataframe(names=coords + ['window_uid']) windows_idxs = windows.window_uid.str.split("_", expand=True) windows_idxs.columns = ['tad_uid', 'win_num'] windows = pd.concat((windows, windows_idxs), axis=1) windows['win_num'] = windows['win_num'].astype(int) windows = windows.sort_values(coords).reset_index(drop=True) windows_with_ctcfs = coverage_by_window(windows, other, aggregations) aggregations_by_tad = {} for c in aggregations.keys(): print(" " * 100, end='\r') print(c, end="\r") cagg = windows_with_ctcfs.pivot_table(index='tad_uid', columns='win_num', values=c).sort_index(axis=1) cagg = cagg.sort_index(axis=1) aggregations_by_tad[c] = cagg return aggregations_by_tad, tad_start_window, tad_end_window
def align_list_to_bed(*, align_list): temp_align_list = align_list.assign(assembly_id_sequence_id=lambda x: x[ 'assembly_id'] + ',' + x['sequence_id']) temp_bed = BedTool.from_dataframe(temp_align_list[[ 'assembly_id_sequence_id', 'sequence_from', 'sequence_to' ]]) temp_merged_bed = temp_bed.sort().merge() if temp_merged_bed.count() > 0: temp_merged_bed_df = temp_merged_bed.to_dataframe() temp_merged_bed_df = pandas.concat([ temp_merged_bed_df['chrom'].str.split( ',', n=1, expand=True).rename(columns={ 0: 'assembly_id', 1: 'sequence_id' }), temp_merged_bed_df[['start', 'end']] ], axis=1) bed = BedTool.from_dataframe( temp_merged_bed_df[['sequence_id', 'start', 'end', 'assembly_id']]) os.remove(temp_merged_bed.fn) else: bed = BedTool('', from_string=True) os.remove(temp_bed.fn) return bed
def windowing_by_number(all_TADs_by_celltype, n_windows): windows = BedTool().window_maker(b=BedTool.from_dataframe(all_TADs_by_celltype), n=n_windows, i='srcwinnum')\ .to_dataframe(names=all_TADs_by_celltype.columns.tolist()) idxs = windows[all_TADs_by_celltype.columns[-1]].str.split("_", expand=True) tad_ids = idxs.iloc[:, :-1].apply(lambda x: "_".join(x), axis=1) w_nums = idxs.iloc[:, -1].astype(int) - 1 windows[all_TADs_by_celltype.columns[-1]] = tad_ids windows['w_num'] = w_nums windows = windows.sort_values(coords).reset_index(drop=True) return windows
def windowing_by_size(centered_boundaries, window_size): windows = BedTool().window_maker(b=BedTool.from_dataframe(centered_boundaries), w=window_size, i='srcwinnum')\ .to_dataframe(names=centered_boundaries.columns.tolist()) idxs = windows[centered_boundaries.columns[-1]].str.split("_", expand=True) tad_ids = idxs.iloc[:, :-1].apply(lambda x: "_".join(x), axis=1) w_nums = idxs.iloc[:, -1].astype(int) - 1 windows[centered_boundaries.columns[-1]] = tad_ids windows['w_num'] = w_nums windows = windows.sort_values(coords).reset_index(drop=True) return windows
def merge_intervals(panels): """Create genomic intervals to filter VCF files starting from the provided panel file(s) Accepts: panels(list) : path to one or more panel bed files Returns: merged_panels(Temp BED File): a temporary file with merged panel intervals """ merged_panels = BedTool(panels[0]) if len(panels) > 1: merged_panels = merged_panels.cat(*panels[1:]) return merged_panels
def _compute_intersections(vcf_file, filter): """Create a temporary file with the gene panel intervals Accepts: vcf_file(str): path to the VCF file filter(BcfTool object) Returns: intersections() """ vcf_bed = BedTool(vcf_file) LOG.info( "Extracting %s intervals from the %s total entries of the VCF file.", filter.count(), vcf_bed.count(), ) intersections = vcf_bed.intersect(filter, header=True) intersected_vars = intersections.count() LOG.info("Number of variants found in the intervals:%s", intersected_vars) return intersections