def prepare_gat( df, promoter_TATA_intersect_bed, TATA_box_locations, file_names, output_genecat_prefix, promoterpref, variable1_name, variable2_name, ): """prepare files for running gat analysis - outputs a workspace file containing all promoters, a variable promoter file and a constitutive promoter file""" # make buffer to save promoters buffer = io.StringIO() df.to_csv(buffer, sep="\t", header=None, index=False) buffer.seek(0) # select only constitutive and variable genes df = df[(df.gene_type == variable1_name) | (df.gene_type == variable2_name)] # reorder columns df_reordered = df[[ "chr", "start", "stop", "gene_type", "strand", "source", "attributes", "AGI", ]] # sort by chromosome and start sorted_motifs = df_reordered.sort_values(["chr", "start"]) # save bed file BedTool.from_dataframe(sorted_motifs).saveas( f"../../data/output/{file_names}/TATA/{output_genecat_prefix}_{promoterpref}_nocontrol.bed" ) # run bedtools intersect between TATAbox_location_renamed.bed and the extracted promoters TATAlocations = BedTool(TATA_box_locations) promoters = BedTool(buffer) promoters.intersect(TATAlocations, wao=True, output=promoter_TATA_intersect_bed) # make a new gat workspace file with all promoters (first 3 columns) BedTool.from_dataframe(sorted_motifs[["chr", "start", "stop"]]).saveas( f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_workspace.bed" ) # select only variable promoters variable_promoters_extended = sorted_motifs[sorted_motifs["gene_type"] == variable2_name] sorted_variable = variable_promoters_extended.sort_values(["chr", "start"]) BedTool.from_dataframe(sorted_variable).saveas( f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_{variable2_name}.bed" ) # make a constitutive only file constitutive_promoters = sorted_motifs[sorted_motifs["gene_type"] == variable1_name] sorted_constitutive = constitutive_promoters.sort_values(["chr", "start"]) BedTool.from_dataframe(sorted_constitutive).saveas( f"../../data/output/{file_names}/TATA/gat_analysis/{output_genecat_prefix}_{promoterpref}_{variable1_name}.bed" )
def Cluster2pIntronRetention(Cluster): ''''clu_5077': {'chrX\t166458208\t166462315': 0.0195360774825754, 'chrX\t166458343\t166462315': -0.0195360774825754} ''' for k1, v1 in Cluster.items(): if len(v1) > 2: for k2, v2 in v1.items(): Junction = BedTool(k2, from_string=True) Junction_Bed = Bed(k2.split("\t")) X = Junction.intersect(Genes, wb=True) m6A_string = [] splicing_type = [] if len(X) >= 1: for gene in X: g = genebed(gene[3:]) for e in g.Exons(): if Junction_Bed.overlap(Junction_Bed, e): x_l = Junction_Bed.overlapLength(e) if x_l >= 10 and x_l < e.length( ) - 2: ## if ==, which means the entire exon is inside of the intron exon = BedTool(str(e), from_string=True) alternative = Junction.intersect(exon) splicing_type.append("pfIntronRetention") m6A_string.append(m6AORnot(alternative)) if len(set(m6A_string)) == 1 and len(splicing_type) > 0: fo_pfIR.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format( "|".join(set(splicing_type)), "|".join(set(m6A_string)), k1, k2, v2))
def get_enrichment(peakfile, length): peaks = BedTool(peakfile) region = BedTool([ Interval(peaks[0].chrom, args.start, args.end, strand='+', score='0', name='region') ]) ilen = len(region[0]) olen = length - ilen outside = [ float(x.attrs['topcoverage']) for x in peaks.intersect(region, f=0.5, v=True) ] inside = [ float(x.attrs['topcoverage']) for x in peaks.intersect(region, f=0.5, u=True) ] if (len(outside)): normed_count = (len(inside) / ilen) / (len(outside) / olen) normed_sum = (sum(inside) / ilen) / (sum(outside) / olen) overrepresented = int(normed_count > 2 or normed_sum > 2) else: normed_count = float('nan') normed_sum = float('nan') overrepresented = -1 return "%s\t%d\t%d\t%1.1f\t%1.1f\t%1.3f\t%1.3f\t%1.3f\t%1.3f\t%1.3f\t%1.3f\t%d" % ( os.path.basename(peakfile).split(".")[0], len(inside), len(outside), sum(inside), sum(outside), np.mean(inside), np.mean(outside), np.median(inside), np.median(outside), normed_count, normed_sum, overrepresented)
def count_stitch(cnv,probefh): to_stitch=[] first='' probes = BedTool(probefh).sort() cnvbed = BedTool(list(set(cnv))).sort(stream=True) cnv = toList(cnvbed) for i in xrange(len(cnv)): if i == 0: first=cnv[i] elif i > 0: (c1,s1,e1,cl1,id1,cf1) = first (c2,s2,e2,cl2,id2,cf2) = cnv[i] if cl1 != cl2: first = cnv[i] elif c1 != c2: first = cnv[i] else: g1 = int(e1)+1 g2 = int(s2)-1 if int(e1) == int(s2)-1: g1=int(e1) g2=int(s2) probe_spans=[] probe_spans.append(len(probes.intersect(BedTool(' '.join(map(str,(c1,s1,e1))),from_string=True),wa=True,u=True,stream=True))) probe_spans.append(len(probes.intersect(BedTool(' '.join(map(str,(c2,s2,e2))),from_string=True),wa=True,u=True,stream=True))) max_span = max(probe_spans) if len(probes.intersect(BedTool(' '.join(map(str,(c1,g1,g2))),from_string=True),wa=True,u=True,stream=True)) <= float(max_span)*0.5: to_stitch.append(i) to_stitch.sort() to_stitch = [[v[1] for v in vals] for _, vals in itertools.groupby(enumerate(to_stitch), key=lambda x: x[1] - x[0])] return len(to_stitch)
def rebin_step1(): if url == False: A = BedTool(input_signal) B = BedTool(bins) AB = A.intersect(B, wo=True) AB_inv = B.intersect(A, v=True) return AB, AB_inv elif url == True: to_download = "'" + input_signal + "'" command = "wget " + to_download p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE) stdout, stderr = p.communicate() ninput_signal = os.path.basename(input_signal) if bigWig == True: run_bash("./kent_binaries/bigWigToBedGraph " + ninput_signal + " " + ninput_signal.replace(".bigWig", ".bedGraph").replace( ".bigwig", ".bedGraph")) run_bash("rm " + ninput_signal) ninput_signal = ninput_signal.replace(".bigWig", ".bedGraph").replace( ".bigwig", ".bedGraph") A = BedTool(ninput_signal) B = BedTool(bins) AB = A.intersect(B, wo=True) AB_inv = B.intersect(A, v=True) run_bash("rm " + ninput_signal) return AB, AB_inv
def load_beddata(genome, bed_file, use_meta, use_gencode, input_dir, is_sorted, chrom=None): bed = BedTool(bed_file) if not is_sorted: print('Sorting BED file') bed = bed.sort() is_sorted = True blacklist = make_blacklist() print('Determining which windows are valid') bed_intersect_blacklist_count = bed.intersect(blacklist, wa=True, c=True, sorted=is_sorted) if chrom: nonblacklist_bools = np.array([i.chrom==chrom and i.count==0 for i in bed_intersect_blacklist_count]) else: nonblacklist_bools = np.array([i.count==0 for i in bed_intersect_blacklist_count]) print('Filtering away blacklisted windows') bed_filtered = bed.intersect(blacklist, wa=True, v=True, sorted=is_sorted) if chrom: print('Filtering away windows not in chromosome:', chrom) bed_filtered = subset_chroms([chrom], bed_filtered) print('Generating test data iterator') bigwig_names, bigwig_files_list = load_bigwigs([input_dir]) bigwig_files = bigwig_files_list[0] if use_meta: meta_names, meta_list = load_meta([input_dir]) meta = meta_list[0] else: meta = [] meta_names = None shift = 0 if use_gencode: cpg_bed = BedTool('resources/cpgisland.bed.gz') cds_bed = BedTool('resources/wgEncodeGencodeBasicV19.cds.merged.bed.gz') intron_bed = BedTool('resources/wgEncodeGencodeBasicV19.intron.merged.bed.gz') promoter_bed = BedTool('resources/wgEncodeGencodeBasicV19.promoter.merged.bed.gz') utr5_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr5.merged.bed.gz') utr3_bed = BedTool('resources/wgEncodeGencodeBasicV19.utr3.merged.bed.gz') peaks_cpg_bedgraph = bed_filtered.intersect(cpg_bed, wa=True, c=True) peaks_cds_bedgraph = bed_filtered.intersect(cds_bed, wa=True, c=True) peaks_intron_bedgraph = bed_filtered.intersect(intron_bed, wa=True, c=True) peaks_promoter_bedgraph = bed_filtered.intersect(promoter_bed, wa=True, c=True) peaks_utr5_bedgraph = bed_filtered.intersect(utr5_bed, wa=True, c=True) peaks_utr3_bedgraph = bed_filtered.intersect(utr3_bed, wa=True, c=True) data_bed = [(window.chrom, window.start, window.stop, 0, bigwig_files, np.append(meta, np.array([cpg.count, cds.count, intron.count, promoter.count, utr5.count, utr3.count], dtype=bool))) for window, cpg, cds, intron, promoter, utr5, utr3 in itertools.izip(bed_filtered, peaks_cpg_bedgraph,peaks_cds_bedgraph,peaks_intron_bedgraph,peaks_promoter_bedgraph,peaks_utr5_bedgraph,peaks_utr3_bedgraph)] else: data_bed = [(window.chrom, window.start, window.stop, shift, bigwig_files, meta) for window in bed_filtered] #from data_iter import DataIterator from data_iter import DataIterator bigwig_rc_order = get_bigwig_rc_order(bigwig_names) datagen_bed = DataIterator(data_bed, genome, 100, L, bigwig_rc_order, shuffle=False) return bigwig_names, meta_names, datagen_bed, nonblacklist_bools
def main(): out_dir = Path("../output/chipseq-wf/intersections") out_dir.mkdir(exist_ok=True) ref = "../output/chipseq-wf/dmel-all-r6.26_genes.bed" ref_bt = BedTool(ref).slop(b=1_000, genome="dm6") for file_name in Path("../output/chipseq-wf/bed").iterdir(): file_out = out_dir / file_name.name bt = BedTool(file_name) bt.intersect(ref_bt, wb=True).saveas(file_out)
def get_binned_modules(ma=None, a=annotations450, b='lola_vignette_data/activeDHS_universe.bed', include_last=False, min_capsule_len=2000): allcpgs = ma.beta.columns.values a = BedTool(a) b = BedTool(b) # a.saveas('a.bed') # b.saveas('b.bed') a_orig = a df = BedTool(a).to_dataframe() df.iloc[:, 0] = df.iloc[:, 0].astype(str) #.map(lambda x: 'chr'+x.split('.')[0]) df = df.set_index('name').loc[list( ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]] a = BedTool.from_dataframe(df) # df_bed=pd.read_table(b,header=None) # df_bed['features']=np.arange(df_bed.shape[0]) # df_bed=df_bed.iloc[:,[0,1,2,-1]] # b=BedTool.from_dataframe(df) # a=BedTool.from_dataframe(df_bed)#('lola_vignette_data/activeDHS_universe.bed') df_bed = BedTool(b).to_dataframe() if df_bed.shape[1] < 4: df_bed['features'] = np.arange(df_bed.shape[0]) b = BedTool.from_dataframe(df_bed) try: c = b.intersect(a, wa=True, wb=True).sort() # c.saveas('c.bed') d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct')) except: df = BedTool(a_orig).to_dataframe() df.iloc[:, 0] = df.iloc[:, 0].astype(str).map( lambda x: 'chr' + x.split('.')[0]) df = df.set_index('name').loc[list( ma.beta)].reset_index().iloc[:, [1, 2, 3, 0]] a = BedTool.from_dataframe(df) c = b.intersect(a, wa=True, wb=True).sort() # c.saveas('c.bed') d = c.groupby(g=[1, 2, 3, 4], c=(8, 8), o=('count', 'distinct')) #d.saveas('d.bed') df2 = d.to_dataframe() df3 = df2.loc[df2.iloc[:, -2] > min_capsule_len] modules = [cpgs.split(',') for cpgs in df3.iloc[:, -1].values] modulecpgs = np.array(list(set(list(reduce(lambda x, y: x + y, modules))))) if include_last: missing_cpgs = np.setdiff1d(allcpgs, modulecpgs).tolist() final_modules = modules + ([missing_cpgs] if include_last else []) module_names = (df3.iloc[:, 0] + '_' + df3.iloc[:, 1].astype(str) + '_' + df3.iloc[:, 2].astype(str)).tolist() return final_modules, modulecpgs.tolist(), module_names
def intersector(sampfile, filtfile, specfr): ###Takes two bed files and will intersect them. The specfr argument is used to tell whether to keep or remove the regions that overlap in the two files given.### sf = BedTool(sampfile) ff = BedTool(filtfile) if specfr == 'Keep': endfile = sf.intersect(ff, u=True) elif specfr == 'Discard': endfile = sf.intersect(ff, u=False) else: print "ERROR: Incorrect specifying argument given, please use 'Keep' or 'Discard'." return endfile
def generate_background(foipath,gfpath,background): """accepts a background filepath generate a background and returns as a pybedtool. Replaces the chrom fields of the foi and the gf with the interval id from the background. """ bckg = background bckgnamed = "" interval = 0 #inserts a unique interval id into the backgrounds name field for b in bckg: bckgnamed += "\t".join(b[:3])+'\t{}\t'.format(interval) + "\t".join(b[4:]) + "\n" interval += 1 bckg = BedTool(bckgnamed,from_string=True) foi = BedTool(str(foipath)) gf = BedTool(str(gfpath)) # get the interval names from the background that the gf intersects with gf = bckg.intersect(gf) gfnamed = "" # insert the interval id into the chrom field of the gf and creates a new bedtool for g in gf: gfnamed += '{}\t'.format(g.name) + "\t".join(g[1:]) + "\n" #print "GFNAMED: " + str(g) gf = BedTool(gfnamed,from_string=True) #print "GFBEDTOOL: " + str(g) # inserts the interval id into the chrom column of the foi and creates a new bedtool foi = bckg.intersect(foi) foinamed = "" for f in foi: foinamed += '{}\t'.format(f.name) + "\t".join(f[1:])+"\n" #print "FOINAMED: " + str(f) foi = BedTool(foinamed,from_string=True) #print "FOIBEDTOOL: " + str(f) bckgnamed = "" for b in bckg: bckgnamed += '{}\t'.format(b.name) + "\t".join(b[1:])+"\n" bckg = BedTool(bckgnamed,from_string=True) # converts the background to a genome dictionary chrstartend = [(g.start,g.end) for g in bckg] background = dict(zip([g.chrom for g in bckg],chrstartend)) return {"foi": foi,"gf":gf,"background":background} run_pvalue=False,run_pybedtool=False,run_jaccard=False,run_proximity=False,run_kolmogorov=False
def bed_intersect(cpg, data): """ using pybedtools, perform an intersection on our cpg file of interest and cpg count observation file :param cpg: cpg file :param data: count file :return: None """ output_name = cpg.replace(".bed", "_results.txt") # make results file # make bedtool objects out of cpg and data files a = BedTool(cpg) b = BedTool(data) # perform a left outer join intersect and move to an output file a.intersect(b, loj=True).moveto(output_name)
def gene_regions(vf, af): print "inside gene regions" v = BedTool(vf) feats = BedTool(af) # first establish all the columns in the annotation file cols = set(f[4] for f in feats) results = {} intersection = v.intersect(feats, wb=True) if len(intersection) > 0: annots = intersection.groupby(g=[1,2,3,4], c=9, o='collapse') for entry in annots: regions = {} for region in entry[4].split(','): if region in regions: regions[region] += 1 else: regions[region] = 1 results[entry.name] = Series(regions) df = DataFrame(results, index = cols) print "exiting gene regions" return df.T.fillna(0)
def get_feat(self, _input): snp_dfm = _input.loc[:, ['chrom', 'chromStart', 'chromEnd', 'name']] snp_bed_obj = BedTool(snp_dfm.to_string(index=False, header=False, index_names=False), from_string=True) seg_bed_fn = os.path.join(self.src_data_dir, self.src_data_fn) seg_bed_obj = BedTool(seg_bed_fn) results = {} # The 'intersect' operation is not 'left-join' style so its result might have less entries than the SNP bed intersection = snp_bed_obj.intersect(seg_bed_obj, wb=True) if len(intersection) > 0: annots = intersection.groupby(g=[1, 2, 3, 4], c=8, o='collapse') for entry in annots: results[entry.name] = pd.Series(entry[4].split(',')).value_counts() names = { 'CTCF': 'CTCF_REG', 'E': 'ENH', 'PF': 'TSS_FLANK', 'R': 'REP', 'T': 'TRAN', 'TSS': 'TSS', 'WE': 'WEAK_ENH' } gwava_dfm = pd.DataFrame(results, index=names.keys()).T.rename(columns=names) snp_dfm = snp_dfm.merge(gwava_dfm, how='left', left_on='name', right_index=True, copy=True) return snp_dfm.fillna(0).drop(['chrom', 'chromStart', 'chromEnd'], axis=1)
def coverage(source: BedTool, to_intersect: BedTool, presorted: bool = True) -> [float]: """ For each interval in the source compute coverage by to_intersect features. Source intervals must be non overlapping """ if not presorted: source, to_intersect = source.sort(), to_intersect.sort() intersection = source.intersect(to_intersect, wao=True) intersection, source = list(intersection), list(source) ind = 0 curinter = intersection[ind] curcov = float(curinter.fields[-1]) coverage = [] assert source[ ind] == curinter, f"Fail: expected {source[ind]}, got {curinter}" for inter in intersection: if inter == curinter: curcov += float(inter.fields[-1]) else: coverage.append(curcov / curinter.length) assert coverage[-1] <= 1 curinter = inter curcov = float(curinter.fields[-1]) ind += 1 assert curinter == source[ ind], f"Fail: expected {source[ind]}, got {curinter}" coverage.append(curcov / curinter.length) assert len(coverage) == len(source) return coverage
def calculate_ovl(nbedfile, obedfile, opts, scoresfile): nbedtool = BedTool(nbedfile) obedtool = BedTool(obedfile) ab = nbedtool.intersect(obedtool, wao=True, f=opts.f, r=opts.r, s=opts.s) cmd = """cut -f4,5,10,13 | awk -F $'\t' 'BEGIN { OFS = FS } ($3 != "."){ print $1,$3,$2,$4; }'""" sh(cmd, infile=ab.fn, outfile=scoresfile)
def load_1kgp(self,raw=None,svtype=None,gen=None,tmp_bed=None): sv = BedTool([(format_chrom(x[0]),x[1],x[2],x[3]) for x in raw if svtype in str(x[3])]).sort() sv.intersect('{}annotation_files/{}_1000Genomes_{}.bed'.format(Config().resource_path(),gen,svtype), f=0.8, F=0.8, wao=True,output=tmp_bed) with open(tmp_bed,'r') as f: for l in f: x = tuple(l.rstrip().split('\t')) locus = tokenize_sv(x)+(str(x[3]),) ovr = int(x[-1]) if ovr==0: continue ovr = format(float(x[len(x)-1])/(int(x[2])-int(x[1])),'.2f') if self._1kgp.get(locus)==None: self._1kgp[locus]=(x[len(x)-2],ovr) elif self._1kgp.get(locus)!=None and float(ovr) > float(self._1kgp[locus][1]): self._1kgp[locus]=(x[len(x)-2],ovr) else: continue os.remove(tmp_bed)
def load_chip_multiTask(input_dir): tfs, chip_beds, merged_chip_bed = get_chip_beds(input_dir) print('Removing peaks outside of X chromosome and autosomes') chroms, chroms_sizes, genome_bed = get_genome_bed() merged_chip_bed = merged_chip_bed.intersect(genome_bed, u=True, sorted=True) print('Windowing genome') genome_windows = BedTool().window_maker(g=genome_sizes_file, w=genome_window_size, s=genome_window_step) print('Extracting windows that overlap at least one ChIP interval') positive_windows = genome_windows.intersect(merged_chip_bed, u=True, f=1.0*(genome_window_size/2+1)/genome_window_size, sorted=True) # Exclude all windows that overlap a blacklisted region blacklist = make_blacklist() print('Removing windows that overlap a blacklisted region') positive_windows = positive_windows.intersect(blacklist, wa=True, v=True, sorted=True) num_positive_windows = positive_windows.count() # Binary binding target matrix of all positive windows print('Number of positive windows:', num_positive_windows) print('Number of targets:', len(tfs)) # Generate targets print('Generating target matrix of all positive windows') y_positive = parmap.map(intersect_count, chip_beds, positive_windows.fn) y_positive = np.array(y_positive, dtype=bool).T print('Positive matrix sparsity', (~y_positive).sum()*1.0/np.prod(y_positive.shape)) merged_chip_slop_bed = merged_chip_bed.slop(g=genome_sizes_file, b=genome_window_size) # Later we want to gather negative windows from the genome that do not overlap # with a blacklisted or ChIP region nonnegative_regions_bed = merged_chip_slop_bed.cat(blacklist) return tfs, positive_windows, y_positive, nonnegative_regions_bed
def gene_regions(vf, af): v = BedTool(vf) feats = BedTool(af) # first establish all the columns in the annotation file cols = set(f[4] for f in feats) results = {} intersection = v.intersect(feats, wb=True) if len(intersection) > 0: annots = intersection.groupby(g=[1,2,3,4], c=9, ops='collapse') for entry in annots: regions = {} for region in entry[4].split(','): if region in regions: regions[region] += 1 else: regions[region] = 1 results[entry.name] = Series(regions) df = DataFrame(results, index = cols) return df.T.fillna(0)
def cnv_format(self): samples_sv = self.parse_vcf() for sample in samples_sv: if os.path.exists(os.path.join(self.module, sample)): pass else: os.mkdir(os.path.join(self.module, sample)) df = pd.DataFrame() df['Samples'] = [sample for i in range( len(samples_sv[sample]['Chrom']))] df['Chrom'] = samples_sv[sample]['Chrom'] df['Start'] = samples_sv[sample]['Start'] df['End'] = samples_sv[sample]['End'] df['Length'] = samples_sv[sample]['Length'] df['Type'] = samples_sv[sample]['Type'] df['CN'] = samples_sv[sample]['CN'] df.to_csv('{module}/{sample}/cnv_igv.seg'.format(module=self.module, sample=sample), index=False, header=True, sep='\t') df = df.drop('Samples', axis=1) # df=df.drop('CN',axis=1) df.to_csv('{module}/{sample}/cnv.bed'.format(module=self.module, sample=sample), index=False, header=False, sep='\t') self.cnv_plot(df, sample) cnv = BedTool( '{module}/{sample}/cnv.bed'.format(module=self.module, sample=sample)) gene = BedTool(self.genecode) intersect = cnv.intersect(gene, wb=True, f=0.5) intersect_gene_count = self.gene_count(intersect) # e.g chr1 1406909 1406998 18358 DUP chr1 1406909 1406998 MRPL20 intersect.moveto( '{module}/{sample}/cnv.annotatedGenecodeV31.bed'.format(module=self.module, sample=sample)) self.cnv_stat(df, sample, intersect_gene_count)
def segmentations(vf, af): v = BedTool(vf) feats = BedTool(af) results = {} intersection = v.intersect(feats, wb=True) if len(intersection) > 0: sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$8"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn) call(sort_cmd1, shell=True) annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse') for entry in annots: regions = {} regions[entry[4]] = entry[5] results[entry.name] = Series(regions) names = { 'CTCF': 'CTCF_REG', 'E': 'ENH', 'PF': 'TSS_FLANK', 'R': 'REP', 'T': 'TRAN', 'TSS': 'TSS', 'WE': 'WEAK_ENH' } return DataFrame(results, index=names.keys()).T.rename(columns=names)
def _iter_pairwise_connections( clusterable_bedtool: pybedtools.BedTool, min_reciprocal_overlap: float, min_sample_overlap: float = 0, is_carrier: Mapping[Text, numpy.ndarray] = MappingProxyType({}) ) -> Iterator[Tuple[Text, Text]]: """ Iterate over pairs of variant intervals that meet minimum requirement for reciprocal overlap. Exclude self-overlaps. Optionally impose requirement of minimum Jaccard index for carrier samples. Parameters ---------- clusterable_bedtool: BedTool bed object with intervals that may overlap each other min_reciprocal_overlap: float minimum reciprocal overlap for two intervals to be connected min_sample_overlap: float (default=0) minimum Jaccard index of carrier samples for two intervals to be connected is_carrier: Mapping[Text, numpy.ndarray] map from variant ID to carrier status (array boolean True/False for each sample) Yields ------- variant_id_1, variant_id_2: Tuple[Text, Text] successive pairs of variant IDs that meet the overlap requiremnts """ # Cluster intervals based on reciprocal overlap if len(clusterable_bedtool) == 0: return overlap_bedtool = clusterable_bedtool.intersect(clusterable_bedtool, f=min_reciprocal_overlap, r=True, wa=True, wb=True, sorted=True, nonamecheck=True) num_1_fields = clusterable_bedtool.field_count() name_1_field = name_field sv_type_1_field = sv_type_field name_2_field = num_1_fields + name_field sv_type_2_field = num_1_fields + sv_type_field if min_sample_overlap > 0: for overlap in overlap_bedtool: fields = overlap.fields if fields[sv_type_1_field] != fields[sv_type_2_field]: continue # only cluster same sv_type name_1 = fields[name_1_field] name_2 = fields[name_2_field] if name_1 != name_2 and jaccard_index( is_carrier[name_1], is_carrier[name_2]) >= min_sample_overlap: yield name_1, name_2 else: for overlap in overlap_bedtool: fields = overlap.fields if fields[sv_type_1_field] != fields[sv_type_2_field]: continue # only cluster same sv_type name_1 = fields[name_1_field] name_2 = fields[name_2_field] if name_1 != name_2: yield name_1, name_2
def snp_freq_by_window(stat_df, group_label, window_file, outdir): groups = stat_df.columns[3:] alt_freq_stat_bed = outdir / f'{group_label}.snp.plot.bed' if not alt_freq_stat_bed.is_file(): alt_freq_stat_df = stat_df.copy() alt_freq_stat_df.loc[:, 'start'] = alt_freq_stat_df.Pos - 1 bed_cols = ['Chr', 'start', 'Pos', 'Alt'] bed_cols.extend(groups) alt_freq_stat_df.to_csv(alt_freq_stat_bed, sep='\t', columns=bed_cols, header=None, index=False) window_bed = BedTool(str(window_file)) snp_bed = BedTool(str(alt_freq_stat_bed)) intersect_obj = window_bed.intersect(snp_bed, sorted=True, wo=True) intersect_obj_cols = ['Chrom', 'Start', 'End'] intersect_obj_cols.extend(['snp_Chr', 'snp_start', 'Pos', 'Alt']) intersect_obj_cols.extend(groups) intersect_obj_cols.append('overlap') intersect_str = StringIO(str(intersect_obj)) intersect_df = pd.read_csv(intersect_str, sep='\t', header=None, names=intersect_obj_cols) intersect_df.drop(['snp_Chr', 'snp_start', 'overlap'], axis=1, inplace=True) return intersect_df
def Cluster2A5SSorA3SS(Cluster): ''' 'clu_5077': {'chrX\t166458208\t166462315': 0.0195360774825754, 'chrX\t166458343\t166462315': -0.0195360774825754} ''' for k1, v1 in Cluster.items(): if len(v1) == 2: bed2list = [] for k2, v2 in v1.items(): bed2list.append(Bed(k2)) bed2list.sort(key=sortbylength) short, longer = bed2list alternative = "" ss_type = "" if short.start == longer.start: alternative = BedTool(short.chr + "\t" + str(short.end) + "\t" + str(longer.end) + "\t" + k1 + "alternative" + "\t0\t" + short.strand, from_string=True) elif short.end == longer.end: alternative = BedTool(short.chr + "\t" + str(longer.start) + "\t" + str(short.start) + "\t" + k1 + "alternative" + "\t0\t" + short.strand, from_string=True) else: pass X = alternative.intersect(m6A_bed) longer = longer.chr + "\t" + str(longer.start) + "\t" + str( longer.end) if len(X) >= 1: yield "A5SS_A3SS", "m6A", k1, v1[longer] else: yield "A5SS_A3SS", "nom6A", k1, v1[longer]
def generate_bed_file_annotations(bed_directory, output_directory, loci): """ Generates the annotation file for every bed file in the bed_directory folder """ # Loop over the bed files in the bed directory. bed_file_list = glob.glob(os.path.join(bed_directory, "*.bed")) logging.info("Start to generate BED file annotations") logging.info("Writing annotation to: {0}/".format(output_directory)) for locus in loci: zscore = os.path.join(output_directory, locus) bed_lines, rsids = _bed_from_zscore(zscore) tmp_bed = open("tmp.bed","w").writelines(bed_lines) snps = BedTool("tmp.bed") no_snps = _get_line_number(zscore) a_matrix= AnnotateLociMatrix(len(bed_file_list), no_snps) logging.info("Annotating locus: {0}, using VCF file {1}".format(locus, zscore)) for beds in bed_file_list: test_annotation = BedTool(beds) inter = snps.intersect(test_annotation) idxs = [] for inte in inter: idxs.append(rsids.index(inte.name)) zeroes = np.zeros(len(rsids)) for idx in idxs: zeroes[idx] = 1 a_matrix.add_annotation(zeroes, beds) annotations_file = os.path.join(output_directory, locus + ".annotations") logging.info("Writing annotation matrix to: {0}".format(annotations_file)) a_matrix.write_annotations(annotations_file) os.remove("tmp.bed")
def bed_overlap(ref_bed_filepath, test_bed_filepath, dir_out='.'): """ Given two bed files, ref_bed and test_bed, perform bedtools intersect -c to return the number of test_bed regions that overlap any regions in the ref_bed file. Returns the ref_bed file with a column of overlap counts """ cwd = os.getcwd() # specify the reference bed file ref_bedtool = BedTool(ref_bed_filepath) prfx_ref = ref_bed_filepath.split('/')[-1] prfx_ref = prfx_ref.split('.')[0] # specify the new ClinVar bed file test_bedtool = BedTool(test_bed_filepath) prfx_test = test_bed_filepath.split('/')[-1] prfx_test = prfx_test.split('.')[0] # specify name/path of output bed file bed_out = dir_out + '/{}_IN_{}.bed'.format(prfx_test, prfx_ref) # run bedtools intersect to get all test_bed regions NOT found in ref_bed (-v option) ref_in_test = test_bedtool.intersect(b=ref_bedtool, c=True) # save the bed overlap file ref_in_test.saveas(bed_out) # confirm file saved if os.path.isfile(bed_out): print('Success!\nFile saved to: \n{}.\n'.format( os.path.join(cwd, bed_out))) return (ref_in_test)
def vcf_to_df_worker(arg): """Convert CANVAS vcf to a dict, single thread""" canvasvcf, exonbed, i = arg logging.debug("Working on job {}: {}".format(i, canvasvcf)) samplekey = op.basename(canvasvcf).split(".")[0].rsplit("_", 1)[0] d = {"SampleKey": samplekey} exons = BedTool(exonbed) cn = parse_segments(canvasvcf) overlaps = exons.intersect(cn, wao=True) gcn_store = {} for ov in overlaps: # Example of ov.fields: # [u'chr1', u'11868', u'12227', u'ENSG00000223972.5', # u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene', # u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0'] gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5])) if gene_name not in gcn_store: gcn_store[gene_name] = defaultdict(int) cn = ov.fields[-2] if cn == ".": continue cn = int(cn) if cn > 10: cn = 10 amt = int(ov.fields[-1]) gcn_store[gene_name][cn] += amt for k, v in sorted(gcn_store.items()): v_mean, v_median = counter_mean_and_median(v) d[k + ".avgcn"] = v_mean d[k + ".medcn"] = v_median cleanup() return d
def generate_curve(bedfile, chromosome, region_start, region_stop): bedtool = BedTool(bedfile) region_of_interest = BedTool(chromosome + ' ' + str(region_start) + ' ' + str(region_stop), from_string=True) plot_region = region_of_interest.intersect(bedtool) domain = np.arange(region_start, region_stop + 1) values = np.zeros(domain.shape) for interval in plot_region: if (interval.start < region_start): start = region_start else: start = interval.start if (interval.end > region_stop): finish = region_stop else: finish = interval.end start_i = start - domain[0] finish_i = finish - domain[-1] values[start_i:finish_i] = values[start_i:finish_i] + 1 return (domain, values)
def compute_fold_change(exp1, exp2): peaks_file = meta_data.peaks_file(exp1); raw_file1 = meta_data.raw_bed_file(exp1); raw_file2 = meta_data.raw_bed_file(exp2); peaks = BedTool(peaks_file); raw1 = BedTool(raw_file1); raw2 = BedTool(raw_file2); coverage_1 = peaks.intersect(raw1, c=True); coverage_2 = peaks.intersect(raw2, c=True); output_bed = list(); #Bad way to do this, but I'm having trouble writing #to a file while iterating over BedTools for i1, i2 in zip(coverage_1, coverage_2): if(i1.count == 0): cstring = "-1"; else: cstring = str.format("{:f}",i2.count / i1.count); line = i1.chrom + "\t" + str(i1.start) + "\t" + str(i1.end) + "\t" + cstring + "\n"; output_bed.append(line); out_directory = meta_data.directory_foldchange(exp2); if(not os.path.isdir(out_directory)): os.mkdir(out_directory); out_file = out_directory + os.sep + "foldchange.bed"; with open(out_file, 'w') as fout: for line in output_bed: fout.write(line);
def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument('bed', help='bed with miRNA as name') p.add_argument('--reference-beds', dest='reference', nargs='+', help='reference beds for each feature to annotate') p.add_argument('--names', nargs='+', help='names corresponding to reference files') args = p.parse_args() if not args.names and not args.reference: sys.exit(p.print_help()) bed = BedTool(args.bed) # create the reference beds reference = {} for refname, refbed in izip(args.names, args.reference): reference[refname] = BedTool(refbed) for refname in args.names: # intersect the mirna bed with the reference annotations for b in bed.intersect(reference[refname], s=True, stream=True): # Cytoscape formatting fields = (b.name, "=", refname) print " ".join(map(str, fields))
def main(): args = parse_args() naive_overlap_sorted_merged = BedTool(args.naive_overlap_sorted_merged) sample_optimal_sets = list(args.sample_optimal_sets) num_samples = len(sample_optimal_sets) sample_bedtools = [BedTool(i) for i in sample_optimal_sets] region_to_sample = dict() for sample_index in range(len(sample_bedtools)): sample = sample_bedtools[sample_index] sample_name = sample_optimal_sets[sample_index] print(sample_name) #intersect with the naive_overlap_sorted_merged peak set intersections = naive_overlap_sorted_merged.intersect(sample, wa=True) for intersection in intersections: str_intersection = str(intersection) if str_intersection not in region_to_sample: region_to_sample[str_intersection] = [sample_name] else: region_to_sample[str_intersection].append(sample_name) output_files = dict() output_files['shared'] = open(args.outf_prefix + ".shared", 'w') for sample_name in sample_optimal_sets: output_files[sample_name] = open(args.outf_prefix + "." + sample_name, 'w') for region in region_to_sample: if len(region_to_sample[region]) == 1: #unique to one sample cur_sample = region_to_sample[region][0] output_files[cur_sample].write(region) elif len(region_to_sample[region]) == num_samples: output_files['shared'].write(region)
def _bed_intersection(bed: pybedtools.BedTool, path, g, region_index, bed_sorted, fraction=0.2): with warnings.catch_warnings(): warnings.simplefilter("ignore") query_bed = _region_bed_sorted(path, g, bed_sorted) try: df = bed.intersect(query_bed, wa=True, f=fraction, g=g, sorted=True).to_dataframe() if df.shape[0] == 0: regions_idx = pd.Series([]) else: regions_idx = df["name"] except pd.errors.EmptyDataError: regions_idx = pd.Series([]) regions = pd.Index(regions_idx.values) bool_series = pd.Series(region_index.isin(regions), index=region_index) query_bed.delete_temporary_history(ask=False) return bool_series
def gene_regions(vf, af): v = BedTool(vf) feats = BedTool(af) # first establish all the columns in the annotation file cols = set(f[4] for f in feats) results = {} intersection = v.intersect(feats, wb=True) if len(intersection) > 0: #sort_cmd1 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s 1<>%s' % (intersection.fn, intersection.fn) #call(sort_cmd1, shell=True) tempfile1 = tempfile.mktemp() sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$9"\t"$5"_"$6"_"$7"_"$8"_"$9}\' %s > %s' % (intersection.fn, tempfile1) call(sort_cmd2, shell=True) intersection = BedTool(tempfile1) annots = intersection.groupby(g=[1,2,3,4,5], c=6, ops='collapse') for entry in annots: regions = {} regions[entry[4]] = entry[5] results[entry.name] = Series(regions) df = DataFrame(results, index = cols) return df.T.fillna(0)
def annotate_peaks(notsif, beds, names): """Takes notsif, transforms to bed, and outputs annotation of where the miRNA seed is interrogating via Cytoscape edge attribute file. """ strand = find_strand_from_filename(notsif) mirna_bed = BedTool(notsif_to_bed(notsif, strand), from_string=True) # create the reference beds reference = {} for name, bed in izip(names, beds): reference[name] = BedTool(bed) for name in names: # intersect the mirna bed with the reference annotations for hit in mirna_bed.intersect(reference[name], s=True, stream=True): # name field returned from notsif_to_bed is delimited by "|" mirna_name = hit.name.split("|")[0] gene_name = hit.name.split("|")[1] # Cytoscape formatting seed_length = "(%s)" % hit.score fields = (mirna_name, seed_length, gene_name, "=", name) print " ".join(map(str, fields))
def getGenesOverlappingRegion(rec, genes): """ :param rec: pysam vcf record :param genes: BedTool object with all the gene annotations in bedformat :return: tuple of strings (gene_name(s)) """ NOGENE = str(set([".___."])) if rec.info['SVTYPE'] == "TRA" or ("TRA" in rec.info.keys() and rec.info['TRA']): return NOGENE chr1 = rec.chrom pos1 = rec.pos pos2 = rec.info['ENDPOSSV'] if int(pos2) < int(pos1): pos1 = rec.info['ENDPOSSV'] pos2 = rec.pos locus = BedTool(' '.join([chr1, str(pos1 - 1), str(pos2)]), from_string=True) isec = locus.intersect(genes, wao=True) if isec is None or isec == "": return NOGENE # print("NO Intersection with any gene") gene = set( isec.to_dataframe().iloc[0::, 6]) # here we get ALL the Genes in column 6 # strand = set(isec.to_dataframe().iloc[0:, 8]).pop() if gene == "." or gene == "{'.'}": return NOGENE return str(gene)
def get_coverage(bed_prefix, directory, file_prefix, bam): """ Coverage at all positions is calculated. This is then used for coverage analysis and to determine read depth at any false negative sites :param bed_prefix: all regions in the bed files submitted are in a file generated during intersections :param directory: location of patient results :param file_prefix: prefix used for all files in pipeline i.e. worklist-patient :return out: filename for coverage stats """ #TODO change BAM path so filename is not required print 'Generating coverage stats.' whole_bed = '/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.whole.bed' out = directory + '/giab_results/whole_bed_coverage.txt' command = '/results/Pipeline/program/sambamba/build/sambamba depth base --min-coverage=0 -q29 -m -L ' + whole_bed + \ ' ' + bam + ' > ' + out + '.tmp' try: subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command:' + str(e.returncode) exit(1) print 'Sambamba complete.' #issue with sambamba that leaves out regions that have 0 coverage - intersect regions to find missing and add # them to the file at coverage 0 temp_bed = out.replace('.txt', '.bed.tmp') command = 'awk \'{print($1"\\t"$2"\\t"$2+1"\\t"$3)}\' ' + out + '.tmp | grep -v "COV" > ' + temp_bed print command try: subprocess.check_call(command, shell=True) print 'BED coordinates extracted.' except subprocess.CalledProcessError as e: print 'Error executing command:' + str(e.returncode) exit(1) coverage_bed = BedTool(temp_bed) print 'BED tool created' whole_bedtool = BedTool(whole_bed) print 'Intersecting' missing_regions = whole_bedtool.intersect(coverage_bed, v=True) missing_file = directory + '/giab_results/regions_missing' missing_regions.moveto(missing_file) print 'Generating file' sample_split = file_prefix.split('-') sample = sample_split[1] + '-' + sample_split[2] command = '''while read i; do start=`echo "$i"|cut -f2`; end=`echo "$i"|cut -f3`; chr=`echo "$i"|cut -f1`; end_true=`echo "${end} - 1" | bc`; for j in $(seq $start $end_true); do new_end=`echo -e "${j} + 1" | bc`; echo -e "$chr\\t${j}\\t0\\t0\\t0\\t0\\t0\\t0\\t0\\t''' + sample + '''";done;done < ''' + missing_file + '> ' + directory + '/to_add' print command try: subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command:' + str(e.returncode) exit(1) command = 'cat ' + out + '.tmp ' + directory + '/to_add > ' + out try: subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command:' + str(e.returncode) exit(1) print 'fix complete.' return out
def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument('peaks', help='peaks bed') p.add_argument('exons', help='refseq exons from UCSC') p.add_argument('gtf', help='refseq gtf with feature of interest') p.add_argument('feature', help='feature of interest in the gtf') p.add_argument('-v', '--verbose', action="store_true", help='maximum verbosity') args = p.parse_args() if args.verbose: sys.stderr.write(">> building exon library...\n") exon_lib = make_exon_lib(args.exons) peaks = BedTool(args.peaks) exons = BedTool(args.exons) full_ref = BedTool(args.gtf) if args.verbose: sys.stderr.write(">> filtering for feature...\n") filtered_ref = full_ref.filter(lambda gtf: gtf[2] == args.feature) if args.verbose: sys.stderr.write(">> selecting exonic peaks...\n") exonic_peaks = peaks.intersect(exons, wo=True) if args.verbose: sys.stderr.write(">> calculating distance fractions...\n") # D for distance (returns negative if upstream) for peak in exonic_peaks.closest(filtered_ref, D="a"): try: p = ComplexLine(peak) corrected_distance = 0.0 total_exon_length = 0.0 # parse gtf attrs gene_id = p.gtfattrs.split(';')[0].rstrip('"').lstrip('gene_id "') # looking downstream wrt peak if p.gtfdistance > 0: # exon with peak corrected_distance = p.exonstop - p.peakstop for exon in exon_lib[p.exoninfo.name]: # add downstream exon lengths if exon > p.exoninfo.number: corrected_distance += exon_lib[p.exoninfo.name][exon] # looking upstream wrt peak else: # exon with peak corrected_distance = p.peakstart - p.exonstart for exon in exon_lib[p.exoninfo.name]: # add upstream exon lengths if exon < p.exoninfo.number: corrected_distance += exon_lib[p.exoninfo.name][exon] for exon in exon_lib[p.exoninfo.name]: total_exon_length += exon_lib[p.exoninfo.name][exon] # fraction print (corrected_distance / total_exon_length) except ValueError: continue
def cpg_islands(vf, af): print "inside cpg_islands" v = BedTool(vf) cpg = BedTool(af) overlap = v.intersect(cpg, wb=True) results = dict([ (r.name, 1) for r in overlap ]) print "exit cpg_islands" return Series(results, name="cpg_island")
def calculate_ovl(nbedfile, obedfile, opts, scoresfile): from pybedtools import BedTool nbedtool = BedTool(nbedfile) obedtool = BedTool(obedfile) ab = nbedtool.intersect(obedtool, wao=True, f=opts.f, r=opts.r, s=opts.s) cmd = """cut -f4,5,10,13 | awk -F $'\t' 'BEGIN { OFS = FS } ($3 != "."){ print $1,$3,$2,$4; }'""" sh(cmd, infile=ab.fn, outfile=scoresfile)
def parse_bed(cluster, gtruth, omit): bed = [] with open(cluster) as f: for line in f: if len(line.split('\t')[6].split('~')) == 2: if len(line.split('\t')[6].split('~')[1].split(':')[1].split('-')) == 1: chrm = line.split('\t')[6].split('~')[1].split(':')[0] coord1 = line.split('\t')[6].split('~')[1].split(':')[1] name = line.split('\t')[8] bed.append(chrm + '\t' + coord1 + '\t' + str(int(coord1) + 1) + '\t' + name) elif len(line.split('\t')[6].split('~')[1].split(':')[1].split('-')) == 2: chrm = line.split('\t')[6].split('~')[1].split(':')[0] coord1 = line.split('\t')[6].split('~')[1].split(':')[1].split('-')[0] coord2 = line.split('\t')[6].split('~')[1].split(':')[1].split('-')[1] name = line.split('\t')[8] bed.append(chrm + '\t' + coord1 + '\t' + str(int(coord1) + 1) + '\t' + name) bed.append(chrm + '\t' + coord2 + '\t' + str(int(coord2) + 1) + '\t' + name) elif len(line.split('\t')[6].split('~')) == 3: chrm1 = line.split('\t')[6].split('~')[1].split(':')[0] chrm2 = line.split('\t')[6].split('~')[2].split(':')[0] coord1 = line.split('\t')[6].split('~')[1].split(':')[1] coord2 = line.split('\t')[6].split('~')[2].split(':')[1] name = line.split('\t')[8] bed.append(chrm1 + '\t' + coord1 + '\t' + str(int(coord1) + 1) + '\t' + name) bed.append(chrm2 + '\t' + coord2 + '\t' + str(int(coord2) + 1) + '\t' + name) bed_str = '\n'.join(bed) sv_bed = BedTool(bed_str, from_string=True) sv_bed = sv_bed.sort() truth_bed = BedTool(gtruth) truth_bed = truth_bed.sort() intersect1 = sv_bed.intersect(truth_bed, wa=True, wb=True) if str(omit) != 'None': omit_bed = BedTool(omit) omit_bed = omit_bed.sort() intersect2 = sv_bed.intersect(omit_bed, wa=True, wb=True) else: intersect2 = [] truth_dict = {} omit_dict = {} for line in intersect1: line = list(line) truth_dict[line[3]] = 1 for line in intersect2: line = list(line) omit_dict[line[3]] = 1 return truth_dict, omit_dict
def motifs(vf, af): print "inside motif" v = BedTool(vf) cpg = BedTool(af) overlap = v.intersect(cpg, wb=True) results = dict([ (r.name, 1) for r in overlap ]) print "exit motif" return Series(results, name="pwm")
def build_vcf_intervals(reads, vcf_recs, bam_handle): """ Find if any of these reads match a known SUN/indel by simple bedtools intersections """ vcf_bed_recs = [ChromosomeInterval(x.CHROM, x.start, x.end, None) for x in vcf_recs] vcf_bed = BedTool(vcf_bed_recs) reads_bed_recs = [(bam_handle.getrname(x.tid), x.positions[0], x.positions[-1]) for x in reads if len(x.positions) > 2] reads_bed = BedTool(reads_bed_recs) return list(vcf_bed.intersect(reads_bed))
def repeats(vf, af): v = BedTool(vf) feats = BedTool(af) intersection = v.intersect(feats, wb=True) results = {} if len(intersection) > 0: annots = intersection.groupby(g=[1,2,3,4], c=8, ops='collapse') for entry in annots: types = entry[4].split(',') results[entry.name] = len(types) return Series(results, name='repeat')
def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument('reads', help='bed format file of reads') p.add_argument('intron', help='intron coordinates bed') p.add_argument('utr', help="UTR coordinates bed") p.add_argument('-v', '--verbose', action='store_true', help="maximum verbosity") args = p.parse_args() bed = BedTool(args.reads) intron = BedTool(args.intron) utr = BedTool(args.utr) annotated_bed = {} if args.verbose: print >>sys.stderr, ">> finding intersection" intersection = bed.intersect(intron, s=True) for row in intersection: annotated_bed[row.name] = {'chrom':row.chrom, 'start':row.start, \ 'stop':row.stop} if args.verbose: print >>sys.stderr, ">> annotating" for row in intersection.closest(utr, s=True, d=True): annotated_bed[row.name]['utr'] = int(row.fields[-1:][0]) step = 100 rangemax = 5000 bin_counts = {} max_utr = 0 for ubound in range(step, rangemax + 1, step): bincount = 0 for name, val in annotated_bed.iteritems(): if val['utr'] <= ubound and val['utr'] > (ubound - step): bincount += 1 if val['utr'] > max_utr: max_utr = val['utr'] fields = (ubound, bincount) print "\t".join(map(str, fields)) for ubound in range(50000, max_utr, 50000): bincount = 0 for name, val in annotated_bed.iteritems(): if val['utr'] <= ubound and val['utr'] > (ubound - 100000): bincount += 1 fields = (ubound, bincount) print "\t".join(map(str, fields))
def intersect_bed(bed_name, bed_filter): """KEEPS regions of annotation of interest that overlap with repeat-masked regions """ pybedtools.set_tempdir('/sc/orga/scratch/richtf01') if not os.path.isfile(bed_name + '.Rmsk.bed'): bed = BedTool(bed_name + '.merged.sorted.bed') print "Keeping calls in rmsk from " + bed_name + "..." bed_overlap = bed.intersect(bed_filter) bed_overlap.saveas(bed_name + '.Rmsk.bed') print bed_name + " done!" else: print bed_name + " rmsk calls already isolated"
def rebin_step1(): if url == False: A=BedTool(input_signal) B=BedTool(bins) AB = A.intersect(B, wo=True) AB_inv=B.intersect(A, v=True) return AB,AB_inv elif url == True: to_download="'"+input_signal+"'" command= "wget "+to_download p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE) stdout, stderr = p.communicate() ninput_signal=os.path.basename(input_signal) if bigWig ==True: run_bash("./kent_binaries/bigWigToBedGraph "+ninput_signal+" "+ninput_signal.replace(".bigWig",".bedGraph").replace(".bigwig",".bedGraph")) run_bash("rm "+ninput_signal) ninput_signal=ninput_signal.replace(".bigWig",".bedGraph").replace(".bigwig",".bedGraph") A=BedTool(ninput_signal) B=BedTool(bins) AB = A.intersect(B, wo=True) AB_inv=B.intersect(A, v=True) run_bash("rm "+ninput_signal) return AB,AB_inv
def intersectBamWithBed(inbam, inbed): ''' Intersects reads with genomic features, Transposable elements, and returns separately reads that map sense and antisense to the features. Input: paths to bam and bed file Output: list of tuples with a name (str) and the reads for sense and antisense piRNAs (bedTool) ''' # convert bam to bed print 'Separating sense and antisense piRNAs ' + timeStamp() piRNA = BedTool(inbam).bam_to_bed() ## create bedtool for genomic features bed = BedTool(inbed) # outname = inbam.replace('.bam', '') # outsense = outname + "sense.bed" # outantisense = outname + "antisense.bed" antisense = piRNA.intersect(bed, S=True) sense = piRNA.intersect(bed, s=True) piRNAs = [ ('sense', sense), ('antisense', antisense)] return piRNAs
def bound_motifs(vf, af): v = BedTool(vf) feats = BedTool(af) intersection = feats.intersect(v, wb=True) results = {} if len(intersection) > 0: sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn) call(sort_cmd, shell=True) annots = intersection.groupby(g=[6,7,8,9], c=4, ops='collapse') for entry in annots: cells = entry[4].split(',') results[entry.name] = len(cells) return Series(results, name='bound_motifs')
def check_overlap(feature_string, query_string): """ Check overlap between two bed strings. Arg1: feature_string -> target string. Arg2: query_string -> query string. Returns -> True (if has overlap), False (not overlap). """ feat_bed = BedTool(feature_string, from_string=True) query_bed = BedTool(query_string, from_string=True) feat_query = feat_bed.intersect(query_bed) return bool(feat_query)
def bed_to_snps(bed, bim_df, mergebed=False): from pybedtools import BedTool if mergebed: print('merging intervals in mask') bed = bed.sort().merge() print('creating bedtool') iter_bim = [['chr'+str(x1), x2, x2+1] for (x1, x2) in np.array(bim_df[['CHR', 'BP']])] bimbed = BedTool(iter_bim) print('performing bedtools intersection') int_bed = bimbed.intersect(bed) print('creating df and merging with refpanel') bp = [x.start for x in int_bed] df_int = pd.DataFrame({'BP': bp}) return pd.merge(bim_df, df_int, how='inner', on='BP')
def get_peak_counter(bed, ref_bed): """ bed is either the pos or neg strand file of peak coordinates ref_bed is the reference annotation for exon, intron, etc. returns defaultdict(Counter) """ bed = BedTool(bed) ref_bed = BedTool(ref_bed) peaks = defaultdict(Counter) for peak in bed.intersect(ref_bed,f=0.5,wo=True): gene_name = peak[6] peaks[gene_name].update([gene_name]) return peaks
def cpg_islands(vf, af): v = BedTool(vf) cpg = BedTool(af) overlap = v.intersect(cpg, wb=True) #sort_cmd = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"_"$6"_"$7"_"$8}\' %s 1<>%s' % (overlap.fn, overlap.fn) #call(sort_cmd, shell=True) tempfile1 = tempfile.mktemp() sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"_"$6"_"$7"_"$8}\' %s > %s' % (overlap.fn, tempfile1) call(sort_cmd2, shell=True) intersection = BedTool(tempfile1) results = {} for r in intersection: results[r.name] = r[4] return Series(results, name="cpg_island")
def consolidate(nbedfile, obedfile, cbedfile): from pybedtools import BedTool nbedtool = BedTool(nbedfile) obedtool = BedTool(obedfile) ab = nbedtool.intersect(obedtool, s=True, u=True) ba = obedtool.intersect(nbedtool, s=True, u=True) cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn) fp = popen(cmd) ovl = BedTool(fp.readlines()) abmerge = ovl.merge(s=True, nms=True, scores="mean").sort() cmd = "cat {0}".format(abmerge.fn) fp = popen(cmd, debug=False) ovl = BedTool(fp.readlines()) notovl = nbedtool.intersect(ovl.sort(), s=True, v=True) infile = "{0} {1}".format(notovl.fn, ovl.fn) tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid()) cmd = "sort -k1,1 -k2,2n" sh(cmd, infile=infile, outfile=tmpfile) fp = open(cbedfile, "w") bed = Bed(tmpfile) for b in bed: if ";" in b.accn: accns = set() for accn in b.accn.split(";"): accns.add(accn) b.accn = ";".join(accns) print >> fp, b fp.close() os.remove(tmpfile) sort([cbedfile, "-i"])
def dnase_fps(vf, af): print "inside dnase_fps" v = BedTool(vf) feats = BedTool(af) results = {} intersection = feats.intersect(v, wb=True) if len(intersection) > 0: sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn) call(sort_cmd, shell=True) annots = intersection.groupby(g=[6,7,8,9], c=4, o='collapse') for entry in annots: cells = entry[4].split(',') results[entry.name] = len(cells) print "exiting dnase_fps" return Series(results, name='dnase_fps')
def motifs(vf, af): v = BedTool(vf) cpg = BedTool(af) overlap = v.intersect(cpg, wb=True) sort_cmd1 = 'sort -k1,1 -k2,2n -k3,3n -k4,4 %s -o %s' % (overlap.fn, overlap.fn) tempfile1 = tempfile.mktemp() call(sort_cmd1, shell=True) sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"__"$6"__"$7"__"$8"__"$9"__"$10"__"$11"__"$12"__"$13}\' %s > %s' % (overlap.fn, tempfile1) call(sort_cmd2, shell=True) intersection = BedTool(tempfile1) annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse') results = {} for entry in annots: results[entry.name] = entry[4] return Series(results, name="pwm")
def repeats(vf, af): v = BedTool(vf) feats = BedTool(af) intersection = v.intersect(feats, wb=True) results = {} if len(intersection) > 0: tempfile1 = tempfile.mktemp() sort_cmd2 = 'awk -F \'\t\' \'{print $1"\t"$2"\t"$3"\t"$4"\t"$5"_"$6"_"$7"_"$8"_"$9"_"$10}\' %s > %s' % (intersection.fn, tempfile1) call(sort_cmd2, shell=True) intersection = BedTool(tempfile1) annots = intersection.groupby(g=[1,2,3,4], c=5, ops='collapse') for entry in annots: results[entry.name] = entry[4] return Series(results, name='repeat')
def make_annot_files(args, bed_for_annot): print('making annot file') df_bim = pd.read_csv(args.bimfile, delim_whitespace=True, usecols = [0,1,2,3], names = ['CHR','SNP','CM','BP']) iter_bim = [['chr'+str(x1), x2, x2] for (x1, x2) in np.array(df_bim[['CHR', 'BP']])] bimbed = BedTool(iter_bim) annotbed = bimbed.intersect(bed_for_annot) bp = [x.start for x in annotbed] df_int = pd.DataFrame({'BP': bp, 'ANNOT':1}) df_annot = pd.merge(df_bim, df_int, how='left', on='BP') df_annot.fillna(0, inplace=True) df_annot = df_annot[['ANNOT']].astype(int) if args.annot_file.endswith('.gz'): with gzip.open(args.annot_file, 'wb') as f: df_annot.to_csv(f, sep = "\t", index = False) else: df_annot.to_csv(args.annot_file, sep="\t", index=False)
def snp_stats(vf, af, stat='avg_het', flank=500): v = BedTool(vf) feats = BedTool(af) flanks = v.slop(g=pybedtools.chromsizes('hg19'), b=flank) intersection = feats.intersect(flanks, wb=True) results = {} if len(intersection) > 0: sort_cmd = 'sort -k6,6 -k7,7n -k8,8n -k9,9 %s -o %s' % (intersection.fn, intersection.fn) call(sort_cmd, shell=True) annots = intersection.groupby(g=[6,7,8,9], c=5, ops='collapse') for entry in annots: rates = entry[4].split(',') tot = reduce(lambda x, y: x + float(y), rates, 0.) rate = tot / (flank * 2) results[entry.name] = rate return Series(results, name=stat)