def findInsertions(bwFile, bedData, interval, x): if interval =='start': sL = int(bedData[x][1])-options.l sR = int(bedData[x][1])+options.r elif interval == 'end': sL = int(bedData[x][2])-options.l sR = int(bedData[x][2])+options.r else: sL = int(bedData[x][1])-options.l sR = int(bedData[x][2])+options.r # get signal data f = open(bwFile, "rb") bigwig_class = BigWigFile(f) try: signal = bigwig_class.get_as_array(bedData[x][0],sL,sR) except OverflowError: signal = np.array([np.nan]*(sR-sL)) f.close() if signal is not None: if np.sum(np.isfinite(signal)) > 0: out = np.nanmean(signal) else: out = 0 else: out = 0 out = signal return out
def get_mean_phastcons(bedtool, phastcons_location, sample_size=1000): """ Get means phastcons scores for all intervals in a bed tool bedtool - bedtool to extract data from phastcons_location - location of phastcons file """ with open(phastcons_location) as bw_file: bw = BigWigFile(bw_file) data = [] for bedline in bedtool.random_subset(min(len(bedtool), sample_size)): conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop) try: if len(conservation_values) > 0: mean_phastcons = np.mean(conservation_values) else: mean_phastcons = 0 data.append(mean_phastcons) except TypeError: pass return data
def getsignal(inputfile, outputfile, pcut, pspan): # p=BwIO(pcut) # chrom_len = {} # for i in p.chromosomeTree['nodes']: # chrom_len[i['key']] = i['chromSize'] pcutbw = BigWigFile(open(pcut, 'rb')) inf = open(inputfile) testll = inf.readline().split() ml = int(testll[2]) - int(testll[1]) inf.seek(0) outf = open(outputfile, 'w') for line in inf: ll = line.split() # if not chrom_len.has_key(ll[0]): # continue cut = list( pcutbw.summarize(ll[0], int(ll[1]) + ml / 2 - pspan, int(ll[1]) + ml / 2 + pspan, 2 * pspan).sum_data) TC = sum(cut) C = sum(cut[(pspan - ml / 2):(pspan - ml / 2 + ml)]) L = sum(cut[(pspan - ml / 2 - ml):(pspan - ml / 2)]) R = sum(cut[(pspan - ml / 2 + ml):(pspan - ml / 2 + 2 * ml)]) FOS = -1 * ((C + 1) / (R + 1) + (C + 1) / (L + 1)) newll = ll + [TC, FOS] outf.write("\t".join(map(str, newll)) + "\n") outf.close()
def Readbw(bwfile,chrm,start,end,n): bwHandle=BigWigFile(open(bwfile, 'rb')) summary = bwHandle.summarize(chrm,int(start),int(end),(int(end)-int(start))/n) count = map(sudocount,summary.valid_count) sum = summary.sum_data scores = list(sum/count) return scores
def getChromatinDataSeries(bigwigFile, libraryTable, sgInfoTable, tssTable, colname = '', naValue = 0): bwindex = BigWigFile(open(bigwigFile)) chromDict = tssTable['chromosome'].to_dict() chromatinScores = [] for name, sgInfo in sgInfoTable.iterrows(): geneTup = (sgInfo['gene_name'],','.join(sgInfo['transcript_list'])) if geneTup not in chromDict: #negative controls chromatinScores.append(np.nan) continue if sgInfo['strand'] == '+': sgRange = sgInfo['pam coordinate'] + sgInfo['length'] else: sgRange = sgInfo['pam coordinate'] - sgInfo['length'] chrom = chromDict[geneTup] chromatinArray = bwindex.get_as_array(chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange)) if chromatinArray is not None and len(chromatinArray) > 0: chromatinScores.append(np.nanmean(chromatinArray)) else: #often chrY when using K562 data.. # print name # print chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange) chromatinScores.append(np.nan) chromatinSeries = pd.Series(chromatinScores, index=libraryTable.index, name = colname) return chromatinSeries.fillna(naValue)
def createMappabilityList(fragmentsMap, bwfile, fragmentCount, options): # keep record which fragment has decent mappability mappable = np.zeros((fragmentCount, ), dtype=np.float) # lazy load from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile(open(bwfile)) for fragmentId in fragmentsMap.keys(): (chrom, start, end) = fragmentsMap[fragmentId] if (options.vverbose): print >> sys.stdout, "- process %s %d-%d " % (chrom, start, end) try: mappable[fragmentId] = bw.query(chrom, start, end, 1)[0]["mean"] if (np.isnan(mappable[fragmentId])): mappable[fragmentId] = 0 except: mappable[fragmentId] = 0. # problem with invalid values if (options.vverbose): print >> sys.stderr, "Problem with bw file at %s %d-%d" % ( chrom, start, end) print traceback.format_exc() return mappable
def main(): p = optparse.OptionParser(__doc__) p.add_option('-A', '--absolute', action='store_true',dest='A',\ default=False, help='absolute threshold') p.add_option('-s','--standard_background', action='store_true',\ dest='stdbg') p.add_option('-D', '--debug', action='store_true', dest='debug') options, args = p.parse_args() debug_c = 0 BEDFILE = open(args[0], 'rU') BW = BigWigFile(file=open(args[1])) BEDout = open(args[2], 'w') for line in BEDFILE: print(line) line = line.strip().split('\t') x = BW.query(line[0], int(line[1]), int(line[2]),1) line.append(str(round(x[0]['mean'], 5))) BEDout.write("\t".join(line)+"\n") """ for i in x: print i['mean'] """ if options.debug: debug_c +=1 if debug_c >= 10: break if __name__ == '__main__': main()
def summary(bwfile,bedfile,topnumber,out): total_result = [] p=BwIO(bwfile) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle=BigWigFile(open(bwfile, 'rb')) inf = open(bedfile) t = time.time() for line in inf: ll = line.split() ll[3]="-" if chrom_len.has_key(ll[0]): summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),1) if summary.valid_count == 0: mean_value = 0 else: mean_value = (summary.sum_data/summary.valid_count)[0] total_result.append(ll+[mean_value]) inf.close() total_result.sort(reverse=True,key=lambda x:x[-1]) outf = open(out,'w') print "scaning 1st ",time.time()-t t=time.time() for i in range(topnumber): ll = total_result[i] summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))) additional_value = ",".join(map(str,list(summary.sum_data))) result = map(str,(ll+[additional_value])) outf.write("\t".join(result)+"\n") outf.close() print "scaning 2nd ",time.time()-t
def main(): usage="%prog [options]" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="BigWig files") parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="BigWig files") parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.') parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file") parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.") parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)") (options,args)=parser.parse_args() if not (options.BigWig_File1 and options.BigWig_File2 and options.output_wig and options.chromSize): parser.print_help() sys.exit(0) OUT=open(options.output_wig,'w') bw1 = BigWigFile( file=open(options.BigWig_File1) ) bw2 = BigWigFile( file=open(options.BigWig_File2) ) chrom_sizes = load_chromsize(options.chromSize) for chr_name, chr_size in chrom_sizes.items(): #iterate each chrom print >>sys.stderr, "Processing " + chr_name + " ..." OUT.write('variableStep chrom='+chr_name+'\n') for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size): coord = interval[1] bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2]) bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2]) if all_nan(bw_signal1) and all_nan(bw_signal2): continue bw_signal1 = replace_nan( bw_signal1 ) bw_signal2 = replace_nan( bw_signal2 ) call_back = getattr(twoList,options.action) for v in call_back(bw_signal1,bw_signal2): coord +=1 if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
def get_mean_phastcons(bedtool, phastcons_location): """ Get means phastcons scores for all intervals in a bed tool bedtool - bedtool to extract data from phastcons_location - location of phastcons file """ f = open(phastcons_location, 'r') bw = BigWigFile(file=f) #if bedtool data = np.ndarray(len(bedtool)) for i, bedline in enumerate(bedtool): conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop) if len(conservation_values) > 0: mean_phastcons = np.mean(conservation_values) else: mean_phastcons = 0 data[i] = mean_phastcons return data
def createMappabilityList(fragmentsMap, bwfile, fragmentCount, options): # keep record which fragment has decent mappability mappable = np.zeros((fragmentCount,), dtype=np.float) # lazy load from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile( open( bwfile ) ) for fragmentId in fragmentsMap.keys(): (chrom, start, end) = fragmentsMap[fragmentId] if (options.vverbose): print >> sys.stdout, "- process %s %d-%d " % (chrom, start, end) try: mappable[fragmentId] = bw.query(chrom, start, end, 1)[0]["mean"] if (np.isnan(mappable[fragmentId])): mappable[fragmentId] = 0 except: mappable[fragmentId] = 0. # problem with invalid values if (options.vverbose): print >> sys.stderr, "Problem with bw file at %s %d-%d" % (chrom, start, end) print traceback.format_exc() return mappable
def get_mean_phastcons(bedtool, phastcons_location, sample_size = 1000): """ Get means phastcons scores for all intervals in a bed tool bedtool - bedtool to extract data from phastcons_location - location of phastcons file """ with open(phastcons_location) as bw_file: bw = BigWigFile(bw_file) data = [] for bedline in bedtool.random_subset(min(len(bedtool), sample_size)): conservation_values = bw.get_as_array(bedline.chrom, bedline.start, bedline.stop) try: if len(conservation_values) > 0: mean_phastcons = np.mean(conservation_values) else: mean_phastcons = 0 data.append(mean_phastcons) except TypeError: pass return data
def count_cut_nmers(fp, w_minus, lflank, rflank, single_nmer_cutoff, sequence, offset): """ count the number of cuts associated with each nmer in sequence covered by X. offset is the position of the cut to be associated with each nmer. if offset = 0 the first base of the tag is lined up with the nmer start """ # w_plus_H=BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) genome = twobitreader.TwoBitFile(sequence) # keep count of the number of occurrences of each n-mer seq_nmer_dict = {} cut_nmer_dict = {} for line in fp.readlines(): ll = line.split() chrm = ll[0] start = int(ll[1]) end = int(ll[2]) # pseq = genome[chrm][(start-lflank+offset):(end+rflank+offset)].upper() nseq = genome[chrm][(start - lflank - offset):(end + rflank - offset)].upper() # cp = list(w_plus_H.summarize(ll[0],start,end,end-start).sum_data) cn = list(w_minus_H.summarize(ll[0], start, end, end - start).sum_data) #each = (len(ll)-5)/2 #cp = (map(float,ll[5:(5+each)])) #cn = (map(float,ll[(5+each):(5+each*2)])) for k in range(len(cn)): # p_cut = cp[k] n_cut = cn[k] # p_seq = pseq[k:(k+lflank+rflank)] n_seq = nseq[(k + 1):(k + lflank + rflank + 1)] # rev_n_seq = rev(n_seq) # if 'N' not in p_seq and p_cut <= single_nmer_cutoff : # try: # cut_nmer_dict[ p_seq ] += p_cut # except: # cut_nmer_dict[ p_seq ] = p_cut # try: # seq_nmer_dict[ p_seq ] += 1 # except: # seq_nmer_dict[ p_seq ] = 1 if 'N' not in n_seq and n_cut <= single_nmer_cutoff: rev_n_seq = rev(n_seq) try: cut_nmer_dict[rev_n_seq] += n_cut except: cut_nmer_dict[rev_n_seq] = n_cut try: seq_nmer_dict[rev_n_seq] += 1 except: seq_nmer_dict[rev_n_seq] = 1 return seq_nmer_dict, cut_nmer_dict
def refine_with_summit(_soft,_mark,_tissue): _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/top_bed/{0}.{1}.{2}.bed"\ .format(_soft,_mark,_tissue))] _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.rep0.bw".format(_mark,_tissue)) _temp_enrich = open("/Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w') _bw = BigWigFile(file=_temp_bw) for line in _temp_peak: vals = _bw.get(line[0],int(line[1]),int(line[2])) vals =tuple(vals) if len(vals)>0: maxs = 0 for _key in vals: if float(_key[2])>maxs: maxs = float(_key[2]) summit = _key[:2] summit_p=int((float(summit[0])+float(summit[1]))/2) if summit_p-1000>0: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999)) else: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000) _temp_enrich.close() sh('sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\ >/Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed'.format(_soft,_mark,_tissue)) sh('bash ../get_enrich.sh /Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed {1} {2} {3}'\ .format(_soft,_mark,_tissue,_soft))
def get_signal(inputfile, output, bwfiles, bwfolder, extend): signalbw = bwfiles.strip().strip(',').split(',') if not bwfolder: bwfolder = "./" if not bwfolder.endswith('/'): bwfolder += '/' bwHs = [] for sb in signalbw: if sb.startswith('/'): bwHs.append(BigWigFile(open(sb, 'rb'))) else: bwHs.append(BigWigFile(open(bwfolder + sb, 'rb'))) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue #center = (int(ll[1]) + int(ll[2]))/2 #S = max(0,center - extend) #E = center + extend #C = (int(ll[1]) + int(ll[2]) ) /2 #S = C - extend #E = C + extend S = int(ll[1]) E = int(ll[2]) for bwHandle in bwHs: try: signal1 = (bwHandle.summarize(ll[0], max(0, S - extend), S, 20)) signal2 = (bwHandle.summarize(ll[0], S, E, 20)) signal3 = (bwHandle.summarize(ll[0], E, E + extend, 20)) binlen1 = extend * 1.0 / 20 binlen2 = (E - S) * 1.0 / 20 binlen3 = extend * 1.0 / 20 if type(signal1.sum_data) == None or type( signal2.sum_data) == None or type( signal3.sum_data) == None: addsig = [0] * 60 else: addsig1 = signal1.sum_data / binlen1 #float(signal.sum_data/signal.valid_count) addsig2 = signal2.sum_data / binlen2 addsig3 = signal3.sum_data / binlen3 addsig = list(addsig1) + list(addsig2) + list(addsig3) except: #print 'c2',line addsig = [0] * 60 #'nan' # ll.extend(list(signal.sum_data/signal.valid_count)) if len(ll) >= 6 and ll[5] == "-": ll.extend(addsig[::-1]) else: ll.extend(addsig) outf.write("\t".join(map(str, ll)) + "\n") inf.close() outf.close()
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,Ipcut,Incut,pspan,tspan,gen,left,right,fetch_length=100): # p=BwIO(pcut) # chrom_len = {} # for i in p.chromosomeTree['nodes']: # chrom_len[i['key']] = i['chromSize'] genome = twobitreader.TwoBitFile(gen) pcutbw = BigWigFile(open(pcut, 'rb')) ncutbw = BigWigFile(open(ncut, 'rb')) Ipcutbw = BigWigFile(open(Ipcut, 'rb')) Incutbw = BigWigFile(open(Incut, 'rb')) inf = open(inputfile) testll = inf.readline().split() ml = int(testll[2]) - int(testll[1]) pspan = pspan - ml/2 inf.seek(0) pBG,nBG = readBG(BGmatrix) outf = open(outputfile,'w') for line in inf: ll = line.split() chrom = ll[0] start = int(ll[1]) end = int(ll[2]) strand = ll[5] seq = genome[chrom][(start-pspan-left):(end + pspan+right)] pout = make_cut(pcutbw,ll,pspan,fetch_length) nout = make_cut(ncutbw,ll,pspan,fetch_length) Ipout = make_cut(Ipcutbw,ll,pspan,fetch_length) Inout = make_cut(Incutbw,ll,pspan,fetch_length) if strand == "-": pout,nout = nout,pout Ipout,Inout = Inout,Ipout if pout == 'NA': continue if 'N' in seq.upper(): continue #print 1 pseq = seq[:-1] nseq = seq[1:] p=[] n=[] for k in range(len(pseq) +1 - left-right): p.append(pBG[pseq[k:k+left+right].upper()]) n.append(nBG[nseq[k:k+left+right].upper()]) if strand != '-': pbglist = p nbglist = n else: pbglist = n[::-1] nbglist = p[::-1] TC,FOS = makeTCFOS(pcutbw,ncutbw,ll,tspan,ml) newll = ll + [TC,FOS] + pout + nout + Ipout + Inout + pbglist + nbglist outf.write("\t".join(map(str,newll))+"\n") outf.close() inf.close()
def Main(): global args args = ParseArg() bw = BigWigFile(open(args.bigwig)) CheckFolderExist(args.output) fout = WriteToFile(args.output + '/' + args.name + '.bed') wout = WriteToFile(args.output + '/' + args.name + '.wig') genome = LoadGenome(args.genome) if args.smooth: logging("Options: turn on smooth mode") for chrom in SortGenome(genome): chrom_size = genome[chrom] logging("Process: %s\t%d" % (chrom, chrom_size)) array = bw.get_as_array(chrom, 0, chrom_size) invalid = np.isnan(array) array[invalid] = 0 agg_array = [] start = 0 stop = args.window for nn in range(int(math.ceil(len(array) / float(args.window)))): if stop >= len(array): stop = len(array) agg_array.append(np.mean(array[start:stop])) break agg_array.append(np.mean(array[start:stop])) start += args.window stop += args.window agg_array = np.array(agg_array) if args.smooth: smooth_array = Smooth(agg_array) else: smooth_array = agg_array print >> wout, "variableStep chrom=%s span=%d" % (chrom, args.window) for nn, value in enumerate(smooth_array): if nn == 0: print >> fout, "%s\t0\t%d\t%.6f" % (chrom, (nn + 1) * args.window, float(value)) print >> wout, "%d\t%.6f" % (nn + 1, value) elif nn == len(smooth_array) - 1: print >> fout, "%s\t%d\t%d\t%.6f" % (chrom, nn * args.window, chrom_size, float(value)) print >> wout, "variableStep chrom=%s span=%d" % ( chrom, chrom_size - ((nn) * args.window)) print >> wout, "%d\t%.6f" % (nn * args.window + 1, float(value)) else: print >> fout, "%s\t%d\t%d\t%.6f" % (chrom, nn * args.window, (nn + 1) * args.window, float(value)) print >> wout, "%d\t%.6f" % (nn * args.window + 1, float(value)) fout.flush() wout.flush() wig2bw = "wigToBigWig -clip %s %s %s" % (args.output + '/' + args.name + '.wig', args.genome, args.output + '/' + args.name + '.bw') os.system(wig2bw) logging("Finish: TSA_smooth DONE!!!")
def summarize(self, interval, bins=None, method='summarize', function='mean'): # We may be dividing by zero in some cases, which raises a warning in # NumPy based on the IEEE 754 standard (see # http://docs.scipy.org/doc/numpy/reference/generated/ # numpy.seterr.html) # # That's OK -- we're expecting that to happen sometimes. So temporarily # disable this error reporting for the duration of this method. orig = np.geterr()['invalid'] np.seterr(invalid='ignore') if (bins is None) or (method == 'get_as_array'): bw = BigWigFile(open(self.fn)) s = bw.get_as_array( interval.chrom, interval.start, interval.stop,) if s is None: s = np.zeros((interval.stop - interval.start,)) else: s[np.isnan(s)] = 0 elif method == 'ucsc_summarize': if function in ['mean', 'min', 'max', 'std', 'coverage']: return self.ucsc_summarize(interval, bins, function=function) else: raise ValueError('function "%s" not supported by UCSC\'s' 'bigWigSummary') else: bw = BigWigFile(open(self.fn)) s = bw.summarize( interval.chrom, interval.start, interval.stop, bins) if s is None: s = np.zeros((bins,)) else: if function == 'sum': s = s.sum_data if function == 'mean': s = s.sum_data / s.valid_count s[np.isnan(s)] = 0 if function == 'min': s = s.min_val s[np.isinf(s)] = 0 if function == 'max': s = s.max_val s[np.isinf(s)] = 0 if function == 'std': s = (s.sum_squares / s.valid_count) s[np.isnan(s)] = 0 # Reset NumPy error reporting np.seterr(divide=orig) return s
def wig_reader(infile, chrom_sizes=None, informat='wiggle', bin_size=2000): '''infile: either a wiggle or bigwig format file chromsize: chrom_name: size, only needed is format is bigwig format: either 'wiggle' or 'bigwig' return: chrom, position (0-based), value ''' if informat.upper() == 'WIGGLE': point_num = 1 count = 0 for chrom, start, end, strand, score in bx.wiggle.IntervalReader( infile): yield (chrom, start, end, score) """ count += 1 if count ==1: chrom = fields[0] up_bound = fields[1]+1 score = fields[2] continue if (fields[0] == chrom) and (fields[1] +1 == up_bound + 1) and (fields[2] == score): point_num += 1 up_bound = fields[1]+1 continue else: yield((chrom, up_bound - point_num, up_bound, score)) chrom = fields[0] score = fields[2] up_bound = fields[1]+1 point_num = 1 """ elif informat.upper() == 'BIGWIG': bw_obj = BigWigFile(file=open(infile)) for chr_name, chr_size in list(chrom_sizes.items()): for chrom, st, end in BED.tillingBed(chrName=chr_name, chrSize=chr_size, stepSize=bin_size): sig_list = bw_obj.get_as_array(chrom, st, end) if sig_list is None: continue sig_list = numpy.nan_to_num(sig_list) if numpy.sum(sig_list) == 0: continue low_bound = st point_num = 1 score = sig_list[0] for value in (sig_list[1:]): if value == score: point_num += 1 else: yield ((chrom, low_bound, low_bound + point_num, score)) score = value low_bound = low_bound + point_num point_num = 1 else: raise Exception("Unknown format. Must be 'wiggle' or 'bigwig'")
def get_signal(inputfile, output, bwfiles, extend, N, bwfolder): signalbw = bwfiles.strip().strip(',').split(',') if not bwfolder: bwfolder = "./" if not bwfolder.endswith('/') and not bwfolder != "": bwfolder += '/' bwHs = [] for sb in signalbw: if sb.startswith('/') or startswith("./") or startswith("../"): bwHs.append(BigWigFile(open(sb, 'rb'))) else: bwHs.append(BigWigFile(open(bwfolder + sb, 'rb'))) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue if len(ll) >= 6 and ll[5] == "-": start = int(ll[2]) strand_flap = 1 else: start = int(ll[1]) strand_flap = 0 S = max(0, start - extend) E = start + extend # S = int(ll[1]) # E = int(ll[2]) outdata = ll for bwHandle in bwHs: try: signal = (bwHandle.summarize(ll[0], S, E, N)) binlen = (E - S) * 1.0 / N if type(signal.sum_data) == None: print 'c1', line addsig = ["na"] * N else: addsig = list( signal.sum_data * 1.0 / (binlen)) #float(signal.sum_data/signal.valid_count) except: print 'c2', line addsig = ["na"] * N #'nan' # ll.extend(list(signal.sum_data/signal.valid_count)) if strand_flap == 1: ll.extend(addsig[::-1]) else: ll.extend(addsig) # ll.extend(list(signal.sum_data/signal.valid_count)) outf.write("\t".join(map(str, ll)) + "\n") inf.close() outf.close()
def bigwig_to_wav(args): import numpy as np from bx.bbi.bigwig_file import BigWigFile from scipy.signal import convolve from scipy.stats import norm from scipy.ndimage import zoom logger.info('read input BigWigfile: ' + args.bigwig_file) f_bigwig = open(args.bigwig_file, 'rb') logger.info('read input BED file: ' + args.bed_file) f_bed = open(args.bed_file, 'r') bigwig = BigWigFile(f_bigwig) smooth_filter = None scale_factors = None if args.smooth == 'boxcar': smooth_filter = np.ones(args.window_size, dtype=np.float32) elif args.smooth == 'gaussian': smooth_filter = norm.pdf( np.linspace(-3, 3, args.window_size * 3, endpoint=True)).astype(np.float32) if args.smooth != 'none': scale_factors = convolve(np.ones(smooth_filter.shape[0]), smooth_filter, mode='same') if not os.path.exists(args.output_dir): logger.info('create output directory: ' + args.output_dir) os.makedirs(args.output_dir) for line in f_bed: c = line.strip().split('\t') chrom = c[0] start = int(c[1]) end = int(c[2]) x = np.nan_to_num(bigwig.get_as_array(chrom, start, end)) # zoom the signals x = zoom(x, args.zoom) if args.smooth != 'none': # smooth the raw signal with a moving window x = convolve(x, smooth_filter, mode='same') # scale the signal filter_length = smooth_filter.shape[0] x[:(filter_length / 2)] /= scale_factors[:(filter_length / 2)] x[(-filter_length / 2):] /= scale_factors[(-filter_length / 2):] if x.shape[0] > filter_length: x[(filter_length / 2):(-filter_length / 2)] /= np.sum(smooth_filter) wav_file = os.path.join(args.output_dir, '%s:%d-%d.wav' % (chrom, start, end)) logger.info('create wav file: ' + wav_file) modulate(x, wav_file, sample_rate=args.sample_rate, n_channels=args.n_channels)
def refine_with_summit(_soft,_mark,_tissue,_reps): _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/sort_bed/{0}.{1}.{2}.bed"\ .format(_soft,_mark,_tissue))] _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.{2}.bw".format(_mark,_tissue,)) _temp_enrich = open("/Data/adam/dnase/enrich_all_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w') _bw = BigWigFile(file=_temp_bw) for line in _temp_peak: vals = _bw.get(line[0],int(line[1]),int(line[2])) vals =tuple(vals) if len(vals)>0: maxs = 0 for _key in vals: if float(_key[2])>maxs: maxs = float(_key[2]) summit = _key[:2] summit_p=int((float(summit[0])+float(summit[1]))/2) if summit_p-1000>0: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999)) else: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000) _temp_enrich.close() awk_args='{printf "%s\\t%s\\n", $0,NR}' sh("sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_all_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\ | awk '{3}'>/Data/adam/dnase/enrich_all_merge_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue,awk_args)) enhancer_dir ="/Data/adam/dnase/enhancer/tissue_enhancer/" sh("bash /Data/adam/dnase/enhancer/roc_pr.sh {0}{1}_enhancer.txt {0}negative_enhancer.txt \ {2} {3} {1}".format(enhancer_dir, _tissue,_soft,_mark)) raw_pr = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/roc_pr_value/{0}.{1}.{2}.bed"\ .format(_soft,_mark,_tissue))] pr_refine = [] temp_positive = [0,0] temp_negative = [0,0] for _line in raw_pr: if _line[3]==0 and _line[4]==0: temp_positive = _line[1:3] out_line = _line[:] out_line[3:5] = temp_negative precision = float(out_line[1])/(float(out_line[1])+float(out_line[3])) out_line.append(str(precision)) pr_refine.append(out_line) elif _line[1]==0 and _line[2]==0: temp_negative = _line[3:5] out_line = _line[:] out_line[1:3] = temp_positive precision = float(out_line[1])/(float(out_line[1])+float(out_line[3])) out_line.append(str(precision)) pr_refine.append(out_line) else: print "error in {0}.{1}.{2}, {3}".format(_soft,_mark,_tissue,_line) pr_refine2 = ['\t'.join(i) for i in pr_refine] with open("/Data/adam/dnase/roc_pr_final/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w') as f: for item in pr_refine2: print >>f, item
def evaluateTC((signalFileName,chrom,start,end)): signalFile = open(signalFileName,"r") bw = BigWigFile(signalFile) mid = (int(start)+int(end))/2 p1 = max(mid - halfWindow,0) p2 = mid + halfWindow try: nCount = int(sum(correctBW(bw.get(chrom,p1,p2),p1,p2))) except Exception: nCount = 0 signalFile.close() return nCount
def get_signal(inputfile, output, bwfiles, bwfolder, extend, N): signalbw = bwfiles.strip().strip(',').split(',') if not bwfolder: bwfolder = "./" if not bwfolder.endswith('/'): bwfolder += '/' bwHs = [] for sb in signalbw: if sb.startswith('/'): bwHs.append(BigWigFile(open(sb, 'rb'))) else: bwHs.append(BigWigFile(open(bwfolder + sb, 'rb'))) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue #center = (int(ll[1]) + int(ll[2]))/2 #S = max(0,center - extend) #E = center + extend C = (int(ll[1]) + int(ll[2])) / 2 #if len(ll)>=6 and ll[5] == "-": # C = int(ll[1]) #else: # C = int(ll[2]) S = max(0, C - extend) E = C + extend for bwHandle in bwHs: try: signal = (bwHandle.summarize(ll[0], S, E, N)) binlen = extend * 2.0 / N if type( signal.sum_data ) == None: #or type(signal2.sum_data) == None or type(signal3.sum_data) == None: addsig = [0] * N else: addsig_tmp = signal.sum_data / binlen #float(signal.sum_data/signal.valid_count) addsig = list(addsig_tmp) #+ list(addsig2) + list(addsig3) except: #print 'c2',line addsig = [0] * N #'nan' # ll.extend(list(signal.sum_data/signal.valid_count)) if len(ll) >= 6 and ll[5] == "-": ll.extend(addsig[::-1]) else: ll.extend(addsig) outf.write("\t".join(map(str, ll)) + "\n") inf.close() outf.close()
def get_signal(inputfile, output, bwfiles, bwfolder, extend): signalbw = bwfiles.strip().strip(',').split(',') if not bwfolder: bwfolder = "" if not bwfolder.endswith('/'): bwfolder += '/' bwHs = [] for sb in signalbw: if sb.startswith('/'): bwHs.append(BigWigFile(open(sb, 'rb'))) else: bwHs.append(BigWigFile(open(bwfolder + sb, 'rb'))) inf = open(inputfile) outf = open(output, 'w') for line in inf: ll = line.split() if "_" in ll[0]: continue if len(ll) >= 6 and ll[5] == "-": strand_flap = 1 else: strand_flap = 0 # center = (int(ll[1]) + int(ll[2]))/2 # S = max(0,center - extend) # E = center + extend S = int(ll[1]) E = int(ll[2]) outdata = [] for bwHandle in bwHs: try: signal = (bwHandle.summarize(ll[0], S, E, (E - S))) if signal: thisdata_tmp = list(signal.sum_data) # if strand_flap == 1: # thisdata = map(round,thisdata_tmp,[4]*(E-S))[::-1] # else: thisdata = map(round, thisdata_tmp, [4] * (E - S)) else: thisdata = ["NA"] * (E - S) except: thisdata = ["NA"] * (E - S) outdata.append(thisdata) # ll.extend(list(signal.sum_data/signal.valid_count)) for pos in range(len(outdata[0])): newll = [ll[0], S + pos, S + pos + 1] for dataorder in range(len(outdata)): newll.append(outdata[dataorder][pos]) outf.write("\t".join(map(str, newll)) + "\n") inf.close() outf.close()
def big_wig_summary_worker((span, bw_list, region_bed_file_name, nb_proc, verbose)): results = list() bw_label = [os.path.basename(p) for p in bw_list] bw_label = [os.path.splitext(os.path.basename(p))[0] for p in bw_list] if verbose: sys.stderr.write("Processing: " + region_bed_file_name) for big_wig, cpt in zip(bw_list, range(len(bw_list))): bigwig = BigWigFile(open(big_wig, "r")) if verbose: sys.stderr.write("Computing coverage for file: " + big_wig + " [" + str(multiprocessing.current_process()) + "], " + str(span[1] - span[0]) + " chunks to process.\n") bed_windows = pybedtools.BedTool(region_bed_file_name) chr_cur = None # Loop through bed lines (features object) for i in bed_windows[slice(span[0], span[1])]: if chr_cur == None: chr_cur = i.chrom else: if i.chrom != chr_cur: chr_cur = i.chrom # Note: bigWig is zero-based/half open as bed. bw_sum = bigwig.query(i.chrom, i.start, i.end, 1) if bw_sum is not None: bw_sum = bw_sum[0]['mean'] bw_sum = np.nan_to_num(bw_sum) bw_sum = np.round(bw_sum, 2) else: bw_sum = 0.00 results.append( (i.chrom + ":" + str(i.start), bw_label[cpt], float(bw_sum))) if verbose: sys.stderr.write("Computing coverage for file: " + big_wig + " [" + str(multiprocessing.current_process()) + "]. Job done.\n") return results
def get_mappability(mappability_file, vcf_file, out_file, region=None, append_chr=True): map_reader = BigWigFile(open(mappability_file, 'rb')) vcf_reader = vcf.Reader(filename=vcf_file) if region is not None: chrom, beg, end = parse_region_for_vcf(region) try: vcf_reader = vcf_reader.fetch(chrom, start=beg, end=end) except ValueError: print("no data for region {} in vcf".format(region)) vcf_reader = [] data = [] for record in vcf_reader: if append_chr: chrom = 'chr{0}'.format(record.CHROM) else: chrom = record.CHROM coord = record.POS beg = coord - 100 beg = max(beg, 0) end = coord + 100 result = map_reader.query(chrom, beg, end, 1) if result is None: mappability = 0 else: mappability = result[0]['mean'] data.append({ 'chrom': record.CHROM, 'coord': record.POS, 'mappability': mappability }) data = pd.DataFrame(data) csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
def main(): input_filename, output_filename, loc_filename, loc_key, chrom_col, start_col = sys.argv[ 1:] # open input, output, and bigwig files location_file = LocationFile(loc_filename) bigwig_filename = location_file.get_values(loc_key) bwfh = open_or_die(bigwig_filename, message='Error opening BigWig file %s' % bigwig_filename) bw = BigWigFile(file=bwfh) ifh = open_or_die(input_filename, message='Error opening input file %s' % input_filename) ofh = open_or_die(output_filename, mode='w', message='Error opening output file %s' % output_filename) # make column numbers 0-based chrom_col = int(chrom_col) - 1 start_col = int(start_col) - 1 min_cols = max(chrom_col, start_col) # add score column to imput file line_number = 0 for line in ifh: line_number += 1 line = line.rstrip('\r\n') elems = line.split('\t') if len(elems) > min_cols: chrom = elems[chrom_col].strip() # base-0 position in chrom start = int(elems[start_col]) score_list = bw.get(chrom, start, start + 1) score_list_len = len(score_list) if score_list_len == 1: beg, end, score = score_list[0] score_val = '%1.3f' % score elif score_list_len == 0: score_val = 'NA' else: die('%s line %d: chrom=%s, start=%d, score_list_len = %d' % (input_filename, line_number, chrom, start, score_list_len)) print >> ofh, '\t'.join([line, score_val]) else: print >> ofh, line bwfh.close() ifh.close() ofh.close()
def extract_phastcons ( bedfile, phas_chrnames, width, pf_res ): """Extract phastcons scores from a bed file. Return the average scores """ info("read bed file...") bfhd = open(bedfile) bed = parse_BED(bfhd) # calculate the middle point of bed regions then extend left and right by 1/2 width bchrs = bed.peaks.keys() bchrs.sort() chrs = [] for c in phas_chrnames: if c in bchrs: chrs.append(c) sumscores = [] for chrom in chrs: info("processing chromosome: %s" %chrom) pchrom = bed.peaks[chrom] bw = BigWigFile(open(chrom+'.bw', 'rb')) for i in range(len(pchrom)): mid = int((pchrom[i][0]+pchrom[i][1])/2) left = int(mid - width/2) right = int(mid + width/2) if left < 0: left = 0 right = width summarize = bw.summarize(chrom, left, right, width/pf_res) if not summarize: continue dat = summarize.sum_data / summarize.valid_count #dat = dat.strip().split('\t') sumscores.append(dat) ## a list with each element is a list of conservation score at the same coordinate sumscores = map(list, zip(*sumscores)) ## exclude na sumscores = [[t2 for t2 in t if not math.isnan(t2)] for t in sumscores] try: conscores = [sum(t)/len(t) for t in sumscores] except ZeroDivisionError: conscores = [0] * (width/pf_res) return conscores
def summary(bwfile1, bwfile2, bwfile_add, bedfile, topnumber, out): total_result = [] p = BwIO(bwfile1) q = BwIO(bwfile2) chrom_len1 = {} chrom_len2 = {} for i in p.chromosomeTree['nodes']: chrom_len1[i['key']] = i['chromSize'] for i in q.chromosomeTree['nodes']: chrom_len2[i['key']] = i['chromSize'] bwHandle1 = BigWigFile(open(bwfile1, 'rb')) bwHandle2 = BigWigFile(open(bwfile2, 'rb')) inf = open(bedfile) t = time.time() for line in inf: ll = line.split() ll[3] = "-" if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]): summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), 1) if summary.valid_count == 0: mean_value1 = 0 else: mean_value1 = (summary.sum_data / summary.valid_count)[0] summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), 1) if summary.valid_count == 0: mean_value2 = 0 else: mean_value2 = (summary.sum_data / summary.valid_count)[0] total_result.append(ll + [mean_value1 + mean_value2]) inf.close() total_result.sort(reverse=True, key=lambda x: x[-1]) bwHs = [] for i in bwfile_add: bwHs.append(BigWigFile(open(i, 'rb'))) outf = open(out, 'w') print "scaning 1st ", time.time() - t t = time.time() for i in range(min(len(total_result), topnumber)): ll = total_result[i] summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value1 = ",".join(map(str, list(summary.sum_data))) summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value2 = ",".join(map(str, list(summary.sum_data))) result = map(str, (ll + [additional_value1, additional_value2])) for bwH in bwHs: summary = bwH.summarize(ll[0], int(ll[1]), int(ll[2]), (int(ll[2]) - int(ll[1]))) additional_value_add = ",".join(map(str, list(summary.sum_data))) result.append(additional_value_add) outf.write("\t".join(result) + "\n") outf.close() print "scaning 2nd ", time.time() - t
def check_position(chrom, start, end): #is there 10% coverage of region [start, end] valids = 0. wrong = 0. for directory in [x[0] for x in os.walk(DATAPATH + "data")]: for filename in glob(directory + "/*.bigWig") + glob(directory + "/*.bw"): f = open(filename, "r") bigwig = BigWigFile(file=f) summary = bigwig.summarize(chrom, start, end + 1, 1) if summary.valid_count * 10 < end - start + 1: wrong += 1 else: valids += 1 return (valids / (valids + wrong) >= 0.75)
def get_signal(inputfile,output,signalbw,extend): signalbw = signalbw.strip().strip(',').split(',') p=BwIO(signalbw[0]) chrom_len = {} for i in p.chromosomeTree['nodes']: chrom_len[i['key']] = i['chromSize'] bwHandle = [] for k in signalbw: bwHandle.append(BigWigFile(open(k, 'rb'))) inf = open(inputfile) outf = open(output,'w') for line in inf: ll = line.split() inputlen = len(ll) if not chrom_len.has_key(ll[0]): continue for bwH in bwHandle: S = (int(ll[1]) + int(ll[2]))/2 E = (int(ll[1]) + int(ll[2]))/2 + 1 try: signal=bwH.summarize(ll[0],max(0,S-extend),E+extend,1) except: break if float(signal.valid_count) == 0: ll.append('0') else: ll.append(str(float(signal.sum_data/signal.valid_count))) if len(ll) == ( inputlen + len(bwHandle) ): outf.write("\t".join(ll)+"\n") inf.close() outf.close()
def test_summaries_from_file(): bw = BigWigFile(file=open("test_data/bbi_tests/test.bw")) def check_summary(line): fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) n = int(fields[3]) t = fields[4] values = [float(v.replace('n/a', 'NaN')) for v in fields[5:]] sd = bw.summarize(chrom, start, end, n) if t == 'mean': print sd.sum_data / sd.valid_count print values assert allclose(sd.sum_data / sd.valid_count, values) elif t == 'min': assert allclose(sd.min_val, values) elif t == 'max': assert allclose(sd.max_val, values) #elif t == 'std': # assert numpy.allclose( sd.max_val, values ) for line in open("test_data/bbi_tests/test.expectation"): yield check_summary, line
class BigWigWrapper(object): """A wrapper for bx-python BigWig file""" def __init__(self, filepath): self.bw = BigWigFile(open(filepath)) def __getitem__(self, iv): return self.bw.get_as_array(iv.chrom, iv.start, iv.end)
def build(self): """ Build the matrix. Since bigWig files are essentially pre-summarized, this just extracts the chrom/start/stop represented by each cell in the matrix and fills it with the value from the bigWig file. """ self.bigwig = BigWigFile(open(self.file)) chrom_rc, chrom_bins = self.chrom2rc() if self.chrom == 'genome': chroms = self.chromdict.keys() else: chroms = [self.chrom] for chrom in chroms: rc = chrom_rc[chrom] nbins = chrom_bins[chrom] start, stop = self.chromdict[chrom] results = self.bigwig.summarize(chrom, start, stop, nbins) values = results.sum_data / results.valid_count values[np.isnan(values)] = 0 self.matrix[rc[:,0], rc[:, 1]] = values self._cleanup()
def test_summaries_from_file(): bw = BigWigFile(file=open("test_data/bbi_tests/test.bw", 'rb')) def check_summary(line): fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) n = int(fields[3]) t = fields[4] values = [float(v.replace('n/a', 'NaN')) for v in fields[5:]] sd = bw.summarize(chrom, start, end, n) if t == 'mean': print(sd.sum_data / sd.valid_count) print(values) assert allclose(sd.sum_data / sd.valid_count, values) elif t == 'min': assert allclose(sd.min_val, values) elif t == 'max': assert allclose(sd.max_val, values) # elif t == 'std': # assert numpy.allclose( sd.max_val, values ) for i, line in enumerate(open("test_data/bbi_tests/test.expectation")): f = partial(check_summary, line) f.description = "Test summaries line %d: %s" % (i, line[:40]) yield (f, )
def load_annos(args): """ Populate a dictionary of Tabixfile handles for each annotation file. Other modules can then access a given handle and fetch data from it as follows: dbsnp_handle = annotations.annos['dbsnp'] hits = dbsnp_handle.fetch(chrom, start, end) """ anno_files = get_anno_files(args) for anno in anno_files: try: # .gz denotes Tabix files. if anno_files[anno].endswith(".gz"): annos[anno] = pysam.Tabixfile(anno_files[anno]) # .bw denotes BigWig files. elif anno_files[anno].endswith(".bw"): annos[anno] = BigWigFile(open(anno_files[anno])) except IOError: sys.exit("Gemini cannot open this annotation file: %s. \n" "Have you installed the annotation files? If so, " "have they been moved or deleted? Exiting...\n\n" "For more details:\n\t" "http://gemini.readthedocs.org/en/latest/content/" "#installation.html\#installing-annotation-files\n" % anno_files[anno])
def get_phastcons(bedtool, phastcons_location, species=None, index=None, ): """ Get phastcons scores for intervals in a bed tool """ if species is None and index is None: print "Error, must select species or index" f = open(phastcons_location, 'r') bw = BigWigFile(file=f) try: #if its a line #for each line fetch bigwig values type(bedtool) v = bedtool.chrom #is a single interval vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons=0 data = mean_phastcons except: #if bedtool for i, bedline in enumerate(bedtool): data = np.ndarray(len(bedtool)) vals = bw.get(bedline.chrom, bedline.start, bedline.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons=0 data[i] = mean_phastcons #returns mean phastcons score for each line #returns inconistant data types, need to convert so it just returns an array return data
def main(): input_filename, output_filename, loc_filename, loc_key, chrom_col, start_col = sys.argv[1:] # open input, output, and bigwig files location_file = LocationFile( loc_filename ) bigwig_filename = location_file.get_values( loc_key ) bwfh = open_or_die( bigwig_filename, message='Error opening BigWig file %s' % bigwig_filename ) bw = BigWigFile( file=bwfh ) ifh = open_or_die( input_filename, message='Error opening input file %s' % input_filename ) ofh = open_or_die( output_filename, mode='w', message='Error opening output file %s' % output_filename ) # make column numbers 0-based chrom_col = int( chrom_col ) - 1 start_col = int( start_col ) - 1 min_cols = max( chrom_col, start_col ) # add score column to imput file line_number = 0 for line in ifh: line_number += 1 line = line.rstrip( '\r\n' ) elems = line.split( '\t' ) if len( elems ) > min_cols: chrom = elems[chrom_col].strip() # base-0 position in chrom start = int( elems[start_col] ) score_list = bw.get( chrom, start, start + 1 ) score_list_len = len( score_list ) if score_list_len == 1: beg, end, score = score_list[0] score_val = '%1.3f' % score elif score_list_len == 0: score_val = 'NA' else: die( '%s line %d: chrom=%s, start=%d, score_list_len = %d' % ( input_filename, line_number, chrom, start, score_list_len ) ) print('\t'.join( [line, score_val] ), file=ofh) else: print(line, file=ofh) bwfh.close() ifh.close() ofh.close()
def getNumberOfFragmentsPerRegionFromBigWig(bw, chromSizes): """ Get the number of all mapped fragments per region in all chromosomes from a bigWig. Utilizing bx-python. Test dataset with two samples covering 200 bp. >>> test = Tester() Get number of fragments in sample. >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile1, [('3R', 200)]) 3.0 >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile2, [('3R', 200)]) 4.0 """ bwh = BigWigFile(open(bw, "rb")) mapped = 0 for cname, csize in chromSizes: regions = bwh.get(cname, 0, csize) # region = bwh.get(chrom_name, start, end) for region in regions: mapped += region[2] return mapped
def get_GA_from_bw(self, plus, minus, GTF, filterfxn): ##bx-python 'get' method is 0 based, fully closed ga = HTSeq.GenomicArray( "auto", typecode='d' , stranded = True) with open(plus) as f: bw_file = BigWigFile(file=f) for GF in GTF: if filterfxn( GF ) == False: continue window = GF.iv chrom, start, stop = window.chrom, window.start, window.end vals = bw_file.get(chrom, start, stop) for start, stop, value in vals: ga[ HTSeq.GenomicPosition(chrom, start, '+') ] = value with open(minus) as f: bw_file = BigWigFile(file=f) for GF in GTF: if filterfxn( GF ) == False: continue window = GF.iv chrom, start, stop = window.chrom, window.start, window.end vals = bw_file.get(chrom, start, stop) for start, stop, value in vals: ga[ HTSeq.GenomicPosition(chrom, start, '-') ] = value return ga
def profile_bwfile(inbed,bwfile): '''retrieve signal from bigwig file for each entry in input bed file''' bw = BigWigFile( file=open( bwfile ) ) for line in open(inbed): bw_signal=[] try: if line.startswith('#'):continue if line.startswith('track'):continue if line.startswith('browser'):continue if not line.strip(): continue else: line = line.rstrip('\r\n') fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) except: print >>sys.stderr,"Must be chrom [space] start [space] end: " + line, continue bw_signal.extend(bw.get_as_array(chrom,start,end)) print chrom +'\t'+ str(start) +'\t'+ str(end) + '\t' + ','.join(str(i) for i in bw_signal)
def findInsertions(bwFile, bedData, x): if options.tn5 is not None: bwFile = options.b + options.tn5 + "." + bedData[x][0] + ".Scores.bw" sL = int(bedData[x][1]) - options.l sR = int(bedData[x][2]) + options.r # get signal data f = open(bwFile, "rb") bw = BigWigFile(f) try: signal = bw.get_as_array(bedData[x][0], sL, sR) except OverflowError: signal = np.array([np.nan] * (sR - sL)) f.close() out = signal try: if bedData[x][3] == "-": out = out[::-1] except IndexError: pass return out
def get_phastcons(bedtool, species=None, index=None): """ Get phastcons scores for intervals in a bed tool """ if species is None and index is None: print "Error, must select species or index" if species is not None and index is None: if species == "mm9": index= basedir + "/yeolab/Conservation/phastCons/mm9_30way/placental/mm9_phastcons.bw" elif species == "hg19": index = basedir + "/yeolab/Conservation/phastCons/hg19_46way/placentalMammals/reformat/hg19_phastcons.bw" f = open(index, 'r') bw = BigWigFile(file=f) try: type(bedtool) v = bedtool.chrom #is a single interval vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons=0 data = mean_phastcons except: for i, bedline in enumerate(bedtool): data = np.ndarray(len(bedtool)) vals = bw.get(bedline.chrom, bedline.start, bedline.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons=0 data[i] = mean_phastcons return data
class TestBigWig(unittest.TestCase): def setUp(self): f = open( "test_data/bbi_tests/test.bw" ) self.bw = BigWigFile(file=f) def test_get_summary(self): data = self.bw.query("chr1", 10000, 20000, 10) means = [ x['mean'] for x in data ] print means assert numpy.allclose( map(float, means), [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998] ) # Summarize variant sd = self.bw.summarize( "chr1", 10000, 20000, 10) assert numpy.allclose( sd.sum_data / sd.valid_count, [-0.17557571594973645, -0.054009292602539061, -0.056892242431640622, -0.03650328826904297, 0.036112907409667966, 0.0064466032981872557, 0.036949024200439454, 0.076638259887695306, 0.043518108367919923, 0.01554749584197998] ) # Test min and max for this entire summary region data = self.bw.query("chr1", 10000, 20000, 1) maxs = [ x['max'] for x in data ] mins = [ x['min'] for x in data ] self.assertEqual( map(float, maxs), [0.289000004529953] ) self.assertEqual( map(float, mins), [-3.9100000858306885] ) def test_get_leaf(self): data = self.bw.query("chr1", 11000, 11005, 5) means = [ x['mean'] for x in data ] assert numpy.allclose( map(float, means), [0.050842501223087311, -2.4589500427246094, 0.050842501223087311, 0.050842501223087311, 0.050842501223087311] ) # Test min and max for this entire leaf region data = self.bw.query("chr1", 11000, 11005, 1) maxs = [ x['max'] for x in data ] mins = [ x['min'] for x in data ] self.assertEqual( map(float, maxs), [0.050842501223087311] ) self.assertEqual( map(float, mins), [-2.4589500427246094] ) def test_wrong_nochrom(self): data = self.bw.query("chr2", 0, 10000, 10) self.assertEqual( data, None )
import numpy as np fl=sys.argv[1] dist=int(sys.argv[2]) from bx.bbi.bigwig_file import BigWigFile genes=read.dat("/home/ssaberi/resources/list.genes.txt",'\t') table=read.dat("/projects/epigenomics/MarcoJuliaPon/peaks.txt",'\t') mygenes=read.dat("/projects/epigenomics/MarcoJuliaPon/mygenes.txt",'\t') ens=[] for i in mygenes: for gn in genes: if i in gn[0]: ens.append(gn[1]) break genespos=read.read_gene_pos('/home/ssaberi/resources/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage') genesbed=bedtools.makebed_genpos(ens,genespos,100000) f = open(fl) bw = BigWigFile(file=f) mat=[] for bed_i in genesbed: vals = bw.get( bed_i[0], bed_i[1], bed_i[2]) mat.append(np.array(vals)) mat=np.array(mat) plt.matshow(mat,aspect='auto',cmap='YlOrBr') fl=fl[-fl[::-1].index('/'):-fl[::-1].index('.')] plt.save(fl+".pdf")
def output(fragmentsMap , fragmentList, fragmentPairs, fragmentCount, fragmentsChrom): ''' outputs 2 files, the first containing "chr extraField fragmentMid marginalizedContactCount mappable? (0/1)" and the second containing: "chr1 fragmentMid1 chr2 fragmentMid2 contactCount" optionally output the 2D contact matrix ''' if (options.verbose): print >> sys.stdout, "- %s START : output data " % (timeStamp()) if ( options.outputFilename != "" ): outfile1 = gzip.open(options.outputDir+options.outputFilename+".fragmentLists.gz","wb") else: outfile1 = gzip.open(options.outputDir+os.path.basename(args[0])+".fragmentLists.gz","wb") fragmentIds = fragmentsMap.keys() fragmentIds.sort() # lookup mean mappability ratio bw = "" if (options.mappability != ""): # lazy load from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile( open( options.mappability ) ) for fragmentId in fragmentIds: contactCounts = 0 chrom = fragmentsMap[fragmentId][0] midpoint = fragmentsMap[fragmentId][1] if (options.vverbose): print >> sys.stdout, "- process %s %d " % (chrom, midpoint) if (fragmentList.has_key(fragmentId)): contactCounts = fragmentList[fragmentId] if (bw != ""): try: mappable = bw.query(chrom, midpoint-options.resolution/2, midpoint+options.resolution/2, 1)[0]["mean"] except: mappable = 0 # problem with invalid values if (options.vverbose): print >> sys.stderr, "Problem with bw file at %s %d-%d" % (chrom, midpoint-options.resolution/2, midpoint+options.resolution/2) print traceback.format_exc() elif (contactCounts>0): mappable=1 outfile1.write("%s\t%d\t%s\t%f\n" % (chrom, midpoint, "NA", mappable)) outfile1.close() if ( options.outputFilename != "" ): outfile2 = gzip.open(options.outputDir+options.outputFilename+".contactCounts.gz","wb") else: outfile2 = gzip.open(options.outputDir+os.path.basename(args[0])+".contactCounts.gz","wb") for fragmentIds, contactCounts in fragmentPairs.iteritems(): chrom1 = fragmentsMap[fragmentIds[0]][0] midpoint1 = fragmentsMap[fragmentIds[0]][1] chrom2 = fragmentsMap[fragmentIds[1]][0] midpoint2 = fragmentsMap[fragmentIds[1]][1] outfile2.write("%s\t%d\t%s\t%d\t%d\n" % (chrom1, midpoint1, chrom2, midpoint2, contactCounts)) outfile2.close() if (options.create2DMatrix or options.create2DMatrixPerChr): # lazy loading from scipy.sparse import lil_matrix import numpy # populate sparse matrix A = lil_matrix((fragmentCount, fragmentCount), dtype='i') for fragmentIds, contactCounts in fragmentPairs.iteritems(): A[fragmentIds[0],fragmentIds[1]] = contactCounts A[fragmentIds[1],fragmentIds[0]] = contactCounts # convert to coordinate format B = A.tocoo() if (options.create2DMatrix): if ( options.outputFilename != "" ): outfile3 = options.outputDir+options.outputFilename+".matrix" else: outfile3 = options.outputDir+os.path.basename(args[0])+".matrix" if (options.verbose): print >> sys.stdout, "- save 2Dmatrix to %s " % (outfile3) f_handle=open(outfile3,'w') C = B.tocsr() for i in xrange(fragmentCount): numpy.savetxt(f_handle, C[i].toarray(),fmt='%i', delimiter='\t') f_handle.close() if (options.create2DMatrixPerChr): for chr in fragmentsChrom.keys(): C = B.tocsc()[:,fragmentsChrom[chr][0]:fragmentsChrom[chr][1]].tocsr()[fragmentsChrom[chr][0]:fragmentsChrom[chr][1],:] fragmentRange=fragmentsChrom[chr][1]-fragmentsChrom[chr][0] header=['d']+[ "%s%d" % i for i in zip(['r']*fragmentRange,range(fragmentRange))] if ( options.outputFilename != "" ): outfile3 = options.outputDir+options.outputFilename+"."+chr+".matrix" else: outfile3 = options.outputDir+os.path.basename(args[0])+"."+chr+".matrix" if (options.verbose): print >> sys.stdout, "- save 2Dmatrix for chromosome %s to %s " % (chr, outfile3) f_handle=open(outfile3,'w') f_handle.write('\t'.join(header)+"\n") for i in xrange(fragmentRange): f_handle.write(header[i+1]+"\t") numpy.savetxt(f_handle, C[i].toarray(),fmt='%i', delimiter='\t') f_handle.close() if (options.verbose): print >> sys.stdout, "- %s FINISHED: output data" % (timeStamp())
#data = np.array(range(1,10)); #for f in allFunctions: # data = applyFunction(data,f); # print(data); #print(allFunctions); #exit; chromSizesFile = MYUTILS.smartGZOpen(args.chrsFile,'r'); chromSizes = {}; for line in chromSizesFile: if line is None or line == "" or line[0]=="#": continue data=line.rstrip().split("\t"); chromSizes[data[0]]=int(data[1]); curBW = BigWigFile(open(args.inBW)) outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w"); outStream.write("track type=wiggle_0\n") for chr in chromSizes.keys(): last = 0; final = chromSizes[chr]; sys.stderr.write("Outputting data for %s:\n"%(chr)); while last!=final: # this breaks it up into chunks so that I'm not piping entire (human) chromosomes at once if args.verbose>0: sys.stderr.write(" Section %i - %i:\n"%(last,curLast)); curLast = np.min([last+args.chunks,final]); curEnd = np.min([curLast+additionalFlankSize, final]); curSt = np.max([last-additionalFlankSize,0]); values = curBW.get_as_array( chr, curSt, curEnd )
useThese = {}; totalLength = 0 for chr in chrOrder: #sample positions useThese[chr] = np.random.random_sample((chromSizes[chr]))<args.sample; totalLength = totalLength + np.sum(useThese[chr]); #make a matrix of the data allData = np.empty([totalLength,len(IDs)]); if args.eliminateMissing>0: keepThese = np.ones([totalLength]).astype(bool); # onlt those for which data was observed in all tracks for i in range(0,len(IDs)): #input GB tracks curBW = BigWigFile(open(files[i])) curTot = 0; if args.verbose>1: sys.stderr.write("Inputting data for %s.\n"%(IDs[i])); for chr in chrOrder: if args.verbose>1: sys.stderr.write(" Inputting data for %s.\n"%(chr)); if args.verbose>2: sys.stderr.write(" Getting data from BW.\n"); values = curBW.get_as_array( chr, 0, chromSizes[chr] ) if values is None: sys.stderr.write("%s is missing %s... skipping it for all\n"%(IDs[i],chr)); chrOrder.remove(chr) allData = np.delete(allData, [range(curTot, (curTot+np.sum(useThese[chr])))],0); if args.eliminateMissing>0: keepThese = np.delete(keepThese, [range(curTot, (curTot+np.sum(useThese[chr])))],0); totalLength = totalLength - np.sum(useThese[chr]); del useThese[chr] del chromSizes[chr]
opts.add_option("-a", help="<bw> Accepts a bigwig file") opts.add_option("-g", help="<Genome Size file>") opts.add_option("-w", default=150,type='int', help="<Int> window size") opts.add_option("-s", default=20,type='int', help="<Int> step size (span)") options, arguments = opts.parse_args() # return usage information if no argvs given if len(sys.argv)==1: os.system(sys.argv[0]+" --help") sys.exit() ##### DEFINE FUNCTIONS ##### ##### INPUTS AND OUTPUTS ##### # open bigwig bw = BigWigFile(open(options.a)) # get gSize file gSizes = np.loadtxt(options.g,'str') chunkSize = 1000000 padLen = 5000 # open out file outName = os.path.join(os.path.dirname(options.a), 'out.smooth.bed') try: os.remove(outName) except OSError: pass print "Saving to %s.."%outName outF = file(outName, 'a')
def setUp(self): f = open( "test_data/bbi_tests/test.bw" ) self.bw = BigWigFile(file=f)
if line is None or line == "" or line[0]=="#": continue data=line.rstrip().split("\t"); for i in range(0,len(data)): oldToNew[data[i]] = data[0]; transChrs.append(data[i]); inFile.close(); chromSizesFile = MYUTILS.smartGZOpen(args.chrsFile,'r'); chromSizes = {}; for line in chromSizesFile: if line is None or line == "" or line[0]=="#": continue data=line.rstrip().split("\t"); chromSizes[data[0]]=int(data[1]); curBW = BigWigFile(open(args.inBW)) outStream = MYUTILS.smartGZOpen("%s.wig.gz"%(args.outFPre),"w"); outStream.write("track type=wiggle_0\n") for chr in transChrs: values = curBW.get_as_array( chr, 0, chromSizes[oldToNew[chr]] ) #print(chr); if values is not None: sys.stderr.write("Adding %s -> %s\n"%(chr, oldToNew[chr])); outStream.write("fixedStep chrom=%s start=1 step=1\n"%(oldToNew[chr])) outStream.write("\n".join(map(str,values))); outStream.write("\n"); toBW = subprocess.Popen(["wigToBigWig","%s.wig.gz"%(args.outFPre),args.chrsFile,"%s.bw"%(args.outFPre)]) temp = toBW.communicate()
if tokens[3] == '+': annotated_5p.add((tokens[0], int(tokens[1]) - 1)) annotated_3p.add((tokens[0], int(tokens[2]) - 1)) elif tokens[3] == '-': annotated_3p.add((tokens[0], int(tokens[1]) - 1)) annotated_5p.add((tokens[0], int(tokens[2]) - 1)) else: raise RuntimeError( 'Invalid line in annotation file: "{}".'.format(line) ) unannotated_fivep_splice_site_counts = defaultdict(int) unannotated_threep_splice_site_counts = defaultdict(int) annotated_fivep_splice_site_counts = defaultdict(int) annotated_threep_splice_site_counts = defaultdict(int) from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile(open(args.phylop_bw, 'rb')) print >>sys.stderr, '\x1b[KDone. Computing/writing matrix elements...' with open( allincidence ) as incidence_stream, open( args.out, 'w' ) as output_stream: unannotated_line_counts = defaultdict(int) annotated_line_counts = defaultdict(int) splice_sites = 0 for key, group in itertools.groupby( incidence_stream, lambda x: x.split('\t')[0] ): for line in group: print >>sys.stderr, ( 'Processed {} splice sites...\r'.format(
Create a site profile vector showing the average signal accumulated from a bigwig file around the center of each interval from a BED file. Output is the average signal value at that relative position across the intervals. usage: %prog bigwig_file.bw padding < bed_file.bed """ import sys from numpy import * from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile( open( sys.argv[1] ) ) padding = int( sys.argv[2] ) totals = zeros( padding*2, dtype=float64 ) valid = zeros( padding*2, dtype=int32 ) for interval in GenomicIntervalReader( sys.stdin ): center = floor( ( interval.start + interval.end ) / 2 ) values = bw.get_as_array( interval.chrom, center - padding, center + padding ) # Determine which positions had data and mask the rest for totalling invalid = isnan( values ) values[ invalid ] = 0 totals += values valid += ( ~ invalid ) savetxt( sys.stdout, totals/valid )
lengthCount+=1; lengthSum+=curLen; if scanThese[i][GENOMEDATA.STR]!="+" and scanThese[i][GENOMEDATA.STR]!="-": numStrandless+=1 avgLength = int(round(lengthSum/lengthCount)) if numStrandless>0 and args.inFile2 is not None: raise Exception("Error: loci contain strandless entries, but genome tracks provided for each strand!!"); if numStrandless>0: sys.stderr.write("Warning: Strandless loci detected; assuming forward orientation.\n"); padding = int(args.flank); #read in track file(s) if args.format=="BIGWIG" or args.format=="BW": from bx.bbi.bigwig_file import BigWigFile inFile1 = BigWigFile(open(args.inFile)) if args.inFile2 is not None: inFile2 = BigWigFile(open(args.inFile2)) elif args.format=="BIGBED" or args.format=="BB": from bx.bbi.bigwig_file import BigBedFile inFile1 = BigBedFile(open(args.inFile)) if args.inFile2 is not None: inFile2 = BigBedFile(open(args.inFile2)) elif args.format=="WIG" or args.format=="W": from bx.arrays.wiggle import WiggleReader inFile1 = WiggleReader(open(args.inFile)) if args.inFile2 is not None: inFile2 = WiggleReader(open(args.inFile2)) elif args.format=="BEDGR" or args.format=="BG": from bx.arrays.bed import BedReader inFile1 = BedReader(open(args.inFile))
def main(args): bw_file = BigWigFile( open(args.bigWigFile) ) bw_file.get_as_array(chrom, st, end)
class HilbertMatrixBigWig(HilbertMatrix): # Need to override build(), but otherwise just like a HilbertMatrix def __init__(self, *args, **kwargs): """ Subclass of HilbertMatrix specifically for bigWig format files """ super(HilbertMatrixBigWig, self).__init__(*args, **kwargs) def build(self): """ Build the matrix. Since bigWig files are essentially pre-summarized, this just extracts the chrom/start/stop represented by each cell in the matrix and fills it with the value from the bigWig file. """ self.bigwig = BigWigFile(open(self.file)) chrom_rc, chrom_bins = self.chrom2rc() if self.chrom == 'genome': chroms = self.chromdict.keys() else: chroms = [self.chrom] for chrom in chroms: rc = chrom_rc[chrom] nbins = chrom_bins[chrom] start, stop = self.chromdict[chrom] results = self.bigwig.summarize(chrom, start, stop, nbins) values = results.sum_data / results.valid_count values[np.isnan(values)] = 0 self.matrix[rc[:,0], rc[:, 1]] = values self._cleanup() def chrom2rc(self): """ Return a dictionary of {chrom: (rows, cols)} and {chrom: nbins} """ precomputed = np.load( os.path.join( os.path.dirname(__file__), 'precomputed.npz')) rc = precomputed['_%s' % self.matrix_dim] d = {} bins = {} last_stop = 0 for chrom, startstop in self.chromdict.items(): start, stop = startstop frac = self.chromdict[chrom][1] / float(self.chrom_length) nbins = int(frac * (self.matrix_dim * self.matrix_dim)) d_start = last_stop d_stop = d_start + nbins d[chrom] = rc[d_start:d_stop, :] bins[chrom] = nbins last_stop += nbins return d, bins