def refine_with_summit(_soft,_mark,_tissue): _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/top_bed/{0}.{1}.{2}.bed"\ .format(_soft,_mark,_tissue))] _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.rep0.bw".format(_mark,_tissue)) _temp_enrich = open("/Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w') _bw = BigWigFile(file=_temp_bw) for line in _temp_peak: vals = _bw.get(line[0],int(line[1]),int(line[2])) vals =tuple(vals) if len(vals)>0: maxs = 0 for _key in vals: if float(_key[2])>maxs: maxs = float(_key[2]) summit = _key[:2] summit_p=int((float(summit[0])+float(summit[1]))/2) if summit_p-1000>0: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999)) else: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000) _temp_enrich.close() sh('sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\ >/Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed'.format(_soft,_mark,_tissue)) sh('bash ../get_enrich.sh /Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed {1} {2} {3}'\ .format(_soft,_mark,_tissue,_soft))
def get_phastcons( bedtool, phastcons_location, species=None, index=None, ): """ Get phastcons scores for intervals in a bed tool """ if species is None and index is None: print "Error, must select species or index" f = open(phastcons_location, 'r') bw = BigWigFile(file=f) try: #if its a line #for each line fetch bigwig values type(bedtool) v = bedtool.chrom #is a single interval vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons = 0 data = mean_phastcons except: #if bedtool for i, bedline in enumerate(bedtool): data = np.ndarray(len(bedtool)) vals = bw.get(bedline.chrom, bedline.start, bedline.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons = 0 data[i] = mean_phastcons #returns mean phastcons score for each line #returns inconistant data types, need to convert so it just returns an array return data
def refine_with_summit(_soft,_mark,_tissue,_reps): _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/sort_bed/{0}.{1}.{2}.bed"\ .format(_soft,_mark,_tissue))] _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.{2}.bw".format(_mark,_tissue,)) _temp_enrich = open("/Data/adam/dnase/enrich_all_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w') _bw = BigWigFile(file=_temp_bw) for line in _temp_peak: vals = _bw.get(line[0],int(line[1]),int(line[2])) vals =tuple(vals) if len(vals)>0: maxs = 0 for _key in vals: if float(_key[2])>maxs: maxs = float(_key[2]) summit = _key[:2] summit_p=int((float(summit[0])+float(summit[1]))/2) if summit_p-1000>0: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999)) else: print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000) _temp_enrich.close() awk_args='{printf "%s\\t%s\\n", $0,NR}' sh("sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_all_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\ | awk '{3}'>/Data/adam/dnase/enrich_all_merge_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue,awk_args)) enhancer_dir ="/Data/adam/dnase/enhancer/tissue_enhancer/" sh("bash /Data/adam/dnase/enhancer/roc_pr.sh {0}{1}_enhancer.txt {0}negative_enhancer.txt \ {2} {3} {1}".format(enhancer_dir, _tissue,_soft,_mark)) raw_pr = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/roc_pr_value/{0}.{1}.{2}.bed"\ .format(_soft,_mark,_tissue))] pr_refine = [] temp_positive = [0,0] temp_negative = [0,0] for _line in raw_pr: if _line[3]==0 and _line[4]==0: temp_positive = _line[1:3] out_line = _line[:] out_line[3:5] = temp_negative precision = float(out_line[1])/(float(out_line[1])+float(out_line[3])) out_line.append(str(precision)) pr_refine.append(out_line) elif _line[1]==0 and _line[2]==0: temp_negative = _line[3:5] out_line = _line[:] out_line[1:3] = temp_positive precision = float(out_line[1])/(float(out_line[1])+float(out_line[3])) out_line.append(str(precision)) pr_refine.append(out_line) else: print "error in {0}.{1}.{2}, {3}".format(_soft,_mark,_tissue,_line) pr_refine2 = ['\t'.join(i) for i in pr_refine] with open("/Data/adam/dnase/roc_pr_final/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w') as f: for item in pr_refine2: print >>f, item
def evaluateTC((signalFileName,chrom,start,end)): signalFile = open(signalFileName,"r") bw = BigWigFile(signalFile) mid = (int(start)+int(end))/2 p1 = max(mid - halfWindow,0) p2 = mid + halfWindow try: nCount = int(sum(correctBW(bw.get(chrom,p1,p2),p1,p2))) except Exception: nCount = 0 signalFile.close() return nCount
def get_phastcons(bedtool, phastcons_location, species=None, index=None, ): """ Get phastcons scores for intervals in a bed tool """ if species is None and index is None: print "Error, must select species or index" f = open(phastcons_location, 'r') bw = BigWigFile(file=f) try: #if its a line #for each line fetch bigwig values type(bedtool) v = bedtool.chrom #is a single interval vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons=0 data = mean_phastcons except: #if bedtool for i, bedline in enumerate(bedtool): data = np.ndarray(len(bedtool)) vals = bw.get(bedline.chrom, bedline.start, bedline.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons=0 data[i] = mean_phastcons #returns mean phastcons score for each line #returns inconistant data types, need to convert so it just returns an array return data
def get_GA_from_bw(self, plus, minus, GTF, filterfxn): ##bx-python 'get' method is 0 based, fully closed ga = HTSeq.GenomicArray( "auto", typecode='d' , stranded = True) with open(plus) as f: bw_file = BigWigFile(file=f) for GF in GTF: if filterfxn( GF ) == False: continue window = GF.iv chrom, start, stop = window.chrom, window.start, window.end vals = bw_file.get(chrom, start, stop) for start, stop, value in vals: ga[ HTSeq.GenomicPosition(chrom, start, '+') ] = value with open(minus) as f: bw_file = BigWigFile(file=f) for GF in GTF: if filterfxn( GF ) == False: continue window = GF.iv chrom, start, stop = window.chrom, window.start, window.end vals = bw_file.get(chrom, start, stop) for start, stop, value in vals: ga[ HTSeq.GenomicPosition(chrom, start, '-') ] = value return ga
def main(): input_filename, output_filename, loc_filename, loc_key, chrom_col, start_col = sys.argv[ 1:] # open input, output, and bigwig files location_file = LocationFile(loc_filename) bigwig_filename = location_file.get_values(loc_key) bwfh = open_or_die(bigwig_filename, message='Error opening BigWig file %s' % bigwig_filename) bw = BigWigFile(file=bwfh) ifh = open_or_die(input_filename, message='Error opening input file %s' % input_filename) ofh = open_or_die(output_filename, mode='w', message='Error opening output file %s' % output_filename) # make column numbers 0-based chrom_col = int(chrom_col) - 1 start_col = int(start_col) - 1 min_cols = max(chrom_col, start_col) # add score column to imput file line_number = 0 for line in ifh: line_number += 1 line = line.rstrip('\r\n') elems = line.split('\t') if len(elems) > min_cols: chrom = elems[chrom_col].strip() # base-0 position in chrom start = int(elems[start_col]) score_list = bw.get(chrom, start, start + 1) score_list_len = len(score_list) if score_list_len == 1: beg, end, score = score_list[0] score_val = '%1.3f' % score elif score_list_len == 0: score_val = 'NA' else: die('%s line %d: chrom=%s, start=%d, score_list_len = %d' % (input_filename, line_number, chrom, start, score_list_len)) print >> ofh, '\t'.join([line, score_val]) else: print >> ofh, line bwfh.close() ifh.close() ofh.close()
def get_phastcons(bedtool, species=None, index=None): """ Get phastcons scores for intervals in a bed tool """ if species is None and index is None: print "Error, must select species or index" if species is not None and index is None: if species == "mm9": index= basedir + "/yeolab/Conservation/phastCons/mm9_30way/placental/mm9_phastcons.bw" elif species == "hg19": index = basedir + "/yeolab/Conservation/phastCons/hg19_46way/placentalMammals/reformat/hg19_phastcons.bw" f = open(index, 'r') bw = BigWigFile(file=f) try: type(bedtool) v = bedtool.chrom #is a single interval vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons=0 data = mean_phastcons except: for i, bedline in enumerate(bedtool): data = np.ndarray(len(bedtool)) vals = bw.get(bedline.chrom, bedline.start, bedline.stop) consvals = list(v[-1] for v in vals) if len(consvals) > 0: mean_phastcons = np.mean(consvals) else: mean_phastcons=0 data[i] = mean_phastcons return data
def main(): input_filename, output_filename, loc_filename, loc_key, chrom_col, start_col = sys.argv[1:] # open input, output, and bigwig files location_file = LocationFile( loc_filename ) bigwig_filename = location_file.get_values( loc_key ) bwfh = open_or_die( bigwig_filename, message='Error opening BigWig file %s' % bigwig_filename ) bw = BigWigFile( file=bwfh ) ifh = open_or_die( input_filename, message='Error opening input file %s' % input_filename ) ofh = open_or_die( output_filename, mode='w', message='Error opening output file %s' % output_filename ) # make column numbers 0-based chrom_col = int( chrom_col ) - 1 start_col = int( start_col ) - 1 min_cols = max( chrom_col, start_col ) # add score column to imput file line_number = 0 for line in ifh: line_number += 1 line = line.rstrip( '\r\n' ) elems = line.split( '\t' ) if len( elems ) > min_cols: chrom = elems[chrom_col].strip() # base-0 position in chrom start = int( elems[start_col] ) score_list = bw.get( chrom, start, start + 1 ) score_list_len = len( score_list ) if score_list_len == 1: beg, end, score = score_list[0] score_val = '%1.3f' % score elif score_list_len == 0: score_val = 'NA' else: die( '%s line %d: chrom=%s, start=%d, score_list_len = %d' % ( input_filename, line_number, chrom, start, score_list_len ) ) print('\t'.join( [line, score_val] ), file=ofh) else: print(line, file=ofh) bwfh.close() ifh.close() ofh.close()
def getNumberOfFragmentsPerRegionFromBigWig(bw, chromSizes): """ Get the number of all mapped fragments per region in all chromosomes from a bigWig. Utilizing bx-python. Test dataset with two samples covering 200 bp. >>> test = Tester() Get number of fragments in sample. >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile1, [('3R', 200)]) 3.0 >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile2, [('3R', 200)]) 4.0 """ bwh = BigWigFile(open(bw, "rb")) mapped = 0 for cname, csize in chromSizes: regions = bwh.get(cname, 0, csize) # region = bwh.get(chrom_name, start, end) for region in regions: mapped += region[2] return mapped
def _get_resized(peakfile, bigwigs, width, outprefix): # Input is peak file name which is sort by rank peaklis = [i.rstrip().split('\t') for i in open(peakfile)] peaklist = _check_file(peaklis) _temp_bw = open(bigwigs) _bw = BigWigFile(file=_temp_bw) # print peaklist[0] with open("{0}.bed".format(outprefix), "w") as f: for _ids, line in enumerate(peaklist): vals = _bw.get(line[0], int(line[1]), int(line[2])) vals = tuple(vals) if len(vals) > 0: maxs = 0 for _key in vals: if float(_key[2]) > maxs: maxs = float(_key[2]) summit = _key[:2] summit_p = int((float(summit[0]) + float(summit[1])) / 2) # out_summit.append([line[0], str(summit_p)]) print >>f, "{0}\t{1}\t{2}\t{3}"\ .format(line[0], str(summit_p-width),str(summit_p+width-1),str(_ids+1))
# Fetching bed coordinates bedDict = aux.createBedDictFromSingleFile(bedFileNameList[b], separator="\t") # Iterating on chromosomes for chrName in constants.getChromList(reference=[bedDict]): # Iterating on coordinates for coord in bedDict[chrName]: # Positions mid = (coord[0] + coord[1])/2 p1 = max(mid-int(math.floor(windowSize/2.0)),0) p2 = mid+int(math.ceil(windowSize/2.0)) # Fetching sequence sequence = aux.correctBW(bw.get(chrName,p1,p2),p1,p2) if(invNeg and len(coord) >= 5 and coord[4] == "-"): sequence = sequence[::-1] # Inverting negative strand sequence = [e*normFactor for e in sequence] # Normalizing the values if(useLog): sequence = [math.log(e+1.0,2) for e in sequence] # Log the values # Updating counters counter = 0; currMean = 0.0 for v in sequence: currMean += v sumVec[b][counter] += v sqSumVec[b][counter] += v**2 counter += 1 totVec[b] += 1.0 boxplotVec[b].append(currMean/len(sequence)) # Evaluating mean and std
############################################################################################# # Initializing prior matrix with MPBS scores priorMatrix = [] for coord in mpbsList: priorMatrix.append([float(coord[4])]) # Creating conservation prior consFile = open(consFileName, "r") bw = BigWigFile(consFile) for i in range(0, len(mpbsList)): coord = mpbsList[i] chrName = coord[0] pos1 = int(coord[1]) pos2 = int(coord[2]) bwQuery = aux.correctBW(bw.get(chrName, pos1, pos2), pos1, pos2) priorMatrix[i].append(sum(bwQuery) / float(len(bwQuery))) consFile.close() # Creating TSS distance prior tssFile = open(tssFileName, "r") bw = BigWigFile(tssFile) for i in range(0, len(mpbsList)): coord = mpbsList[i] chrName = coord[0] pos1 = int(coord[1]) pos2 = int(coord[2]) mid = (pos1 + pos2) / 2 bwQuery = aux.correctBW(bw.get(chrName, mid, mid + 1), mid, mid + 1) priorMatrix[i].append(bwQuery[0]) tssFile.close()
outputFileName = sys.argv[3] # Opening signal file signalFile = open(signalFileName, "r") bw = BigWigFile(signalFile) # Iterating on the mpbsfile to update the score mpbsFile = open(mpbsFileName, "r") outputFile = open(outputFileName, "w") for line in mpbsFile: ll = line.strip().split() chrName = ll[0] p1 = int(ll[1]) p2 = int(ll[2]) mLen = p2 - p1 p1_ext = p1 - mLen p2_ext = p2 + mLen if (p1_ext < 0): continue signalVec = aux.correctBW(bw.get(chrName, p1_ext, p2_ext), p1_ext, p2_ext) nl = sum(signalVec[:mLen]) + 1.0 nc = sum(signalVec[mLen:2 * mLen]) + 1.0 nr = sum(signalVec[2 * mLen:]) + 1.0 try: ll[4] = str(round(-((nc / nr) + (nc / nl)), 4)) except Exception: ll[4] = "-999" outputFile.write("\t".join(ll) + "\n") signalFile.close() mpbsFile.close() outputFile.close()
# Storing coordinates into disctionary (already in order of score) coordList = aux.createBedListFromSingleFile(coordFileName) # Iterating on coordDict to crop desired regions for coord in coordList: # Obtaining real coordenates (rc) if(isBed): mPoint = (coord[1] + coord[2]) / 2 else: mPoint = coord[1] + coord[9] rc = [mPoint - windowLen , mPoint + windowLen] # Obtaining bw object wigFile = open(glob.glob(wigLocation+coord[0]+"_*.bw")[0],"r") bw = BigWigFile(wigFile) # Fetching and writing sequence bwQuery = bw.get(coord[0],rc[0],rc[1]) if(bwQuery == None or bwQuery == []): continue outputFile.write("fixedStep chrom="+coord[0]+" start="+str(rc[0]+1)+" step=1\n") for (c1,c2,value) in bwQuery: outputFile.write(str(round(value,6))+"\n") # Closing wig file wigFile.close() # End for chrName in chrNameList # Termination outputFile.close()
# Evaluating overlap for chrom in constants.getChromList(reference=[mpbsDict]): c = 0 for interval in mpbsDict[chrom]: didBreak = False while (c < len(tfbsDict[chrom])): check = aux.checkTuplesOverlap(interval, tfbsDict[chrom][c]) if (check == 0): interval[2] = mpbsName + ":Y" interval += [interval[0], interval[1], "0,150,0"] didBreak = True break elif (check == -1): treatSignalQ = bwTreat.get( chrom, max(interval[0] + (peakExt / 2) - (negExt / 2), 0), interval[1] - (peakExt / 2) + (negExt / 2)) controlSignalQ = bwControl.get( chrom, max(interval[0] + (peakExt / 2) - (negExt / 2), 0), interval[1] - (peakExt / 2) + (negExt / 2)) treatSum = sum([e[2] for e in treatSignalQ]) controlSum = sum([e[2] for e in controlSignalQ]) if (treatSum > controlSum): interval[2] = mpbsName + ":." interval += [interval[0], interval[1], "0,0,0"] else: interval[2] = mpbsName + ":N" interval += [interval[0], interval[1], "150,0,0"] didBreak = True break c += 1
class BigWig(object): def __init__(self, filename): self.filename = filename self.determine_sizes() self.bwf = BigWigFile(open(filename)) def determine_sizes(self): self.sizes = {} fh = open(self.filename, "rb") # read magic number to guess endianness magic = fh.read(4) if magic == '&\xfc\x8f\x88': endianness = '<' elif magic == '\x88\x8f\xfc&': endianness = '>' else: raise IOError("The file is not in bigwig format") # read the header info = struct.unpack(endianness + 'HHQQQHHQQIQ', fh.read(60)) self.version = info[0] self.zoom_levels = info[1] self.chromosome_tree_offset = info[2] self.full_data_offset = info[3] self.full_index_offset = info[4] self.field_count = info[5] self.defined_field_count = info[6] self.auto_SQL_offset = info[7] self.total_summary_offset = info[8] self.uncompress_buf_size = info[9] # go to the data fh.seek(self.chromosome_tree_offset) # read magic again magic = fh.read(4) if magic == '\x91\x8c\xcax': endianness = '<' elif magic == 'x\xca\x8c\x91': endianness = '>' else: raise ValueError("Wrong magic for this bigwig data file") info2 = struct.unpack(endianness + 'IIIQQ', fh.read(28)) self.block_size = info2[0] self.key_size = info2[1] self.val_size = info2[2] self.item_count = info2[3] info3 = struct.unpack(endianness + 'BBH', fh.read(4)) self.is_leaf = info3[0] self.count = info3[2] for n in range(self.count): format_code = endianness + str(self.key_size) + 'sII' info = struct.unpack(format_code, fh.read(self.key_size + 2 * 4)) key, chrom_id, chrom_size = info key = key.replace('\x00', '') self.sizes[key] = chrom_size def get_as_array(self, chrom, start, end): return self.bwf.get_as_array(chrom, start, end) def get(self, chrom, start, end): return self.bwf.get(chrom, start, end) def query(self, chrom, start, end, number): return self.bwf.query(chrom, start, end, number)
outputFile = open(outputFileName,"w") counter = 1 for line in bedFile: # Positions ll = line.strip().split("\t") mid = (int(ll[1]) + int(ll[2]))/2 if(windowSize == 0): p1 = int(ll[1]) p2 = int(ll[2]) else: p1 = max(0,mid-int(math.floor(windowSize/2.0))) p2 = mid+int(math.ceil(windowSize/2.0)) # Fetching sequence sequence1 = aux.correctBW(bw1.get(ll[0],p1,p2),p1,p2) sequence2 = aux.correctBW(bw2.get(ll[0],p1,p2),p1,p2) # Normalize sequence1 = [e*normFactor1 for e in sequence1] sequence2 = [e*normFactor2 for e in sequence2] # Log if(useLog): sequence1 = [math.log(e+1.0,2) for e in sequence1] sequence2 = [math.log(e+1.0,2) for e in sequence2] # Calculate fold change foldSequence = [sequence2[i]-sequence1[i] for i in range(0,len(sequence1))] v = np.array(foldSequence).mean()
newCoord = [r,min(r+coordLen,spCoord[1])] randDict[chrName].append(newCoord) else: randDict = spacingDict # Iterating on bed files boxplotVec = [] for bedDict in [evDict,randDict]: # Iterating on chromosomes boxplotVec.append([]) for chrName in constants.getChromList(reference=[bedDict]): # Iterating on coordinates for coord in bedDict[chrName]: sequence1 = aux.correctBW(bw1.get(chrName,coord[0],coord[1]),coord[0],coord[1]) sequence2 = aux.correctBW(bw2.get(chrName,coord[0],coord[1]),coord[0],coord[1]) sequence1 = [e*normFactor1 for e in sequence1] # Normalization 1 sequence2 = [e*normFactor2 for e in sequence2] # Normalization 2 if(useLog): sequence1 = [math.log(e+1.0,2) for e in sequence1] # Log 1 sequence2 = [math.log(e+1.0,2) for e in sequence2] # Log 2 finalSeq = [sequence2[i]-sequence1[i] for i in range(0,len(sequence1))] boxplotVec[-1].append(np.array(finalSeq).mean()) # Closing wig file wigFile1.close() wigFile2.close() ############################################################################################################### ### WRITING RESULTS
newCoord = [r, min(r + coordLen, spCoord[1])] randDict[chrName].append(newCoord) else: randDict = spacingDict # Iterating on bed files boxplotVec = [] for bedDict in [evDict, randDict]: # Iterating on chromosomes boxplotVec.append([]) for chrName in constants.getChromList(reference=[bedDict]): # Iterating on coordinates for coord in bedDict[chrName]: sequence = aux.correctBW(bw.get(chrName, coord[0], coord[1]), coord[0], coord[1]) sequence = [e * normFactor for e in sequence] # Normalization if (useLog): sequence = [math.log(e + 1.0, 2) for e in sequence] # Log boxplotVec[-1].append(np.array(sequence).mean()) # Closing wig file wigFile.close() ############################################################################################################### ### WRITING RESULTS ############################################################################################################### # Creating output file name outputFileName = outputLocation + bedLabel + "_" + wigLabel
import numpy as np fl = sys.argv[1] dist = int(sys.argv[2]) from bx.bbi.bigwig_file import BigWigFile genes = read.dat("/home/ssaberi/resources/list.genes.txt", '\t') table = read.dat("/projects/epigenomics/MarcoJuliaPon/peaks.txt", '\t') mygenes = read.dat("/projects/epigenomics/MarcoJuliaPon/mygenes.txt", '\t') ens = [] for i in mygenes: for gn in genes: if i in gn[0]: ens.append(gn[1]) break genespos = read.read_gene_pos( '/home/ssaberi/resources/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage' ) genesbed = bedtools.makebed_genpos(ens, genespos, 100000) f = open(fl) bw = BigWigFile(file=f) mat = [] for bed_i in genesbed: vals = bw.get(bed_i[0], bed_i[1], bed_i[2]) mat.append(np.array(vals)) mat = np.array(mat) plt.matshow(mat, aspect='auto', cmap='YlOrBr') fl = fl[-fl[::-1].index('/'):-fl[::-1].index('.')] plt.save(fl + ".pdf")
# Iterating on coordinate file coordFile = open(coordFileName, "r") outputFile = open(outputFileName, "w") counter = 1 for line in coordFile: # Initialization ll = line.strip().split("\t") chrName = ll[0] p1 = int(ll[1]) p2 = int(ll[2]) # Create Neph signal input nephSignalFile = open(nephSignalFileName, "w") nephSignalFile.write("\n".join( [str(e) for e in aux.correctBW(bw.get(chrName, p1, p2), p1, p2)])) nephSignalFile.close() # Apply neph nephCommand = "detect-cache " nephCommand += "--flankmin 3 --flankmax 10 --centermin 6 --centermax 40 --maxthold 10 " nephCommand += nephSignalFileName + " > " + nephResultFileName os.system(nephCommand) os.system("rm " + nephSignalFileName) # Read/Write neph results nephResultFile = open(nephResultFileName, "r") for line in nephResultFile: ll = line.strip().split("\t") if (float(ll[4]) > fosThresh): continue outputFile.write("\t".join([
# Signal files posSigFile = open(posSignalFileName,"r") posBw = BigWigFile(posSigFile) negSigFile = open(negSignalFileName,"r") negBw = BigWigFile(negSigFile) # Chrom sizes dictionary chromSizesFileName = constants.getChromSizesLocation() chromSizesFile = open(chromSizesFileName,"r") chromSizesDict = dict() for line in chromSizesFile: ll = line.strip().split("\t") chromSizesDict[ll[0]] = int(ll[1]) chrList = constants.getChromList(x=False, y=False) # Sumarizing outputFile = open(outputLocation+posSigName+"_"+negSigName+"_"+str(windowLen)+".bed","w") for chrName in chrList: for k in range(0,chromSizesDict[chrName],windowLen): p1 = k; p2 = min(k+windowLen,chromSizesDict[chrName]) posMean = np.array(aux.correctBW(posBw.get(chrName,p1,p2),p1,p2)).mean() negMean = np.array([-e for e in aux.correctBW(negBw.get(chrName,p1,p2),p1,p2)]).mean() outputFile.write("\t".join(["hs"+chrName[3:],str(p1),str(p2),str(posMean)])+"\n"+"\t".join(["hs"+chrName[3:],str(p1),str(p2),str(negMean)])+"\n") # Termination posSigFile.close() negSigFile.close() outputFile.close()
# Reading input mpbsFileName = sys.argv[1] signalFileName = sys.argv[2] outputFileName = sys.argv[3] # Parameters halfWindow = 100 # Opening signal file signalFile = open(signalFileName, "r") bw = BigWigFile(signalFile) # Iterating on the mpbsfile to update the score mpbsFile = open(mpbsFileName, "r") outputFile = open(outputFileName, "w") for line in mpbsFile: ll = line.strip().split() mid = (int(ll[1]) + int(ll[2])) / 2 p1 = max(mid - halfWindow, 0) p2 = mid + halfWindow try: nCount = int(sum(aux.correctBW(bw.get(ll[0], p1, p2), p1, p2))) except Exception: nCount = 0 ll[4] = str(nCount) outputFile.write("\t".join(ll) + "\n") signalFile.close() mpbsFile.close() outputFile.close()
if (re.match('^-BigWigFile', argument)): bw_file = argv.pop(0) elif (re.match('^-enhancers', argument)): enhancer_list = argv.pop(0) # Load BigWig file and peak coordinates. f = open(bw_file) bw = BigWigFile(file=f) enhancer_f = open(enhancer_list, 'rb') data = csv.reader(enhancer_f, delimiter='\t') enhancer_tab = [row for row in data] # Create matrix to store summit positions. summits = [[0 for x in xrange(3)] for x in xrange(len(enhancer_tab))] # Get the summit for each peak and save the coordinates. for i in range(0, len(enhancer_tab)): vals = bw.get(enhancer_tab[i][0], int(enhancer_tab[i][1]), int(enhancer_tab[i][2])) max_start, max_end, max_val = max(vals, key=lambda x: x[-1]) summits[i][0] = enhancer_tab[i][0] summits[i][1] = max_start summits[i][2] = max_end # Write bed file with summit coordinates. print "\n".join("\t".join(str(col) for col in row) for row in summits) enhancer_f.close() f.close()
import numpy as np fl=sys.argv[1] dist=int(sys.argv[2]) from bx.bbi.bigwig_file import BigWigFile genes=read.dat("/home/ssaberi/resources/list.genes.txt",'\t') table=read.dat("/projects/epigenomics/MarcoJuliaPon/peaks.txt",'\t') mygenes=read.dat("/projects/epigenomics/MarcoJuliaPon/mygenes.txt",'\t') ens=[] for i in mygenes: for gn in genes: if i in gn[0]: ens.append(gn[1]) break genespos=read.read_gene_pos('/home/ssaberi/resources/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage') genesbed=bedtools.makebed_genpos(ens,genespos,100000) f = open(fl) bw = BigWigFile(file=f) mat=[] for bed_i in genesbed: vals = bw.get( bed_i[0], bed_i[1], bed_i[2]) mat.append(np.array(vals)) mat=np.array(mat) plt.matshow(mat,aspect='auto',cmap='YlOrBr') fl=fl[-fl[::-1].index('/'):-fl[::-1].index('.')] plt.save(fl+".pdf")
bedFile = aux.createBedDictFromSingleFile(outputLocation+"_".join(bedLabelList)+"temp.bed", separator="\t") os.system("rm "+outputLocation+"_".join(bedLabelList)+"temp.bed") # Fetching signal nbSig = len(wigFileNameList) sumVec = [[0.0 for e in range(0,windowSize)] for k in range(0,nbSig)] sqSumVec = [[0.0 for e in range(0,windowSize)] for k in range(0,nbSig)] totVec = [0.0 for k in range(0,nbSig)] boxplotVec = [[] for k in range(0,nbSig)] for s in range(0,nbSig): bwFile = open(wigFileNameList[s],"r") bw = BigWigFile(bwFile) for chrName in constants.getChromList(reference=[bedFile]): for coord in bedFile[chrName]: mid = (coord[0] + coord[1])/2 bwQuery = aux.correctBW(bw.get(chrName,mid-int(math.floor(windowSize/2.0)),mid+int(math.ceil(windowSize/2.0))),mid-int(math.floor(windowSize/2.0)),mid+int(math.ceil(windowSize/2.0))) if(len(bwQuery) < windowSize): continue counter = 0; currMean = 0.0 for value in bwQuery: currMean += (value*normFactorList[s]) sumVec[s][counter] += (value*normFactorList[s]) sqSumVec[s][counter] += ((value*normFactorList[s])**2) counter += 1 totVec[s] += 1.0 boxplotVec[s].append(currMean/len(bwQuery)) bwFile.close() # Evaluating mean and std meanVec = [[0.0 for e in range(0,windowSize)] for k in range(0,nbSig)] stdVec = [[0.0 for e in range(0,windowSize)] for k in range(0,nbSig)] for s in range(0,nbSig):