def changevalue(ifile, ref, ofile, gfile, step=10, suppress=False, buffer=None): from random import randint if ifile != ref: print('\nnormalizing', ifile, '...') else: print('\nsaving reference ...') tm = time() fi, fr = open(ifile), open(ref) wg = Wig(step=step, gfile=gfile) for line in fi: col = line.split() rcol = fr.readline().split() if len(rcol) == 0: rcol = [0.0] cr, pos, vl = col[2], int(col[3]) / step, float(rcol[0]) wg.data[cr][pos] = vl n = 0 for line in fr: n += 1 if n > 0: print( 'Warning: the input genome size is smaller than the reference genome size by', n, 'wiggle steps!') wg.save(ofile, suppress=suppress) print('time cost:', time() - tm)
def load(self, path, suppress=False): ''' Description: Load multiple Wig class instances from wiggle format files located in one directory Parameter: path: a path to the directory that contain the wiggle format files Value: None ''' paths = path for path in paths.split(','): #wigs={} if os.path.isdir(path): for infile in glob.glob(os.path.join(path, '*.wig')): fname = os.path.split(infile)[-1] if fname[-4:] == '.wig': fname = fname[:-4] self.set( fname, Wig(infile, step=self.step, suppress=suppress) ) ########## ---add--- by kaifu on Aug 15,2012 ########## #wigs[infile]=Wig(infile,step=self.step) ########## ---delete--- by kaifu on Aug 15,2012 ########## elif os.path.isfile(path): fname = os.path.split(path)[-1] if fname[-4:] == '.wig': fname = fname[:-4] self.set( fname, Wig(path, step=self.step, suppress=suppress) ) ########## ---add--- by kaifu on Aug 15,2012 ##########
def rawsort(ifile,sort_ofile,gfile,format,step=10,suppress=False,buffer=None): tm=time() if format=='wig': print '\nconverting',ifile,'...' raw_ofile=sort_ofile[:-3]+'raw.wiq' wg=Wig(file=ifile,gfile=gfile,step=step,suppress=suppress) wg.ajust_size(gfile=gfile) wg.save(file=raw_ofile,format="wiq",step=step,suppress=suppress) print 'time cost:',time()-tm tm=time() else:raw_ofile=ifile print '\nsorting',raw_ofile,'...' temp=ifile[:-3]+'temp' while os.path.isdir(temp):temp=temp+'.temp' os.mkdir(temp) if buffer!=None:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --buffer-size '+str(buffer)+' --temporary-directory '+str(temp)+' '+raw_ofile else:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --temporary-directory '+str(temp)+' '+raw_ofile os.system(cmd) if format=='wig': print 'Removing ',raw_ofile,'...' os.system('rm '+raw_ofile) print 'removing',temp os.system('rm '+str(temp)+' -r') print 'time cost:',time()-tm
def toWig(self,fs=None,extend=0,mifrsz=10,mafrsz=300): ''' Description: Calculate nucleosome occupancy from the reads data Parameter: fs: average size of fragments that are subject to sequencing and generate the reads, only for signgle-end reads. When this value is not given, a fs value will be infered by the program. For paired-end reads loaded buy the function loadBedPaired(), set fs to 0. extend: a interger value, each read will be extend to this length. mifrsz: the minimal estimated average fragment size, only for single-end reads mafrsz: the maximal estimated average fragment size, only for single-end reads Value: a Wig class instance ''' step=self.step if fs==None:fs=self.fragSizeDis(minsize=mifrsz,maxsize=mafrsz) if extend<=0:extend=fs print 'extend to',extend old_extend=extend fragsize,extend=fs/(2*step),extend/(2*step) wg=Wig(step=step) print 'generating wig ...' for chr in self.data: tmax=max(1000,fragsize*4,extend*4) if self.data[chr]['+'].size<tmax:self.data[chr]['+'].resize(tmax,refcheck=0) if self.data[chr]['-'].size<tmax:self.data[chr]['-'].resize(tmax,refcheck=0) wg.addChr(chr) lth=self.data[chr]['+'].size wg.resizeChr(chr,lth*step) self.data[chr]['+'][fragsize:lth]=self.data[chr]['+'][0:(lth-fragsize)] for i in range(fragsize):self.data[chr]['+'][i]=0 self.data[chr]['+'][0:(lth-fragsize)]+=self.data[chr]['-'][fragsize:lth] for p in range(-extend,extend+1):wg.data[chr][extend:(lth-extend)]+=self.data[chr]['+'][(extend+p):(lth-extend+p)] wg.foldChange(old_extend*1.0/wg.step) ##### added by Kaifu on May29, 2014 return wg
def changevalue(ifile,ref,ofile,gfile,step=10,suppress=False,buffer=None): from random import randint if ifile!=ref:print '\nnormalizing',ifile,'...' else:print '\nsaving reference ...' tm=time() fi,fr=open(ifile),open(ref) wg=Wig(step=step,gfile=gfile) for line in fi: col=line.split() rcol=fr.readline().split() if len(rcol)==0:rcol=[0.0] cr,pos,vl=col[2],int(col[3])/step,float(rcol[0]) wg.data[cr][pos]=vl n=0 for line in fr:n+=1 if n>0:print 'Warning: the input genome size is smaller than the reference genome size by',n,'wiggle steps!' wg.save(ofile,suppress=suppress) print 'time cost:',time()-tm
def samplingTotal(self, region_file=None, region_out_file=None, exclude_low_percent=1, exclude_high_percent=1, bnum=100000, nonzero=False): ''' Description: caculate the sum of each wig's values after excluding the low and high percentile Parameter: None Value: None ''' #if exclude_low_percent==0 and exclude_high_percent==0:return None #else: #print 'calculating normalization factors by sampling ...' names = self.data.keys() if exclude_low_percent == 0 and exclude_high_percent == 0 and region_file == None: return None sampling_total = {} if region_file == None: print 'calculate total signal in each sample after excluding the top', exclude_high_percent, 'and bottom', exclude_low_percent, 'percents of genomic regions with extremely high and low signal values' wsums = {} for name in names: wsums[name] = self.data[name].sum() wavg = sum(wsums.values()) / len(wsums.values()) rfwig = deepcopy(self.data[names[0]]) rfwig.foldChange(wavg * 1.0 / wsums[names[0]]) for name in names[1:]: self.data[name].foldChange(wavg * 1.0 / wsums[name]) rfwig.add(self.data[name]) self.data[name].foldChange(wsums[name] * 1.0 / wavg) rfwig.foldChange(1.0 / len(names)) lowcut, highcut = rfwig.percentile( p=[exclude_low_percent, 100 - exclude_high_percent], bnum=bnum, nonzero_end=nonzero) rg = rfwig.regionWithinValueRange(lowcut, highcut) if region_out_file != None: rg.save(region_out_file) else: print 'calculate total signal in each sample in genomic regions defined by', region_file rg = Wig(region_file) for name in names: sampling_total[name] = self.data[name].multiply(rg).sum() print rg.sum(), '(' + str( rg.sum() * 100.0 / rg.gsize()) + '%) of', rg.gsize(), 'base pairs calculated:' for name in names: print name, sampling_total[name], '(' + str( sampling_total[name] * 100.0 / self.data[name].sum()) + '% of total)' return sampling_total
def rawsort(ifile,sort_ofile,gfile,format,step=10,suppress=False,buffer=None): tm=time() if format=='wig': print('\nconverting',ifile,'...') raw_ofile=sort_ofile[:-3]+'raw.wiq' wg=Wig(file=ifile,gfile=gfile,step=step,suppress=suppress) wg.ajust_size(gfile=gfile) wg.save(file=raw_ofile,format="wiq",step=step,suppress=suppress) print('time cost:',time()-tm) tm=time() else:raw_ofile=ifile print('\nsorting',raw_ofile,'...') temp=ifile[:-3]+'temp' while os.path.isdir(temp):temp=temp+'.temp' os.mkdir(temp) if buffer!=None:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --buffer-size '+str(buffer)+' --temporary-directory '+str(temp)+' '+raw_ofile else:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --temporary-directory '+str(temp)+' '+raw_ofile os.system(cmd) if format=='wig': print('Removing ',raw_ofile,'...') os.system('rm '+raw_ofile) print('removing',temp) os.system('rm '+str(temp)+' -r') print('time cost:',time()-tm)
def samplingTotal(self,region_file=None,region_out_file=None,exclude_low_percent=1,exclude_high_percent=1,bnum=100000,nonzero=False): ''' Description: caculate the sum of each wig's values after excluding the low and high percentile Parameter: None Value: None ''' #if exclude_low_percent==0 and exclude_high_percent==0:return None #else: #print 'calculating normalization factors by sampling ...' names=list(self.data.keys()) if exclude_low_percent==0 and exclude_high_percent==0 and region_file==None: return None sampling_total={} if region_file==None: sys.stdout.write('calculate total signal in each sample after excluding the top ' + str(exclude_high_percent) + ' and bottom ' + str(exclude_low_percent) + 'percents of genomic regions with extremely high and low signal values\n') wsums={} for name in names:wsums[name]=self.data[name].sum() wavg=functions.div(sum(wsums.values()),len(list(wsums.values()))) rfwig=deepcopy(self.data[names[0]]) rfwig.foldChange(functions.div(wavg*1.0,wsums[names[0]])) for name in names[1:]: self.data[name].foldChange(functions.div(wavg*1.0,wsums[name])) rfwig.add(self.data[name]) self.data[name].foldChange(functions.div(wsums[name]*1.0,wavg)) rfwig.foldChange(functions.div(1.0,len(names))) lowcut,highcut=rfwig.percentile(p=[exclude_low_percent,100-exclude_high_percent],bnum=bnum,nonzero_end=nonzero) rg=rfwig.regionWithinValueRange(lowcut,highcut) if region_out_file!=None:rg.save(region_out_file) else: sys.stdout.write('calculate total signal in each sample in genomic regions defined by' + region_file + "\n") rg=Wig(region_file) for name in names:sampling_total[name]=self.data[name].multiply(rg).sum() sys.stdout.write(str(rg.sum()) + ' (' + str(functions.div(rg.sum()*100.0,rg.gsize())) + ' %) of ' + str(rg.gsize()) + ' base pairs calculated:\n') for name in names: sys.stdout.write(name + str(sampling_total[name]) + ' (' + str(functions.div(sampling_total[name]*100.0,self.data[name].sum())) + '% of total)\n') return sampling_total
def samplingTotal(self,region_file=None,region_out_file=None,exclude_low_percent=1,exclude_high_percent=1,bnum=100000,nonzero=False): ''' Description: caculate the sum of each wig's values after excluding the low and high percentile Parameter: None Value: None ''' #if exclude_low_percent==0 and exclude_high_percent==0:return None #else: #print 'calculating normalization factors by sampling ...' names=self.data.keys() if exclude_low_percent==0 and exclude_high_percent==0 and region_file==None: return None sampling_total={} if region_file==None: print 'calculate total signal in each sample after excluding the top',exclude_high_percent,'and bottom',exclude_low_percent,'percents of genomic regions with extremely high and low signal values' wsums={} for name in names:wsums[name]=self.data[name].sum() wavg=sum(wsums.values())/len(wsums.values()) rfwig=deepcopy(self.data[names[0]]) rfwig.foldChange(wavg*1.0/wsums[names[0]]) for name in names[1:]: self.data[name].foldChange(wavg*1.0/wsums[name]) rfwig.add(self.data[name]) self.data[name].foldChange(wsums[name]*1.0/wavg) rfwig.foldChange(1.0/len(names)) lowcut,highcut=rfwig.percentile(p=[exclude_low_percent,100-exclude_high_percent],bnum=bnum,nonzero_end=nonzero) rg=rfwig.regionWithinValueRange(lowcut,highcut) if region_out_file!=None:rg.save(region_out_file) else: print 'calculate total signal in each sample in genomic regions defined by',region_file rg=Wig(region_file) for name in names:sampling_total[name]=self.data[name].multiply(rg).sum() print rg.sum(),'('+str(rg.sum()*100.0/rg.gsize())+'%) of',rg.gsize(),'base pairs calculated:' for name in names:print name,sampling_total[name],'('+str(sampling_total[name]*100.0/self.data[name].sum())+'% of total)' return sampling_total