def _bigwig_extractor(datafile, intervals, **kwargs): width = intervals[0].stop - intervals[0].start data = np.zeros((len(intervals), 1, 1, width)) wWigIO.open(datafile) for index, interval in enumerate(intervals): wWigIO.getData(datafile, interval.chrom, interval.start, interval.stop, data[index, 0, 0, :]) wWigIO.close(datafile) return data
def extract_bigwig_to_npy(bigwig, output_dir, dtype=np.float32): wWigIO.open(bigwig) chrom_sizes = wWigIO.getChromSize(bigwig) file_shapes = {} for chrom, size in zip(*chrom_sizes): data = np.empty(size) wWigIO.getData(bigwig, chrom, 0, size, data) np.save('{}.npy'.format(os.path.join(output_dir, chrom)), data.astype(dtype)) file_shapes[chrom] = data.shape wWigIO.close(bigwig) with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp: json.dump( { 'file_shapes': file_shapes, 'type': 'array', 'source': bigwig }, fp)
def getChromSizesWigIO(self): wWigIO.open(self.path) out = wWigIO.getChromSize(self.path) out = dict(zip(out[0], out[1])) wWigIO.close(self.path) return out
# ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main # ------------------------------------ if __name__=="__main__": # get information from bigwig file wWigIO.open('test.bw') chroms = wWigIO.getChromSize('test.bw') wigs = wWigIO.getIntervals('test.bw', 'chr1', 10, 200) wWigIO.close('test.bw') print wigs # bigwig -> wig wWigIO.bigWigToWig('test.bw','test.wig') # write the chrom sizes into test.sizes with open('test.sizes','w') as fh: for chrom in chroms: print >>fh, chrom+"\t"+str(chroms[chrom]) # wig -> bigwig wWigIO.wigToBigWig('test.wig','test.sizes','test2.bw')
def main(row_ME, phylop_primates): wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates)
def main(sim_fastq, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates, exon_scores): MEs = set([]) wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score gencode_U2_scores = [] gencode_mean_conservation_vertebrates = [] gencode_mean_conservation_primates = [] for row in csv.reader(open(exon_scores), delimiter=' '): chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates = row gencode_U2_scores.append(float(U2_score)) gencode_mean_conservation_vertebrates.append( float(mean_conservation_vertebrates)) gencode_mean_conservation_primates.append( float(mean_conservation_primates)) for row in csv.reader(open(sim_fastq), delimiter='\t'): if row[0][0] == "@": SJ, ME_seq, estart, eend, total_coverage, n = row[0].split("_") len_ME = len(ME_seq) SJ = SJ[1:] SJ_chr, SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ) SJ_len = int(SJ_iend) - int(SJ_istart) Kmer = SJ_len - (len_ME + 1) P_ME = 1 - (1 - (float(1) / float(4**len_ME + 4)))**Kmer strand = "+" if "-" in SJ: strand = "-" estart = int(estart) eend = int(eend) MEs.add((SJ_chr, strand, estart, eend, P_ME)) for m in MEs: chr, strand, estart, eend, P_ME = m estart, eend = sorted([estart, eend]) E5 = str(Genome[chr][estart - 14:estart + 3]).upper() E3 = str(Genome[chr][eend - 3:eend + 10]).upper() if strand == "-": E5 = str(Genome[chr][eend - 3:eend + 14].reverse_complement()).upper() E3 = str(Genome[chr][estart - 10:estart + 3].reverse_complement()).upper() U2_score = 0 i = 0 for N in E5: if N != "N": U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: if N != "N": U2_score += U2_GTAG_5[N][i] i += 1 U2_score = percent(U2_score, TOTAL_U2_max_score) conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart - 2, eend + 2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart - 2, eend + 2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates / len( conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates / len( conservation_primates) except ZeroDivisionError: pass ME_percentil_U2_score = stats.percentileofscore( gencode_U2_scores, U2_score) ME_percentil_mean_conservation_vertebrates = stats.percentileofscore( gencode_mean_conservation_vertebrates, mean_conservation_primates) ME_percentil_mean_conservation_primates = stats.percentileofscore( gencode_mean_conservation_primates, mean_conservation_vertebrates) overall_score = P_ME * (1 - ME_percentil_U2_score / 100) * ( 1 - ME_percentil_mean_conservation_vertebrates / 100) if ME_percentil_mean_conservation_primates > ME_percentil_mean_conservation_vertebrates: overall_score = P_ME * (1 - ME_percentil_U2_score / 100) * ( 1 - ME_percentil_mean_conservation_primates / 100) #print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates print chr, estart, eend, strand, U2_score, ME_percentil_U2_score, mean_conservation_vertebrates, ME_percentil_mean_conservation_vertebrates, mean_conservation_primates, ME_percentil_mean_conservation_primates, P_ME, overall_score
def main(gencode_bed, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates): wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score exons = set([]) for row in csv.reader(open(gencode_bed), delimiter='\t'): csv.field_size_limit(1000000000) qstarts = map(int, row[11].strip(",").split(",")) blocksizes = map(int, row[10].strip(",").split(",")) start = int(row[1]) strand = row[5] bn = int(row[9]) chr = row[0] for q1, b in zip(qstarts[1:-1], blocksizes[1:-1]): estart = start + q1 eend = start + q1 + b E5 = str(Genome[chr][estart - 14:estart + 3]).upper() E3 = str(Genome[chr][eend - 3:eend + 10]).upper() if strand == "-": E5 = str(Genome[chr][eend - 3:eend + 14].reverse_complement()).upper() E3 = str(Genome[chr][estart - 10:estart + 3].reverse_complement()).upper() U2_score = 0 i = 0 for N in E5: U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: U2_score += U2_GTAG_5[N][i] i += 1 U2_score = percent(U2_score, TOTAL_U2_max_score) if E5[-5:-3] == "AG" and E3[3:5] == "GT": exons.add((chr, estart, eend, strand, U2_score)) # if " ".join([chr, estart, eend]) == "chr17 26597935 26598725": # print for e in exons: chr, estart, eend, strand, U2_score = e conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart - 2, eend + 2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart - 2, eend + 2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates / len( conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates / len( conservation_primates) except ZeroDivisionError: pass print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates
def __init__(self, fname): ''' Open BigWig file. ''' self.fname = fname wWigIO.open(self.fname)
import os, sys import numpy as np import scipy.stats import tabix import wWigIO import bw_bin def show_help(): print >> sys.stderr, "\n\tpython ", sys.argv[ 0], " /datd/huboqiang/test_NOM/02.SingleC/mESC_gF28_1/singleC/chr10.ACG.TCG.bed.gz COUNT_U COUNT_M 15" in_bw = "/datd/huboqiang/test_NOM/mESC_nuc.sort.bw" wWigIO.open(in_bw) class NDR(object): def __init__(self, chrom, count_u, count_m, tb_file, cutoff=1e-5, depth=3, bin_len=40, step_len=20, dist_len=140): self.chrom = chrom self.bin_len = bin_len
def __init__(self,fname): ''' Open BigWig file. ''' self.fname=fname wWigIO.open(self.fname)
# ------------------------------------ # Misc functions # ------------------------------------ # ------------------------------------ # Classes # ------------------------------------ # ------------------------------------ # Main # ------------------------------------ if __name__ == "__main__": # get information from bigwig file wWigIO.open('test.bw') chroms = wWigIO.getChromSize('test.bw') wigs = wWigIO.getIntervals('test.bw', 'chr1', 10, 200) wWigIO.close('test.bw') print wigs # bigwig -> wig wWigIO.bigWigToWig('test.bw', 'test.wig') # write the chrom sizes into test.sizes with open('test.sizes', 'w') as fh: for chrom in chroms: print >> fh, chrom + "\t" + str(chroms[chrom]) # wig -> bigwig wWigIO.wigToBigWig('test.wig', 'test.sizes', 'test2.bw')
def main(sim_fastq, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates): MEs = set([]) wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score for row in csv.reader(open(sim_fastq), delimiter='\t'): chr, estart, eend, exon, exon_len, strand = row estart = int(estart) eend = int(eend) # if row[0][0]=="@": # SJ, ME_seq, estart, eend, total_coverage, n = row[0].split("_") # len_ME = len(ME_seq) # SJ = SJ[1:] # SJ_chr, SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ) # SJ_len = int(SJ_iend) - int(SJ_istart) # Kmer = SJ_len - (len_ME+1) # P_ME = 1 - ( 1 - (float(1)/float(4**len_ME+4)))**Kmer # strand = "+" # if "-" in SJ: # strand = "-" # estart = int(estart) # eend = int(eend) # MEs.add((SJ_chr, strand, estart, eend, P_ME)) # for m in MEs: # chr, strand, estart, eend, P_ME = m # estart, eend = sorted([estart, eend]) E5 = str(Genome[chr][estart - 14:estart + 3]).upper() E3 = str(Genome[chr][eend - 3:eend + 10]).upper() if strand == "-": E5 = str(Genome[chr][eend - 3:eend + 14].reverse_complement()).upper() E3 = str(Genome[chr][estart - 10:estart + 3].reverse_complement()).upper() E5 = E5[:-5] + "AG" + E5[-3:] E3 = E3[:3] + "GT" + E3[5:] U2_score = 0 ME5_U2_score = 0 ME3_U2_score = 0 i = 0 for N in E5: if N != "N": U2_score += U2_GTAG_3[N][i] ME5_U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: if N != "N": U2_score += U2_GTAG_5[N][i] ME3_U2_score += U2_GTAG_5[N][i] i += 1 ME3_U2_score = percent(ME3_U2_score, U2_GTAG_5_max_score) ME5_U2_score = percent(ME5_U2_score, U2_GTAG_3_max_score) U2_score = percent(U2_score, TOTAL_U2_max_score) conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart - 2, eend + 2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart - 2, eend + 2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates / len( conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates / len( conservation_primates) except ZeroDivisionError: pass #print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates print chr, estart, eend, strand, U2_score, ME5_U2_score, ME3_U2_score, mean_conservation_vertebrates, mean_conservation_primates
def main(sim_fastq, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates): MEs = set([]) wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score for row in csv.reader(open(sim_fastq), delimiter = '\t'): chr, estart, eend, exon, exon_len, strand = row estart = int(estart) eend = int(eend) # if row[0][0]=="@": # SJ, ME_seq, estart, eend, total_coverage, n = row[0].split("_") # len_ME = len(ME_seq) # SJ = SJ[1:] # SJ_chr, SJ_istart, SJ_iend = re.findall(r"[\w']+", SJ) # SJ_len = int(SJ_iend) - int(SJ_istart) # Kmer = SJ_len - (len_ME+1) # P_ME = 1 - ( 1 - (float(1)/float(4**len_ME+4)))**Kmer # strand = "+" # if "-" in SJ: # strand = "-" # estart = int(estart) # eend = int(eend) # MEs.add((SJ_chr, strand, estart, eend, P_ME)) # for m in MEs: # chr, strand, estart, eend, P_ME = m # estart, eend = sorted([estart, eend]) E5 = str(Genome[chr][estart-14:estart+3]).upper() E3 = str(Genome[chr][eend-3:eend+10]).upper() if strand == "-": E5 = str(Genome[chr][eend-3:eend+14].reverse_complement()).upper() E3 = str(Genome[chr][estart-10:estart+3].reverse_complement()).upper() E5 = E5[:-5] + "AG" + E5[-3:] E3 = E3[:3] + "GT" + E3[5:] U2_score = 0 ME5_U2_score = 0 ME3_U2_score = 0 i = 0 for N in E5: if N!="N": U2_score += U2_GTAG_3[N][i] ME5_U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: if N!="N": U2_score += U2_GTAG_5[N][i] ME3_U2_score += U2_GTAG_5[N][i] i += 1 ME3_U2_score = percent(ME3_U2_score, U2_GTAG_5_max_score) ME5_U2_score = percent(ME5_U2_score, U2_GTAG_3_max_score) U2_score = percent(U2_score, TOTAL_U2_max_score) conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart-2, eend+2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart-2, eend+2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates/len(conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates/len(conservation_primates) except ZeroDivisionError: pass #print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates print chr, estart, eend, strand, U2_score, ME5_U2_score, ME3_U2_score, mean_conservation_vertebrates, mean_conservation_primates
def main(gencode_bed, U2_GTAG_5_file, U2_GTAG_3_file, phylop_vertebrates, phylop_primates): wWigIO.open(phylop_vertebrates) wWigIO.open(phylop_primates) U2_GTAG_5 = PWM_to_dict(U2_GTAG_5_file) U2_GTAG_3 = PWM_to_dict(U2_GTAG_3_file) U2_GTAG_5_max_score = 0 U2_GTAG_3_max_score = 0 for index in range(13): U2_GTAG_5_max_score += max(U2_GTAG_5['A'][index], U2_GTAG_5['C'][index], U2_GTAG_5['T'][index], U2_GTAG_5['G'][index]) for index in range(17): U2_GTAG_3_max_score += max(U2_GTAG_3['A'][index], U2_GTAG_3['C'][index], U2_GTAG_3['T'][index], U2_GTAG_3['G'][index]) TOTAL_U2_max_score = U2_GTAG_5_max_score + U2_GTAG_3_max_score exons = set([]) for row in csv.reader(open(gencode_bed), delimiter = '\t'): csv.field_size_limit(1000000000) qstarts = map (int, row[11].strip(",").split(",")) blocksizes = map(int, row[10].strip(",").split(",")) start = int(row[1]) strand = row[5] bn = int(row[9]) chr = row[0] for q1, b in zip(qstarts[1:-1], blocksizes[1:-1]): estart = start + q1 eend = start + q1 + b E5 = str(Genome[chr][estart-14:estart+3]).upper() E3 = str(Genome[chr][eend-3:eend+10]).upper() if strand == "-": E5 = str(Genome[chr][eend-3:eend+14].reverse_complement()).upper() E3 = str(Genome[chr][estart-10:estart+3].reverse_complement()).upper() U2_score = 0 i = 0 for N in E5: U2_score += U2_GTAG_3[N][i] i += 1 i = 0 for N in E3: U2_score += U2_GTAG_5[N][i] i += 1 U2_score = percent(U2_score, TOTAL_U2_max_score) if E5[-5:-3]=="AG" and E3[3:5] == "GT": exons.add((chr, estart, eend, strand, U2_score)) # if " ".join([chr, estart, eend]) == "chr17 26597935 26598725": # print for e in exons: chr, estart, eend, strand, U2_score = e conservation_vertebrates = wWigIO.getIntervals(phylop_vertebrates, chr, estart-2, eend+2) conservation_primates = wWigIO.getIntervals(phylop_primates, chr, estart-2, eend+2) mean_conservation_vertebrates = 0 mean_conservation_primates = 0 for i in conservation_vertebrates: mean_conservation_vertebrates += i[2] try: mean_conservation_vertebrates = mean_conservation_vertebrates/len(conservation_vertebrates) except ZeroDivisionError: pass for i in conservation_primates: mean_conservation_primates += i[2] try: mean_conservation_primates = mean_conservation_primates/len(conservation_primates) except ZeroDivisionError: pass print chr, estart, eend, strand, U2_score, mean_conservation_vertebrates, mean_conservation_primates