def plot_one_chr_fragment(matrix, figure_path, chr_name, ch_start, ch_end): print "Plotting picture" ch_start = ch_start / domain_res ch_end = ch_end / domain_res # domain_st = domain_st / domain_res # domain_end = domain_end / domain_res i = genome_db.label2idx[chr_name] q2_0 = matrix[i] q2_0 = q2_0[ch_start:ch_end, ch_start:ch_end] # np.savetxt("test_for_plot.txt",q2_0) plotting.plot_rotated_matrix(np.log(q2_0), 0, cmap='OrRd') print ch_start, ch_end #,domain_st,domain_end # print domain_st-ch_start,domain_end-ch_start # plt.axvline(x=domain_st-ch_start) # plt.axvline(x=domain_end-ch_start) plt.subplots_adjust(bottom=0.15) fp1 = figure_path + "_" + chr_name + "_bin_" + str( ch_start) + "_to_" + str(ch_end) + ".rotated.png" print "Saving figure " + fp1 f = open(fp1, "wb") plt.savefig(fp1, dpi=900) f.close() plt.clf() plotting.plot_matrix(np.log(q2_0), cmap='OrRd') fp1 = figure_path + "_" + chr_name + "_bin_" + str( ch_start) + "_to_" + str(ch_end) + ".png" print "Saving figure " + fp1 f = open(fp1, "wb") plt.savefig(fp1, dpi=900) f.close()
def iterativeFiltering(genome_db, fragments): ''' Filter the data at the binned level and perform the iterative correction. ''' # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(options.outputDir+'heatmap-res-1M.hdf5', mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(options.outputDir+'heatmap-res-1M.hdf5', options.experiment) # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export(options.experiment, options.outputDir+'IC-heatmap-res-1M.hdf5') plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
def step3(hiclib_path, sraid, res=1000000): ''' 3. Filter and iteratively correct heatmaps. http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName') # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid+'_map-res%sk.pdf'%(res/1000)) plt.clf() # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export('DataName', sraid+'_map-res%sk-ic.hdf5'%(res/1000)) # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid+'_map-res%sk-ic.pdf'%(res/1000)) plt.clf() # Save Bias outfile = open(sraid+"_map-res%sk-ic-bias.txt"%(res/1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res)) outfile.write("\t%s"%BD.biasDict['DataName'][i]) outfile.write("\n") outfile.close()
def plot_one_chr_pict(matrix,figure_path,chr_numb): #chr_numb is zero-based print "Plotting picture" i=chr_numb q2_0=matrix[st[i]:end[i],st[i]:end[i]] plotting.plot_matrix(np.log(q2_0),cmap='OrRd') plt.subplots_adjust(bottom=0.15) fp1=figure_path+str(domain_res)+"_1000KB_chr"+str(i+1)+".png" print "Saving figure "+fp1 f = open(fp1, "wb") plt.savefig(fp1,dpi=900) f.close() plt.clf()
def step4(hiclib_path, sraid, res=1000000): ''' 4. Eigen vector decomposition /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName') # Do eigen decomposition BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=30, force=True) ## First 30 EIGs BD.restoreZeros(value=0) eig = BD.eigEigenvalueDict['DataName'] eig_v = BD.EigDict['DataName'] # Plot the heatmap directly. plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v))) plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000)) plt.clf() outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res)) for eigenvector in eig_v: outfile.write("\t%s" % eigenvector[i]) outfile.write("\n") outfile.close()
def step4(hiclib_path, sraid, res=1000000): ''' 4. Eigen vector decomposition /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName') # Do eigen decomposition BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removeCis() BD.truncTrans(high=0.0005) BD.removePoorRegions(cutoff=1) BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=30, force=True) ## First 30 EIGs BD.restoreZeros(value=0) eig = BD.eigEigenvalueDict['DataName'] eig_v = BD.EigDict['DataName'] # Plot the heatmap directly. plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v))) plt.savefig(sraid+'_map-res%sk-eig.pdf'%(res/1000)) plt.clf() outfile = open(sraid+"_map-res%sk-ic-eig.txt"%(res/1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res)) for eigenvector in eig_v: outfile.write("\t%s"%eigenvector[i]) outfile.write("\n") outfile.close()
def plot_one_chr_fragment(matrix, figure_path, chr_numb, ch_start, ch_end): #chr_numb is zero-based print "Plotting picture" ch_start = ch_start / domain_res ch_end = ch_end / domain_res i = chr_numb q2_0 = matrix[st[i]:end[i], st[i]:end[i]] q2_0 = q2_0[ch_start:ch_end, ch_start:ch_end] plotting.plot_matrix(np.log(q2_0), cmap='OrRd') plt.subplots_adjust(bottom=0.15) fp1 = figure_path + str(domain_res) + "_40KB_chr" + str( i + 1) + "_bin_" + str(ch_start) + "_to_" + str(ch_end) + ".png" print "Saving figure " + fp1 f = open(fp1, "wb") plt.savefig(fp1, dpi=900) f.close() plt.clf()
def iterativeFiltering(genome_db, filesuffix): ''' Filter the data at the binned level and perform the iterative correction. ''' # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(options.outputDir + options.experiment + filesuffix, mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(options.outputDir + options.experiment + filesuffix, options.experiment) # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Remove empty bins BD.removeZeros() # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export(options.experiment, options.outputDir + options.experiment + '-IC' + filesuffix) plt.figure() plotting.plot_matrix(np.log(BD.dataDict[options.experiment])) pp.savefig()
raw_heatmap = h5dict.h5dict(heatmap_filepath, mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(heatmap_filepath, 'HindIII_GM_1') # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export('HindIII_GM_1', heatmap_filepath + '_corrected') # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['HindIII_GM_1'])) #plt.show() plt.savefig('Sp.png')
raw_heatmap = h5dict.h5dict('../2_filtering_reads/heatmap-res-1M.hdf5', mode='r') resolution = int(raw_heatmap['resolution']) BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad('../2_filtering_reads/heatmap-res-1M.hdf5', 'Rao2014_10M') #BD.removeDiagonal() BD.removeBySequencedCount(0.5) BD.removePoorRegions(cutoff=1) BD.truncTrans(high=0.0005) BD.iterativeCorrectWithoutSS() BD.export('Rao2014_10M', './IC-heatmap-res-1M.hdf5') fig = plt.figure() plotting.plot_matrix(np.log(BD.dataDict['Rao2014_10M'] + 1.0)) fig.savefig('./heatmap.pdf') ''' print 'Eigendecomposition...' BD.removeCis() BD.fakeCis() BD.removeZeros() BD.doEig(numPCs=1) # first principal component BD.restoreZeros(value=0) eigenvectors = BD.EigDict['Rao2014_10M'] print eigenvectors[:,:10] np.savetxt('egvecs.txt', eigenvectors) '''
l = genome_db1.chrmLensBin[i] for j in xrange(0,l): #Adjusting everything except main diagonal idx1 = range(st[i],end[i]-j) #i-indices for diagonal elements idx2 = range(st[i]+j,end[i]) #j-indices for diagonal elements if (N1[j] == 0) or (N2[j]==0): q2[idx1,idx2] = 0 else: if (Err[j] <= 0.05): #applay only for points where error is less then 5% q2[idx1,idx2] = q2[idx1,idx2]*(N1[j]/N2[j]) for i in xrange(0,len(q2)): #Since we have modified only 1/2 of array, we need to applay all same to the bottom half now for j in xrange(i,len(q2)): q2[j,i] = q2[i,j] print "Adjusting total contact number after correction" s3 = np.sum([np.sum(BD2.dataDict['HindIII_GM_1'][st[i]:end[i],st[i]:end[i]]) for i in chrms])/ np.sum([np.sum(q2[st[i]:end[i],st[i]:end[i]]) for i in chrms]) for i in chrms: q2[st[i]:end[i],st[i]:end[i]] *= s3 print "Exporting heatmap to ",out_heatmap_filepath2 BD2.dataDict['HindIII_GM_1'] = q2 BD2.export('HindIII_GM_1',out_heatmap_filepath2) print "Plotting contact matrix" plotting.plot_matrix(np.log(q2)) plt.subplots_adjust(bottom=0.15) print "Saving figure "+figure_path f = open(figure_path, "wb") plt.savefig(figure_path,dpi=600) f.close()
E1_values = np.genfromtxt(E1_file, dtype=None)['f2'] assert len(E1_values) == len(q) saddles, strength = doSaddles(q, E1_values, genome_db) vmin = np.min(saddles["all_average"]) vmax = np.max(saddles["all_average"]) print "Min = ", vmin, "Max = ", vmax #Shufle E1 to get boostrap control import pickle SD = [] for i in range(100): if i % 5 == 0: print "Bootstraping ", i, "%" np.random.shuffle(E1_values) SD.append(doSaddles(q, E1_values, genome_db)) with open(figure_path + "bootstrup_dump", "w") as f: pickle.dump([SD], f) print "Strength=", strength, "+/-", np.std([i[1] for i in SD]) print "Bootstrap average = ", np.average([i[1] for i in SD]) for i in saddles: print "saving ", i plt.clf() np.savetxt(figure_path + i + ".txt", saddles[i]) plotting.plot_matrix(np.log(saddles[i])) if i == "all_average": plt.title("Strength=" + str(strength)) plt.savefig(figure_path + i + ".png", dpi=300)
np.logical_not(mask_pval), mask_hugeDifference) # take out elements that are already in p-val N_hugeDif = N_significant - np.sum(mask_hugeDifference) print "Number of values with difference >= ", use_weights, " times = ", N_hugeDif, " out of ", N_significant, "(", ( float(N_hugeDif) / N_significant) * 100, "%)" print "Applying mask_mapability" Q = Q * np.logical_not( mask_mapability ) + mask_mapability * 1000.0 #all points where reads were not mapable set to 1000 print "Applying p-value mask" Q = Q * np.logical_not( mask_pval ) - mask_pval * 500.0 #all points where reads were p-value > alpha set to -1000 if use_weights != 1: print "Applying weights mask" Q = Q * np.logical_not( mask_hugeDifference ) + mask_hugeDifference * 500.0 #all points where difference is less then use_weights times if domain_res < 1000000: Q = Q[0:5000, 0:5000] print "Warning: saving only a part of picture due to high resolution" print "Plotting contact matrix" plotting.plot_matrix(Q) plt.subplots_adjust(bottom=0.15) print "Saving figure " + figure_path f = open(figure_path, "wb") plt.savefig(figure_path, dpi=600) f.close()
if use_weights != 1: print "Applying weights mask" Q = Q * np.logical_not( mask_hugeDifference ) + mask_hugeDifference * 500.0 #all points where difference is less then use_weights times if domain_res < 1000000: Q = Q[0:5000, 0:5000] print "Warning: saving only a part of picture due to high resolution" st = genome_db1.chrmStartsBinCont[18] end = genome_db1.chrmEndsBinCont[18] chr19 = Q[st:end, st:end] print "Plotting contact matrix" plotting.plot_matrix(Q) plt.subplots_adjust(bottom=0.15) print "Saving figure " + figure_path f = open(figure_path, "wb") plt.savefig(figure_path, dpi=600) f.close() plt.clf() print "Plotting contact matrix" plotting.plot_matrix(chr19) plt.subplots_adjust(bottom=0.15) print "Saving figure " + figure_path + ".chr19.png" f = open(figure_path + ".chr19.png", "wb") plt.savefig(figure_path + ".chr19.png", dpi=600) f.close() np.save(figure_path + "chr19.txt", chr19)
def filter_hires_heatmap(mode="cis", hm_file=""): from hiclib import highResBinnedData resolution = extractResolutionFromFileName(hm_file) if resolution == None: raise # Create a object, load the data. print "creating an object" hmap = highResBinnedData.HiResHiC(genome_db, resolution) print "loading data" hmap.loadData(hm_file, mode=mode) print "saving pict of heatmap" import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from mirnylib import plotting chr0array = hmap.data[(0, 0)].getData() maxlen = min(10000, len(chr0array)) to_plot = chr0array[0:maxlen, 0:maxlen] figure_path = hm_file + "stage1.png" print "saving ", figure_path plotting.plot_matrix(np.log(to_plot)) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=300) f.close() plt.clf() # Remove the contacts between loci located within the same bin +/- 1 bin. hmap.removeDiagonal(m=1) to_plot = hmap.data[(0, 0)].getData()[0:maxlen, 0:maxlen] figure_path = hm_file + "stage2.png" print "saving ", figure_path plotting.plot_matrix(np.log(to_plot)) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=300) f.close() plt.clf() # Removes 0.5 percent of regions with low coverage. hmap.removePoorRegions(percent=0.5) # Perform iterative correction. hmap.iterativeCorrection() to_plot = hmap.data[(0, 0)].getData()[0:maxlen, 0:maxlen] figure_path = hm_file + "stage3.png" print "saving ", figure_path plotting.plot_matrix(np.log(to_plot)) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=300) f.close() plt.clf() # Save the iteratively corrected heatmap. hmap.export(hm_file + ".IC." + mode + ".hdf5")
#chrm2,nt_end = position.chrm,position.nt #assert chrm1==chrm2 #chrm=chrm1 chrmLabel = chrm chrm = genome_db_contig.label2idx[chrm] for data,hmpath,domains in zip([data_dict[chrm],data_dict_second_hmap[chrm]], [hm_file,second_hm_file], [[domains_files_Arm,domains_files_Dix], [second_domains_files_Arm,second_domains_files_Dix]]): left_border = (nt_start-distance)/resolution right_border = (nt_end+distance)/resolution to_plot = np.log(data[left_border:right_border,left_border:right_border]) plotting.plot_matrix(to_plot) plt.plot([distance/resolution,right_border-left_border-(distance)/resolution,(right_border-left_border)-distance/resolution], [distance/resolution,distance/resolution,-(distance/resolution)+right_border-left_border], ls="dashed",color='k') #find_domains_inside_region for domain_file,domains_color in zip(domains,["black","white"]): domain = np.genfromtxt(domain_file,dtype=np.dtype([('chrm','S10'),('start',np.uint32),('end',np.uint32)]),usecols = (0,1,2)) domain = np.sort(domain,order=["chrm","start"]) domains_in_region = [d for d in domain if d["start"]>=(nt_start-distance) and d["end"]<=(nt_end+distance) and d["chrm"].upper()==chrmLabel.upper()] for d in domains_in_region: moving_constant = nt_start-distance st = (d["start"] - moving_constant)/resolution end =(d["end"] - moving_constant)/resolution plt.plot([st,st,end],
for domain,domain_color in zip(domains,colors): if ("chr" in domain[0]["chrm"]) and (not "CHR" in chrmLabel.upper()): addition = "CHR" else: addition = "" domains_in_region = [d for d in domain if d["start"]>=nt_start and d["end"]<=nt_end and d["chrm"].upper()==addition+chrmLabel.upper()] for d in domains_in_region: moving_constant = nt_start st = (d["start"] - moving_constant)/resolution end =(d["end"] - moving_constant)/resolution plt.plot([st,st,end], [st,end,end], ls="solid",linewidth=points_per_dot,color=domain_color) plotting.plot_matrix(to_plot,cmap='OrRd') if right_border-left_border > 10: tick_coeff = 10 else: tick_coeff = 1 plt.yticks(list(range(0,right_border-left_border,((right_border-left_border)/tick_coeff)))) plt.xticks(list(range(0,right_border-left_border,((right_border-left_border)/tick_coeff)))) plt.gca().set_yticklabels(["%.5g" % i for i in range(left_border*resolution, right_border*resolution, ((right_border-left_border)/tick_coeff)*resolution)]) plt.gca().set_xticklabels(["%.5g" % i for i in range(left_border*resolution, right_border*resolution, ((right_border-left_border)/tick_coeff)*resolution)], rotation="vertical") plt.title(title)
vmin = min([np.min(i) for i in saddles.values()]) vmax = max([np.max(i) for i in saddles.values()]) if not os.path.isdir(figure_path): os.mkdir(figure_path) figure_path += "/" print "Saving results to", figure_path for i in saddles: print "saving ", i plt.clf() np.savetxt(figure_path + i + ".txt", saddles[i]) plotting.plot_matrix(np.log(saddles[i])) plt.savefig(figure_path + i + ".png", dpi=300) all_average = np.zeros((5, 5), dtype=float) for i in range(5): for j in range(5): all_average[i, j] = np.average([ saddles[c][i, j] for c in saddles if not np.isnan(saddles[c][i, j]) ]) all_mean = np.zeros((5, 5), dtype=float) for i in range(5): for j in range(5): all_mean[i, j] = np.nanmean([ saddles[c][i, j] for c in saddles if not np.isnan(saddles[c][i, j]) ])
def filter_bychr_heatmap(hm_file): resolution = extractResolutionFromFileName(hm_file) if resolution == None: raise from hiclib import binnedData # Create a object, load the data. print "creating an object" hmap = binnedData.binnedData(resolution, genome_db) print "loading data" hmap.simpleLoad(hm_file, "heatmap") print "saving pict of heatmap" import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from mirnylib import plotting maxlen = min(10000, len(hmap.dataDict["heatmap"])) a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen] figure_path = hm_file + "stage1.png" print "saving ", figure_path plotting.plot_matrix(np.log(a)) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=600) f.close() plt.clf() # Remove the contacts between loci located within the same bin +/- 1 bin. hmap.removeDiagonal(m=1) hmap.removeBySequencedCount( ) # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome) hmap.removePoorRegions( cutoff=0.5, coverage=True ) # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix) # This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins. hmap.truncTrans() # remove PCR blowouts from trans data a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen] figure_path = hm_file + "stage2.png" print "saving ", figure_path plotting.plot_matrix(np.log(a)) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=200) f.close() plt.clf() hmap.iterativeCorrectWithoutSS(force=True) #do iterative correction a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen] figure_path = hm_file + "stage3.png" print "saving ", figure_path plotting.plot_matrix(np.log(a)) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=600) f.close() plt.clf() # Save the iteratively corrected heatmap. hmap.export("heatmap", hm_file + ".IC.hdf5", False)
def step3(hiclib_path, sraid, res=1000000): ''' 3. Filter and iteratively correct heatmaps. http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html ''' import matplotlib.pyplot as plt import numpy as np from mirnylib import genome from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData genome_db = genome.Genome(hiclib_path + '/fasta/hg19', readChrms=['#', 'X']) # Read resolution from the dataset. raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000), mode='r') resolution = int(raw_heatmap['resolution']) # Create a binnedData object, load the data. BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName') # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid + '_map-res%sk.pdf' % (res / 1000)) plt.clf() # Remove the contacts between loci located within the same bin. BD.removeDiagonal() # Remove bins with less than half of a bin sequenced. BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() # Save the iteratively corrected heatmap. BD.export('DataName', sraid + '_map-res%sk-ic.hdf5' % (res / 1000)) # Plot the heatmap directly. plotting.plot_matrix(np.log(BD.dataDict['DataName'])) plt.savefig(sraid + '_map-res%sk-ic.pdf' % (res / 1000)) plt.clf() # Save Bias outfile = open(sraid + "_map-res%sk-ic-bias.txt" % (res / 1000), "w") for i in xrange(len(BD.chromosomeIndex)): chro = BD.genome.idx2label[BD.chromosomeIndex[i]] posi = BD.positionIndex[i] outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res)) outfile.write("\t%s" % BD.biasDict['DataName'][i]) outfile.write("\n") outfile.close()
# currentEIG[0]=-currentEIG[0] #print currentEIG #print "Saving figure "+figure_path+".eig.png" #f = open(figure_path+".eig.png", "wb") #plt.plot(range(len(currentEIG[0])),currentEIG[0],"bo") #plt.savefig(figure_path+".eig.png",dpi=600) #f.close() #plt.clf() #f1 = open(figure_path+".eig", "wb") #for i in range(len(currentEIG[0])): # s = "chr"+str(i)+"\t"+str(currentEIG[0][i])+"\n" # f1.write(s) #f1.close() #mi=0.8 #Min and Max values for pictures #ma=1.2 #To make colors same #if (np.max(q1) > ma) or (np.min(q1) < mi): # print "Current max and min are ",np.max(q1),np.min(q1) # raise Exception("Array values out of "+str(mi)+"\t"+str(ma)) print "Plotting contact matrix" #plotting.plot_matrix(q1,vmin=mi, vmax=ma, cmap='seismic') plotting.plot_matrix(q1, cmap='seismic') plt.subplots_adjust(bottom=0.15) print "Saving figure " + figure_path f = open(figure_path, "wb") plt.savefig(figure_path, dpi=600) f.close()
def sortfunct(s): if s in ["chrW","chrZ"]: return 10000000 else: return 100*len(s) sortorder = sorted(range(len(order)), key=lambda k: (sortfunct(order[k]),order[k])) print order print sortorder print order[sortorder] for i in xrange(len(q2)): for j in xrange(len(q2)): q3[i,j] = q2[sortorder[i],sortorder[j]] print order[sortorder] plotting.plot_matrix(q3,cmap='seismic',vmin=-1.5,vmax=4,ticklabels1=order[sortorder]) plt.subplots_adjust(bottom=0.15) print "Saving figure "+figure_path f = open(figure_path, "wb") plt.savefig(figure_path,dpi=300) f.close() for ind,st,end in zip(sortorder,genome_db.chrmStartsBinCont[sortorder],genome_db.chrmEndsBinCont[sortorder]): L = genome_db.chrmLens[ind] Total_L = sum(genome_db.chrmLens) intra = float(np.sum(q[st:end,st:end])) inter = np.sum(q[st:end])-intra intra /= 2. intra = intra / L**2 inter = inter / (L*(Total_L-L))
continue #remove contacts with cromosome itself s1 = float(np.sum(res[i, :])) s2 = float(np.sum(res[:, j])) p1 = (s1 / total) * (s2 / (total - s1)) p2 = (s2 / total) * (s1 / (total - s2)) znam = (p1 + p2) * (total / 2.0) if znam == 0: res_probablities[i, j] = None else: res_probablities[i, j] = res[i, j] / znam #res_probablities=np.log2(res_probablities) mi = np.min(res_probablities) #0.8 #Min and Max values for pictures ma = np.max(res_probablities) #1.4 #To make colors same if (np.max(res_probablities) > ma) or (np.min(res_probablities) < mi): print "Current max and min are ", np.max(res_probablities), np.min( res_probablities) raise Exception("Array values out of " + str(mi) + "\t" + str(ma)) print "Plotting contact matrix" #plotting.plot_matrix(res_probablities,vmin=mi, vmax=ma) plotting.plot_matrix(res_probablities) plt.subplots_adjust(bottom=0.15) print "Saving figure " + figure_path f = open(figure_path, "wb") plt.savefig(figure_path, dpi=600) f.close()
print "sum = " , print sum(counts.values()) for res in [1000000]: #TR.saveCooler(out_file+".{0}.cool".format(res), res) #pass f1.saveHeatmap(out_file+".{0}.hm".format(res), res) BD = binnedData.binnedData(res, genome_db) BD.simpleLoad(out_file+".{0}.hm".format(res), enzyme) plotting.plot_matrix(BD.dataDict[enzyme], clip_min = 0, clip_max = 1, cmap='Blues') #plt.colorbar(extend='both') #plt.clim(0, 1); #, label = "'viridis'" plt.savefig(out_file+".{0}.png".format(res), dpi=300, figsize=(16, 16)) plt.close() for res in [1000000, 200000]: f1.saveCooler(out_file+".{0}.cool".format(res), res) f1.saveHeatmap(out_file+".{0}.hm".format(res), res)
import scipy.spatial.distance as ssd # convert the redundant n*n square matrix form into a condensed nC2 array print len(interchr_contacts2), genome_db.chrmCount distArray = ssd.squareform( interchr_contacts2 ) # distArray[{n choose 2}-{n-i choose 2} + (j-i-1)] is the distance between points i and j from scipy.cluster.hierarchy import dendrogram, linkage Z = linkage(distArray, 'single') from scipy.cluster.hierarchy import fcluster result = fcluster(Z, 40, criterion='maxclust') print result print max(result) plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) plt.savefig("test.png") plt.clf() plt.figure(figsize=(25, 10)) plotting.plot_matrix(interchr_contacts) plt.savefig("test2.png") plt.clf()
min_resolution = 200 files = [ i for i in files if int(i.split("-")[-1].split("k")[0]) > min_resolution ] print "Processing following files:", "\n".join(files) for file in files: file = base_folder + "/" + file figure_path = file + '.png' raw_heatmap = h5dict.h5dict(file, mode='r') resolution = int(raw_heatmap['resolution']) BD = binnedData.binnedData(resolution, genome_db) BD.simpleLoad(file, 'HindIII') BD.removeBySequencedCount(0.5) # Remove 1% of regions with low coverage. BD.removePoorRegions(cutoff=1) # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts). BD.truncTrans(high=0.0005) # Perform iterative correction. BD.iterativeCorrectWithoutSS() q = np.log(BD.dataDict['HindIII']) #if domain_res < 1000000: # print "Matrix is too big, have to resize it" # q=resize_matrix(q,1000000/domain_res,np.max) print "saving ", figure_path plotting.plot_matrix(q) plt.subplots_adjust(bottom=0.15) f = open(figure_path, "wb") plt.savefig(figure_path, dpi=600) f.close() plt.clf()