예제 #1
0
def plot_one_chr_fragment(matrix, figure_path, chr_name, ch_start, ch_end):
    print "Plotting picture"
    ch_start = ch_start / domain_res
    ch_end = ch_end / domain_res
    # domain_st = domain_st / domain_res
    # domain_end = domain_end / domain_res
    i = genome_db.label2idx[chr_name]
    q2_0 = matrix[i]
    q2_0 = q2_0[ch_start:ch_end, ch_start:ch_end]
    #	np.savetxt("test_for_plot.txt",q2_0)
    plotting.plot_rotated_matrix(np.log(q2_0), 0, cmap='OrRd')
    print ch_start, ch_end  #,domain_st,domain_end
    #	print domain_st-ch_start,domain_end-ch_start
    #	plt.axvline(x=domain_st-ch_start)
    #	plt.axvline(x=domain_end-ch_start)
    plt.subplots_adjust(bottom=0.15)
    fp1 = figure_path + "_" + chr_name + "_bin_" + str(
        ch_start) + "_to_" + str(ch_end) + ".rotated.png"
    print "Saving figure " + fp1
    f = open(fp1, "wb")
    plt.savefig(fp1, dpi=900)
    f.close()

    plt.clf()
    plotting.plot_matrix(np.log(q2_0), cmap='OrRd')
    fp1 = figure_path + "_" + chr_name + "_bin_" + str(
        ch_start) + "_to_" + str(ch_end) + ".png"
    print "Saving figure " + fp1
    f = open(fp1, "wb")
    plt.savefig(fp1, dpi=900)
    f.close()
예제 #2
0
def iterativeFiltering(genome_db, fragments):
	'''
	Filter the data at the binned level and perform the iterative correction.
	'''
	
	# Read resolution from the dataset.
	raw_heatmap = h5dict.h5dict(options.outputDir+'heatmap-res-1M.hdf5', mode='r') 
	resolution = int(raw_heatmap['resolution'])
	
	# Create a binnedData object, load the data.
	BD = binnedData.binnedData(resolution, genome_db)
	BD.simpleLoad(options.outputDir+'heatmap-res-1M.hdf5', options.experiment)

	# Remove the contacts between loci located within the same bin.
	BD.removeDiagonal()
	
	# Remove bins with less than half of a bin sequenced.
	BD.removeBySequencedCount(0.5)
	
	# Remove 1% of regions with low coverage.
	BD.removePoorRegions(cutoff=1)
	
	# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
	BD.truncTrans(high=0.0005)
	
	# Perform iterative correction.
	BD.iterativeCorrectWithoutSS()

	# Save the iteratively corrected heatmap.
	BD.export(options.experiment, options.outputDir+'IC-heatmap-res-1M.hdf5')

	plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
예제 #3
0
def step3(hiclib_path, sraid, res=1000000):
    ''' 3. Filter and iteratively correct heatmaps.
        http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html
    '''
    import matplotlib.pyplot as plt
    import numpy as np

    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r') 
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName')

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid+'_map-res%sk.pdf'%(res/1000))
    plt.clf()

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export('DataName', sraid+'_map-res%sk-ic.hdf5'%(res/1000))

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid+'_map-res%sk-ic.pdf'%(res/1000))
    plt.clf()

    # Save Bias
    outfile = open(sraid+"_map-res%sk-ic-bias.txt"%(res/1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res))
        outfile.write("\t%s"%BD.biasDict['DataName'][i])
        outfile.write("\n")
    outfile.close()
def plot_one_chr_pict(matrix,figure_path,chr_numb): #chr_numb is zero-based
	print "Plotting picture"
	i=chr_numb
	q2_0=matrix[st[i]:end[i],st[i]:end[i]]
	plotting.plot_matrix(np.log(q2_0),cmap='OrRd')
	plt.subplots_adjust(bottom=0.15)
	fp1=figure_path+str(domain_res)+"_1000KB_chr"+str(i+1)+".png"
	print "Saving figure "+fp1
	f = open(fp1, "wb")
	plt.savefig(fp1,dpi=900)
	f.close()
	plt.clf()
예제 #5
0
def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True)  ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid + '_map-res%sk-eig.pdf' % (res / 1000))
    plt.clf()

    outfile = open(sraid + "_map-res%sk-ic-eig.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        for eigenvector in eig_v:
            outfile.write("\t%s" % eigenvector[i])
        outfile.write("\n")
    outfile.close()
예제 #6
0
def step4(hiclib_path, sraid, res=1000000):
    ''' 4. Eigen vector decomposition
    /examples/iterativeCorrectionEigenvectorExpansion/eigenvectorAnalysis.py
    '''
    import matplotlib.pyplot as plt
    import numpy as np
    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path+'/fasta/hg19', readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid+'_map-res%sk.hdf5'%(res/1000), mode='r')  
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid+'_map-res%sk.hdf5'%(res/1000), 'DataName')
    
    # Do eigen decomposition
    BD.removeDiagonal()
    BD.removeBySequencedCount(0.5)
    BD.removeCis()
    BD.truncTrans(high=0.0005)
    BD.removePoorRegions(cutoff=1)
    BD.fakeCis()
    BD.removeZeros()
    BD.doEig(numPCs=30, force=True) ## First 30 EIGs
    BD.restoreZeros(value=0)

    eig = BD.eigEigenvalueDict['DataName']
    eig_v = BD.EigDict['DataName']

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(np.dot(np.dot(eig_v.T, np.diag(eig)), eig_v)))
    plt.savefig(sraid+'_map-res%sk-eig.pdf'%(res/1000))
    plt.clf()

    outfile = open(sraid+"_map-res%sk-ic-eig.txt"%(res/1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s"%(chro, posi, posi+res))
        for eigenvector in eig_v:
            outfile.write("\t%s"%eigenvector[i])
        outfile.write("\n")
    outfile.close()
예제 #7
0
def plot_one_chr_fragment(matrix, figure_path, chr_numb, ch_start,
                          ch_end):  #chr_numb is zero-based
    print "Plotting picture"
    ch_start = ch_start / domain_res
    ch_end = ch_end / domain_res
    i = chr_numb
    q2_0 = matrix[st[i]:end[i], st[i]:end[i]]
    q2_0 = q2_0[ch_start:ch_end, ch_start:ch_end]
    plotting.plot_matrix(np.log(q2_0), cmap='OrRd')
    plt.subplots_adjust(bottom=0.15)
    fp1 = figure_path + str(domain_res) + "_40KB_chr" + str(
        i + 1) + "_bin_" + str(ch_start) + "_to_" + str(ch_end) + ".png"
    print "Saving figure " + fp1
    f = open(fp1, "wb")
    plt.savefig(fp1, dpi=900)
    f.close()
    plt.clf()
예제 #8
0
def iterativeFiltering(genome_db, filesuffix):
    '''
	Filter the data at the binned level and perform the iterative correction.
	'''

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(options.outputDir + options.experiment +
                                filesuffix,
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(options.outputDir + options.experiment + filesuffix,
                  options.experiment)

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Remove empty bins
    BD.removeZeros()

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export(options.experiment,
              options.outputDir + options.experiment + '-IC' + filesuffix)

    plt.figure()
    plotting.plot_matrix(np.log(BD.dataDict[options.experiment]))
    pp.savefig()
예제 #9
0
raw_heatmap = h5dict.h5dict(heatmap_filepath, mode='r')
resolution = int(raw_heatmap['resolution'])

# Create a binnedData object, load the data.
BD = binnedData.binnedData(resolution, genome_db)
BD.simpleLoad(heatmap_filepath, 'HindIII_GM_1')

# Remove the contacts between loci located within the same bin.
BD.removeDiagonal()

# Remove bins with less than half of a bin sequenced.
BD.removeBySequencedCount(0.5)

# Remove 1% of regions with low coverage.
BD.removePoorRegions(cutoff=1)

# Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
BD.truncTrans(high=0.0005)

# Perform iterative correction.
BD.iterativeCorrectWithoutSS()

# Save the iteratively corrected heatmap.
BD.export('HindIII_GM_1', heatmap_filepath + '_corrected')

# Plot the heatmap directly.
plotting.plot_matrix(np.log(BD.dataDict['HindIII_GM_1']))

#plt.show()
plt.savefig('Sp.png')
예제 #10
0
raw_heatmap = h5dict.h5dict('../2_filtering_reads/heatmap-res-1M.hdf5',
                            mode='r')
resolution = int(raw_heatmap['resolution'])

BD = binnedData.binnedData(resolution, genome_db)
BD.simpleLoad('../2_filtering_reads/heatmap-res-1M.hdf5', 'Rao2014_10M')

#BD.removeDiagonal()
BD.removeBySequencedCount(0.5)
BD.removePoorRegions(cutoff=1)
BD.truncTrans(high=0.0005)
BD.iterativeCorrectWithoutSS()

BD.export('Rao2014_10M', './IC-heatmap-res-1M.hdf5')

fig = plt.figure()
plotting.plot_matrix(np.log(BD.dataDict['Rao2014_10M'] + 1.0))
fig.savefig('./heatmap.pdf')
'''
print 'Eigendecomposition...'
BD.removeCis()
BD.fakeCis()
BD.removeZeros()
BD.doEig(numPCs=1) # first principal component
BD.restoreZeros(value=0)
eigenvectors = BD.EigDict['Rao2014_10M']
print eigenvectors[:,:10]
np.savetxt('egvecs.txt', eigenvectors)
'''
예제 #11
0
	l = genome_db1.chrmLensBin[i]
	for j in xrange(0,l): #Adjusting everything except main diagonal
		idx1 = range(st[i],end[i]-j) #i-indices for diagonal elements
		idx2 = range(st[i]+j,end[i]) #j-indices for diagonal elements
		if (N1[j] == 0) or (N2[j]==0):
			q2[idx1,idx2] = 0
		else:
			if (Err[j] <= 0.05): #applay only for points where error is less then 5%
				q2[idx1,idx2] = q2[idx1,idx2]*(N1[j]/N2[j])

for i in xrange(0,len(q2)): #Since we have modified only 1/2 of array, we need to applay all same to the bottom half now
	for j in xrange(i,len(q2)):
		q2[j,i] = q2[i,j]

print "Adjusting total contact number after correction"

s3 = np.sum([np.sum(BD2.dataDict['HindIII_GM_1'][st[i]:end[i],st[i]:end[i]]) for i in chrms])/ np.sum([np.sum(q2[st[i]:end[i],st[i]:end[i]]) for i in chrms])
for i in chrms:
	q2[st[i]:end[i],st[i]:end[i]] *= s3

print "Exporting heatmap to ",out_heatmap_filepath2
BD2.dataDict['HindIII_GM_1'] = q2
BD2.export('HindIII_GM_1',out_heatmap_filepath2)

print "Plotting contact matrix"
plotting.plot_matrix(np.log(q2))
plt.subplots_adjust(bottom=0.15)
print "Saving figure "+figure_path
f = open(figure_path, "wb")
plt.savefig(figure_path,dpi=600)
f.close()
예제 #12
0
E1_values = np.genfromtxt(E1_file, dtype=None)['f2']
assert len(E1_values) == len(q)

saddles, strength = doSaddles(q, E1_values, genome_db)

vmin = np.min(saddles["all_average"])
vmax = np.max(saddles["all_average"])
print "Min = ", vmin, "Max = ", vmax

#Shufle E1 to get boostrap control
import pickle
SD = []
for i in range(100):
    if i % 5 == 0:
        print "Bootstraping ", i, "%"
    np.random.shuffle(E1_values)
    SD.append(doSaddles(q, E1_values, genome_db))
with open(figure_path + "bootstrup_dump", "w") as f:
    pickle.dump([SD], f)

print "Strength=", strength, "+/-", np.std([i[1] for i in SD])
print "Bootstrap average = ", np.average([i[1] for i in SD])

for i in saddles:
    print "saving ", i
    plt.clf()
    np.savetxt(figure_path + i + ".txt", saddles[i])
    plotting.plot_matrix(np.log(saddles[i]))
    if i == "all_average":
        plt.title("Strength=" + str(strength))
    plt.savefig(figure_path + i + ".png", dpi=300)
예제 #13
0
        np.logical_not(mask_pval),
        mask_hugeDifference)  # take out elements that are already in p-val
    N_hugeDif = N_significant - np.sum(mask_hugeDifference)
    print "Number of values with difference >= ", use_weights, " times = ", N_hugeDif, " out of ", N_significant, "(", (
        float(N_hugeDif) / N_significant) * 100, "%)"

print "Applying mask_mapability"
Q = Q * np.logical_not(
    mask_mapability
) + mask_mapability * 1000.0  #all points where reads were not mapable set to 1000
print "Applying p-value mask"
Q = Q * np.logical_not(
    mask_pval
) - mask_pval * 500.0  #all points where reads were p-value > alpha set to -1000
if use_weights != 1:
    print "Applying weights mask"
    Q = Q * np.logical_not(
        mask_hugeDifference
    ) + mask_hugeDifference * 500.0  #all points where difference is less then use_weights times

if domain_res < 1000000:
    Q = Q[0:5000, 0:5000]
    print "Warning: saving only a part of picture due to high resolution"

print "Plotting contact matrix"
plotting.plot_matrix(Q)
plt.subplots_adjust(bottom=0.15)
print "Saving figure " + figure_path
f = open(figure_path, "wb")
plt.savefig(figure_path, dpi=600)
f.close()
if use_weights != 1:
    print "Applying weights mask"
    Q = Q * np.logical_not(
        mask_hugeDifference
    ) + mask_hugeDifference * 500.0  #all points where difference is less then use_weights times

if domain_res < 1000000:
    Q = Q[0:5000, 0:5000]
    print "Warning: saving only a part of picture due to high resolution"

st = genome_db1.chrmStartsBinCont[18]
end = genome_db1.chrmEndsBinCont[18]
chr19 = Q[st:end, st:end]

print "Plotting contact matrix"
plotting.plot_matrix(Q)
plt.subplots_adjust(bottom=0.15)
print "Saving figure " + figure_path
f = open(figure_path, "wb")
plt.savefig(figure_path, dpi=600)
f.close()

plt.clf()
print "Plotting contact matrix"
plotting.plot_matrix(chr19)
plt.subplots_adjust(bottom=0.15)
print "Saving figure " + figure_path + ".chr19.png"
f = open(figure_path + ".chr19.png", "wb")
plt.savefig(figure_path + ".chr19.png", dpi=600)
f.close()
np.save(figure_path + "chr19.txt", chr19)
예제 #15
0
def filter_hires_heatmap(mode="cis", hm_file=""):
    from hiclib import highResBinnedData

    resolution = extractResolutionFromFileName(hm_file)
    if resolution == None:
        raise

    # Create a  object, load the data.
    print "creating an object"
    hmap = highResBinnedData.HiResHiC(genome_db, resolution)

    print "loading data"
    hmap.loadData(hm_file, mode=mode)

    print "saving pict of heatmap"
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from mirnylib import plotting

    chr0array = hmap.data[(0, 0)].getData()
    maxlen = min(10000, len(chr0array))

    to_plot = chr0array[0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage1.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(to_plot))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=300)
    f.close()
    plt.clf()

    # Remove the contacts between loci located within the same bin +/- 1 bin.
    hmap.removeDiagonal(m=1)

    to_plot = hmap.data[(0, 0)].getData()[0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage2.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(to_plot))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=300)
    f.close()
    plt.clf()

    # Removes 0.5 percent of regions with low coverage.
    hmap.removePoorRegions(percent=0.5)

    # Perform iterative correction.
    hmap.iterativeCorrection()

    to_plot = hmap.data[(0, 0)].getData()[0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage3.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(to_plot))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=300)
    f.close()
    plt.clf()

    # Save the iteratively corrected heatmap.
    hmap.export(hm_file + ".IC." + mode + ".hdf5")
예제 #16
0
		#chrm2,nt_end = position.chrm,position.nt
		#assert chrm1==chrm2
		#chrm=chrm1
		
		chrmLabel = chrm
		chrm = genome_db_contig.label2idx[chrm]
		
		for data,hmpath,domains in zip([data_dict[chrm],data_dict_second_hmap[chrm]],
										[hm_file,second_hm_file],
										[[domains_files_Arm,domains_files_Dix],
										[second_domains_files_Arm,second_domains_files_Dix]]):
			left_border = (nt_start-distance)/resolution
			right_border = (nt_end+distance)/resolution
			to_plot = np.log(data[left_border:right_border,left_border:right_border])
			
			plotting.plot_matrix(to_plot)
			
			plt.plot([distance/resolution,right_border-left_border-(distance)/resolution,(right_border-left_border)-distance/resolution],
					[distance/resolution,distance/resolution,-(distance/resolution)+right_border-left_border],
					ls="dashed",color='k')
			
			#find_domains_inside_region
			for domain_file,domains_color in zip(domains,["black","white"]):
				domain = np.genfromtxt(domain_file,dtype=np.dtype([('chrm','S10'),('start',np.uint32),('end',np.uint32)]),usecols = (0,1,2))
				domain = np.sort(domain,order=["chrm","start"])
				domains_in_region = [d for d in domain if d["start"]>=(nt_start-distance) and d["end"]<=(nt_end+distance) and d["chrm"].upper()==chrmLabel.upper()]
				for d in domains_in_region:
					moving_constant = nt_start-distance
					st = (d["start"] - moving_constant)/resolution
					end =(d["end"] - moving_constant)/resolution
					plt.plot([st,st,end],
예제 #17
0
		for domain,domain_color in zip(domains,colors):
			if ("chr" in domain[0]["chrm"]) and (not "CHR" in chrmLabel.upper()):
				addition = "CHR"
			else:
				addition = ""
			domains_in_region = [d for d in domain if d["start"]>=nt_start and 
														d["end"]<=nt_end and 
														d["chrm"].upper()==addition+chrmLabel.upper()]
			for d in domains_in_region:
					moving_constant = nt_start
					st = (d["start"] - moving_constant)/resolution
					end =(d["end"] - moving_constant)/resolution
					plt.plot([st,st,end],
							[st,end,end],
							ls="solid",linewidth=points_per_dot,color=domain_color)
		plotting.plot_matrix(to_plot,cmap='OrRd')
		if right_border-left_border > 10:
			tick_coeff = 10
		else:
			tick_coeff = 1
		plt.yticks(list(range(0,right_border-left_border,((right_border-left_border)/tick_coeff))))
		plt.xticks(list(range(0,right_border-left_border,((right_border-left_border)/tick_coeff))))
		plt.gca().set_yticklabels(["%.5g" % i for i in range(left_border*resolution,
															right_border*resolution,
															((right_border-left_border)/tick_coeff)*resolution)])
		plt.gca().set_xticklabels(["%.5g" % i for i in range(left_border*resolution,
															right_border*resolution,
															((right_border-left_border)/tick_coeff)*resolution)],
								rotation="vertical")

		plt.title(title)
vmin = min([np.min(i) for i in saddles.values()])
vmax = max([np.max(i) for i in saddles.values()])

if not os.path.isdir(figure_path):
    os.mkdir(figure_path)

figure_path += "/"

print "Saving results to", figure_path

for i in saddles:
    print "saving ", i
    plt.clf()
    np.savetxt(figure_path + i + ".txt", saddles[i])
    plotting.plot_matrix(np.log(saddles[i]))
    plt.savefig(figure_path + i + ".png", dpi=300)

all_average = np.zeros((5, 5), dtype=float)
for i in range(5):
    for j in range(5):
        all_average[i, j] = np.average([
            saddles[c][i, j] for c in saddles if not np.isnan(saddles[c][i, j])
        ])

all_mean = np.zeros((5, 5), dtype=float)
for i in range(5):
    for j in range(5):
        all_mean[i, j] = np.nanmean([
            saddles[c][i, j] for c in saddles if not np.isnan(saddles[c][i, j])
        ])
예제 #19
0
def filter_bychr_heatmap(hm_file):

    resolution = extractResolutionFromFileName(hm_file)
    if resolution == None:
        raise
    from hiclib import binnedData
    # Create a  object, load the data.
    print "creating an object"
    hmap = binnedData.binnedData(resolution, genome_db)

    print "loading data"
    hmap.simpleLoad(hm_file, "heatmap")

    print "saving pict of heatmap"
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from mirnylib import plotting

    maxlen = min(10000, len(hmap.dataDict["heatmap"]))

    a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage1.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(a))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=600)
    f.close()
    plt.clf()

    # Remove the contacts between loci located within the same bin +/- 1 bin.
    hmap.removeDiagonal(m=1)

    hmap.removeBySequencedCount(
    )  # new filter: omit all bins with less than 0.5 coverage by sequenced bases (i.e. bases present in the genome)

    hmap.removePoorRegions(
        cutoff=0.5, coverage=True
    )  # remove .5% bins with the lowest number of records (i.e. non-zero entrees in the matrix)
    # This filter was updated to remove bins which have zero contacts and one PCR blowout. Those bins would have many reads, but all reads will be with one or few other bins.

    hmap.truncTrans()  # remove PCR blowouts from trans data

    a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage2.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(a))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=200)
    f.close()
    plt.clf()

    hmap.iterativeCorrectWithoutSS(force=True)  #do iterative correction

    a = hmap.dataDict["heatmap"][0:maxlen, 0:maxlen]
    figure_path = hm_file + "stage3.png"
    print "saving ", figure_path
    plotting.plot_matrix(np.log(a))
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=600)
    f.close()
    plt.clf()

    # Save the iteratively corrected heatmap.
    hmap.export("heatmap", hm_file + ".IC.hdf5", False)
예제 #20
0
def step3(hiclib_path, sraid, res=1000000):
    ''' 3. Filter and iteratively correct heatmaps.
        http://mirnylab.bitbucket.org/hiclib/tutorial/03_heatmap_processing.html
    '''
    import matplotlib.pyplot as plt
    import numpy as np

    from mirnylib import genome
    from mirnylib import h5dict
    from mirnylib import plotting
    from hiclib import binnedData

    genome_db = genome.Genome(hiclib_path + '/fasta/hg19',
                              readChrms=['#', 'X'])

    # Read resolution from the dataset.
    raw_heatmap = h5dict.h5dict(sraid + '_map-res%sk.hdf5' % (res / 1000),
                                mode='r')
    resolution = int(raw_heatmap['resolution'])

    # Create a binnedData object, load the data.
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(sraid + '_map-res%sk.hdf5' % (res / 1000), 'DataName')

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk.pdf' % (res / 1000))
    plt.clf()

    # Remove the contacts between loci located within the same bin.
    BD.removeDiagonal()

    # Remove bins with less than half of a bin sequenced.
    BD.removeBySequencedCount(0.5)

    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)

    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)

    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    # Save the iteratively corrected heatmap.
    BD.export('DataName', sraid + '_map-res%sk-ic.hdf5' % (res / 1000))

    # Plot the heatmap directly.
    plotting.plot_matrix(np.log(BD.dataDict['DataName']))
    plt.savefig(sraid + '_map-res%sk-ic.pdf' % (res / 1000))
    plt.clf()

    # Save Bias
    outfile = open(sraid + "_map-res%sk-ic-bias.txt" % (res / 1000), "w")
    for i in xrange(len(BD.chromosomeIndex)):
        chro = BD.genome.idx2label[BD.chromosomeIndex[i]]
        posi = BD.positionIndex[i]
        outfile.write("chr%s\t%s\t%s" % (chro, posi, posi + res))
        outfile.write("\t%s" % BD.biasDict['DataName'][i])
        outfile.write("\n")
    outfile.close()
예제 #21
0
#	currentEIG[0]=-currentEIG[0]
#print currentEIG

#print "Saving figure "+figure_path+".eig.png"
#f = open(figure_path+".eig.png", "wb")
#plt.plot(range(len(currentEIG[0])),currentEIG[0],"bo")
#plt.savefig(figure_path+".eig.png",dpi=600)
#f.close()
#plt.clf()

#f1 = open(figure_path+".eig", "wb")
#for i in range(len(currentEIG[0])):
#	s = "chr"+str(i)+"\t"+str(currentEIG[0][i])+"\n"
#	f1.write(s)
#f1.close()

#mi=0.8 #Min and Max values for pictures
#ma=1.2 #To make colors same
#if (np.max(q1) > ma) or (np.min(q1) < mi):
#	print "Current max and min are ",np.max(q1),np.min(q1)
#	raise Exception("Array values out of "+str(mi)+"\t"+str(ma))

print "Plotting contact matrix"
#plotting.plot_matrix(q1,vmin=mi, vmax=ma, cmap='seismic')
plotting.plot_matrix(q1, cmap='seismic')
plt.subplots_adjust(bottom=0.15)
print "Saving figure " + figure_path
f = open(figure_path, "wb")
plt.savefig(figure_path, dpi=600)
f.close()
def sortfunct(s):
		if s in ["chrW","chrZ"]:
			return 10000000
		else:
			return 100*len(s)

sortorder = sorted(range(len(order)), key=lambda k: (sortfunct(order[k]),order[k]))
print order
print sortorder
print order[sortorder]
for i in xrange(len(q2)):
	for j in xrange(len(q2)):
		q3[i,j] = q2[sortorder[i],sortorder[j]]

print order[sortorder]
plotting.plot_matrix(q3,cmap='seismic',vmin=-1.5,vmax=4,ticklabels1=order[sortorder])
plt.subplots_adjust(bottom=0.15)
print "Saving figure "+figure_path
f = open(figure_path, "wb")
plt.savefig(figure_path,dpi=300)
f.close()

for ind,st,end in zip(sortorder,genome_db.chrmStartsBinCont[sortorder],genome_db.chrmEndsBinCont[sortorder]):
	L = genome_db.chrmLens[ind]
	Total_L = sum(genome_db.chrmLens)
	intra = float(np.sum(q[st:end,st:end]))
	inter = np.sum(q[st:end])-intra
	intra /= 2.
	
	intra = intra / L**2
	inter = inter / (L*(Total_L-L))
            continue  #remove contacts with cromosome itself

        s1 = float(np.sum(res[i, :]))
        s2 = float(np.sum(res[:, j]))
        p1 = (s1 / total) * (s2 / (total - s1))
        p2 = (s2 / total) * (s1 / (total - s2))

        znam = (p1 + p2) * (total / 2.0)
        if znam == 0:
            res_probablities[i, j] = None
        else:
            res_probablities[i, j] = res[i, j] / znam

#res_probablities=np.log2(res_probablities)

mi = np.min(res_probablities)  #0.8 #Min and Max values for pictures
ma = np.max(res_probablities)  #1.4 #To make colors same
if (np.max(res_probablities) > ma) or (np.min(res_probablities) < mi):
    print "Current max and min are ", np.max(res_probablities), np.min(
        res_probablities)
    raise Exception("Array values out of " + str(mi) + "\t" + str(ma))

print "Plotting contact matrix"
#plotting.plot_matrix(res_probablities,vmin=mi, vmax=ma)
plotting.plot_matrix(res_probablities)
plt.subplots_adjust(bottom=0.15)
print "Saving figure " + figure_path
f = open(figure_path, "wb")
plt.savefig(figure_path, dpi=600)
f.close()
예제 #24
0
    
        print "sum = " ,
        print sum(counts.values())  
        
        
    for res in [1000000]: 
        #TR.saveCooler(out_file+".{0}.cool".format(res), res)
        #pass
        f1.saveHeatmap(out_file+".{0}.hm".format(res), res)   

        
        BD = binnedData.binnedData(res, genome_db)
        
        BD.simpleLoad(out_file+".{0}.hm".format(res), enzyme)
        
        plotting.plot_matrix(BD.dataDict[enzyme], clip_min = 0, clip_max = 1,  cmap='Blues')
        
        
        #plt.colorbar(extend='both')
        #plt.clim(0, 1);
        #, label = "'viridis'"
        plt.savefig(out_file+".{0}.png".format(res), dpi=300, figsize=(16, 16))    
        
        
        
        plt.close()
        
        for res in [1000000, 200000]: 
            f1.saveCooler(out_file+".{0}.cool".format(res), res)
            f1.saveHeatmap(out_file+".{0}.hm".format(res), res)  
        
예제 #25
0
import scipy.spatial.distance as ssd
# convert the redundant n*n square matrix form into a condensed nC2 array
print len(interchr_contacts2), genome_db.chrmCount
distArray = ssd.squareform(
    interchr_contacts2
)  # distArray[{n choose 2}-{n-i choose 2} + (j-i-1)] is the distance between points i and j
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(distArray, 'single')
from scipy.cluster.hierarchy import fcluster
result = fcluster(Z, 40, criterion='maxclust')
print result
print max(result)

plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.savefig("test.png")
plt.clf()

plt.figure(figsize=(25, 10))
plotting.plot_matrix(interchr_contacts)
plt.savefig("test2.png")
plt.clf()
예제 #26
0
min_resolution = 200
files = [
    i for i in files if int(i.split("-")[-1].split("k")[0]) > min_resolution
]
print "Processing following files:", "\n".join(files)
for file in files:
    file = base_folder + "/" + file
    figure_path = file + '.png'
    raw_heatmap = h5dict.h5dict(file, mode='r')
    resolution = int(raw_heatmap['resolution'])
    BD = binnedData.binnedData(resolution, genome_db)
    BD.simpleLoad(file, 'HindIII')
    BD.removeBySequencedCount(0.5)
    # Remove 1% of regions with low coverage.
    BD.removePoorRegions(cutoff=1)
    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts).
    BD.truncTrans(high=0.0005)
    # Perform iterative correction.
    BD.iterativeCorrectWithoutSS()

    q = np.log(BD.dataDict['HindIII'])
    #if domain_res < 1000000:
    #	print "Matrix is too big, have to resize it"
    #	q=resize_matrix(q,1000000/domain_res,np.max)
    print "saving ", figure_path
    plotting.plot_matrix(q)
    plt.subplots_adjust(bottom=0.15)
    f = open(figure_path, "wb")
    plt.savefig(figure_path, dpi=600)
    f.close()
    plt.clf()