def gene_bin(para, data='YeastSyn'): show(""" Check gene distributions among the bins in heatmap """, True) map1 = ContactMap() assert map1.load(data, False) GE = read_gene(para['DataPath']+'/SGD/SGD_features.tab') bin_idx = [] for gid in GE: ch, st, ed = GE[gid] try: idx = map1.choose_map_loc([int(ch)], [int(st)-1]) for i in idx: if i >= 0: bin_idx.append(i) except: print 'Skip', ch, st, gid n = map1.contact_map.shape[0] cout = histogram(bin_idx, range(n), False) for i in xrange(n): show(map1.idx2chr[map1.frag_chr[i]]) show(map1.frag_sta[i]) show(map1.frag_end[i]) show(cout[i]) show()
def run1(name='demo'): show(''' Compare decomposation results during iterations ''', True) map1 = ContactMap(name) assert map1.load() map1.decompose_auto() n, r = map1.contact_group.shape print n, r map1.decompose('NND', dim_num=r, A=map1.get_null_map()) map2 = map1.duplicate() map2.decompose(dim_num=r) ref = np.array(map2.contact_group) show('Iter\tObj.\t# out of %s' % r) show('Corr. Mean\tCorr. STD\tMetric\n') from contact_map import gini_impurity for i in [1, 5, 10, 50] + range(100, 1201, 100): show(i) map2 = map1.duplicate() show(map2.decompose(dim_num=r, max_iter=i, stop_thrd=0)) ## match dims s = [] v = [] for j in xrange(r): srt, val, pval = map2.test_enrichment(ref[:, j], 'PCC') s.append(srt[0]) v.append(val[srt[0]]) show(len(set(s))) show(mean_std(v)) show(gini_impurity(np.diag(map2.group_map))) show()
def bin_count(infile, pdf): if not os.path.exists(infile): return from contact_map import ContactMap map1 = ContactMap() map1.load('YeastHiC') n = map1.contact_map.shape[0] idx, name = map1.get_locations(infile) srt, val, pval = map1.test_enrichment(idx, 'AUC', title=infile, plot=pdf, pages=9) show(infile) show(len(idx)) show(len(set(idx))) show(pval[srt[0]]) sign = [i for i in srt if pval[i] < 0.01] show(sign) show() if infile.find('telomere') > -1 or infile.find('tRNA') > -1 or infile.find('paralogs') > -1: outfile = open(infile+'_val.csv', 'w') outfile.write('Name,Bin Idx,Membership\n') for Name, Idx in zip(name, idx): outfile.write('%s,%s'%(Name, Idx)) for i in sign: outfile.write(',%s'%map1.contact_group[Idx,i]) outfile.write('\n') outfile.close() os.remove(infile)
def __init__(self, exepath, name='DataName', enzyme='Enzyme', sparse=False): ContactMap.__init__(self, name, enzyme, sparse) path = os.path.abspath(exepath) if not os.path.exists(path): print 'Please install domaincall to', path print 'The project is at https://github.com/kingsfordgroup/armatus' exit(0) self.exepath = path
def one_cell(path, pdf, cell, genome): map1 = ContactMap() map1.genome_info(path + '/%s_chr_len.txt' % genome) for i in sorted(map1.idx2chr.keys()): info = one_chr(path=path, cell=cell, genome=genome, ch=map1.idx2chr[i], pdf=pdf)
def run4(name='demo'): show( ''' Compare the change of clusters under different resolutions and iterations ''', True) map1 = ContactMap(name) assert map1.load() dims = [10, 15, 20, 25, 30, 35, 40, 45, 50] map1.decompose_auto(dim_num=dims) memb = np.array(map1.contact_group * map1.group_map) show('') show(dims, True) for ratio in xrange(1, 4): map2 = map1.duplicate() map2.get_interactions() map2.create_binnedmap(binsize=map1.get_binsize() * ratio) map2.mask_diag() paras = map2.decompose_auto(dim_num=dims) bins, vals = zip(*paras) idx, val = map1.get_locations(map2.output_groups(), st=0, ch=0, po=1) newv = np.array(map2.contact_group * map2.group_map) show(map2.get_binsize()) show(vals, True) show(map1.contact_group.shape, True) show('\n# of Iter.') show(dims, True) for i in xrange(10): it = i * 100 paras = map1.decompose_auto(max_iter=it, update=True, dim_num=dims) bins, vals = zip(*paras) show(it) show(vals, True)
def bin_count(infile, pdf): if not os.path.exists(infile): return from contact_map import ContactMap map1 = ContactMap() map1.load('YeastHiC') n = map1.contact_map.shape[0] idx, name = map1.get_locations(infile) srt, val, pval = map1.test_enrichment(idx, 'AUC', title=infile, plot=pdf, pages=9) show(infile) show(len(idx)) show(len(set(idx))) show(pval[srt[0]]) sign = [i for i in srt if pval[i] < 0.01] show(sign) show() if infile.find('telomere') > -1 or infile.find('tRNA') > -1 or infile.find( 'paralogs') > -1: outfile = open(infile + '_val.csv', 'w') outfile.write('Name,Bin Idx,Membership\n') for Name, Idx in zip(name, idx): outfile.write('%s,%s' % (Name, Idx)) for i in sign: outfile.write(',%s' % map1.contact_group[Idx, i]) outfile.write('\n') outfile.close() os.remove(infile)
def run2(name='demo'): show(''' Show the distribution of objective values ''', True) map1 = ContactMap(name) assert map1.load() map1.decompose_auto(max_iter=50) n, r = map1.contact_group.shape map2 = map1.duplicate() objs = [] for i in xrange(100): map2.reset_solution() objs.append(map2.decompose(dim_num=r, max_iter=50)) max_obj = int(max(objs)) + 1 min_obj = int(min(objs)) - 1 bins = range(min_obj, max_obj, int((max_obj - min_obj) / 10)) show('') show(bins, True) show('Frequency') show(histogram(objs, bins), True) show('Mean\tSTD\n') m, s = mean_std(objs) show([m, s], True) map2.decompose('Null', dim_num=r) obj1 = map2.decompose(dim_num=r) show('Use Null init has obj.') show(obj1) show('and Z-score is') show((obj1 - m) / s, True)
def map_contacts(fname): traj = md.load(fname) contacts = ContactMap(traj[0]) (fig, ax) = contacts.residue_contacts.plot(cmap='seismic', vmin=-1, vmax=1) plt.xlabel("Residue") plt.ylabel("Residue") figname = os.path.splitext(fname)[0] + '.png' plt.savefig(figname, bbox_inches='tight') plt.show() return contacts
def go_test(para, data='YeastHiC'): show(""" Check gene groups relating to the same GO term """, True) MP = read_map(para['DataPath']+'/SGD/go_slim_mapping.tab') CO = read_complex(para['DataPath']+'/SGD/go_protein_complex_slim.tab') GO = read_go(para['DataPath']+'/SGD/go_terms.tab') GE = read_gene(para['DataPath']+'/SGD/SGD_features.tab') cc = 0 go2gene = MP.copy() go2gene.update(CO) map1 = ContactMap() assert map1.load(data) map1.output_groups() for go in go2gene: go = go.strip() fname = save_gogene(go, go2gene, GE) idx, names = map1.get_locations(fname, st=1, ch=1, po=2, nm=0) os.remove(fname) if len(idx) < 1: continue srt, val, pval = map1.test_enrichment(idx, method='AvgCCD') cc += 1 sign = [] for i in srt: if pval[i] < 0.01: sign.append(i) if len(sign) > 0: show(go) show(len(idx)) show(GO[int(go.split(':')[1])]) show(pval[sign[0]]) show(pval[sign[-1]]) show(sign) show() show('We tested %s GO terms for %s.\n'%(cc,data))
def chr_detail(path, cell, genome, ch, loci, st=0, ed=None, pdf=None): map1 = ContactMap('tad-detail-%s-in-%s'%(ch,cell)) # map1.clear() if not map1.load(): map1.genome_info(path+'/%s_chr_len.txt'%genome) map1.focus_chromosome(ch) map1.create_binnedmap(40e3, lazy=True) # map1.contact_map = np.asmatrix(np.loadtxt(path+'/'+cell+'/uij.'+ch)) map1.contact_map = np.asmatrix(np.loadtxt(path+'/'+cell+'/nij/nij.'+ch)) print cell, ch, map1.frag_sta.shape[0], map1.contact_map.shape[0] assert map1.frag_sta.shape[0] == map1.contact_map.shape[0] map1.get_sparse_interactions() map1.focus_chromosome(ch, st=st, ed=ed) map1.create_contactmap(throw=0) map1.save() show(cell) show(ch) show(map1.contact_map.shape) map1.mask_diag() map1.mask_short() map1.decompose_auto(par_lam=1, beta=3, update=False) map1.sort_groups() # map1.add_bias_back() show(map1.contact_group.shape) show() if pdf is not None: map1.plot_map() pdf.savefig(); plt.clf() map1.plot_map(map1.contact_group*map1.group_map*map1.contact_group.T, vmin=0.01, title='H*S*H.T') pdf.savefig(); plt.clf() map1.plot_submap() pdf.savefig(); plt.clf() TAD_st, _ = map1.get_locations(path+'/'+cell+'/'+domain, st=0, ch=0, po=1, add=0) TAD_ed, _ = map1.get_locations(path+'/'+cell+'/'+domain, st=0, ch=0, po=2, add=-1) TAD = zip(TAD_st, TAD_ed) W = np.asarray(map1.contact_group * map1.group_map) Wsum = W.sum(1) W /= Wsum[Wsum>0].mean() gini = 1-np.power(W,2).sum(1) gini[Wsum==0] = 0 if loci is not None: loc = map1.choose_map_loc(loci) else: loc = [] grps = W[loc,:].sum(0) > 0 map1.output_groups() show(loc, True) sel = np.arange(0, 40) # if pdf is not None: # plt.plot(sel, gini[sel], 'k.--') for i in xrange(W.shape[1]): if grps[i]: if pdf is not None: plt.plot(sel, W[sel,i], label='C%s'%i) tad = [] for i,j in TAD: tad.append(j-i) if i in sel and j in sel: if pdf is not None: plt.plot([i,j], [1.1,1.1], 'k-', linewidth=2) if pdf is not None: plt.plot(loc, [1]*len(loc), 'r.') xt = sel[::(len(sel)/5)] plt.xticks(xt, ['%sM'%(X*0.04+st*1e-6) for X in xt]) plt.ylim([0,1.2]) plt.xlim([sel.min(), sel.max()]) pdf.savefig(); plt.clf() if pdf is not None: map1.plot_map(map1.contact_map[sel,:][:,sel]) pdf.savefig(); plt.clf() map1.plot_map(map1.contact_group[sel,:]*\ map1.group_map*\ map1.contact_group[sel,:].T, vmin=0.01, title='H*S*H.T') pdf.savefig(); plt.clf() return map1
def run1(para): show(''' Decompose headmap and show clusters in PDB format ''', True) path = para['DataPath']+'/Duan2010N' map1 = ContactMap('PDBMAP') map1.genome_info(path+'/restriction_fragments_mappable_HindIII.txt', i2=3, i3=0) map1.add_interactions(path+'/interactions_HindIII_fdr0.01_inter.txt') map1.add_interactions(path+'/interactions_HindIII_fdr0.01_intra.txt') map1.create_binnedmap(10000) map1.decompose_auto(dims=range(5,51,5)) map_idx, pdb_idx = map1.get_locations(path+'/3d_model_of_yeast_with_genomic_positions.txt', st=1, ch=0, po=1, nm=-1) H = map1.contact_group n,r = H.shape for i in xrange(3): members = set() for j in xrange(n): if H[j,i] > 1: members.add(j) mark_idx = [ip for im,ip in zip(map_idx, pdb_idx) if im in members] output_pdb('Yeast3D-C%s.pdb'%i, path+'/3d_model_of_yeast_genome.pdb', mark_idx)
import pandas as pd import matplotlib.pyplot as plt import mdtraj as md from contact_map import ContactMap, ContactFrequency, ContactDifference pdb = 'poses/snx_chanel' traj = md.load_pdb(pdb + '.pdb') print(traj) topology = traj.topology tox = topology.select("segname TOX") cav = topology.select("segname R1 R2 R3 R4") frame_contacts = ContactMap(traj[0], query=tox, haystack=cav, cutoff=0.35) #print (frame_contacts.residue_contacts.df) df = frame_contacts.residue_contacts.df (fig, ax) = frame_contacts.residue_contacts.plot(cmap='seismic', vmin=-1, vmax=1) tox_residues_id = residue_for_atoms_id(tox, traj.topology) cav_residues_id = residue_for_atoms_id(cav, traj.topology) tox_residues = residue_for_atoms_name(tox_residues_id, traj.topology) cav_residues = residue_for_atoms_name(cav_residues_id, traj.topology) cav_residues_ori = residue_for_atoms_original(cav_residues_id, traj.topology) ax.set_xlim(min(cav_residues_id), max(cav_residues_id) + 1) ax.set_ylim(min(tox_residues_id), max(tox_residues_id) + 1) #segment_for_residue(topology)
def decompose_dist(pdf, curve, r=None): 'Decompose the Euc distance matrix on curve' from contact_map import ContactMap, EIG, NMF_main map1 = ContactMap() curve_show(curve) pdf.savefig() plt.clf() verts = np.array(curve) map1.plot_map(verts, title="Verteces", log=False) pdf.savefig() plt.clf() from scipy.spatial.distance import pdist, squareform ds = squareform(pdist(verts, 'euclidean')) ## transform # V = ds.max() - ds # V = ds.max() / (ds + 1) V = ds.max() * ((ds + 1)**-2) map1.plot_map(V, title="Synthetic Heatmap", log=True) pdf.savefig() plt.clf() #plt.hist(np.reshape(V,(-1)), bins=100, normed=1, facecolor='blue') #plt.title('Distribution of map values') #pdf.savefig(); plt.clf() plt.loglog([(i + 1.0) / V.shape[0] for i in range(V.shape[0])], trace_sum(V), linestyle='-.') plt.title('Distribution of interactions along 1D') plt.xlabel('Ratio of linked locations to the total length') plt.ylabel('Number of observed links') pdf.savefig() plt.clf() if r == None: r = choose_size(pdf, V, 9) show('Best number of dimentions is %s\n' % r) r = 4 if False: ## try PCA U = (V - np.mean(V.T, axis=1)).T Q, M = EIG(np.cov(U), r) else: Q, M = EIG(V, r) map1.plot_map(Q, title='Eig. Decomp. - Q Matrix', log=False) pdf.savefig() plt.clf() map1.plot_map(M, title='Eig. Decomp. - M Matrix', log=False) pdf.savefig() plt.clf() map1.plot_map(Q * M * Q.T, title='Eig. Decomp. - Recovered', log=False) pdf.savefig() plt.clf() sep_map_show(pdf, verts, Q) H, S, obj = NMF_main(V, J='NMF-PoissonManifoldEqual', H=Q, S=M, r=r) map1.plot_map(H * S * H.T, title='NMF Decomp. - Recovered', log=False) pdf.savefig() plt.clf() map1.plot_map(H, title='NMF Decomp. - H Matrix', log=False) pdf.savefig() plt.clf() map1.plot_map(S, title='NMF Decomp. - S Matrix', log=False) pdf.savefig() plt.clf() maxp = np.argmax(np.asarray(H), 0) srt = np.argsort(maxp) sep_map_show(pdf, verts, H[:, srt]) try: from sklearn.cluster import KMeans km = KMeans(n_clusters=r) H = -np.matrix(km.fit_transform(V)) S = np.matrix(np.eye(r)) maxp = np.argmax(np.asarray(H), 0) srt = np.argsort(maxp) map1.plot_map(H, title='K-means Decomp. - H Matrix', log=False) pdf.savefig() plt.clf() sep_map_show(pdf, verts, H[:, srt]) except: print 'Please install SK-kit to run K-means' pass
def get_syn_map(para, bin_size=3200, with_bias=True): pdf = PdfPages(para['ExeFile'] + 'plot1.pdf') ## prepare map1 = ContactMap('Syn3D') map1.genome_info(para['DataPath'] + '/Tjong2012GR/yeast_chr_len-Tjong.txt') map1.create_binnedmap(3200) ## fixed map2 = map1.duplicate() map3 = map1.duplicate() ## obtain links from PDB link_map = np.load('syn_link.npy') if with_bias: ## add random bias np.random.seed(0) bias = np.random.random(link_map.shape[0]) link_map *= np.outer(bias, bias) print link_map.min(), link_map.max(), link_map.mean() link_map = np.floor(link_map) ## sampling bias map1.contact_map = np.matrix(link_map, dtype='float') output = open( 'syn_yeast_map_bin%s%s.txt' % (bin_size, 'bias' if with_bias else ''), 'w') ch = map1.frag_chr po = (map1.frag_sta + map1.frag_end) / 2 for i in xrange(link_map.shape[0]): for j in xrange(link_map.shape[1]): if link_map[i, j] > 0: output.write('%s\t%s\t%s\t%s\t0\t%s\t1e-10\t1e-10\n' % (ch[i], po[i], ch[j], po[j], link_map[i, j])) output.write('\n') output.close() map1.get_interactions() map1.create_binnedmap(bin_size) map1.mask_diag() map1.plot_map(title='Heatmap for the number of links') pdf.savefig() plt.clf() map1.decompose('NND') idx, names = map2.get_locations(map1.output_groups(), st=0, ch=0, po=1, nm=0, add=0) dist_map = np.load('syn_dist.npy') dist = dist_map[idx, :][:, idx] map1.plot_map(dist, title='Heatmap for the average distances', log=False) pdf.savefig() plt.clf() pdf.close() return map1, dist
import matplotlib.pyplot as plt import mdtraj as md from contact_map import ContactMap pdb_list = [ "../pdb_dir_1_500ns/frame0.pdb", "../pdb_dir_5001_6000ns/frame4164.pdb"] # Program takes about several minutes to finish # It is a bit slow; for i in range(len(pdb_list)): pdb = md.load_pdb(pdb_list[i]) frame_contacts = ContactMap(pdb[0], cutoff=1.5) (fig, ax) = frame_contacts.residue_contacts.plot(cmap='seismic', vmin=-1, vmax=1) plt.xlabel("Residue") plt.ylabel("Residue") fig.savefig(f'cont-map-{i}.pdf', format='pdf', dpi=500) plt.close() # Calculate the difference between two contact maps diff = contacts[1] - contacts[0] (fig, ax) = diff.residue_contacts.plot(cmap='seismic', vmin=-1, vmax=1) plt.xlabel("Residue") plt.ylabel("Residue") fig.savefig(f'cont-map-diff.pdf', format='pdf', dpi=500) plt.close()
def one_region(path, cell, genome, ch, bi, loci, st=0, ed=None, pdf=None): if bi.endswith('kb'): reso = int(bi.replace('kb', '')) * 1000 elif bi.endswith('mb'): reso = int(bi.replace('mb', '')) * 1000000 else: raise ValueError('Unknow unit %s' % bi) map1 = ContactMap('loop-%s-in-%s' % (ch, cell)) map1.clear() if not map1.load(): map1.genome_info(path + '/%s_chr_len.txt' % genome) map1.create_binnedmap(reso, lazy=True) map1.focus_chromosome(ch, st=st, ed=ed) if True: ## read files norm = [] infile = open(path + '/' + cell + '/' + bi + '_resolution_intrachromosomal/' + ch + '/MAPQGE30/' + ch + '_' + bi + '.KRnorm') for line in infile: norm.append(float(line)) infile.close() expect = [] infile = open(path + '/' + cell + '/' + bi + '_resolution_intrachromosomal/' + ch + '/MAPQGE30/' + ch + '_' + bi + '.KRexpected') for line in infile: expect.append(float(line)) infile.close() expect.append(1.0) print len(norm), len(expect) infile = open( path + '/' + cell + '/' + bi + '_resolution_intrachromosomal/' + ch + '/MAPQGE30/' + ch + '_' + bi + '.RAWobserved', 'r', 2 << 9) p1 = [] p2 = [] val = [] for line in infile: P1, P2, Val = line.split() pos1 = int(P1) pos2 = int(P2) if pos1 < st or (ed is not None and pos1 >= ed): continue if pos2 < st or (ed is not None and pos2 >= ed): continue p1.append(pos1) p2.append(pos2) I = pos1 / reso J = pos2 / reso IJ = abs(pos1 - pos2) / reso # val.append(float(Val)) val.append(float(Val) / (norm[I] * norm[J])) # val.append(float(Val)/(norm[I]*norm[J]*expect[IJ])) map1.inter_loc1 = np.array(p1, dtype='int') map1.inter_loc2 = np.array(p2, dtype='int') map1.inter_freq = np.array(val, dtype='float') chidx = map1.chr2idx[ch] map1.inter_chr1 = chidx * np.ones(len(p1), dtype='int') map1.inter_chr2 = chidx * np.ones(len(p2), dtype='int') infile.close() map1.create_binnedmap(reso) map1.save() show(cell) show(ch) if pdf is not None: map1.plot_map() pdf.savefig() plt.clf() map1.decompose_auto() map1.sort_groups() show(map1.contact_group.shape) show() bins = map1.choose_map_loc(loci) W = np.asarray(map1.contact_group * map1.group_map) n, r = W.shape wm = W.sum(1) W /= np.mean(wm[wm > 0]) gini = 1 - np.power(W, 2).sum(1) gini[wm == 0] = 0 outfile = open('loop-%s-in-%s_groups.wig' % (ch, cell), 'w') # outfile.write('track type=wiggle_0 name="Overall" description="BNMF" visibility=full autoScale=off viewLimits=800:1000 color=0,0,0 maxHeightPixels=100:50:20 graphType=bar priority=20\nfixedStep chrom='+ch+' start=%d'%st+' step=%d'%reso+' span=%d\n'%reso) # for i in xrange(n): # outfile.write('%d\n'%int(1000*gini[i])) jj = [] ww = 0 # for j in xrange(r): # if W[bins,j].max() < 0.1: # continue for j in W[bins, :].argmax(1): ww += W[:, j] outfile.write( 'track type=wiggle_0 name="C%s' % (j + 1) + '" description="BNMF" visibility=full autoScale=off viewLimits=0:200 color=0,0,0 maxHeightPixels=100:50:20 graphType=bar priority=20\nfixedStep chrom=' + ch + ' start=%d' % st + ' step=%d' % reso + ' span=%d\n' % reso) for i in xrange(n): outfile.write('%d\n' % int(1000 * W[i, j])) jj.append(j) # outfile.write('track type=wiggle_0 name="Overall" description="BNMF" visibility=full autoScale=off viewLimits=0:200 color=0,0,0 maxHeightPixels=100:50:20 graphType=bar priority=20\nfixedStep chrom='+ch+' start=%d'%st+' step=%d'%reso+' span=%d\n'%reso) # for i in xrange(n): # outfile.write('%d\n'%int(1000*ww[i])) # outfile.close() sel = range(n) lab = ['%dk' % ((i * reso + st) / 1000) for i in sel] five = np.arange(0, len(sel), len(sel) / 5) if pdf is not None: map1.plot_map(map1.contact_group * map1.group_map * map1.contact_group.T, log=False) pdf.savefig() plt.clf() map1.plot_map(map1.contact_group[:, jj] * map1.group_map[jj, :][:, jj] * map1.contact_group[:, jj].T, log=False) pdf.savefig() plt.clf() map1.plot_submap() pdf.savefig() plt.clf() plt.plot(sel, gini[sel], '--k') for j in jj: plt.plot(sel, W[sel, j], '-', label='C%s' % (j + 1)) # plt.plot(sel, ww[sel], '-', label='Combined') plt.plot(bins, [1.1] * len(bins), 'ro') plt.legend() plt.xticks([sel[j] for j in five], [lab[j] for j in five]) pdf.savefig() plt.clf() return
def get_syn_map(para, bin_size=3200, with_bias=True): pdf = PdfPages(para['ExeFile']+'plot1.pdf') ## prepare map1 = ContactMap('Syn3D') map1.genome_info(para['DataPath']+'/Tjong2012GR/yeast_chr_len-Tjong.txt') map1.create_binnedmap(3200) ## fixed map2 = map1.duplicate() map3 = map1.duplicate() ## obtain links from PDB link_map = np.load('syn_link.npy') if with_bias: ## add random bias np.random.seed(0) bias = np.random.random(link_map.shape[0]) link_map *= np.outer(bias, bias) print link_map.min(), link_map.max(), link_map.mean() link_map = np.floor(link_map) ## sampling bias map1.contact_map = np.matrix(link_map, dtype='float') output = open('syn_yeast_map_bin%s%s.txt'%(bin_size, 'bias' if with_bias else ''), 'w') ch = map1.frag_chr po = (map1.frag_sta+map1.frag_end)/2 for i in xrange(link_map.shape[0]): for j in xrange(link_map.shape[1]): if link_map[i,j] > 0: output.write('%s\t%s\t%s\t%s\t0\t%s\t1e-10\t1e-10\n'%(ch[i], po[i], ch[j], po[j], link_map[i,j])) output.write('\n') output.close() map1.get_interactions() map1.create_binnedmap(bin_size) map1.mask_diag() map1.plot_map(title='Heatmap for the number of links') pdf.savefig(); plt.clf(); map1.decompose('NND') idx, names = map2.get_locations(map1.output_groups(), st=0, ch=0, po=1, nm=0, add=0) dist_map = np.load('syn_dist.npy') dist = dist_map[idx,:][:,idx] map1.plot_map(dist, title='Heatmap for the average distances', log=False) pdf.savefig(); plt.clf(); pdf.close() return map1, dist
def plot2(para): pdf = PdfPages(para['ExeFile'] + 'plot2.pdf') ## initalization map1 = ContactMap('plot2') if True: map1.genome_info('../data/yeast_chr_len.txt') datafiles = [ '../data/Duan2010N/interactions_HindIII_fdr0.01_inter.txt', '../data/Duan2010N/interactions_HindIII_fdr0.01_intra.txt' ] for datafile in datafiles: map1.add_interactions(datafile) map1.create_binnedmap(binsize=10e3) map1.mask_diag() map1.mask_short() map1.mask_low() map1.plot_map(map1.contact_map, log=True, vmin=1, vmax=100) pdf.savefig() plt.clf() sel = np.arange(200, 400) map1.plot_map(map1.contact_map[sel, :][:, sel], log=True, vmin=1, vmax=100) pdf.savefig() plt.clf() for l in [0, 0.1, 1, 10]: map1.reset_solution() map1.decompose('NMF-PoissonManifoldEqual', dim_num=55, par_lam=l) R = map1.contact_group * map1.group_map * map1.contact_group.T map1.plot_map(R[sel, :][:, sel], vmin=1, vmax=100, title=str(l)) pdf.savefig() plt.clf() pdf.close()
def run5(name='demo'): show( ''' Mapping clusters by changing the number of total clusters ''', True) map1 = ContactMap(name) map2 = ContactMap(name) assert map1.load() assert map2.load() # dims = [10,20,30,40,50,60,70,80] dims = range(5, 31, 1) show('Bin Size\tMetric') map1.decompose_auto(dim_num=30) full = np.arange(map1.contact_group.shape[1]) show(full.tolist(), True) from contact_map import gini_impurity for r in dims: show(r) map2.decompose_auto(dim_num=r) show(gini_impurity(np.diag(map2.group_map))) match = map1.best_cor(map2, dims=True) dt = {} for i, j in match: dt[i] = j for i in full: if i in dt: show(dt[i]) else: show('') show()
def one_cell(path, pdf, cell, genome): map1 = ContactMap() map1.genome_info(path+'/%s_chr_len.txt'%genome) cci = 0; ccj = 0 ni = 0; nj = 0 tadlen = [] tadtype = [] grptype=[] gini = [] entropy = [] for i in sorted(map1.idx2chr.keys()): info = one_chr(path=path, cell=cell, genome=genome, ch=map1.idx2chr[i], pdf=pdf) CCI,CCJ,NI,NJ,TADLEN,TADTYPE,GRPTYPE,GINI,ENTRO = info cci += CCI ccj += CCJ ni += NI nj += NJ tadlen += TADLEN tadtype += TADTYPE grptype += GRPTYPE gini += GINI[GINI>0].tolist() entropy += ENTRO[ENTRO>0].tolist() tadlen = np.array(tadlen) plt.hist(tadlen*resolution, np.arange(tadlen.max())*resolution) plt.title('Distribution of TAD sizes in %s'%cell) pdf.savefig(); plt.clf() bins = range(max(tadtype)+1) count = histogram(tadtype, bins, False) show(bins, True) show(count, True) tadtype = np.array(tadtype) plt.hist(tadtype, np.arange(tadtype.max()+1)) plt.title('Distribution of covered clusters in %s'%cell) pdf.savefig(); plt.clf() bins = range(max(grptype)+1) count = histogram(grptype, bins, False) show(bins, True) show(count, True) grptype = np.array(grptype) plt.hist(grptype, np.arange(grptype.max()+1)) plt.title('Distribution of covered TADs in %s'%cell) pdf.savefig(); plt.clf() # plt.plot(np.arange(plot_left, plot_right), cci/ni, '.-r', label='TAD start') # plt.plot(np.arange(plot_left, plot_right), ccj/nj, '.-b', label='TAD end') plt.plot(np.arange(plot_left, plot_right), (cci+ccj)/(ni+nj), '.-k', label='TAD boundary') plt.xlabel('Genomic distances (kb)') plt.ylabel('Gini impurity score') plt.xticks(np.arange(plot_left, plot_right), np.arange(plot_left, plot_right)*resolution) plt.xlim([plot_left, plot_right]) plt.title('Average scores around TAD in %s'%cell) plt.legend() pdf.savefig(); plt.clf() show(mean_std(gini)) plt.hist(gini, np.arange(0,1.001,0.05)) plt.title('Distribution of Gini impurity scores') plt.xlabel('Gini impurity scores') plt.ylabel('Frequency') pdf.savefig(); plt.clf() show(mean_std(entropy)) plt.hist(entropy, np.arange(0,6,0.2)) plt.title('Distribution of entropy at TAD boundaries') plt.xlabel('Entropy') plt.ylabel('Frequency') pdf.savefig(); plt.clf()
def one_chr(path, cell, genome, ch, pdf=None): map1 = ContactMap('tad-%s-in-%s'%(ch,cell)) # map1.clear() if not map1.load(): map1.genome_info(path+'/%s_chr_len.txt'%genome) map1.focus_chromosome(ch) map1.create_binnedmap(40e3, lazy=True) map1.contact_map = np.loadtxt(path+'/'+cell+'/uij.'+ch) print cell, ch, map1.frag_sta.shape[0], map1.contact_map.shape[0] assert map1.frag_sta.shape[0] == map1.contact_map.shape[0] map1.get_sparse_interactions() map1.create_binnedmap(resolution*1000) map1.mask_diag() map1.mask_short() map1.mask_low() map1.save() show(cell) show(ch) if pdf is not None: map1.plot_map() pdf.savefig(); plt.clf() map1.decompose_auto(update=False) map1.sort_groups() show(map1.contact_group.shape) if pdf is not None: map1.plot_submap() pdf.savefig(); plt.clf() TAD_st, _ = map1.get_locations(path+'/'+cell+'/'+domain, st=0, ch=0, po=1, add=0) TAD_ed, _ = map1.get_locations(path+'/'+cell+'/'+domain, st=0, ch=0, po=2, add=-1) TAD = zip(TAD_st, TAD_ed) W = np.asarray(map1.contact_group * map1.group_map) wm = W.sum(1) W /= np.mean(wm[wm>0]) group = np.argmax(W,1)+1 group[wm==0] = -1 ## masked regions gini = 1-np.power(W,2).sum(1) gini[wm==0] = -1 ## masked regions log2W = np.log2(W) log2W[W==0] = 0 entropy = (-W*log2W).sum(1) entropy[wm==0] = 0 score = gini score[score<0] = 0 ## for ploting for i in [1,np.argmax(entropy)/50]: sel = np.arange(i*50, min(W.shape[0],(i+1)*50)) pos = ['%.fM'%(j*resolution*1e-3) for j in sel] if pdf is not None: fig = plt.figure() axis = fig.add_subplot(211) # axis.plot(sel, score[sel], '--k') for i in xrange(W.shape[1]): # if W[sel,i].max() > 0.1: if pdf is not None: axis.plot(sel, W[sel,i], label='C%s'%i) for i,j in TAD: if i in sel and j in sel: if pdf is not None: axis.plot([i,j], [1,1], 'k-', linewidth=2) if pdf is not None: plt.ylim([0,1.2]) plt.xticks(sel[::int(len(sel)/5)], pos[::int(len(sel)/5)]) axis = fig.add_subplot(212) from matplotlib.colors import LogNorm axis.imshow(map1.contact_map[sel,:][:,sel], interpolation='none', norm=LogNorm(), aspect='equal', cmap='OrRd') axis.legend() fig.savefig(pdf, format='pdf') plt.clf() tad = np.zeros_like(gini) tadlen = [] for i,j in TAD: for k in xrange(i+1, j-1): tad[k] = (i+j+1)/2 ## regions in the domain tadlen.append(j-i) tadtype = [] for i in np.unique(tad): if i > 0: tadtype.append(len(np.unique(group[tad==i]))) grptype = [] for i in np.unique(group): if i > 0: grptype.append(len(np.unique(tad[group==i]))) show(np.sum(np.logical_and(tad==0,gini>=0))) ## TADs for cut in [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]: show(np.sum(gini>=cut)) ## clusters show(np.sum(np.logical_and(tad==0,gini>=cut))) ## TADs % clusters cci = 0; ccj = 0 ## scores around the TAD boundaries ni = 0; nj = 0 for i,j in TAD: if i+plot_left >=0 and i+plot_right < len(score): cci += score[(i+plot_left):(i+plot_right)] ni += 1 if j+plot_left >=0 and j+plot_right < len(score): ccj += score[(j+plot_left):(j+plot_right)] nj += 1 show() return cci,ccj,ni,nj,tadlen,tadtype,grptype,gini,entropy
def decompose_dist(pdf, curve, r=None): 'Decompose the Euc distance matrix on curve' from contact_map import ContactMap, EIG, NMF_main map1 = ContactMap() curve_show(curve) pdf.savefig(); plt.clf() verts = np.array(curve) map1.plot_map(verts, title = "Verteces", log=False) pdf.savefig(); plt.clf() from scipy.spatial.distance import pdist, squareform ds = squareform(pdist(verts, 'euclidean')) ## transform # V = ds.max() - ds # V = ds.max() / (ds + 1) V = ds.max() * ((ds+1)**-2) map1.plot_map(V, title="Synthetic Heatmap", log=True) pdf.savefig(); plt.clf() #plt.hist(np.reshape(V,(-1)), bins=100, normed=1, facecolor='blue') #plt.title('Distribution of map values') #pdf.savefig(); plt.clf() plt.loglog([(i+1.0)/V.shape[0] for i in range(V.shape[0])], trace_sum(V), linestyle='-.') plt.title('Distribution of interactions along 1D') plt.xlabel('Ratio of linked locations to the total length') plt.ylabel('Number of observed links') pdf.savefig(); plt.clf() if r == None: r = choose_size(pdf, V, 9) show('Best number of dimentions is %s\n'%r) r = 4 if False: ## try PCA U = (V-np.mean(V.T,axis=1)).T Q, M = EIG(np.cov(U), r) else: Q, M = EIG(V, r) map1.plot_map(Q, title = 'Eig. Decomp. - Q Matrix', log=False) pdf.savefig(); plt.clf() map1.plot_map(M, title = 'Eig. Decomp. - M Matrix', log=False) pdf.savefig(); plt.clf() map1.plot_map(Q*M*Q.T, title = 'Eig. Decomp. - Recovered', log=False) pdf.savefig(); plt.clf() sep_map_show(pdf, verts, Q) H, S, obj = NMF_main(V, J='NMF-PoissonManifoldEqual', H=Q, S=M, r=r) map1.plot_map(H*S*H.T, title = 'NMF Decomp. - Recovered', log=False) pdf.savefig(); plt.clf() map1.plot_map(H, title = 'NMF Decomp. - H Matrix', log=False) pdf.savefig(); plt.clf() map1.plot_map(S, title = 'NMF Decomp. - S Matrix', log=False) pdf.savefig(); plt.clf() maxp = np.argmax(np.asarray(H),0) srt = np.argsort(maxp) sep_map_show(pdf, verts, H[:,srt]) try: from sklearn.cluster import KMeans km = KMeans(n_clusters=r) H = -np.matrix(km.fit_transform(V)) S = np.matrix(np.eye(r)) maxp = np.argmax(np.asarray(H),0) srt = np.argsort(maxp) map1.plot_map(H, title = 'K-means Decomp. - H Matrix', log=False) pdf.savefig(); plt.clf() sep_map_show(pdf, verts, H[:,srt]) except: print 'Please install SK-kit to run K-means' pass
def plot2(para): pdf = PdfPages(para['ExeFile']+'plot2.pdf') ## initalization map1 = ContactMap('plot2') if True: map1.genome_info('../data/yeast_chr_len.txt') datafiles = ['../data/Duan2010N/interactions_HindIII_fdr0.01_inter.txt', '../data/Duan2010N/interactions_HindIII_fdr0.01_intra.txt'] for datafile in datafiles: map1.add_interactions(datafile) map1.create_binnedmap(binsize=10e3) map1.mask_diag() map1.mask_short() map1.mask_low() map1.plot_map(map1.contact_map, log=True, vmin=1, vmax=100) pdf.savefig(); plt.clf(); sel = np.arange(200,400) map1.plot_map(map1.contact_map[sel,:][:,sel], log=True, vmin=1, vmax=100) pdf.savefig(); plt.clf(); for l in [0, 0.1, 1, 10]: map1.reset_solution() map1.decompose('NMF-PoissonManifoldEqual', dim_num=55, par_lam=l) R = map1.contact_group * map1.group_map * map1.contact_group.T map1.plot_map(R[sel,:][:,sel], vmin=1, vmax=100, title=str(l)) pdf.savefig(); plt.clf(); pdf.close()
def plot1(para): pdf = PdfPages(para['ExeFile']+'plot1.pdf') ## initalization map1 = ContactMap('plot1') map1.clear() ## read chromosome sizes if not map1.load(): map1.genome_info('../data/yeast_chr_len.txt') datafiles = ['../data/Duan2010N/interactions_HindIII_fdr0.01_inter.txt', '../data/Duan2010N/interactions_HindIII_fdr0.01_intra.txt'] for datafile in datafiles: map1.add_interactions(datafile) map1.create_binnedmap(binsize=20e3) map1.mask_diag() map1.mask_short() map1.mask_low() map1.decompose_auto(plot=pdf) map1.sort_groups() map1.save() map1.plot_map(vmin=1, vmax=1000, title='$X$') pdf.savefig(); plt.clf(); map1.plot_map(np.diag(map1.bias_vector), log=False, title='$B$') pdf.savefig(); plt.clf(); map1.plot_map(map1.contact_group, log=False, title='$H$') pdf.savefig(); plt.clf(); map1.plot_map(map1.group_map, log=False, title='$S$') pdf.savefig(); plt.clf(); map1.plot_map(map1.group_map * map1.contact_group.T, log=False, title='$W=SH^T$') pdf.savefig(); plt.clf(); map1.plot_map(map1.contact_group * map1.group_map * map1.contact_group.T, vmin=1, vmax=1000, title='$R=HSH^T$') pdf.savefig(); plt.clf(); grps = map1.label_groups(plot=pdf) r = map1.contact_group.shape[1] for i in [0,r-2,r-1]: map1.plot_map(map1.contact_group[:,i] * map1.contact_group[:,i].T, vmin=1, title=str(i+1)) pdf.savefig(); plt.clf(); map1.plot_map(np.outer(map1.bias_vector, map1.bias_vector), log=False) pdf.savefig(); plt.clf(); map1.add_bias_back() map1.plot_map(map1.contact_group * map1.group_map * map1.contact_group.T, vmin=1, vmax=1000, title='$R=HSH^T$') pdf.savefig(); plt.clf(); pdf.close()
def run3(para, name='demo'): show( ''' Compare objective values in NMF and average distances in PDB. ''', True) pdf = PdfPages(para['ExeFile'] + 'plot.pdf') map1 = ContactMap('Syn3D') map1.genome_info(para['DataPath'] + '/Tjong2012GR/yeast_chr_len-Tjong.txt') map1.create_binnedmap(3200) map2 = map1.duplicate() map1.contact_map = np.matrix(np.load('syn_link.npy'), dtype='float') map1.get_interactions() map1.create_binnedmap(32000) map1.mask_diag() paras = map1.decompose_auto() r = paras[-1][0] map3 = map1.duplicate() show(r) show('is the selected cluster number\n') print map1.contact_map.shape idx, names = map2.get_locations(map1.output_groups(), st=0, ch=0, po=1, nm=0, add=0) print len(idx) dist_map = np.load('syn_dist.npy') show(dist_map.mean()) show('is the average of all bins\n') dist = dist_map[idx, :][:, idx] ## distance among bins show(dist.mean()) show('is the average of selected bins\n') inum = [] objs = [] avgs = [] objs3 = [] avgs3 = [] show( '\tObjective function values\tAverage intra-cluster distances\tcase2\n' ) map1.reset_solution() map1.decompose('NND', dim_num=r) map3.reset_solution() # map3.decompose('NND', dim_num=r) icc = 0 step = 20 for i in xrange(100): icc += step show(icc) inum.append(icc) obj = map1.decompose(dim_num=r, par_lam=1, max_iter=step, stop_thrd=0) obj3 = map3.decompose(dim_num=r, par_lam=1, max_iter=step, stop_thrd=0) map1.sort_groups('diagnal') show(obj) avg = [] avg3 = [] for j in xrange(r): idx1 = np.asarray(map1.contact_group)[:, j] > float( map1.contact_group[:, j].mean()) D1 = dist[idx1, :][:, idx1] d1 = D1[np.triu_indices(D1.shape[0], k=1)] avg += d1.tolist() idx3 = np.asarray(map3.contact_group)[:, j] > float( map3.contact_group[:, j].mean()) D3 = dist[idx3, :][:, idx3] d3 = D3[np.triu_indices(D3.shape[0], k=1)] avg3 += d3.tolist() show(mean_std(avg)[0]) show(mean_std(avg3)[0]) show() objs.append(obj) objs3.append(obj3) avgs.append(mean_std(avg)[0]) avgs3.append(mean_std(avg3)[0]) plt.plot(inum, objs, 'r-', label='NNDSVD Initialization') # plt.plot(inum, objs3, 'b--', label='Random Initialization') plt.legend() plt.xlabel('Number of iterations') plt.ylabel('Objective function values for NMF') pdf.savefig() plt.clf() plt.plot(objs, avgs, 'r.', label='NNDSVD Initialization') # plt.plot(objs3, avgs3, 'b+', label='Random Initialization') plt.legend() plt.xlabel('Objective function values for NMF') plt.ylabel('Average intra cluster distances (nm)') pdf.savefig() plt.clf() show('\nCorrelation of objective with the average distances\n') show('Pearson Coef.') show(correlation(objs, avgs), True) show('Spearman Rank Coef.') show(correlation(objs, avgs, rank=True), True) map1.plot_submap() pdf.savefig() plt.clf() map3.plot_submap() pdf.savefig() plt.clf() pdf.close()
def plot1(para): pdf = PdfPages(para['ExeFile'] + 'plot1.pdf') ## initalization map1 = ContactMap('plot1') map1.clear() ## read chromosome sizes if not map1.load(): map1.genome_info('../data/yeast_chr_len.txt') datafiles = [ '../data/Duan2010N/interactions_HindIII_fdr0.01_inter.txt', '../data/Duan2010N/interactions_HindIII_fdr0.01_intra.txt' ] for datafile in datafiles: map1.add_interactions(datafile) map1.create_binnedmap(binsize=20e3) map1.mask_diag() map1.mask_short() map1.mask_low() map1.decompose_auto(plot=pdf) map1.sort_groups() map1.save() map1.plot_map(vmin=1, vmax=1000, title='$X$') pdf.savefig() plt.clf() map1.plot_map(np.diag(map1.bias_vector), log=False, title='$B$') pdf.savefig() plt.clf() map1.plot_map(map1.contact_group, log=False, title='$H$') pdf.savefig() plt.clf() map1.plot_map(map1.group_map, log=False, title='$S$') pdf.savefig() plt.clf() map1.plot_map(map1.group_map * map1.contact_group.T, log=False, title='$W=SH^T$') pdf.savefig() plt.clf() map1.plot_map(map1.contact_group * map1.group_map * map1.contact_group.T, vmin=1, vmax=1000, title='$R=HSH^T$') pdf.savefig() plt.clf() grps = map1.label_groups(plot=pdf) r = map1.contact_group.shape[1] for i in [0, r - 2, r - 1]: map1.plot_map(map1.contact_group[:, i] * map1.contact_group[:, i].T, vmin=1, title=str(i + 1)) pdf.savefig() plt.clf() map1.plot_map(np.outer(map1.bias_vector, map1.bias_vector), log=False) pdf.savefig() plt.clf() map1.add_bias_back() map1.plot_map(map1.contact_group * map1.group_map * map1.contact_group.T, vmin=1, vmax=1000, title='$R=HSH^T$') pdf.savefig() plt.clf() pdf.close()