def test_to_igraph() : #Make sure the igraph output has correct same structure T1 = SuchTree( gopher_tree ) T2 = SuchTree( lice_tree ) links = pd.read_csv( gl_links, index_col=0 ) SLT = SuchLinkedTrees( T1, T2, links ) g = SLT.to_igraph() # igraph returns an unweighted adjacency matrix, # so we'll convert SuchLinkedTrees weighted # adjacency matrix to an unweighted form. saj = numpy.ceil( SLT.adjacency() ) # For some reason, igraph invented its own Matrix # class that doesn't implement a standard numpy # interface. :-/ iaj = numpy.array( list( map( list, g.get_adjacency() ) ) ) # matrixes must be the same shape assert saj.shape == iaj.shape # all matrix elements must be equal assert reduce( lambda a,b:a and b, (saj == iaj).flatten() )
def test_to_igraph(): #Make sure the igraph output has correct same structure T1 = SuchTree(gopher_tree) T2 = SuchTree(lice_tree) links = pd.read_csv(gl_links, index_col=0) SLT = SuchLinkedTrees(T1, T2, links) g = SLT.to_igraph() # igraph returns an unweighted adjacency matrix, # so we'll convert SuchLinkedTrees weighted # adjacency matrix to an unweighted form. saj = numpy.ceil(SLT.adjacency()) # For some reason, igraph invented its own Matrix # class that doesn't implement a standard numpy # interface. :-/ iaj = numpy.array(list(map(list, g.get_adjacency()))) # matrixes must be the same shape assert saj.shape == iaj.shape # all matrix elements must be equal assert reduce(lambda a, b: a and b, (saj == iaj).flatten())
def test_get_column_leafs_by_name_as_row_ids() : T = SuchTree( test_tree ) links = pd.DataFrame( numpy.random.random_integers( 0, 3, size=(N,N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys()) ) SLT = SuchLinkedTrees( T, T, links ) for colname in links.columns : s = links.applymap(bool)[ colname ] leafs1 = set( map( list(SLT.col_ids).index, map( lambda x : T.leafs[x], s[ s > 0 ].index ) ) ) leafs2 = set( SLT.get_column_leafs( colname, as_row_ids=True ) ) assert leafs1 == leafs2
def test_get_column_leafs(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(T, T, links) for n, colname in enumerate(links.columns): s = links.applymap(bool)[colname] leafs1 = set(map(lambda x: T.leafs[x], s[s > 0].index)) leafs2 = set(SLT.get_column_leafs(n)) assert leafs1 == leafs2
def test_get_column_leafs() : T = SuchTree( test_tree ) links = pd.DataFrame( numpy.random.random_integers( 0, 3, size=(N,N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys()) ) SLT = SuchLinkedTrees( T, T, links ) for n,colname in enumerate( links.columns ) : s = links.applymap(bool)[ colname ] leafs1 = set( map( lambda x : T.leafs[x], s[ s > 0 ].index ) ) leafs2 = set( SLT.get_column_leafs(n) ) assert leafs1 == leafs2
def test_get_column_links(): T = SuchTree(test_tree) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=row_names) SLT = SuchLinkedTrees(T, T, links) for n, colname in enumerate(links.columns): s = links.applymap(bool)[colname] c = SLT.get_column_links(n) for m, rowname in enumerate(SLT.row_names): assert s[rowname] == c[m]
def test_get_column_links() : T = SuchTree( test_tree ) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame( numpy.random.random_integers( 0, 3, size=(N,N)), columns=list(T.leafs.keys()), index=row_names ) SLT = SuchLinkedTrees( T, T, links ) for n,colname in enumerate( links.columns ) : s = links.applymap(bool)[ colname ] c = SLT.get_column_links(n) for m,rowname in enumerate( SLT.row_names ) : assert s[rowname] == c[m]
def test_get_column_leafs_by_name_as_row_ids(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(T, T, links) for colname in links.columns: s = links.applymap(bool)[colname] leafs1 = set( map( list(SLT.col_ids).index, map(lambda x: T.leafs[x], s[s > 0].index))) leafs2 = set(SLT.get_column_leafs(colname, as_row_ids=True)) assert leafs1 == leafs2
def test_init_both_trees_by_file(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(test_tree, test_tree, links) assert type(SLT) == SuchLinkedTrees
def test_link_identities(): with tempfile.NamedTemporaryFile() as f1: f1.file.write(b'(A:1,(B:1,(C:1,D:1)E:1)F:1)G:1;') f1.file.close() T1 = SuchTree(f1.name) with tempfile.NamedTemporaryFile() as f2: f2.file.write(b'((a:1,b:1)e:1,(c:1,d:1)f:1)g:1;') f2.file.close() T2 = SuchTree(f2.name) ll = (('A', 'a'), ('B', 'c'), ('B', 'd'), ('C', 'd'), ('D', 'd')) links = pd.DataFrame(numpy.zeros((4, 4), dtype=int), index=list(T1.leafs.keys()), columns=list(T2.leafs.keys())) for i, j in ll: links.at[i, j] = 1 SLT = SuchLinkedTrees(T1, T2, links) t1_sfeal = dict(zip(T1.leafs.values(), T1.leafs.keys())) t2_sfeal = dict(zip(T2.leafs.values(), T2.leafs.keys())) lll = set((t1_sfeal[j], t2_sfeal[i]) for i, j in SLT.linklist.tolist()) assert set(ll) == lll
def test_subset_b() : T = SuchTree( test_tree ) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame( numpy.random.random_integers( 0, 3, size=(N,N)), columns=list(T.leafs.keys()), index=row_names ) SLT = SuchLinkedTrees( T, T, links ) sfeal = dict( zip( SLT.TreeB.leafs.values(), SLT.TreeB.leafs.keys() ) ) subset_links = links[ list(map( lambda x: sfeal[x], SLT.TreeB.get_leafs(1) )) ] l = subset_links.unstack() SLT.subset_b(1) A = set(map( lambda x : (SLT.TreeB.leafs[x[0]], SLT.TreeA.leafs[x[1]]), list( l[l>0].index ) )) B = set(map( lambda x : (x[0], x[1]), SLT.linklist ) ) assert A == B
def test_subset_b(): T = SuchTree(test_tree) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=row_names) SLT = SuchLinkedTrees(T, T, links) sfeal = dict(zip(SLT.TreeB.leafs.values(), SLT.TreeB.leafs.keys())) subset_links = links[list(map(lambda x: sfeal[x], SLT.TreeB.get_leafs(1)))] l = subset_links.unstack() SLT.subset_b(1) A = set( map(lambda x: (SLT.TreeB.leafs[x[0]], SLT.TreeA.leafs[x[1]]), list(l[l > 0].index))) B = set(map(lambda x: (x[0], x[1]), SLT.linklist)) assert A == B
def test_row_names(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(T, T, links) assert SLT.row_names == list(T.leafs.keys())
def test_col_ids(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(T, T, links) col_ids = SLT.col_ids leaf_ids = T.leafs.values() assert len(col_ids) == len(leaf_ids) for i, j in zip(col_ids, leaf_ids): assert i == j
def test_linkmatrix_property(): T = SuchTree(test_tree) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=row_names) SLT = SuchLinkedTrees(T, T, links) for col in SLT.col_names: for row in SLT.row_names: col_id = SLT.col_names.index(col) row_id = SLT.row_names.index(row) assert bool(links.T[row][col]) == SLT.linkmatrix[row_id][col_id]
def test_linklist_property(): T = SuchTree(test_tree) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=row_names) SLT = SuchLinkedTrees(T, T, links) l = links.unstack() A = set( map(lambda x: (SLT.TreeB.leafs[x[0]], SLT.TreeA.leafs[x[1]]), list(l[l > 0].index))) B = set(map(lambda x: (x[0], x[1]), SLT.linklist)) assert A == B
def simtree(prefix, birth_rate=0.3, death_rate=0.1, min_host_leafs=8, max_host_leafs=64, min_guest_leafs=4, max_guest_leafs=128, duplication_rate=0.2, loss_rate=0.1, switch_rate=0.05, k=2.0, theta=0.5): ''' Time interval is always 1.0 units, and GuestTreeGen stops after 1000 attempts. ''' max_guest_attempts = 1000 # make output directory if not exists(prefix): mkdir(prefix) # build the host tree E = subprocess.call(['java'] + java_ops + [ '-jar', 'jprime.jar', 'HostTreeGen', '-bi', '-min', str(min_host_leafs), '-max', str(max_host_leafs), '1.0', str(birth_rate), str(death_rate), prefix + '/' + 'host' ]) if not E == 0: raise JPrIMEError('HostTreeGen failed.') E = subprocess.call(['java'] + java_ops + [ '-jar', 'jprime.jar', 'BranchRelaxer', '-o', prefix + '/' + 'host.relaxed.tree', prefix + '/' + 'host.pruned.tree', 'IIDGamma', str(k), str(theta) ]) if not E == 0: raise JPrIMEError('BranchRelaxer failed on host tree.') # build the guest tree E = subprocess.call(['java'] + java_ops + [ '-jar', 'jprime.jar', 'GuestTreeGen', '--max-attempts', str(max_guest_attempts), '-min', str(min_guest_leafs), '-max', str(max_guest_leafs), prefix + '/' + 'host.pruned.tree', str(duplication_rate), str(loss_rate), str(switch_rate), prefix + '/' + 'guest' ]) if not E == 0: raise JPrIMEError('GuestTreGen failed.') E = subprocess.call(['java'] + java_ops + [ '-jar', 'jprime.jar', 'BranchRelaxer', '-o', prefix + '/' + 'guest.relaxed.tree', prefix + '/' + 'guest.pruned.tree', 'IIDGamma', str(k), str(theta) ]) if not E == 0: raise JPrIMEError('BranchRelaxer failed on guest tree.') # load the trees T1 = SuchTree(prefix + '/' + 'host.relaxed.tree') T2 = SuchTree(prefix + '/' + 'guest.relaxed.tree') # populate the link matrix using the leaf names l = zeros((T1.n_leafs, T2.n_leafs), dtype=int) hostnames = T1.leafs.keys() guestnames = T2.leafs.keys() for L in T2.leafs.keys(): guest, host = L.split('_') #host = 'H' + host i = hostnames.index(host) j = guestnames.index(L) l[i, j] = 1 links = pandas.DataFrame(l, index=hostnames, columns=guestnames) links.to_csv(prefix + '/' + 'links.csv') # initialize the SuchLinkedTrees object SLT = SuchLinkedTrees(T1, T2, links) # plot the adjacency matrix aj = SLT.adjacency() lp_plot = seaborn.heatmap(aj.T, cmap='viridis', vmin=0, vmax=1, cbar=False, square=True, xticklabels=False, yticklabels=False) lp_plot.invert_yaxis() fig = lp_plot.get_figure() fig.savefig(prefix + '/' + 'adjacency.png', size=6) fig.clf() # plot cophylogeny using R r_code = ''' tr1 <- read.tree( "HOST_TREE" ) tr2 <- read.tree( "GUEST_TREE" ) links <- read.csv( "LINKS", row.names=1, stringsAsFactors = F ) im <- graph_from_incidence_matrix( as.matrix( links ) ) assoc <- as_edgelist( im ) obj <- cophylo( tr1, tr2, assoc=assoc ) pdf( "OUTFILE", width = 10, height = 12 ) plot( obj ) dev.off() ''' r_code = r_code.replace('HOST_TREE', prefix + '/' + 'host.relaxed.tree') r_code = r_code.replace('GUEST_TREE', prefix + '/' + 'guest.relaxed.tree') r_code = r_code.replace('LINKS', prefix + '/' + 'links.csv') r_code = r_code.replace('OUTFILE', prefix + '/' + 'cophylo.pdf') robjects.r(r_code) # calculate spectral densities lambdas = SLT.spectrum() a_lambd = eigvalsh(SLT.TreeA.laplacian()['laplacian']) b_lambd = eigvalsh(SLT.TreeB.laplacian()['laplacian']) with open(prefix + '/' + 'eigenvalues.csv', 'w') as f: f.write('graph ' + ','.join(map(str, lambdas)) + '\n') f.write('TreeA ' + ','.join(map(str, a_lambd)) + '\n') f.write('TreeB ' + ','.join(map(str, b_lambd)) + '\n') bandwidth = 0.4 X = linspace(-0.5, 1.5, 200) density = gaussian_kde(lambdas / max(lambdas), bw_method=bandwidth).pdf(X) a_dnsty = gaussian_kde(a_lambd / max(a_lambd), bw_method=bandwidth).pdf(X) b_dnsty = gaussian_kde(b_lambd / max(b_lambd), bw_method=bandwidth).pdf(X) with open(prefix + '/' + 'densities.txt', 'w') as f: f.write('graph ' + ','.join(map(str, density)) + '\n') f.write('TreeA ' + ','.join(map(str, a_dnsty)) + '\n') f.write('TreeB ' + ','.join(map(str, b_dnsty)) + '\n') # calculate Hommola correlation d = SLT.linked_distances() r, p = pearsonr(d['TreeA'], d['TreeB']) with open(prefix + '/' + 'distances.txt', 'w') as f: f.write('TreeA ' + ','.join(map(str, d['TreeA'])) + '\n') f.write('TreeB ' + ','.join(map(str, d['TreeB'])) + '\n') # save jointplot of patristic distances jp = seaborn.jointplot(d['TreeA'], d['TreeB'], size=6) jp.savefig(prefix + '/' + 'correlation.png') jp.fig.clf() # output moment data moments = {} moments['eigengap'] = lambdas[-1] - lambdas[-2] moments['skew'] = skew(density) moments['kurtosis'] = kurtosis(density) moments['treedist'] = pdd(a_dnsty, b_dnsty) moments['occupancy'] = ( 2.0 * SLT.n_links ) \ / ( SLT.TreeA.n_leafs \ + SLT.TreeB.n_leafs ) moments['squareness'] = float( SLT.TreeA.n_leafs ) \ / SLT.TreeB.n_leafs moments['r'] = r moments['p'] = p with open(prefix + '/' + 'moments.csv', 'w') as f: f.write(','.join(moments.keys()) + '\n') f.write(','.join(map(str, moments.values()))) # output simulation parameters data = {} data['prefix'] = prefix data['host_leafs'] = T1.n_leafs data['guest_leafs'] = T2.n_leafs data['links'] = SLT.n_links data['birth_rate'] = birth_rate data['death_rate'] = death_rate data['min_host_leafs'] = min_host_leafs data['max_host_leafs'] = max_host_leafs data['min_guest_leafs'] = min_guest_leafs data['max_guest_leafs'] = max_guest_leafs data['duplication_rate'] = duplication_rate data['loss_rate'] = loss_rate data['switch_rate'] = switch_rate data['k'] = k data['theta'] = theta with open(prefix + '/' + 'data.csv', 'w') as f: f.write(','.join(data.keys()) + '\n') f.write(','.join(map(str, data.values())))