def test_good_manipulation(self): dm = DistanceMatrix(self.names, self.matrix) # getitem self.assertEqual(dm[1], [1, 0, 3, 5]) self.assertEqual(dm[2, 1], 3) self.assertEqual(dm[2][1], 3) self.assertEqual(dm[1, 2], 3) self.assertEqual(dm[1][2], 3) self.assertEqual(dm['Alpha'], [0, 1, 2, 4]) self.assertEqual(dm['Gamma', 'Delta'], 6) # setitem dm['Alpha'] = [0, 10, 20, 40] self.assertEqual(dm['Alpha'], [0, 10, 20, 40]) # delitem insert item del dm[1] self.assertEqual(dm.names, ['Alpha', 'Gamma', 'Delta']) self.assertEqual(dm.matrix, [[0], [20, 0], [40, 6, 0]]) dm.insert('Beta', [1, 0, 3, 5], 1) self.assertEqual(dm.names, self.names) self.assertEqual(dm.matrix, [[0], [1, 0], [20, 3, 0], [40, 5, 6, 0]]) del dm['Alpha'] self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta']) self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0]]) dm.insert('Alpha', [1, 2, 4, 0]) self.assertEqual(dm.names, ['Beta', 'Gamma', 'Delta', 'Alpha']) self.assertEqual(dm.matrix, [[0], [3, 0], [5, 6, 0], [1, 2, 4, 0]])
def test_format_phylip(self): dm = DistanceMatrix(self.names, self.matrix) handle = StringIO() dm.format_phylip(handle) lines = handle.getvalue().splitlines() self.assertEqual(len(lines), len(dm) + 1) self.assertTrue(lines[0].endswith(str(len(dm)))) for name, line in zip(self.names, lines[1:]): self.assertTrue(line.startswith(name))
def score_to_matrix(list_with_scores): """Help function that returns a distance matrix for guide tree generation. """ # lexikographic sort of list and graphs for further proceeding for i in range(len(list_with_scores)): graphs = [list_with_scores[i][0], list_with_scores[i][1]] graphs.sort() graphs.reverse() list_with_scores[i][0] = graphs[0] list_with_scores[i][1] = graphs[1] list_with_scores.sort() list_with_scores.reverse() # create name and score list for generation of distance matrix names = [] scores = [] i = -1 for entry in list_with_scores: if entry[0] not in names: names.append(entry[0]) scores.append([0, entry[2]]) i += 1 else: scores[i].append(entry[2]) last_graph = list_with_scores[-1][1] names.append(last_graph) names.reverse() scores.append([0]) scores.reverse() for line in scores: line.reverse() # generating distance matrix matrix = DistanceMatrix(names, scores) return matrix
def construct_tree(gene_name, with_marburg=1, algorithm='UPGMA'): # Construct Tree with specific type (Default = UPGMA) if with_marburg == 1: print('Constructing Tree with All Viruses without Marburg') filename = algorithm + '_' + gene_name names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire'] else: print('Constructing {0}\'s Tree with All Viruses with Marburg'.format(gene_name)) filename = algorithm + '_' + gene_name + '_with_Marburg' names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire', 'Marburg'] marburg_genome = SeqIO.read("./Data/Marburg_genome.fasta", "fasta") Alignment.read_data() print('Aligning Genes for marburg_genome') gene_name += '_with_marburg' Alignment.read_genes(marburg_genome) print('Reading edit matrix and construct tree') edit_matrix = pd.read_csv("./Output/edit_matrices/" + gene_name + ".csv", header=None) # read edit matrix file constructor = DistanceTreeConstructor() # Create a tree constructor object edit_matrix = convert_tu_lower_triangular(edit_matrix) # Convert Edit Distance matrix to lower triangular distance_matrix = DistanceMatrix(names=names, matrix=edit_matrix) if algorithm == 'NJ': # Neighbor-Joining Alogrithm tree = constructor.nj(distance_matrix) else: # UPGMA Algorithm tree = constructor.upgma(distance_matrix) save_tree(tree, filename) # Save Tree into a file return tree
def summarise_dist(self, rf_results: RfResults, dir_out): for use_norm in (True, False): if use_norm: path_out = os.path.join(dir_out, 'rf_normed.tree') path_hm = os.path.join(dir_out, 'rf_normed_heatmap.svg') plt_title = 'Normalised Robinson-Foulds Distance' else: path_out = os.path.join(dir_out, 'rf_un_normed.tree') path_hm = os.path.join(dir_out, 'rf_un_normed_heatmap.svg') plt_title = '(un)Normalised Robinson-Foulds Distance' metrics = defaultdict(dict) names = set() for (tid_a, tid_b), (rf, norm_rf) in rf_results.data.items(): if use_norm: metrics[tid_a][tid_b] = norm_rf metrics[tid_b][tid_a] = norm_rf else: metrics[tid_a][tid_b] = rf metrics[tid_b][tid_a] = rf names.add(tid_a) names.add(tid_b) labels = sorted(list(names)) mat_vals = list() mat = np.zeros((len(labels), len(labels))) for i in range(len(labels)): cur_row = list() tid_a = labels[i] for j in range(i + 1): tid_b = labels[j] if tid_a == tid_b: cur_row.append(0.0) else: cur_row.append(metrics[tid_a][tid_b]) mat[i, j] = metrics[tid_a][tid_b] mat_vals.append(cur_row) mat = mat + mat.T # Newick dm = DistanceMatrix(names=labels, matrix=mat_vals) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) Phylo.write(tree, path_out, 'newick') # Heatmap cmap = sns.cubehelix_palette(100, reverse=True) sns.set(font_scale=1) fig_size = (15, 15) rf_df = pd.DataFrame(mat, columns=labels, index=labels) sns.clustermap(rf_df, annot=True, fmt='.3f', cmap=cmap, figsize=fig_size).fig.suptitle(plt_title) plt.savefig(path_hm)
def buildPhyloDM(groups): names=[] ct=0 pcut=0.2 #pcut=0.0 for i in groups: if i.types!=None: di=[[item,i.types.count(item)] for item in set(i.types) \ if i.types.count(item)*1.0/len(i.types)>pcut] di=sorted(di,key=lambda x: x[1],reverse=True) strdi=str(ct)+"|"+List2String(di) #pdb.set_trace() else: strdi=str(ct) names.append(strdi) ct+=1 matrix=[] for i in range(len(groups)): irow=[] for j in range(i+1): mij=distanceij(groups[i],groups[j]) if i!=j else 0 irow.append(mij) matrix.append(irow) dm=DistanceMatrix(names,matrix) return dm
def _convert(m, names): lwtm = [] # Convert to lower triangular form for i in range(0, len(m)): j = i + 1 lwtm.append(m[i,:j].tolist()) if names is None: n = [f"S{n}" for n in range(1, len(m) + 1)] return DistanceMatrix(names=n, matrix=lwtm)
def score_to_distance(score_matrix): np_score = np.array(list(score_matrix)) max_score = np.max(np_score) map_flip = np.vectorize(lambda v: (v + max_score - 2 * v) / max_score) flipped = map_flip(np_score) return DistanceMatrix( score_matrix.names, matrix=[list(map(float, sl[:i + 1])) for i, sl in enumerate(flipped)])
def test_good_construction(self): dm = DistanceMatrix(self.names, self.matrix) self.assertTrue(isinstance(dm, TreeConstruction.DistanceMatrix)) self.assertEqual(dm.names[0], 'Alpha') self.assertEqual(dm.matrix[2][1], 3) self.assertEqual(len(dm), 4) self.assertEqual(repr(dm), "DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta'], " "matrix=[[0], [1, 0], [2, 3, 0], [4, 5, 6, 0]])")
def test_correct_answer(self): for i in range(6): n, m = read_matrix('tests/test{}.txt'.format(i)) tree1 = DistanceTreeConstructor().nj(DistanceMatrix(n, m)) tree2 = NJ_tree().create_tree(n, m) self.assertTrue( nx.is_isomorphic( Phylo.to_networkx(tree1).to_undirected(), Phylo.to_networkx(tree2).to_undirected()))
def print_trees(country, position_table): ### Pull out the concensus sequence concensus_seq = position_table.drop('seqid', axis=1).mode(axis=0).T[0] concensus_seq position_table = position_table.set_index('seqid') ### Determine which samples are farthest from the concensus sequence distance_from_concensus_seq = position_table.apply( lambda row: sum(row != concensus_seq), axis=1) distance_from_concensus_seq_sorted = distance_from_concensus_seq.sort_values( ascending=False) distance_from_concensus_seq_sorted ### Select 10 sequences to do our first analysis subset_seqs = distance_from_concensus_seq_sorted[:10].index subset_seqs ### Construct a distance matrix for our sequences distances = {} for i, seqid1 in enumerate(subset_seqs): distances[seqid1, seqid1] = 0 for j in range(i + 1, len(subset_seqs)): seqid2 = subset_seqs[j] distances[seqid1, seqid2] = sum( position_table.loc[seqid1] != position_table.loc[seqid2]) distances[seqid2, seqid1] = distances[seqid1, seqid2] distances = pd.Series(distances).unstack() matrix = np.tril(distances.values).tolist() for i in range(len(matrix)): matrix[i] = matrix[i][:i + 1] dm = DistanceMatrix(list(distances.index), matrix) ### Now construct our tree constructor = DistanceTreeConstructor() tree = constructor.nj(dm) print(country.upper()) print("Neighbor Joining Tree") tree.ladderize() # Flip branches so deeper clades are displayed at top display(Phylo.draw(tree)) #**Please see the guidance at the top of the page for what to try** if (len(dm) > 1): tree2 = constructor.upgma(dm) #Construction of a distance tree using clustering with the Unweighted Pair Group Method with Arithmatic Mean (UPGMA) -- stepwise differences print("UPGMA Tree") tree2.ladderize( ) # Flip branches so deeper clades are displayed at top display(Phylo.draw(tree2)) return
def draw(self): """ visualize the phylo tree """ mat = list( map(lambda x: list(filter(lambda x: x > 0, x)), self.distMat.tolist())) constructor = DistanceTreeConstructor() upgmatree = constructor.upgma(DistanceMatrix(self.names, mat)) Phylo.draw_ascii(upgmatree)
def dendrogram_biopython(condensed_distance_matrix_jaccard, organisms): """ Create a lower triangle matrix. Then create a biopython dendrogram. Parameters ---------- condensed_distance_matrix_jaccard: ndarray Condensed Jaccard distance matrix organisms: list organisms names """ from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix from Bio.Phylo import draw_ascii lower_triangle_matrix = [ list(v[:i + 1]) for i, v in enumerate(squareform(condensed_distance_matrix_jaccard)) ] constructor = DistanceTreeConstructor() dm = DistanceMatrix(organisms, lower_triangle_matrix) tree = constructor.nj(dm) draw_ascii(tree) dm.format_phylip(open('test.phy', 'w'))
def random_score_matrix(list_of_graph_names): """Help function that generates random scores for a distance matrix. """ scores = [] k = 0 for i in range(len(list_of_graph_names)): scores.append([0]) if k > 0: for j in range(k): random_score = randint(1, 100) / 100 scores[i].append(random_score) scores[i].reverse() k += 1 # generating distance matrix matrix = DistanceMatrix(list_of_graph_names, scores) return matrix
def run_optimization(): ''' ''' params = get_data() num_samples = 16 #--------------------------------------------------------------------------------------------------------------------------------------------------- NUM_OF_VERTICES = 200 distances = np.zeros((num_samples, num_samples)) for i in range(num_samples): for j in range(i + 1, num_samples): print("working on the pair", (i, j)) distances[i, j] = np.abs(compare_curves(params[i], params[j], num_of_verts=NUM_OF_VERTICES)) distances[j, i] = distances[i,j] #--------------------------------------------------------------------------------------------------------------------------------------------------- # Plot distance matrix and make phylogenetic tree #--------------------------------------------------------------------------------------------------------------------------------------------------- plt.matshow(distances) plt.colorbar() plt.show distaceMat = [list(distances[i, :i+1]) for i in range(16)] distaceMatrix = DistanceMatrix(names=['a1', 'a2', 'a3', 'a4', 'b1', 'b2', 'b3', 'b4', 'c1', 'c2', 'c3', 'c4', 'd1', 'd2', 'd3', 'd4'], matrix=distaceMat) constructor = DistanceTreeConstructor() tree_up = constructor.upgma(distaceMatrix) tree_nj = constructor.nj(distaceMatrix) Phylo.draw_ascii(tree_nj) Phylo.draw_ascii(tree_up) return distances
def get_phylogenetic_tree(max_str_len=1, norm="JSD", cpc_function="Square25", joining_alg="nj"): desc, genes = iter_over_files() pm = pd_matrix(genes, max_str_len=max_str_len, norm=norm, cpc_function="Square25") pm = convert_triangle(pm) dm = DistanceMatrix(names=desc, matrix=pm) constructor = DistanceTreeConstructor() if (joining_alg == "nj"): tree = constructor.nj(dm) elif (joining_alg == "upgma"): tree = constructor.upgma(dm) Phylo.write(tree, 'phylo-tree/result.xml', 'newick')
def main(): seqs = read_files(sys.argv[1]) gen_score_file_to_distance_file("scores.txt", seqs) matrix, seq_ids = gen_matrix_from_pair_ids_and_value(path='distances.txt') print(matrix) print(len(matrix)) print(len(seq_ids)) dm = DistanceMatrix(names=seq_ids, matrix=matrix) print(dm) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) fig = plt.figure(figsize=(12, 5), dpi=100) axes = fig.add_subplot(1, 1, 1) Phylo.draw(tree, axes=axes)
def leaf_distance_from_tree(tree): leavesName = [] for leaf in tree.leaf_nodes(): leavesName.append(leaf.taxon.label) distMatrix = [] nLeaf = len(leavesName) for i in range(nLeaf): distMatrix.append([]) for j in range(i): name1 = leavesName[i] name2 = leavesName[j] node1 = tree.find_node_with_taxon_label(name1) node2 = tree.find_node_with_taxon_label(name2) ancestor = tree.mrca(taxon_labels=[name1, name2]) dist1 = node_distance(node1, ancestor) dist2 = node_distance(node2, ancestor) distMatrix[i].append(dist1 + dist2) distMatrix[i].append(0) return DistanceMatrix(leavesName, distMatrix)
def run(self): self.output().makedirs() labels, mat = self.requires().as_matrix() # Convert the numpy distance matrix to the expected format mat_list = list() for i in range(len(labels)): row = list() for j in range(i + 1): row.append(mat[i][j]) mat_list.append(row) dm = DistanceMatrix(names=labels, matrix=mat_list) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) # Write the tree to disk. with self.output().open('w') as fh: Phylo.write(tree, fh, 'newick')
def process_input_matrix(input_matrix): """ Converts an array-of-arrays containting sample IDs and distances into a BioPython DistanceMatrix object """ input_matrix.pop(0) sample_names = [row[0] for row in input_matrix] for row in input_matrix: row.pop(0) distance_matrix = [] for input_matrix_row in input_matrix: distance_matrix.append([float(i) for i in input_matrix_row]) """ np.tril() converts a matrix like this: [[0 1 2] [1 0 1] [2 1 0]] ...into this: [[0 0 0] [1 0 0] [2 1 0]] ...but what we need to pass to DistanceMatrix() is this: [[0] [1 0] [2 1 0]] ...so that's what the (somewhat cryptic) code below does. """ distance_matrix = np.tril(np.array(distance_matrix)) num_rows = distance_matrix.shape[0] """ masking the distance matrix with tril_indices gives a linearized distance matrix [0 1 0 2 1 0] that we need to re-construct into [[0], [1, 0], [2, 1, 0]] """ lower_triangular_idx_mask = np.tril_indices(num_rows) linear_distance_matrix = distance_matrix[lower_triangular_idx_mask] distance_matrix = [] min = 0 max = 1 for i in range(num_rows): distance_matrix.append(linear_distance_matrix[min:max].tolist()) min = max max = max + (i + 2) distance_matrix = DistanceMatrix(names=sample_names, matrix=distance_matrix) return distance_matrix
def test_bad_manipulation(self): dm = DistanceMatrix(self.names, self.matrix) # getitem self.assertRaises(ValueError, dm.__getitem__, 'A') self.assertRaises(ValueError, dm.__getitem__, ('Alpha', 'A')) self.assertRaises(TypeError, dm.__getitem__, (1, 'A')) self.assertRaises(TypeError, dm.__getitem__, (1, 1.2)) self.assertRaises(IndexError, dm.__getitem__, 6) self.assertRaises(IndexError, dm.__getitem__, (10, 10)) # setitem: item or index test self.assertRaises(ValueError, dm.__setitem__, 'A', [1, 3, 4]) self.assertRaises(ValueError, dm.__setitem__, ('Alpha', 'A'), 4) self.assertRaises(TypeError, dm.__setitem__, (1, 'A'), 3) self.assertRaises(TypeError, dm.__setitem__, (1, 1.2), 2) self.assertRaises(IndexError, dm.__setitem__, 6, [1, 3, 4]) self.assertRaises(IndexError, dm.__setitem__, (10, 10), 1) # setitem: value test self.assertRaises(ValueError, dm.__setitem__, 0, [1, 2]) self.assertRaises(TypeError, dm.__setitem__, ('Alpha', 'Beta'), 'a') self.assertRaises(TypeError, dm.__setitem__, 'Alpha', ['a', 'b', 'c'])
def estimate_parameters4(kmer_distance_matrices, mu=None): # Here, we use the expected k-mer distance formula which does not use the coalescent model to account for variation in divergence time k = kmer_distance_matrices.keys() assert (len(k) == 2) names = kmer_distance_matrices.values()[0].names n = len(names) # Estimate branch lengths and theta that = np.zeros((n, n)) bnds = tuple([(0, None)] * ((n**2 - n) / 2)) opt_result = minimize(objfn4_T, tuple([0.5] * ((n**2 - n) / 2)), args=(kmer_distance_matrices[k[0]], kmer_distance_matrices[k[1]], k[0], k[1]), bounds=bnds, method='SLSQP') #indices = list(itertools.chain.from_iterable([[(i,j) for j in range(i)] for i in range(n)])) that = [[opt_result.x[i * (i - 1) / 2 + j] for j in range(i)] + [0.0] for i in range(n)] thatdm = DistanceMatrix(names, that) return (thatdm)
def test_bad_manipulation(self): dm = DistanceMatrix(self.names, self.matrix) # getitem self.assertRaises(ValueError, dm.__getitem__, "A") self.assertRaises(ValueError, dm.__getitem__, ("Alpha", "A")) self.assertRaises(TypeError, dm.__getitem__, (1, "A")) self.assertRaises(TypeError, dm.__getitem__, (1, 1.2)) self.assertRaises(IndexError, dm.__getitem__, 6) self.assertRaises(IndexError, dm.__getitem__, (10, 10)) # setitem: item or index test self.assertRaises(ValueError, dm.__setitem__, "A", [1, 3, 4]) self.assertRaises(ValueError, dm.__setitem__, ("Alpha", "A"), 4) self.assertRaises(TypeError, dm.__setitem__, (1, "A"), 3) self.assertRaises(TypeError, dm.__setitem__, (1, 1.2), 2) self.assertRaises(IndexError, dm.__setitem__, 6, [1, 3, 4]) self.assertRaises(IndexError, dm.__setitem__, (10, 10), 1) # setitem: value test self.assertRaises(ValueError, dm.__setitem__, 0, [1, 2]) self.assertRaises(TypeError, dm.__setitem__, ("Alpha", "Beta"), "a") self.assertRaises(TypeError, dm.__setitem__, "Alpha", ["a", "b", "c"])
def get_distance(self, msa): if not isinstance(msa, MultipleSeqAlignment): raise TypeError("Must provide a MultipleSeqAlignment object.") i=0 for record in msa: record.index= i i+=1 names = [record.id for record in msa] dm = DistanceMatrix(names) pair_combinations = list(itertools.combinations(msa, 2)) for pair in range(len(pair_combinations)): dm[pair_combinations[pair][0].id, pair_combinations[pair][1].id] = self._pairwise(pair_combinations[pair][0], pair_combinations[pair][1]) return dm
def make_score_matrix(records): record_ids = [record.id for record in records] matrix = DistanceMatrix(names=record_ids) for i, sequence_a in enumerate(tqdm(records)): prepare_query(sequence_a.seq) blastp_cline = NcbiblastpCommandline(query='query.txt', db='db', outfmt=5, out='./result.txt') stdout, stderr = blastp_cline() results = SearchIO.read('./result.txt', format='blast-xml') for hit in results: highest_scoring_pair = max(list(hit), key=lambda hit: hit.bitscore) score = highest_scoring_pair.bitscore length = len(list(highest_scoring_pair.fragments)) try: j = record_ids.index(highest_scoring_pair.hit_id) matrix[i, j] = score / length except: pass return matrix
def build_distance_matrix(ids: List[str], distance_file: TextIO) -> DistanceMatrix: r"""Build a distance matrix. Parameters ---------- ids : List[str] List of sequence IDs distance_file : TextIO File containing distances in f"{id1}\t{id12}\t{dist}\n" format. Returns ------- DistanceMatrix """ dm = DistanceMatrix(names=ids) for line in distance_file: id1, id2, distance = line.split() dm[id1, id2] = float(distance) return dm
def JukesCantorDistanceMatrix(msa): names = [seq.id for seq in msa] matrix = [] rowIdx = 0 for row in msa: matrix.append([]) for col in msa: if col.id == row.id: matrix[rowIdx].append(0) break else: strLen = len(row.seq) diff = 0 for i in range(strLen): if row.seq[i] != '-' and col.seq[i] != '-' and row.seq[ i] != col.seq[i]: diff = diff + 1 JDdist = -0.75 * np.log(1 - 4. / 3 * (1.0 * diff / strLen)) matrix[rowIdx].append(JDdist) rowIdx = rowIdx + 1 return DistanceMatrix(names, matrix)
def distance_matrix(self): names = [] matrix = [] seqs = [] names.append(str(self.query_id)) seqs.append("".join(self.query_seq)) for s in self.blastsubject_set.all(): id = s.subject_id seq = s.subject_seq names.append(str(id)) seqs.append("".join(seq)) for i in range(0, len(names)): matrix.append([]) for j in range(0, i + 1): d = 0.0 if i != j: if self.is_prot(): d = prot_dist(seqs[i], seqs[j]) else: d = nucl_dist(seqs[i], seqs[j]) matrix[i].append(d) return DistanceMatrix(names=names, matrix=matrix)
def tree_from_distance_matrix(X): """Distance matrix to phylo tree""" from Bio import Phylo from Bio.Phylo.TreeConstruction import DistanceMatrix,DistanceTreeConstructor from Bio.Cluster import distancematrix names = list(X.index) if type(X) is pd.DataFrame: X = X.values mat = distancematrix(X) #print (names) #names = [i[16:] for i in names] new=[] for i in mat: new.append(np.insert(i, 0, 0).tolist()) dm = DistanceMatrix(names,new) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) #Phylo.draw_ascii(tree,file=open('temp.txt','w')) return tree
def ArgMinSumOfDistancesFromCoalescentJCExpectedKmerPairDistanceParameterizationMap( kmer_distance_matrices, mu=None): k = kmer_distance_matrices.keys() assert (len(k) == 2) names = kmer_distance_matrices.values()[0].names n = len(names) # Estimate branch lengths and theta #that = np.zeros((n,n)) bnds = tuple([(0, None)] * (n * (n - 1) / 2) + [(0, None)]) opt_result = minimize( SumOfDistancesFromCoalescentJCExpectedKmerPairDistanceParameterizationMap, tuple([0.5] * (n * (n - 1) / 2) + [1]), args=(kmer_distance_matrices[k[0]], kmer_distance_matrices[k[1]], k[0], k[1]), bounds=bnds, method='SLSQP') thetahat = opt_result.x[-1] #indices = list(itertools.chain.from_iterable([[(i,j) for j in range(i)] for i in range(n)])) that = [[opt_result.x[i * (i - 1) / 2 + j] for j in range(i)] + [0.0] for i in range(n)] thatdm = DistanceMatrix(names, that) # print dm # print thatdm return (thatdm, thetahat)