def get_dn_ds_tree(self, dn_ds_method="NG86", tree_method="UPGMA", codon_table=default_codon_table): """Method for constructing dn tree and ds tree. Argument: - dn_ds_method - Available methods include NG86, LWL85, YN00 and ML. - tree_method - Available methods include UPGMA and NJ. """ from Bio.Phylo.TreeConstruction import DistanceTreeConstructor dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method, codon_table=codon_table) dn_constructor = DistanceTreeConstructor() ds_constructor = DistanceTreeConstructor() if tree_method == "UPGMA": dn_tree = dn_constructor.upgma(dn_dm) ds_tree = ds_constructor.upgma(ds_dm) elif tree_method == "NJ": dn_tree = dn_constructor.nj(dn_dm) ds_tree = ds_constructor.nj(ds_dm) else: raise RuntimeError("Unknown tree method ({0}). Only NJ and UPGMA " "are accepted.".format(tree_method)) return dn_tree, ds_tree
def get_dn_ds_tree(self, dn_ds_method="NG86", tree_method="UPGMA", codon_table=None): """Construct dn tree and ds tree. Argument: - dn_ds_method - Available methods include NG86, LWL85, YN00 and ML. - tree_method - Available methods include UPGMA and NJ. """ from Bio.Phylo.TreeConstruction import DistanceTreeConstructor if codon_table is None: codon_table = CodonTable.generic_by_id[1] dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method, codon_table=codon_table) dn_constructor = DistanceTreeConstructor() ds_constructor = DistanceTreeConstructor() if tree_method == "UPGMA": dn_tree = dn_constructor.upgma(dn_dm) ds_tree = ds_constructor.upgma(ds_dm) elif tree_method == "NJ": dn_tree = dn_constructor.nj(dn_dm) ds_tree = ds_constructor.nj(ds_dm) else: raise RuntimeError(f"Unknown tree method ({tree_method})." " Only NJ and UPGMA are accepted.") return dn_tree, ds_tree
def dna(file_path, file_format, algorithm): # Read the sequences and align aln = AlignIO.read(file_path, file_format) # Print the alignment print(aln) # Calculate the distance matrix calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) # Print the distance Matrix print('\nDistance Matrix\n===================') print(calculator) # Construct the phylogenetic tree using choosen algorithm constructor = DistanceTreeConstructor() if algorithm.lower() == 'upgma': tree = constructor.upgma(dm) elif algorithm.lower() == 'nj': tree = constructor.nj(dm) else: click.echo('Invalid algorithm!') # Draw the phylogenetic tree Phylo.draw(tree) # Print the phylogenetic tree in the terminal print('\nPhylogenetic Tree\n===================') Phylo.draw_ascii(tree)
def build_trees(filename, tree_name): # Compute alignment with ClustalW algorithm clustalw_cline = ClustalwCommandline("clustalw", infile="{}.fa".format(filename)) clustalw_cline() alignment = AlignIO.read("{}.aln".format(filename), format="clustal") # Create distance matrix calculator = DistanceCalculator('blosum62') dist_matrix = calculator.get_distance(alignment) # Build phylogenetic trees using upgma and nj methods constructor = DistanceTreeConstructor() upgma_tree = constructor.upgma(dist_matrix) nj_tree = constructor.nj(dist_matrix) # Draw the trees label_func = lambda clade: "" if clade.name.startswith("Inner") else clade Phylo.draw(upgma_tree, label_func=label_func, do_show=False) plt.title("{} × upgma".format(tree_name)) plt.show() Phylo.draw(nj_tree, label_func=label_func, do_show=False) plt.title("{} × nj".format(tree_name)) plt.show()
def make_newick_tree(dm): constructor = DistanceTreeConstructor() upgmatree = constructor.upgma(dm) njtree = constructor.nj(dm) upgmatree.root_with_outgroup({'name': "KE136308.1"}) njtree.root_with_outgroup({'name': "KE136308.1"}) return upgmatree, njtree
def buildTree(FASTAFile): myAlignment = AlignIO.read(FASTAFile, "fasta") # Create a tip mapping from the fasta file tipMapping = {} for record in myAlignment: tipMapping[record.id] = str(record.seq) # Compute a distance matrix and construct tree calculator = DistanceCalculator("identity") myMatrix = calculator.get_distance(myAlignment) constructor = DistanceTreeConstructor() upgmaTree = constructor.nj(myMatrix) upgmaTree.root_at_midpoint() Phylo.draw(upgmaTree) # Convert phyloxml tree to newick # biopython does not provide a function to do this so it was necessary # to write to a buffer in newick to convert then get rid of unneeded info for clade in upgmaTree.get_terminals(): clade.name = "\"" + clade.name + "\"" buf = cStringIO.StringIO() Phylo.write(upgmaTree, buf, 'newick', plain = True) tree = buf.getvalue() tree = re.sub(r'Inner\d*', '', tree) tree = tree.replace(";", "") tree = literal_eval(tree) #newick format # RLR tree required for maxParsimony function tree = NewicktoRLR(tree) return tree
def construct_tree(matrix, nj=True): """Build a tree from a distance matrix Can either use neighbor-joining (nj) or UPGMA. """ if not (matrix and type(matrix) == list and len(matrix) > 0): print "matrix has invalid value" return dm = _DistanceMatrix(names=[str(i) for i in range(len(matrix))], matrix=matrix) constructor = DistanceTreeConstructor() if nj: tree = constructor.nj(dm) else: tree = constructor.upgma(dm) # this will remove the names from the inner nodes # this is critical for seq-gen to read in the tree for clade in tree.get_nonterminals(): clade.name = '' return tree
def main(): file_name = "data/coding.fa" # file_name = "data/cons_noncode.fa" alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in SeqIO.parse(file_name, "fasta"): alignment.extend([seq_record]) print("Number of characters in alignment:", len(alignment[0])) #################### # Neighbor joining # #################### calculator = DistanceCalculator('identity') dm = calculator.get_distance(alignment) constructor = DistanceTreeConstructor() start = time.time() tree = constructor.nj(dm) end = time.time() print("Neighbor joining ran in {} seconds.".format(end - start)) Phylo.draw(tree, label_func=get_label) ######### # UPGMA # ######### start = time.time() tree = constructor.upgma(dm) end = time.time() print("UPGMA ran in {} seconds.".format(end - start)) Phylo.draw(tree, label_func=get_label)
def distance_matrix(cls, cluster_list): print cluster_list dists = Distance.objects.filter(rep_accnum1__in=cluster_list, rep_accnum2__in=cluster_list) distance_pairs = {g.rep_accnum1 + '_' + g.rep_accnum2: g.distance for g in dists.all()} matrix = [] for i in range(0,len(cluster_list)): matrix_iteration = [] for j in range(0,i+1): if i == j: matrix_iteration.append(0) elif cluster_list[i] + '_' + cluster_list[j] in distance_pairs: matrix_iteration.append(distance_pairs[cluster_list[i] + '_' + cluster_list[j]]) elif cluster_list[j] + '_' + cluster_list[i] in distance_pairs: matrix_iteration.append(distance_pairs[cluster_list[j] + '_' + cluster_list[i]]) else: raise("Error, can't find pair!") matrix.append(matrix_iteration) #print matrix_iteration cluster_list = [s.encode('ascii', 'ignore') for s in cluster_list] matrix_obj = _DistanceMatrix(names=cluster_list, matrix=matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(matrix_obj) tree.ladderize() #Phylo.draw_ascii(tree) output = StringIO.StringIO() Phylo.write(tree, output, 'newick') tree_str = output.getvalue() #print tree_str return tree_str
class DistanceTreeConstructorTest(unittest.TestCase): """Test DistanceTreeConstructor""" def setUp(self): self.aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip') calculator = DistanceCalculator('blosum62') self.dm = calculator.get_distance(self.aln) self.constructor = DistanceTreeConstructor(calculator) def test_upgma(self): tree = self.constructor.upgma(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) tree_file = StringIO.StringIO() Phylo.write(tree, tree_file, 'newick') ref_tree = open('./TreeConstruction/upgma.tre') self.assertEqual(tree_file.getvalue(), ref_tree.readline()) ref_tree.close() def test_nj(self): tree = self.constructor.nj(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) tree_file = StringIO.StringIO() Phylo.write(tree, tree_file, 'newick') ref_tree = open('./TreeConstruction/nj.tre') self.assertEqual(tree_file.getvalue(), ref_tree.readline()) ref_tree.close() def test_built_tree(self): tree = self.constructor.build_tree(self.aln) self.assertTrue(isinstance(tree, BaseTree.Tree)) tree_file = StringIO.StringIO() Phylo.write(tree, tree_file, 'newick') ref_tree = open('./TreeConstruction/nj.tre') self.assertEqual(tree_file.getvalue(), ref_tree.readline()) ref_tree.close()
def summarise_dist(self, rf_results: RfResults, dir_out): for use_norm in (True, False): if use_norm: path_out = os.path.join(dir_out, 'rf_normed.tree') path_hm = os.path.join(dir_out, 'rf_normed_heatmap.svg') plt_title = 'Normalised Robinson-Foulds Distance' else: path_out = os.path.join(dir_out, 'rf_un_normed.tree') path_hm = os.path.join(dir_out, 'rf_un_normed_heatmap.svg') plt_title = '(un)Normalised Robinson-Foulds Distance' metrics = defaultdict(dict) names = set() for (tid_a, tid_b), (rf, norm_rf) in rf_results.data.items(): if use_norm: metrics[tid_a][tid_b] = norm_rf metrics[tid_b][tid_a] = norm_rf else: metrics[tid_a][tid_b] = rf metrics[tid_b][tid_a] = rf names.add(tid_a) names.add(tid_b) labels = sorted(list(names)) mat_vals = list() mat = np.zeros((len(labels), len(labels))) for i in range(len(labels)): cur_row = list() tid_a = labels[i] for j in range(i + 1): tid_b = labels[j] if tid_a == tid_b: cur_row.append(0.0) else: cur_row.append(metrics[tid_a][tid_b]) mat[i, j] = metrics[tid_a][tid_b] mat_vals.append(cur_row) mat = mat + mat.T # Newick dm = DistanceMatrix(names=labels, matrix=mat_vals) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) Phylo.write(tree, path_out, 'newick') # Heatmap cmap = sns.cubehelix_palette(100, reverse=True) sns.set(font_scale=1) fig_size = (15, 15) rf_df = pd.DataFrame(mat, columns=labels, index=labels) sns.clustermap(rf_df, annot=True, fmt='.3f', cmap=cmap, figsize=fig_size).fig.suptitle(plt_title) plt.savefig(path_hm)
def tree_reconstruction(phy_file, method, model, phyformat): '''Construct tree with given method and model''' aln = AlignIO.read(phy_file, 'phylip-' + phyformat) constructor = DistanceTreeConstructor() calculator = DistanceCalculator(model) dm = calculator.get_distance(aln) if method == 'upgma': tree = constructor.upgma(dm) elif method == 'nj': tree = constructor.nj(dm) tree.ladderize() for c in tree.find_clades(): if 'Inner' in c.name: c.name = '' Phylo.write(tree, args.output + '/tree.nwk', 'newick') plt.rcParams['font.style'] = 'italic' plt.rc('font', size=8) plt.rc('axes', titlesize=14) plt.rc('xtick', labelsize=10) plt.rc('ytick', labelsize=10) plt.rc('figure', titlesize=18) draw(tree, do_show=False) plt.savefig(args.output + "/tree.svg", format='svg', dpi=1200)
def get_tree(aln, kind='nj'): from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) return dm, tree
class DistanceTreeConstructorTest(unittest.TestCase): """Test DistanceTreeConstructor""" def setUp(self): self.aln = AlignIO.read('TreeConstruction/msa.phy', 'phylip') calculator = DistanceCalculator('blosum62') self.dm = calculator.get_distance(self.aln) self.constructor = DistanceTreeConstructor(calculator) def test_upgma(self): tree = self.constructor.upgma(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/upgma.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() def test_nj(self): tree = self.constructor.nj(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() def test_built_tree(self): tree = self.constructor.build_tree(self.aln) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree))
def nj_tree_constructor(x): constructor = DistanceTreeConstructor() calculator = DistanceCalculator("identity") dm = calculator.get_distance(x) njtree = constructor.nj(dm) print(njtree) Phylo.draw_ascii(njtree)
def get_tree(self, chrom, start=1, end=None, samples=None, return_format="tree_obj"): print("chrom: {} start: {} end: {} samples: {}".format( chrom, start, end, samples)) names, matrix = self.get_matrix(chrom, start=start, end=end, samples=samples, return_format="Phylo") distance_matrix = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(distance_matrix) # neighbour joining tree if return_format == "tree_obj": return tree elif return_format == "newick": treeIO = StringIO() Phylo.write(tree, treeIO, "newick") treeString = treeIO.getvalue() treeString = treeString.strip() return treeString
def construct_tree(gene_name, with_marburg=1, algorithm='UPGMA'): # Construct Tree with specific type (Default = UPGMA) if with_marburg == 1: print('Constructing Tree with All Viruses without Marburg') filename = algorithm + '_' + gene_name names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire'] else: print('Constructing {0}\'s Tree with All Viruses with Marburg'.format(gene_name)) filename = algorithm + '_' + gene_name + '_with_Marburg' names = ['Bundibugyo', 'Reston', 'Sudan', 'TaiForest', 'Zaire', 'Marburg'] marburg_genome = SeqIO.read("./Data/Marburg_genome.fasta", "fasta") Alignment.read_data() print('Aligning Genes for marburg_genome') gene_name += '_with_marburg' Alignment.read_genes(marburg_genome) print('Reading edit matrix and construct tree') edit_matrix = pd.read_csv("./Output/edit_matrices/" + gene_name + ".csv", header=None) # read edit matrix file constructor = DistanceTreeConstructor() # Create a tree constructor object edit_matrix = convert_tu_lower_triangular(edit_matrix) # Convert Edit Distance matrix to lower triangular distance_matrix = DistanceMatrix(names=names, matrix=edit_matrix) if algorithm == 'NJ': # Neighbor-Joining Alogrithm tree = constructor.nj(distance_matrix) else: # UPGMA Algorithm tree = constructor.upgma(distance_matrix) save_tree(tree, filename) # Save Tree into a file return tree
class DistanceTreeConstructorTest(unittest.TestCase): """Test DistanceTreeConstructor.""" def setUp(self): self.aln = AlignIO.read("TreeConstruction/msa.phy", "phylip") calculator = DistanceCalculator("blosum62") self.dm = calculator.get_distance(self.aln) self.constructor = DistanceTreeConstructor(calculator) def test_upgma(self): tree = self.constructor.upgma(self.dm) self.assertIsInstance(tree, BaseTree.Tree) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read("./TreeConstruction/upgma.tre", "newick") self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() def test_nj(self): tree = self.constructor.nj(self.dm) self.assertIsInstance(tree, BaseTree.Tree) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read("./TreeConstruction/nj.tre", "newick") self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() # create a matrix of length 2 calculator = DistanceCalculator("blosum62") self.min_dm = calculator.get_distance(self.aln) for i in range(len(self.min_dm) - 2): del self.min_dm[len(self.min_dm) - 1] min_tree = self.constructor.nj(self.min_dm) self.assertIsInstance(min_tree, BaseTree.Tree) ref_min_tree = Phylo.read("./TreeConstruction/nj_min.tre", "newick") self.assertTrue(Consensus._equal_topology(min_tree, ref_min_tree)) def test_built_tree(self): tree = self.constructor.build_tree(self.aln) self.assertIsInstance(tree, BaseTree.Tree) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read("./TreeConstruction/nj.tre", "newick") self.assertTrue(Consensus._equal_topology(tree, ref_tree))
def get_tree(): #biopython-extract the unrooted tree aln = AlignIO.read('agc.aln', 'clustal') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) return tree
def build_nj_tree(self): dm = self.distance_matrix() constructor = DistanceTreeConstructor() tree = constructor.nj(dm) treeio = StringIO.StringIO() Phylo.write(tree, treeio, 'newick') treestr = treeio.getvalue() treeio.close() return treestr
def build_tree(aln, kind='nj'): """Build a tree with bio.phylo module""" from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) return dm, tree
def print_trees(country, position_table): ### Pull out the concensus sequence concensus_seq = position_table.drop('seqid', axis=1).mode(axis=0).T[0] concensus_seq position_table = position_table.set_index('seqid') ### Determine which samples are farthest from the concensus sequence distance_from_concensus_seq = position_table.apply( lambda row: sum(row != concensus_seq), axis=1) distance_from_concensus_seq_sorted = distance_from_concensus_seq.sort_values( ascending=False) distance_from_concensus_seq_sorted ### Select 10 sequences to do our first analysis subset_seqs = distance_from_concensus_seq_sorted[:10].index subset_seqs ### Construct a distance matrix for our sequences distances = {} for i, seqid1 in enumerate(subset_seqs): distances[seqid1, seqid1] = 0 for j in range(i + 1, len(subset_seqs)): seqid2 = subset_seqs[j] distances[seqid1, seqid2] = sum( position_table.loc[seqid1] != position_table.loc[seqid2]) distances[seqid2, seqid1] = distances[seqid1, seqid2] distances = pd.Series(distances).unstack() matrix = np.tril(distances.values).tolist() for i in range(len(matrix)): matrix[i] = matrix[i][:i + 1] dm = DistanceMatrix(list(distances.index), matrix) ### Now construct our tree constructor = DistanceTreeConstructor() tree = constructor.nj(dm) print(country.upper()) print("Neighbor Joining Tree") tree.ladderize() # Flip branches so deeper clades are displayed at top display(Phylo.draw(tree)) #**Please see the guidance at the top of the page for what to try** if (len(dm) > 1): tree2 = constructor.upgma(dm) #Construction of a distance tree using clustering with the Unweighted Pair Group Method with Arithmatic Mean (UPGMA) -- stepwise differences print("UPGMA Tree") tree2.ladderize( ) # Flip branches so deeper clades are displayed at top display(Phylo.draw(tree2)) return
def build_tree_NJ(msa, distanceMatrix=None): if not distanceMatrix: distCalculator = DistanceCalculator("identity") distanceMatrix = distCalculator.get_distance(msa) # Construct the tree with the distance Matrix constructor = DistanceTreeConstructor() tree = constructor.nj(distanceMatrix) # Make the tree rooted #tree.root_at_midpoint() #return newick format return "[&R] " + tree.format("newick").strip()
def generar_arbol(file, indice): with open(file, "r") as aln: alineamiento = AlignIO.read(aln, "clustal") calculator = DistanceCalculator('identity') dm = calculator.get_distance(alineamiento) constructor = DistanceTreeConstructor(calculator) nj = constructor.nj(dm) # Neighbor Joining Phylo.draw(nj) path = './static/assets/arbol_filogenetico' + indice + '.png' pylab.savefig(path, format='png')
def get_dn_ds_tree(self, dn_ds_method="NG86", tree_method="UPGMA"): """Method for constructing dn tree and ds tree. Argument: - dn_ds_method - Available methods include NG86, LWL85, YN00 and ML. - tree_method - Available methods include UPGMA and NJ. """ from Bio.Phylo.TreeConstruction import DistanceTreeConstructor dn_dm, ds_dm = self.get_dn_ds_matrix(method=dn_ds_method) dn_constructor = DistanceTreeConstructor() ds_constructor = DistanceTreeConstructor() if tree_method == "UPGMA": dn_tree = dn_constructor.upgma(dn_dm) ds_tree = ds_constructor.upgma(ds_dm) elif tree_method == "NJ": dn_tree = dn_constructor.nj(dn_dm) ds_tree = ds_constructor.nj(ds_dm) else: raise RuntimeError("Unkown tree method ({0}). Only NJ and UPGMA " "are accepted.".format(tree_method)) return dn_tree, ds_tree
def createNJPhyloTree(align, distanceModel="identity", alignName="anonymous"): print( "[INFO] Calculating distance matrix for {} alignment and {} distance model" .format(alignName, distanceModel)) calculator = DistanceCalculator(distanceModel) dm = calculator.get_distance(align) print("[INFO] Constructing NJ phylogenetic tree for {} alignment".format( alignName)) constructor = DistanceTreeConstructor() njtree = constructor.nj(dm) return njtree
def construct_tree(X_2d, acc, title): acc = list(acc) data = pairwise_distances(X_2d).astype('float') data[np.isnan(data)] = 0 data_list = [] for i in range(data.shape[0]): #for j in range(i, data.shape[0]): data_list.append([data[i, j] for j in range(0, i+1)]) data = data_list dm = _DistanceMatrix(acc, matrix=data) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) Phylo.write(tree, title + ".nwk", 'newick')
def GenerarArbol(): #Generacion de arbol filogenetico UPGMA alineamientos = AlignIO.read("protsec.aln","clustal") # Calculamos matriz de distancias. calculo_matriz = DistanceCalculator('identity') matriz_distancia = calculo_matriz.get_distance(alineamientos) print matriz_distancia # Creamos el arbol UPGMA. creador_arbol = DistanceTreeConstructor() arbol_UPGMA =creador_arbol.nj(matriz_distancia) Phylo.draw_ascii(arbol_UPGMA) Phylo.draw(arbol_UPGMA)
def create_NJ_tree(alignment): # Import Phylo library for tree constructor and draw methods from Bio import Phylo from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor # Create simpler names for tree-constructing methods constructor = DistanceTreeConstructor() calculator = DistanceCalculator('identity') # Calculate the distances between the sequences in alignment dists = calculator.get_distance(alignment) # Create the phylo tree tree = constructor.nj(dists) # Print the phylo tree print(Phylo.draw_ascii(tree)) # Return the phylo tree return tree
def nj_tree(fichero_clw): with open(fichero_clw, "r") as aln: #usar AlignIO tpara leer el archivo de alineamiento en formato 'clustal' format alignment = AlignIO.read(aln, "fasta") #calcular la matriz de distancias calculator = DistanceCalculator('identity') # añade la matriz de distancias al objeto calculator y lo retorna dm = calculator.get_distance(alignment) print(dm) #initialize a DistanceTreeConstructor object based on our distance calculator object constructor = DistanceTreeConstructor(calculator) #build the tree #upgma_tree = constructor.build_tree(alignment) nj_tree = constructor.nj(dm) Phylo.draw(nj_tree)
def build_guide_trees(distance_matrix): # build distance matrix biopython object matrix = [distance_matrix[i, :i + 1].tolist() for i in range(len(distance_matrix))] names = ['S' + str(i) for i in range(len(distance_matrix))] dm = _DistanceMatrix(names, matrix) print('Constructed matrix') constructor = DistanceTreeConstructor() # construct neighbour joining tree t = time.time() tree = constructor.nj(dm) print('Constructed nj tree in {:.4f}s'.format(time.time() - t)) Phylo.write(tree, "njtree.dnd", "newick") remove_inner_nodes_tree("njtree.dnd") """
def ex01(): alignments = get_alignments() calculator = DistanceCalculator('blosum62') constructor = DistanceTreeConstructor() for a, name in alignments: dist_matrix = calculator.get_distance(a) upgma_tree = constructor.upgma(dist_matrix) nj_tree = constructor.nj(dist_matrix) print("\n\n>>> {}".format(name)) # print(dist_matrix) # draw_ascii(upgma_tree) # draw_ascii(nj_tree) draw(upgma_tree) draw(nj_tree)
def test_correct_res(self): dist_matrix = pd.read_csv("data/wiki_tree.csv", index_col=0) self.tree.set_distance_matrix(dist_matrix) self.tree.fit() dist_matrix = _DistanceMatrix(names=['a', 'b', 'c', 'd', 'e'], matrix=[[0], [5, 0], [9, 10, 0], [9, 10, 8, 0], [8, 9, 7, 3, 0]]) constructor = DistanceTreeConstructor() lib_tree = constructor.nj(dist_matrix) self.assertTrue( is_isomorphic( Phylo.to_networkx(lib_tree).to_undirected(), Phylo.to_networkx(self.tree.get_tree()).to_undirected()))
def run_optimization(): ''' ''' params = get_data() num_samples = 16 #--------------------------------------------------------------------------------------------------------------------------------------------------- NUM_OF_VERTICES = 200 distances = np.zeros((num_samples, num_samples)) for i in range(num_samples): for j in range(i + 1, num_samples): print("working on the pair", (i, j)) distances[i, j] = np.abs(compare_curves(params[i], params[j], num_of_verts=NUM_OF_VERTICES)) distances[j, i] = distances[i,j] #--------------------------------------------------------------------------------------------------------------------------------------------------- # Plot distance matrix and make phylogenetic tree #--------------------------------------------------------------------------------------------------------------------------------------------------- plt.matshow(distances) plt.colorbar() plt.show distaceMat = [list(distances[i, :i+1]) for i in range(16)] distaceMatrix = DistanceMatrix(names=['a1', 'a2', 'a3', 'a4', 'b1', 'b2', 'b3', 'b4', 'c1', 'c2', 'c3', 'c4', 'd1', 'd2', 'd3', 'd4'], matrix=distaceMat) constructor = DistanceTreeConstructor() tree_up = constructor.upgma(distaceMatrix) tree_nj = constructor.nj(distaceMatrix) Phylo.draw_ascii(tree_nj) Phylo.draw_ascii(tree_up) return distances
def MakePlot(x, org_names, ckm30, ckm50, outgroup, outfile, outfilexml, sum_x): #Make sure names are unique names = org_names for name in names: if names.count(name)>1: temp_name = name i=1 for dummy in range(0,names.count(name)-1): #Don't change the last one, just to make sure we don't conflict with the outgroup names[names.index(temp_name)] = temp_name + "_" + str(i) i = i +1 #Normalize the x vector x = map(lambda y: y/sum(x),x) ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30)) ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50)) num_rows = ckm30_norm.shape[0] num_cols = ckm30_norm.shape[1] matrix=list() for i in range(num_rows): matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)]) #Make the list of distances (ave of the two ckm matrices) ckm_ave_train = .5*ckm30_norm+.5*ckm50_norm ckm_ave_train_dist = dict() for i in range(len(org_names)): ckm_ave_train_dist[org_names[i]] = [.5*ckm_ave_train[i,j]+.5*ckm_ave_train[j,i] for j in range(len(org_names))] #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish. dm = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) t=Tree(tree.format('newick'),format=1) #tree.format('newick') #Phylo.draw_ascii(tree) #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node. #Function to insert a node at a given distance def insert_node(t, name_to_insert, insert_above, dist_along): insert_at_node = t.search_nodes(name=insert_above)[0] parent = (t&insert_above).up orig_branch_length = t.get_distance(insert_at_node,parent) if orig_branch_length < dist_along: raise ValueError("error: dist_along larger than orig_branch_length in PlotPackage.py") removed_node = insert_at_node.detach() removed_node.dist = orig_branch_length - dist_along added_node = parent.add_child(name=name_to_insert, dist=dist_along) added_node.add_child(removed_node) #Function to insert a node some % along a branch, taking into account the ckm distances and nodes already created in the NJ tree (and what distance their descendants are from everyone else) def insert_hyp_node(t, leaf_name, percent, ckm_ave_train_dist, org_names): dists = map(lambda y: abs(y-percent), ckm_ave_train_dist[leaf_name]) nearby_indicies = list() #Add all the organisms that are within 0.05 of the given percent # for i in range(len(dists)): # if dists[i]<=.05: # nearby_indicies.append(i) nearby_names = list() #If there are no nearby indicies, add the closest organism to the given percent if nearby_indicies==[]: nearby_names.append(org_names[dists.index(min(dists))]) else: for i in range(len(nearby_indicies)): nearby_names.append(org_names[i]) mean_dist = np.mean(map(lambda y: ckm_ave_train_dist[leaf_name][org_names.index(y)],nearby_names)) nearby_names.append(leaf_name) LCA = t.get_common_ancestor(nearby_names) LCA_to_leaf_dist = t.get_distance(LCA,leaf_name) #divide the dist to the right/left of the LCA node by the number of percentage points in there if LCA.name==t.name: percent_dist = percent*LCA_to_leaf_dist if mean_dist <= percent: child_node = (t&leaf_name) else: child_node = (t&nearby_names[0])#This means "go up from root" in the direction of the nearest guy ancestor_node = (t&child_node.name).up elif mean_dist <= percent: percent_dist = t.get_distance(LCA) + abs(percent-mean_dist)*(LCA_to_leaf_dist)/(1-mean_dist) child_node = (t&leaf_name) ancestor_node = (t&child_node.name).up else: percent_dist = t.get_distance(LCA) - abs(percent-mean_dist)*(t.get_distance(LCA))/(mean_dist) child_node = (t&leaf_name) ancestor_node = (t&child_node.name).up while t.get_distance(t.name, ancestor_node) > percent_dist: child_node = ancestor_node ancestor_node = (t&child_node.name).up insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node)) #Set outgroup if outgroup in names: t.set_outgroup(t&outgroup) #I will need to check that this outgroup is actually one of the names... else: print("WARNING: the chosen outgroup " + outgroup + " is not in the given taxonomy: ") print(names) print("Proceeding without setting an outgroup. This may cause results to be uninterpretable.") #Insert hypothetical nodes hyp_node_names = dict() cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1] cutoffs = [-.5141*(val**3)+1.0932*(val**2)+0.3824*val for val in cutoffs] for i in range(len(org_names)): xi = x[i:len(x):len(org_names)] for j in range(1,len(cutoffs)+1): if xi[j]>0: insert_hyp_node(t, org_names[i], cutoffs[j-1],ckm_ave_train_dist, org_names) hyp_node_names[org_names[i]+"_"+str(cutoffs[j-1])] = [org_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names size_factor=250 font_size=55 #Now put the bubbles on the nodes def layout(node): node_style = NodeStyle() node_style["hz_line_width"] = 10 node_style["vt_line_width"] = 10 node.set_style(node_style) #print(node) if node.is_leaf(): if node.name in org_names: #make reconstructed bubble size = x[org_names.index(node.name)] F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") #Denote that this was a training organism nameFace = AttrFace("name", fsize=font_size, fgcolor='black') faces.add_face_to_node(nameFace, node, 0, position="branch-right") elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x node_base_name = hyp_node_names[node.name][0] percent = hyp_node_names[node.name][1] if node_base_name in org_names: idx = hyp_node_names[node.name][2] size = x[org_names.index(node_base_name)+(idx+1)*len(org_names)] F = CircleFace(radius=size_factor*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") #This is if I want the names of the hypothetical nodes to be printed as well #nameFace = AttrFace("name", fsize=font_size, fgcolor='black') #faces.add_face_to_node(nameFace, node, 0, position="branch-right") else: size=0 else: size=0 ts = TreeStyle() ts.layout_fn = layout ts.mode = "r" #ts.mode = "c" ts.scale = 2*1000 ts.show_leaf_name = False ts.min_leaf_separation = 50 F = CircleFace(radius=.87*size_factor, color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 ts.legend.add_face(F,0) ts.legend.add_face(TextFace(" Inferred relative abundance",fsize=1.5*font_size,fgcolor="Blue"),1) ts.legend.add_face(TextFace(" Total absolute abundance depicted " + str(sum_x)[0:8], fsize=1.5*font_size,fgcolor="Black"),1) ts.legend_position=4 #t.show(tree_style=ts) t.render(outfile, w=550, units="mm", tree_style=ts) #Redner the XML file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) project.add_phylogeny(phylo) project.export(open(outfilexml,'w'))
# CAGTTCGCCACAA Gamma # Several thigns can be done witht he alignment: get a distance matrix from it: dstcalc = DistanceCalculator('identity') dm = dstcalc.get_distance(aln) # DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon'], matrix=[[0], [0.23076923076923073, 0], [0.3846153846153846, 0.23076923076923073, 0], [0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0], [0.6153846153846154, 0.3846153846153846, 0.46153846153846156, 0.15384615384615385, 0]]) print "What's the get_distance(aln) from DistanceCalculator('identity') object?" print type(dm) print dm # Alpha 0 # Beta 0.230769230769 0 # Gamma 0.384615384615 0.230769230769 0 # Delta 0.538461538462 0.538461538462 0.538461538462 0 # Epsilon 0.615384615385 0.384615384615 0.461538461538 0.153846153846 0 # build a tree from it. from Bio.Phylo.TreeConstruction import DistanceTreeConstructor construc0 = DistanceTreeConstructor(dstcalc, 'nj') tre0 = construc0.build_tree(aln) print type(tre0) # as you can see from abovedstcalc is needed for te constructor and then # to build the tree the alignment is needed. That's two things which need to originae fromt he same thing. # A bit of a tall order # You can build the tree from a distance matrix only, by leaving out the aln argument # by not using the build_tree method on the constructor, but rather the .nj method construc2 = DistanceTreeConstructor() tre2 = construc2.nj(dm) print type(tre2)
def nj_tree(distanceMatrix): print "Constructing Neighbor Joining Tree" constructor = DistanceTreeConstructor() tree = constructor.nj(distanceMatrix) Phylo.write(tree, "geneContentTree.newick", "newick") print "Done constructing tree"
from Bio import Phylo from Bio.Phylo.TreeConstruction import DistanceTreeConstructor from Bio.Phylo.TreeConstruction import _DistanceMatrix def dm_to_tree(dm): dm = dm.astype(float) distance_triangular = [list(dm.values[i, : i + 1]) for i in range(len(dm))] try: dm = _DistanceMatrix(names=[str(i) for i in dm.columns], matrix=distance_triangular) except Exception, e: print list(dm.columns) print [type(i) for i in dm.columns] print type(distance_triangular) print type(distance_triangular[0]) print set([str(type(i)) for j in distance_triangular for i in j]) print distance_triangular raise e constructor = DistanceTreeConstructor() tree = constructor.nj(dm) for c in tree.get_nonterminals(): c.name = None return tree
def best_elements_order_tree(relations, elements = None, filter_order = None): present_elements, present_element_groups, properties, property_groups, element_2_property_2_relation, property_2_element_2_relation = relations_2_model(relations) if not elements: elements = present_elements # distances = {} # for e1 in elements: # for e2 in elements: # if (e1 is e2) or (id(e1) > id(e2)): continue # nb_similarity = 0 # for property in properties[:]: # if True == (e1 in property_2_element_2_relation[property]) == (e2 in property_2_element_2_relation[property]): # nb_similarity += 2 # elif (e1 in property_2_element_2_relation[property]) == (e2 in property_2_element_2_relation[property]): # nb_similarity += 1 # distances[e1, e2] = distances[e2, e1] = 1.0 - nb_similarity / len(properties) distances = {} for e1 in elements: for e2 in elements: if (e1 is e2) or (id(e1) > id(e2)): continue d = 0 for property in properties[:]: if (e1 in property_2_element_2_relation[property]) != (e2 in property_2_element_2_relation[property]): d += 1.0 distances[e1, e2] = distances[e2, e1] = d label_2_element = { element.label : element for element in elements } from Bio.Phylo.TreeConstruction import _DistanceMatrix as DistanceMatrix, DistanceTreeConstructor dm = DistanceMatrix([element.label for element in elements]) for e1 in elements: for e2 in elements: if (e1 is e2) or (id(e1) > id(e2)): continue dm[e1.label, e2.label] = distances[e1, e2] print(dm, file = sys.stderr) treebuilder = DistanceTreeConstructor(None) tree = treebuilder.nj(dm) #tree = treebuilder.upgma(dm) print(tree, file = sys.stderr) def walker(clade): if clade.clades: results = [] partss = [walker(child) for child in clade.clades] for ordered_parts in all_orders(partss): combinations = all_combinations(ordered_parts) results.extend(combinations) return results else: element = label_2_element[clade.name] return [ [element] ] orders = walker(tree.root) print(len(orders), file = sys.stderr) def score_order(order): nb_hole = 0 nb_prop_with_hole = 0 total_hole_length = 0 for property in properties: start = None end = None in_hole = False for i, element in enumerate(order): if element in property_2_element_2_relation[property]: if start is None: start = i end = i in_hole = False else: if (not start is None) and (not in_hole): in_hole = True nb_hole += 1 # After end, it is not a hole! if end != i: nb_hole -= 1 if not end is None: length = end - start + 1 if length > len(property_2_element_2_relation[property]): total_hole_length += length - len(property_2_element_2_relation[property]) nb_prop_with_hole += 1 return (-nb_prop_with_hole, -nb_hole * 2 + -total_hole_length) order, score = best(orders, score_order, score0 = (-sys.maxsize, -sys.maxsize)) return order
from Bio import Phylo from Bio.Phylo.TreeConstruction import _DistanceMatrix from Bio.Phylo.TreeConstruction import DistanceTreeConstructor from io import StringIO import re # hamming distance def hamming(seq1, seq2): # assert len(seq1) == len(seq2), 'unequal reads!' return int(sum([i[0] != i[1] for i in zip(seq1, seq2)])) f = open('rosalind_chbp.txt') species = f.readline().rstrip().split() table = [''.join(i) for i in zip(*f.read().rstrip().split())] n = len(table) ''' For the Phylo.TreeConstruction to work, integers in the distance matrix must be Python int and not numpy.int64 ''' dm = [[hamming(table[i], table[j]) for j in range(i+1)] for i in range(n)] constructor = DistanceTreeConstructor() tree = constructor.nj(_DistanceMatrix(names=species, matrix=dm)) handle = StringIO() Phylo.write(tree, handle, format='newick', plain=True) result = handle.getvalue() result = re.sub('Inner[0-9]+', '', result) open('rosalind_chbp_sub.txt', 'wt').write(result)
def main(): global YIELD_FILE global MLST_FILE global FORCE_MLST_SCHEME #Set up the file names for Nullarbor folder structure YIELD_FILE = 'yield.tab' MLST_FILE = 'mlst.tab' #Add MLST schemes to force their usage if that species is encountered #Only force schemes if there are two (e.g., A baumannii and E coli) FORCE_MLST_SCHEME = {"Acinetobacter baumannii": "abaumannii_2", "Campylobacter jejuni": "campylobacter", #"Citrobacter freundii": "cfreundii", #"Cronobacter": "cronobacter", "Enterobacter cloacae": "ecloacae", "Escherichia coli": "ecoli", #"Klebsiella oxytoca": "koxytoca", #"Klebsiella pneumoniae": "kpneumoniae", #"Pseudomonas aeruginosa": "paeruginosa" "Shigella sonnei": "ecoli", "Salmonella enterica": "senterica", "Vibrio cholerae": "vcholerae" } ''' Read in the MDU-IDs from file. For each ID, instantiate an object of class Isolate. This class associates QC data with the ID tag. Move the contigs for all isolates into a tempdir, with a temp 9-character filename. Run andi phylogenomics on all the contig sets. Infer an NJ tree using Bio Phylo from the andi-calculated distance matrix. Correct the negative branch lengths in the NJ tree using ETE3. Export the tree to file. Gather and combine the metadata for each ID as a super-matrix. Optionally, add LIMS metadata to the super-matrix from a LIMS excel spreadsheet option (adds MALDI-ToF, Submitting Lab ID, Submitting Lab species guess) and/or use the flag-if-new to highlight 'new' isolates. Export the tree and metadata to .csv, .tsv/.tab file. Export the 'isolates not found' to text file too. ''' if not ARGS.subparser_name: PARSER.print_help() sys.exit() elif ARGS.subparser_name == 'version': from .utils.version import Version Version() sys.exit() else:# ARGS.subparser_name == "run": if ARGS.Nullarbor_folders: print('Nullarbor folder structure selected.') YIELD_FILE = 'yield.clean.tab' MLST_FILE = 'mlst2.tab' EXCEL_OUT = (f"{os.path.splitext(os.path.basename(ARGS.LIMS_request_sheet))[0]}" \ f"_results.xlsx") if ARGS.threads > cpu_count(): sys.exit(f'Number of requested threads must be less than {cpu_count()}.') print(str(ARGS.threads) +' CPU processors requested.') #Check if final slash in manually specified wgs_qc path if ARGS.wgs_qc[-1] != '/': print('\n-wgs_qc path is entered as '+ARGS.wgs_qc) print('You are missing a final \'/\' on this path.') print('Exiting now.\n') sys.exit() #i) read in the IDs from file xls_table = get_isolate_request_IDs(ARGS.LIMS_request_sheet) IDs = list(set(xls_table.index.values)) #base should be a global, given that it is used in other functions too. base = os.path.splitext(ARGS.LIMS_request_sheet)[0] #ii) Return a folder path to the QC data for each available ID # using a wildcard search of the ID in IDs in ARGS.wgs_qc path. iso_paths = isolates_available(IDs) #Drop the path and keep the folder name isos = [i.split('/')[-1] for i in iso_paths] #iii) make tempdir to store the temp_contigs there for 'andi' analysis. assembly_tempdir = make_tempdir() #vi) Copy contigs to become temp_contigs into tempdir, only if andi #requested. #Translation dict to store {random 9-character filename: original filename} iso_ID_trans = {} #Dict to store each isolate under each consensus species#####maybe delete from collections import defaultdict isos_grouped_by_cons_spp = defaultdict(list) for iso in isos: #Instantiate an Isolate class for each isolate in isolates sample = Isolate(iso) #Next, we could just use iso_path+/contigs.fa, but that would skip #the if os.path.exists() test in sample.assembly(iso). assembly_path = sample.assembly() short_id = shortened_ID() #Store key,value as original_name,short_id for later retrieval. iso_ID_trans[iso] = short_id if ARGS.andi_run: cmd = 'ln -s '+assembly_path+' '+assembly_tempdir+'/'+short_id+\ '_contigs.fa' os.system(cmd) print('Creating symlink:', cmd) if len(list(iso_ID_trans.items())) > 0: with open(base+'_temp_names.txt', 'w') as tmp_names: print('\nTranslated isolate IDs:\nShort\tOriginal') for key, value in list(iso_ID_trans.items()): print(value+'\t'+key) tmp_names.write(value+'\t'+key+'\n') if ARGS.metadata_run: #summary_frames will store all of the metaDataFrames herein summary_frames = [] n_isos = len(isos) if n_isos == 0: print('\nNo isolates detected in the path '+ARGS.wgs_qc+'.') print('Exiting now.\n') sys.exit() #Kraken set at 2 threads, so 36 processes can run on 72 CPUs #Create a pool 'p' of size based on number of isolates (n_isos) if n_isos <= ARGS.threads//2: p = Pool(n_isos) else: p = Pool(ARGS.threads//2) print(f'\nRunning kraken on the assemblies ({ARGS.assembly_name} files):') results_k_cntgs = p.map(kraken_contigs_multiprocessing, isos) print(results_k_cntgs) #concat the dataframe objects res_k_cntgs = pd.concat(results_k_cntgs, axis=0, sort=False) print('\nKraken_contigs results gathered from kraken on contigs...') #Multiprocessor retrieval of kraken results on reads. Single thread #per job. if n_isos <= ARGS.threads: p = Pool(n_isos) else: p = Pool(ARGS.threads) results_k_reads = p.map(kraken_reads_multiprocessing, isos) #concat the dataframe objects res_k_reads = pd.concat(results_k_reads, axis=0) print('Kraken_reads results gathered from kraken.tab files...') #Multiprocessor retrieval of contig metrics. Single process #per job. results_metrics_contigs = p.map(metricsContigs_multiprocessing, isos) res_m_cntgs = pd.concat(results_metrics_contigs, axis=0) print('Contig metrics gathered using \'fa -t\'...') #Multiprocessor retrieval of read metrics. Single process #per job. results_metrics_reads = p.map(metricsReads_multiprocessing, isos) res_m_reads = pd.concat(results_metrics_reads, axis=0) print('Read metrics gathered from '+YIELD_FILE+' files...') #Multiprocessor retrieval of abricate results. Single process #per job. results_abricate = p.map(abricate_multiprocessing, isos) res_all_abricate = pd.concat(results_abricate, axis=0, sort=False) res_all_abricate.fillna('', inplace=True) print('Resistome hits gathered from abricate.tab files...') #append the dfs to the summary list of dfs summary_frames.append(res_k_cntgs) summary_frames.append(res_k_reads) summary_frames.append(res_m_cntgs) summary_frames.append(res_m_reads) summary_frames.append(res_all_abricate) #These next steps build up the metadata not yet obtained #(via mulitprocesses above), also replace the dm-matrix short names #with original names #Let's store the metadata for each isolate in summary_isos summary_isos = [] #Let's populate summary_isos above, isolate by isolate (in series) c = 0 for iso in isos: iso_df = [] sample = Isolate(iso) short_id = iso_ID_trans[iso] species_cntgs = res_k_cntgs.loc[iso, 'sp_krkn1_cntgs'] species_reads = res_k_reads.loc[iso, 'sp_krkn1_reads'] if species_cntgs == species_reads: species = species_cntgs else: species = 'indet' mlst_df = sample.mlst(species, sample.assembly()) iso_df.append(mlst_df) species_consensus = {'sp_krkn_ReadAndContigConsensus':species} species_cons_df = pd.DataFrame([species_consensus], index=[iso]) iso_df.append(species_cons_df) iso_df_pd = pd.concat(iso_df, axis=1) summary_isos.append(iso_df_pd) #Glue the isolate by isolate metadata into a single df summary_isos_df = pd.concat(summary_isos) #Glue the dataframes built during multiprocessing processes summary_frames_df = pd.concat(summary_frames, axis=1) #Finish up with everything in one table! metadata_overall = pd.concat([xls_table, summary_isos_df, summary_frames_df], axis=1, sort=False) metadata_overall.fillna('', inplace=True) metadata_overall.index.name = 'ISOLATE' print('\nMetadata super-matrix:') #Write this supermatrix (metadata_overall) to csv and tab/tsv csv = os.path.abspath(base+'_metadataAll.csv') tsv = os.path.abspath(base+'_metadataAll.tab') json = os.path.abspath(base+'_metadataAll.json') metadata_overall.to_csv(sys.stdout) writer = pd.ExcelWriter(EXCEL_OUT) metadata_overall.to_excel(writer,'Sheet 1', freeze_panes=(1, 1)) writer.save() print(f"\nResults written to {os.path.abspath(EXCEL_OUT)}") for k, v in zip(metadata_overall['sp_krkn_ReadAndContigConsensus'], metadata_overall.index): isos_grouped_by_cons_spp[k.replace(' ', '_')].append(v) #Run andi? if ARGS.andi_run: #Run andi andi_mat = 'andi_'+ARGS.model_andi_distance+'dist_'+base+'.mat' andi_c = 'nice andi -j -m '+ARGS.model_andi_distance+' -t '+\ str(ARGS.threads)+' '+assembly_tempdir+'/*_contigs.fa > '+\ andi_mat print('\nRunning andi with: \''+andi_c+'\'') os.system(andi_c) #Read in the andi dist matrix, convert to lower triangle dm = read_file_lines(andi_mat)[1:] dm = lower_tri(dm) #Correct the names in the matrix for iso in isos: #Could do it this way, but this is slower than a nested loop #dm.names[dm.names.index(iso_ID_trans[iso])] = iso #real 0m9.417s #user 1m18.576s #sys 0m2.620s #Nested loop is faster for i in range(0, len(dm.names)): #iso_ID_trans[iso] is the short_id if dm.names[i] == iso_ID_trans[iso]: dm.names[i] = iso #real 0m8.789s #user 1m14.637s #sys 0m2.420s #From the distance matrix in dm, infer the NJ tree from Bio.Phylo.TreeConstruction import DistanceTreeConstructor constructor = DistanceTreeConstructor() njtree = constructor.nj(dm) njtree.rooted = True from Bio import Phylo Phylo.write(njtree, 'temp.tre', 'newick') from ete3 import Tree t = Tree('temp.tre', format=1) #Get rid of negative branch lengths (an artefact, not an error, of NJ) for node in t.traverse(): node.dist = abs(node.dist) t.set_outgroup(t.get_midpoint_outgroup()) t_out = base+'_andi_NJ_'+ARGS.model_andi_distance+'dist.nwk.tre' t.write(format=1, outfile=t_out) print('Final tree (midpoint-rooted, NJ under '+\ ARGS.model_andi_distance+' distance) looks like this:') #Print the ascii tree print(t) #Remove the temp.tre os.remove('temp.tre') print('Tree (NJ under '+ARGS.model_andi_distance+\ ' distance, midpoint-rooted) written to '+t_out+'.') #Run roary? if ARGS.roary_run: roary_keepers = [ "accessory.header.embl", "accessory.tab", "accessory_binary_genes.fa", "accessory_binary_genes.fa.newick", "accessory_binary_genes_midpoint.nwk.tre", "accessory_graph.dot", "blast_identity_frequency.Rtab", "clustered_proteins", "core_accessory.header.embl", "core_accessory.tab", "core_accessory_graph.dot", "core_gene_alignment.aln", "gene_presence_absence.Ltab.csv", "gene_presence_absence.Rtab", "gene_presence_absence.csv", "number_of_conserved_genes.Rtab", "number_of_genes_in_pan_genome.Rtab", "number_of_new_genes.Rtab", "number_of_unique_genes.Rtab", "pan_genome_reference.fa", "pan_genome_sequences", "summary_statistics.txt" ] params = [(i, 'prokka') for i in isos if not os.path.exists('prokka/'+i)] if len(params) > 0: print('\nRunning prokka:') if len(params) <= ARGS.threads//2: p = Pool(len(params)) else: p = Pool(ARGS.threads//2) p.map(prokka, params) else: print('\nProkka files already exist. Let\'s move on to '+\ 'the roary analysis...') #Run Roary on the species_consensus subsets. print('Now, let\'s run roary!') for k, v in list(isos_grouped_by_cons_spp.items()): print(k, v) n_isos = len(v) if n_isos > 1: shutil.rmtree(base+'_'+k+'_roary', ignore_errors=True) roary(base, k, ' '.join(['prokka/'+iso+'/*.gff' for iso in v])) roary_genes = pd.read_table(base+'_'+k+ '_roary/gene_presence_absence.' +\ 'Rtab', index_col=0, header=0) roary_genes = roary_genes.transpose() roary_genes.to_csv(base+'_'+k+ '_roary/gene_presence_absence.Ltab.csv', mode='w', index=True, index_label='name') if n_isos > 2: from ete3 import Tree t = Tree(base+'_'+k+ '_roary/accessory_binary_genes.fa.newick', format=1) #Get rid of negative branch lengths (an artefact, #not an error, of NJ) for node in t.traverse(): node.dist = abs(node.dist) t.set_outgroup(t.get_midpoint_outgroup()) t_out = base+'_'+k+\ '_roary/accessory_binary_genes_midpoint.nwk.tre' t.write(format=1, outfile=t_out) print('\nWritten midpoint-rooted roary tree.\n') wd = os.getcwd() os.chdir(base+'_'+k+'_roary') for f_name in glob.glob('*'): if f_name not in roary_keepers: shutil.rmtree(f_name, ignore_errors=True) os.remove(f_name) os.chdir(wd) if n_isos <= 2: print('Need more than two isolates to have a meaningful '+\ 'pangenome tree. No mid-point rooting of the ' +\ 'pangenome tree performed.') wd = os.getcwd() os.chdir(base+'_'+k+'_roary') os.system('python ../collapseSites.py -f core_gene_alignment.aln -i fasta -t '+str(ARGS.threads)) if os.path.exists('core_gene_alignment_collapsed.fasta'): os.system('FastTree -nt -gtr < core_gene_alignment_collapsed.fasta > core_gene_FastTree_SNVs.tre') #calc pairwise snp dist and write to file with open('core_gene_alignment_collapsed.fasta', 'r') as inf: from Bio import AlignIO aln = AlignIO.read(inf, 'fasta') pairs = [] for i in range(0,len(aln)): lst = [(aln, i, j) for j in range(0, i+1)] pairs.append(lst) if len(pairs) <= ARGS.threads: p = Pool(len(pairs)) else: p = Pool(ARGS.threads) print('Running pw comparisons in parallel...') result = p.map(pw_calc, pairs) summary = pd.concat(result, axis=0, sort=False) summary.fillna('', inplace=True) with open('core_gene_alignment_SNV_distances.tab', 'w') as distmat: summary.to_csv(distmat, mode='w', sep='\t', index=True, index_label='name') #convert roary output to fripan compatible os.system('python ../roary2fripan.py '+base+'_'+k) roary2fripan_strains_file = pd.read_table(base+'_'+k+ '.strains', index_col=0, header=0) info_list = [] info_list.append(roary2fripan_strains_file) info_list.append(metadata_overall.loc[v, :]) strains_info_out = pd.concat(info_list, axis=1, sort=False) strains_info_out.to_csv(base+'_'+k+'.strains', mode='w', sep='\t', index=True, index_label='ID') print('Updated '+base+'_'+k+'.strains with all metadata.') os.system('cp '+base+'_'+k+'* ~/public_html/fripan') os.chdir(wd) else: print('Only one isolate in '+k+'. Need at least 2 isolates '+\ 'to run roary. Moving on...') #Keep the tempdirs created during the run if not ARGS.keep_tempdirs: shutil.rmtree(assembly_tempdir, ignore_errors=True) print('\nDeleted tempdir '+assembly_tempdir+'.') else: print('\nTempdir '+assembly_tempdir+' not deleted.') print('\nRun finished.')
# rosalind_ba7b ''' Limb Length Problem Find the limb length for a leaf in a tree. Given: An integer n, followed by an integer j between 0 and n - 1, followed by a space-separated additive distance matrix D (whose elements are integers). Return: The limb length of the leaf in Tree(D) corresponding to row j of this distance matrix (use 0-based indexing). ''' import numpy as np from Bio.Phylo.TreeConstruction import _DistanceMatrix from Bio.Phylo.TreeConstruction import DistanceTreeConstructor f = open('rosalind_ba7b.txt') n = int(f.readline().rstrip()) j = int(f.readline().rstrip()) D = np.fromfile(f, sep=' ', dtype=int).reshape(n, n) #For the Phylo.TreeConstruction to work, integers must be Python int and not numpy.int64 dm = [[int(D[i, j]) for j in range(i+1)] for i in range(n)] names = [str(i) for i in range(n)] constructor = DistanceTreeConstructor() tree = constructor.nj(_DistanceMatrix(names, dm)) print(round(tree.find_any(str(j)).branch_length))
def main(argv): input_file='' title='Title' label_internal_nodes = False label_leaves = False out_file='' width=750 out_file_xml='' plot_rectangular = False common_kmer_data_path='' taxonomic_names_on_leaves = False try: opts, args = getopt.getopt(argv,"h:i:lnrto:w:x:D:",["Help=","InputCommonKmerXFile=","LabelLeaves=", "LabelInternalNodes=","Rectangular=", "TaxonomicNamesOnLeaves=", "OutFile=","Width=","OutFileXML=","CommonKmerDataPath="]) except getopt.GetoptError: print 'Unknown option, call using: ./PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) for opt, arg in opts: if opt == '-h': print './PlotNJTree.py -i <InputCommonKmerXFile> -D <CommonKmerDataPath> -l <LabelLeavesFlag> -n <LabelInternalNodesFlag> -r <RectangularPlotFlag> -t <TaxonomicNamesOnLeavesFlag> -o <OutFile.png> -x <Outfile.xml> -w <Width>' sys.exit(2) elif opt in ("-i", "--InputCommonKmerXFile"): input_file = arg elif opt in ("-l", "--LabelLeaves"): label_leaves = True elif opt in ("-n","--LabelInternalNodes"): label_internal_nodes = True elif opt in ("-o", "--OutFile"): out_file = arg elif opt in ("-w", "--Width"): width = int(arg) elif opt in ("-x", "--OutFileXML"): out_file_xml = arg elif opt in ("-D", "--CommonKmerDataPath"): common_kmer_data_path = arg elif opt in ("-r", "--Rectangular"): plot_rectangular = True elif opt in ("-t", "--TaxonomicNamesOnLeaves"): taxonomic_names_on_leaves = True #Read in the x vector fid = open(input_file,'r') x = map(lambda y: float(y),fid.readlines()) fid.close() #Normalize the x vector #x = map(lambda y: y/sum(x),x) #Read in the taxonomy taxonomy = list() fid = open(os.path.join(common_kmer_data_path,"Taxonomy.txt"),'r') for line in fid: taxonomy.append('_'.join(line.split()[0].split("_")[1:])) #Just take the first line of the taxonomy (erasing the taxID) fid.close() #Read in the basis for the ckm matrices x_file_names = list() fid = open(os.path.join(common_kmer_data_path,"FileNames.txt"),'r') for line in fid: x_file_names.append(os.path.basename(line.strip())) fid.close() #Read in the common kmer matrix f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-30mers.h5'),'r') ckm30=np.array(f['common_kmers'],dtype=np.float64) f.close() f=h5py.File(os.path.join(common_kmer_data_path,'CommonKmerMatrix-50mers.h5'),'r') ckm50=np.array(f['common_kmers'],dtype=np.float64) f.close() ckm30_norm = np.multiply(ckm30,1/np.diag(ckm30)) ckm50_norm = np.multiply(ckm50,1/np.diag(ckm50)) num_rows = ckm30_norm.shape[0] num_cols = ckm30_norm.shape[1] names = x_file_names matrix=list() for i in range(num_rows): matrix.append([.5*(1-.5*ckm30_norm[i,j]-.5*ckm30_norm[j,i])+.5*(1-.5*ckm50_norm[i,j]-.5*ckm50_norm[j,i]) for j in range(i+1)]) #Construct the tree. Note I could use RapidNJ here, but a few tests have shown that the trees that RapidNJ creates are rubbish. dm = _DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) t=Tree(tree.format('newick'),format=1) #tree.format('newick') #Phylo.draw_ascii(tree) #Now I will put internal nodes in a certain phylogenetic distance between the root and a given node. #Function to insert a node at a given distance def insert_node(t, name_to_insert, insert_above, dist_along): insert_at_node = t.search_nodes(name=insert_above)[0] parent = (t&insert_above).up orig_branch_length = t.get_distance(insert_at_node,parent) if orig_branch_length < dist_along: raise ValueError("error: dist_along larger than orig_branch_length") removed_node = insert_at_node.detach() removed_node.dist = orig_branch_length - dist_along added_node = parent.add_child(name=name_to_insert, dist=dist_along) added_node.add_child(removed_node) #Function to insert a node some % along a branch def insert_hyp_node(t, leaf_name, percent): total_dist = t.get_distance(t.name,leaf_name) percent_dist = percent*total_dist child_node = (t&leaf_name) ancestor_node = (t&child_node.name).up while t.get_distance(t.name, ancestor_node) > percent_dist: child_node = ancestor_node ancestor_node = (t&child_node.name).up insert_node(t, leaf_name+"_"+str(percent), child_node.name, percent_dist-t.get_distance(t.name, ancestor_node)) #Insert hypothetical nodes hyp_node_names = dict() cutoffs = [.9,.8,.7,.6,.5,.4,.3,.2,.1] cutoffs = map(lambda y: y**1.5,cutoffs) for i in range(len(x_file_names)): xi = x[i:len(x):len(x_file_names)] for j in range(1,len(cutoffs)+1): if xi[j]>0: insert_hyp_node(t, x_file_names[i], cutoffs[j-1]) hyp_node_names[x_file_names[i]+"_"+str(cutoffs[j-1])] = [x_file_names[i], cutoffs[j-1], j-1] #in case there are "_" in the file names #insert_hyp_node(t, x_file_names[i],.5/t.get_distance(t.name,t&x_file_names[i])*cutoffs[j]) #Now put the bubbles on the nodes def layout(node): #print(node) if node.is_leaf(): if node.name in x_file_names: #make reconstructed bubble size = x[x_file_names.index(node.name)] F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") if taxonomic_names_on_leaves: nameFace = AttrFace("name", fsize=25, fgcolor='black',text_suffix="_"+taxonomy[x_file_names.index(node.name)]) faces.add_face_to_node(nameFace, node, 0, position="branch-right") else: nameFace = AttrFace("name", fsize=25, fgcolor='black') faces.add_face_to_node(nameFace, node, 0, position="branch-right") elif node.name in hyp_node_names: #Otherwise it's a hypothetical node, just use recon x node_base_name = hyp_node_names[node.name][0] percent = hyp_node_names[node.name][1] if node_base_name in x_file_names: idx = hyp_node_names[node.name][2] size = x[x_file_names.index(node_base_name)+(idx+1)*len(x_file_names)] F = CircleFace(radius=500*math.sqrt(size), color="RoyalBlue", style="sphere") F.border.width = None F.opacity = 0.6 faces.add_face_to_node(F,node, 0, position="branch-right") #print node #print size else: size=0 else: size=0 #print(size) ts = TreeStyle() ts.layout_fn = layout if plot_rectangular: ts.mode = "r" else: ts.mode = "c" ts.show_leaf_name = False ts.min_leaf_separation = 50 #Export the tree to a png image t.render(out_file, w=width, units="mm", tree_style=ts) #Export the xml file project = Phyloxml() phylo = phyloxml.PhyloxmlTree(newick=t.write(format=0, features=[])) phylo.phyloxml_phylogeny.set_name(title) project.add_phylogeny(phylo) project.export(open(out_file_xml,'w'))