def get_tree(aln, kind='nj'): from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) return dm, tree
def consensus(msa): alignment = MultipleSeqAlignment(msa) calculator = DistanceCalculator('identity') dm = calculator.get_distance(alignment) constructor = DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(alignment) print tree
def buildTree(FASTAFile): myAlignment = AlignIO.read(FASTAFile, "fasta") # Create a tip mapping from the fasta file tipMapping = {} for record in myAlignment: tipMapping[record.id] = str(record.seq) # Compute a distance matrix and construct tree calculator = DistanceCalculator("identity") myMatrix = calculator.get_distance(myAlignment) constructor = DistanceTreeConstructor() upgmaTree = constructor.nj(myMatrix) upgmaTree.root_at_midpoint() Phylo.draw(upgmaTree) # Convert phyloxml tree to newick # biopython does not provide a function to do this so it was necessary # to write to a buffer in newick to convert then get rid of unneeded info for clade in upgmaTree.get_terminals(): clade.name = "\"" + clade.name + "\"" buf = cStringIO.StringIO() Phylo.write(upgmaTree, buf, 'newick', plain = True) tree = buf.getvalue() tree = re.sub(r'Inner\d*', '', tree) tree = tree.replace(";", "") tree = literal_eval(tree) #newick format # RLR tree required for maxParsimony function tree = NewicktoRLR(tree) return tree
def dna(file_path, file_format, algorithm): # Read the sequences and align aln = AlignIO.read(file_path, file_format) # Print the alignment print(aln) # Calculate the distance matrix calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) # Print the distance Matrix print('\nDistance Matrix\n===================') print(calculator) # Construct the phylogenetic tree using choosen algorithm constructor = DistanceTreeConstructor() if algorithm.lower() == 'upgma': tree = constructor.upgma(dm) elif algorithm.lower() == 'nj': tree = constructor.nj(dm) else: click.echo('Invalid algorithm!') # Draw the phylogenetic tree Phylo.draw(tree) # Print the phylogenetic tree in the terminal print('\nPhylogenetic Tree\n===================') Phylo.draw_ascii(tree)
def main_new(fastafile, bkp): distance_name = ["ab", "ac", "bc"] temp = SeqIO.to_dict(SeqIO.parse(fastafile, "fasta")) seq_name = [*temp] aln = AlignIO.read(open(fastafile), 'fasta') calculator = DistanceCalculator('blosum62') segment_1 = calculator.get_distance(aln[:, :bkp]) segment_2 = calculator.get_distance(aln[:, bkp:]) distance = [ segment_1[seq_name[1]][0], segment_1[seq_name[2]][0], segment_1[seq_name[2]][1], segment_2[seq_name[1]][0], segment_2[seq_name[2]][0], segment_2[seq_name[2]][1] ] #distance=[segment_1[seq_name[1]][0],segment_1[seq_name[2]][0],segment_1[seq_name[2]][1],segment_2[seq_name[1]][0],segment_2[seq_name[2]][0],segment_2[seq_name[2]][1]]; compare_distance = [ abs(distance[0] - distance[3]), abs(distance[1] - distance[4]), abs(distance[2] - distance[5]) ] ##in order of ab,ac,bc temp2 = distance_name[compare_distance.index(min(compare_distance))] string = "abc" string = string.replace(temp2[0], "") string = string.replace(temp2[1], "") rec = seq_name["abc".index(string)] return rec
def fastaToNJTree(fastaFile, outputFile): aln = AlignIO.read(fastaFile, 'fasta') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(aln) Phylo.write(tree, outputFile, 'newick')
def construct_id_dm(seq_df, seq_fpath, align_outpath="tmp/iddm_align.fasta", ordered=False, aligned=False, kalign_silent=True): """Constructs an np.ndarray corresponding to the identity distance matrix of records in seq_df :param seq_df: DataFrame of OrthoDB/ NCBI sequence records; should only contain records for which identity distance matrix will be computed :param seq_fpath: Path of fasta file containing at least all of the records in seq_df. Can contain more records than are in seq_df - a temporary file containing only the records in seq_df.index will be generated (filtered_fpath) :param align_outpath: Optional filepath. If provided, the resulting alignment will be stored there. Otherwise, written to a temporary file (tmp/iddm_align.fasta) :param ordered: boolean. True: distance matrix rows will be ordered by the order of records in seq_df.index; False: distance matrix rows will be ordered by the order of records in seq_fpath :return: id_dm: np.ndarray of identity distance matrix calculated by AlignIO :return: align_srs: pandas Series object containing aligned sequences """ from Bio.Phylo.TreeConstruction import DistanceCalculator from Bio import AlignIO # Filter records in seq_fpath to new fasta only containing records in seq_df.index # filtered_outpath = "tmp/iddm.fasta" filtered_fpath = "tmp/alias_matches.fasta" filter_fasta_infile(seq_df.index, seq_fpath, outfile_path=filtered_fpath, ordered=ordered) if not aligned: # KAlign sequences in filtered_outpath, write to align_outpath with open(filtered_fpath, 'r') as filtered_f, open(align_outpath, 'wt', encoding='utf-8') as align_f: args = ['kalign'] if kalign_silent: subprocess.run(args=args, stdin=filtered_f, stdout=align_f, stderr=subprocess.PIPE, text=True) else: subprocess.run(args=args, stdin=filtered_f, stdout=align_f, text=True) else: align_outpath = filtered_fpath align_srs = fasta_to_srs(align_outpath) with open(align_outpath) as aligned_f: aln = AlignIO.read(aligned_f, 'fasta') calculator = DistanceCalculator('identity') id_dm_obj = calculator.get_distance(aln) # Convert AlignIO object to np.ndarray for i, r in enumerate(id_dm_obj): if i == 0: id_dm = np.array(r) else: id_dm = np.vstack((id_dm, r)) return id_dm, align_srs
def printGeneTree(self): """ Print gene trees with matplotlib and in the terminal for the four largest target ORFs of coronaviruses. Takes a .phy file containing multiple alligned sequences, generates a matrix based on sequence composition and compares each sequence (genome) to one another. sequences with grater scores (similarity) are ranked closer together on the phylogenetic trees. input: A .phy file that contains coronavirus gene sequences to draw phylogenetic tree output: A visual representation of a gene tree on terminal and matplotlib """ align = AlignIO.read( self.newPhylip, 'phylip') # Reads created .phy file containing the SeqRecord #print (align) # prints concatenated allignments calculator = DistanceCalculator('identity') dm = calculator.get_distance(align) # Calculate the distance matrix print( '\n======================================== DISTANCE MATRIX =======================================\n' ) print(dm, "\n\n") # Print the distance Matrix constructor = DistanceTreeConstructor( ) # Construct the phylogenetic tree using UPGMA algorithm tree = constructor.upgma(dm) print( '\n========================================= GENE TREE ===========================================\n' ) Phylo.draw( tree ) # Draw the phylogenetic tree (must install matplotlib to use this formatting) Phylo.draw_ascii(tree) # Print the phylogenetic tree in terminal
def plot_alignment_heatmap(alignments, trans_dict=None, title="Percent difference"): # calculate distance - https://biopython.org/wiki/Phylo calculator = DistanceCalculator('identity') dm = calculator.get_distance(alignments) if trans_dict is None: # create a translation dictionary for human understandable labels trans_dict = dict( (alignment.id, " ".join(alignment.description.split()[1:3])) for alignment in alignments ) # create dataframe from distance matrix for easier plotting df = pd.DataFrame( dm.matrix, index=[trans_dict[name] for name in dm.names], columns=[trans_dict[name] for name in dm.names] ) plt.figure() sns.heatmap( df * 100, fmt='3.2f', annot=True, linewidths=0.5, cmap=sns.light_palette("navy"), cbar=False, square=True ) plt.title(title) plt.tight_layout() return plt.gcf()
def plot_phylo_tree(align: MultipleSeqAlignment, accession_numbers: dict): """ Plots a phylogenetic tree :param align: MultipleSeqAlignment with the alignment result to be plotted :param accession_numbers: dict of accession numbers and their translation to human-understandable names :return: figure-handle of the plotted phylogenetic tree """ # calculate distance - https://biopython.org/wiki/Phylo calculator = DistanceCalculator('identity') dm = calculator.get_distance(align) # construct a tree constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) # remove the names for the non-terminals for better visual appeal for non_terminal in tree.get_nonterminals(): non_terminal.name = '' # change accession numbers into human more understandable names for terminal in tree.get_terminals(): terminal.name = accession_numbers[re.match("(^\S*)(?=\.)", terminal.name)[0]] print(Phylo.draw_ascii(tree)) # plot the tree fig, ax = plt.subplots(1, 1) # draw the resulting tree Phylo.draw(tree, show_confidence=False, axes=ax, do_show=False) ax.set_xlim(right=0.8) return fig
def main(): file_name = "data/coding.fa" # file_name = "data/cons_noncode.fa" alignment = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) for seq_record in SeqIO.parse(file_name, "fasta"): alignment.extend([seq_record]) print("Number of characters in alignment:", len(alignment[0])) #################### # Neighbor joining # #################### calculator = DistanceCalculator('identity') dm = calculator.get_distance(alignment) constructor = DistanceTreeConstructor() start = time.time() tree = constructor.nj(dm) end = time.time() print("Neighbor joining ran in {} seconds.".format(end - start)) Phylo.draw(tree, label_func=get_label) ######### # UPGMA # ######### start = time.time() tree = constructor.upgma(dm) end = time.time() print("UPGMA ran in {} seconds.".format(end - start)) Phylo.draw(tree, label_func=get_label)
def nj_tree_constructor(x): constructor = DistanceTreeConstructor() calculator = DistanceCalculator("identity") dm = calculator.get_distance(x) njtree = constructor.nj(dm) print(njtree) Phylo.draw_ascii(njtree)
def tree(from_cluster,to_cluster, grupa): consensus_trees = [] for i in [x for x in range(from_cluster,to_cluster)]: msa = AlignIO.read('msa\msa_rodzina_' + str(i)+ '_s.fasta', 'fasta') print i calculator = DistanceCalculator('identity') try: dm = calculator.get_distance(msa) constructor = DistanceTreeConstructor(calculator, 'nj') trees = bootstrap_trees(msa, 50, constructor) trees_list = list(trees) not_included = set([]) for j in range(len(trees_list)): target_tree = trees_list[j] support_tree = get_support(target_tree, trees_list) for node in support_tree.get_nonterminals(): if node.confidence < 50: not_included.add(j) trees = [trees_list[k] for k in range(len(trees_list)) if k not in not_included] if len(trees) > 0: consensus_trees.append(majority_consensus(trees)) except: ValueError Phylo.write(consensus_trees,"drzewa_wynikowe_" + str(grupa),"newick")
def build_phylogeny_trees(): path = "out/homologous_gene_sequences/" output_path = "out/aligned_homologous_gene_sequences/" for homologous_gene_sequence in os.listdir(path): input = path + homologous_gene_sequence output = output_path + homologous_gene_sequence clustal_omega = ClustalOmegaCommandline(infile=input, outfile=output, verbose=True, auto=True) os.system(str(clustal_omega)) multi_seq_align = AlignIO.read(output, 'fasta') # Distance Matrix calculator = DistanceCalculator('identity') dist_mat = calculator.get_distance(multi_seq_align) tree_constructor = DistanceTreeConstructor() phylo_tree = tree_constructor.upgma(dist_mat) Phylo.draw(phylo_tree) print('\nPhylogenetic Tree\n', homologous_gene_sequence) Phylo.draw_ascii(phylo_tree) Phylo.write([phylo_tree], 'out/phylogenetic_trees/{}_tree.nex'.format(homologous_gene_sequence), 'nexus')
def build_trees(filename, tree_name): # Compute alignment with ClustalW algorithm clustalw_cline = ClustalwCommandline("clustalw", infile="{}.fa".format(filename)) clustalw_cline() alignment = AlignIO.read("{}.aln".format(filename), format="clustal") # Create distance matrix calculator = DistanceCalculator('blosum62') dist_matrix = calculator.get_distance(alignment) # Build phylogenetic trees using upgma and nj methods constructor = DistanceTreeConstructor() upgma_tree = constructor.upgma(dist_matrix) nj_tree = constructor.nj(dist_matrix) # Draw the trees label_func = lambda clade: "" if clade.name.startswith("Inner") else clade Phylo.draw(upgma_tree, label_func=label_func, do_show=False) plt.title("{} × upgma".format(tree_name)) plt.show() Phylo.draw(nj_tree, label_func=label_func, do_show=False) plt.title("{} × nj".format(tree_name)) plt.show()
def upgma_tree_constructor(x): constructor = DistanceTreeConstructor() calculator = DistanceCalculator("identity") dm = calculator.get_distance(x) upgmatree = constructor.upgma(dm) print(upgmatree) Phylo.draw_ascii(upgmatree)
def tree_reconstruction(phy_file, method, model, phyformat): '''Construct tree with given method and model''' aln = AlignIO.read(phy_file, 'phylip-' + phyformat) constructor = DistanceTreeConstructor() calculator = DistanceCalculator(model) dm = calculator.get_distance(aln) if method == 'upgma': tree = constructor.upgma(dm) elif method == 'nj': tree = constructor.nj(dm) tree.ladderize() for c in tree.find_clades(): if 'Inner' in c.name: c.name = '' Phylo.write(tree, args.output + '/tree.nwk', 'newick') plt.rcParams['font.style'] = 'italic' plt.rc('font', size=8) plt.rc('axes', titlesize=14) plt.rc('xtick', labelsize=10) plt.rc('ytick', labelsize=10) plt.rc('figure', titlesize=18) draw(tree, do_show=False) plt.savefig(args.output + "/tree.svg", format='svg', dpi=1200)
def main(): alignment = AlignIO.read(open("protein.fasta"), "fasta") calculator = DistanceCalculator('identity') dm = calculator.get_distance(alignment) constructor = DistanceTreeConstructor(calculator, 'upgma') tree = constructor.build_tree(alignment) tree.ladderize() Phylo.draw(tree)
def get_tree(): #biopython-extract the unrooted tree aln = AlignIO.read('agc.aln', 'clustal') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) return tree
def build_tree(aln, kind='nj'): """Build a tree with bio.phylo module""" from Bio.Phylo.TreeConstruction import DistanceCalculator,DistanceTreeConstructor calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor() tree = constructor.nj(dm) return dm, tree
def test_known_matrices(self): aln = AlignIO.read('TreeConstruction/msa.phy', 'phylip') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) self.assertEqual(dm['Alpha', 'Beta'], 1 - (10 * 1.0 / 13)) calculator = DistanceCalculator('blastn') dm = calculator.get_distance(aln) self.assertEqual(dm['Alpha', 'Beta'], 1 - (38 * 1.0 / 65)) calculator = DistanceCalculator('trans') dm = calculator.get_distance(aln) self.assertEqual(dm['Alpha', 'Beta'], 1 - (49 * 1.0 / 78)) calculator = DistanceCalculator('blosum62') dm = calculator.get_distance(aln) self.assertEqual(dm['Alpha', 'Beta'], 1 - (53 * 1.0 / 84))
def test_known_matrices(self): aln = AlignIO.read("TreeConstruction/msa.phy", "phylip") calculator = DistanceCalculator("identity") dm = calculator.get_distance(aln) self.assertEqual(dm["Alpha", "Beta"], 1 - (10 * 1.0 / 13)) calculator = DistanceCalculator("blastn") dm = calculator.get_distance(aln) self.assertEqual(dm["Alpha", "Beta"], 1 - (38 * 1.0 / 65)) calculator = DistanceCalculator("trans") dm = calculator.get_distance(aln) self.assertEqual(dm["Alpha", "Beta"], 1 - (49 * 1.0 / 78)) calculator = DistanceCalculator("blosum62") dm = calculator.get_distance(aln) self.assertEqual(dm["Alpha", "Beta"], 1 - (53 * 1.0 / 84))
def test_distance_calculator(self): aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) self.assertEqual(dm['Alpha', 'Beta'], 1 - (10 * 1.0 / 13)) calculator = DistanceCalculator('blastn') dm = calculator.get_distance(aln) self.assertEqual(dm['Alpha', 'Beta'], 1 - (38 * 1.0 / 65)) calculator = DistanceCalculator('trans') dm = calculator.get_distance(aln) self.assertEqual(dm['Alpha', 'Beta'], 1 - (49 * 1.0 / 78)) calculator = DistanceCalculator('blosum62') dm = calculator.get_distance(aln) self.assertEqual(dm['Alpha', 'Beta'], 1 - (53 * 1.0 / 84))
def createTree(file): aln = AlignIO.read(file, 'phylip') # Calculate the distance matrix calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) # Construct the phylogenetic tree using UPGMA algorithm constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) Phylo.write(tree, 'new.xml', 'phyloxml')
def __init__(self, input_filename, searcher): # Read the aligned sequences and align self.aligned_file = AlignIO.read(input_filename, format='clustal') self.searcher = searcher # Calculate the distance matrix calculator = DistanceCalculator('identity') self.distance_matrix = calculator.get_distance(self.aligned_file) matplotlib.rc('font', size=6)
def D_seq_matrix(fasta_file): aln = AlignIO.read(fasta_file, 'fasta') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor() tree_seq = constructor.upgma(dm) #print tree_dmc Phylo.write(tree_seq,'ph_seq.nre','newick') print dm.names return dm
def cluster_by_cdr3(results_table, output_dir): df1 = pd.read_csv(results_table) res_df = pd.DataFrame() fasta_file = output_dir + "/cdr3.fasta" with open(fasta_file, 'w+') as fas: for i, row in df1.iterrows(): if type(row["CDR3 first"]) is not str: continue fas.write(">" + "_".join(row["Patient"].split(" ")) + ":" + row["well_id"] + ":" + row["V first"] + "\n") fas.write(row["CDR3 first"] + "\n") alignment_file = align_func.clustalw_align(fasta_file, sys.stdout) aln = AlignIO.read(alignment_file, 'clustal') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) with open(output_dir + '/dm.pkl', 'wb') as f: pickle.dump(dm, f, protocol=0) l = list(combinations(range(len(dm.names)), 2)) distmat = np.repeat(np.inf, len(l)) for index in range(len(l)): distmat[index] = dm.matrix[l[index][1]][l[index][0]] with open(output_dir + '/distmat.pkl', 'wb') as f: pickle.dump(distmat, f, protocol=0) Z = linkage(distmat, method='average') max_d = 0.05 clusters = fcluster(Z, max_d, criterion='distance') patient_col = [x.split("_W")[0] for x in dm.names] well_col = ['W' + x.split("_W")[1].split("_")[0] for x in dm.names] df2 = pd.DataFrame(data={ "cluster": clusters, "Patient": patient_col, "well_id": well_col }) table = pd.merge(df2, df1, on=["Patient", "well_id"], how="inner") table = table[[ 'cluster', 'Patient', 'Amp Batch', 'well_id', 'cell_name', '#reads', '#umi distribution', "V first", "V first counts", "V second", "V second counts", "D first", "D first counts", "D second", "D second counts", "J first", "J first counts", "J second", "J second counts", "CDR3 first", "CDR3 first counts", "CDR3 second", "CDR3 second counts" ]] table.to_csv(output_dir + '/full_results.csv')
def phyloxml_from_msa(msa, phyloxml): from Bio import AlignIO from Bio.Phylo.TreeConstruction import DistanceCalculator from Bio.Phylo.TreeConstruction import DistanceTreeConstructor from Bio import Phylo ms_alignment = AlignIO.read(msa, "fasta") calculator = DistanceCalculator("ident") dist_matrix = calculator.get_distance(ms_alignment) constructor = DistanceTreeConstructor() tree = constructor.upgma(dist_matrix) Phylo.write(tree, phyloxml, "phyloxml")
def build_tree_NJ(msa, distanceMatrix=None): if not distanceMatrix: distCalculator = DistanceCalculator("identity") distanceMatrix = distCalculator.get_distance(msa) # Construct the tree with the distance Matrix constructor = DistanceTreeConstructor() tree = constructor.nj(distanceMatrix) # Make the tree rooted #tree.root_at_midpoint() #return newick format return "[&R] " + tree.format("newick").strip()
def MSAOBJ(Align): calculator = DistanceCalculator('identity') MSAlst = [] for indx in Align.index: ind = [a == indx for a in Align.index] seq = Seq(list(Align[ind].iloc[0])[0]) MSAlst.append(SeqRecord(seq,id=indx)) align = MultipleSeqAlignment(MSAlst) dm = calculator.get_distance(align) return(align,dm)
def generar_arbol(file, indice): with open(file, "r") as aln: alineamiento = AlignIO.read(aln, "clustal") calculator = DistanceCalculator('identity') dm = calculator.get_distance(alineamiento) constructor = DistanceTreeConstructor(calculator) nj = constructor.nj(dm) # Neighbor Joining Phylo.draw(nj) path = './static/assets/arbol_filogenetico' + indice + '.png' pylab.savefig(path, format='png')
def calculate_distance_matrix(self, type, file): in_file = file # print(type(in_file)) if type == 'DNA': matrix_type = 'blastn' else: matrix_type = 'blosum62' calculator = DistanceCalculator(matrix_type) alignment = AlignIO.read(in_file, "fasta") dm = calculator.get_distance(alignment) return dm
def blosumnj(filename): aln = AlignIO.read(open(filename), 'fasta') print(aln) calculator = DistanceCalculator('blosum62') dm = calculator.get_distance(aln) print(dm) from Bio.Phylo.TreeConstruction import DistanceTreeConstructor constructor = DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(aln) print(tree)
def distance(inFile, model='identity'): """ Given an alingment file (in fasta format), this function return a distance matrix. Module required: - AlignIO (from Bio) - DistanceCalculator (from Bio.Phylo.TreeConstruction) Usage: <inFile> <model (default = 'identity')> """ aln = AlignIO.read(inFile, 'fasta') # read the alignment calculator = DistanceCalculator(model) # prepare the mode to calculate the distance dm = calculator.get_distance(aln) # calculate the distance of the alignment return dm
def __get_dm(self): from Bio.Phylo.TreeConstruction import DistanceCalculator import numpy as np calculator = DistanceCalculator('identity') dm = calculator.get_distance(self.aln) dm_array = np.zeros(shape=(self.ns, self.ns)) for row in range(0, self.ns): for cln in range(0, self.ns): if cln > row: dm_array[row, cln] = dm[cln][row] else: dm_array[row, cln] = dm[row][cln] return dm_array
def get_dist_matrix (file): aln = AlignIO.read(open('tmp/'+file), 'clustal') calculator = DistanceCalculator('blosum62') dist_matrix = calculator.get_distance(aln) i=0 j=0 da_list = list() for row in dist_matrix: print ('New Row!') j=0 for column in row: if i<j: # with this, you take out the 0's so n = (N²-N)/2 print (dist_matrix[i,j]) da_list.append(dist_matrix[i,j]) j+=1 i+=1 return (da_list)
def test_nj(self): tree = self.constructor.nj(self.dm) self.assertTrue(isinstance(tree, BaseTree.Tree)) # tree_file = StringIO() # Phylo.write(tree, tree_file, 'newick') ref_tree = Phylo.read('./TreeConstruction/nj.tre', 'newick') self.assertTrue(Consensus._equal_topology(tree, ref_tree)) # ref_tree.close() # create a matrix of length 2 calculator = DistanceCalculator('blosum62') self.min_dm = calculator.get_distance(self.aln) for i in range(len(self.min_dm) - 2): del self.min_dm[len(self.min_dm) - 1] min_tree = self.constructor.nj(self.min_dm) self.assertTrue(isinstance(min_tree, BaseTree.Tree)) ref_min_tree = Phylo.read('./TreeConstruction/nj_min.tre', 'newick') self.assertTrue(Consensus._equal_topology(min_tree, ref_min_tree))
def NNIheuristic(FASTAFile, sampleSize, threshold, outputDir): """"Find the maximum parsimony score for that tree""" random.seed(0) outputFile = FASTAFile.replace(".align", ".out") if "/" in outputFile: outputFile = outputFile[outputFile.rfind("/"):] output = open(outputDir + "/" + outputFile, 'w') output.write("*****************RUN STARTS HERE!*****************") #start time startTime = time.clock() output.write("\n" + "Filename: " + FASTAFile + "\n") output.write("Program Start: {:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now()) + "\n") output.write("Sample Size: " + str(sampleSize) + "\nThreshold: " + str(threshold) + "\n\n") # Import fasta alignment file myAlignment = AlignIO.read(FASTAFile, "fasta") # Create a tip mapping from the fasta file tipMapping = {} for record in myAlignment: tipMapping[record.id] = str(record.seq) # Compute a distance matrix and construct tree calculator = DistanceCalculator("identity") myMatrix = calculator.get_distance(myAlignment) output.write("matrix constructed here") constructor = DistanceTreeConstructor() upgmaTree = constructor.upgma(myMatrix) output.write("constructed upgma tree") # Convert phyloxml tree to newick # biopython does not provide a function to do this so it was necessary # to write to a buffer in newick to convert then get rid of unneeded info for clade in upgmaTree.get_terminals(): clade.name = "\"" + clade.name + "\"" buf = cStringIO.StringIO() Phylo.write(upgmaTree, buf, 'newick', plain = True) tree = buf.getvalue() tree = re.sub(r'Inner\d*', '', tree) tree = tree.replace(";", "") tree = literal_eval(tree) #newick format output.write("created the original tree into newick format") # RLR tree required for maxParsimony function tree = NewicktoRLR(tree) score = maxParsimony(tree, tipMapping) graph = nx.Graph() makeGraph(graph, tree) output.write("made a graph") leaves = getLeaves(tree) currentFeasible = isFeasible(graph,leaves) output.write("tested isFeasible") # Perform NNI heuristic counter = 0 loopCounter = 0 while True: output.write("in the while loop") loopCounter += 1 output.write("Loop Iteration: " + str(loopCounter) + "\n") output.write("Loop Start Time: {:%H:%M:%S}".format(datetime.datetime.now()) + "\n") output.write("Current Tree\nFeasibility: " + str(currentFeasible) + "\nScore: " + str(score) + "\nTree:\n" + str(tree) + "\n\n") NNIs = allNNIs(tree) if len(NNIs)-1 < sampleSize: sampleSize = len(NNIs)-1 toScore = random.sample(NNIs, sampleSize) # add feasibility test output.write("starting feasibility test") feasible = [] infeasible = [] for tree in toScore: graph = nx.Graph() makeGraph(graph, tree) leaves = getLeaves(tree) if isFeasible(graph, leaves): #if this tree is possible feasible.append(tree) else: infeasible.append(tree) #if this tree is not possible output.write("Number of Feasible Neighbor Trees: " + str(len(feasible)) + "\n") output.write("Number of Infeasible Neighbor Trees: " + str(len(infeasible)) + "\n") if len(feasible) != 0: #if feasible trees were found if isFeasible(graph, leaves): #if this NNI is possible feasible.append(tree) else: infeasible.append(tree) #if this NNI is not possible if len(feasible) != 0: #if feasible NNIs were found scoredList = map(lambda x: (maxParsimony(x, tipMapping), x), feasible) sortedList = sorted(scoredList) counter = 0 if not currentFeasible or sortedList[0][0] < score: score = sortedList[0][0] tree = sortedList[0][1] currentFeasible = True output.write("Found a New Feasible Tree!\n\n") else: output.write("Best Possible Feasible Tree Found\n" + str(tree) + "\n" + "Score: " + str(score) + "\n\n") break else: #if no possible trees we're found if currentFeasible: #checks if the original tree was feasible output.write("No Feasible Neighbors, Best Possible Feasible Tree\n" + str(tree) + "\n\n") break counter += 1 output.write("Threshold counter: " + str(counter) + "\n\n") if counter >= threshold: output.write("Threshold Met: No Feasible Tree Found\n") stopTime = (time.clock() - startTime) output.write("Program Stop: " + str(stopTime) + " seconds\n\n") return output.write("Searching Infeasible Space\n") scoredList = map(lambda x: (maxParsimony(x, tipMapping), x), infeasible) sortedList = sorted(scoredList) choseNeighbor = False for neighbor in sortedList: #if the original tree was infeasible and no feasible neighbors were found, take the next best infeasible tree and run again if neighbor[0] > score: score = neighbor[0] tree = neighbor[1] choseNeighbor = True break if not choseNeighbor: score = sortedList[-1][0] tree = sortedList[-1][1] currentFeasible = False output.write("Next Best Infeasible Tree\n\n") endTime = (time.clock() - startTime) output.write("Program End: " + str(endTime) + " seconds\n\n") #outputTree = RLRtoNewick(tree) #print "Final score", score return
## pad sequences so that they all have the same length #for record in records: # if len(record.seq) != maxlen: # sequence = str(record.seq).ljust(maxlen, '.') # record.seq = Seq.Seq(sequence) #assert all(len(record.seq) == maxlen for record in records) ## write to temporary file and do alignment #output_file = '{}_padded.fasta'.format(os.path.splitext(input_file)[0]) #with open(output_file, 'w') as f: # SeqIO.write(records, f, 'fasta') #alignment = AlignIO.read(output_file, "fasta") #cline = ClustalwCommandline("clustalw2", infile=input_file) #print(cline) #print type(cline) muscle_cline = MuscleCommandline(input=input_file) stdout, stderr = muscle_cline() alignment = AlignIO.read(StringIO(stdout), "fasta") print(alignment) #alignment = AlignIO.read('../data/ls_orchid.fasta', 'fasta') #print alignment calculator = DistanceCalculator('ident') dm = calculator.get_distance(alignment) constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) Phylo.write(tree, 'phyloxml.xml', 'phyloxml')
def setUp(self): self.aln = AlignIO.read(open('TreeConstruction/msa.phy'), 'phylip') calculator = DistanceCalculator('blosum62') self.dm = calculator.get_distance(self.aln) self.constructor = DistanceTreeConstructor(calculator)
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor from Bio import AlignIO from Bio.Phylo.Consensus import * from Bio import Phylo clusters = 508 consensus_trees = [] #drzewa konsensusowe dla wszystkich klastrow for i in [x for x in range(100,clusters) if x != 354]: msa = AlignIO.read('msa_klaster' + str(i) + '_s.fasta', 'fasta') calculator = DistanceCalculator('identity') dm = calculator.get_distance(msa) constructor = DistanceTreeConstructor(calculator, 'nj') trees = bootstrap_trees(msa, 50, constructor) trees_list = list(trees) not_included = set([]) for j in range(len(trees_list)): target_tree = trees_list[j] support_tree = get_support(target_tree, trees_list) for node in support_tree.get_nonterminals(): if node.confidence < 50: not_included.add(j) trees = [trees_list[k] for k in range(len(trees_list)) if k not in not_included] if len(trees) > 0: consensus_trees.append(majority_consensus(trees))
# i = 0 # while i < len(sequences): # if sequences[i] in temp_dict: # i += 14 # else: # temp_dict[sequences[i]] = sequences[i + 1 : i + 13] # new_file.write(str(i) + "\n") # for item in temp_dict[sequences[i]]: # new_file.write(item) # i += 14 fasta_files = file_handlers.find_files(file_paths, "fasta") for path in fasta_files: file_name = file_handlers.get_file_name(path) print file_name name_list = file_name.split(".") # derep_out_file = ''.join(name_list[0] + '_uniques.fasta') dm_out_file = "".join(name_list[0] + "_dm.txt") # cmd = ['usearch -derep_fulllength ' + path + ' -fastaout ' + derep_out_file] # subprocess.call(cmd, shell=True) new_file = open("/Users/andrea/repositories/AMPHORA2/muscle_alignments/" + dm_out_file, "w") aln = AlignIO.read(path, "fasta") calculator = DistanceCalculator( "identity" ) # identity is the name of the model(scoring matrix) to calculate the distance. The identity model is the default one and can be used both for DNA and protein sequence. dm = calculator.get_distance(aln) new_file.write(dm) new_file.close()
# ICMC - USP # Python program to build phylogenetic tree from Bio import AlignIO from Bio import Phylo import numpy as np from Bio.Phylo.TreeConstruction import DistanceCalculator from Bio.Phylo.TreeConstruction import DistanceTreeConstructor # Reads the alignment files alignApe = AlignIO.read('genomes/human_primates_aligned.fasta', 'fasta') alignHIV = AlignIO.read('genomes/alignedHIV.fasta', 'fasta') # Creates the distance matrix calculator = DistanceCalculator('ident') dm_ape = calculator.get_distance(alignApe) dm_hiv = calculator.get_distance(alignHIV) # Jukes Cantor corrections dm_ape_corrected = dm_ape for d in dm_ape_corrected.matrix: d[:] = [-3/4*np.log(1-4/3*x) for x in d] dm_hiv_corrected = dm_hiv for d in dm_hiv_corrected.matrix: d[:] = [-3/4*np.log(1-4/3*x) for x in d] # Constructs the tree using the upgma algorithm constructor = DistanceTreeConstructor()
def makeDistanceTree(): aln = AlignIO.read('Tests/TreeConstruction/msa.phy', 'phylip') calculator = DistanceCalculator('identity') dm = calculator.get_distance(aln) constructor = DistanceTreeConstructor(calculator, 'nj') tree = constructor.build_tree(aln)
from Bio.Phylo.TreeConstruction import DistanceCalculator from Bio import AlignIO # the alignmnet is pretty much the elementary structure aln = AlignIO.read('./msa.phy', 'phylip') # print aln # SingleLetterAlphabet() alignment with 5 rows and 13 columns # AACGTGGCCACAT Alpha # AAGGTCGCCACAC Beta # GAGATTTCCGCCT Delta # GAGATCTCCGCCC Epsilon # CAGTTCGCCACAA Gamma # Several thigns can be done witht he alignment: get a distance matrix from it: dstcalc = DistanceCalculator('identity') dm = dstcalc.get_distance(aln) # DistanceMatrix(names=['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon'], matrix=[[0], [0.23076923076923073, 0], [0.3846153846153846, 0.23076923076923073, 0], [0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0], [0.6153846153846154, 0.3846153846153846, 0.46153846153846156, 0.15384615384615385, 0]]) print "What's the get_distance(aln) from DistanceCalculator('identity') object?" print type(dm) print dm # Alpha 0 # Beta 0.230769230769 0 # Gamma 0.384615384615 0.230769230769 0 # Delta 0.538461538462 0.538461538462 0.538461538462 0 # Epsilon 0.615384615385 0.384615384615 0.461538461538 0.153846153846 0 # build a tree from it. from Bio.Phylo.TreeConstruction import DistanceTreeConstructor construc0 = DistanceTreeConstructor(dstcalc, 'nj') tre0 = construc0.build_tree(aln)
def noFeasibleTest(FASTAFile, sampleSize, outputDir): """"takes a FASTAFile, constructs a UPGMA Tree from the file data, converts this tree to RLR format, tries to find the tree with the lowest parsimony score (ignores feasibility check)""" random.seed(0) outputFile = FASTAFile.replace(".align", ".out") if "/" in outputFile: outputFile = outputFile[outputFile.rfind("/"):] output = open(outputDir + "/" + outputFile, 'w') output.write("*****************RUN STARTS HERE!*****************") #start time startTime = time.clock() output.write("\n" + "Filename: " + FASTAFile + "\n") output.write("Program Start: {:%Y-%m-%d %H:%M:%S}".format(datetime.datetime.now()) + "\n") output.write("Sample Size: " + str(sampleSize) + "\n\n") # Import fasta alignment file myAlignment = AlignIO.read(FASTAFile, "fasta") # Create a tip mapping from the fasta file tipMapping = {} for record in myAlignment: tipMapping[record.id] = str(record.seq) # Compute a distance matrix and construct tree calculator = DistanceCalculator("identity") myMatrix = calculator.get_distance(myAlignment) constructor = DistanceTreeConstructor() upgmaTree = constructor.upgma(myMatrix) # Convert phyloxml tree to newick # biopython does not provide a function to do this so it was necessary # to write to a buffer in newick to convert then get rid of unneeded info for clade in upgmaTree.get_terminals(): clade.name = "\"" + clade.name + "\"" buf = cStringIO.StringIO() Phylo.write(upgmaTree, buf, 'newick', plain = True) tree = buf.getvalue() tree = re.sub(r'Inner\d*', '', tree) tree = tree.replace(";", "") tree = literal_eval(tree) #newick format # RLR tree required for maxParsimony function tree = NNI.NewicktoRLR(tree) score = NNI.maxParsimony(tree, tipMapping) # Perform NNI heuristic loopCounter = 0 while True: loopCounter += 1 output.write("Loop Iteration: " + str(loopCounter) + "\n") output.write("Loop Start Time: {:%H:%M:%S}".format(datetime.datetime.now()) + "\n") output.write("Current Tree\nScore: " + str(score) + "\nTree:\n" + str(tree) + "\n\n") NNIs = NNI.allNNIs(tree) if len(NNIs)-1 < sampleSize: sampleSize = len(NNIs)-1 toScore = random.sample(NNIs, sampleSize) scoredList = map(lambda x: (NNI.maxParsimony(x, tipMapping), x), toScore) sortedlist = sorted(scoredList) if sortedlist[0][0] < score: score = sortedlist[0][0] tree = sortedlist[0][1] output.write("Found A More Parsimonious Tree!\n\n") else: break output.write("No Neighbors With Better Scores Found\n\n") output.write("Final Tree:\n" + str(tree) + "\nScore: " + str(score) + "\n\n") endTime = (time.clock() - startTime) output.write("Program End: " + str(endTime) + " seconds\n\n") return