def extensions_onto_foundation(otu_file_fh, extension_taxonomy_fh, extension_seq_fh, foundation_alignment_fh, ghost_tree_fp): """Combines two genetic databases into one phylogenetic tree. Some genetic databases provide finer taxonomic resolution, but high sequence variability causes poor multiple sequence alignments (these are the "extension trees"). Other databases provide high quality phylogenetic information (hence it is used as the "foundation"), but poor taxonomic resolution. This script combines two genetic databases into one phylogenetic tree in .nwk format, taking advantage of the benefits of both databases, but allowing sequencing to be performed using the "extension trees" primer set. Parameters __________ otu_file_fh : filehandle Tab-delimited text file containing OTU clusters in rows containing accession numbers only. Format can be 1) where the accession number is in the first column with only one column or 2) it can contain accession numbers clustered in tab-delimited rows containing more accession numbers, which are part of that OTU cluster (as in output of "ghost-tree group-extensions"). This file refers to the "extension trees". File references to sequence reads or sample numbers/names are not valid here. This is not an OTU .biom table. extension_taxonomy_fh : filehandle Tab-delimited text file related to "extension trees" wih the 1st column being an accession number (same accession numbers in otu_file_fh and extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in the following format: k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales; f__Sebacinaceae;g__unidentified;s__Sebacina extension_seq_fh : filehandle The .fasta formated sequences for the "extension trees" genetic dataset. Sequence identifiers are the accession numbers. These accession numbers are the same as in the otu_file_fh and extension_taxonomy_fh. foundation_alignment_fh : filehandle File containing pre-aligned sequences from a genetic marker database in .fasta format. This file refers to the "foundation" of the ghost-tree. Contains accession numbers and taxonomy labels. ghost_tree_fp : folder Output folder contains files including: a) The Newick formatted ghost-tree, which is the final output of the ghost-tree tool. This is a phylogenetic tree designed for downstream diversity analyses. b) Accession IDs from the ghost-tree.nwk file that you can use for downstream analyses tools c) log error file (this is an optional file that you can have if you type '--stderr') """ global foundation_accession_genus_dic # needs global assignment for flake8 foundation_accession_genus_dic = {} std_output, std_error = "", "" process = subprocess.Popen("muscle", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() if re.search("command not found", std_error): print "muscle, multiple sequence aligner, is not found. Is it" \ " installed? Is it in your path?" process = subprocess.Popen("fasttree", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() std_output, std_error = "", "" if re.search("command not found", std_error): print "fasttree, phylogenetic tree builder, is not found. Is it" \ " installed? Is it in your path?" os.mkdir("tmp") os.mkdir(ghost_tree_fp) extension_genus_accession_list_dic = \ _extension_genus_accession_dic(otu_file_fh, extension_taxonomy_fh) skbio.write(_make_nr_foundation_alignment( foundation_alignment_fh, extension_genus_accession_list_dic), into=ghost_tree_fp + "/nr_foundation_alignment_gt.fasta", format="fasta") foundation_tree, all_std_error = _make_foundation_tree( ghost_tree_fp + "/nr_foundation_alignment_gt.fasta", std_error, ghost_tree_fp) seqs = SequenceCollection.read(extension_seq_fh) for node in foundation_tree.tips(): key_node, _ = str(node).split(":") key_node = foundation_accession_genus_dic[key_node] try: _make_mini_otu_files(key_node, extension_genus_accession_list_dic, seqs) process = subprocess.Popen( "muscle -in tmp/mini_seq_gt.fasta" + " -out" + " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() process = subprocess.Popen("fasttree -nt -quiet" + " tmp/mini_alignment_gt.fasta >" + " tmp/mini_tree_gt.nwk", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std_output, std_error = process.communicate() all_std_error += "FastTree warnings for genus " + key_node + " are:\n" + std_error + "\n" mini_tree = read("tmp/mini_tree_gt.nwk", format='newick', into=TreeNode) node.extend(mini_tree.root_at_midpoint().children[:]) except: continue shutil.rmtree("tmp") ghost_tree_nwk = open(ghost_tree_fp + "/ghost_tree.nwk", "w") ghost_tree_nwk.write(str(foundation_tree)) ghost_tree_nwk.close() _make_accession_id_file(ghost_tree_fp) return str(foundation_tree).strip(), all_std_error
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh, extension_seq_fh, foundation_alignment_fh, ghost_tree_fp): """Combines two genetic databases into one phylogenetic tree. Some genetic databases provide finer taxonomic resolution, but high sequence variability causes poor multiple sequence alignments (these are the "extension trees"). Other databases provide high quality phylogenetic information (hence it is used as the "foundation"), but poor taxonomic resolution. This script combines two genetic databases into one phylogenetic tree in .nwk format, taking advantage of the benefits of both databases, but allowing sequencing to be performed using the "extension trees" primer set. Parameters __________ otu_file_fh : filehandle Tab-delimited text file containing OTU clusters in rows containing accession numbers only. Format can be 1) where the accession number is in the first column with only one column or 2) it can contain accession numbers clustered in tab-delimited rows containing more accession numbers, which are part of that OTU cluster (as in output of "ghost-tree group-extensions"). This file refers to the "extension trees". File references to sequence reads or sample numbers/names are not valid here. This is not an OTU .biom table. extension_taxonomy_fh : filehandle Tab-delimited text file related to "extension trees" wih the 1st column being an accession number (same accession numbers in otu_file_fh and extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in the following format: k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales; f__Sebacinaceae;g__unidentified;s__Sebacina extension_seq_fh : filehandle The .fasta formated sequences for the "extension trees" genetic dataset. Sequence identifiers are the accession numbers. These accession numbers are the same as in the otu_file_fh and extension_taxonomy_fh. foundation_alignment_fh : filehandle File containing pre-aligned sequences from a genetic marker database in .fasta format. This file refers to the "foundation" of the ghost-tree. Contains accession numbers and taxonomy labels. ghost_tree_fh : filehandle The Newick formatted ghost-tree is the final output of the ghost-tree tool. This is a phylogenetic tree designed for downstream diversity analyses. """ os.system("mkdir tmp") global foundation_accession_genus_dic foundation_accession_genus_dic = {} global seqs extension_genus_accession_list_dic = \ _extension_genus_accession_dic(otu_file_fh, extension_taxonomy_fh) skbio.write(_make_nr_foundation_alignment( foundation_alignment_fh, extension_genus_accession_list_dic), into="nr_foundation_alignment_gt.fasta", format="fasta") foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta") seqs = SequenceCollection.read(extension_seq_fh) for node in foundation_tree.tips(): key_node, _ = str(node).split(":") key_node = foundation_accession_genus_dic[key_node] try: _make_mini_otu_files(key_node, extension_genus_accession_list_dic, seqs) os.system("muscle -in tmp/mini_seq_gt.fasta -out" + " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1") os.system("fasttree -nt -quiet tmp/mini_alignment_gt.fasta >" + " tmp/mini_tree_gt.nwk") mini_tree = read("tmp/mini_tree_gt.nwk", format='newick', into=TreeNode) node.extend(mini_tree.children[:]) except: continue os.system("rm -r tmp") ghost_tree_fp.write(str(foundation_tree)) return str(foundation_tree).strip()
def scaffold_extensions_into_foundation(otu_file_fh, extension_taxonomy_fh, extension_seq_fh, foundation_alignment_fh, ghost_tree_fp): """Combines two genetic databases into one phylogenetic tree. Some genetic databases provide finer taxonomic resolution, but high sequence variability causes poor multiple sequence alignments (these are the "extension trees"). Other databases provide high quality phylogenetic information (hence it is used as the "foundation"), but poor taxonomic resolution. This script combines two genetic databases into one phylogenetic tree in .nwk format, taking advantage of the benefits of both databases, but allowing sequencing to be performed using the "extension trees" primer set. Parameters __________ otu_file_fh : filehandle Tab-delimited text file containing OTU clusters in rows containing accession numbers only. Format can be 1) where the accession number is in the first column with only one column or 2) it can contain accession numbers clustered in tab-delimited rows containing more accession numbers, which are part of that OTU cluster (as in output of "ghost-tree group-extensions"). This file refers to the "extension trees". File references to sequence reads or sample numbers/names are not valid here. This is not an OTU .biom table. extension_taxonomy_fh : filehandle Tab-delimited text file related to "extension trees" wih the 1st column being an accession number (same accession numbers in otu_file_fh and extension_taxonomy_fh) and the 2nd column is the taxonomy ranking in the following format: k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Sebacinales; f__Sebacinaceae;g__unidentified;s__Sebacina extension_seq_fh : filehandle The .fasta formated sequences for the "extension trees" genetic dataset. Sequence identifiers are the accession numbers. These accession numbers are the same as in the otu_file_fh and extension_taxonomy_fh. foundation_alignment_fh : filehandle File containing pre-aligned sequences from a genetic marker database in .fasta format. This file refers to the "foundation" of the ghost-tree. Contains accession numbers and taxonomy labels. ghost_tree_fh : filehandle The Newick formatted ghost-tree is the final output of the ghost-tree tool. This is a phylogenetic tree designed for downstream diversity analyses. """ global foundation_accession_genus_dic # needs global assignment for flake8 foundation_accession_genus_dic = {} ghost_tree_output = str(ghost_tree_fp) ghost_tree_output = ghost_tree_output[16:-4] process = subprocess.Popen("muscle", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() if re.search("command not found", error): print "muscle, multiple sequence aligner, is not found. Is it" \ " installed? Is it in your path?" process = subprocess.Popen("fasttree", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() if re.search("command not found", error): print "fasttree, phylogenetic tree builder, is not found. Is it" \ " installed? Is it in your path?" os.mkdir("tmp") logfile = open("ghost-tree_log_"+ghost_tree_output+".txt", "w") extension_genus_accession_list_dic = \ _extension_genus_accession_dic(otu_file_fh, extension_taxonomy_fh) skbio.write(_make_nr_foundation_alignment(foundation_alignment_fh, extension_genus_accession_list_dic), into="nr_foundation_alignment_gt.fasta", format="fasta") foundation_tree = _make_foundation_tree("nr_foundation_alignment_gt.fasta", logfile) seqs = SequenceCollection.read(extension_seq_fh) for node in foundation_tree.tips(): key_node, _ = str(node).split(":") key_node = foundation_accession_genus_dic[key_node] try: _make_mini_otu_files(key_node, extension_genus_accession_list_dic, seqs) process = subprocess.Popen("muscle -in tmp/mini_seq_gt.fasta" + " -out" + " tmp/mini_alignment_gt.fasta -quiet" + " -maxiters 2 -diags1", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() process = subprocess.Popen("fasttree -nt -quiet" + " tmp/mini_alignment_gt.fasta >" + " tmp/mini_tree_gt.nwk", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() logfile.write("FastTree warnings for genus "+key_node+" are:\n" + error + "\n") mini_tree = read("tmp/mini_tree_gt.nwk", format='newick', into=TreeNode) node.extend(mini_tree.root_at_midpoint().children[:]) except: continue shutil.rmtree("tmp") ghost_tree_fp.write(str(foundation_tree)) logfile.close() return str(foundation_tree).strip()