def get_tree_from_fasta(in_fasta): """ @warning: The root node must be present """ databank_tree = None FH_databank = FastaIO(in_fasta) for record in FH_databank: if record.description.endswith(";"): record.description = record.description[:-1] taxonomy = record.description.split(";") if databank_tree is None: databank_tree = Node(taxonomy[0]) parent = databank_tree for rank_depth, taxa in enumerate(taxonomy[1:]): if not parent.has_child(taxa): taxa_node = Node(taxa, parent) if (rank_depth + 1) == (len(taxonomy) - 1): # Current node is leaf taxa_node.metadata["seq_ids"] = [record.id] else: if (rank_depth + 1) == (len(taxonomy) - 1): # Current node is leaf taxa_node = parent.get_child(taxa) taxa_node.metadata["seq_ids"].append(record.id) parent = parent.get_child(taxa) FH_databank.close() return databank_tree
def update_tree_for_sample( biom, tree, sample_name, taxonomy_key, sample_id=None ): """ @summary: Updates a tree with sample taxa (nodes and counts). @param biom: [Biom] The Biom object. @param tree: [Node] The root node of the tree to update. @param sample_name: [str] The sample name to process. @param taxonomy_key: [str] The metadata title for the taxonomy in biom. @param sample_id: [str] The sample id to replace the sample name in tree. """ sample_key = sample_name if sample_id is None else str(sample_id) for observation in biom.get_observations_by_sample( sample_name ): current_node = tree if taxonomy_key in observation["metadata"] and observation["metadata"][taxonomy_key] is not None: # Get taxonomy taxonomy = biom.get_observation_taxonomy( observation["id"], taxonomy_key ) # Add taxon in tree for taxon in taxonomy: if not current_node.has_child( taxon ): current_node.add_child( Node(taxon) ) current_node = current_node.get_child( taxon ) # Add sample count in node if sample_key not in current_node.metadata: current_node.metadata[sample_key] = 0 current_node.metadata[sample_key] += biom.get_count( observation["id"], sample_name ) return tree
def get_tree_with_count( input_biom, compress=False, taxonomy_key="taxonomy" ): """ @summary: Returns the tree of taxa and their counts by sample from BIOM. @param input_biom: [str] Path to the BIOM file processed. @param compress: [bool] if true the samples names are replaced by samples index. @param taxonomy_key: [str] The metadata title for the taxonomy in biom. @return: [list] The tree generated and the ordered list of samples names (usefull to retrieve name by index if you use compress). """ ordered_samples_names = list() tree = Node("root") biom = BiomIO.from_json( input_biom ) for sample_name in biom.get_samples_names(): ordered_samples_names.append( sample_name ) sample_id = None if not compress else (len(ordered_samples_names)-1) update_tree_for_sample( biom, tree, sample_name, taxonomy_key, sample_id ) return tree, ordered_samples_names
def process(input_fasta, input_tax, output_fasta, output_tax, domains_filtered=None, ranks=[ "rootrank", "domain", "phylum", "class", "order", "family", "genus", "species" ]): """ @param input_fasta: [str] The silva sequences file. @param input_tax: [str] The silva taxonomy file. @param output_fasta: [str] The RDP retrain classifier sequence file. @param output_tax: [str] The RDP retrain classifier taxonomy file. @param domains_filtered: [list] These domains are not kept in output. @param ranks: [list] The ranks reported in outputs. """ # Create tax tree taxonomy_ref, taxonomy_tree, new_taxon_id = silva_tax_2_tree( input_tax, ranks, domains_filtered) # Write fasta FH_cleanFasta = open(output_fasta, "w") FH_fasta = open(input_fasta) is_filtered = None for line in FH_fasta: if line.startswith(">"): line_fields = line.strip().split() evaluated_id = line_fields[0] evaluated_taxonomy = " ".join(line_fields[1:]).split(";") if domains_filtered is not None and evaluated_taxonomy[0].strip( ).lower() in domains_filtered: is_filtered = True elif not taxonomy_ref.has_key(";".join( evaluated_taxonomy[:-1]).lower()): is_filtered = True print "The sequence '" + evaluated_id + "' is skipped because the node for '" + ";".join( evaluated_taxonomy ) + "' does not exist in the taxonomy file." else: is_filtered = False clean_taxonomy = get_taxonomy(taxonomy_ref[";".join( evaluated_taxonomy[:-1]).lower()]) if not "species" in ranks: raise Exception( "The execution without 'species' rank is not implemented." ) else: parent_node = taxonomy_ref[";".join( evaluated_taxonomy[:-1]).lower()] # Go to genus while parent_node.metadata["rank"] != "genus": parent_depth = ranks.index( parent_node.metadata["rank"]) if parent_node.has_child("unknown " + ranks[parent_depth + 1]): parent_node = parent_node.get_child( "unknown " + ranks[parent_depth + 1]) else: missing_name = "unknown " + ranks[parent_depth + 1] missing_node = Node(missing_name, parent_node, {}, { "clean_name": missing_name + " [id:" + str(new_taxon_id) + "]", "id": new_taxon_id, "rank": ranks[parent_depth + 1] }) new_taxon_id += 1 parent_node = missing_node # Add species to tree species_name = get_cleaned_sp(evaluated_taxonomy[-1]) species_node = None if parent_node.has_child(species_name): species_node = parent_node.get_child(species_name) else: species_node = Node(species_name, parent_node, {}, { "clean_name": species_name.replace('*', ' ').replace( '<', ' ').replace('>', ' ') + " [id:" + str(new_taxon_id) + "]", "id": new_taxon_id, "rank": "species" }) new_taxon_id += 1 clean_taxonomy = get_taxonomy(species_node) FH_cleanFasta.write(line_fields[0] + '\tRoot' + ';' + clean_taxonomy + "\n") elif not is_filtered: FH_cleanFasta.write( line.strip().replace('u', 't').replace('U', 'T') + "\n") FH_fasta.close() FH_cleanFasta.close() # Write RDP tax file FH_cleanTax = open(output_tax, "w") write_rdp_tax(FH_cleanTax, taxonomy_tree) FH_cleanTax.close()
def silva_tax_2_tree(taxonomy_file, authorized_ranks, domains_filtered=None): """ @summary: Returns a taxonomic tree from the silva taxonomy. @param taxonomy_file: [str] The silva taxonomy file (provide taxon name, taxon level and taxon rank name). Line example: 'Archaea;Crenarchaeota;Thermoprotei; 7 class 119' @param authorized_ranks: [list] The ranks reported in tree. @param domains_filtered: [list] These domains are not kept in tree. @return: [Node, dict, int] The root node of the tree, the link between complete taxonomy (all levels) and node (only authorized ranks), the id of the next new element added in tree. """ next_new_id = 1000000000 taxonomy_ref = dict() taxonomy_tree = Node("Root", None, None, { 'clean_name': 'Root', 'id': 0, 'rank': 'rootrank' }) FH_tax = open(taxonomy_file) for line in FH_tax: if line.strip() != "": # Parse line matches = re.search("^(.+);\s+(\d+)\s+([^\s]+)", line.strip()) if matches is not None: evaluated_taxonomy = matches.group(1).split(";") evaluated_id = matches.group(2) evaluated_rank = matches.group(3) else: raise Exception("Incorrect line content : '" + line.strip() + "'.") if domains_filtered is None or not evaluated_taxonomy[0].strip( ).lower() in domains_filtered: # Go to the most downstream already existing ancestor of the element parent_node = taxonomy_tree if len(evaluated_taxonomy) > 1: parent_node = taxonomy_ref[";".join( evaluated_taxonomy[:-1]).lower()] current_node = None # Add evaluated node if it has a valid rank if evaluated_rank in authorized_ranks: if authorized_ranks.index( evaluated_rank) <= authorized_ranks.index( parent_node.metadata["rank"]): if evaluated_taxonomy[ -1] == "uncultured" and parent_node.name == evaluated_taxonomy[ -2]: evaluated_rank = authorized_ranks[ authorized_ranks.index( parent_node.metadata["rank"]) + 1] else: raise Exception( "The taxonomy in file '" + taxonomy_file + "' seems to be incoherent. The taxon '" + ";".join(evaluated_taxonomy) + "' is tagged as '" + evaluated_rank + "' and its ancestor '" + get_taxonomy(parent_node) + "' is tagged as '" + parent_node.metadata["rank"] + "'.") # Complete missing ranks between parent and evaluated evaluated_rank_depth = authorized_ranks.index( evaluated_rank) while authorized_ranks[evaluated_rank_depth - 1] != parent_node.metadata["rank"]: missing_rank_depth = authorized_ranks.index( parent_node.metadata["rank"]) + 1 missing_name = "unknown " + authorized_ranks[ missing_rank_depth] missing_node = None if parent_node.has_child(missing_name): missing_node = parent_node.get_child(missing_name) else: missing_node = Node( missing_name, parent_node, {}, { "clean_name": missing_name + " [id:" + str(next_new_id) + "]", "id": next_new_id, "rank": authorized_ranks[missing_rank_depth] }) next_new_id += 1 parent_node = missing_node # Add evaluated node evaluated_name = evaluated_taxonomy[-1].strip() if evaluated_name.lower() in [ "unidentified", "uncultured", "incertae sedis", "unknown " + evaluated_rank, parent_node.name + " incertae sedis" ]: # Clean unknown taxon name evaluated_name = "unknown " + evaluated_rank if evaluated_name == "unknown " + evaluated_rank and parent_node.has_child( evaluated_name ): # Is unknown and unknown already exists current_node = parent_node.get_child(evaluated_name) else: # Is not unknown or is unknown but does not already exist current_node = Node(evaluated_name, parent_node, {}, { "clean_name": evaluated_name + " [id:" + evaluated_id + "]", "id": evaluated_id, "rank": evaluated_rank }) # Store link between complete taxonomy and node in tree if current_node is None: current_node = parent_node taxonomy_ref[";".join( evaluated_taxonomy).lower()] = current_node FH_tax.close() return taxonomy_ref, taxonomy_tree, next_new_id