예제 #1
0
def get_tree_from_fasta(in_fasta):
    """
    @warning: The root node must be present
    """
    databank_tree = None
    FH_databank = FastaIO(in_fasta)
    for record in FH_databank:
        if record.description.endswith(";"):
            record.description = record.description[:-1]
        taxonomy = record.description.split(";")
        if databank_tree is None:
            databank_tree = Node(taxonomy[0])
        parent = databank_tree
        for rank_depth, taxa in enumerate(taxonomy[1:]):
            if not parent.has_child(taxa):
                taxa_node = Node(taxa, parent)
                if (rank_depth + 1) == (len(taxonomy) -
                                        1):  # Current node is leaf
                    taxa_node.metadata["seq_ids"] = [record.id]
            else:
                if (rank_depth + 1) == (len(taxonomy) -
                                        1):  # Current node is leaf
                    taxa_node = parent.get_child(taxa)
                    taxa_node.metadata["seq_ids"].append(record.id)
            parent = parent.get_child(taxa)
    FH_databank.close()
    return databank_tree
예제 #2
0
def get_tree_from_fasta(in_fasta):
    """
    @warning: The root node must be present
    """
    databank_tree = None
    FH_databank = FastaIO(in_fasta)
    for record in FH_databank:
        if record.description.endswith(";"):
            record.description = record.description[:-1]
        taxonomy = record.description.split(";")
        if databank_tree is None:
            databank_tree = Node(taxonomy[0])
        parent = databank_tree
        for rank_depth, taxa in enumerate(taxonomy[1:]):
            if not parent.has_child(taxa):
                taxa_node = Node(taxa, parent)
                if (rank_depth + 1) == (len(taxonomy) - 1):  # Current node is leaf
                    taxa_node.metadata["seq_ids"] = [record.id]
            else:
                if (rank_depth + 1) == (len(taxonomy) - 1):  # Current node is leaf
                    taxa_node = parent.get_child(taxa)
                    taxa_node.metadata["seq_ids"].append(record.id)
            parent = parent.get_child(taxa)
    FH_databank.close()
    return databank_tree
예제 #3
0
def update_tree_for_sample( biom, tree, sample_name, taxonomy_key, sample_id=None ):
    """
    @summary: Updates a tree with sample taxa (nodes and counts).
    @param biom: [Biom] The Biom object.
    @param tree: [Node] The root node of the tree to update.
    @param sample_name: [str] The sample name to process.
    @param taxonomy_key: [str] The metadata title for the taxonomy in biom.
    @param sample_id: [str] The sample id to replace the sample name in tree.
    """
    sample_key = sample_name if sample_id is None else str(sample_id)
    for observation in biom.get_observations_by_sample( sample_name ):
        current_node = tree
        if taxonomy_key in observation["metadata"] and observation["metadata"][taxonomy_key] is not None:
            # Get taxonomy
            taxonomy = biom.get_observation_taxonomy( observation["id"], taxonomy_key )
            # Add taxon in tree
            for taxon in taxonomy:
                if not current_node.has_child( taxon ):
                    current_node.add_child( Node(taxon) )
                current_node = current_node.get_child( taxon )
            # Add sample count in node
            if sample_key not in current_node.metadata:
                current_node.metadata[sample_key] = 0
            current_node.metadata[sample_key] += biom.get_count( observation["id"], sample_name )
    return tree
예제 #4
0
def get_tree_with_count( input_biom, compress=False, taxonomy_key="taxonomy" ):
    """
    @summary: Returns the tree of taxa and their counts by sample from BIOM.
    @param input_biom: [str] Path to the BIOM file processed.
    @param compress: [bool] if true the samples names are replaced by samples index.
    @param taxonomy_key: [str] The metadata title for the taxonomy in biom.
    @return: [list] The tree generated and the ordered list of samples names (usefull to retrieve name by index if you use compress).
    """
    ordered_samples_names = list()
    tree = Node("root")
    biom = BiomIO.from_json( input_biom )
    for sample_name in biom.get_samples_names():
        ordered_samples_names.append( sample_name )
        sample_id = None if not compress else (len(ordered_samples_names)-1)
        update_tree_for_sample( biom, tree, sample_name, taxonomy_key, sample_id )
    return tree, ordered_samples_names
예제 #5
0
def process(input_fasta,
            input_tax,
            output_fasta,
            output_tax,
            domains_filtered=None,
            ranks=[
                "rootrank", "domain", "phylum", "class", "order", "family",
                "genus", "species"
            ]):
    """
    @param input_fasta: [str] The silva sequences file.
    @param input_tax: [str] The silva taxonomy file.
    @param output_fasta: [str] The RDP retrain classifier sequence file.
    @param output_tax: [str] The RDP retrain classifier taxonomy file.
    @param domains_filtered: [list] These domains are not kept in output.
    @param ranks: [list] The ranks reported in outputs.
    """
    # Create tax tree
    taxonomy_ref, taxonomy_tree, new_taxon_id = silva_tax_2_tree(
        input_tax, ranks, domains_filtered)

    # Write fasta
    FH_cleanFasta = open(output_fasta, "w")
    FH_fasta = open(input_fasta)
    is_filtered = None
    for line in FH_fasta:
        if line.startswith(">"):
            line_fields = line.strip().split()
            evaluated_id = line_fields[0]
            evaluated_taxonomy = " ".join(line_fields[1:]).split(";")
            if domains_filtered is not None and evaluated_taxonomy[0].strip(
            ).lower() in domains_filtered:
                is_filtered = True
            elif not taxonomy_ref.has_key(";".join(
                    evaluated_taxonomy[:-1]).lower()):
                is_filtered = True
                print "The sequence '" + evaluated_id + "' is skipped because the node for '" + ";".join(
                    evaluated_taxonomy
                ) + "' does not exist in the taxonomy file."
            else:
                is_filtered = False
                clean_taxonomy = get_taxonomy(taxonomy_ref[";".join(
                    evaluated_taxonomy[:-1]).lower()])
                if not "species" in ranks:
                    raise Exception(
                        "The execution without 'species' rank is not implemented."
                    )
                else:
                    parent_node = taxonomy_ref[";".join(
                        evaluated_taxonomy[:-1]).lower()]
                    # Go to genus
                    while parent_node.metadata["rank"] != "genus":
                        parent_depth = ranks.index(
                            parent_node.metadata["rank"])
                        if parent_node.has_child("unknown " +
                                                 ranks[parent_depth + 1]):
                            parent_node = parent_node.get_child(
                                "unknown " + ranks[parent_depth + 1])
                        else:
                            missing_name = "unknown " + ranks[parent_depth + 1]
                            missing_node = Node(missing_name, parent_node, {},
                                                {
                                                    "clean_name":
                                                    missing_name + " [id:" +
                                                    str(new_taxon_id) + "]",
                                                    "id":
                                                    new_taxon_id,
                                                    "rank":
                                                    ranks[parent_depth + 1]
                                                })
                            new_taxon_id += 1
                            parent_node = missing_node
                    # Add species to tree
                    species_name = get_cleaned_sp(evaluated_taxonomy[-1])
                    species_node = None
                    if parent_node.has_child(species_name):
                        species_node = parent_node.get_child(species_name)
                    else:
                        species_node = Node(species_name, parent_node, {}, {
                            "clean_name":
                            species_name.replace('*', ' ').replace(
                                '<', ' ').replace('>', ' ') + " [id:" +
                            str(new_taxon_id) + "]",
                            "id":
                            new_taxon_id,
                            "rank":
                            "species"
                        })
                        new_taxon_id += 1
                    clean_taxonomy = get_taxonomy(species_node)
                FH_cleanFasta.write(line_fields[0] + '\tRoot' + ';' +
                                    clean_taxonomy + "\n")
        elif not is_filtered:
            FH_cleanFasta.write(
                line.strip().replace('u', 't').replace('U', 'T') + "\n")
    FH_fasta.close()
    FH_cleanFasta.close()

    # Write RDP tax file
    FH_cleanTax = open(output_tax, "w")
    write_rdp_tax(FH_cleanTax, taxonomy_tree)
    FH_cleanTax.close()
예제 #6
0
def silva_tax_2_tree(taxonomy_file, authorized_ranks, domains_filtered=None):
    """
    @summary: Returns a taxonomic tree from the silva taxonomy.
    @param taxonomy_file: [str] The silva taxonomy file (provide taxon name, taxon level and taxon rank name). Line example: 'Archaea;Crenarchaeota;Thermoprotei;    7    class        119'
    @param authorized_ranks: [list] The ranks reported in tree.
    @param domains_filtered: [list] These domains are not kept in tree.
    @return: [Node, dict, int] The root node of the tree, the link between complete taxonomy (all levels) and node (only authorized ranks), the id of the next new element added in tree.
    """
    next_new_id = 1000000000
    taxonomy_ref = dict()
    taxonomy_tree = Node("Root", None, None, {
        'clean_name': 'Root',
        'id': 0,
        'rank': 'rootrank'
    })
    FH_tax = open(taxonomy_file)
    for line in FH_tax:
        if line.strip() != "":
            # Parse line
            matches = re.search("^(.+);\s+(\d+)\s+([^\s]+)", line.strip())
            if matches is not None:
                evaluated_taxonomy = matches.group(1).split(";")
                evaluated_id = matches.group(2)
                evaluated_rank = matches.group(3)
            else:
                raise Exception("Incorrect line content : '" + line.strip() +
                                "'.")
            if domains_filtered is None or not evaluated_taxonomy[0].strip(
            ).lower() in domains_filtered:
                # Go to the most downstream already existing ancestor of the element
                parent_node = taxonomy_tree
                if len(evaluated_taxonomy) > 1:
                    parent_node = taxonomy_ref[";".join(
                        evaluated_taxonomy[:-1]).lower()]
                current_node = None
                # Add evaluated node if it has a valid rank
                if evaluated_rank in authorized_ranks:
                    if authorized_ranks.index(
                            evaluated_rank) <= authorized_ranks.index(
                                parent_node.metadata["rank"]):
                        if evaluated_taxonomy[
                                -1] == "uncultured" and parent_node.name == evaluated_taxonomy[
                                    -2]:
                            evaluated_rank = authorized_ranks[
                                authorized_ranks.index(
                                    parent_node.metadata["rank"]) + 1]
                        else:
                            raise Exception(
                                "The taxonomy in file '" + taxonomy_file +
                                "' seems to be incoherent. The taxon '" +
                                ";".join(evaluated_taxonomy) +
                                "' is tagged as '" + evaluated_rank +
                                "' and its ancestor '" +
                                get_taxonomy(parent_node) +
                                "' is tagged as '" +
                                parent_node.metadata["rank"] + "'.")
                    # Complete missing ranks between parent and evaluated
                    evaluated_rank_depth = authorized_ranks.index(
                        evaluated_rank)
                    while authorized_ranks[evaluated_rank_depth -
                                           1] != parent_node.metadata["rank"]:
                        missing_rank_depth = authorized_ranks.index(
                            parent_node.metadata["rank"]) + 1
                        missing_name = "unknown " + authorized_ranks[
                            missing_rank_depth]
                        missing_node = None
                        if parent_node.has_child(missing_name):
                            missing_node = parent_node.get_child(missing_name)
                        else:
                            missing_node = Node(
                                missing_name, parent_node, {}, {
                                    "clean_name":
                                    missing_name + " [id:" + str(next_new_id) +
                                    "]",
                                    "id":
                                    next_new_id,
                                    "rank":
                                    authorized_ranks[missing_rank_depth]
                                })
                            next_new_id += 1
                        parent_node = missing_node
                    # Add evaluated node
                    evaluated_name = evaluated_taxonomy[-1].strip()
                    if evaluated_name.lower() in [
                            "unidentified", "uncultured", "incertae sedis",
                            "unknown " + evaluated_rank,
                            parent_node.name + " incertae sedis"
                    ]:  # Clean unknown taxon name
                        evaluated_name = "unknown " + evaluated_rank
                    if evaluated_name == "unknown " + evaluated_rank and parent_node.has_child(
                            evaluated_name
                    ):  # Is unknown and unknown already exists
                        current_node = parent_node.get_child(evaluated_name)
                    else:  # Is not unknown or is unknown but does not already exist
                        current_node = Node(evaluated_name, parent_node, {}, {
                            "clean_name":
                            evaluated_name + " [id:" + evaluated_id + "]",
                            "id":
                            evaluated_id,
                            "rank":
                            evaluated_rank
                        })
                # Store link between complete taxonomy and node in tree
                if current_node is None:
                    current_node = parent_node
                taxonomy_ref[";".join(
                    evaluated_taxonomy).lower()] = current_node
    FH_tax.close()

    return taxonomy_ref, taxonomy_tree, next_new_id